PyPI - crfm-helm - Versions diffs - 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl - Mend

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (268) hide show

{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/annotation/air_bench_annotator.py +2 -2
helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
helm/benchmark/annotation/bird_sql_annotator.py +2 -2
helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +12 -16
helm/benchmark/annotation/omni_math_annotator.py +13 -14
helm/benchmark/annotation/wildbench_annotator.py +9 -9
helm/benchmark/executor.py +11 -12
helm/benchmark/metrics/aci_bench_metrics.py +9 -29
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
helm/benchmark/metrics/classification_metrics.py +3 -3
helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dischargeme_metrics.py +9 -29
helm/benchmark/metrics/efficiency_metrics.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/ifeval_metrics.py +2 -2
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/med_dialog_metrics.py +9 -29
helm/benchmark/metrics/medalign_metrics.py +9 -29
helm/benchmark/metrics/medi_qa_metrics.py +9 -29
helm/benchmark/metrics/medication_qa_metrics.py +10 -30
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +9 -29
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_metrics.py +2 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
helm/benchmark/metrics/toxicity_metrics.py +2 -2
helm/benchmark/metrics/unitxt_metrics.py +3 -4
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/model_deployment_registry.py +16 -26
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +43 -13
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +7 -1
helm/benchmark/presentation/summarize.py +84 -61
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/reeval_run.py +3 -4
helm/benchmark/reeval_runner.py +3 -3
helm/benchmark/run.py +84 -73
helm/benchmark/run_expander.py +12 -1
helm/benchmark/run_spec_factory.py +7 -6
helm/benchmark/run_specs/arabic_run_specs.py +73 -0
helm/benchmark/run_specs/audio_run_specs.py +52 -8
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +0 -53
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
helm/benchmark/run_specs/experimental_run_specs.py +31 -1
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +114 -15
helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
helm/benchmark/run_specs/vlm_run_specs.py +28 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
helm/benchmark/scenarios/aratrust_scenario.py +76 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
helm/benchmark/scenarios/bluex_scenario.py +66 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
helm/benchmark/scenarios/clear_scenario.py +11 -7
helm/benchmark/scenarios/cleva_scenario.py +1 -1
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/headqa_scenario.py +6 -1
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/math_scenario.py +21 -20
helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
helm/benchmark/scenarios/medalign_scenario.py +9 -3
helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
helm/benchmark/scenarios/medbullets_scenario.py +7 -2
helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
helm/benchmark/scenarios/medec_scenario.py +6 -1
helm/benchmark/scenarios/medhallu_scenario.py +7 -1
helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +16 -5
helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
helm/benchmark/scenarios/seahelm_scenario.py +2 -2
helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/server.py +2 -1
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +228 -0
helm/benchmark/static/schema_audio.yaml +60 -49
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_enterprise.yaml +21 -0
helm/benchmark/static/schema_long_context.yaml +81 -20
helm/benchmark/static/schema_medhelm.yaml +272 -213
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_vhelm.yaml +26 -26
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
helm/benchmark/static_build/index.html +4 -4
helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/benchmark/window_services/test_utils.py +3 -4
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/diva_llama_client.py +4 -2
helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
helm/clients/audio_language/qwen_audiolm_client.py +4 -2
helm/clients/audio_language/test.py +62 -0
helm/clients/bedrock_client.py +3 -1
helm/clients/client.py +7 -7
helm/clients/grok_client.py +36 -0
helm/clients/huggingface_client.py +42 -3
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
helm/clients/openai_client.py +102 -55
helm/clients/openai_responses_client.py +176 -0
helm/clients/palmyra_client.py +2 -5
helm/clients/reka_client.py +2 -2
helm/clients/test_huggingface_client.py +3 -3
helm/clients/together_client.py +31 -6
helm/clients/vertexai_client.py +17 -9
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +66 -53
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +102 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/critique_request.py +0 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +104 -12
helm/common/local_context.py +140 -0
helm/common/object_spec.py +23 -8
helm/common/remote_context.py +61 -0
helm/common/request.py +8 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +995 -45
helm/config/model_metadata.yaml +780 -59
helm/config/tokenizer_configs.yaml +224 -3
helm/proxy/cli.py +4 -2
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/retry.py +5 -0
helm/proxy/services/server_service.py +21 -85
helm/tokenizers/grok_tokenizer.py +55 -0
helm/tokenizers/huggingface_tokenizer.py +1 -1
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/numeracy_scenario.py +0 -793
helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
helm/benchmark/static_build/assets/index-262903c1.js +0 -10
helm/benchmark/static_build/assets/index-42060d71.css +0 -1
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0

helm/config/tokenizer_configs.yaml CHANGED Viewed

@@ -252,6 +252,25 @@ tokenizer_configs:
     end_of_text_token: "<eos>"
     prefix_token: "<bos>"
+  # Grok
+  - name: xai/grok-3-beta
+    tokenizer_spec:
+      class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
+  - name: xai/grok-3-mini-beta
+    tokenizer_spec:
+      class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
+  - name: xai/grok-4-0709
+    tokenizer_spec:
+      class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
   # Hf-internal-testing
   # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
@@ -299,6 +318,22 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: "<|endoftext|>"
+  - name: huggingface/smollm2-135m
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+  - name: huggingface/smollm2-135m-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|im_end|>"
   # Lighting AI
   - name: lightningai/lit-gpt
     tokenizer_spec:
@@ -396,6 +431,14 @@ tokenizer_configs:
     prefix_token: "<|begin_of_text|>"
     end_of_text_token: "<|eot_id|>"
+  - name: meta/llama-4-scout-17b-16e-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: meta-llama/Llama-4-Scout-17B-16E-Instruct
+    prefix_token: "<|begin_of_text|>"
+    end_of_text_token: "<|end_of_text|>"
   # 01-ai
   - name: 01-ai/Yi-6B
     tokenizer_spec:
@@ -432,9 +475,33 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: ""
+  - name: allenai/olmo-2-1124-7b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+  - name: allenai/olmo-2-0325-32b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+  - name: allenai/olmoe-1b-7b-0125-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "|||IP_ADDRESS|||"
+    prefix_token: "|||IP_ADDRESS|||"
+  # Marin Community
+  - name: marin-community/marin-8b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|eot_id|>"
+    prefix_token: "<|begin_of_text|>"
   # Microsoft
-  - name: microsoft/phi-2
+  - name: microsoft/phi-2
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: "<|endoftext|>"
@@ -521,6 +588,17 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+  # Moonshot AI
+  - name: moonshotai/kimi-k2-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
+        trust_remote_code: true
+        revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
+    end_of_text_token: "[EOS]"
+    prefix_token: "[BOS]"
   # Nectec
   - name: nectec/OpenThaiLLM-Prebuilt-7B
     tokenizer_spec:
@@ -619,6 +697,14 @@ tokenizer_configs:
     end_of_text_token: "<|im_end|>"
     prefix_token: "<|im_start|>"
+  - name: qwen/qwen3-235b-a22b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: Qwen/Qwen3-235B-A22B
+    end_of_text_token: "<|im_end|>"
+    prefix_token: "<|im_start|>"
   - name: qwen/qwq-32b-preview
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -664,6 +750,15 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: ""
+  - name: qwen/qwen2.5-omni-7b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: Qwen/Qwen2.5-Omni-7B
+        trust_remote_code: false
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
   # SambaLingo
   - name: sambanova/sambalingo-thai-base
     tokenizer_spec:
@@ -814,6 +909,7 @@ tokenizer_configs:
     end_of_text_token: ""
     prefix_token: ""
+# Maritaca AI
   - name: maritaca-ai/sabia-7b
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -822,6 +918,14 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+  - name: maritaca-ai/sabia-2-tokenizer-medium
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
 # Granite-3.1-8b-base
   - name: ibm-granite/granite-3.1-8b-base
     tokenizer_spec:
@@ -910,8 +1014,6 @@ tokenizer_configs:
     prefix_token: ""
     end_of_text_token: ""
   - name: ibm-granite/granite-34b-code-instruct-8k
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -945,7 +1047,17 @@ tokenizer_configs:
     prefix_token: ""
     end_of_text_token: ""
+  # IBM Granite 3.3
+  - name: ibm/granite-3.3-8b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: ibm-granite/granite-3.3-8b-instruct
+    end_of_text_token: "<|end_of_text|>"
+    prefix_token: "<|end_of_text|>"
   # DeepSeek-R1-Distill-Llama-3.1-8b
   - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
@@ -964,3 +1076,112 @@ tokenizer_configs:
             pretrained_model_name_or_path: deepseek-ai/deepseek-coder-6.7b-instruct
     end_of_text_token: "<｜end▁of▁sentence｜>"
     prefix_token: "<｜begin▁of▁sentence｜>"
+# vilm/vinallama-2.7b-chat
+  - name: vilm/vinallama-2.7b-chat
+    tokenizer_spec:
+        class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+        args:
+            pretrained_model_name_or_path: vilm/vinallama-2.7b-chat
+    end_of_text_token: "<im_end>"
+    prefix_token: "<im_start>"
+# vilm/vinallama-7b-chat
+  - name: vilm/vinallama-7b-chat
+    tokenizer_spec:
+        class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+        args:
+            pretrained_model_name_or_path: vilm/vinallama-7b-chat
+    end_of_text_token: "<im_end>"
+    prefix_token: "<im_start>"
+# vilm/vietcuna-7b-v3
+  - name: vilm/vietcuna-7b-v3
+    tokenizer_spec:
+        class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+        args:
+            pretrained_model_name_or_path: vilm/vietcuna-7b-v3
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Viet-Mistral/Vistral-7B-Chat
+  - name: Viet-Mistral/Vistral-7B-Chat
+    tokenizer_spec:
+        class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+        args:
+            pretrained_model_name_or_path: Viet-Mistral/Vistral-7B-Chat
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# vinai/PhoGPT-7B5-Instruct
+  - name: vinai/PhoGPT-7B5-Instruct
+    tokenizer_spec:
+        class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+        args:
+            pretrained_model_name_or_path: vinai/PhoGPT-7B5-Instruct
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# vinai/PhoGPT-4B-Chat
+  - name: vinai/PhoGPT-4B-Chat
+    tokenizer_spec:
+        class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+        args:
+            pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Gemma-3-Gaia-PT-BR-4b-it
+  - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
+    end_of_text_token: "<eos>"
+    prefix_token: "<bos>"
+# Bode 13B Alpaca PT-BR
+  - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Cabrita 7B PT-BR tokenizer
+  - name: 22h/cabrita_7b_pt_850000
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Gervásio 7B PT‑BR/PT‑PT tokenizer
+  - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Tucano 2b4 PT-BR tokenizer
+  - name: TucanoBR/Tucano-2b4
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: TucanoBR/Tucano-2b4
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# TeenyTinyLlama 460M PT-BR tokenizer
+  - name: nicholasKluge/TeenyTinyLlama-460m
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"

helm/proxy/cli.py CHANGED Viewed

@@ -21,7 +21,7 @@ from typing import List, Dict
 import re
 import sys
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, setup_default_logging
 from helm.common.authentication import Authentication
 from helm.proxy.accounts import Usage, Account
 from helm.proxy.services.remote_service import RemoteService, add_service_args, create_authentication
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
     # Update quotas
     for quota_str in args.quotas:
-        m = re.match(f"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
+        m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
         if not m:
             raise Exception(
                 f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
@@ -198,6 +198,8 @@ def main():
     args = parser.parse_args()
+    setup_default_logging()
     service = create_remote_service(args)
     auth = create_authentication(args)

helm/proxy/critique/mechanical_turk_utils.py CHANGED Viewed

@@ -38,7 +38,7 @@ def replace_emoji_characters(s: str) -> str:
         highpoints = re.compile("[\U00010000-\U0010ffff]")
     elif sys.maxunicode == 65535:
         # Python was built with '--enable-unicode=ucs2'
-        highpoints = re.compile("[\uD800-\uDBFF][\uDC00-\uDFFF]")
+        highpoints = re.compile("[\ud800-\udbff][\udc00-\udfff]")
     else:
         raise UnicodeError("Unable to determine if Python was built using UCS-2 or UCS-4")

helm/proxy/retry.py CHANGED Viewed

@@ -5,6 +5,7 @@ from retrying import Retrying
 from helm.common.request import RequestResult
 from helm.common.tokenization_request import TokenizationRequestResult
 from helm.common.hierarchical_logger import hlog
+import os
 import traceback
 import threading
@@ -19,6 +20,10 @@ Example usage:
         ...
 """
+# TODO: make these configurable at a config / cli level
+HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
+HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
 # The lock is used to prevent multiple threads from printing at the same time.
 # This can cause issues when printing the stack trace.
 # (The stack traces can get mixed up and become unreadable.)

helm/proxy/services/server_service.py CHANGED Viewed

@@ -1,8 +1,8 @@
-import dataclasses
 import os
 import signal
-from typing import List, Optional
+from typing import List
+from helm.common.local_context import LocalContext
 from helm.common.cache import CacheConfig
 from helm.common.cache_backend_config import CacheBackendConfig, BlackHoleCacheBackendConfig
 from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
@@ -11,7 +11,6 @@ from helm.common.moderations_api_request import ModerationAPIRequest, Moderation
 from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
 from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
 from helm.common.file_upload_request import FileUploadRequest, FileUploadResult
-from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials
 from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
 from helm.common.tokenization_request import (
     TokenizationRequest,
@@ -22,27 +21,13 @@ from helm.common.tokenization_request import (
 from helm.common.request import Request, RequestResult
 from helm.common.hierarchical_logger import hlog
 from helm.proxy.accounts import Accounts, Account
-from helm.clients.auto_client import AutoClient
-from helm.clients.moderation_api_client import ModerationAPIClient
-from helm.clients.image_generation.nudity_check_client import NudityCheckClient
-from helm.clients.gcs_client import GCSClient
-from helm.clients.clip_score_client import CLIPScoreClient
-from helm.clients.toxicity_classifier_client import ToxicityClassifierClient
-from helm.proxy.example_queries import example_queries
-from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA
 from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization
 from helm.proxy.query import Query, QueryResult
-from helm.proxy.retry import retry_request
 from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter
-from helm.tokenizers.auto_tokenizer import AutoTokenizer
 from helm.proxy.services.service import (
     Service,
-    CACHE_DIR,
     ACCOUNTS_FILE,
     GeneralInfo,
-    VERSION,
-    expand_environments,
-    synthesize_request,
 )
@@ -57,43 +42,17 @@ class ServerService(Service):
         root_mode: bool = False,
         cache_backend_config: CacheBackendConfig = BlackHoleCacheBackendConfig(),
     ):
-        ensure_directory_exists(base_path)
-        client_file_storage_path = os.path.join(base_path, CACHE_DIR)
-        ensure_directory_exists(client_file_storage_path)
-        credentials = get_credentials(base_path)
         accounts_path = os.path.join(base_path, ACCOUNTS_FILE)
-        self.cache_backend_config = cache_backend_config
-        self.client = AutoClient(credentials, client_file_storage_path, cache_backend_config)
-        self.tokenizer = AutoTokenizer(credentials, cache_backend_config)
-        self.token_counter = AutoTokenCounter(self.tokenizer)
+        self.context = LocalContext(base_path, cache_backend_config)
+        self.token_counter = AutoTokenCounter(self.context.tokenizer)
         self.accounts = Accounts(accounts_path, root_mode=root_mode)
-        # Lazily instantiate the following clients
-        self.moderation_api_client: Optional[ModerationAPIClient] = None
-        self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None
-        self.perspective_api_client: Optional[ToxicityClassifierClient] = None
-        self.nudity_check_client: Optional[NudityCheckClient] = None
-        self.clip_score_client: Optional[CLIPScoreClient] = None
-        self.gcs_client: Optional[GCSClient] = None
     def get_general_info(self) -> GeneralInfo:
-        # Can't send release_dates in ModelMetadata bacause dates cannot be round-tripped to and from JSON easily.
-        # TODO(#2158): Either fix this or delete get_general_info.
-        all_models = [dataclasses.replace(model_metadata, release_date=None) for model_metadata in ALL_MODELS_METADATA]
-        return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=all_models)
+        return self.context.get_general_info()
     def expand_query(self, query: Query) -> QueryResult:
-        """Turn the `query` into requests."""
-        prompt = query.prompt
-        settings = query.settings
-        environments = parse_hocon(query.environments)
-        requests = []
-        for environment in expand_environments(environments):
-            request = synthesize_request(prompt, settings, environment)
-            requests.append(request)
-        return QueryResult(requests=requests)
+        return self.context.expand_query(query)
     def _get_model_group_for_model_deployment(self, model_deployment: str) -> str:
         if model_deployment.startswith("openai/"):
@@ -105,7 +64,11 @@ class ServerService(Service):
                 return "gpt4"
             elif model_deployment.startswith("openai/gpt-3"):
                 return "gpt3"
-            elif model_deployment.startswith("openai/o1"):
+            elif (
+                model_deployment.startswith("openai/o1")
+                or model_deployment.startswith("openai/o3")
+                or model_deployment.startswith("openai/o4")
+            ):
                 return "o1"
             else:
                 return "openai"
@@ -126,7 +89,7 @@ class ServerService(Service):
         self.accounts.check_can_use(auth.api_key, model_group)
         # Use!
-        request_result: RequestResult = self.client.make_request(request)
+        request_result: RequestResult = self.context.make_request(request)
         # Only deduct if not cached
         if not request_result.cached:
@@ -139,66 +102,39 @@ class ServerService(Service):
     def tokenize(self, auth: Authentication, request: TokenizationRequest) -> TokenizationRequestResult:
         """Tokenize via an API."""
         self.accounts.authenticate(auth)
-        return self.tokenizer.tokenize(request)
+        return self.context.tokenize(request)
     def decode(self, auth: Authentication, request: DecodeRequest) -> DecodeRequestResult:
         """Decodes to text."""
         self.accounts.authenticate(auth)
-        return self.tokenizer.decode(request)
+        return self.context.decode(request)
     def upload(self, auth: Authentication, request: FileUploadRequest) -> FileUploadResult:
         """Uploads a file to external storage."""
         self.accounts.authenticate(auth)
-        if not self.gcs_client:
-            self.gcs_client = self.client.get_gcs_client()
-        assert self.gcs_client
-        return self.gcs_client.upload(request)
+        return self.context.upload(request)
     def check_nudity(self, auth: Authentication, request: NudityCheckRequest) -> NudityCheckResult:
         """Check for nudity."""
         self.accounts.authenticate(auth)
-        if not self.nudity_check_client:
-            self.nudity_check_client = self.client.get_nudity_check_client()
-        assert self.nudity_check_client
-        return self.nudity_check_client.check_nudity(request)
+        return self.context.check_nudity(request)
     def compute_clip_score(self, auth: Authentication, request: CLIPScoreRequest) -> CLIPScoreResult:
         """Computes CLIPScore for a given caption and image."""
         self.accounts.authenticate(auth)
-        if not self.clip_score_client:
-            self.clip_score_client = self.client.get_clip_score_client()
-        assert self.clip_score_client
-        return self.clip_score_client.compute_score(request)
+        return self.context.compute_clip_score(request)
     def get_toxicity_scores(self, auth: Authentication, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
-        @retry_request
-        def get_toxicity_scores_with_retry(request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
-            if not self.toxicity_classifier_client:
-                self.toxicity_classifier_client = self.client.get_toxicity_classifier_client()
-            return self.toxicity_classifier_client.get_toxicity_scores(request)
         self.accounts.authenticate(auth)
-        return get_toxicity_scores_with_retry(request)
+        return self.context.get_toxicity_scores(request)
     def get_moderation_results(self, auth: Authentication, request: ModerationAPIRequest) -> ModerationAPIRequestResult:
-        @retry_request
-        def get_moderation_results_with_retry(request: ModerationAPIRequest) -> ModerationAPIRequestResult:
-            if not self.moderation_api_client:
-                self.moderation_api_client = self.client.get_moderation_api_client()
-            return self.moderation_api_client.get_moderation_results(request)
         self.accounts.authenticate(auth)
-        return get_moderation_results_with_retry(request)
+        return self.context.get_moderation_results(request)
     def make_critique_request(self, auth: Authentication, request: CritiqueRequest) -> CritiqueRequestResult:
         self.accounts.authenticate(auth)
-        return self.client.get_critique_client().make_critique_request(request)
+        return self.context.make_critique_request(request)
     def create_account(self, auth: Authentication) -> Account:
         """Creates a new account."""
@@ -233,4 +169,4 @@ class ServerService(Service):
         hlog("Done.")
     def get_cache_config(self, shard_name: str) -> CacheConfig:
-        return self.cache_backend_config.get_cache_config(shard_name)
+        return self.context.get_cache_config(shard_name)

helm/tokenizers/grok_tokenizer.py ADDED Viewed

@@ -0,0 +1,55 @@
+import dataclasses
+import os
+from typing import Any, Dict, List, Optional
+import requests
+from helm.common.cache import CacheConfig
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationToken,
+)
+from helm.tokenizers.caching_tokenizer import CachingTokenizer
+class GrokAPITokenizer(CachingTokenizer):
+    """Tokenizer that uses the xAI Grok Tokenize Text API
+    Doc: https://docs.x.ai/docs/api-reference#tokenize-text"""
+    def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None) -> None:
+        super().__init__(cache_config)
+        self.api_key = api_key or os.environ.get("XAI_API_KEY")
+    def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
+        cache_key = dataclasses.asdict(request)
+        # Delete encode because the Grok API simulateously gives string and integer tokens.
+        del cache_key["encode"]
+        return cache_key
+    def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        if not self.api_key:
+            raise Exception(
+                "No Grok API key found. "
+                "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
+            )
+        text = request["text"]
+        if not text:
+            return {"token_ids": []}
+        model = request["tokenizer"].split("/")[-1]
+        response = requests.post(
+            url="https://api.x.ai/v1/tokenize-text",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json={"text": text, "model": model},
+        )
+        response.raise_for_status()
+        return response.json()
+    def _tokenization_raw_response_to_tokens(
+        self, response: Dict[str, Any], request: TokenizationRequest
+    ) -> List[TokenizationToken]:
+        raw_token_field_name = "token_id" if request.encode else "string_token"
+        return [TokenizationToken(raw_token[raw_token_field_name]) for raw_token in response["token_ids"]]
+    def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
+        raise NotImplementedError("The xAI API does not support decoding.")

helm/tokenizers/huggingface_tokenizer.py CHANGED Viewed

@@ -119,7 +119,7 @@ class HuggingFaceTokenizer(CachingTokenizer):
                     tokens = tokenizer.encode(
                         request["text"],
                         truncation=request["truncation"],
-                        max_length=request["max_length"],
+                        max_length=max(request["max_length"], 0),
                         add_special_tokens=False,
                     )
             else:

helm/tokenizers/test_grok_tokenizer.py ADDED Viewed

@@ -0,0 +1,33 @@
+import os
+import pytest
+from helm.common.cache import BlackHoleCacheConfig
+from helm.common.tokenization_request import (
+    TokenizationRequest,
+    TokenizationToken,
+)
+from helm.tokenizers.grok_tokenizer import GrokAPITokenizer
+@pytest.mark.models
+def test_tokenize():
+    if not os.environ.get("XAI_API_KEY"):
+        pytest.skip("No xAI API key found; skipping test")
+    tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
+    request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦")
+    result = tokenizer.tokenize(request)
+    assert result.success
+    assert not result.cached
+    assert result.tokens == [TokenizationToken(token) for token in ["otter", "", "", ""]]
+@pytest.mark.models
+def test_encode():
+    if not os.environ.get("XAI_API_KEY"):
+        pytest.skip("No xAI API key found; skipping test")
+    tokenizer = GrokAPITokenizer(cache_config=BlackHoleCacheConfig())
+    request = TokenizationRequest(tokenizer="xai/grok-3-beta", text="otter 🦦", encode=True)
+    result = tokenizer.tokenize(request)
+    assert result.success
+    assert not result.cached
+    assert result.tokens == [TokenizationToken(token) for token in [142507, 11637, 294, 294]]

crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.5py3-none-any.whl → 0.5.7py3-none-any.whl