crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -251,6 +251,13 @@ tokenizer_configs:
|
|
|
251
251
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
252
252
|
end_of_text_token: "<eos>"
|
|
253
253
|
prefix_token: "<bos>"
|
|
254
|
+
- name: google/medgemma-4b-it
|
|
255
|
+
tokenizer_spec:
|
|
256
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
257
|
+
args:
|
|
258
|
+
trust_remote_code: true
|
|
259
|
+
end_of_text_token: "<eos>"
|
|
260
|
+
prefix_token: "<bos>"
|
|
254
261
|
|
|
255
262
|
# Grok
|
|
256
263
|
- name: xai/grok-3-beta
|
|
@@ -265,6 +272,12 @@ tokenizer_configs:
|
|
|
265
272
|
end_of_text_token: ""
|
|
266
273
|
prefix_token: ""
|
|
267
274
|
|
|
275
|
+
- name: xai/grok-4-0709
|
|
276
|
+
tokenizer_spec:
|
|
277
|
+
class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
|
|
278
|
+
end_of_text_token: ""
|
|
279
|
+
prefix_token: ""
|
|
280
|
+
|
|
268
281
|
# Hf-internal-testing
|
|
269
282
|
|
|
270
283
|
# Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
|
|
@@ -454,7 +467,7 @@ tokenizer_configs:
|
|
|
454
467
|
|
|
455
468
|
# Allen Institute for AI
|
|
456
469
|
# The allenai/olmo-7b requires Python 3.9 or newer.
|
|
457
|
-
# To use the allenai/olmo-7b tokenizer, run `pip install crfm-helm[allenai]` first.
|
|
470
|
+
# To use the allenai/olmo-7b tokenizer, run `pip install "crfm-helm[allenai]"` first.
|
|
458
471
|
- name: allenai/olmo-7b
|
|
459
472
|
tokenizer_spec:
|
|
460
473
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -582,6 +595,17 @@ tokenizer_configs:
|
|
|
582
595
|
end_of_text_token: "</s>"
|
|
583
596
|
prefix_token: "<s>"
|
|
584
597
|
|
|
598
|
+
# Moonshot AI
|
|
599
|
+
- name: moonshotai/kimi-k2-instruct
|
|
600
|
+
tokenizer_spec:
|
|
601
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
602
|
+
args:
|
|
603
|
+
pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
|
|
604
|
+
trust_remote_code: true
|
|
605
|
+
revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
|
|
606
|
+
end_of_text_token: "[EOS]"
|
|
607
|
+
prefix_token: "[BOS]"
|
|
608
|
+
|
|
585
609
|
# Nectec
|
|
586
610
|
- name: nectec/OpenThaiLLM-Prebuilt-7B
|
|
587
611
|
tokenizer_spec:
|
|
@@ -633,6 +657,12 @@ tokenizer_configs:
|
|
|
633
657
|
end_of_text_token: "<|endoftext|>"
|
|
634
658
|
prefix_token: "<|endoftext|>"
|
|
635
659
|
|
|
660
|
+
- name: openai/o200k_harmony
|
|
661
|
+
tokenizer_spec:
|
|
662
|
+
class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
|
|
663
|
+
end_of_text_token: "<|endoftext|>"
|
|
664
|
+
prefix_token: "<|startoftext|>"
|
|
665
|
+
|
|
636
666
|
- name: openai/clip-vit-large-patch14
|
|
637
667
|
tokenizer_spec:
|
|
638
668
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -688,6 +718,18 @@ tokenizer_configs:
|
|
|
688
718
|
end_of_text_token: "<|im_end|>"
|
|
689
719
|
prefix_token: "<|im_start|>"
|
|
690
720
|
|
|
721
|
+
- name: qwen/qwen3-235b-a22b-instruct-2507-fp8
|
|
722
|
+
tokenizer_spec:
|
|
723
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
724
|
+
end_of_text_token: "<|im_end|>"
|
|
725
|
+
prefix_token: ""
|
|
726
|
+
|
|
727
|
+
- name: qwen/qwen3-next-80b-a3b-thinking
|
|
728
|
+
tokenizer_spec:
|
|
729
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
730
|
+
end_of_text_token: "<|im_end|>"
|
|
731
|
+
prefix_token: ""
|
|
732
|
+
|
|
691
733
|
- name: qwen/qwq-32b-preview
|
|
692
734
|
tokenizer_spec:
|
|
693
735
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -768,6 +810,12 @@ tokenizer_configs:
|
|
|
768
810
|
end_of_text_token: "<|endoftext|>"
|
|
769
811
|
prefix_token: ""
|
|
770
812
|
|
|
813
|
+
- name: tiiuae/falcon3-1b-instruct
|
|
814
|
+
tokenizer_spec:
|
|
815
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
816
|
+
end_of_text_token: "<|endoftext|>"
|
|
817
|
+
prefix_token: ""
|
|
818
|
+
|
|
771
819
|
# TsinghuaKEG
|
|
772
820
|
- name: TsinghuaKEG/ice
|
|
773
821
|
tokenizer_spec:
|
|
@@ -892,6 +940,23 @@ tokenizer_configs:
|
|
|
892
940
|
end_of_text_token: ""
|
|
893
941
|
prefix_token: ""
|
|
894
942
|
|
|
943
|
+
- name: ibm/granite-4.0-micro
|
|
944
|
+
tokenizer_spec:
|
|
945
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
946
|
+
args:
|
|
947
|
+
pretrained_model_name_or_path: ibm-granite/granite-4.0-micro
|
|
948
|
+
end_of_text_token: "<|end_of_text|>"
|
|
949
|
+
prefix_token: "<|end_of_text|>"
|
|
950
|
+
|
|
951
|
+
- name: ibm/granite-4.0-h-small
|
|
952
|
+
tokenizer_spec:
|
|
953
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
954
|
+
args:
|
|
955
|
+
pretrained_model_name_or_path: ibm-granite/granite-4.0-h-small
|
|
956
|
+
end_of_text_token: "<|end_of_text|>"
|
|
957
|
+
prefix_token: "<|end_of_text|>"
|
|
958
|
+
|
|
959
|
+
# Maritaca AI
|
|
895
960
|
- name: maritaca-ai/sabia-7b
|
|
896
961
|
tokenizer_spec:
|
|
897
962
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -900,6 +965,14 @@ tokenizer_configs:
|
|
|
900
965
|
end_of_text_token: "</s>"
|
|
901
966
|
prefix_token: "<s>"
|
|
902
967
|
|
|
968
|
+
- name: maritaca-ai/sabia-2-tokenizer-medium
|
|
969
|
+
tokenizer_spec:
|
|
970
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
971
|
+
args:
|
|
972
|
+
pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
|
|
973
|
+
end_of_text_token: "</s>"
|
|
974
|
+
prefix_token: "<s>"
|
|
975
|
+
|
|
903
976
|
# Granite-3.1-8b-base
|
|
904
977
|
- name: ibm-granite/granite-3.1-8b-base
|
|
905
978
|
tokenizer_spec:
|
|
@@ -1022,7 +1095,6 @@ tokenizer_configs:
|
|
|
1022
1095
|
end_of_text_token: ""
|
|
1023
1096
|
|
|
1024
1097
|
# IBM Granite 3.3
|
|
1025
|
-
|
|
1026
1098
|
- name: ibm/granite-3.3-8b-instruct
|
|
1027
1099
|
tokenizer_spec:
|
|
1028
1100
|
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
@@ -1031,7 +1103,12 @@ tokenizer_configs:
|
|
|
1031
1103
|
end_of_text_token: "<|end_of_text|>"
|
|
1032
1104
|
prefix_token: "<|end_of_text|>"
|
|
1033
1105
|
|
|
1034
|
-
|
|
1106
|
+
# Z.ai GLM-4.5-AIR-FP8
|
|
1107
|
+
- name: zai-org/glm-4.5-air-fp8
|
|
1108
|
+
tokenizer_spec:
|
|
1109
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1110
|
+
end_of_text_token: "<|endoftext|>"
|
|
1111
|
+
prefix_token: ""
|
|
1035
1112
|
|
|
1036
1113
|
# DeepSeek-R1-Distill-Llama-3.1-8b
|
|
1037
1114
|
- name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
|
|
@@ -1042,6 +1119,20 @@ tokenizer_configs:
|
|
|
1042
1119
|
end_of_text_token: "<|end▁of▁sentence|>"
|
|
1043
1120
|
prefix_token: "<|begin▁of▁sentence|>"
|
|
1044
1121
|
|
|
1122
|
+
# DeepSeek-R1-Distill-Llama-3.1-8b
|
|
1123
|
+
- name: deepseek-ai/deepseek-r1-distill-llama-70b
|
|
1124
|
+
tokenizer_spec:
|
|
1125
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1126
|
+
end_of_text_token: "<|end▁of▁sentence|>"
|
|
1127
|
+
prefix_token: "<|begin▁of▁sentence|>"
|
|
1128
|
+
|
|
1129
|
+
# DeepSeek-R1-Distill-Qwen-14B
|
|
1130
|
+
- name: deepseek-ai/deepseek-r1-distill-qwen-14b
|
|
1131
|
+
tokenizer_spec:
|
|
1132
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1133
|
+
end_of_text_token: "<|end▁of▁sentence|>"
|
|
1134
|
+
prefix_token: "<|begin▁of▁sentence|>"
|
|
1135
|
+
|
|
1045
1136
|
# deepseek-ai/deepseek-coder-6.7b-instruct
|
|
1046
1137
|
- name: deepseek-ai/deepseek-coder-6.7b-instruct
|
|
1047
1138
|
tokenizer_spec:
|
|
@@ -1051,7 +1142,6 @@ tokenizer_configs:
|
|
|
1051
1142
|
end_of_text_token: "<|end▁of▁sentence|>"
|
|
1052
1143
|
prefix_token: "<|begin▁of▁sentence|>"
|
|
1053
1144
|
|
|
1054
|
-
|
|
1055
1145
|
# vilm/vinallama-2.7b-chat
|
|
1056
1146
|
- name: vilm/vinallama-2.7b-chat
|
|
1057
1147
|
tokenizer_spec:
|
|
@@ -1104,4 +1194,111 @@ tokenizer_configs:
|
|
|
1104
1194
|
args:
|
|
1105
1195
|
pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
|
|
1106
1196
|
end_of_text_token: "</s>"
|
|
1107
|
-
prefix_token: "<s>"
|
|
1197
|
+
prefix_token: "<s>"
|
|
1198
|
+
|
|
1199
|
+
# Gemma-3-Gaia-PT-BR-4b-it
|
|
1200
|
+
- name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
1201
|
+
tokenizer_spec:
|
|
1202
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1203
|
+
args:
|
|
1204
|
+
pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
|
|
1205
|
+
end_of_text_token: "<eos>"
|
|
1206
|
+
prefix_token: "<bos>"
|
|
1207
|
+
|
|
1208
|
+
# Bode 13B Alpaca PT-BR
|
|
1209
|
+
- name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
1210
|
+
tokenizer_spec:
|
|
1211
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1212
|
+
args:
|
|
1213
|
+
pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
|
|
1214
|
+
end_of_text_token: "</s>"
|
|
1215
|
+
prefix_token: "<s>"
|
|
1216
|
+
|
|
1217
|
+
# Cabrita 7B PT-BR tokenizer
|
|
1218
|
+
- name: 22h/cabrita_7b_pt_850000
|
|
1219
|
+
tokenizer_spec:
|
|
1220
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1221
|
+
args:
|
|
1222
|
+
pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
|
|
1223
|
+
end_of_text_token: "</s>"
|
|
1224
|
+
prefix_token: "<s>"
|
|
1225
|
+
|
|
1226
|
+
# Gervásio 7B PT‑BR/PT‑PT tokenizer
|
|
1227
|
+
- name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
1228
|
+
tokenizer_spec:
|
|
1229
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1230
|
+
args:
|
|
1231
|
+
pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
|
|
1232
|
+
end_of_text_token: "</s>"
|
|
1233
|
+
prefix_token: "<s>"
|
|
1234
|
+
|
|
1235
|
+
# Tucano 2b4 PT-BR tokenizer
|
|
1236
|
+
- name: TucanoBR/Tucano-2b4
|
|
1237
|
+
tokenizer_spec:
|
|
1238
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1239
|
+
args:
|
|
1240
|
+
pretrained_model_name_or_path: TucanoBR/Tucano-2b4
|
|
1241
|
+
end_of_text_token: "</s>"
|
|
1242
|
+
prefix_token: "<s>"
|
|
1243
|
+
|
|
1244
|
+
# TeenyTinyLlama 460M PT-BR tokenizer
|
|
1245
|
+
- name: nicholasKluge/TeenyTinyLlama-460m
|
|
1246
|
+
tokenizer_spec:
|
|
1247
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1248
|
+
args:
|
|
1249
|
+
pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
|
|
1250
|
+
end_of_text_token: "</s>"
|
|
1251
|
+
prefix_token: "<s>"
|
|
1252
|
+
|
|
1253
|
+
# AceGPT-v2
|
|
1254
|
+
- name: freedomintelligence/acegpt-v2-8b-chat
|
|
1255
|
+
tokenizer_spec:
|
|
1256
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1257
|
+
end_of_text_token: "<|end_of_text|>"
|
|
1258
|
+
prefix_token: "<|begin_of_text|>"
|
|
1259
|
+
|
|
1260
|
+
- name: freedomintelligence/acegpt-v2-32b-chat
|
|
1261
|
+
tokenizer_spec:
|
|
1262
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1263
|
+
end_of_text_token: "<|endoftext|>"
|
|
1264
|
+
prefix_token: ""
|
|
1265
|
+
|
|
1266
|
+
- name: freedomintelligence/acegpt-v2-70b-chat
|
|
1267
|
+
tokenizer_spec:
|
|
1268
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1269
|
+
end_of_text_token: "<|end_of_text|>"
|
|
1270
|
+
prefix_token: "<|begin_of_text|>"
|
|
1271
|
+
|
|
1272
|
+
# ALLaM
|
|
1273
|
+
- name: allam-ai/allam-7b-instruct-preview
|
|
1274
|
+
tokenizer_spec:
|
|
1275
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1276
|
+
end_of_text_token: "</s>"
|
|
1277
|
+
prefix_token: "<s>"
|
|
1278
|
+
|
|
1279
|
+
# SILMA
|
|
1280
|
+
- name: silma-ai/silma-9b-instruct-v1.0
|
|
1281
|
+
tokenizer_spec:
|
|
1282
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1283
|
+
end_of_text_token: "<eos>"
|
|
1284
|
+
prefix_token: "<bos>"
|
|
1285
|
+
|
|
1286
|
+
# Jais Family
|
|
1287
|
+
- name: inceptionai/jais-family-590m-chat
|
|
1288
|
+
tokenizer_spec:
|
|
1289
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1290
|
+
end_of_text_token: "<|endoftext|>"
|
|
1291
|
+
prefix_token: "<|endoftext|>"
|
|
1292
|
+
|
|
1293
|
+
# Jais Adapted
|
|
1294
|
+
- name: inceptionai/jais-adapted-7b-chat
|
|
1295
|
+
tokenizer_spec:
|
|
1296
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1297
|
+
end_of_text_token: "</s>"
|
|
1298
|
+
prefix_token: "<s>"
|
|
1299
|
+
|
|
1300
|
+
- name: inceptionai/jais-adapted-13b-chat
|
|
1301
|
+
tokenizer_spec:
|
|
1302
|
+
class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
|
|
1303
|
+
end_of_text_token: "</s>"
|
|
1304
|
+
prefix_token: "<s>"
|
helm/proxy/cli.py
CHANGED
|
@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
|
|
|
123
123
|
|
|
124
124
|
# Update quotas
|
|
125
125
|
for quota_str in args.quotas:
|
|
126
|
-
m = re.match(
|
|
126
|
+
m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
|
|
127
127
|
if not m:
|
|
128
128
|
raise Exception(
|
|
129
129
|
f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "
|
helm/proxy/example_queries.py
CHANGED
|
@@ -21,7 +21,7 @@ example_queries = [
|
|
|
21
21
|
"""
|
|
22
22
|
temperature: 0.5 # Medium amount of randomness
|
|
23
23
|
stop_sequences: [.] # Stop when you hit a period
|
|
24
|
-
model: openai/gpt-
|
|
24
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
25
25
|
"""
|
|
26
26
|
),
|
|
27
27
|
environments="",
|
|
@@ -33,7 +33,7 @@ example_queries = [
|
|
|
33
33
|
temperature: 0.5 # Medium amount of randomness
|
|
34
34
|
stop_sequences: [\\n] # Stop when you hit a newline
|
|
35
35
|
num_completions: 5 # Generate many samples
|
|
36
|
-
model: openai/gpt-
|
|
36
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
37
37
|
"""
|
|
38
38
|
),
|
|
39
39
|
environments="",
|
|
@@ -58,7 +58,7 @@ example_queries = [
|
|
|
58
58
|
"""
|
|
59
59
|
temperature: 0 # Deterministic
|
|
60
60
|
max_tokens: 50
|
|
61
|
-
model: openai/gpt-
|
|
61
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
62
62
|
"""
|
|
63
63
|
),
|
|
64
64
|
environments="",
|
|
@@ -76,7 +76,7 @@ example_queries = [
|
|
|
76
76
|
environments=dedent(
|
|
77
77
|
"""
|
|
78
78
|
occupation: [mathematician, lawyer, doctor]
|
|
79
|
-
model: [openai/gpt-
|
|
79
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
80
80
|
"""
|
|
81
81
|
),
|
|
82
82
|
),
|
|
@@ -101,7 +101,7 @@ example_queries = [
|
|
|
101
101
|
),
|
|
102
102
|
environments=dedent(
|
|
103
103
|
"""
|
|
104
|
-
model: [openai/gpt-
|
|
104
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
105
105
|
"""
|
|
106
106
|
),
|
|
107
107
|
),
|
|
@@ -136,7 +136,7 @@ example_queries = [
|
|
|
136
136
|
),
|
|
137
137
|
environments=dedent(
|
|
138
138
|
"""
|
|
139
|
-
model: [openai/gpt-
|
|
139
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
140
140
|
"""
|
|
141
141
|
),
|
|
142
142
|
),
|
|
@@ -144,7 +144,7 @@ example_queries = [
|
|
|
144
144
|
prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
|
|
145
145
|
settings=dedent(
|
|
146
146
|
"""
|
|
147
|
-
model: openai/gpt-
|
|
147
|
+
model: openai/gpt-4.1-nano-2025-04-14
|
|
148
148
|
"""
|
|
149
149
|
),
|
|
150
150
|
environments="",
|
|
@@ -161,7 +161,7 @@ example_queries = [
|
|
|
161
161
|
),
|
|
162
162
|
environments=dedent(
|
|
163
163
|
"""
|
|
164
|
-
model: [openai/gpt-
|
|
164
|
+
model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
|
|
165
165
|
"""
|
|
166
166
|
),
|
|
167
167
|
),
|
helm/proxy/retry.py
CHANGED
|
@@ -5,6 +5,7 @@ from retrying import Retrying
|
|
|
5
5
|
from helm.common.request import RequestResult
|
|
6
6
|
from helm.common.tokenization_request import TokenizationRequestResult
|
|
7
7
|
from helm.common.hierarchical_logger import hlog
|
|
8
|
+
import os
|
|
8
9
|
import traceback
|
|
9
10
|
import threading
|
|
10
11
|
|
|
@@ -19,6 +20,10 @@ Example usage:
|
|
|
19
20
|
...
|
|
20
21
|
"""
|
|
21
22
|
|
|
23
|
+
# TODO: make these configurable at a config / cli level
|
|
24
|
+
HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
|
|
25
|
+
HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
|
|
26
|
+
|
|
22
27
|
# The lock is used to prevent multiple threads from printing at the same time.
|
|
23
28
|
# This can cause issues when printing the stack trace.
|
|
24
29
|
# (The stack traces can get mixed up and become unreadable.)
|
helm/proxy/server.py
CHANGED
|
@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
|
|
|
23
23
|
from helm.common.authentication import Authentication
|
|
24
24
|
from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
25
25
|
from helm.common.general import ensure_directory_exists
|
|
26
|
-
from helm.common.hierarchical_logger import hlog
|
|
26
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
27
27
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
28
28
|
from helm.common.request import Request
|
|
29
29
|
from helm.common.perspective_api_request import PerspectiveAPIRequest
|
|
@@ -273,6 +273,7 @@ def main():
|
|
|
273
273
|
default="",
|
|
274
274
|
)
|
|
275
275
|
args = parser.parse_args()
|
|
276
|
+
setup_default_logging()
|
|
276
277
|
|
|
277
278
|
register_builtin_configs_from_helm_package()
|
|
278
279
|
register_configs_from_directory(args.base_path)
|
helm/proxy/static/index.css
CHANGED
helm/proxy/static/index.js
CHANGED
|
@@ -282,7 +282,13 @@ $(function () {
|
|
|
282
282
|
requestResult.completions.forEach((completion) => {
|
|
283
283
|
const $contents = $("<span>", {
|
|
284
284
|
title: `logprob: ${completion.logprob}`,
|
|
285
|
-
})
|
|
285
|
+
});
|
|
286
|
+
if (completion.thinking) {
|
|
287
|
+
const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
|
|
288
|
+
$contents.append($thinking);
|
|
289
|
+
}
|
|
290
|
+
const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
|
|
291
|
+
$contents.append($resultText);
|
|
286
292
|
const $metadata = $("<span>", { class: "metadata" });
|
|
287
293
|
$metadata.append(
|
|
288
294
|
$("<span>", { title: "Log probability" }).append(
|
|
@@ -8,7 +8,7 @@ from helm.common.credentials_utils import provide_api_key
|
|
|
8
8
|
from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
|
|
9
9
|
from helm.common.hierarchical_logger import hlog
|
|
10
10
|
from helm.common.object_spec import create_object, inject_object_spec_args
|
|
11
|
-
from helm.proxy.retry import retry_tokenizer_request
|
|
11
|
+
from helm.proxy.retry import NonRetriableException, retry_tokenizer_request
|
|
12
12
|
from helm.common.tokenization_request import (
|
|
13
13
|
DecodeRequest,
|
|
14
14
|
DecodeRequestResult,
|
|
@@ -50,7 +50,7 @@ class AutoTokenizer(Tokenizer):
|
|
|
50
50
|
)
|
|
51
51
|
tokenizer = create_object(tokenizer_spec)
|
|
52
52
|
else:
|
|
53
|
-
|
|
53
|
+
raise NonRetriableException(f"Could not find tokenizer config for {tokenizer_name}")
|
|
54
54
|
|
|
55
55
|
# Cache the tokenizer
|
|
56
56
|
assert isinstance(tokenizer, Tokenizer) # To make mypy happy
|
|
@@ -34,6 +34,8 @@ class GrokAPITokenizer(CachingTokenizer):
|
|
|
34
34
|
"Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
|
|
35
35
|
)
|
|
36
36
|
text = request["text"]
|
|
37
|
+
if not text:
|
|
38
|
+
return {"token_ids": []}
|
|
37
39
|
model = request["tokenizer"].split("/")[-1]
|
|
38
40
|
response = requests.post(
|
|
39
41
|
url="https://api.x.ai/v1/tokenize-text",
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class ACIBenchMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for ACIBench."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="aci_bench_accuracy",
|
|
11
|
-
scenario_name="aci_bench",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class CHWCarePlanMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for CHWCarePlan."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="chw_care_plan_accuracy",
|
|
11
|
-
scenario_name="chw_care_plan",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class DischargeMeMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for DischargeMe."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="dischargeme_accuracy",
|
|
11
|
-
scenario_name="dischargeme",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedDialogMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MedDialog."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="med_dialog_accuracy",
|
|
11
|
-
scenario_name="med_dialog",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedalignMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for Medalign."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medalign_accuracy",
|
|
11
|
-
scenario_name="medalign",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MediQAMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MediQA."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medi_qa_accuracy",
|
|
11
|
-
scenario_name="medi_qa",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.medication_qa_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MedicationQAMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MedicationQA."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="medication_qa_accuracy",
|
|
11
|
-
scenario_name="medication_qa",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MentalHealthMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MentalHealth."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mental_health_accuracy",
|
|
11
|
-
scenario_name="mental_health",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mimic_bhc_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MIMICBHCMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MIMICBHC."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mimic_bhc_accuracy",
|
|
11
|
-
scenario_name="mimic_bhc",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MIMICRRSMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MIMICRRS."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mimic_rrs_accuracy",
|
|
11
|
-
scenario_name="mimic_rrs",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mtsamples_procedures_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MTSamplesProceduresMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MTSamplesProcedures."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mtsamples_procedures_accuracy",
|
|
11
|
-
scenario_name="mtsamples_procedures",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
from helm.benchmark.annotation.mtsamples_replicate_annotator import ANNOTATOR_MODELS
|
|
2
|
-
from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class MTSamplesReplicateMetric(LLMJuryMetric):
|
|
6
|
-
"""Score metrics for MTSamplesReplicate."""
|
|
7
|
-
|
|
8
|
-
def __init__(self):
|
|
9
|
-
super().__init__(
|
|
10
|
-
metric_name="mtsamples_replicate_accuracy",
|
|
11
|
-
scenario_name="mtsamples_replicate",
|
|
12
|
-
annotator_models=ANNOTATOR_MODELS,
|
|
13
|
-
default_score=1.0,
|
|
14
|
-
)
|