PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
helm/benchmark/adaptation/adapter_spec.py +10 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/bbq_metrics.py +12 -0
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/safety_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/run_display.py +13 -3
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +15 -4
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
helm/benchmark/runner.py +7 -0
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/banking77_scenario.py +21 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +32 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
helm/benchmark/scenarios/financebench_scenario.py +21 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +21 -0
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +19 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +54 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +20 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +21 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +18 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +17 -18
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/index.html +5 -6
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/ai21_client.py +2 -0
helm/clients/aleph_alpha_client.py +2 -0
helm/clients/anthropic_client.py +7 -1
helm/clients/audio_language/diva_llama_client.py +2 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +2 -1
helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/bedrock_client.py +63 -6
helm/clients/cohere_client.py +3 -0
helm/clients/dspy_client.py +135 -0
helm/clients/google_client.py +2 -0
helm/clients/http_model_client.py +2 -0
helm/clients/huggingface_client.py +4 -3
helm/clients/ibm_client.py +3 -1
helm/clients/image_generation/adobe_vision_client.py +2 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/cogview2_client.py +2 -1
helm/clients/image_generation/dalle2_client.py +2 -0
helm/clients/image_generation/dalle_mini_client.py +2 -1
helm/clients/image_generation/deep_floyd_client.py +2 -0
helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
helm/clients/image_generation/lexica_client.py +2 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/image_generation/mindalle_client.py +2 -1
helm/clients/image_generation/together_image_generation_client.py +2 -0
helm/clients/megatron_client.py +2 -0
helm/clients/mistral_client.py +2 -0
helm/clients/moderation_api_client.py +2 -0
helm/clients/openai_client.py +38 -21
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/palmyra_client.py +2 -1
helm/clients/reka_client.py +2 -1
helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
helm/clients/stanfordhealthcare_http_model_client.py +2 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +52 -13
helm/clients/vertexai_client.py +23 -11
helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
helm/clients/vision_language/huggingface_vlm_client.py +2 -0
helm/clients/vision_language/idefics_client.py +2 -1
helm/clients/vision_language/open_flamingo_client.py +2 -1
helm/clients/vision_language/paligemma_client.py +2 -1
helm/clients/vision_language/palmyra_vision_client.py +2 -0
helm/clients/vision_language/qwen2_vlm_client.py +2 -1
helm/clients/vision_language/qwen_vlm_client.py +2 -1
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +5 -2
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +103 -34
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/test_general.py +4 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +1001 -187
helm/config/model_metadata.yaml +602 -18
helm/config/tokenizer_configs.yaml +202 -5
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/auto_tokenizer.py +2 -2
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
/helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
/helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
/helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
/helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
/helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
/helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0

helm/config/tokenizer_configs.yaml CHANGED Viewed

@@ -251,6 +251,13 @@ tokenizer_configs:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
     end_of_text_token: "<eos>"
     prefix_token: "<bos>"
+  - name: google/medgemma-4b-it
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        trust_remote_code: true
+    end_of_text_token: "<eos>"
+    prefix_token: "<bos>"
   # Grok
   - name: xai/grok-3-beta
@@ -265,6 +272,12 @@ tokenizer_configs:
     end_of_text_token: ""
     prefix_token: ""
+  - name: xai/grok-4-0709
+    tokenizer_spec:
+      class_name: "helm.tokenizers.grok_tokenizer.GrokAPITokenizer"
+    end_of_text_token: ""
+    prefix_token: ""
   # Hf-internal-testing
   # Tokenizer name hf-internal-testing/llama-tokenizer is taken from:
@@ -454,7 +467,7 @@ tokenizer_configs:
   # Allen Institute for AI
   # The allenai/olmo-7b requires Python 3.9 or newer.
-  # To use the allenai/olmo-7b tokenizer, run `pip install crfm-helm[allenai]` first.
+  # To use the allenai/olmo-7b tokenizer, run `pip install "crfm-helm[allenai]"` first.
   - name: allenai/olmo-7b
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -582,6 +595,17 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+  # Moonshot AI
+  - name: moonshotai/kimi-k2-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: moonshotai/Kimi-K2-Instruct
+        trust_remote_code: true
+        revision: 4f239503ad9d1a042f0a4bacac457931ab972cfc
+    end_of_text_token: "[EOS]"
+    prefix_token: "[BOS]"
   # Nectec
   - name: nectec/OpenThaiLLM-Prebuilt-7B
     tokenizer_spec:
@@ -633,6 +657,12 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: "<|endoftext|>"
+  - name: openai/o200k_harmony
+    tokenizer_spec:
+      class_name: "helm.tokenizers.tiktoken_tokenizer.TiktokenTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|startoftext|>"
   - name: openai/clip-vit-large-patch14
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -688,6 +718,18 @@ tokenizer_configs:
     end_of_text_token: "<|im_end|>"
     prefix_token: "<|im_start|>"
+  - name: qwen/qwen3-235b-a22b-instruct-2507-fp8
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|im_end|>"
+    prefix_token: ""
+  - name: qwen/qwen3-next-80b-a3b-thinking
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|im_end|>"
+    prefix_token: ""
   - name: qwen/qwq-32b-preview
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -768,6 +810,12 @@ tokenizer_configs:
     end_of_text_token: "<|endoftext|>"
     prefix_token: ""
+  - name: tiiuae/falcon3-1b-instruct
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
   # TsinghuaKEG
   - name: TsinghuaKEG/ice
     tokenizer_spec:
@@ -892,6 +940,23 @@ tokenizer_configs:
     end_of_text_token: ""
     prefix_token: ""
+  - name: ibm/granite-4.0-micro
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: ibm-granite/granite-4.0-micro
+    end_of_text_token: "<|end_of_text|>"
+    prefix_token: "<|end_of_text|>"
+  - name: ibm/granite-4.0-h-small
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: ibm-granite/granite-4.0-h-small
+    end_of_text_token: "<|end_of_text|>"
+    prefix_token: "<|end_of_text|>"
+# Maritaca AI
   - name: maritaca-ai/sabia-7b
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -900,6 +965,14 @@ tokenizer_configs:
     end_of_text_token: "</s>"
     prefix_token: "<s>"
+  - name: maritaca-ai/sabia-2-tokenizer-medium
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: maritaca-ai/sabia-2-tokenizer-medium
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
 # Granite-3.1-8b-base
   - name: ibm-granite/granite-3.1-8b-base
     tokenizer_spec:
@@ -1022,7 +1095,6 @@ tokenizer_configs:
     end_of_text_token: ""
   # IBM Granite 3.3
   - name: ibm/granite-3.3-8b-instruct
     tokenizer_spec:
       class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
@@ -1031,7 +1103,12 @@ tokenizer_configs:
     end_of_text_token: "<|end_of_text|>"
     prefix_token: "<|end_of_text|>"
+  # Z.ai GLM-4.5-AIR-FP8
+  - name: zai-org/glm-4.5-air-fp8
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
   # DeepSeek-R1-Distill-Llama-3.1-8b
   - name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
@@ -1042,6 +1119,20 @@ tokenizer_configs:
     end_of_text_token: "<｜end▁of▁sentence｜>"
     prefix_token: "<｜begin▁of▁sentence｜>"
+  # DeepSeek-R1-Distill-Llama-3.1-8b
+  - name: deepseek-ai/deepseek-r1-distill-llama-70b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<｜end▁of▁sentence｜>"
+    prefix_token: "<｜begin▁of▁sentence｜>"
+  # DeepSeek-R1-Distill-Qwen-14B
+  - name: deepseek-ai/deepseek-r1-distill-qwen-14b
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<｜end▁of▁sentence｜>"
+    prefix_token: "<｜begin▁of▁sentence｜>"
 # deepseek-ai/deepseek-coder-6.7b-instruct
   - name: deepseek-ai/deepseek-coder-6.7b-instruct
     tokenizer_spec:
@@ -1051,7 +1142,6 @@ tokenizer_configs:
     end_of_text_token: "<｜end▁of▁sentence｜>"
     prefix_token: "<｜begin▁of▁sentence｜>"
 # vilm/vinallama-2.7b-chat
   - name: vilm/vinallama-2.7b-chat
     tokenizer_spec:
@@ -1104,4 +1194,111 @@ tokenizer_configs:
         args:
             pretrained_model_name_or_path: vinai/PhoGPT-4B-Chat
     end_of_text_token: "</s>"
-    prefix_token: "<s>"
+    prefix_token: "<s>"
+# Gemma-3-Gaia-PT-BR-4b-it
+  - name: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: CEIA-UFG/Gemma-3-Gaia-PT-BR-4b-it
+    end_of_text_token: "<eos>"
+    prefix_token: "<bos>"
+# Bode 13B Alpaca PT-BR
+  - name: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: recogna-nlp/bode-13b-alpaca-pt-br-no-peft
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Cabrita 7B PT-BR tokenizer
+  - name: 22h/cabrita_7b_pt_850000
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: 22h/cabrita_7b_pt_850000
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Gervásio 7B PT‑BR/PT‑PT tokenizer
+  - name: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: PORTULAN/gervasio-7b-portuguese-ptbr-decoder
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# Tucano 2b4 PT-BR tokenizer
+  - name: TucanoBR/Tucano-2b4
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: TucanoBR/Tucano-2b4
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+# TeenyTinyLlama 460M PT-BR tokenizer
+  - name: nicholasKluge/TeenyTinyLlama-460m
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+      args:
+        pretrained_model_name_or_path: nicholasKluge/TeenyTinyLlama-460m
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  # AceGPT-v2
+  - name: freedomintelligence/acegpt-v2-8b-chat
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|end_of_text|>"
+    prefix_token: "<|begin_of_text|>"
+  - name: freedomintelligence/acegpt-v2-32b-chat
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: ""
+  - name: freedomintelligence/acegpt-v2-70b-chat
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|end_of_text|>"
+    prefix_token: "<|begin_of_text|>"
+  # ALLaM
+  - name: allam-ai/allam-7b-instruct-preview
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  # SILMA
+  - name: silma-ai/silma-9b-instruct-v1.0
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<eos>"
+    prefix_token: "<bos>"
+  # Jais Family
+  - name: inceptionai/jais-family-590m-chat
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "<|endoftext|>"
+    prefix_token: "<|endoftext|>"
+  # Jais Adapted
+  - name: inceptionai/jais-adapted-7b-chat
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"
+  - name: inceptionai/jais-adapted-13b-chat
+    tokenizer_spec:
+      class_name: "helm.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer"
+    end_of_text_token: "</s>"
+    prefix_token: "<s>"

helm/proxy/cli.py CHANGED Viewed

@@ -123,7 +123,7 @@ def do_create_update_command(service: RemoteService, auth: Authentication, args)
     # Update quotas
     for quota_str in args.quotas:
-        m = re.match(f"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
+        m = re.match(rf"(\w+)\.(\w+)=(\d+|{UNLIMITED_QUOTA})", quota_str)
         if not m:
             raise Exception(
                 f"Invalid format: {quota_str}, expect <model_group>.<granularity>=<quota> "

helm/proxy/example_queries.py CHANGED Viewed

@@ -21,7 +21,7 @@ example_queries = [
             """
             temperature: 0.5  # Medium amount of randomness
             stop_sequences: [.]  # Stop when you hit a period
-            model: openai/gpt-3.5-turbo-0613
+            model: openai/gpt-4.1-nano-2025-04-14
             """
         ),
         environments="",
@@ -33,7 +33,7 @@ example_queries = [
             temperature: 0.5  # Medium amount of randomness
             stop_sequences: [\\n]  # Stop when you hit a newline
             num_completions: 5  # Generate many samples
-            model: openai/gpt-3.5-turbo-0613
+            model: openai/gpt-4.1-nano-2025-04-14
             """
         ),
         environments="",
@@ -58,7 +58,7 @@ example_queries = [
             """
             temperature: 0  # Deterministic
             max_tokens: 50
-            model: openai/gpt-3.5-turbo-0613
+            model: openai/gpt-4.1-nano-2025-04-14
             """
         ),
         environments="",
@@ -76,7 +76,7 @@ example_queries = [
         environments=dedent(
             """
             occupation: [mathematician, lawyer, doctor]
-            model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
+            model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
             """
         ),
     ),
@@ -101,7 +101,7 @@ example_queries = [
         ),
         environments=dedent(
             """
-            model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
+            model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
             """
         ),
     ),
@@ -136,7 +136,7 @@ example_queries = [
         ),
         environments=dedent(
             """
-            model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
+            model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
             """
         ),
     ),
@@ -144,7 +144,7 @@ example_queries = [
         prompt="Write a Python function that takes two vectors a and b and returns their Euclidean distance.",
         settings=dedent(
             """
-            model: openai/gpt-3.5-turbo-0613
+            model: openai/gpt-4.1-nano-2025-04-14
             """
         ),
         environments="",
@@ -161,7 +161,7 @@ example_queries = [
         ),
         environments=dedent(
             """
-            model: [openai/gpt-3.5-turbo-0613, openai/gpt-3.5-turbo-1106]
+            model: [openai/gpt-4.1-nano-2025-04-14, openai/gpt-4.1-mini-2025-04-14]
             """
         ),
     ),

helm/proxy/retry.py CHANGED Viewed

@@ -5,6 +5,7 @@ from retrying import Retrying
 from helm.common.request import RequestResult
 from helm.common.tokenization_request import TokenizationRequestResult
 from helm.common.hierarchical_logger import hlog
+import os
 import traceback
 import threading
@@ -19,6 +20,10 @@ Example usage:
         ...
 """
+# TODO: make these configurable at a config / cli level
+HELM_RETRIES = int(os.environ.get("HELM_RETRIES", "5"))
+HELM_TOKENIZER_RETRIES = int(os.environ.get("HELM_TOKENIZER_RETRIES", HELM_RETRIES))
 # The lock is used to prevent multiple threads from printing at the same time.
 # This can cause issues when printing the stack trace.
 # (The stack traces can get mixed up and become unreadable.)

helm/proxy/server.py CHANGED Viewed

@@ -23,7 +23,7 @@ from helm.benchmark.model_deployment_registry import get_default_model_deploymen
 from helm.common.authentication import Authentication
 from helm.common.cache_backend_config import CacheBackendConfig, MongoCacheBackendConfig, SqliteCacheBackendConfig
 from helm.common.general import ensure_directory_exists
-from helm.common.hierarchical_logger import hlog
+from helm.common.hierarchical_logger import hlog, setup_default_logging
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request
 from helm.common.perspective_api_request import PerspectiveAPIRequest
@@ -273,6 +273,7 @@ def main():
         default="",
     )
     args = parser.parse_args()
+    setup_default_logging()
     register_builtin_configs_from_helm_package()
     register_configs_from_directory(args.base_path)

helm/proxy/static/index.css CHANGED Viewed

@@ -35,6 +35,10 @@
   font-style: italic;
 }
+.thinking {
+  font-style: italic;
+}
 .token:hover {
   background-color: lightgreen;
 }

helm/proxy/static/index.js CHANGED Viewed

@@ -282,7 +282,13 @@ $(function () {
     requestResult.completions.forEach((completion) => {
       const $contents = $("<span>", {
         title: `logprob: ${completion.logprob}`,
-      }).append(renderTokens(completion.tokens));
+      });
+      if (completion.thinking) {
+        const $thinking = $("<span>", { class: "thinking" }).append(completion.thinking.text);
+        $contents.append($thinking);
+      }
+      const $resultText = completion.tokens.length > 0 ?renderTokens(completion.tokens) : $("<div>").append(completion.text);
+      $contents.append($resultText);
       const $metadata = $("<span>", { class: "metadata" });
       $metadata.append(
         $("<span>", { title: "Log probability" }).append(

helm/tokenizers/auto_tokenizer.py CHANGED Viewed

@@ -8,7 +8,7 @@ from helm.common.credentials_utils import provide_api_key
 from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
 from helm.common.hierarchical_logger import hlog
 from helm.common.object_spec import create_object, inject_object_spec_args
-from helm.proxy.retry import retry_tokenizer_request
+from helm.proxy.retry import NonRetriableException, retry_tokenizer_request
 from helm.common.tokenization_request import (
     DecodeRequest,
     DecodeRequestResult,
@@ -50,7 +50,7 @@ class AutoTokenizer(Tokenizer):
             )
             tokenizer = create_object(tokenizer_spec)
         else:
-            hlog(f"No tokenizer config for {tokenizer_name}")
+            raise NonRetriableException(f"Could not find tokenizer config for {tokenizer_name}")
         # Cache the tokenizer
         assert isinstance(tokenizer, Tokenizer)  # To make mypy happy

helm/tokenizers/grok_tokenizer.py CHANGED Viewed

@@ -34,6 +34,8 @@ class GrokAPITokenizer(CachingTokenizer):
                 "Set grokApiKey in credentials.conf or set the GROK_API_KEY environment variable"
             )
         text = request["text"]
+        if not text:
+            return {"token_ids": []}
         model = request["tokenizer"].split("/")[-1]
         response = requests.post(
             url="https://api.x.ai/v1/tokenize-text",

helm/benchmark/metrics/aci_bench_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.aci_bench_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class ACIBenchMetric(LLMJuryMetric):
-    """Score metrics for ACIBench."""
-    def __init__(self):
-        super().__init__(
-            metric_name="aci_bench_accuracy",
-            scenario_name="aci_bench",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/chw_care_plan_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.chw_care_plan_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class CHWCarePlanMetric(LLMJuryMetric):
-    """Score metrics for CHWCarePlan."""
-    def __init__(self):
-        super().__init__(
-            metric_name="chw_care_plan_accuracy",
-            scenario_name="chw_care_plan",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/dischargeme_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.dischargeme_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class DischargeMeMetric(LLMJuryMetric):
-    """Score metrics for DischargeMe."""
-    def __init__(self):
-        super().__init__(
-            metric_name="dischargeme_accuracy",
-            scenario_name="dischargeme",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/med_dialog_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.med_dialog_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MedDialogMetric(LLMJuryMetric):
-    """Score metrics for MedDialog."""
-    def __init__(self):
-        super().__init__(
-            metric_name="med_dialog_accuracy",
-            scenario_name="med_dialog",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/medalign_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.medalign_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MedalignMetric(LLMJuryMetric):
-    """Score metrics for Medalign."""
-    def __init__(self):
-        super().__init__(
-            metric_name="medalign_accuracy",
-            scenario_name="medalign",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/medi_qa_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.medi_qa_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MediQAMetric(LLMJuryMetric):
-    """Score metrics for MediQA."""
-    def __init__(self):
-        super().__init__(
-            metric_name="medi_qa_accuracy",
-            scenario_name="medi_qa",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/medication_qa_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.medication_qa_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MedicationQAMetric(LLMJuryMetric):
-    """Score metrics for MedicationQA."""
-    def __init__(self):
-        super().__init__(
-            metric_name="medication_qa_accuracy",
-            scenario_name="medication_qa",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/mental_health_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.mental_health_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MentalHealthMetric(LLMJuryMetric):
-    """Score metrics for MentalHealth."""
-    def __init__(self):
-        super().__init__(
-            metric_name="mental_health_accuracy",
-            scenario_name="mental_health",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/mimic_bhc_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.mimic_bhc_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MIMICBHCMetric(LLMJuryMetric):
-    """Score metrics for MIMICBHC."""
-    def __init__(self):
-        super().__init__(
-            metric_name="mimic_bhc_accuracy",
-            scenario_name="mimic_bhc",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/mimic_rrs_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.mimic_rrs_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MIMICRRSMetric(LLMJuryMetric):
-    """Score metrics for MIMICRRS."""
-    def __init__(self):
-        super().__init__(
-            metric_name="mimic_rrs_accuracy",
-            scenario_name="mimic_rrs",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/mtsamples_procedures_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.mtsamples_procedures_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MTSamplesProceduresMetric(LLMJuryMetric):
-    """Score metrics for MTSamplesProcedures."""
-    def __init__(self):
-        super().__init__(
-            metric_name="mtsamples_procedures_accuracy",
-            scenario_name="mtsamples_procedures",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

helm/benchmark/metrics/mtsamples_replicate_metrics.py DELETED Viewed

@@ -1,14 +0,0 @@
-from helm.benchmark.annotation.mtsamples_replicate_annotator import ANNOTATOR_MODELS
-from helm.benchmark.metrics.llm_jury_metrics import LLMJuryMetric
-class MTSamplesReplicateMetric(LLMJuryMetric):
-    """Score metrics for MTSamplesReplicate."""
-    def __init__(self):
-        super().__init__(
-            metric_name="mtsamples_replicate_accuracy",
-            scenario_name="mtsamples_replicate",
-            annotator_models=ANNOTATOR_MODELS,
-            default_score=1.0,
-        )

crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl