PyPI - crfm-helm - Versions diffs - 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl - Mend

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (394) hide show

{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
helm/benchmark/adaptation/adapter_spec.py +10 -0
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
helm/benchmark/annotation/aci_bench_annotator.py +11 -22
helm/benchmark/annotation/air_bench_annotator.py +1 -1
helm/benchmark/annotation/alrage_annotator.py +90 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
helm/benchmark/annotation/dischargeme_annotator.py +11 -22
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +11 -22
helm/benchmark/annotation/medalign_annotator.py +11 -22
helm/benchmark/annotation/medi_qa_annotator.py +11 -22
helm/benchmark/annotation/medication_qa_annotator.py +11 -22
helm/benchmark/annotation/mental_health_annotator.py +11 -22
helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
helm/benchmark/annotation/model_as_judge.py +23 -18
helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
helm/benchmark/metrics/air_bench_metrics.py +3157 -1
helm/benchmark/metrics/alrage_metric.py +35 -0
helm/benchmark/metrics/basic_metrics.py +267 -2
helm/benchmark/metrics/bbq_metrics.py +12 -0
helm/benchmark/metrics/classification_metrics.py +19 -1
helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
helm/benchmark/metrics/comet_metric.py +1 -1
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
helm/benchmark/metrics/copyright_metrics.py +1 -1
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
helm/benchmark/metrics/dry_run_metrics.py +30 -1
helm/benchmark/metrics/efficiency_metrics.py +74 -0
helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
helm/benchmark/metrics/ifeval_metrics.py +13 -1
helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
helm/benchmark/metrics/language_modeling_metrics.py +13 -1
helm/benchmark/metrics/live_qa_metrics.py +13 -1
helm/benchmark/metrics/llm_jury_metrics.py +13 -1
helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
helm/benchmark/metrics/lmkt_metrics.py +47 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
helm/benchmark/metrics/medec_metrics.py +25 -2
helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
helm/benchmark/metrics/metric.py +25 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
helm/benchmark/metrics/omni_math_metrics.py +13 -1
helm/benchmark/metrics/safety_metrics.py +13 -1
helm/benchmark/metrics/seahelm_metrics.py +14 -1
helm/benchmark/metrics/summac/model_summac.py +3 -3
helm/benchmark/metrics/summarization_metrics.py +129 -1
helm/benchmark/metrics/toxicity_metrics.py +31 -1
helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
helm/benchmark/metrics/wildbench_metrics.py +21 -1
helm/benchmark/model_deployment_registry.py +11 -19
helm/benchmark/presentation/create_plots.py +11 -2
helm/benchmark/presentation/run_display.py +13 -3
helm/benchmark/presentation/run_entry.py +2 -2
helm/benchmark/presentation/schema.py +10 -22
helm/benchmark/presentation/summarize.py +189 -14
helm/benchmark/presentation/taxonomy_info.py +20 -0
helm/benchmark/presentation/test_create_plots.py +4 -1
helm/benchmark/run.py +15 -4
helm/benchmark/run_expander.py +4 -0
helm/benchmark/run_specs/arabic_run_specs.py +197 -0
helm/benchmark/run_specs/bluex_run_specs.py +40 -0
helm/benchmark/run_specs/classic_run_specs.py +2 -55
helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
helm/benchmark/run_specs/heim_run_specs.py +3 -1
helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
helm/benchmark/run_specs/long_context_run_specs.py +48 -1
helm/benchmark/run_specs/medhelm/__init__.py +0 -0
helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
helm/benchmark/runner.py +7 -0
helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
helm/benchmark/scenarios/air_bench_scenario.py +21 -0
helm/benchmark/scenarios/alghafa_scenario.py +126 -0
helm/benchmark/scenarios/alrage_scenario.py +54 -0
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
helm/benchmark/scenarios/aratrust_scenario.py +95 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
helm/benchmark/scenarios/banking77_scenario.py +21 -0
helm/benchmark/scenarios/bbq_scenario.py +15 -0
helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
helm/benchmark/scenarios/bluex_scenario.py +70 -0
helm/benchmark/scenarios/bold_scenario.py +15 -0
helm/benchmark/scenarios/boolq_scenario.py +20 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
helm/benchmark/scenarios/clear_scenario.py +23 -0
helm/benchmark/scenarios/cleva_scenario.py +480 -1
helm/benchmark/scenarios/code_scenario.py +28 -0
helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
helm/benchmark/scenarios/commonsense_scenario.py +32 -0
helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
helm/benchmark/scenarios/copyright_scenario.py +35 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
helm/benchmark/scenarios/disinformation_scenario.py +22 -0
helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
helm/benchmark/scenarios/financebench_scenario.py +21 -0
helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
helm/benchmark/scenarios/gpqa_scenario.py +18 -0
helm/benchmark/scenarios/grammar_scenario.py +20 -1
helm/benchmark/scenarios/gsm_scenario.py +21 -0
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
helm/benchmark/scenarios/headqa_scenario.py +22 -0
helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
helm/benchmark/scenarios/ice_scenario.py +21 -1
helm/benchmark/scenarios/ifeval_scenario.py +18 -0
helm/benchmark/scenarios/imdb_scenario.py +15 -0
helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
helm/benchmark/scenarios/koala_scenario.py +21 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
helm/benchmark/scenarios/legal_support_scenario.py +13 -0
helm/benchmark/scenarios/legalbench_scenario.py +19 -0
helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
helm/benchmark/scenarios/lextreme_scenario.py +11 -0
helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
helm/benchmark/scenarios/math_scenario.py +54 -20
helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
helm/benchmark/scenarios/med_qa_scenario.py +20 -0
helm/benchmark/scenarios/medalign_scenario.py +23 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
helm/benchmark/scenarios/medbullets_scenario.py +22 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
helm/benchmark/scenarios/medec_scenario.py +23 -0
helm/benchmark/scenarios/medhallu_scenario.py +23 -0
helm/benchmark/scenarios/medhelm/__init__.py +0 -0
helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
helm/benchmark/scenarios/melt_scenarios.py +2 -2
helm/benchmark/scenarios/mental_health_scenario.py +23 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
helm/benchmark/scenarios/mmlu_scenario.py +21 -0
helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
helm/benchmark/scenarios/msmarco_scenario.py +30 -0
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
helm/benchmark/scenarios/omni_math_scenario.py +18 -0
helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
helm/benchmark/scenarios/quac_scenario.py +14 -0
helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
helm/benchmark/scenarios/raft_scenario.py +15 -0
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
helm/benchmark/scenarios/scenario.py +31 -0
helm/benchmark/scenarios/seahelm_scenario.py +350 -2
helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
helm/benchmark/scenarios/situation_prompts.yaml +49 -0
helm/benchmark/scenarios/spider_scenario.py +18 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
helm/benchmark/scenarios/summarization_scenario.py +37 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
helm/benchmark/scenarios/the_pile_scenario.py +13 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
helm/benchmark/scenarios/vicuna_scenario.py +21 -1
helm/benchmark/scenarios/wikifact_scenario.py +20 -0
helm/benchmark/scenarios/wildbench_scenario.py +18 -0
helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
helm/benchmark/slurm_jobs.py +1 -2
helm/benchmark/slurm_runner.py +8 -1
helm/benchmark/static/schema_arabic.yaml +271 -0
helm/benchmark/static/schema_classic.yaml +0 -17
helm/benchmark/static/schema_long_context.yaml +17 -18
helm/benchmark/static/schema_medhelm.yaml +36 -0
helm/benchmark/static/schema_slp.yaml +219 -0
helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
helm/benchmark/static_build/index.html +5 -6
helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
helm/clients/ai21_client.py +2 -0
helm/clients/aleph_alpha_client.py +2 -0
helm/clients/anthropic_client.py +7 -1
helm/clients/audio_language/diva_llama_client.py +2 -0
helm/clients/audio_language/llama_omni/arguments.py +61 -0
helm/clients/audio_language/llama_omni/constants.py +9 -0
helm/clients/audio_language/llama_omni/conversation.py +213 -0
helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
helm/clients/audio_language/llama_omni/model/builder.py +88 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
helm/clients/audio_language/llama_omni/preprocess.py +295 -0
helm/clients/audio_language/llama_omni/utils.py +202 -0
helm/clients/audio_language/llama_omni_client.py +2 -1
helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_audiolm_client.py +2 -1
helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
helm/clients/bedrock_client.py +63 -6
helm/clients/cohere_client.py +3 -0
helm/clients/dspy_client.py +135 -0
helm/clients/google_client.py +2 -0
helm/clients/http_model_client.py +2 -0
helm/clients/huggingface_client.py +4 -3
helm/clients/ibm_client.py +3 -1
helm/clients/image_generation/adobe_vision_client.py +2 -0
helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
helm/clients/image_generation/cogview2_client.py +2 -1
helm/clients/image_generation/dalle2_client.py +2 -0
helm/clients/image_generation/dalle_mini_client.py +2 -1
helm/clients/image_generation/deep_floyd_client.py +2 -0
helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
helm/clients/image_generation/lexica_client.py +2 -0
helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
helm/clients/image_generation/mindalle_client.py +2 -1
helm/clients/image_generation/together_image_generation_client.py +2 -0
helm/clients/megatron_client.py +2 -0
helm/clients/mistral_client.py +2 -0
helm/clients/moderation_api_client.py +2 -0
helm/clients/openai_client.py +38 -21
helm/clients/openai_responses_client.py +34 -8
helm/clients/openrouter_client.py +31 -0
helm/clients/palmyra_client.py +2 -1
helm/clients/reka_client.py +2 -1
helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
helm/clients/stanfordhealthcare_http_model_client.py +2 -0
helm/clients/test_huggingface_client.py +3 -3
helm/clients/test_openrouter_client.py +69 -0
helm/clients/together_client.py +52 -13
helm/clients/vertexai_client.py +23 -11
helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
helm/clients/vision_language/huggingface_vlm_client.py +2 -0
helm/clients/vision_language/idefics_client.py +2 -1
helm/clients/vision_language/open_flamingo_client.py +2 -1
helm/clients/vision_language/paligemma_client.py +2 -1
helm/clients/vision_language/palmyra_vision_client.py +2 -0
helm/clients/vision_language/qwen2_vlm_client.py +2 -1
helm/clients/vision_language/qwen_vlm_client.py +2 -1
helm/clients/vllm_client.py +43 -7
helm/clients/vllm_granite_thinking_client.py +56 -0
helm/clients/writer_client.py +5 -2
helm/common/critique_request.py +0 -1
helm/common/hierarchical_logger.py +103 -34
helm/common/object_spec.py +23 -8
helm/common/optional_dependencies.py +1 -1
helm/common/test_general.py +4 -0
helm/common/test_logging.py +94 -0
helm/config/model_deployments.yaml +1001 -187
helm/config/model_metadata.yaml +602 -18
helm/config/tokenizer_configs.yaml +202 -5
helm/proxy/cli.py +1 -1
helm/proxy/example_queries.py +8 -8
helm/proxy/retry.py +5 -0
helm/proxy/server.py +2 -1
helm/proxy/static/index.css +4 -0
helm/proxy/static/index.js +7 -1
helm/tokenizers/auto_tokenizer.py +2 -2
helm/tokenizers/grok_tokenizer.py +2 -0
helm/benchmark/metrics/aci_bench_metrics.py +0 -14
helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
helm/benchmark/metrics/dischargeme_metrics.py +0 -14
helm/benchmark/metrics/med_dialog_metrics.py +0 -14
helm/benchmark/metrics/medalign_metrics.py +0 -14
helm/benchmark/metrics/medi_qa_metrics.py +0 -14
helm/benchmark/metrics/medication_qa_metrics.py +0 -14
helm/benchmark/metrics/mental_health_metrics.py +0 -14
helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
helm/benchmark/metrics/numeracy_metrics.py +0 -72
helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
helm/benchmark/scenarios/numeracy_scenario.py +0 -794
helm/benchmark/static_build/assets/index-94295e78.js +0 -10
helm/benchmark/static_build/assets/index-b9779128.css +0 -1
helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
{crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
/helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
/helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
/helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
/helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
/helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
/helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
/helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
/helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
/helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
/helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0

helm/clients/test_openrouter_client.py ADDED Viewed

@@ -0,0 +1,69 @@
+import os
+import pytest
+import tempfile
+from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
+from helm.common.request import Request
+from helm.clients.openrouter_client import OpenRouterClient
+from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
+class TestOpenRouterClient:
+    def setup_method(self, method):
+        cache_file = tempfile.NamedTemporaryFile(delete=False)
+        self.cache_path: str = cache_file.name
+        self.tokenizer_name = "mistralai/Mistral-7B-v0.1"
+        self.tokenizer = HuggingFaceTokenizer(
+            cache_config=BlackHoleCacheConfig(),
+            tokenizer_name=self.tokenizer_name,
+        )
+    def teardown_method(self, method):
+        os.remove(self.cache_path)
+    @pytest.mark.parametrize(
+        "model_name,test_input,expected_model",
+        [
+            (
+                "mistralai/mistral-medium-3.1",
+                Request(
+                    model="mistralai/mistral-medium-3.1",
+                    model_deployment="openrouter/mistral-medium-3.1",
+                ),
+                "mistralai/mistral-medium-3.1",
+            ),
+            (
+                None,
+                Request(model="openai/gpt-oss-20b:free", model_deployment="openrouter/gpt-oss-20b:free"),
+                "openai/gpt-oss-20b:free",
+            ),
+        ],
+    )
+    def test_get_model_for_request(self, model_name, test_input, expected_model):
+        client = OpenRouterClient(
+            tokenizer_name=self.tokenizer_name,
+            tokenizer=self.tokenizer,
+            cache_config=SqliteCacheConfig(self.cache_path),
+            model_name=model_name,
+            api_key="test_key",
+        )
+        assert client._get_model_for_request(test_input) == expected_model
+    def test_api_key_env_var(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "test_key")
+        client = OpenRouterClient(
+            tokenizer_name=self.tokenizer_name,
+            tokenizer=self.tokenizer,
+            cache_config=SqliteCacheConfig(self.cache_path),
+        )
+        assert client.api_key == "test_key"
+    def test_api_key_argument(self):
+        client = OpenRouterClient(
+            tokenizer_name=self.tokenizer_name,
+            tokenizer=self.tokenizer,
+            cache_config=BlackHoleCacheConfig(),
+            api_key="explicit_key",
+        )
+        assert client.api_key == "explicit_key"

helm/clients/together_client.py CHANGED Viewed

@@ -9,6 +9,7 @@ import requests
 from retrying import retry
 from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
 from helm.common.object_spec import get_class_by_name
 from helm.common.optional_dependencies import handle_module_not_found_error
@@ -25,8 +26,6 @@ except ModuleNotFoundError as e:
 class _RewriteRequestTags:
     """Tags that indicate that the request for the model must be rewritten before sending to Together."""
-    # TODO: Convert to StrEnum after upgrading to Python 3.11
     ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
     """Indicates that the EOS token should be added as an extra stop sequence.
@@ -101,7 +100,20 @@ class JobNotFinishedError(TogetherClientError):
     pass
-def _parse_thinking(input: str) -> Tuple[str, str]:
+def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
+def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
     """Return a tuple of thinking text and output text."""
     match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
     if match:
@@ -114,6 +126,31 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
     return (input, "")
+def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
+    """Return a tuple of thinking text and output text."""
+    match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), match.group(2))
+    match = re.match(r"\n<think>(.*)", input, re.DOTALL)
+    if match:
+        return (match.group(1), "")
+    return (input, "")
+def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
+    # TODO: Come up with a more sustainable extensible way of doing this.
+    if "deepseek-r1" in model_name:
+        return _parse_thinking_deepseek_r1(input)
+    elif "qwen3" in model_name:
+        return _parse_thinking_qwen3(input)
+    elif "glm-4.5" in model_name:
+        return _parse_thinking_glm_4_5(input)
+    else:
+        raise Exception(f"No thinking parser available for model {model_name}")
 class TogetherClient(CachingClient):
     """
     Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
@@ -237,6 +274,7 @@ class TogetherClient(CachingClient):
             try:
                 response, cached = self.cache.get(cache_key, wrap_request_time(do_it_sync))
             except Exception as error:
+                hexception(error)
                 return RequestResult(
                     success=False,
                     cached=False,
@@ -348,9 +386,8 @@ class TogetherChatClient(CachingClient):
         self._client = Together(api_key=api_key)
         self._together_model = together_model
         self._disable_logprobs = bool(disable_logprobs)
-        # self.output_processor is actually a function, not a class
         self._parse_thinking = bool(parse_thinking)
+        # self.output_processor is actually a function, not a class
         self.output_processor: Optional[Callable[[str], str]] = (
             get_class_by_name(output_processor) if output_processor else None
         )
@@ -420,6 +457,7 @@ class TogetherChatClient(CachingClient):
             raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
             response = ChatCompletionResponse.model_validate(raw_response)
         except Exception as error:
+            hexception(error)
             return RequestResult(
                 success=False,
                 cached=False,
@@ -446,15 +484,15 @@ class TogetherChatClient(CachingClient):
             if self.output_processor:
                 output_text = self.output_processor(output_text)
+            thinking: Optional[Thinking] = None
             if self._parse_thinking:
-                thinking_text, output_text = _parse_thinking(output_text)
-                generated_outputs.append(
-                    GeneratedOutput(
-                        text=output_text, logprob=logprob, tokens=tokens, thinking=Thinking(text=thinking_text)
-                    )
-                )
-            else:
-                generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
+                thinking_text, output_text = _parse_thinking(output_text, request.model)
+                thinking = Thinking(text=thinking_text)
+            elif hasattr(choice.message, "reasoning_content"):
+                thinking = Thinking(text=choice.message.reasoning_content)
+            generated_outputs.append(
+                GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
+            )
         return RequestResult(
             success=True,
             cached=cached,
@@ -527,6 +565,7 @@ class TogetherCompletionClient(CachingClient):
             raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
             response = CompletionResponse.model_validate(raw_response)
         except Exception as error:
+            hexception(error)
             return RequestResult(
                 success=False,
                 cached=False,

helm/clients/vertexai_client.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import requests
 from abc import ABC, abstractmethod
 from threading import Lock
-from typing import Any, Dict, Mapping, Optional, List, Union
+from typing import Any, Dict, Mapping, Optional, List, Union, cast
 from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.multimodal_request_utils import get_contents_as_bytes
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
@@ -107,7 +108,7 @@ class VertexAITextClient(VertexAIClient):
     def make_request(self, request: Request) -> RequestResult:
         """Make a request"""
-        parameters = {
+        parameters: Dict[str, Any] = {
             "temperature": request.temperature,
             "max_output_tokens": request.max_tokens,
             "top_k": request.top_k_per_token,
@@ -152,6 +153,7 @@ class VertexAITextClient(VertexAIClient):
             response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except (requests.exceptions.RequestException, AssertionError) as e:
+            hexception(e)
             error: str = f"VertexAITextClient error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
@@ -207,21 +209,23 @@ class VertexAIChatClient(VertexAIClient):
     def make_request(self, request: Request) -> RequestResult:
         """Make a request"""
-        contents = [request.prompt]
+        # mypy is unhappy without this cast
+        contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
+            List[Union[str, Image, Part]], [request.prompt]
+        )
         # For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
         if request.multimodal_prompt is not None:
             return self._make_multimodal_request(request)
         if request.messages is not None:
-            contents = []
             role_mapping = {"user": "user", "assistant": "model"}
-            for msg in request.messages:
-                contents.append(
-                    Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
-                )
+            contents = [
+                Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
+                for msg in request.messages
+            ]
-        parameters = {
+        parameters: Dict[str, Any] = {
             "temperature": request.temperature,
             "max_output_tokens": request.max_tokens,
             "top_k": request.top_k_per_token,
@@ -274,8 +278,14 @@ class VertexAIChatClient(VertexAIClient):
                     if not candidate.content:
                         raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
                     if not candidate.content.parts:
-                        raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
-                    predictions.append({"text": candidate.content.text})
+                        if candidate.finish_reason == 2:  # MAX_TOKENS
+                            # This means that there is no text output because the maximum number of tokens were
+                            # reached during thinking.
+                            predictions.append({"text": ""})
+                        else:
+                            raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
+                    else:
+                        predictions.append({"text": candidate.content.text})
                     # TODO: Extract more information from the response
                 return {"predictions": predictions}
@@ -302,6 +312,7 @@ class VertexAIChatClient(VertexAIClient):
                 error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
             )
         except (requests.exceptions.RequestException, AssertionError) as e:
+            hexception(e)
             error: str = f"VertexAITextClient error: {e}"
             return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
@@ -432,6 +443,7 @@ class VertexAIChatClient(VertexAIClient):
                 cache_key = self.make_cache_key_with_safety_settings_preset(raw_cache_key, request)
                 response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
             except requests.exceptions.RequestException as e:
+                hexception(e)
                 error: str = f"Gemini Vision error: {e}"
                 return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
             except VertexAIContentBlockedError as e:

helm/clients/vision_language/huggingface_vision2seq_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch
 from helm.common.cache import CacheConfig
 from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import Request, RequestResult, GeneratedOutput, Token
 from helm.common.request import wrap_request_time
@@ -125,6 +125,7 @@ class HuggingFaceVision2SeqClient(CachingClient):
                 )
                 result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
             except RuntimeError as model_error:
+                hexception(model_error)
                 return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
             for text in result["output"]:

helm/clients/vision_language/huggingface_vlm_client.py CHANGED Viewed

@@ -5,6 +5,7 @@ from transformers import pipeline
 from transformers.pipelines import ImageToTextPipeline
 from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.images_utils import open_image
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
@@ -93,6 +94,7 @@ class HuggingFaceVLMClient(CachingClient):
             )
             result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except RuntimeError as e:
+            hexception(e)
             return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
         output: str = result["generated_text"]

helm/clients/vision_language/idefics_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ from transformers import IdeficsForVisionText2Text, AutoProcessor, IdeficsProces
 from helm.common.cache import CacheConfig
 from helm.common.images_utils import open_image
 from helm.common.gpu_utils import get_torch_device_name
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput, Token
@@ -137,6 +137,7 @@ class IDEFICSClient(CachingClient):
                 )
                 result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
             except RuntimeError as model_error:
+                hexception(model_error)
                 return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
             for text in result["output"]:

helm/clients/vision_language/open_flamingo_client.py CHANGED Viewed

@@ -5,7 +5,7 @@ import torch
 from huggingface_hub import hf_hub_download
 from helm.common.cache import CacheConfig
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.images_utils import open_image
 from helm.common.gpu_utils import get_torch_device_name
 from helm.common.media_object import TEXT_TYPE
@@ -131,6 +131,7 @@ class OpenFlamingoClient(CachingClient):
             )
             result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except RuntimeError as ex:
+            hexception(ex)
             return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
         completions: List[GeneratedOutput] = []

helm/clients/vision_language/paligemma_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
 from helm.common.cache import CacheConfig
 from helm.common.images_utils import open_image
 from helm.common.gpu_utils import get_torch_device_name
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import Request, RequestResult, GeneratedOutput, Token
@@ -126,6 +126,7 @@ class PaliGemmaClient(CachingClient):
                     result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
                     concat_results.append(result)
             except RuntimeError as model_error:
+                hexception(model_error)
                 return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
             for result in concat_results:

helm/clients/vision_language/palmyra_vision_client.py CHANGED Viewed

@@ -5,6 +5,7 @@ import requests
 from helm.common.cache import CacheConfig
 from helm.common.images_utils import encode_base64
+from helm.common.hierarchical_logger import hexception
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import Request, RequestResult, GeneratedOutput, ErrorFlags
 from helm.common.request import wrap_request_time
@@ -76,6 +77,7 @@ class PalmyraVisionClient(CachingClient):
             )
             result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
         except PalmyraVisionContentBlockedError as ex:
+            hexception(ex)
             return RequestResult(
                 success=False,
                 cached=False,

helm/clients/vision_language/qwen2_vlm_client.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch
 from helm.common.cache import CacheConfig
 from helm.common.gpu_utils import get_torch_device_name
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import Request, RequestResult, GeneratedOutput, Token
 from helm.common.request import wrap_request_time
@@ -157,6 +157,7 @@ class Qwen2VLMClient(CachingClient):
                     )
                     result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
                 except RuntimeError as model_error:
+                    hexception(model_error)
                     return RequestResult(
                         success=False,
                         cached=False,

helm/clients/vision_language/qwen_vlm_client.py CHANGED Viewed

@@ -7,7 +7,7 @@ from transformers.generation import GenerationConfig
 from helm.common.cache import CacheConfig
 from helm.common.gpu_utils import get_torch_device_name
-from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.hierarchical_logger import hexception, hlog, htrack_block
 from helm.common.media_object import TEXT_TYPE
 from helm.common.request import Request, RequestResult, GeneratedOutput, Token
 from helm.common.request import wrap_request_time
@@ -139,6 +139,7 @@ class QwenVLMClient(CachingClient):
                     )
                     result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
                 except RuntimeError as model_error:
+                    hexception(model_error)
                     return RequestResult(
                         success=False, cached=False, error=str(model_error), completions=[], embedding=[]
                     )

helm/clients/vllm_client.py CHANGED Viewed

@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
 from helm.common.cache import CacheConfig
 from helm.common.request import Request
-from helm.clients.openai_client import OpenAILegacyCompletionsClient
+from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
 from helm.tokenizers.tokenizer import Tokenizer
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
         tokenizer_name: str,
         cache_config: CacheConfig,
         base_url: Optional[str] = None,
+        vllm_model_name: Optional[str] = None,
+        **kwargs,
     ):
         super().__init__(
             tokenizer=tokenizer,
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
             api_key="EMPTY",
             org_id=None,
             base_url=base_url,
+            openai_model_name=vllm_model_name,
+            **kwargs,
         )
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
-    def _get_model_for_request(self, request: Request) -> str:
-        # The `model` parameter for vLLM should be the whole model name including the creator organization,
-        # unlike OpenAI which only uses the model engine.
-        return request.model
+        self.vllm_model_name = vllm_model_name
     def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
         raw_request = super()._to_raw_completion_request(request)
         # This avoids the error: best_of must be 1 when using greedy sampling
-        if "best_of" in raw_request and raw_request["best_of"] > 1:
+        if (
+            "temperature" in raw_request
+            and raw_request["temperature"] == 0.0
+            and "best_of" in raw_request
+            and raw_request["best_of"] > 1
+        ):
             raw_request["best_of"] = 1
         return raw_request
+class VLLMChatClient(OpenAIClient):
+    """Sends request to a vLLM server using the OpenAI-compatible API.
+    Only uses the Chat Completions API.
+    See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
+    def __init__(
+        self,
+        tokenizer: Tokenizer,
+        tokenizer_name: str,
+        cache_config: CacheConfig,
+        base_url: Optional[str] = None,
+        vllm_model_name: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            tokenizer_name=tokenizer_name,
+            cache_config=cache_config,
+            api_key="EMPTY",
+            org_id=None,
+            base_url=base_url,
+            openai_model_name=vllm_model_name,
+            **kwargs,
+        )
+        self.tokenizer = tokenizer
+        self.tokenizer_name = tokenizer_name
+        self.vllm_model_name = vllm_model_name

helm/clients/vllm_granite_thinking_client.py ADDED Viewed

@@ -0,0 +1,56 @@
+from dataclasses import replace
+import re
+from typing import Any, Dict, List, Tuple
+from helm.clients.vllm_client import VLLMChatClient
+from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
+class VLLMGraniteThinkingClient(VLLMChatClient):
+    """Sends request to a Granite model on vLLM server with thinking enabled.
+    From vLLM documentation at
+    https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
+    IBM Granite 3.2 reasoning is disabled by default;
+    to enable it, you must also pass thinking=True in your chat_template_kwargs.
+    """
+    def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
+        raw_request = super()._make_chat_raw_request(request)
+        raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
+        return raw_request
+    def _parse_thinking(self, input: str) -> Tuple[str, str]:
+        """Return a tuple of thinking text and output text."""
+        match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
+        if match:
+            return (match.group(1), match.group(2))
+        match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
+        if match:
+            return (match.group(1), match.group(2))
+        match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
+        if match:
+            return (match.group(1), "")
+        match = re.match(r"<think>(.*)", input, re.DOTALL)
+        if match:
+            return (match.group(1), "")
+        return (input, "")
+    def _make_chat_request(self, request: Request) -> RequestResult:
+        request_result = super()._make_chat_request(request)
+        modified_completions: List[GeneratedOutput] = []
+        for completion in request_result.completions:
+            thinking, modified_text = self._parse_thinking(completion.text)
+            modified_completions.append(
+                replace(
+                    completion,
+                    text=modified_text,
+                    thinking=Thinking(text=thinking),
+                )
+            )
+        return replace(request_result, completions=modified_completions)

helm/clients/writer_client.py CHANGED Viewed

@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Mapping, Optional
 from helm.clients.client import CachingClient
 from helm.common.cache import CacheConfig
+from helm.common.hierarchical_logger import hexception
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
+from helm.proxy.retry import NonRetriableException
 try:
     from writerai import Writer
@@ -19,9 +21,9 @@ class WriterClient(CachingClient):
     def _get_messages_from_request(self, request: Request) -> List[Dict]:
         if request.prompt and request.messages:
-            raise ValueError(f"Only one of `prompt` and `messages` may be set in request: {request}")
+            raise NonRetriableException(f"Only one of `prompt` and `messages` may be set in request: {request}")
         if request.multimodal_prompt:
-            raise ValueError("`multimodal_prompt` is not supported by WriterClient")
+            raise NonRetriableException("`multimodal_prompt` is not supported by WriterClient")
         if request.messages:
             return [{"role": message["role"], "content": message["content"]} for message in request.messages]
         else:
@@ -82,6 +84,7 @@ class WriterClient(CachingClient):
             raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
             chat_completion: ChatCompletion = ChatCompletion.model_validate(raw_response)
         except Exception as error:
+            hexception(error)
             return RequestResult(
                 success=False,
                 cached=False,

helm/common/critique_request.py CHANGED Viewed

@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
 class QuestionType:
     """String enum of question types."""
-    # TODO: Make this a StrEnum after upgrading to Python 3.11
     MULTIPLE_CHOICE: str = "multiple_choice"
     CHECKBOX: str = "checkbox"
     FREE_RESPONSE: str = "free_response"

crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.6py3-none-any.whl → 0.5.10py3-none-any.whl