crfm-helm 0.5.6__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/METADATA +72 -130
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/RECORD +372 -305
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +312 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +15 -4
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +363 -53
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +11 -13
- helm/benchmark/runner.py +7 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -53
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -21
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -52
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +54 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +23 -1
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +17 -18
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +21 -8
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +63 -6
- helm/clients/cohere_client.py +3 -0
- helm/clients/dspy_client.py +135 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +4 -3
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +38 -21
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -13
- helm/clients/vertexai_client.py +23 -11
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +5 -2
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +103 -34
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +1001 -187
- helm/config/model_metadata.yaml +602 -18
- helm/config/tokenizer_configs.yaml +202 -5
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/auto_tokenizer.py +2 -2
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +0 -103
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pytest
|
|
3
|
+
import tempfile
|
|
4
|
+
|
|
5
|
+
from helm.common.cache import BlackHoleCacheConfig, SqliteCacheConfig
|
|
6
|
+
from helm.common.request import Request
|
|
7
|
+
from helm.clients.openrouter_client import OpenRouterClient
|
|
8
|
+
|
|
9
|
+
from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TestOpenRouterClient:
|
|
13
|
+
def setup_method(self, method):
|
|
14
|
+
cache_file = tempfile.NamedTemporaryFile(delete=False)
|
|
15
|
+
self.cache_path: str = cache_file.name
|
|
16
|
+
self.tokenizer_name = "mistralai/Mistral-7B-v0.1"
|
|
17
|
+
self.tokenizer = HuggingFaceTokenizer(
|
|
18
|
+
cache_config=BlackHoleCacheConfig(),
|
|
19
|
+
tokenizer_name=self.tokenizer_name,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def teardown_method(self, method):
|
|
23
|
+
os.remove(self.cache_path)
|
|
24
|
+
|
|
25
|
+
@pytest.mark.parametrize(
|
|
26
|
+
"model_name,test_input,expected_model",
|
|
27
|
+
[
|
|
28
|
+
(
|
|
29
|
+
"mistralai/mistral-medium-3.1",
|
|
30
|
+
Request(
|
|
31
|
+
model="mistralai/mistral-medium-3.1",
|
|
32
|
+
model_deployment="openrouter/mistral-medium-3.1",
|
|
33
|
+
),
|
|
34
|
+
"mistralai/mistral-medium-3.1",
|
|
35
|
+
),
|
|
36
|
+
(
|
|
37
|
+
None,
|
|
38
|
+
Request(model="openai/gpt-oss-20b:free", model_deployment="openrouter/gpt-oss-20b:free"),
|
|
39
|
+
"openai/gpt-oss-20b:free",
|
|
40
|
+
),
|
|
41
|
+
],
|
|
42
|
+
)
|
|
43
|
+
def test_get_model_for_request(self, model_name, test_input, expected_model):
|
|
44
|
+
client = OpenRouterClient(
|
|
45
|
+
tokenizer_name=self.tokenizer_name,
|
|
46
|
+
tokenizer=self.tokenizer,
|
|
47
|
+
cache_config=SqliteCacheConfig(self.cache_path),
|
|
48
|
+
model_name=model_name,
|
|
49
|
+
api_key="test_key",
|
|
50
|
+
)
|
|
51
|
+
assert client._get_model_for_request(test_input) == expected_model
|
|
52
|
+
|
|
53
|
+
def test_api_key_env_var(self, monkeypatch):
|
|
54
|
+
monkeypatch.setenv("OPENROUTER_API_KEY", "test_key")
|
|
55
|
+
client = OpenRouterClient(
|
|
56
|
+
tokenizer_name=self.tokenizer_name,
|
|
57
|
+
tokenizer=self.tokenizer,
|
|
58
|
+
cache_config=SqliteCacheConfig(self.cache_path),
|
|
59
|
+
)
|
|
60
|
+
assert client.api_key == "test_key"
|
|
61
|
+
|
|
62
|
+
def test_api_key_argument(self):
|
|
63
|
+
client = OpenRouterClient(
|
|
64
|
+
tokenizer_name=self.tokenizer_name,
|
|
65
|
+
tokenizer=self.tokenizer,
|
|
66
|
+
cache_config=BlackHoleCacheConfig(),
|
|
67
|
+
api_key="explicit_key",
|
|
68
|
+
)
|
|
69
|
+
assert client.api_key == "explicit_key"
|
helm/clients/together_client.py
CHANGED
|
@@ -9,6 +9,7 @@ import requests
|
|
|
9
9
|
from retrying import retry
|
|
10
10
|
|
|
11
11
|
from helm.common.cache import CacheConfig
|
|
12
|
+
from helm.common.hierarchical_logger import hexception
|
|
12
13
|
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
13
14
|
from helm.common.object_spec import get_class_by_name
|
|
14
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -25,8 +26,6 @@ except ModuleNotFoundError as e:
|
|
|
25
26
|
class _RewriteRequestTags:
|
|
26
27
|
"""Tags that indicate that the request for the model must be rewritten before sending to Together."""
|
|
27
28
|
|
|
28
|
-
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
29
|
-
|
|
30
29
|
ADD_EOS_TOKEN_AS_STOP_SEQUENCE = "ADD_EOS_TOKEN_AS_STOP_SEQUENCE"
|
|
31
30
|
"""Indicates that the EOS token should be added as an extra stop sequence.
|
|
32
31
|
|
|
@@ -101,7 +100,20 @@ class JobNotFinishedError(TogetherClientError):
|
|
|
101
100
|
pass
|
|
102
101
|
|
|
103
102
|
|
|
104
|
-
def
|
|
103
|
+
def _parse_thinking_deepseek_r1(input: str) -> Tuple[str, str]:
|
|
104
|
+
"""Return a tuple of thinking text and output text."""
|
|
105
|
+
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
106
|
+
if match:
|
|
107
|
+
return (match.group(1), match.group(2))
|
|
108
|
+
|
|
109
|
+
match = re.match(r"<think>\n?(.*)", input, re.DOTALL)
|
|
110
|
+
if match:
|
|
111
|
+
return (match.group(1), "")
|
|
112
|
+
|
|
113
|
+
return (input, "")
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _parse_thinking_qwen3(input: str) -> Tuple[str, str]:
|
|
105
117
|
"""Return a tuple of thinking text and output text."""
|
|
106
118
|
match = re.match(r"<think>\n(.*)\n</think>\n{0,2}(.*)", input, re.DOTALL)
|
|
107
119
|
if match:
|
|
@@ -114,6 +126,31 @@ def _parse_thinking(input: str) -> Tuple[str, str]:
|
|
|
114
126
|
return (input, "")
|
|
115
127
|
|
|
116
128
|
|
|
129
|
+
def _parse_thinking_glm_4_5(input: str) -> Tuple[str, str]:
|
|
130
|
+
"""Return a tuple of thinking text and output text."""
|
|
131
|
+
match = re.match(r"\n<think>(.*)</think>(.*)", input, re.DOTALL)
|
|
132
|
+
if match:
|
|
133
|
+
return (match.group(1), match.group(2))
|
|
134
|
+
|
|
135
|
+
match = re.match(r"\n<think>(.*)", input, re.DOTALL)
|
|
136
|
+
if match:
|
|
137
|
+
return (match.group(1), "")
|
|
138
|
+
|
|
139
|
+
return (input, "")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _parse_thinking(input: str, model_name: str) -> Tuple[str, str]:
|
|
143
|
+
# TODO: Come up with a more sustainable extensible way of doing this.
|
|
144
|
+
if "deepseek-r1" in model_name:
|
|
145
|
+
return _parse_thinking_deepseek_r1(input)
|
|
146
|
+
elif "qwen3" in model_name:
|
|
147
|
+
return _parse_thinking_qwen3(input)
|
|
148
|
+
elif "glm-4.5" in model_name:
|
|
149
|
+
return _parse_thinking_glm_4_5(input)
|
|
150
|
+
else:
|
|
151
|
+
raise Exception(f"No thinking parser available for model {model_name}")
|
|
152
|
+
|
|
153
|
+
|
|
117
154
|
class TogetherClient(CachingClient):
|
|
118
155
|
"""
|
|
119
156
|
Client for the models where we evaluate offline. Since the queries are handled offline, the `TogetherClient` just
|
|
@@ -237,6 +274,7 @@ class TogetherClient(CachingClient):
|
|
|
237
274
|
try:
|
|
238
275
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it_sync))
|
|
239
276
|
except Exception as error:
|
|
277
|
+
hexception(error)
|
|
240
278
|
return RequestResult(
|
|
241
279
|
success=False,
|
|
242
280
|
cached=False,
|
|
@@ -348,9 +386,8 @@ class TogetherChatClient(CachingClient):
|
|
|
348
386
|
self._client = Together(api_key=api_key)
|
|
349
387
|
self._together_model = together_model
|
|
350
388
|
self._disable_logprobs = bool(disable_logprobs)
|
|
351
|
-
# self.output_processor is actually a function, not a class
|
|
352
389
|
self._parse_thinking = bool(parse_thinking)
|
|
353
|
-
|
|
390
|
+
# self.output_processor is actually a function, not a class
|
|
354
391
|
self.output_processor: Optional[Callable[[str], str]] = (
|
|
355
392
|
get_class_by_name(output_processor) if output_processor else None
|
|
356
393
|
)
|
|
@@ -420,6 +457,7 @@ class TogetherChatClient(CachingClient):
|
|
|
420
457
|
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
421
458
|
response = ChatCompletionResponse.model_validate(raw_response)
|
|
422
459
|
except Exception as error:
|
|
460
|
+
hexception(error)
|
|
423
461
|
return RequestResult(
|
|
424
462
|
success=False,
|
|
425
463
|
cached=False,
|
|
@@ -446,15 +484,15 @@ class TogetherChatClient(CachingClient):
|
|
|
446
484
|
if self.output_processor:
|
|
447
485
|
output_text = self.output_processor(output_text)
|
|
448
486
|
|
|
487
|
+
thinking: Optional[Thinking] = None
|
|
449
488
|
if self._parse_thinking:
|
|
450
|
-
thinking_text, output_text = _parse_thinking(output_text)
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
)
|
|
456
|
-
|
|
457
|
-
generated_outputs.append(GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens))
|
|
489
|
+
thinking_text, output_text = _parse_thinking(output_text, request.model)
|
|
490
|
+
thinking = Thinking(text=thinking_text)
|
|
491
|
+
elif hasattr(choice.message, "reasoning_content"):
|
|
492
|
+
thinking = Thinking(text=choice.message.reasoning_content)
|
|
493
|
+
generated_outputs.append(
|
|
494
|
+
GeneratedOutput(text=output_text, logprob=logprob, tokens=tokens, thinking=thinking)
|
|
495
|
+
)
|
|
458
496
|
return RequestResult(
|
|
459
497
|
success=True,
|
|
460
498
|
cached=cached,
|
|
@@ -527,6 +565,7 @@ class TogetherCompletionClient(CachingClient):
|
|
|
527
565
|
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
528
566
|
response = CompletionResponse.model_validate(raw_response)
|
|
529
567
|
except Exception as error:
|
|
568
|
+
hexception(error)
|
|
530
569
|
return RequestResult(
|
|
531
570
|
success=False,
|
|
532
571
|
cached=False,
|
helm/clients/vertexai_client.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
from abc import ABC, abstractmethod
|
|
3
3
|
from threading import Lock
|
|
4
|
-
from typing import Any, Dict, Mapping, Optional, List, Union
|
|
4
|
+
from typing import Any, Dict, Mapping, Optional, List, Union, cast
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
|
+
from helm.common.hierarchical_logger import hexception
|
|
7
8
|
from helm.common.multimodal_request_utils import get_contents_as_bytes
|
|
8
9
|
from helm.common.media_object import TEXT_TYPE
|
|
9
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -107,7 +108,7 @@ class VertexAITextClient(VertexAIClient):
|
|
|
107
108
|
|
|
108
109
|
def make_request(self, request: Request) -> RequestResult:
|
|
109
110
|
"""Make a request"""
|
|
110
|
-
parameters = {
|
|
111
|
+
parameters: Dict[str, Any] = {
|
|
111
112
|
"temperature": request.temperature,
|
|
112
113
|
"max_output_tokens": request.max_tokens,
|
|
113
114
|
"top_k": request.top_k_per_token,
|
|
@@ -152,6 +153,7 @@ class VertexAITextClient(VertexAIClient):
|
|
|
152
153
|
|
|
153
154
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
154
155
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
156
|
+
hexception(e)
|
|
155
157
|
error: str = f"VertexAITextClient error: {e}"
|
|
156
158
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
157
159
|
|
|
@@ -207,21 +209,23 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
207
209
|
|
|
208
210
|
def make_request(self, request: Request) -> RequestResult:
|
|
209
211
|
"""Make a request"""
|
|
210
|
-
|
|
212
|
+
# mypy is unhappy without this cast
|
|
213
|
+
contents: Union[List[Union[str, Image, Part]], List[Content]] = cast(
|
|
214
|
+
List[Union[str, Image, Part]], [request.prompt]
|
|
215
|
+
)
|
|
211
216
|
|
|
212
217
|
# For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
|
|
213
218
|
if request.multimodal_prompt is not None:
|
|
214
219
|
return self._make_multimodal_request(request)
|
|
215
220
|
|
|
216
221
|
if request.messages is not None:
|
|
217
|
-
contents = []
|
|
218
222
|
role_mapping = {"user": "user", "assistant": "model"}
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
+
contents = [
|
|
224
|
+
Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
|
|
225
|
+
for msg in request.messages
|
|
226
|
+
]
|
|
223
227
|
|
|
224
|
-
parameters = {
|
|
228
|
+
parameters: Dict[str, Any] = {
|
|
225
229
|
"temperature": request.temperature,
|
|
226
230
|
"max_output_tokens": request.max_tokens,
|
|
227
231
|
"top_k": request.top_k_per_token,
|
|
@@ -274,8 +278,14 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
274
278
|
if not candidate.content:
|
|
275
279
|
raise VertexAIContentBlockedError(f"No content in candidate: {candidate}")
|
|
276
280
|
if not candidate.content.parts:
|
|
277
|
-
|
|
278
|
-
|
|
281
|
+
if candidate.finish_reason == 2: # MAX_TOKENS
|
|
282
|
+
# This means that there is no text output because the maximum number of tokens were
|
|
283
|
+
# reached during thinking.
|
|
284
|
+
predictions.append({"text": ""})
|
|
285
|
+
else:
|
|
286
|
+
raise VertexAIContentBlockedError(f"No content parts in candidate: {candidate}")
|
|
287
|
+
else:
|
|
288
|
+
predictions.append({"text": candidate.content.text})
|
|
279
289
|
# TODO: Extract more information from the response
|
|
280
290
|
return {"predictions": predictions}
|
|
281
291
|
|
|
@@ -302,6 +312,7 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
302
312
|
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
303
313
|
)
|
|
304
314
|
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
315
|
+
hexception(e)
|
|
305
316
|
error: str = f"VertexAITextClient error: {e}"
|
|
306
317
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
307
318
|
|
|
@@ -432,6 +443,7 @@ class VertexAIChatClient(VertexAIClient):
|
|
|
432
443
|
cache_key = self.make_cache_key_with_safety_settings_preset(raw_cache_key, request)
|
|
433
444
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
434
445
|
except requests.exceptions.RequestException as e:
|
|
446
|
+
hexception(e)
|
|
435
447
|
error: str = f"Gemini Vision error: {e}"
|
|
436
448
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
437
449
|
except VertexAIContentBlockedError as e:
|
|
@@ -8,7 +8,7 @@ import torch
|
|
|
8
8
|
|
|
9
9
|
from helm.common.cache import CacheConfig
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name, is_cuda_available
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
14
14
|
from helm.common.request import wrap_request_time
|
|
@@ -125,6 +125,7 @@ class HuggingFaceVision2SeqClient(CachingClient):
|
|
|
125
125
|
)
|
|
126
126
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
127
127
|
except RuntimeError as model_error:
|
|
128
|
+
hexception(model_error)
|
|
128
129
|
return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
|
|
129
130
|
|
|
130
131
|
for text in result["output"]:
|
|
@@ -5,6 +5,7 @@ from transformers import pipeline
|
|
|
5
5
|
from transformers.pipelines import ImageToTextPipeline
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.hierarchical_logger import hexception
|
|
8
9
|
from helm.common.images_utils import open_image
|
|
9
10
|
from helm.common.media_object import TEXT_TYPE
|
|
10
11
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
@@ -93,6 +94,7 @@ class HuggingFaceVLMClient(CachingClient):
|
|
|
93
94
|
)
|
|
94
95
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
95
96
|
except RuntimeError as e:
|
|
97
|
+
hexception(e)
|
|
96
98
|
return RequestResult(success=False, cached=False, error=str(e), completions=[], embedding=[])
|
|
97
99
|
|
|
98
100
|
output: str = result["generated_text"]
|
|
@@ -8,7 +8,7 @@ from transformers import IdeficsForVisionText2Text, AutoProcessor, IdeficsProces
|
|
|
8
8
|
from helm.common.cache import CacheConfig
|
|
9
9
|
from helm.common.images_utils import open_image
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
14
14
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
@@ -137,6 +137,7 @@ class IDEFICSClient(CachingClient):
|
|
|
137
137
|
)
|
|
138
138
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
139
139
|
except RuntimeError as model_error:
|
|
140
|
+
hexception(model_error)
|
|
140
141
|
return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
|
|
141
142
|
|
|
142
143
|
for text in result["output"]:
|
|
@@ -5,7 +5,7 @@ import torch
|
|
|
5
5
|
from huggingface_hub import hf_hub_download
|
|
6
6
|
|
|
7
7
|
from helm.common.cache import CacheConfig
|
|
8
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
8
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
9
9
|
from helm.common.images_utils import open_image
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
11
|
from helm.common.media_object import TEXT_TYPE
|
|
@@ -131,6 +131,7 @@ class OpenFlamingoClient(CachingClient):
|
|
|
131
131
|
)
|
|
132
132
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
133
133
|
except RuntimeError as ex:
|
|
134
|
+
hexception(ex)
|
|
134
135
|
return RequestResult(success=False, cached=False, error=str(ex), completions=[], embedding=[])
|
|
135
136
|
|
|
136
137
|
completions: List[GeneratedOutput] = []
|
|
@@ -8,7 +8,7 @@ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
|
|
|
8
8
|
from helm.common.cache import CacheConfig
|
|
9
9
|
from helm.common.images_utils import open_image
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
14
14
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
@@ -126,6 +126,7 @@ class PaliGemmaClient(CachingClient):
|
|
|
126
126
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
127
127
|
concat_results.append(result)
|
|
128
128
|
except RuntimeError as model_error:
|
|
129
|
+
hexception(model_error)
|
|
129
130
|
return RequestResult(success=False, cached=False, error=str(model_error), completions=[], embedding=[])
|
|
130
131
|
|
|
131
132
|
for result in concat_results:
|
|
@@ -5,6 +5,7 @@ import requests
|
|
|
5
5
|
|
|
6
6
|
from helm.common.cache import CacheConfig
|
|
7
7
|
from helm.common.images_utils import encode_base64
|
|
8
|
+
from helm.common.hierarchical_logger import hexception
|
|
8
9
|
from helm.common.media_object import TEXT_TYPE
|
|
9
10
|
from helm.common.request import Request, RequestResult, GeneratedOutput, ErrorFlags
|
|
10
11
|
from helm.common.request import wrap_request_time
|
|
@@ -76,6 +77,7 @@ class PalmyraVisionClient(CachingClient):
|
|
|
76
77
|
)
|
|
77
78
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
78
79
|
except PalmyraVisionContentBlockedError as ex:
|
|
80
|
+
hexception(ex)
|
|
79
81
|
return RequestResult(
|
|
80
82
|
success=False,
|
|
81
83
|
cached=False,
|
|
@@ -8,7 +8,7 @@ import torch
|
|
|
8
8
|
|
|
9
9
|
from helm.common.cache import CacheConfig
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device_name
|
|
11
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
11
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
12
12
|
from helm.common.media_object import TEXT_TYPE
|
|
13
13
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
14
14
|
from helm.common.request import wrap_request_time
|
|
@@ -157,6 +157,7 @@ class Qwen2VLMClient(CachingClient):
|
|
|
157
157
|
)
|
|
158
158
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
159
159
|
except RuntimeError as model_error:
|
|
160
|
+
hexception(model_error)
|
|
160
161
|
return RequestResult(
|
|
161
162
|
success=False,
|
|
162
163
|
cached=False,
|
|
@@ -7,7 +7,7 @@ from transformers.generation import GenerationConfig
|
|
|
7
7
|
|
|
8
8
|
from helm.common.cache import CacheConfig
|
|
9
9
|
from helm.common.gpu_utils import get_torch_device_name
|
|
10
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
10
|
+
from helm.common.hierarchical_logger import hexception, hlog, htrack_block
|
|
11
11
|
from helm.common.media_object import TEXT_TYPE
|
|
12
12
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
13
13
|
from helm.common.request import wrap_request_time
|
|
@@ -139,6 +139,7 @@ class QwenVLMClient(CachingClient):
|
|
|
139
139
|
)
|
|
140
140
|
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
141
141
|
except RuntimeError as model_error:
|
|
142
|
+
hexception(model_error)
|
|
142
143
|
return RequestResult(
|
|
143
144
|
success=False, cached=False, error=str(model_error), completions=[], embedding=[]
|
|
144
145
|
)
|
helm/clients/vllm_client.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import Any, Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import CacheConfig
|
|
4
4
|
from helm.common.request import Request
|
|
5
|
-
from helm.clients.openai_client import OpenAILegacyCompletionsClient
|
|
5
|
+
from helm.clients.openai_client import OpenAIClient, OpenAILegacyCompletionsClient
|
|
6
6
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
7
7
|
|
|
8
8
|
|
|
@@ -19,6 +19,8 @@ class VLLMClient(OpenAILegacyCompletionsClient):
|
|
|
19
19
|
tokenizer_name: str,
|
|
20
20
|
cache_config: CacheConfig,
|
|
21
21
|
base_url: Optional[str] = None,
|
|
22
|
+
vllm_model_name: Optional[str] = None,
|
|
23
|
+
**kwargs,
|
|
22
24
|
):
|
|
23
25
|
super().__init__(
|
|
24
26
|
tokenizer=tokenizer,
|
|
@@ -27,18 +29,52 @@ class VLLMClient(OpenAILegacyCompletionsClient):
|
|
|
27
29
|
api_key="EMPTY",
|
|
28
30
|
org_id=None,
|
|
29
31
|
base_url=base_url,
|
|
32
|
+
openai_model_name=vllm_model_name,
|
|
33
|
+
**kwargs,
|
|
30
34
|
)
|
|
31
35
|
self.tokenizer = tokenizer
|
|
32
36
|
self.tokenizer_name = tokenizer_name
|
|
33
|
-
|
|
34
|
-
def _get_model_for_request(self, request: Request) -> str:
|
|
35
|
-
# The `model` parameter for vLLM should be the whole model name including the creator organization,
|
|
36
|
-
# unlike OpenAI which only uses the model engine.
|
|
37
|
-
return request.model
|
|
37
|
+
self.vllm_model_name = vllm_model_name
|
|
38
38
|
|
|
39
39
|
def _to_raw_completion_request(self, request: Request) -> Dict[str, Any]:
|
|
40
40
|
raw_request = super()._to_raw_completion_request(request)
|
|
41
41
|
# This avoids the error: best_of must be 1 when using greedy sampling
|
|
42
|
-
if
|
|
42
|
+
if (
|
|
43
|
+
"temperature" in raw_request
|
|
44
|
+
and raw_request["temperature"] == 0.0
|
|
45
|
+
and "best_of" in raw_request
|
|
46
|
+
and raw_request["best_of"] > 1
|
|
47
|
+
):
|
|
43
48
|
raw_request["best_of"] = 1
|
|
44
49
|
return raw_request
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class VLLMChatClient(OpenAIClient):
|
|
53
|
+
"""Sends request to a vLLM server using the OpenAI-compatible API.
|
|
54
|
+
|
|
55
|
+
Only uses the Chat Completions API.
|
|
56
|
+
|
|
57
|
+
See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
|
|
58
|
+
|
|
59
|
+
def __init__(
|
|
60
|
+
self,
|
|
61
|
+
tokenizer: Tokenizer,
|
|
62
|
+
tokenizer_name: str,
|
|
63
|
+
cache_config: CacheConfig,
|
|
64
|
+
base_url: Optional[str] = None,
|
|
65
|
+
vllm_model_name: Optional[str] = None,
|
|
66
|
+
**kwargs,
|
|
67
|
+
):
|
|
68
|
+
super().__init__(
|
|
69
|
+
tokenizer=tokenizer,
|
|
70
|
+
tokenizer_name=tokenizer_name,
|
|
71
|
+
cache_config=cache_config,
|
|
72
|
+
api_key="EMPTY",
|
|
73
|
+
org_id=None,
|
|
74
|
+
base_url=base_url,
|
|
75
|
+
openai_model_name=vllm_model_name,
|
|
76
|
+
**kwargs,
|
|
77
|
+
)
|
|
78
|
+
self.tokenizer = tokenizer
|
|
79
|
+
self.tokenizer_name = tokenizer_name
|
|
80
|
+
self.vllm_model_name = vllm_model_name
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from dataclasses import replace
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
from helm.clients.vllm_client import VLLMChatClient
|
|
6
|
+
from helm.common.request import GeneratedOutput, Request, RequestResult, Thinking
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class VLLMGraniteThinkingClient(VLLMChatClient):
|
|
10
|
+
"""Sends request to a Granite model on vLLM server with thinking enabled.
|
|
11
|
+
|
|
12
|
+
From vLLM documentation at
|
|
13
|
+
https://docs.vllm.ai/en/v0.9.1/features/reasoning_outputs.html
|
|
14
|
+
|
|
15
|
+
IBM Granite 3.2 reasoning is disabled by default;
|
|
16
|
+
to enable it, you must also pass thinking=True in your chat_template_kwargs.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
|
|
20
|
+
raw_request = super()._make_chat_raw_request(request)
|
|
21
|
+
raw_request["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
|
|
22
|
+
return raw_request
|
|
23
|
+
|
|
24
|
+
def _parse_thinking(self, input: str) -> Tuple[str, str]:
|
|
25
|
+
"""Return a tuple of thinking text and output text."""
|
|
26
|
+
match = re.match(r"<think>(.*)</think>\s*<response>(.*)</response>", input, re.DOTALL)
|
|
27
|
+
if match:
|
|
28
|
+
return (match.group(1), match.group(2))
|
|
29
|
+
|
|
30
|
+
match = re.match(r"<think>(.*)</think>\s*<response>(.*)", input, re.DOTALL)
|
|
31
|
+
if match:
|
|
32
|
+
return (match.group(1), match.group(2))
|
|
33
|
+
|
|
34
|
+
match = re.match(r"<think>(.*)</think>\s*", input, re.DOTALL)
|
|
35
|
+
if match:
|
|
36
|
+
return (match.group(1), "")
|
|
37
|
+
|
|
38
|
+
match = re.match(r"<think>(.*)", input, re.DOTALL)
|
|
39
|
+
if match:
|
|
40
|
+
return (match.group(1), "")
|
|
41
|
+
|
|
42
|
+
return (input, "")
|
|
43
|
+
|
|
44
|
+
def _make_chat_request(self, request: Request) -> RequestResult:
|
|
45
|
+
request_result = super()._make_chat_request(request)
|
|
46
|
+
modified_completions: List[GeneratedOutput] = []
|
|
47
|
+
for completion in request_result.completions:
|
|
48
|
+
thinking, modified_text = self._parse_thinking(completion.text)
|
|
49
|
+
modified_completions.append(
|
|
50
|
+
replace(
|
|
51
|
+
completion,
|
|
52
|
+
text=modified_text,
|
|
53
|
+
thinking=Thinking(text=thinking),
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
return replace(request_result, completions=modified_completions)
|
helm/clients/writer_client.py
CHANGED
|
@@ -2,8 +2,10 @@ from typing import Any, Dict, List, Mapping, Optional
|
|
|
2
2
|
|
|
3
3
|
from helm.clients.client import CachingClient
|
|
4
4
|
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.common.hierarchical_logger import hexception
|
|
5
6
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
7
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
8
|
+
from helm.proxy.retry import NonRetriableException
|
|
7
9
|
|
|
8
10
|
try:
|
|
9
11
|
from writerai import Writer
|
|
@@ -19,9 +21,9 @@ class WriterClient(CachingClient):
|
|
|
19
21
|
|
|
20
22
|
def _get_messages_from_request(self, request: Request) -> List[Dict]:
|
|
21
23
|
if request.prompt and request.messages:
|
|
22
|
-
raise
|
|
24
|
+
raise NonRetriableException(f"Only one of `prompt` and `messages` may be set in request: {request}")
|
|
23
25
|
if request.multimodal_prompt:
|
|
24
|
-
raise
|
|
26
|
+
raise NonRetriableException("`multimodal_prompt` is not supported by WriterClient")
|
|
25
27
|
if request.messages:
|
|
26
28
|
return [{"role": message["role"], "content": message["content"]} for message in request.messages]
|
|
27
29
|
else:
|
|
@@ -82,6 +84,7 @@ class WriterClient(CachingClient):
|
|
|
82
84
|
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
83
85
|
chat_completion: ChatCompletion = ChatCompletion.model_validate(raw_response)
|
|
84
86
|
except Exception as error:
|
|
87
|
+
hexception(error)
|
|
85
88
|
return RequestResult(
|
|
86
89
|
success=False,
|
|
87
90
|
cached=False,
|
helm/common/critique_request.py
CHANGED
|
@@ -6,7 +6,6 @@ from helm.common.media_object import MediaObject
|
|
|
6
6
|
class QuestionType:
|
|
7
7
|
"""String enum of question types."""
|
|
8
8
|
|
|
9
|
-
# TODO: Make this a StrEnum after upgrading to Python 3.11
|
|
10
9
|
MULTIPLE_CHOICE: str = "multiple_choice"
|
|
11
10
|
CHECKBOX: str = "checkbox"
|
|
12
11
|
FREE_RESPONSE: str = "free_response"
|