PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/clients/vertexai_client.py CHANGED Viewed

@@ -4,6 +4,7 @@ from threading import Lock
 from typing import Any, Dict, Mapping, Optional, List, Union
 from helm.common.cache import CacheConfig
+from helm.common.multimodal_request_utils import get_contents_as_bytes
 from helm.common.media_object import TEXT_TYPE
 from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, ErrorFlags
@@ -12,7 +13,14 @@ from helm.clients.client import CachingClient, truncate_sequence, generate_uid_f
 try:
     import vertexai
     from vertexai.language_models import TextGenerationModel, TextGenerationResponse  # PaLM2
-    from vertexai.preview.generative_models import GenerativeModel, GenerationResponse, Candidate, Part, Image  # Gemini
+    from vertexai.preview.generative_models import (
+        GenerativeModel,
+        GenerationResponse,
+        Candidate,
+        Content,
+        Part,
+        Image,
+    )  # Gemini
     from google.cloud.aiplatform_v1beta1.types import SafetySetting, HarmCategory
 except ModuleNotFoundError as e:
     handle_module_not_found_error(e, ["google"])
@@ -48,17 +56,16 @@ def _get_safety_settings_for_preset(
         raise ValueError(f"Unknown safety_settings_preset: {safety_settings_preset}")
-def _get_model_name_for_request(request: Request) -> str:
-    # We have to strip "-safety-" suffixes from model names because they are not part of the Vertex AI model name
-    # TODO: Clean up this hack
-    return request.model_engine.split("-safety-")[0]
 class VertexAIClient(CachingClient, ABC):
     """Client for Vertex AI models"""
     def __init__(
-        self, cache_config: CacheConfig, project_id: str, location: str, safety_settings_preset: Optional[str] = None
+        self,
+        cache_config: CacheConfig,
+        project_id: str,
+        location: str,
+        safety_settings_preset: Optional[str] = None,
+        vertexai_model: Optional[str] = None,
     ) -> None:
         super().__init__(cache_config=cache_config)
         self.project_id = project_id
@@ -67,8 +74,15 @@ class VertexAIClient(CachingClient, ABC):
         self.safety_settings_preset = safety_settings_preset
         self.safety_settings = _get_safety_settings_for_preset(safety_settings_preset)
+        self.vertexai_model = vertexai_model
         vertexai.init(project=self.project_id, location=self.location)
+    def _get_model_name_for_request(self, request: Request) -> str:
+        if self.vertexai_model is not None:
+            return self.vertexai_model
+        return request.model_engine
     def make_cache_key_with_safety_settings_preset(self, raw_request: Mapping, request: Request) -> Mapping:
         """Construct the key for the cache using the raw request.
@@ -111,7 +125,7 @@ class VertexAITextClient(VertexAIClient):
         }
         completions: List[GeneratedOutput] = []
-        model_name: str = _get_model_name_for_request(request)
+        model_name: str = self._get_model_name_for_request(request)
         try:
@@ -193,12 +207,20 @@ class VertexAIChatClient(VertexAIClient):
     def make_request(self, request: Request) -> RequestResult:
         """Make a request"""
-        contents: str = request.prompt
+        contents = [request.prompt]
         # For the multimodal case, build up the content with the media objects of `request.multimodal_prompt`
         if request.multimodal_prompt is not None:
             return self._make_multimodal_request(request)
+        if request.messages is not None:
+            contents = []
+            role_mapping = {"user": "user", "assistant": "model"}
+            for msg in request.messages:
+                contents.append(
+                    Content(role=role_mapping.get(msg["role"], "user"), parts=[Part.from_text(msg["content"])])
+                )
         parameters = {
             "temperature": request.temperature,
             "max_output_tokens": request.max_tokens,
@@ -217,7 +239,7 @@ class VertexAIChatClient(VertexAIClient):
         }
         completions: List[GeneratedOutput] = []
-        model_name: str = _get_model_name_for_request(request)
+        model_name: str = self._get_model_name_for_request(request)
         model = self.get_model(model_name)
         try:
@@ -263,7 +285,7 @@ class VertexAIChatClient(VertexAIClient):
             cache_key = self.make_cache_key_with_safety_settings_preset(
                 {
                     "model_name": model_name,
-                    "prompt": request.prompt,
+                    "prompt": request.messages or request.prompt,
                     **parameters,
                 },
                 request,
@@ -338,6 +360,16 @@ class VertexAIChatClient(VertexAIClient):
         for media_object in request.multimodal_prompt.media_objects:
             if media_object.is_type("image") and media_object.location:
                 contents.append(Part.from_image(Image.load_from_file(media_object.location)))
+            elif media_object.is_type("video") and media_object.location:
+                # Following this example
+                # https://cloud.google.com/vertex-ai/generative-ai/docs/samples/googlegenaisdk-textgen-with-local-video
+                with open(media_object.location, "rb") as fp:
+                    video_content = fp.read()
+                contents.append(Part.from_data(data=video_content, mime_type=media_object.content_type))
+            elif media_object.is_type("audio") and media_object.location:
+                contents.append(
+                    Part.from_data(get_contents_as_bytes(media_object.location), mime_type=media_object.content_type)
+                )
             elif media_object.is_type(TEXT_TYPE):
                 if media_object.text is None:
                     raise ValueError("MediaObject of text type has missing text field value")
@@ -355,7 +387,7 @@ class VertexAIChatClient(VertexAIClient):
         }
         completions: List[GeneratedOutput] = []
-        model_name: str = _get_model_name_for_request(request)
+        model_name: str = self._get_model_name_for_request(request)
         model = self.get_model(model_name)
         request_time = 0

helm/clients/vision_language/huggingface_vision2seq_client.py CHANGED Viewed

@@ -95,8 +95,8 @@ class HuggingFaceVision2SeqClient(CachingClient):
                 def do_it() -> Dict[str, Any]:
                     messages = [{"role": "user", "content": multimodal_prompt}]
-                    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-                    inputs = processor(
+                    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)  # type: ignore
+                    inputs = processor(  # type: ignore
                         text=[prompt] * request.num_completions,
                         images=[
                             [load_image(image_path) for image_path in image_paths]
@@ -107,8 +107,10 @@ class HuggingFaceVision2SeqClient(CachingClient):
                     inputs = {k: v.to(self._device) for k, v in inputs.items()}
                     # Generate
-                    generated_ids = model.generate(**inputs, **generation_args)
-                    generated_texts: List[str] = processor.batch_decode(generated_ids, skip_special_tokens=True)
+                    generated_ids = model.generate(**inputs, **generation_args)  # type: ignore
+                    generated_texts: List[str] = processor.batch_decode(  # type: ignore
+                        generated_ids, skip_special_tokens=True
+                    )
                     return {"output": generated_texts}
                 # Include the prompt and model name in the cache key

helm/clients/vision_language/huggingface_vlm_client.py CHANGED Viewed

@@ -50,7 +50,7 @@ class HuggingFaceVLMClient(CachingClient):
         with self._models_lock:
             model_id: str = self._models_aliases.get(model_name, model_name)
             if model_id not in self._models:
-                self._models[model_id] = pipeline("image-to-text", model=model_id, device_map="auto")
+                self._models[model_id] = pipeline("image-to-text", model=model_id, device_map="auto")  # type: ignore
             return self._models[model_id]
     def make_request(self, request: Request) -> RequestResult:
@@ -80,7 +80,7 @@ class HuggingFaceVLMClient(CachingClient):
             def do_it() -> Dict[str, Any]:
                 model: ImageToTextPipeline = self._get_model(request.model_deployment)
-                outputs = model(image, prompt=prompt, generate_kwargs=generation_args)
+                outputs = model(image, prompt=prompt, generate_kwargs=generation_args)  # type: ignore
                 return outputs[0]
             cache_key = CachingClient.make_cache_key(

helm/clients/vision_language/idefics_client.py CHANGED Viewed

@@ -89,14 +89,18 @@ class IDEFICSClient(CachingClient):
         input_args: Dict[str, Union[str, bool]] = {"return_tensors": "pt"}
         generation_args = {
             "max_new_tokens": request.max_tokens,
-            "bad_words_ids": processor.tokenizer(self.BAD_WORD_TOKENS, add_special_tokens=False).input_ids,
+            "bad_words_ids": processor.tokenizer(  # type: ignore
+                self.BAD_WORD_TOKENS, add_special_tokens=False
+            ).input_ids,
         }
         if self.END_OF_UTTERANCE_TOKEN in request.stop_sequences:
             # Following https://huggingface.co/HuggingFaceM4/idefics-80b-instruct,
             # specify <end_of_utterance> as an exit condition.
             input_args["add_end_of_utterance_token"] = False
-            exit_condition = processor.tokenizer(self.END_OF_UTTERANCE_TOKEN, add_special_tokens=False).input_ids
+            exit_condition = processor.tokenizer(  # type: ignore
+                self.END_OF_UTTERANCE_TOKEN, add_special_tokens=False
+            ).input_ids
             generation_args["eos_token_id"] = exit_condition
         multimodal_prompt: List[Union[str, Image.Image]] = []

helm/clients/vision_language/open_flamingo/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-from .src.flamingo import Flamingo
-from .src.factory import create_model_and_transforms
+from helm.clients.vision_language.open_flamingo.src.flamingo import Flamingo
+from helm.clients.vision_language.open_flamingo.src.factory import create_model_and_transforms

helm/clients/vision_language/open_flamingo/src/factory.py CHANGED Viewed

@@ -7,9 +7,9 @@ from typing import Optional
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from helm.common.general import handle_module_not_found_error
-from .flamingo import Flamingo
-from .flamingo_lm import FlamingoLMMixin
-from .utils import extend_instance
+from helm.clients.vision_language.open_flamingo.src.flamingo import Flamingo
+from helm.clients.vision_language.open_flamingo.src.flamingo_lm import FlamingoLMMixin
+from helm.clients.vision_language.open_flamingo.src.utils import extend_instance
 def create_model_and_transforms(

helm/clients/vision_language/open_flamingo/src/flamingo.py CHANGED Viewed

@@ -5,7 +5,7 @@ Source: https://github.com/mlfoundations/open_flamingo
 import torch
 from einops import rearrange
 from torch import nn
-from .helpers import PerceiverResampler
+from helm.clients.vision_language.open_flamingo.src.helpers import PerceiverResampler
 from torch.distributed.fsdp.wrap import (
     enable_wrap,
     wrap,
@@ -15,7 +15,7 @@ from torch.distributed.fsdp import (
     FullyShardedDataParallel as FSDP,
 )
-from .utils import apply_with_stopping_condition
+from helm.clients.vision_language.open_flamingo.src.utils import apply_with_stopping_condition
 class Flamingo(nn.Module):

helm/clients/vision_language/open_flamingo/src/flamingo_lm.py CHANGED Viewed

@@ -3,8 +3,8 @@ Source: https://github.com/mlfoundations/open_flamingo
 """
 import torch.nn as nn
-from .helpers import GatedCrossAttentionBlock
-from .utils import getattr_recursive, setattr_recursive
+from helm.clients.vision_language.open_flamingo.src.helpers import GatedCrossAttentionBlock
+from helm.clients.vision_language.open_flamingo.src.utils import getattr_recursive, setattr_recursive
 class FlamingoLayer(nn.Module):

helm/clients/vision_language/paligemma_client.py CHANGED Viewed

@@ -93,7 +93,7 @@ class PaliGemmaClient(CachingClient):
             else:
                 raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
         prompt_text: str = "\n".join(prompt_pieces)
-        model_inputs = processor(text=prompt_text, images=images, return_tensors="pt").to(self._device)
+        model_inputs = processor(text=prompt_text, images=images, return_tensors="pt").to(self._device)  # type: ignore
         input_len = model_inputs["input_ids"].shape[-1]
         completions: List[GeneratedOutput] = []
@@ -109,7 +109,7 @@ class PaliGemmaClient(CachingClient):
                             )[0]
                             if not request.echo_prompt:
                                 generation = generation[input_len:]
-                            decoded = processor.decode(generation, skip_special_tokens=True)
+                            decoded = processor.decode(generation, skip_special_tokens=True)  # type: ignore
                             return {"output": decoded}
                     # Include the prompt and model name in the cache key

helm/clients/vision_language/qwen2_vlm_client.py ADDED Viewed

@@ -0,0 +1,188 @@
+from threading import Lock
+from typing import Any, Dict, List, Optional
+from dataclasses import dataclass
+from transformers import AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+from helm.common.cache import CacheConfig
+from helm.common.gpu_utils import get_torch_device_name
+from helm.common.hierarchical_logger import hlog, htrack_block
+from helm.common.media_object import TEXT_TYPE
+from helm.common.request import Request, RequestResult, GeneratedOutput, Token
+from helm.common.request import wrap_request_time
+from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
+@dataclass(frozen=True)
+class LoadedModelProcessor:
+    model: Any
+    processor: AutoProcessor
+# Global cache for all models
+_models_lock: Lock = Lock()
+_models: Dict[str, Optional[LoadedModelProcessor]] = {
+    "Qwen/Qwen2-VL-7B-Instruct": None,
+    "Qwen/Qwen2-VL-72B-Instruct": None,
+    "Qwen/Qwen2.5-VL-3B-Instruct": None,
+    "Qwen/Qwen2.5-VL-7B-Instruct": None,
+    "Qwen/Qwen2.5-VL-32B-Instruct": None,
+    "Qwen/Qwen2.5-VL-72B-Instruct": None,
+}
+class Qwen2VLMClient(CachingClient):
+    def __init__(self, cache_config: CacheConfig):
+        super().__init__(cache_config=cache_config)
+        self._device: str = get_torch_device_name()
+    def _get_model_name(self, helm_model_name: str) -> str:
+        if helm_model_name == "qwen2-vl-7b-instruct":
+            return "Qwen/Qwen2-VL-7B-Instruct"
+        elif helm_model_name == "qwen2-vl-72b-instruct":
+            return "Qwen/Qwen2-VL-72B-Instruct"
+        elif helm_model_name == "qwen2.5-vl-3b-instruct":
+            return "Qwen/Qwen2.5-VL-3B-Instruct"
+        elif helm_model_name == "qwen2.5-vl-7b-instruct":
+            return "Qwen/Qwen2.5-VL-7B-Instruct"
+        elif helm_model_name == "qwen2.5-vl-32b-instruct":
+            return "Qwen/Qwen2.5-VL-32B-Instruct"
+        elif helm_model_name == "qwen2.5-vl-72b-instruct":
+            return "Qwen/Qwen2.5-VL-72B-Instruct"
+        else:
+            raise ValueError(f"Unhandled model name: {helm_model_name}")
+    def _get_model(self, helm_model_name: str) -> LoadedModelProcessor:
+        from transformers import Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
+        global _models_lock, _models
+        model_name = self._get_model_name(helm_model_name)
+        with _models_lock:
+            loaded = _models[model_name]
+            if loaded is None:
+                hlog(f"Loading model {model_name} and caching in memory...")
+                # Use different loading routines depending on whether it's Qwen2.5 or Qwen2.
+                if "2.5" in model_name:
+                    # Qwen2.5: by default use torch_dtype="auto". You can enable flash_attention_2 if desired.
+                    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                        model_name,
+                        torch_dtype=torch.bfloat16,
+                        device_map="auto",
+                        attn_implementation="flash_attention_2",
+                    ).eval()
+                else:
+                    model = Qwen2VLForConditionalGeneration.from_pretrained(
+                        model_name,
+                        torch_dtype=torch.bfloat16,
+                        device_map="auto",
+                        attn_implementation="flash_attention_2",
+                    ).eval()
+                processor = AutoProcessor.from_pretrained(model_name)
+                loaded = LoadedModelProcessor(model=model, processor=processor)
+                _models[model_name] = loaded
+        return loaded
+    def make_request(self, request: Request) -> RequestResult:
+        assert request.multimodal_prompt is not None, "Multimodal prompt is required"
+        # Build messages by collating all media objects into a single "user" message.
+        message_content = []
+        for media_object in request.multimodal_prompt.media_objects:
+            if media_object.is_type("image") and media_object.location:
+                message_content.append({"type": "image", "image": media_object.location})
+            elif media_object.is_type(TEXT_TYPE):
+                if media_object.text is None:
+                    raise ValueError("MediaObject of text type has missing text field value")
+                message_content.append({"type": "text", "text": media_object.text})
+            else:
+                raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
+        messages = [{"role": "user", "content": message_content}]
+        generation_args = {
+            "max_new_tokens": request.max_tokens,
+        }
+        completions: List[GeneratedOutput] = []
+        request_time: float = 0
+        request_datetime: Optional[int] = None
+        all_cached: bool = True
+        with htrack_block(f"Generating for prompt: {request.multimodal_prompt.text}"):
+            for completion_index in range(request.num_completions):
+                try:
+                    def do_it() -> Dict[str, Any]:
+                        loaded = self._get_model(request.model_engine)
+                        model = loaded.model
+                        processor = loaded.processor
+                        # Prepare text and vision inputs.
+                        text = processor.apply_chat_template(  # type: ignore
+                            messages, tokenize=False, add_generation_prompt=True
+                        )
+                        image_inputs, video_inputs = process_vision_info(messages)
+                        inputs = processor(  # type: ignore
+                            text=[text],
+                            images=image_inputs,
+                            videos=video_inputs,
+                            padding=True,
+                            return_tensors="pt",
+                        ).to(self._device)
+                        generated_ids = model.generate(**inputs, **generation_args)
+                        # Remove the input prefix from outputs.
+                        generated_ids_trimmed = [
+                            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+                        ]
+                        output_text = processor.batch_decode(  # type: ignore
+                            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                        )
+                        # For simplicity, we split tokens by whitespace.
+                        completion = output_text[0]
+                        tokens = completion.split()
+                        return {"output": (completion, tokens)}
+                    cache_key = CachingClient.make_cache_key(
+                        raw_request={
+                            "completion_index": completion_index,
+                            "model": request.model,
+                            "prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
+                            **generation_args,
+                        },
+                        request=request,
+                    )
+                    result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+                except RuntimeError as model_error:
+                    return RequestResult(
+                        success=False,
+                        cached=False,
+                        error=str(model_error),
+                        completions=[],
+                        embedding=[],
+                    )
+                text_out, tokens = result["output"]
+                completions.append(
+                    GeneratedOutput(
+                        text=text_out,
+                        logprob=0,
+                        tokens=[Token(text=str(token), logprob=0) for token in tokens],
+                    )
+                )
+                hlog(f"Generated: {text_out}")
+                request_time += result["request_time"]
+                request_datetime = request_datetime or result.get("request_datetime")
+                all_cached = all_cached and cached
+        return RequestResult(
+            success=True,
+            cached=all_cached,
+            request_time=request_time,
+            request_datetime=request_datetime,
+            completions=completions,
+            embedding=[],
+        )

helm/clients/vision_language/qwen_vlm_client.py CHANGED Viewed

@@ -115,14 +115,16 @@ class QwenVLMClient(CachingClient):
                     def do_it() -> Dict[str, Any]:
                         if request.model_engine == "qwen-vl-chat":
-                            completion, _ = model.chat(tokenizer, query=tokenizer.from_list_format(query), history=None)
+                            completion, _ = model.chat(  # type: ignore
+                                tokenizer, query=tokenizer.from_list_format(query), history=None  # type: ignore
+                            )
                         else:
-                            inputs = tokenizer(tokenizer.from_list_format(query), return_tensors="pt")
+                            inputs = tokenizer(tokenizer.from_list_format(query), return_tensors="pt")  # type: ignore
                             inputs = inputs.to(self._device)
-                            pred = model.generate(**inputs, **generation_args)
-                            completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
+                            pred = model.generate(**inputs, **generation_args)  # type: ignore
+                            completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)  # type: ignore
-                        tokens: List[str] = tokenizer.tokenize(completion)
+                        tokens: List[str] = tokenizer.tokenize(completion)  # type: ignore
                         return {"output": (completion, tokens)}
                     # Include the prompt and model name in the cache key

helm/clients/vllm_client.py CHANGED Viewed

@@ -2,13 +2,15 @@ from typing import Any, Dict, Optional
 from helm.common.cache import CacheConfig
 from helm.common.request import Request
-from helm.clients.openai_client import OpenAIClient
+from helm.clients.openai_client import OpenAILegacyCompletionsClient
 from helm.tokenizers.tokenizer import Tokenizer
-class VLLMClient(OpenAIClient):
+class VLLMClient(OpenAILegacyCompletionsClient):
     """Sends request to a vLLM server using the OpenAI-compatible API.
+    Only supports the legacy Text Completions API, rather than the Chat Completions API.
     See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
     def __init__(
@@ -29,10 +31,6 @@ class VLLMClient(OpenAIClient):
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer_name
-    def _is_chat_model_engine(self, model_engine: str) -> bool:
-        # Only support vLLM completion models for now.
-        return False
     def _get_model_for_request(self, request: Request) -> str:
         # The `model` parameter for vLLM should be the whole model name including the creator organization,
         # unlike OpenAI which only uses the model engine.

helm/clients/writer_client.py ADDED Viewed

@@ -0,0 +1,102 @@
+from typing import Any, Dict, List, Mapping, Optional
+from helm.clients.client import CachingClient
+from helm.common.cache import CacheConfig
+from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
+try:
+    from writerai import Writer
+    from writerai.types.chat_completion import ChatCompletion
+except ModuleNotFoundError as e:
+    handle_module_not_found_error(e, ["openai"])
+class WriterClient(CachingClient):
+    def __init__(self, cache_config: CacheConfig, api_key: Optional[str] = None):
+        super().__init__(cache_config=cache_config)
+        self._writer_client = Writer(api_key=api_key)
+    def _get_messages_from_request(self, request: Request) -> List[Dict]:
+        if request.prompt and request.messages:
+            raise ValueError(f"Only one of `prompt` and `messages` may be set in request: {request}")
+        if request.multimodal_prompt:
+            raise ValueError("`multimodal_prompt` is not supported by WriterClient")
+        if request.messages:
+            return [{"role": message["role"], "content": message["content"]} for message in request.messages]
+        else:
+            return [{"role": "user", "content": request.prompt}]
+    def _convert_chat_completion_to_generated_outputs(
+        self, chat_completion: ChatCompletion, request: Request
+    ) -> List[GeneratedOutput]:
+        generated_outputs: List[GeneratedOutput] = []
+        for choice in chat_completion.choices:
+            raw_completion_content = choice.message.content
+            # The Writer chat completion API doesn't support echo.
+            # If `echo_prompt` is true, combine the prompt and completion.
+            text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
+            tokens: List[Token] = []
+            if choice.logprobs and choice.logprobs.content:
+                tokens = [
+                    Token(text=choice_token.token, logprob=choice_token.logprob)
+                    for choice_token in choice.logprobs.content
+                ]
+            generated_output = GeneratedOutput(
+                text=text,
+                logprob=sum(token.logprob for token in tokens) if tokens else 0.0,
+                tokens=tokens,
+                finish_reason={"reason": choice.finish_reason},
+            )
+            generated_outputs.append(generated_output)
+        return generated_outputs
+    def _convert_request_to_raw_request(self, request: Request) -> Dict:
+        raw_request = {
+            "messages": self._get_messages_from_request(request),
+            "model": request.model.split("/")[-1],
+            "logprobs": bool(request.top_k_per_token),
+            "max_tokens": request.max_tokens,
+            "n": request.num_completions,
+            "stop": request.stop_sequences,
+            "temperature": request.temperature,
+            "top_p": request.top_p,
+        }
+        if request.response_format and request.response_format.json_schema:
+            raw_request["response_format"] = {
+                "type": "json_schema",
+                "json_schema": {
+                    "schema": request.response_format.json_schema,
+                },
+            }
+        return raw_request
+    def make_request(self, request: Request) -> RequestResult:
+        raw_request = self._convert_request_to_raw_request(request)
+        cache_key: Mapping = CachingClient.make_cache_key(raw_request, request)
+        def do_it() -> Dict[Any, Any]:
+            return self._writer_client.chat.chat(**raw_request).model_dump()
+        try:
+            raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
+            chat_completion: ChatCompletion = ChatCompletion.model_validate(raw_response)
+        except Exception as error:
+            return RequestResult(
+                success=False,
+                cached=False,
+                error=str(error),
+                completions=[],
+                embedding=[],
+            )
+        generated_outputs = self._convert_chat_completion_to_generated_outputs(chat_completion, request)
+        return RequestResult(
+            success=True,
+            cached=cached,
+            request_time=raw_response["request_time"],
+            request_datetime=raw_response["request_datetime"],
+            completions=generated_outputs,
+            embedding=[],
+        )

helm/clients/yi_client.py CHANGED Viewed

@@ -26,6 +26,3 @@ class YiChatClient(OpenAIClient):
             org_id=None,
             base_url=YiChatClient.BASE_URL,
         )
-    def _is_chat_model_engine(self, model_engine: str) -> bool:
-        return True

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl