crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
from threading import Lock
|
|
2
|
+
from typing import Any, Dict, List, Optional
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
|
6
|
+
from qwen_vl_utils import process_vision_info
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
from helm.common.cache import CacheConfig
|
|
10
|
+
from helm.common.gpu_utils import get_torch_device_name
|
|
11
|
+
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
12
|
+
from helm.common.media_object import TEXT_TYPE
|
|
13
|
+
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
14
|
+
from helm.common.request import wrap_request_time
|
|
15
|
+
from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class LoadedQwen2ModelProcessor:
|
|
20
|
+
model: Qwen2VLForConditionalGeneration
|
|
21
|
+
processor: AutoProcessor
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_models_lock: Lock = Lock()
|
|
25
|
+
_models: Dict[str, Optional[LoadedQwen2ModelProcessor]] = {
|
|
26
|
+
"Qwen/Qwen2-VL-7B-Instruct": None,
|
|
27
|
+
"Qwen/Qwen2-VL-72B-Instruct": None,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Qwen2VLMClient(CachingClient):
|
|
32
|
+
def __init__(self, cache_config: CacheConfig):
|
|
33
|
+
super().__init__(cache_config=cache_config)
|
|
34
|
+
self._device: str = get_torch_device_name()
|
|
35
|
+
|
|
36
|
+
def _get_model_name(self, helm_model_name: str) -> str:
|
|
37
|
+
if helm_model_name == "qwen2-vl-7b-instruct":
|
|
38
|
+
return "Qwen/Qwen2-VL-7B-Instruct"
|
|
39
|
+
elif helm_model_name == "qwen2-vl-72b-instruct":
|
|
40
|
+
return "Qwen/Qwen2-VL-72B-Instruct"
|
|
41
|
+
else:
|
|
42
|
+
raise ValueError(f"Unhandled model name: {helm_model_name}")
|
|
43
|
+
|
|
44
|
+
def _get_model(self, helm_model_name: str) -> LoadedQwen2ModelProcessor:
|
|
45
|
+
global _models_lock
|
|
46
|
+
global _models
|
|
47
|
+
|
|
48
|
+
model_name = self._get_model_name(helm_model_name)
|
|
49
|
+
|
|
50
|
+
with _models_lock:
|
|
51
|
+
loaded = _models[model_name]
|
|
52
|
+
if loaded is None:
|
|
53
|
+
hlog(f"Loading model {model_name} and caching in memory...")
|
|
54
|
+
# https://huggingface.co/docs/transformers/model_doc/qwen2_vl#flash-attention-2-to-speed-up-generation
|
|
55
|
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
56
|
+
model_name,
|
|
57
|
+
torch_dtype=torch.bfloat16,
|
|
58
|
+
device_map="auto",
|
|
59
|
+
attn_implementation="flash_attention_2",
|
|
60
|
+
).eval()
|
|
61
|
+
processor = AutoProcessor.from_pretrained(model_name)
|
|
62
|
+
loaded = LoadedQwen2ModelProcessor(model=model, processor=processor)
|
|
63
|
+
_models[model_name] = loaded
|
|
64
|
+
|
|
65
|
+
return loaded
|
|
66
|
+
|
|
67
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
68
|
+
assert request.multimodal_prompt is not None, "Multimodal prompt is required"
|
|
69
|
+
loaded = self._get_model(request.model_engine)
|
|
70
|
+
model = loaded.model
|
|
71
|
+
processor = loaded.processor
|
|
72
|
+
|
|
73
|
+
# Build Qwen2 messages
|
|
74
|
+
# We assume all media objects go into a single "user" message:
|
|
75
|
+
# messages = [
|
|
76
|
+
# {
|
|
77
|
+
# "role": "user",
|
|
78
|
+
# "content": [
|
|
79
|
+
# {"type": "image", "image": "file:///path/to/image1.jpg"},
|
|
80
|
+
# {"type": "image", "image": "file:///path/to/image2.jpg"},
|
|
81
|
+
# {"type": "text", "text": "Describe these images."}
|
|
82
|
+
# ]
|
|
83
|
+
# }
|
|
84
|
+
# ]
|
|
85
|
+
message_content = []
|
|
86
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
87
|
+
if media_object.is_type("image") and media_object.location:
|
|
88
|
+
message_content.append({"type": "image", "image": media_object.location})
|
|
89
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
90
|
+
if media_object.text is None:
|
|
91
|
+
raise ValueError("MediaObject of text type has missing text field value")
|
|
92
|
+
message_content.append({"type": "text", "text": media_object.text})
|
|
93
|
+
else:
|
|
94
|
+
raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
|
|
95
|
+
|
|
96
|
+
messages = [{"role": "user", "content": message_content}]
|
|
97
|
+
|
|
98
|
+
# Prepare text and vision inputs
|
|
99
|
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
100
|
+
image_inputs, video_inputs = process_vision_info(messages)
|
|
101
|
+
|
|
102
|
+
inputs = processor(
|
|
103
|
+
text=[text],
|
|
104
|
+
images=image_inputs,
|
|
105
|
+
videos=video_inputs,
|
|
106
|
+
padding=True,
|
|
107
|
+
return_tensors="pt",
|
|
108
|
+
).to(self._device)
|
|
109
|
+
|
|
110
|
+
generation_args = {
|
|
111
|
+
"max_new_tokens": request.max_tokens,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
completions: List[GeneratedOutput] = []
|
|
115
|
+
request_time: float = 0
|
|
116
|
+
request_datetime: Optional[int] = None
|
|
117
|
+
all_cached: bool = True
|
|
118
|
+
|
|
119
|
+
with htrack_block(f"Generating for prompt: {text}"):
|
|
120
|
+
for completion_index in range(request.num_completions):
|
|
121
|
+
try:
|
|
122
|
+
|
|
123
|
+
def do_it() -> Dict[str, Any]:
|
|
124
|
+
generated_ids = model.generate(**inputs, **generation_args)
|
|
125
|
+
# Remove the input prefix from outputs
|
|
126
|
+
generated_ids_trimmed = [
|
|
127
|
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
|
128
|
+
]
|
|
129
|
+
output_text = processor.batch_decode(
|
|
130
|
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
131
|
+
)
|
|
132
|
+
# There's only one batch element
|
|
133
|
+
completion = output_text[0]
|
|
134
|
+
# For simplicity, we split tokens by whitespace.
|
|
135
|
+
# A more accurate tokenization would require a tokenizer for Qwen2, if desired.
|
|
136
|
+
tokens = completion.split()
|
|
137
|
+
return {"output": (completion, tokens)}
|
|
138
|
+
|
|
139
|
+
cache_key = CachingClient.make_cache_key(
|
|
140
|
+
raw_request={
|
|
141
|
+
"completion_index": completion_index,
|
|
142
|
+
"model": request.model,
|
|
143
|
+
"prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
|
|
144
|
+
**generation_args,
|
|
145
|
+
},
|
|
146
|
+
request=request,
|
|
147
|
+
)
|
|
148
|
+
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
149
|
+
except RuntimeError as model_error:
|
|
150
|
+
return RequestResult(
|
|
151
|
+
success=False, cached=False, error=str(model_error), completions=[], embedding=[]
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
text_out, tokens = result["output"]
|
|
155
|
+
completions.append(
|
|
156
|
+
GeneratedOutput(
|
|
157
|
+
text=text_out,
|
|
158
|
+
logprob=0,
|
|
159
|
+
tokens=[Token(text=str(token), logprob=0) for token in tokens],
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
hlog(f"Generated: {text_out}")
|
|
163
|
+
|
|
164
|
+
request_time += result["request_time"]
|
|
165
|
+
request_datetime = request_datetime or result.get("request_datetime")
|
|
166
|
+
all_cached = all_cached and cached
|
|
167
|
+
|
|
168
|
+
return RequestResult(
|
|
169
|
+
success=True,
|
|
170
|
+
cached=all_cached,
|
|
171
|
+
request_time=request_time,
|
|
172
|
+
request_datetime=request_datetime,
|
|
173
|
+
completions=completions,
|
|
174
|
+
embedding=[],
|
|
175
|
+
)
|
helm/clients/vllm_client.py
CHANGED
|
@@ -2,13 +2,15 @@ from typing import Any, Dict, Optional
|
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import CacheConfig
|
|
4
4
|
from helm.common.request import Request
|
|
5
|
-
from helm.clients.openai_client import
|
|
5
|
+
from helm.clients.openai_client import OpenAILegacyCompletionsClient
|
|
6
6
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class VLLMClient(
|
|
9
|
+
class VLLMClient(OpenAILegacyCompletionsClient):
|
|
10
10
|
"""Sends request to a vLLM server using the OpenAI-compatible API.
|
|
11
11
|
|
|
12
|
+
Only supports the legacy Text Completions API, rather than the Chat Completions API.
|
|
13
|
+
|
|
12
14
|
See: https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server"""
|
|
13
15
|
|
|
14
16
|
def __init__(
|
|
@@ -29,10 +31,6 @@ class VLLMClient(OpenAIClient):
|
|
|
29
31
|
self.tokenizer = tokenizer
|
|
30
32
|
self.tokenizer_name = tokenizer_name
|
|
31
33
|
|
|
32
|
-
def _is_chat_model_engine(self, model_engine: str) -> bool:
|
|
33
|
-
# Only support vLLM completion models for now.
|
|
34
|
-
return False
|
|
35
|
-
|
|
36
34
|
def _get_model_for_request(self, request: Request) -> str:
|
|
37
35
|
# The `model` parameter for vLLM should be the whole model name including the creator organization,
|
|
38
36
|
# unlike OpenAI which only uses the model engine.
|
helm/clients/yi_client.py
CHANGED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from typing import Optional
|
|
3
|
+
from filelock import FileLock
|
|
4
|
+
import base64
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
import ffmpeg
|
|
8
|
+
import numpy as np
|
|
9
|
+
import soundfile as sf
|
|
10
|
+
import subprocess
|
|
11
|
+
|
|
12
|
+
from helm.common.hierarchical_logger import hlog
|
|
13
|
+
from helm.common.multimodal_request_utils import get_contents_as_bytes
|
|
14
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
15
|
+
|
|
16
|
+
try:
|
|
17
|
+
import librosa
|
|
18
|
+
except ModuleNotFoundError as e:
|
|
19
|
+
handle_module_not_found_error(e, ["audiolm"])
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def ensure_audio_file_exists_from_array(path: str, array: np.ndarray, sample_rate: int) -> None:
|
|
23
|
+
"""Write the array to the wav or mp3 file if it does not already exist.
|
|
24
|
+
|
|
25
|
+
Uses file locking and an atomic rename to avoid file corruption due to incomplete writes and
|
|
26
|
+
concurrent writes."""
|
|
27
|
+
file_extension = os.path.splitext(path)[1]
|
|
28
|
+
if file_extension != ".wav" and file_extension != ".mp3":
|
|
29
|
+
raise ValueError(f"Path must end with .wav or .mp3: {path}")
|
|
30
|
+
with FileLock(f"{path}.lock"):
|
|
31
|
+
if os.path.exists(path):
|
|
32
|
+
# Skip because file already exists
|
|
33
|
+
return
|
|
34
|
+
path_prefix = path.removesuffix(file_extension)
|
|
35
|
+
tmp_path = f"{path_prefix}.tmp{file_extension}"
|
|
36
|
+
sf.write(tmp_path, array, samplerate=sample_rate)
|
|
37
|
+
os.rename(tmp_path, path)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_array_from_audio_file(path: str, sample_rate: Optional[int]) -> np.ndarray:
|
|
41
|
+
"""Get an array from an audio file"""
|
|
42
|
+
audio_file = (
|
|
43
|
+
BytesIO(get_contents_as_bytes(path)) if path.startswith("http://") or path.startswith("https://") else path
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# librosa accepts a local file path or a file-like object
|
|
47
|
+
audio_array, _ = librosa.load(audio_file, sr=sample_rate)
|
|
48
|
+
return audio_array
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def use_ffmpeg_to_convert_audio_file(input_path: str, output_path: str) -> None:
|
|
52
|
+
if os.path.exists(output_path):
|
|
53
|
+
return
|
|
54
|
+
"""Use ffmpeg to convert an audio file type"""
|
|
55
|
+
try:
|
|
56
|
+
subprocess.run(["ffmpeg", "-i", input_path, output_path], check=True)
|
|
57
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
58
|
+
raise ValueError("Please install ffmpeg using `bash install-shelm-extras.sh` first to convert audio files.")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def is_invalid_audio_file(audio_path: str) -> bool:
|
|
62
|
+
"""
|
|
63
|
+
Two conditions for an audio file to be considered invalid:
|
|
64
|
+
1. The file does not exist.
|
|
65
|
+
2. The file is empty.
|
|
66
|
+
"""
|
|
67
|
+
if not os.path.exists(audio_path):
|
|
68
|
+
return True
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
with sf.SoundFile(audio_path) as audio_file:
|
|
72
|
+
return len(audio_file) == 0
|
|
73
|
+
except RuntimeError:
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def extract_audio(video_path: str, output_audio_path: str) -> None:
|
|
78
|
+
"""
|
|
79
|
+
Extracts audio from an MP4 video file and saves it as an MP3 file.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
video_path (str): Path to the input MP4 video file.
|
|
83
|
+
output_audio_path (str): Path to save the extracted MP3 audio file.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
None
|
|
87
|
+
"""
|
|
88
|
+
try:
|
|
89
|
+
(
|
|
90
|
+
ffmpeg.input(video_path)
|
|
91
|
+
.output(output_audio_path, format="mp3", acodec="libmp3lame", audio_bitrate="192k")
|
|
92
|
+
.run(overwrite_output=True)
|
|
93
|
+
)
|
|
94
|
+
except ffmpeg.Error as e:
|
|
95
|
+
hlog(f"Error extracting audio from video: {video_path}: {e.stderr.decode()}")
|
|
96
|
+
raise e
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def encode_audio_to_base64(file_path: str) -> str:
|
|
100
|
+
"""
|
|
101
|
+
Encodes an audio file to a Base64 string.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
file_path (str): Path to the audio file.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
str: Base64-encoded string of the audio file.
|
|
108
|
+
"""
|
|
109
|
+
assert os.path.exists(file_path), f"Audio file does not exist at path: {file_path}"
|
|
110
|
+
with open(file_path, "rb") as audio_file:
|
|
111
|
+
return base64.b64encode(audio_file.read()).decode("utf-8")
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
from typing import Callable
|
|
3
3
|
|
|
4
4
|
from helm.common.general import ensure_directory_exists, generate_unique_id
|
|
5
|
-
from .file_cache import FileCache
|
|
5
|
+
from helm.common.file_caches.file_cache import FileCache
|
|
6
6
|
|
|
7
7
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
8
|
|
helm/common/images_utils.py
CHANGED
|
@@ -10,7 +10,7 @@ from urllib.request import urlopen
|
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
|
|
13
|
-
from .general import is_url
|
|
13
|
+
from helm.common.general import is_url
|
|
14
14
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
15
15
|
|
|
16
16
|
try:
|
|
@@ -25,7 +25,7 @@ def open_image(image_location: str) -> Image.Image:
|
|
|
25
25
|
"""
|
|
26
26
|
image: Image.Image
|
|
27
27
|
if is_url(image_location):
|
|
28
|
-
image = Image.open(requests.get(image_location, stream=True).raw)
|
|
28
|
+
image = Image.open(requests.get(image_location, stream=True).raw) # type: ignore
|
|
29
29
|
else:
|
|
30
30
|
image = Image.open(image_location)
|
|
31
31
|
return image.convert("RGB")
|
helm/common/media_object.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import urllib
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from dataclasses import dataclass, field, replace
|
|
5
|
-
from typing import List, Optional
|
|
5
|
+
from typing import Any, Dict, List, Optional
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
IMAGE_TYPE = "image"
|
|
@@ -27,7 +27,7 @@ class MediaObject:
|
|
|
27
27
|
location: Optional[str] = None
|
|
28
28
|
"""When the media object is a file, specify the location of the media object, which can be a local path or URL."""
|
|
29
29
|
|
|
30
|
-
def to_dict(self) ->
|
|
30
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
31
31
|
"""Converts the media object to a dictionary."""
|
|
32
32
|
return {key: value for key, value in self.__dict__.items() if value is not None}
|
|
33
33
|
|
|
@@ -1,10 +1,36 @@
|
|
|
1
|
+
import base64
|
|
1
2
|
from typing import List, Optional
|
|
2
3
|
|
|
4
|
+
import requests
|
|
5
|
+
import urllib.parse
|
|
6
|
+
|
|
3
7
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
8
|
from helm.benchmark.scenarios.scenario import Reference
|
|
5
9
|
from helm.common.request import RequestResult
|
|
6
10
|
|
|
7
11
|
|
|
12
|
+
def get_contents_as_bytes(path: str) -> bytes:
|
|
13
|
+
"""Get the contents at the location as bytes.
|
|
14
|
+
|
|
15
|
+
The location can be a local path or a URL."""
|
|
16
|
+
# Fetch the audio file and convert it to a base64 encoded string
|
|
17
|
+
is_remote = urllib.parse.urlparse(path).scheme in ["http", "https"]
|
|
18
|
+
if is_remote:
|
|
19
|
+
response = requests.get(path)
|
|
20
|
+
response.raise_for_status()
|
|
21
|
+
return response.content
|
|
22
|
+
else:
|
|
23
|
+
with open(path, "rb") as f:
|
|
24
|
+
return f.read()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def get_contents_as_base64(path: str) -> str:
|
|
28
|
+
"""Get the contents at the location as a base64-encoded string.
|
|
29
|
+
|
|
30
|
+
The location can be a local path or a URL."""
|
|
31
|
+
return base64.b64encode(get_contents_as_bytes(path)).decode("utf-8")
|
|
32
|
+
|
|
33
|
+
|
|
8
34
|
def gather_generated_image_locations(request_result: RequestResult) -> List[str]:
|
|
9
35
|
"""Gathers the locations (file paths or URLs) of the generated images."""
|
|
10
36
|
image_locations: List[str] = []
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class REEvalParameters:
|
|
7
|
+
"""
|
|
8
|
+
Parameters for reeval evaluation.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
model_ability: Optional[float] = None
|
|
12
|
+
"""The inital ability of the model to perform the task. Used for reeval evaluation."""
|
helm/common/request.py
CHANGED
|
@@ -4,7 +4,8 @@ from typing import Any, Callable, Dict, List, Optional
|
|
|
4
4
|
|
|
5
5
|
from helm.common.media_object import MultimediaObject
|
|
6
6
|
from helm.common.image_generation_parameters import ImageGenerationParameters
|
|
7
|
-
from .general import indent_lines, format_text
|
|
7
|
+
from helm.common.general import indent_lines, format_text
|
|
8
|
+
from helm.common.response_format import ResponseFormat
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
@dataclass(frozen=True)
|
|
@@ -72,6 +73,9 @@ class Request:
|
|
|
72
73
|
image_generation_parameters: Optional[ImageGenerationParameters] = None
|
|
73
74
|
"""Parameters for image generation."""
|
|
74
75
|
|
|
76
|
+
response_format: Optional[ResponseFormat] = None
|
|
77
|
+
"""EXPERIMENTAL: Response format. Currently only supported by OpenAI and Together."""
|
|
78
|
+
|
|
75
79
|
def validate(self):
|
|
76
80
|
if (
|
|
77
81
|
(self.messages and self.prompt)
|
|
@@ -193,7 +197,7 @@ class RequestResult:
|
|
|
193
197
|
"""Whether the request was actually cached"""
|
|
194
198
|
|
|
195
199
|
request_time: Optional[float] = None
|
|
196
|
-
"""How long
|
|
200
|
+
"""How long the request took in seconds"""
|
|
197
201
|
|
|
198
202
|
request_datetime: Optional[int] = None
|
|
199
203
|
"""When was the request sent?
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass(frozen=True)
|
|
6
|
+
class ResponseFormat:
|
|
7
|
+
"""EXPERIMENTAL: Model response format.
|
|
8
|
+
|
|
9
|
+
Currently only supports JSON schema.
|
|
10
|
+
|
|
11
|
+
Currently only supported by OpenAI and Together.
|
|
12
|
+
|
|
13
|
+
See:
|
|
14
|
+
- https://platform.openai.com/docs/guides/structured-outputs
|
|
15
|
+
- https://docs.together.ai/docs/json-mode"""
|
|
16
|
+
|
|
17
|
+
json_schema: Optional[Dict[str, Any]] = None
|
|
18
|
+
"""EXPERIMENTAL: The JSON schema that the model output should conform to."""
|