crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -2,9 +2,14 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .test_utils import
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
from helm.benchmark.window_services.test_utils import (
|
|
6
|
+
get_tokenizer_service,
|
|
7
|
+
TEST_PROMPT,
|
|
8
|
+
GPT2_TEST_TOKENS,
|
|
9
|
+
GPT2_TEST_TOKEN_IDS,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
12
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class TestOpenAIWindowService:
|
|
@@ -2,9 +2,9 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
5
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class TestOPTWindowService:
|
|
@@ -2,9 +2,9 @@ from tempfile import TemporaryDirectory
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .tokenizer_service import TokenizerService
|
|
6
|
-
from .window_service_factory import WindowServiceFactory
|
|
7
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
5
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
6
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class TestPalmyraWindowService:
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestT0ppWindowService:
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestT511bWindowService:
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestUL2WindowService:
|
|
@@ -4,7 +4,7 @@ from helm.common.authentication import Authentication
|
|
|
4
4
|
from helm.common.cache_backend_config import CacheBackendConfig
|
|
5
5
|
from helm.proxy.services.server_service import ServerService
|
|
6
6
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
7
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
TEST_PROMPT: str = (
|
|
@@ -2,9 +2,9 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
5
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class TestYaLMWindowService:
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from helm.common.authentication import Authentication
|
|
2
2
|
from helm.common.tokenization_request import (
|
|
3
|
-
WindowServiceInfo,
|
|
4
3
|
TokenizationRequest,
|
|
5
4
|
TokenizationRequestResult,
|
|
6
5
|
DecodeRequest,
|
|
@@ -25,7 +24,3 @@ class TokenizerService:
|
|
|
25
24
|
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
26
25
|
"""Decode via an API."""
|
|
27
26
|
return self._service.decode(self._auth, request)
|
|
28
|
-
|
|
29
|
-
def get_info(self, model_name: str) -> WindowServiceInfo:
|
|
30
|
-
"""Get info via an API."""
|
|
31
|
-
return self._service.get_window_service_info(model_name)
|
helm/clients/ai21_client.py
CHANGED
|
@@ -11,8 +11,8 @@ from helm.common.request import (
|
|
|
11
11
|
GeneratedOutput,
|
|
12
12
|
Token,
|
|
13
13
|
)
|
|
14
|
-
from .client import CachingClient, truncate_sequence, cleanup_str
|
|
15
|
-
from .ai21_utils import AI21RequestError, handle_failed_request
|
|
14
|
+
from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
|
|
15
|
+
from helm.clients.ai21_utils import AI21RequestError, handle_failed_request
|
|
16
16
|
|
|
17
17
|
try:
|
|
18
18
|
from ai21 import AI21Client as AISDKClient
|
|
@@ -186,7 +186,7 @@ class AI21ChatClient(CachingClient):
|
|
|
186
186
|
completions: List[GeneratedOutput] = []
|
|
187
187
|
|
|
188
188
|
for choice in response["choices"]:
|
|
189
|
-
completions.append(GeneratedOutput(text=choice["message"]["content"], logprob=0.0, tokens=[]))
|
|
189
|
+
completions.append(GeneratedOutput(text=choice["message"]["content"] or "", logprob=0.0, tokens=[]))
|
|
190
190
|
|
|
191
191
|
return RequestResult(
|
|
192
192
|
success=True,
|
|
@@ -4,7 +4,7 @@ from helm.common.cache import CacheConfig
|
|
|
4
4
|
from helm.common.media_object import TEXT_TYPE
|
|
5
5
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
6
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
7
|
-
from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
7
|
+
from helm.clients.client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
10
|
from aleph_alpha_client import Client, CompletionRequest, CompletionResponse, Image, Prompt
|
|
File without changes
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
from typing import Any, Dict, List, Optional, Tuple, TypedDict
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
from transformers import AutoModel, PreTrainedModel
|
|
6
|
+
|
|
7
|
+
from helm.clients.client import CachingClient
|
|
8
|
+
from helm.common.cache import CacheConfig
|
|
9
|
+
from helm.common.media_object import TEXT_TYPE
|
|
10
|
+
from helm.common.request import (
|
|
11
|
+
GeneratedOutput,
|
|
12
|
+
Request,
|
|
13
|
+
RequestResult,
|
|
14
|
+
wrap_request_time,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.audio_utils import get_array_from_audio_file
|
|
17
|
+
from helm.proxy.retry import NonRetriableException
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_LOCK = threading.Lock()
|
|
21
|
+
_PRE_TRAINED_MODEL: Optional[PreTrainedModel] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _get_pre_trained_model(model_name: str, **kwargs) -> PreTrainedModel:
|
|
25
|
+
global _LOCK
|
|
26
|
+
global _PRE_TRAINED_MODEL
|
|
27
|
+
with _LOCK:
|
|
28
|
+
if _PRE_TRAINED_MODEL is None:
|
|
29
|
+
_PRE_TRAINED_MODEL = AutoModel.from_pretrained(model_name, **kwargs)
|
|
30
|
+
return _PRE_TRAINED_MODEL
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DivaLlamaRequest(TypedDict):
|
|
34
|
+
"""Cache key for DivaLlamaClient"""
|
|
35
|
+
|
|
36
|
+
model: str
|
|
37
|
+
media_objects: List[Dict[str, Any]]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DivaLlamaClient(CachingClient):
|
|
41
|
+
SAMPLE_RATE = 16000
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
cache_config: CacheConfig,
|
|
46
|
+
**kwargs,
|
|
47
|
+
):
|
|
48
|
+
super().__init__(cache_config)
|
|
49
|
+
self.pre_trained_model = _get_pre_trained_model("WillHeld/DiVA-llama-3-v0-8b", trust_remote_code=True, **kwargs)
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _get_generate_input(request: Request) -> Tuple[np.ndarray, Optional[str]]:
|
|
53
|
+
if request.prompt:
|
|
54
|
+
raise NonRetriableException("request.prompt must be empty for DivaLlamaClient")
|
|
55
|
+
if request.embedding:
|
|
56
|
+
raise NonRetriableException("request.embedding must be empty for DivaLlamaClient")
|
|
57
|
+
if request.messages:
|
|
58
|
+
raise NonRetriableException("request.messages must be empty for DivaLlamaClient")
|
|
59
|
+
if request.multimodal_prompt is None:
|
|
60
|
+
raise NonRetriableException("request.multimodal_prompt must not be None for DivaLlamaClient")
|
|
61
|
+
text_input: Optional[str] = None
|
|
62
|
+
audio_input: Optional[np.ndarray] = None
|
|
63
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
64
|
+
if media_object.is_type("audio"):
|
|
65
|
+
if audio_input is not None:
|
|
66
|
+
raise NonRetriableException(
|
|
67
|
+
"Only one audio object allowed in request.multimodal_prompt.media_objects"
|
|
68
|
+
)
|
|
69
|
+
assert media_object.location
|
|
70
|
+
audio_input = get_array_from_audio_file(media_object.location, DivaLlamaClient.SAMPLE_RATE)
|
|
71
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
72
|
+
if text_input is not None:
|
|
73
|
+
raise NonRetriableException(
|
|
74
|
+
"Only one text object allowed in request.multimodal_prompt.media_objects"
|
|
75
|
+
)
|
|
76
|
+
assert media_object.text is not None
|
|
77
|
+
text_input = media_object.text
|
|
78
|
+
else:
|
|
79
|
+
raise NonRetriableException(f"Unsupported media content type type: {media_object.content_type}")
|
|
80
|
+
if audio_input is None:
|
|
81
|
+
raise NonRetriableException(
|
|
82
|
+
"Expected a single audio object allowed in request.multimodal_prompt.media_objects"
|
|
83
|
+
)
|
|
84
|
+
return audio_input, text_input
|
|
85
|
+
|
|
86
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
87
|
+
assert request.multimodal_prompt is not None
|
|
88
|
+
raw_request: DivaLlamaRequest = {
|
|
89
|
+
"model": request.model,
|
|
90
|
+
"media_objects": [media_object.to_dict() for media_object in request.multimodal_prompt.media_objects],
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
|
|
95
|
+
def do_it() -> Dict[str, Any]:
|
|
96
|
+
with _LOCK:
|
|
97
|
+
audio_input, text_input = DivaLlamaClient._get_generate_input(request)
|
|
98
|
+
if text_input is None:
|
|
99
|
+
return {"completions": self.pre_trained_model.generate([audio_input])}
|
|
100
|
+
else:
|
|
101
|
+
return {"completions": self.pre_trained_model.generate([audio_input], [text_input])}
|
|
102
|
+
|
|
103
|
+
cache_key = CachingClient.make_cache_key(raw_request, request)
|
|
104
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
105
|
+
except Exception as e: # Do something if error is encountered.
|
|
106
|
+
error: str = f"HuggingFace error: {e}"
|
|
107
|
+
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
108
|
+
|
|
109
|
+
generated_output = GeneratedOutput(text=response["completions"][0], logprob=0, tokens=[])
|
|
110
|
+
|
|
111
|
+
return RequestResult(
|
|
112
|
+
success=True,
|
|
113
|
+
cached=cached,
|
|
114
|
+
request_time=response["request_time"],
|
|
115
|
+
request_datetime=response.get("request_datetime"),
|
|
116
|
+
completions=[generated_output],
|
|
117
|
+
embedding=[],
|
|
118
|
+
)
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
from threading import Lock
|
|
2
|
+
import torch
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from transformers import AutoTokenizer
|
|
7
|
+
import whisper
|
|
8
|
+
from helm.clients.audio_language.llama_omni.model.builder import load_pretrained_model as load_llama_omni
|
|
9
|
+
from helm.clients.audio_language.llama_omni.model.language_model.omni_speech2s_llama import OmniSpeech2SLlamaForCausalLM
|
|
10
|
+
from helm.clients.audio_language.llama_omni.conversation import conv_templates, Conversation
|
|
11
|
+
from helm.clients.audio_language.llama_omni.preprocess import tokenizer_speech_token
|
|
12
|
+
|
|
13
|
+
from helm.common.cache import CacheConfig
|
|
14
|
+
from helm.common.gpu_utils import get_torch_device_name
|
|
15
|
+
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
16
|
+
from helm.common.media_object import TEXT_TYPE
|
|
17
|
+
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
18
|
+
from helm.common.request import wrap_request_time
|
|
19
|
+
from helm.clients.client import CachingClient, generate_uid_for_multimodal_prompt
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class LoadedLlamaOmniModelProcessor:
|
|
24
|
+
"""Loaded model and processor for Qwen."""
|
|
25
|
+
|
|
26
|
+
model: OmniSpeech2SLlamaForCausalLM
|
|
27
|
+
tokenizer: AutoTokenizer
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
_models_lock: Lock = Lock()
|
|
31
|
+
_models: Dict[str, Optional[LoadedLlamaOmniModelProcessor]] = {
|
|
32
|
+
"ICTNLP/Llama-3.1-8B-Omni": None,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class LlamaOmniAudioLMClient(CachingClient):
|
|
37
|
+
"""
|
|
38
|
+
From https://github.com/ictnlp/LLaMA-Omni,
|
|
39
|
+
LLaMA-Omni is the audio multimodal version based on the LLaMA-3.1-8B large language model,
|
|
40
|
+
developed by ICTNLP group. LLaMA-Omni accepts audio, text as inputs, and outputs text.
|
|
41
|
+
|
|
42
|
+
Paper: https://arxiv.org/abs/2409.06666
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
END_OF_TEXT_TOKEN: str = "<|im_end|>"
|
|
46
|
+
CONV_MODE: str = "llama_3"
|
|
47
|
+
PAD_ID: int = 128004
|
|
48
|
+
MEL_NUM: int = 128
|
|
49
|
+
|
|
50
|
+
def __init__(self, cache_config: CacheConfig):
|
|
51
|
+
super().__init__(cache_config=cache_config)
|
|
52
|
+
self._device: str = get_torch_device_name()
|
|
53
|
+
|
|
54
|
+
def _get_model(self, helm_model_name: str) -> LoadedLlamaOmniModelProcessor:
|
|
55
|
+
global _models_lock
|
|
56
|
+
global _models
|
|
57
|
+
|
|
58
|
+
model_name: str
|
|
59
|
+
if helm_model_name == "llama-3.1-8b-omni":
|
|
60
|
+
model_name = "ICTNLP/Llama-3.1-8B-Omni"
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError(f"Unhandled model name: {helm_model_name}")
|
|
63
|
+
|
|
64
|
+
# Ensure that only one thread is loading the model at a time
|
|
65
|
+
with _models_lock:
|
|
66
|
+
loaded_model_processor = _models[model_name]
|
|
67
|
+
if loaded_model_processor is None:
|
|
68
|
+
hlog(f"Loading model {model_name} and caching in memory...")
|
|
69
|
+
# Follow the official LLaMA-Omni model loading pattern:
|
|
70
|
+
# https://github.com/ictnlp/LLaMA-Omni/blob/main/omni_speech/infer/run.sh
|
|
71
|
+
tokenizer, model, _ = load_llama_omni(model_name, None, s2s=True)
|
|
72
|
+
_models[model_name] = LoadedLlamaOmniModelProcessor(model, tokenizer)
|
|
73
|
+
loaded_model_processor = _models[model_name]
|
|
74
|
+
|
|
75
|
+
assert loaded_model_processor is not None
|
|
76
|
+
return loaded_model_processor
|
|
77
|
+
|
|
78
|
+
def _load_local_audio(self, media_object) -> torch.Tensor:
|
|
79
|
+
assert media_object.is_local_file, "LLaMA-Omni only supports local audio file input"
|
|
80
|
+
audio_media = whisper.load_audio(media_object.location)
|
|
81
|
+
audio_media = whisper.pad_or_trim(audio_media)
|
|
82
|
+
audio_media = whisper.log_mel_spectrogram(audio_media, n_mels=self.MEL_NUM).permute(1, 0)
|
|
83
|
+
return audio_media
|
|
84
|
+
|
|
85
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
86
|
+
assert request.multimodal_prompt is not None, "Multimodal prompt is required"
|
|
87
|
+
|
|
88
|
+
loaded_model_processor: LoadedLlamaOmniModelProcessor = self._get_model(request.model_engine)
|
|
89
|
+
model = loaded_model_processor.model
|
|
90
|
+
tokenizer = loaded_model_processor.tokenizer
|
|
91
|
+
|
|
92
|
+
# The generation configs are taken from the official LLaMA-Omni repository
|
|
93
|
+
# https://github.com/ictnlp/LLaMA-Omni/blob/main/omni_speech/infer/infer.py#L116
|
|
94
|
+
generation_args = {
|
|
95
|
+
"max_new_tokens": 25,
|
|
96
|
+
"do_sample": False,
|
|
97
|
+
"use_cache": False,
|
|
98
|
+
"pad_token_id": self.PAD_ID,
|
|
99
|
+
"streaming_unit_gen": False,
|
|
100
|
+
"top_p": None,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
input_text_query: Dict[str, str]
|
|
104
|
+
input_audio_query: Dict[str, Any]
|
|
105
|
+
prompt_text: str = ""
|
|
106
|
+
|
|
107
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
108
|
+
if media_object.is_type("audio") and media_object.location:
|
|
109
|
+
input_audio_query = {"audio": self._load_local_audio(media_object)}
|
|
110
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
111
|
+
if media_object.text is None:
|
|
112
|
+
raise ValueError("MediaObject of text type has missing text field value")
|
|
113
|
+
input_text_query = {"text": "<speech>\n" + media_object.text}
|
|
114
|
+
prompt_text += media_object.text
|
|
115
|
+
else:
|
|
116
|
+
raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
|
|
117
|
+
|
|
118
|
+
completions: List[GeneratedOutput] = []
|
|
119
|
+
request_time: float = 0
|
|
120
|
+
request_datetime: Optional[int] = None
|
|
121
|
+
all_cached: bool = True
|
|
122
|
+
|
|
123
|
+
with htrack_block(f"Generating for prompt: {prompt_text}"):
|
|
124
|
+
for completion_index in range(request.num_completions):
|
|
125
|
+
try:
|
|
126
|
+
|
|
127
|
+
def do_it() -> Dict[str, Any]:
|
|
128
|
+
conv: Conversation = conv_templates[self.CONV_MODE].copy()
|
|
129
|
+
conv.append_message(conv.roles[0], input_text_query["text"])
|
|
130
|
+
conv.append_message(conv.roles[1], None)
|
|
131
|
+
query: str = conv.get_prompt()
|
|
132
|
+
# LLama-Omni requires a batch input
|
|
133
|
+
text_inputs = (
|
|
134
|
+
tokenizer_speech_token(query, tokenizer, return_tensors="pt").unsqueeze(0).to(self._device)
|
|
135
|
+
)
|
|
136
|
+
audio_inputs = (
|
|
137
|
+
input_audio_query["audio"].to(dtype=torch.float16, device=self._device).unsqueeze(0)
|
|
138
|
+
)
|
|
139
|
+
speech_length = torch.LongTensor([audio_inputs.shape[1]])
|
|
140
|
+
pred, _ = model.generate(
|
|
141
|
+
text_inputs,
|
|
142
|
+
audio_inputs,
|
|
143
|
+
speech_length,
|
|
144
|
+
None,
|
|
145
|
+
None,
|
|
146
|
+
None,
|
|
147
|
+
None,
|
|
148
|
+
None,
|
|
149
|
+
None,
|
|
150
|
+
None,
|
|
151
|
+
None,
|
|
152
|
+
False,
|
|
153
|
+
None,
|
|
154
|
+
None,
|
|
155
|
+
**generation_args,
|
|
156
|
+
)
|
|
157
|
+
completion = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
|
|
158
|
+
tokens: List[str] = tokenizer.tokenize(completion)
|
|
159
|
+
return {"output": (completion, tokens)}
|
|
160
|
+
|
|
161
|
+
# Include the prompt and model name in the cache key
|
|
162
|
+
cache_key = CachingClient.make_cache_key(
|
|
163
|
+
raw_request={
|
|
164
|
+
"completion_index": completion_index,
|
|
165
|
+
"model": request.model,
|
|
166
|
+
"prompt": generate_uid_for_multimodal_prompt(request.multimodal_prompt),
|
|
167
|
+
**generation_args,
|
|
168
|
+
},
|
|
169
|
+
request=request,
|
|
170
|
+
)
|
|
171
|
+
result, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
172
|
+
except RuntimeError as model_error:
|
|
173
|
+
return RequestResult(
|
|
174
|
+
success=False, cached=False, error=str(model_error), completions=[], embedding=[]
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
text, tokens = result["output"]
|
|
178
|
+
|
|
179
|
+
# Tokenize truncated text to get the list of tokens
|
|
180
|
+
completions.append(
|
|
181
|
+
GeneratedOutput(
|
|
182
|
+
text=text, logprob=0, tokens=[Token(text=str(token), logprob=0) for token in tokens]
|
|
183
|
+
)
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
request_time += result["request_time"]
|
|
187
|
+
# Use the datetime from the first completion because that's when the request was fired
|
|
188
|
+
request_datetime = request_datetime or result.get("request_datetime")
|
|
189
|
+
all_cached = all_cached and cached
|
|
190
|
+
|
|
191
|
+
return RequestResult(
|
|
192
|
+
success=True,
|
|
193
|
+
cached=all_cached,
|
|
194
|
+
request_time=request_time,
|
|
195
|
+
request_datetime=request_datetime,
|
|
196
|
+
completions=completions,
|
|
197
|
+
embedding=[],
|
|
198
|
+
)
|