crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
helm/clients/bedrock_client.py
CHANGED
|
@@ -2,12 +2,13 @@ from abc import abstractmethod
|
|
|
2
2
|
from copy import deepcopy
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
|
-
from typing import Any, Dict, List, Mapping, Optional
|
|
5
|
+
from typing import Any, Dict, List, Mapping, Optional, TypedDict
|
|
6
|
+
from datetime import datetime
|
|
6
7
|
|
|
7
8
|
from helm.common.cache import CacheConfig
|
|
8
9
|
from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
|
|
9
10
|
from helm.common.request import Request, RequestResult, GeneratedOutput, wrap_request_time
|
|
10
|
-
from helm.clients.bedrock_utils import get_bedrock_client
|
|
11
|
+
from helm.clients.bedrock_utils import get_bedrock_client, get_bedrock_client_v1
|
|
11
12
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
12
13
|
|
|
13
14
|
|
|
@@ -23,27 +24,41 @@ class BedrockClient(CachingClient):
|
|
|
23
24
|
def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[GeneratedOutput]:
|
|
24
25
|
raise NotImplementedError()
|
|
25
26
|
|
|
27
|
+
"""
|
|
28
|
+
Amazon Bedrock is a fully managed service that provides s selection of leading foundation models (FMs) from Amazon
|
|
29
|
+
and other partner model providers.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def model_provider(self) -> str:
|
|
35
|
+
raise NotImplementedError()
|
|
36
|
+
|
|
26
37
|
def __init__(
|
|
27
38
|
self,
|
|
28
39
|
cache_config: CacheConfig,
|
|
29
40
|
tokenizer: Tokenizer,
|
|
30
41
|
tokenizer_name: str,
|
|
31
|
-
bedrock_model_id: Optional[str] = None,
|
|
32
42
|
assumed_role: Optional[str] = None,
|
|
33
43
|
region: Optional[str] = None,
|
|
34
44
|
):
|
|
35
45
|
super().__init__(cache_config=cache_config)
|
|
36
46
|
self.tokenizer = tokenizer
|
|
37
47
|
self.tokenizer_name = tokenizer_name
|
|
38
|
-
self.bedrock_model_id = bedrock_model_id
|
|
39
48
|
self.bedrock_client = get_bedrock_client(
|
|
40
49
|
assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
|
|
41
|
-
region=region
|
|
50
|
+
region=region,
|
|
42
51
|
)
|
|
43
52
|
|
|
44
53
|
def make_request(self, request: Request) -> RequestResult:
|
|
45
|
-
# model_id should be something like "amazon.titan-tg1-large"
|
|
46
|
-
|
|
54
|
+
# model_id should be something like "amazon.titan-tg1-large", replace amazon- prefix with model creator name
|
|
55
|
+
model_name = request.model.split("/")[-1]
|
|
56
|
+
# check if model_name starts with "amazon-"
|
|
57
|
+
if self.model_provider == "amazon":
|
|
58
|
+
model_id = f"{self.model_provider}.{model_name}"
|
|
59
|
+
else:
|
|
60
|
+
model_id = model_name.replace("amazon-", f"{self.model_provider}.")
|
|
61
|
+
|
|
47
62
|
raw_request = self.convert_request_to_raw_request(request)
|
|
48
63
|
|
|
49
64
|
# modelId isn't part of raw_request, so it must be explicitly passed into the input to
|
|
@@ -58,6 +73,7 @@ class BedrockClient(CachingClient):
|
|
|
58
73
|
|
|
59
74
|
try:
|
|
60
75
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
76
|
+
|
|
61
77
|
except Exception as error:
|
|
62
78
|
return RequestResult(
|
|
63
79
|
success=False,
|
|
@@ -79,12 +95,109 @@ class BedrockClient(CachingClient):
|
|
|
79
95
|
)
|
|
80
96
|
|
|
81
97
|
|
|
98
|
+
class _ContentBlock(TypedDict):
|
|
99
|
+
text: str
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class _Message(TypedDict):
|
|
103
|
+
role: str
|
|
104
|
+
content: List[_ContentBlock]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class BedrockNovaClient(CachingClient):
|
|
108
|
+
"""
|
|
109
|
+
Amazon Bedrock is a fully managed service that provides s selection of leading foundation models (FMs) from Amazon
|
|
110
|
+
and other partner model providers.
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
def __init__(
|
|
114
|
+
self,
|
|
115
|
+
cache_config: CacheConfig,
|
|
116
|
+
tokenizer: Tokenizer,
|
|
117
|
+
tokenizer_name: str,
|
|
118
|
+
assumed_role: Optional[str] = None,
|
|
119
|
+
region: Optional[str] = None,
|
|
120
|
+
):
|
|
121
|
+
super().__init__(cache_config=cache_config)
|
|
122
|
+
self.tokenizer = tokenizer
|
|
123
|
+
self.tokenizer_name = tokenizer_name
|
|
124
|
+
self.bedrock_client = get_bedrock_client_v1(
|
|
125
|
+
assumed_role=assumed_role or os.environ.get("BEDROCK_ASSUME_ROLE", None),
|
|
126
|
+
region=region,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
def _get_messages_from_request(self, request: Request) -> List[_Message]:
|
|
130
|
+
if request.prompt and request.messages:
|
|
131
|
+
raise ValueError(f"Only one of `prompt` and `messages` may be set in request: {request}")
|
|
132
|
+
if request.multimodal_prompt:
|
|
133
|
+
raise ValueError(f"`multimodal_prompt` is not supported in request: {request}")
|
|
134
|
+
|
|
135
|
+
if request.messages:
|
|
136
|
+
return [
|
|
137
|
+
{"role": message["role"], "content": [{"text": message["content"]}]} for message in request.messages
|
|
138
|
+
]
|
|
139
|
+
else:
|
|
140
|
+
return [{"role": "user", "content": [{"text": request.prompt}]}]
|
|
141
|
+
|
|
142
|
+
def convert_request_to_raw_request(self, request: Request) -> Dict:
|
|
143
|
+
model_id = request.model.replace("/", ".")
|
|
144
|
+
messages = self._get_messages_from_request(request)
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"modelId": model_id,
|
|
148
|
+
"inferenceConfig": {
|
|
149
|
+
"temperature": request.temperature,
|
|
150
|
+
"maxTokens": request.max_tokens,
|
|
151
|
+
"topP": request.top_p,
|
|
152
|
+
},
|
|
153
|
+
"messages": messages,
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
157
|
+
raw_request = self.convert_request_to_raw_request(request)
|
|
158
|
+
cache_key = CachingClient.make_cache_key(raw_request, request)
|
|
159
|
+
|
|
160
|
+
def do_it() -> Dict[Any, Any]:
|
|
161
|
+
return self.bedrock_client.converse(**raw_request)
|
|
162
|
+
|
|
163
|
+
response, cached = self.cache.get(cache_key, do_it)
|
|
164
|
+
|
|
165
|
+
completions = self.convert_raw_response_to_completions(response, request)
|
|
166
|
+
dt = datetime.strptime(response["ResponseMetadata"]["HTTPHeaders"]["date"], "%a, %d %b %Y %H:%M:%S GMT")
|
|
167
|
+
# Use API reported latency rather than client measured latency
|
|
168
|
+
request_time = response["metrics"]["latencyMs"] / 1000
|
|
169
|
+
|
|
170
|
+
return RequestResult(
|
|
171
|
+
success=True,
|
|
172
|
+
cached=cached,
|
|
173
|
+
request_time=request_time,
|
|
174
|
+
request_datetime=int(dt.timestamp()),
|
|
175
|
+
completions=completions,
|
|
176
|
+
embedding=[],
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[GeneratedOutput]:
|
|
180
|
+
completions: List[GeneratedOutput] = []
|
|
181
|
+
raw_completion = response["output"]
|
|
182
|
+
output_text = raw_completion["message"]["content"][0]["text"]
|
|
183
|
+
finish_reason = response["stopReason"]
|
|
184
|
+
completion = truncate_and_tokenize_response_text(
|
|
185
|
+
output_text.lstrip(), request, self.tokenizer, self.tokenizer_name, finish_reason
|
|
186
|
+
)
|
|
187
|
+
completions.append(completion)
|
|
188
|
+
return completions
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# Amazon Bedrock Client for Titan Models
|
|
82
192
|
class BedrockTitanClient(BedrockClient):
|
|
83
193
|
_COMPLETION_REASON_TO_FINISH_REASON = {
|
|
84
194
|
"LENGTH": "length",
|
|
85
195
|
"FINISH": "endoftext",
|
|
86
196
|
}
|
|
87
197
|
|
|
198
|
+
# creator org for titan
|
|
199
|
+
model_provider = "amazon"
|
|
200
|
+
|
|
88
201
|
def convert_request_to_raw_request(self, request: Request) -> Dict:
|
|
89
202
|
# TODO: Support the following:
|
|
90
203
|
# - top_k_per_token
|
|
@@ -115,6 +228,7 @@ class BedrockTitanClient(BedrockClient):
|
|
|
115
228
|
# - tokens
|
|
116
229
|
# - logprob
|
|
117
230
|
completions: List[GeneratedOutput] = []
|
|
231
|
+
|
|
118
232
|
for raw_completion in response["results"]:
|
|
119
233
|
output_text = raw_completion["outputText"]
|
|
120
234
|
# Call lstrip() Titan has the tendency to emit "\n" as the first token in the generated text output.
|
|
@@ -126,3 +240,83 @@ class BedrockTitanClient(BedrockClient):
|
|
|
126
240
|
)
|
|
127
241
|
completions.append(completion)
|
|
128
242
|
return completions
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
# Amazon Bedrock Client for Mistral Models
|
|
246
|
+
class BedrockMistralClient(BedrockClient):
|
|
247
|
+
_COMPLETION_REASON_TO_FINISH_REASON = {
|
|
248
|
+
"length": "length",
|
|
249
|
+
"stop": "endoftext",
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
model_provider = "mistral"
|
|
253
|
+
|
|
254
|
+
def convert_request_to_raw_request(self, request: Request) -> Dict:
|
|
255
|
+
# TODO: Support the following:
|
|
256
|
+
# - top_k_per_token
|
|
257
|
+
# - echo_prompt
|
|
258
|
+
# - num_completions
|
|
259
|
+
return {
|
|
260
|
+
"prompt": f"[INST]{request.prompt}[/INST]",
|
|
261
|
+
"temperature": request.temperature,
|
|
262
|
+
"top_p": request.top_p,
|
|
263
|
+
"max_tokens": request.max_tokens,
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[GeneratedOutput]:
|
|
267
|
+
# - logprob
|
|
268
|
+
completions: List[GeneratedOutput] = []
|
|
269
|
+
|
|
270
|
+
for raw_completion in response["outputs"]:
|
|
271
|
+
output_text = raw_completion["text"]
|
|
272
|
+
|
|
273
|
+
finish_reason = BedrockMistralClient._COMPLETION_REASON_TO_FINISH_REASON.get(
|
|
274
|
+
raw_completion["stop_reason"], raw_completion["stop_reason"].lower()
|
|
275
|
+
)
|
|
276
|
+
# Work around generated outputs with leading whitespace due to issue #2467
|
|
277
|
+
# TODO(#2467): Remove workaround
|
|
278
|
+
completion = truncate_and_tokenize_response_text(
|
|
279
|
+
output_text.lstrip(), request, self.tokenizer, self.tokenizer_name, finish_reason
|
|
280
|
+
)
|
|
281
|
+
completions.append(completion)
|
|
282
|
+
|
|
283
|
+
return completions
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# Amazon Bedrock Client for LLAMA Models
|
|
287
|
+
class BedrockLlamaClient(BedrockClient):
|
|
288
|
+
_COMPLETION_REASON_TO_FINISH_REASON = {
|
|
289
|
+
"length": "length",
|
|
290
|
+
"stop": "endoftext",
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
model_provider = "meta"
|
|
294
|
+
|
|
295
|
+
def convert_request_to_raw_request(self, request: Request) -> Dict:
|
|
296
|
+
# TODO: Support the following:
|
|
297
|
+
# - top_k_per_token
|
|
298
|
+
# - echo_prompt
|
|
299
|
+
# - num_completions
|
|
300
|
+
return {
|
|
301
|
+
"prompt": f"[INST]{request.prompt}[/INST]",
|
|
302
|
+
"temperature": request.temperature,
|
|
303
|
+
"top_p": request.top_p,
|
|
304
|
+
"max_gen_len": request.max_tokens,
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
def convert_raw_response_to_completions(self, response: Dict, request: Request) -> List[GeneratedOutput]:
|
|
308
|
+
# - logprob
|
|
309
|
+
completions: List[GeneratedOutput] = []
|
|
310
|
+
output_text = response["generation"]
|
|
311
|
+
|
|
312
|
+
finish_reason = BedrockLlamaClient._COMPLETION_REASON_TO_FINISH_REASON.get(
|
|
313
|
+
response["stop_reason"], response["stop_reason"].lower()
|
|
314
|
+
)
|
|
315
|
+
# Work around generated outputs with leading whitespace due to issue #2467
|
|
316
|
+
# TODO(#2467): Remove workaround
|
|
317
|
+
completion = truncate_and_tokenize_response_text(
|
|
318
|
+
output_text.lstrip(), request, self.tokenizer, self.tokenizer_name, finish_reason
|
|
319
|
+
)
|
|
320
|
+
completions.append(completion)
|
|
321
|
+
|
|
322
|
+
return completions
|
helm/clients/bedrock_utils.py
CHANGED
|
@@ -8,6 +8,7 @@ from helm.common.optional_dependencies import handle_module_not_found_error
|
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
10
|
import boto3
|
|
11
|
+
from boto3 import Session
|
|
11
12
|
from botocore.config import Config
|
|
12
13
|
except ModuleNotFoundError as e:
|
|
13
14
|
handle_module_not_found_error(e, ["aws"])
|
|
@@ -70,3 +71,35 @@ def get_bedrock_client(
|
|
|
70
71
|
|
|
71
72
|
hlog(f"Amazon Bedrock client successfully created with endpoint {bedrock_client._endpoint}")
|
|
72
73
|
return bedrock_client
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_bedrock_client_v1(
|
|
77
|
+
region: Optional[str] = None,
|
|
78
|
+
service_name: str = "bedrock-runtime",
|
|
79
|
+
assumed_role: Optional[str] = None,
|
|
80
|
+
read_timeout: int = 5000,
|
|
81
|
+
connect_timeout: int = 5000,
|
|
82
|
+
max_attempts: int = 10,
|
|
83
|
+
):
|
|
84
|
+
boto_config = Config(
|
|
85
|
+
read_timeout=read_timeout, connect_timeout=connect_timeout, retries={"max_attempts": max_attempts}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
if assumed_role:
|
|
89
|
+
session = boto3.Session(region_name=region)
|
|
90
|
+
# Assume role and get credentials
|
|
91
|
+
sts = session.client("sts")
|
|
92
|
+
creds = sts.assume_role(RoleArn=str(assumed_role), RoleSessionName="crfm-helm")["Credentials"]
|
|
93
|
+
session = Session(
|
|
94
|
+
aws_access_key_id=creds["AccessKeyId"],
|
|
95
|
+
aws_secret_access_key=creds["SecretAccessKey"],
|
|
96
|
+
aws_session_token=creds["SessionToken"],
|
|
97
|
+
)
|
|
98
|
+
return session.client(
|
|
99
|
+
service_name=service_name,
|
|
100
|
+
region_name=region,
|
|
101
|
+
config=boto_config,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# default to instance role to get the aws credentials or aws configured credentials
|
|
105
|
+
return boto3.client(service_name=service_name, region_name=region, config=boto_config)
|
|
@@ -6,7 +6,7 @@ import torch
|
|
|
6
6
|
from helm.common.gpu_utils import get_torch_device
|
|
7
7
|
from helm.common.images_utils import open_image
|
|
8
8
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
9
|
-
from .base_clip_scorer import BaseCLIPScorer
|
|
9
|
+
from helm.clients.clip_scorers.base_clip_scorer import BaseCLIPScorer
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
_ = torch.manual_seed(42)
|
|
@@ -4,7 +4,7 @@ import transformers
|
|
|
4
4
|
from helm.common.gpu_utils import get_torch_device, get_torch_device_name
|
|
5
5
|
from helm.common.images_utils import open_image
|
|
6
6
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
7
|
-
from .base_clip_scorer import BaseCLIPScorer
|
|
7
|
+
from helm.clients.clip_scorers.base_clip_scorer import BaseCLIPScorer
|
|
8
8
|
|
|
9
9
|
_ = torch.manual_seed(42)
|
|
10
10
|
|
helm/clients/cohere_client.py
CHANGED
|
@@ -164,12 +164,12 @@ class CohereRawChatRequest(TypedDict):
|
|
|
164
164
|
message: str
|
|
165
165
|
model: Optional[str]
|
|
166
166
|
preamble: Optional[str]
|
|
167
|
-
chat_history: Optional[Sequence[cohere.
|
|
167
|
+
chat_history: Optional[Sequence[cohere.ChatbotMessage]]
|
|
168
168
|
temperature: Optional[float]
|
|
169
169
|
max_tokens: Optional[int]
|
|
170
170
|
k: Optional[int]
|
|
171
171
|
p: Optional[float]
|
|
172
|
-
seed: Optional[
|
|
172
|
+
seed: Optional[int]
|
|
173
173
|
stop_sequences: Optional[Sequence[str]]
|
|
174
174
|
frequency_penalty: Optional[float]
|
|
175
175
|
presence_penalty: Optional[float]
|
|
@@ -188,7 +188,7 @@ def convert_to_raw_chat_request(request: Request) -> CohereRawChatRequest:
|
|
|
188
188
|
"k": request.top_k_per_token,
|
|
189
189
|
"p": request.top_p,
|
|
190
190
|
"stop_sequences": request.stop_sequences,
|
|
191
|
-
"seed":
|
|
191
|
+
"seed": int(request.random) if request.random is not None else None,
|
|
192
192
|
"frequency_penalty": request.frequency_penalty,
|
|
193
193
|
"presence_penalty": request.presence_penalty,
|
|
194
194
|
}
|
helm/clients/google_client.py
CHANGED
|
@@ -2,7 +2,7 @@ from typing import List, Dict
|
|
|
2
2
|
|
|
3
3
|
from helm.common.cache import CacheConfig
|
|
4
4
|
from helm.common.request import Request, RequestResult, GeneratedOutput, Token
|
|
5
|
-
from .client import CachingClient, truncate_sequence
|
|
5
|
+
from helm.clients.client import CachingClient, truncate_sequence
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class GoogleClient(CachingClient):
|
|
@@ -19,7 +19,7 @@ from helm.common.request import (
|
|
|
19
19
|
Token,
|
|
20
20
|
)
|
|
21
21
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
22
|
-
from .client import CachingClient, truncate_sequence
|
|
22
|
+
from helm.clients.client import CachingClient, truncate_sequence
|
|
23
23
|
from helm.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer, WrappedPreTrainedTokenizer
|
|
24
24
|
from threading import Lock
|
|
25
25
|
|
|
@@ -59,17 +59,23 @@ class HuggingFaceServer:
|
|
|
59
59
|
self,
|
|
60
60
|
pretrained_model_name_or_path: str,
|
|
61
61
|
wrapped_tokenizer: WrappedPreTrainedTokenizer,
|
|
62
|
-
openvino: bool = False,
|
|
63
62
|
**kwargs,
|
|
64
63
|
):
|
|
65
64
|
self.device: Optional[str]
|
|
66
65
|
if "device_map" in kwargs:
|
|
66
|
+
if "device" in kwargs:
|
|
67
|
+
raise ValueError("At most one of one of `device` and `device_map` may be specified.")
|
|
67
68
|
try:
|
|
68
69
|
import accelerate # noqa: F401
|
|
69
70
|
except ModuleNotFoundError as e:
|
|
70
71
|
handle_module_not_found_error(e, ["accelerate"])
|
|
71
|
-
hlog(f'Hugging Face device_map set to "{kwargs["device_map"]}".')
|
|
72
|
+
hlog(f'Hugging Face device_map set to "{kwargs["device_map"]}" from kwargs.')
|
|
72
73
|
self.device = None
|
|
74
|
+
elif "device" in kwargs:
|
|
75
|
+
if "device_map" in kwargs:
|
|
76
|
+
raise ValueError("At most one of one of `device` and `device_map` may be specified.")
|
|
77
|
+
hlog(f'Hugging Face device set to "{kwargs["device"]}" from kwargs.')
|
|
78
|
+
self.device = kwargs.pop("device")
|
|
73
79
|
elif torch.cuda.is_available():
|
|
74
80
|
hlog('Hugging Face device set to "cuda:0" because CUDA is available.')
|
|
75
81
|
self.device = "cuda:0"
|
|
@@ -85,20 +91,7 @@ class HuggingFaceServer:
|
|
|
85
91
|
|
|
86
92
|
with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"):
|
|
87
93
|
# WARNING this may fail if your GPU does not have enough memory
|
|
88
|
-
if
|
|
89
|
-
# Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
|
|
90
|
-
# OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
|
|
91
|
-
# Intel® architectures using OpenVINO™ runtime.
|
|
92
|
-
try:
|
|
93
|
-
from optimum.intel.openvino import OVModelForCausalLM
|
|
94
|
-
except ModuleNotFoundError as e:
|
|
95
|
-
handle_module_not_found_error(e, ["openvino"])
|
|
96
|
-
|
|
97
|
-
self.device = "cpu"
|
|
98
|
-
self.model = OVModelForCausalLM.from_pretrained(
|
|
99
|
-
pretrained_model_name_or_path, export=True, **kwargs
|
|
100
|
-
).to(self.device)
|
|
101
|
-
elif self.device is None:
|
|
94
|
+
if self.device is None:
|
|
102
95
|
# kwargs contains device_map=auto
|
|
103
96
|
# Do not call to() because accelerate will take care of model device placement.
|
|
104
97
|
self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
|
@@ -113,7 +106,6 @@ class HuggingFaceServer:
|
|
|
113
106
|
encoded_input = tokenizer(raw_request["prompt"], return_tensors="pt", return_token_type_ids=False).to(
|
|
114
107
|
0 if self.device is None else self.device
|
|
115
108
|
)
|
|
116
|
-
|
|
117
109
|
stopping_criteria: Optional[StoppingCriteriaList] = None
|
|
118
110
|
optional_args = {}
|
|
119
111
|
if len(raw_request["stop_sequences"]) > 0:
|