crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
helm/clients/mistral_client.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import requests
|
|
2
|
-
from typing import Any, Dict, List, Optional, TypedDict
|
|
2
|
+
from typing import Any, Dict, List, Optional, TypedDict, Union
|
|
3
3
|
|
|
4
4
|
from helm.proxy.retry import NonRetriableException
|
|
5
5
|
from helm.common.cache import CacheConfig
|
|
6
|
+
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
6
7
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
7
8
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput
|
|
8
9
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
9
|
-
from .client import CachingClient, truncate_and_tokenize_response_text
|
|
10
|
+
from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
|
|
10
11
|
|
|
11
12
|
try:
|
|
12
|
-
from mistralai
|
|
13
|
-
from mistralai.models
|
|
13
|
+
from mistralai import Mistral
|
|
14
|
+
from mistralai.models import ChatCompletionResponse
|
|
14
15
|
except ModuleNotFoundError as e:
|
|
15
16
|
handle_module_not_found_error(e, ["mistral"])
|
|
16
17
|
|
|
@@ -19,11 +20,14 @@ class MistralAIRequest(TypedDict):
|
|
|
19
20
|
"""Data passed between make_request and _send_request. Used as the cache key."""
|
|
20
21
|
|
|
21
22
|
model: str
|
|
22
|
-
prompt
|
|
23
|
+
# The prompt can be either a string or a list of messages that can be multimodal
|
|
24
|
+
prompt: Optional[Union[str, List[Dict[str, str]]]]
|
|
25
|
+
messages: Optional[List[Dict[str, Any]]]
|
|
23
26
|
max_tokens: int
|
|
24
27
|
temperature: float
|
|
25
28
|
top_p: float
|
|
26
29
|
random_seed: Optional[int]
|
|
30
|
+
stop: Optional[List[str]]
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
class MistralAIClient(CachingClient):
|
|
@@ -43,21 +47,24 @@ class MistralAIClient(CachingClient):
|
|
|
43
47
|
self.api_key: str = api_key
|
|
44
48
|
self.tokenizer = tokenizer
|
|
45
49
|
self.tokenizer_name = tokenizer_name
|
|
46
|
-
self._client =
|
|
50
|
+
self._client = Mistral(api_key=self.api_key)
|
|
47
51
|
self.mistral_model = mistral_model
|
|
48
52
|
|
|
49
53
|
def _send_request(self, raw_request: MistralAIRequest) -> Dict[str, Any]:
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
54
|
+
if raw_request["messages"] is not None:
|
|
55
|
+
messages = raw_request["messages"]
|
|
56
|
+
else:
|
|
57
|
+
messages = [{"role": "user", "content": raw_request["prompt"]}]
|
|
58
|
+
chat_response: Optional[ChatCompletionResponse] = self._client.chat.complete(
|
|
53
59
|
model=raw_request["model"],
|
|
54
|
-
messages=messages,
|
|
60
|
+
messages=messages, # type: ignore
|
|
55
61
|
temperature=raw_request["temperature"],
|
|
56
62
|
max_tokens=raw_request["max_tokens"],
|
|
57
63
|
top_p=raw_request["top_p"],
|
|
58
64
|
random_seed=raw_request["random_seed"],
|
|
59
65
|
safe_prompt=False, # Disable safe_prompt
|
|
60
66
|
)
|
|
67
|
+
assert chat_response is not None
|
|
61
68
|
# Documentation: "If mode is 'json', the output will only contain JSON serializable types."
|
|
62
69
|
# Source: https://docs.pydantic.dev/latest/api/base_model/#pydantic.BaseModel.model_dump
|
|
63
70
|
#
|
|
@@ -86,17 +93,54 @@ class MistralAIClient(CachingClient):
|
|
|
86
93
|
"""Make a request"""
|
|
87
94
|
completions: List[GeneratedOutput] = []
|
|
88
95
|
|
|
96
|
+
prompt: Union[str, List[Dict[str, str]]] = request.prompt
|
|
97
|
+
if request.multimodal_prompt:
|
|
98
|
+
# Following https://docs.mistral.ai/capabilities/vision
|
|
99
|
+
multimodal_content: List[Dict[str, str]] = []
|
|
100
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
101
|
+
if media_object.is_type(IMAGE_TYPE) and media_object.location:
|
|
102
|
+
assert media_object.location
|
|
103
|
+
if media_object.is_local_file:
|
|
104
|
+
from helm.common.images_utils import encode_base64
|
|
105
|
+
|
|
106
|
+
base64_image: str = encode_base64(media_object.location)
|
|
107
|
+
image_url = f"data:image/jpeg;base64,{base64_image}"
|
|
108
|
+
else:
|
|
109
|
+
image_url = media_object.location
|
|
110
|
+
multimodal_content.append({"type": "image_url", "image_url": image_url})
|
|
111
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
112
|
+
assert media_object.text
|
|
113
|
+
multimodal_content.append({"type": "text", "text": media_object.text})
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
|
|
116
|
+
|
|
117
|
+
prompt = multimodal_content
|
|
118
|
+
|
|
89
119
|
# `num_completions` is not supported, so instead make `num_completions` separate requests.
|
|
90
120
|
for completion_index in range(request.num_completions):
|
|
91
121
|
try:
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
122
|
+
if request.messages:
|
|
123
|
+
raw_request: MistralAIRequest = {
|
|
124
|
+
"model": self.mistral_model or request.model_engine,
|
|
125
|
+
"prompt": None,
|
|
126
|
+
"messages": request.messages,
|
|
127
|
+
"max_tokens": request.max_tokens,
|
|
128
|
+
"temperature": request.temperature,
|
|
129
|
+
"top_p": request.top_p,
|
|
130
|
+
"random_seed": self._get_random_seed(request, completion_index),
|
|
131
|
+
"stop": request.stop_sequences or None,
|
|
132
|
+
}
|
|
133
|
+
else:
|
|
134
|
+
raw_request = {
|
|
135
|
+
"model": self.mistral_model or request.model_engine,
|
|
136
|
+
"prompt": prompt,
|
|
137
|
+
"messages": None,
|
|
138
|
+
"max_tokens": request.max_tokens,
|
|
139
|
+
"temperature": request.temperature,
|
|
140
|
+
"top_p": request.top_p,
|
|
141
|
+
"random_seed": self._get_random_seed(request, completion_index),
|
|
142
|
+
"stop": request.stop_sequences or None,
|
|
143
|
+
}
|
|
100
144
|
|
|
101
145
|
def do_it() -> Dict[str, Any]:
|
|
102
146
|
result: Dict[str, Any] = self._send_request(raw_request)
|
helm/clients/openai_client.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
# mypy: check_untyped_defs = False
|
|
2
2
|
from dataclasses import replace
|
|
3
|
-
from typing import Any, Dict, List, Optional, cast, Union
|
|
3
|
+
from typing import Any, Dict, List, Optional, cast, Union, Callable
|
|
4
4
|
|
|
5
5
|
from helm.benchmark.model_metadata_registry import is_vlm
|
|
6
|
+
from helm.common import multimodal_request_utils
|
|
6
7
|
from helm.common.cache import CacheConfig
|
|
7
|
-
from helm.common.media_object import TEXT_TYPE
|
|
8
|
-
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
8
|
+
from helm.common.media_object import TEXT_TYPE, MultimediaObject
|
|
9
|
+
from helm.common.request import ErrorFlags, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
9
10
|
from helm.common.hierarchical_logger import hlog
|
|
11
|
+
from helm.common.object_spec import get_class_by_name
|
|
10
12
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
13
|
from helm.common.tokenization_request import (
|
|
12
14
|
TokenizationRequest,
|
|
13
15
|
TokenizationRequestResult,
|
|
14
16
|
)
|
|
15
|
-
from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
17
|
+
from helm.clients.client import Client, CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
16
18
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
17
19
|
|
|
18
20
|
try:
|
|
@@ -27,6 +29,19 @@ class OpenAIClient(CachingClient):
|
|
|
27
29
|
|
|
28
30
|
# Error OpenAI throws when the image in the prompt violates their content policy
|
|
29
31
|
INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
|
|
32
|
+
INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
|
|
33
|
+
INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
|
|
34
|
+
"The response was filtered due to the prompt triggering Azure OpenAI's content management policy."
|
|
35
|
+
)
|
|
36
|
+
INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
|
|
37
|
+
"The response was filtered due to the prompt triggering Microsoft's content management policy."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# OpenAI server error
|
|
41
|
+
OPENAI_SERVER_ERROR: str = (
|
|
42
|
+
"The server had an error processing your request. Sorry about that! You can retry your request, "
|
|
43
|
+
"or contact us through our help center at help.openai.com if you keep seeing this error."
|
|
44
|
+
)
|
|
30
45
|
|
|
31
46
|
# Set the finish reason to this if the prompt violates OpenAI's content policy
|
|
32
47
|
CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
|
|
@@ -42,28 +57,31 @@ class OpenAIClient(CachingClient):
|
|
|
42
57
|
api_key: Optional[str] = None,
|
|
43
58
|
org_id: Optional[str] = None,
|
|
44
59
|
base_url: Optional[str] = None,
|
|
60
|
+
reasoning_effort: Optional[str] = None,
|
|
61
|
+
openai_model_name: Optional[str] = None,
|
|
62
|
+
output_processor: Optional[str] = None,
|
|
45
63
|
):
|
|
46
64
|
super().__init__(cache_config=cache_config)
|
|
47
65
|
self.tokenizer = tokenizer
|
|
48
66
|
self.tokenizer_name = tokenizer_name
|
|
49
67
|
self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return True
|
|
56
|
-
return False
|
|
68
|
+
self.reasoning_effort = reasoning_effort
|
|
69
|
+
self.openai_model_name = openai_model_name
|
|
70
|
+
self.output_processor: Optional[Callable[[str], str]] = (
|
|
71
|
+
get_class_by_name(output_processor) if output_processor else None
|
|
72
|
+
)
|
|
57
73
|
|
|
58
74
|
def _get_model_for_request(self, request: Request) -> str:
|
|
59
|
-
return request.model_engine
|
|
75
|
+
return self.openai_model_name or request.model_engine
|
|
60
76
|
|
|
61
77
|
def _get_cache_key(self, raw_request: Dict, request: Request):
|
|
62
78
|
cache_key = CachingClient.make_cache_key(raw_request, request)
|
|
63
79
|
if request.multimodal_prompt:
|
|
64
80
|
prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
|
|
65
81
|
cache_key = {**cache_key, "multimodal_prompt": prompt_key}
|
|
66
|
-
|
|
82
|
+
|
|
83
|
+
if "messages" in cache_key:
|
|
84
|
+
del cache_key["messages"]
|
|
67
85
|
return cache_key
|
|
68
86
|
|
|
69
87
|
def _make_embedding_request(self, request: Request) -> RequestResult:
|
|
@@ -140,6 +158,19 @@ class OpenAIClient(CachingClient):
|
|
|
140
158
|
base64_image: str = encode_base64(media_object.location)
|
|
141
159
|
image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"}
|
|
142
160
|
content.append({"type": "image_url", "image_url": image_object})
|
|
161
|
+
elif media_object.is_type("audio") and media_object.location:
|
|
162
|
+
base64_audio: str = multimodal_request_utils.get_contents_as_base64(media_object.location)
|
|
163
|
+
format: str = media_object.content_type.split("/")[1]
|
|
164
|
+
if format == "mpeg":
|
|
165
|
+
# OpenAI expects "mp3" for mpeg audio
|
|
166
|
+
format = "mp3"
|
|
167
|
+
|
|
168
|
+
content.append(
|
|
169
|
+
{
|
|
170
|
+
"type": "input_audio",
|
|
171
|
+
"input_audio": {"data": base64_audio, "format": format},
|
|
172
|
+
}
|
|
173
|
+
)
|
|
143
174
|
elif media_object.is_type(TEXT_TYPE):
|
|
144
175
|
content.append({"type": media_object.type, "text": media_object.text})
|
|
145
176
|
else:
|
|
@@ -164,15 +195,35 @@ class OpenAIClient(CachingClient):
|
|
|
164
195
|
"frequency_penalty": request.frequency_penalty,
|
|
165
196
|
}
|
|
166
197
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
198
|
+
if request.response_format and request.response_format.json_schema:
|
|
199
|
+
# Copy and modify JSON schema to conform to OpenAI's requirements
|
|
200
|
+
json_schema = dict(request.response_format.json_schema)
|
|
201
|
+
|
|
202
|
+
# additionalProperties: false must always be set in objects
|
|
203
|
+
# https://platform.openai.com/docs/guides/structured-outputs#additionalproperties-false-must-always-be-set-in-objects
|
|
204
|
+
if "additionalProperties" not in json_schema:
|
|
205
|
+
json_schema["additionalProperties"] = False
|
|
206
|
+
|
|
207
|
+
# All fields must be required
|
|
208
|
+
# https://platform.openai.com/docs/guides/structured-outputs#all-fields-must-be-required
|
|
209
|
+
if "required" not in json_schema:
|
|
210
|
+
json_schema["required"] = list(json_schema["properties"].keys())
|
|
211
|
+
|
|
212
|
+
raw_request["response_format"] = {
|
|
213
|
+
"type": "json_schema",
|
|
214
|
+
"json_schema": {
|
|
215
|
+
"name": "response",
|
|
216
|
+
"description": "",
|
|
217
|
+
"schema": json_schema,
|
|
218
|
+
"strict": True,
|
|
219
|
+
},
|
|
220
|
+
}
|
|
171
221
|
|
|
172
222
|
# Special handling for o1 models.
|
|
173
223
|
# Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
|
|
174
224
|
# https://platform.openai.com/docs/guides/reasoning
|
|
175
|
-
|
|
225
|
+
model_engine: str = request.model_engine
|
|
226
|
+
if model_engine.startswith("o1") or model_engine.startswith("o3"):
|
|
176
227
|
# Avoid error:
|
|
177
228
|
# "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead." # noqa: E501
|
|
178
229
|
# Note that openai>=1.45 is needed for this
|
|
@@ -184,6 +235,30 @@ class OpenAIClient(CachingClient):
|
|
|
184
235
|
if raw_request["stop"] is None:
|
|
185
236
|
raw_request.pop("stop")
|
|
186
237
|
|
|
238
|
+
# Avoid error:
|
|
239
|
+
# "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
|
|
240
|
+
# not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
|
|
241
|
+
# 'code': 'unsupported_parameter'}}"
|
|
242
|
+
raw_request.pop("temperature", None)
|
|
243
|
+
|
|
244
|
+
if self.reasoning_effort:
|
|
245
|
+
raw_request["reasoning_effort"] = "self.reasoning_effort"
|
|
246
|
+
elif is_vlm(request.model):
|
|
247
|
+
# Avoid error:
|
|
248
|
+
# "Invalid type for 'stop': expected an unsupported value, but got null instead."
|
|
249
|
+
if raw_request["stop"] is None:
|
|
250
|
+
raw_request.pop("stop")
|
|
251
|
+
|
|
252
|
+
# Special handling for gpt-4o-audio-preview
|
|
253
|
+
# See: https://platform.openai.com/docs/guides/audio
|
|
254
|
+
if model_engine.startswith("gpt-4o-audio-preview") or model_engine.startswith("gpt-4o-mini-audio-preview"):
|
|
255
|
+
raw_request["modalities"] = ["text"]
|
|
256
|
+
|
|
257
|
+
# Avoid error:
|
|
258
|
+
# OpenAI error: Error code: 400 - {'error': {'message': "[{'type': 'string_type', 'loc': ('body', 'stop', 'str'), 'msg': 'Input should be a valid string', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[str]'), 'msg': 'Input should be a valid list', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[list[int]]'), 'msg': 'Input should be a valid list', 'input': None}]", 'type': 'invalid_request_error', 'param': None, 'code': None}} # noqa: 3501
|
|
259
|
+
if raw_request["stop"] is None:
|
|
260
|
+
raw_request.pop("stop")
|
|
261
|
+
|
|
187
262
|
def do_it() -> Dict[str, Any]:
|
|
188
263
|
return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
|
|
189
264
|
|
|
@@ -191,7 +266,7 @@ class OpenAIClient(CachingClient):
|
|
|
191
266
|
cache_key = self._get_cache_key(raw_request, request)
|
|
192
267
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
193
268
|
except openai.OpenAIError as e:
|
|
194
|
-
if self.INAPPROPRIATE_IMAGE_ERROR in str(e):
|
|
269
|
+
if self.INAPPROPRIATE_IMAGE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_ERROR in str(e):
|
|
195
270
|
hlog(f"Failed safety check: {str(request)}")
|
|
196
271
|
empty_completion = GeneratedOutput(
|
|
197
272
|
text="",
|
|
@@ -206,15 +281,54 @@ class OpenAIClient(CachingClient):
|
|
|
206
281
|
completions=[empty_completion] * request.num_completions,
|
|
207
282
|
embedding=[],
|
|
208
283
|
)
|
|
284
|
+
elif self.OPENAI_SERVER_ERROR in str(e):
|
|
285
|
+
# Handle these errors by returning an empty completion to unblock
|
|
286
|
+
hlog(f"OpenAI server error for request: {str(request)}")
|
|
287
|
+
empty_completion = GeneratedOutput(
|
|
288
|
+
text="",
|
|
289
|
+
logprob=0,
|
|
290
|
+
tokens=[],
|
|
291
|
+
finish_reason={"reason": self.OPENAI_SERVER_ERROR},
|
|
292
|
+
)
|
|
293
|
+
return RequestResult(
|
|
294
|
+
success=True,
|
|
295
|
+
cached=False,
|
|
296
|
+
request_time=0,
|
|
297
|
+
completions=[empty_completion] * request.num_completions,
|
|
298
|
+
embedding=[],
|
|
299
|
+
)
|
|
300
|
+
elif self.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or self.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
|
|
301
|
+
return RequestResult(
|
|
302
|
+
success=False,
|
|
303
|
+
cached=False,
|
|
304
|
+
error="Content blocked by Azure's content management filter",
|
|
305
|
+
completions=[],
|
|
306
|
+
embedding=[],
|
|
307
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
308
|
+
)
|
|
209
309
|
|
|
210
310
|
error: str = f"OpenAI error: {e}"
|
|
211
311
|
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
212
312
|
|
|
213
313
|
completions: List[GeneratedOutput] = []
|
|
214
314
|
for raw_completion in response["choices"]:
|
|
315
|
+
# Handle Azure OpenAI content filter
|
|
316
|
+
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter
|
|
317
|
+
if raw_completion["finish_reason"] == "content_filter":
|
|
318
|
+
hlog(f"Content blocked by OpenAI filter: {str(raw_request)}")
|
|
319
|
+
return RequestResult(
|
|
320
|
+
success=False,
|
|
321
|
+
cached=False,
|
|
322
|
+
error="Content blocked by OpenAI filter",
|
|
323
|
+
completions=[],
|
|
324
|
+
embedding=[],
|
|
325
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
326
|
+
)
|
|
215
327
|
# The OpenAI chat completion API doesn't support echo.
|
|
216
328
|
# If `echo_prompt` is true, combine the prompt and completion.
|
|
217
329
|
raw_completion_content = raw_completion["message"]["content"]
|
|
330
|
+
if self.output_processor:
|
|
331
|
+
raw_completion_content = self.output_processor(raw_completion_content)
|
|
218
332
|
text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
|
|
219
333
|
# The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
|
|
220
334
|
tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
|
|
@@ -313,10 +427,115 @@ class OpenAIClient(CachingClient):
|
|
|
313
427
|
embedding=[],
|
|
314
428
|
)
|
|
315
429
|
|
|
430
|
+
def _make_transcription_request(self, request: Request) -> RequestResult:
|
|
431
|
+
assert (
|
|
432
|
+
request.multimodal_prompt is not None and request.multimodal_prompt.size == 1
|
|
433
|
+
), "Expected just a single audio file."
|
|
434
|
+
media_object = request.multimodal_prompt.media_objects[0]
|
|
435
|
+
assert media_object.is_type("audio") and media_object.location, "Expected an audio file."
|
|
436
|
+
audio_path: str = media_object.location
|
|
437
|
+
model: str = self._get_model_for_request(request)
|
|
438
|
+
|
|
439
|
+
def do_it() -> Dict[str, Any]:
|
|
440
|
+
transcription = self.client.audio.transcriptions.create(model=model, file=open(audio_path, "rb"))
|
|
441
|
+
return {"transcription": transcription.text}
|
|
442
|
+
|
|
443
|
+
try:
|
|
444
|
+
cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
|
|
445
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
446
|
+
except openai.OpenAIError as e:
|
|
447
|
+
error: str = f"OpenAI error: {e}"
|
|
448
|
+
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
449
|
+
|
|
450
|
+
return RequestResult(
|
|
451
|
+
success=True,
|
|
452
|
+
cached=cached,
|
|
453
|
+
request_time=response["request_time"],
|
|
454
|
+
request_datetime=response.get("request_datetime"),
|
|
455
|
+
completions=[GeneratedOutput(text=response["transcription"], logprob=0, tokens=[])],
|
|
456
|
+
embedding=[],
|
|
457
|
+
)
|
|
458
|
+
|
|
316
459
|
def make_request(self, request: Request) -> RequestResult:
|
|
317
460
|
if request.embedding:
|
|
318
461
|
return self._make_embedding_request(request)
|
|
319
|
-
elif
|
|
320
|
-
return self.
|
|
462
|
+
elif "whisper" in request.model_engine:
|
|
463
|
+
return self._make_transcription_request(request)
|
|
321
464
|
else:
|
|
322
|
-
return self.
|
|
465
|
+
return self._make_chat_request(request)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
class OpenAILegacyCompletionsClient(OpenAIClient):
|
|
469
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
470
|
+
return self._make_completion_request(request)
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
class OpenAITranscriptionThenCompletionClient(Client):
|
|
474
|
+
"""
|
|
475
|
+
Wrapper around `OpenAIClient` that transcribes audio to text with a
|
|
476
|
+
speech-to-text model (e.g., Whisper) before making a completion request.
|
|
477
|
+
"""
|
|
478
|
+
|
|
479
|
+
@staticmethod
|
|
480
|
+
def wrap_transcribed_indicator(transcription: str) -> str:
|
|
481
|
+
return f"\n[TRANSCRIBED AUDIO START]\n{transcription}\n[TRANSCRIBED AUDIO END]\n"
|
|
482
|
+
|
|
483
|
+
def __init__(
|
|
484
|
+
self,
|
|
485
|
+
tokenizer: Tokenizer,
|
|
486
|
+
tokenizer_name: str,
|
|
487
|
+
cache_config: CacheConfig,
|
|
488
|
+
api_key: Optional[str] = None,
|
|
489
|
+
org_id: Optional[str] = None,
|
|
490
|
+
):
|
|
491
|
+
self._openai_client = OpenAIClient(
|
|
492
|
+
tokenizer=tokenizer,
|
|
493
|
+
tokenizer_name=tokenizer_name,
|
|
494
|
+
cache_config=cache_config,
|
|
495
|
+
api_key=api_key,
|
|
496
|
+
org_id=org_id,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
500
|
+
# Ensure that there is only one _ in the model engine name as the format is
|
|
501
|
+
# `transcription-model_completion-model`
|
|
502
|
+
assert request.model_engine.count("_") == 1, f"Invalid model name: {request.model_engine}"
|
|
503
|
+
# Use `model_engine` to determine both the models for transcription and completion
|
|
504
|
+
transcription_model, completion_model = request.model_engine.split("_")
|
|
505
|
+
|
|
506
|
+
# Only multimodal prompts are supported
|
|
507
|
+
assert request.multimodal_prompt is not None, "Expected a multimodal prompt"
|
|
508
|
+
|
|
509
|
+
# Gather all the text content and transcribe any audio to text
|
|
510
|
+
text_content: List[str] = []
|
|
511
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
512
|
+
if media_object.is_type("audio") and media_object.location:
|
|
513
|
+
request = Request(
|
|
514
|
+
model=f"openai/{transcription_model}",
|
|
515
|
+
multimodal_prompt=MultimediaObject(media_objects=[media_object]),
|
|
516
|
+
)
|
|
517
|
+
response = self._openai_client.make_request(request)
|
|
518
|
+
|
|
519
|
+
transcribed_text: str
|
|
520
|
+
if response.success and response.completions:
|
|
521
|
+
transcribed_text = response.completions[0].text
|
|
522
|
+
else:
|
|
523
|
+
transcribed_text = ""
|
|
524
|
+
hlog(f"Failed to transcribe audio: {response.error}")
|
|
525
|
+
|
|
526
|
+
text_content.append(self.wrap_transcribed_indicator(transcribed_text))
|
|
527
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
528
|
+
assert media_object.text is not None, "Expected text content"
|
|
529
|
+
text_content.append(media_object.text)
|
|
530
|
+
else:
|
|
531
|
+
raise ValueError(f"Unrecognized media type: {media_object.type}")
|
|
532
|
+
|
|
533
|
+
text_prompt: str = "\n".join(text_content)
|
|
534
|
+
hlog(f"Transcribed prompt:\n{text_prompt}")
|
|
535
|
+
|
|
536
|
+
# Now make the request to the completion model with just a text-only prompt and no audio
|
|
537
|
+
# Use the same decoding parameters as the original request
|
|
538
|
+
# Ensure to set multimodal_prompt to None so the request is treated as text-only.
|
|
539
|
+
return self._openai_client.make_request(
|
|
540
|
+
replace(request, prompt=text_prompt, model=f"openai/{completion_model}", multimodal_prompt=None)
|
|
541
|
+
)
|
helm/clients/palmyra_client.py
CHANGED
|
@@ -12,7 +12,7 @@ from helm.common.tokenization_request import (
|
|
|
12
12
|
TokenizationRequestResult,
|
|
13
13
|
)
|
|
14
14
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
15
|
-
from .client import CachingClient, truncate_sequence
|
|
15
|
+
from helm.clients.client import CachingClient, truncate_sequence
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
_CONTENT_MODERATION_KEY = "fail.content.moderation.failed"
|
|
@@ -163,6 +163,3 @@ class PalmyraChatClient(OpenAIClient):
|
|
|
163
163
|
org_id=None,
|
|
164
164
|
base_url="https://api.writer.com/v1/chat",
|
|
165
165
|
)
|
|
166
|
-
|
|
167
|
-
def _is_chat_model_engine(self, model_engine: str) -> bool:
|
|
168
|
-
return True
|
helm/clients/reka_client.py
CHANGED
|
@@ -9,7 +9,7 @@ from helm.common.request import wrap_request_time, Request, RequestResult, Gener
|
|
|
9
9
|
from helm.common.hierarchical_logger import hlog
|
|
10
10
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
11
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
12
|
-
from .client import CachingClient, truncate_and_tokenize_response_text
|
|
12
|
+
from helm.clients.client import CachingClient, truncate_and_tokenize_response_text
|
|
13
13
|
|
|
14
14
|
try:
|
|
15
15
|
import reka
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from helm.clients.azure_openai_client import AzureOpenAIClient
|
|
4
|
+
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.proxy.retry import NonRetriableException
|
|
6
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StanfordHealthCareAzureOpenAIClient(AzureOpenAIClient):
|
|
10
|
+
"""
|
|
11
|
+
Client for accessing OpenAI models hosted on Stanford Health Care's model API.
|
|
12
|
+
|
|
13
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
17
|
+
stanfordhealthcareApiKey: your-private-key
|
|
18
|
+
```
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
CREDENTIAL_HEADER_NAME = "Ocp-Apim-Subscription-Key"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
tokenizer: Tokenizer,
|
|
26
|
+
tokenizer_name: str,
|
|
27
|
+
cache_config: CacheConfig,
|
|
28
|
+
openai_model_name: str,
|
|
29
|
+
api_version: str,
|
|
30
|
+
api_key: Optional[str] = None,
|
|
31
|
+
endpoint: Optional[str] = None,
|
|
32
|
+
base_url: Optional[str] = None,
|
|
33
|
+
):
|
|
34
|
+
if not api_key:
|
|
35
|
+
raise NonRetriableException("Must provide API key through credentials.conf")
|
|
36
|
+
if base_url:
|
|
37
|
+
base_url = base_url.format(endpoint=endpoint)
|
|
38
|
+
super().__init__(
|
|
39
|
+
tokenizer=tokenizer,
|
|
40
|
+
tokenizer_name=tokenizer_name,
|
|
41
|
+
cache_config=cache_config,
|
|
42
|
+
api_key="unused",
|
|
43
|
+
base_url=base_url,
|
|
44
|
+
azure_openai_deployment_name=openai_model_name,
|
|
45
|
+
api_version=api_version,
|
|
46
|
+
default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
|
|
47
|
+
)
|
|
48
|
+
else:
|
|
49
|
+
super().__init__(
|
|
50
|
+
tokenizer=tokenizer,
|
|
51
|
+
tokenizer_name=tokenizer_name,
|
|
52
|
+
cache_config=cache_config,
|
|
53
|
+
api_key="unused",
|
|
54
|
+
endpoint=endpoint,
|
|
55
|
+
azure_openai_deployment_name=openai_model_name,
|
|
56
|
+
api_version=api_version,
|
|
57
|
+
default_headers={StanfordHealthCareAzureOpenAIClient.CREDENTIAL_HEADER_NAME: api_key},
|
|
58
|
+
)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.common.request import (
|
|
4
|
+
Request,
|
|
5
|
+
GeneratedOutput,
|
|
6
|
+
)
|
|
7
|
+
from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StanfordHealthCareClaudeClient(StanfordHealthCareHTTPModelClient):
|
|
11
|
+
"""
|
|
12
|
+
Client for accessing Claude models hosted on Stanford Health Care's model API.
|
|
13
|
+
|
|
14
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
18
|
+
stanfordhealthcareApiKey: your-private-key
|
|
19
|
+
```
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def get_request(self, request: Request) -> Dict[str, Any]:
|
|
23
|
+
return {
|
|
24
|
+
"model_id": self.model,
|
|
25
|
+
"prompt_text": request.prompt,
|
|
26
|
+
"max_tokens": request.max_tokens,
|
|
27
|
+
"temperature": request.temperature,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
|
|
31
|
+
return [GeneratedOutput(text=item["text"], logprob=0, tokens=[]) for item in response["content"]]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.common.request import (
|
|
4
|
+
Request,
|
|
5
|
+
GeneratedOutput,
|
|
6
|
+
)
|
|
7
|
+
from helm.clients.stanfordhealthcare_http_model_client import StanfordHealthCareHTTPModelClient
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StanfordHealthCareGoogleClient(StanfordHealthCareHTTPModelClient):
|
|
11
|
+
"""
|
|
12
|
+
Client for accessing Google models hosted on Stanford Health Care's model API.
|
|
13
|
+
|
|
14
|
+
Configure by setting the following in prod_env/credentials.conf:
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
stanfordhealthcareEndpoint: https://your-domain-name/
|
|
18
|
+
stanfordhealthcareApiKey: your-private-key
|
|
19
|
+
```
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def get_request(self, request: Request) -> Dict[str, Any]:
|
|
23
|
+
return {
|
|
24
|
+
"contents": {
|
|
25
|
+
"role": "user",
|
|
26
|
+
"parts": {"text": request.prompt},
|
|
27
|
+
},
|
|
28
|
+
"generation_config": {
|
|
29
|
+
"temperature": request.temperature,
|
|
30
|
+
"topP": request.top_p,
|
|
31
|
+
"topK": request.top_k_per_token,
|
|
32
|
+
"candidateCount": 1,
|
|
33
|
+
"maxOutputTokens": request.max_tokens,
|
|
34
|
+
"stopSequences": request.stop_sequences,
|
|
35
|
+
},
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
def parse_response(self, response: Dict[str, Any]) -> List[GeneratedOutput]:
|
|
39
|
+
completion = ""
|
|
40
|
+
for item in response["content"]:
|
|
41
|
+
if "content" in item["candidates"][0]:
|
|
42
|
+
completion += item["candidates"][0]["content"]["parts"][0]["text"]
|
|
43
|
+
return [GeneratedOutput(text=completion, logprob=0, tokens=[])]
|