crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
helm/clients/openai_client.py
CHANGED
|
@@ -1,18 +1,23 @@
|
|
|
1
1
|
# mypy: check_untyped_defs = False
|
|
2
2
|
from dataclasses import replace
|
|
3
|
-
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict, List, Optional, cast, Union, Callable
|
|
5
|
+
|
|
6
|
+
from openai import OpenAIError
|
|
4
7
|
|
|
5
8
|
from helm.benchmark.model_metadata_registry import is_vlm
|
|
9
|
+
from helm.common import multimodal_request_utils
|
|
6
10
|
from helm.common.cache import CacheConfig
|
|
7
|
-
from helm.common.media_object import TEXT_TYPE
|
|
8
|
-
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
9
|
-
from helm.common.hierarchical_logger import hlog
|
|
11
|
+
from helm.common.media_object import TEXT_TYPE, MultimediaObject, MediaObject
|
|
12
|
+
from helm.common.request import ErrorFlags, Thinking, wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
13
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
14
|
+
from helm.common.object_spec import get_class_by_name
|
|
10
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
11
16
|
from helm.common.tokenization_request import (
|
|
12
17
|
TokenizationRequest,
|
|
13
18
|
TokenizationRequestResult,
|
|
14
19
|
)
|
|
15
|
-
from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
20
|
+
from helm.clients.client import Client, CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
16
21
|
from helm.tokenizers.tokenizer import Tokenizer
|
|
17
22
|
|
|
18
23
|
try:
|
|
@@ -22,11 +27,29 @@ except ModuleNotFoundError as e:
|
|
|
22
27
|
handle_module_not_found_error(e, ["openai"])
|
|
23
28
|
|
|
24
29
|
|
|
25
|
-
class
|
|
26
|
-
|
|
30
|
+
class OpenAIClientUtils:
|
|
31
|
+
"""Methods used by both the chat completions client and the responses API client"""
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def is_reasoning_model(cls, model_engine: str) -> bool:
|
|
35
|
+
# All OpenAI reasoning models start "o[somenumber]", so we regexp for that to future proof things
|
|
36
|
+
return bool(re.match(r"^o\d+", model_engine))
|
|
27
37
|
|
|
28
38
|
# Error OpenAI throws when the image in the prompt violates their content policy
|
|
29
39
|
INAPPROPRIATE_IMAGE_ERROR: str = "Your input image may contain content that is not allowed by our safety system"
|
|
40
|
+
INAPPROPRIATE_PROMPT_ERROR: str = "Invalid prompt: your prompt was flagged"
|
|
41
|
+
INAPPROPRIATE_PROMPT_AZURE_ERROR: str = (
|
|
42
|
+
"The response was filtered due to the prompt triggering Azure OpenAI's content management policy."
|
|
43
|
+
)
|
|
44
|
+
INAPPROPRIATE_PROMPT_MICROSOFT_ERROR: str = (
|
|
45
|
+
"The response was filtered due to the prompt triggering Microsoft's content management policy."
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# OpenAI server error
|
|
49
|
+
OPENAI_SERVER_ERROR: str = (
|
|
50
|
+
"The server had an error processing your request. Sorry about that! You can retry your request, "
|
|
51
|
+
"or contact us through our help center at help.openai.com if you keep seeing this error."
|
|
52
|
+
)
|
|
30
53
|
|
|
31
54
|
# Set the finish reason to this if the prompt violates OpenAI's content policy
|
|
32
55
|
CONTENT_POLICY_VIOLATED_FINISH_REASON: str = (
|
|
@@ -34,6 +57,56 @@ class OpenAIClient(CachingClient):
|
|
|
34
57
|
"See https://labs.openai.com/policies/content-policy for more information."
|
|
35
58
|
)
|
|
36
59
|
|
|
60
|
+
@classmethod
|
|
61
|
+
def handle_openai_error(cls, e: OpenAIError, request: Request):
|
|
62
|
+
if cls.INAPPROPRIATE_IMAGE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_ERROR in str(e):
|
|
63
|
+
hwarn(f"Failed safety check: {str(request)}")
|
|
64
|
+
empty_completion = GeneratedOutput(
|
|
65
|
+
text="",
|
|
66
|
+
logprob=0,
|
|
67
|
+
tokens=[],
|
|
68
|
+
finish_reason={"reason": cls.CONTENT_POLICY_VIOLATED_FINISH_REASON},
|
|
69
|
+
)
|
|
70
|
+
return RequestResult(
|
|
71
|
+
success=True,
|
|
72
|
+
cached=False,
|
|
73
|
+
request_time=0,
|
|
74
|
+
completions=[empty_completion] * request.num_completions,
|
|
75
|
+
embedding=[],
|
|
76
|
+
)
|
|
77
|
+
elif cls.OPENAI_SERVER_ERROR in str(e):
|
|
78
|
+
# Handle these errors by returning an empty completion to unblock
|
|
79
|
+
hwarn(f"OpenAI server error for request: {str(request)}")
|
|
80
|
+
empty_completion = GeneratedOutput(
|
|
81
|
+
text="",
|
|
82
|
+
logprob=0,
|
|
83
|
+
tokens=[],
|
|
84
|
+
finish_reason={"reason": cls.OPENAI_SERVER_ERROR},
|
|
85
|
+
)
|
|
86
|
+
return RequestResult(
|
|
87
|
+
success=True,
|
|
88
|
+
cached=False,
|
|
89
|
+
request_time=0,
|
|
90
|
+
completions=[empty_completion] * request.num_completions,
|
|
91
|
+
embedding=[],
|
|
92
|
+
)
|
|
93
|
+
elif cls.INAPPROPRIATE_PROMPT_AZURE_ERROR in str(e) or cls.INAPPROPRIATE_PROMPT_MICROSOFT_ERROR in str(e):
|
|
94
|
+
return RequestResult(
|
|
95
|
+
success=False,
|
|
96
|
+
cached=False,
|
|
97
|
+
error="Content blocked by Azure's content management filter",
|
|
98
|
+
completions=[],
|
|
99
|
+
embedding=[],
|
|
100
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
error: str = f"OpenAI error: {e}"
|
|
104
|
+
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class OpenAIClient(CachingClient):
|
|
108
|
+
END_OF_TEXT: str = "<|endoftext|>"
|
|
109
|
+
|
|
37
110
|
def __init__(
|
|
38
111
|
self,
|
|
39
112
|
tokenizer: Tokenizer,
|
|
@@ -42,28 +115,31 @@ class OpenAIClient(CachingClient):
|
|
|
42
115
|
api_key: Optional[str] = None,
|
|
43
116
|
org_id: Optional[str] = None,
|
|
44
117
|
base_url: Optional[str] = None,
|
|
118
|
+
reasoning_effort: Optional[str] = None,
|
|
119
|
+
openai_model_name: Optional[str] = None,
|
|
120
|
+
output_processor: Optional[str] = None,
|
|
45
121
|
):
|
|
46
122
|
super().__init__(cache_config=cache_config)
|
|
47
123
|
self.tokenizer = tokenizer
|
|
48
124
|
self.tokenizer_name = tokenizer_name
|
|
49
125
|
self.client = OpenAI(api_key=api_key, organization=org_id, base_url=base_url)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
return True
|
|
56
|
-
return False
|
|
126
|
+
self.reasoning_effort = reasoning_effort
|
|
127
|
+
self.openai_model_name = openai_model_name
|
|
128
|
+
self.output_processor: Optional[Callable[[str], str]] = (
|
|
129
|
+
get_class_by_name(output_processor) if output_processor else None
|
|
130
|
+
)
|
|
57
131
|
|
|
58
132
|
def _get_model_for_request(self, request: Request) -> str:
|
|
59
|
-
return request.model_engine
|
|
133
|
+
return self.openai_model_name or request.model_engine
|
|
60
134
|
|
|
61
135
|
def _get_cache_key(self, raw_request: Dict, request: Request):
|
|
62
136
|
cache_key = CachingClient.make_cache_key(raw_request, request)
|
|
63
137
|
if request.multimodal_prompt:
|
|
64
138
|
prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
|
|
65
139
|
cache_key = {**cache_key, "multimodal_prompt": prompt_key}
|
|
66
|
-
|
|
140
|
+
|
|
141
|
+
if "messages" in cache_key:
|
|
142
|
+
del cache_key["messages"]
|
|
67
143
|
return cache_key
|
|
68
144
|
|
|
69
145
|
def _make_embedding_request(self, request: Request) -> RequestResult:
|
|
@@ -100,7 +176,7 @@ class OpenAIClient(CachingClient):
|
|
|
100
176
|
embedding=embedding,
|
|
101
177
|
)
|
|
102
178
|
|
|
103
|
-
def
|
|
179
|
+
def _make_chat_raw_request(self, request: Request) -> Dict[str, Any]:
|
|
104
180
|
messages: Optional[List[Dict[str, Union[str, Any]]]] = request.messages
|
|
105
181
|
if (
|
|
106
182
|
(request.prompt and request.messages)
|
|
@@ -119,7 +195,7 @@ class OpenAIClient(CachingClient):
|
|
|
119
195
|
if request.messages[-1]["role"] != "user":
|
|
120
196
|
raise ValueError("Last message must have role 'user'")
|
|
121
197
|
if request.prompt != "":
|
|
122
|
-
|
|
198
|
+
hwarn("Since message is set, prompt will be ignored")
|
|
123
199
|
else:
|
|
124
200
|
# Convert prompt into a single message
|
|
125
201
|
# For now, put the whole prompt in a single user message, and expect the response
|
|
@@ -140,6 +216,19 @@ class OpenAIClient(CachingClient):
|
|
|
140
216
|
base64_image: str = encode_base64(media_object.location)
|
|
141
217
|
image_object: Dict[str, str] = {"url": f"data:image/jpeg;base64,{base64_image}"}
|
|
142
218
|
content.append({"type": "image_url", "image_url": image_object})
|
|
219
|
+
elif media_object.is_type("audio") and media_object.location:
|
|
220
|
+
base64_audio: str = multimodal_request_utils.get_contents_as_base64(media_object.location)
|
|
221
|
+
format: str = media_object.content_type.split("/")[1]
|
|
222
|
+
if format == "mpeg":
|
|
223
|
+
# OpenAI expects "mp3" for mpeg audio
|
|
224
|
+
format = "mp3"
|
|
225
|
+
|
|
226
|
+
content.append(
|
|
227
|
+
{
|
|
228
|
+
"type": "input_audio",
|
|
229
|
+
"input_audio": {"data": base64_audio, "format": format},
|
|
230
|
+
}
|
|
231
|
+
)
|
|
143
232
|
elif media_object.is_type(TEXT_TYPE):
|
|
144
233
|
content.append({"type": media_object.type, "text": media_object.text})
|
|
145
234
|
else:
|
|
@@ -164,15 +253,35 @@ class OpenAIClient(CachingClient):
|
|
|
164
253
|
"frequency_penalty": request.frequency_penalty,
|
|
165
254
|
}
|
|
166
255
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
256
|
+
if request.response_format and request.response_format.json_schema:
|
|
257
|
+
# Copy and modify JSON schema to conform to OpenAI's requirements
|
|
258
|
+
json_schema = dict(request.response_format.json_schema)
|
|
259
|
+
|
|
260
|
+
# additionalProperties: false must always be set in objects
|
|
261
|
+
# https://platform.openai.com/docs/guides/structured-outputs#additionalproperties-false-must-always-be-set-in-objects
|
|
262
|
+
if "additionalProperties" not in json_schema:
|
|
263
|
+
json_schema["additionalProperties"] = False
|
|
264
|
+
|
|
265
|
+
# All fields must be required
|
|
266
|
+
# https://platform.openai.com/docs/guides/structured-outputs#all-fields-must-be-required
|
|
267
|
+
if "required" not in json_schema:
|
|
268
|
+
json_schema["required"] = list(json_schema["properties"].keys())
|
|
269
|
+
|
|
270
|
+
raw_request["response_format"] = {
|
|
271
|
+
"type": "json_schema",
|
|
272
|
+
"json_schema": {
|
|
273
|
+
"name": "response",
|
|
274
|
+
"description": "",
|
|
275
|
+
"schema": json_schema,
|
|
276
|
+
"strict": True,
|
|
277
|
+
},
|
|
278
|
+
}
|
|
171
279
|
|
|
172
280
|
# Special handling for o1 models.
|
|
173
281
|
# Refer to the "Reasoning models" documentation further discussion of o1 model limitations:
|
|
174
282
|
# https://platform.openai.com/docs/guides/reasoning
|
|
175
|
-
|
|
283
|
+
model_engine: str = request.model_engine
|
|
284
|
+
if OpenAIClientUtils.is_reasoning_model(model_engine):
|
|
176
285
|
# Avoid error:
|
|
177
286
|
# "Unsupported parameter: 'max_tokens' is not supported with this model. Use 'max_completion_tokens' instead." # noqa: E501
|
|
178
287
|
# Note that openai>=1.45 is needed for this
|
|
@@ -184,6 +293,39 @@ class OpenAIClient(CachingClient):
|
|
|
184
293
|
if raw_request["stop"] is None:
|
|
185
294
|
raw_request.pop("stop")
|
|
186
295
|
|
|
296
|
+
# Avoid error:
|
|
297
|
+
# "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
|
|
298
|
+
# not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
|
|
299
|
+
# 'code': 'unsupported_parameter'}}"
|
|
300
|
+
raw_request.pop("temperature", None)
|
|
301
|
+
|
|
302
|
+
# The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
|
|
303
|
+
raw_request.pop("top_p", None)
|
|
304
|
+
raw_request.pop("frequency_penalty", None)
|
|
305
|
+
raw_request.pop("presence_penalty", None)
|
|
306
|
+
|
|
307
|
+
if self.reasoning_effort:
|
|
308
|
+
raw_request["reasoning_effort"] = self.reasoning_effort
|
|
309
|
+
elif is_vlm(request.model):
|
|
310
|
+
# Avoid error:
|
|
311
|
+
# "Invalid type for 'stop': expected an unsupported value, but got null instead."
|
|
312
|
+
if raw_request["stop"] is None:
|
|
313
|
+
raw_request.pop("stop")
|
|
314
|
+
|
|
315
|
+
# Special handling for gpt-4o-audio-preview
|
|
316
|
+
# See: https://platform.openai.com/docs/guides/audio
|
|
317
|
+
if model_engine.startswith("gpt-4o-audio-preview") or model_engine.startswith("gpt-4o-mini-audio-preview"):
|
|
318
|
+
raw_request["modalities"] = ["text"]
|
|
319
|
+
|
|
320
|
+
# Avoid error:
|
|
321
|
+
# OpenAI error: Error code: 400 - {'error': {'message': "[{'type': 'string_type', 'loc': ('body', 'stop', 'str'), 'msg': 'Input should be a valid string', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[str]'), 'msg': 'Input should be a valid list', 'input': None}, {'type': 'list_type', 'loc': ('body', 'stop', 'list[list[int]]'), 'msg': 'Input should be a valid list', 'input': None}]", 'type': 'invalid_request_error', 'param': None, 'code': None}} # noqa: 3501
|
|
322
|
+
if raw_request["stop"] is None:
|
|
323
|
+
raw_request.pop("stop")
|
|
324
|
+
return raw_request
|
|
325
|
+
|
|
326
|
+
def _make_chat_request(self, request: Request) -> RequestResult:
|
|
327
|
+
raw_request = self._make_chat_raw_request(request)
|
|
328
|
+
|
|
187
329
|
def do_it() -> Dict[str, Any]:
|
|
188
330
|
return self.client.chat.completions.create(**raw_request).model_dump(mode="json")
|
|
189
331
|
|
|
@@ -191,30 +333,27 @@ class OpenAIClient(CachingClient):
|
|
|
191
333
|
cache_key = self._get_cache_key(raw_request, request)
|
|
192
334
|
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
193
335
|
except openai.OpenAIError as e:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
)
|
|
336
|
+
return OpenAIClientUtils.handle_openai_error(e, request)
|
|
337
|
+
|
|
338
|
+
completions: List[GeneratedOutput] = []
|
|
339
|
+
for raw_completion in response["choices"]:
|
|
340
|
+
# Handle Azure OpenAI content filter
|
|
341
|
+
# See: https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/content-filter
|
|
342
|
+
if raw_completion["finish_reason"] == "content_filter":
|
|
343
|
+
hlog(f"Content blocked by OpenAI filter: {str(raw_request)}")
|
|
202
344
|
return RequestResult(
|
|
203
|
-
success=
|
|
345
|
+
success=False,
|
|
204
346
|
cached=False,
|
|
205
|
-
|
|
206
|
-
completions=[
|
|
347
|
+
error="Content blocked by OpenAI filter",
|
|
348
|
+
completions=[],
|
|
207
349
|
embedding=[],
|
|
350
|
+
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
208
351
|
)
|
|
209
|
-
|
|
210
|
-
error: str = f"OpenAI error: {e}"
|
|
211
|
-
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
212
|
-
|
|
213
|
-
completions: List[GeneratedOutput] = []
|
|
214
|
-
for raw_completion in response["choices"]:
|
|
215
352
|
# The OpenAI chat completion API doesn't support echo.
|
|
216
353
|
# If `echo_prompt` is true, combine the prompt and completion.
|
|
217
354
|
raw_completion_content = raw_completion["message"]["content"]
|
|
355
|
+
if self.output_processor:
|
|
356
|
+
raw_completion_content = self.output_processor(raw_completion_content)
|
|
218
357
|
text: str = request.prompt + raw_completion_content if request.echo_prompt else raw_completion_content
|
|
219
358
|
# The OpenAI chat completion API doesn't return us tokens or logprobs, so we tokenize ourselves.
|
|
220
359
|
tokenization_result: TokenizationRequestResult = self.tokenizer.tokenize(
|
|
@@ -224,11 +363,20 @@ class OpenAIClient(CachingClient):
|
|
|
224
363
|
tokens: List[Token] = [
|
|
225
364
|
Token(text=cast(str, raw_token), logprob=0) for raw_token in tokenization_result.raw_tokens
|
|
226
365
|
]
|
|
366
|
+
# vLLM has a optional `reasoning_content` field in the message
|
|
367
|
+
# that is not in the standard OpenAI API.
|
|
368
|
+
# This field is also used by some model providers such as Grok.
|
|
369
|
+
thinking = (
|
|
370
|
+
Thinking(text=raw_completion["message"]["reasoning_content"])
|
|
371
|
+
if "reasoning_content" in raw_completion["message"]
|
|
372
|
+
else None
|
|
373
|
+
)
|
|
227
374
|
completion = GeneratedOutput(
|
|
228
375
|
text=text,
|
|
229
376
|
logprob=0, # OpenAI does not provide logprobs
|
|
230
377
|
tokens=tokens,
|
|
231
378
|
finish_reason={"reason": raw_completion["finish_reason"]},
|
|
379
|
+
thinking=thinking,
|
|
232
380
|
)
|
|
233
381
|
completions.append(truncate_sequence(completion, request)) # Truncate the text by stop sequences
|
|
234
382
|
|
|
@@ -313,10 +461,127 @@ class OpenAIClient(CachingClient):
|
|
|
313
461
|
embedding=[],
|
|
314
462
|
)
|
|
315
463
|
|
|
464
|
+
def _make_transcription_request(self, request: Request) -> RequestResult:
|
|
465
|
+
assert (
|
|
466
|
+
request.multimodal_prompt is not None and request.multimodal_prompt.size == 1
|
|
467
|
+
), "Expected just a single audio file."
|
|
468
|
+
media_object = request.multimodal_prompt.media_objects[0]
|
|
469
|
+
assert media_object.is_type("audio") and media_object.location, "Expected an audio file."
|
|
470
|
+
audio_path: str = media_object.location
|
|
471
|
+
model: str = self._get_model_for_request(request)
|
|
472
|
+
|
|
473
|
+
def do_it() -> Dict[str, Any]:
|
|
474
|
+
transcription = self.client.audio.transcriptions.create(model=model, file=open(audio_path, "rb"))
|
|
475
|
+
return {"transcription": transcription.text}
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
cache_key = self._get_cache_key({"audio": audio_path, "model": model}, request)
|
|
479
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
480
|
+
except openai.OpenAIError as e:
|
|
481
|
+
error: str = f"OpenAI error: {e}"
|
|
482
|
+
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
483
|
+
|
|
484
|
+
return RequestResult(
|
|
485
|
+
success=True,
|
|
486
|
+
cached=cached,
|
|
487
|
+
request_time=response["request_time"],
|
|
488
|
+
request_datetime=response.get("request_datetime"),
|
|
489
|
+
completions=[GeneratedOutput(text=response["transcription"], logprob=0, tokens=[])],
|
|
490
|
+
embedding=[],
|
|
491
|
+
)
|
|
492
|
+
|
|
316
493
|
def make_request(self, request: Request) -> RequestResult:
|
|
317
494
|
if request.embedding:
|
|
318
495
|
return self._make_embedding_request(request)
|
|
319
|
-
elif
|
|
320
|
-
return self.
|
|
496
|
+
elif "whisper" in request.model_engine or "transcribe" in request.model_engine:
|
|
497
|
+
return self._make_transcription_request(request)
|
|
321
498
|
else:
|
|
322
|
-
return self.
|
|
499
|
+
return self._make_chat_request(request)
|
|
500
|
+
|
|
501
|
+
|
|
502
|
+
class OpenAILegacyCompletionsClient(OpenAIClient):
|
|
503
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
504
|
+
return self._make_completion_request(request)
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
class OpenAITranscriptionThenCompletionClient(Client):
|
|
508
|
+
"""
|
|
509
|
+
Wrapper around `OpenAIClient` that transcribes audio to text with a
|
|
510
|
+
speech-to-text model (e.g., Whisper) before making a completion request.
|
|
511
|
+
"""
|
|
512
|
+
|
|
513
|
+
@staticmethod
|
|
514
|
+
def wrap_transcribed_indicator(transcription: str) -> str:
|
|
515
|
+
return f"\n[TRANSCRIBED AUDIO START]\n{transcription}\n[TRANSCRIBED AUDIO END]\n"
|
|
516
|
+
|
|
517
|
+
def __init__(
|
|
518
|
+
self,
|
|
519
|
+
tokenizer: Tokenizer,
|
|
520
|
+
tokenizer_name: str,
|
|
521
|
+
cache_config: CacheConfig,
|
|
522
|
+
api_key: Optional[str] = None,
|
|
523
|
+
org_id: Optional[str] = None,
|
|
524
|
+
):
|
|
525
|
+
self._openai_client = OpenAIClient(
|
|
526
|
+
tokenizer=tokenizer,
|
|
527
|
+
tokenizer_name=tokenizer_name,
|
|
528
|
+
cache_config=cache_config,
|
|
529
|
+
api_key=api_key,
|
|
530
|
+
org_id=org_id,
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
534
|
+
# Ensure that there is only one _ in the model engine name as the format is
|
|
535
|
+
# `transcription-model_completion-model`
|
|
536
|
+
assert request.model_engine.count("_") == 1, f"Invalid model name: {request.model_engine}"
|
|
537
|
+
# Use `model_engine` to determine both the models for transcription and completion
|
|
538
|
+
transcription_model, completion_model = request.model_engine.split("_")
|
|
539
|
+
|
|
540
|
+
# Only multimodal prompts are supported
|
|
541
|
+
assert request.multimodal_prompt is not None, "Expected a multimodal prompt"
|
|
542
|
+
|
|
543
|
+
# Gather all the text content and transcribe any audio to text
|
|
544
|
+
text_content: List[str] = []
|
|
545
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
546
|
+
if media_object.is_type("audio") and media_object.location:
|
|
547
|
+
request = Request(
|
|
548
|
+
model=f"openai/{transcription_model}",
|
|
549
|
+
multimodal_prompt=MultimediaObject(media_objects=[media_object]),
|
|
550
|
+
)
|
|
551
|
+
response = self._openai_client.make_request(request)
|
|
552
|
+
|
|
553
|
+
transcribed_text: str
|
|
554
|
+
if response.success and response.completions:
|
|
555
|
+
transcribed_text = response.completions[0].text
|
|
556
|
+
else:
|
|
557
|
+
transcribed_text = ""
|
|
558
|
+
hlog(f"Failed to transcribe audio: {response.error}")
|
|
559
|
+
|
|
560
|
+
text_content.append(self.wrap_transcribed_indicator(transcribed_text))
|
|
561
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
562
|
+
assert media_object.text is not None, "Expected text content"
|
|
563
|
+
text_content.append(media_object.text)
|
|
564
|
+
else:
|
|
565
|
+
raise ValueError(f"Unrecognized media type: {media_object.type}")
|
|
566
|
+
|
|
567
|
+
text_prompt: str = "\n".join(text_content)
|
|
568
|
+
hlog(f"Transcribed prompt:\n{text_prompt}")
|
|
569
|
+
|
|
570
|
+
# Now make the request to the completion model with just a text-only prompt and no audio
|
|
571
|
+
# Use the same decoding parameters as the original request
|
|
572
|
+
# Ensure to set multimodal_prompt to None so the request is treated as text-only.
|
|
573
|
+
request_result: RequestResult = self._openai_client.make_request(
|
|
574
|
+
replace(request, prompt=text_prompt, model=f"openai/{completion_model}", multimodal_prompt=None)
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# Also include the generated transcript to the request result
|
|
578
|
+
completions_with_transcript: List[GeneratedOutput] = [
|
|
579
|
+
replace(
|
|
580
|
+
completion,
|
|
581
|
+
multimodal_content=MultimediaObject(
|
|
582
|
+
media_objects=[MediaObject(text=text_prompt, content_type="text/plain")]
|
|
583
|
+
),
|
|
584
|
+
)
|
|
585
|
+
for completion in request_result.completions
|
|
586
|
+
]
|
|
587
|
+
return replace(request_result, completions=completions_with_transcript)
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# mypy: check_untyped_defs = False
|
|
2
|
+
import dataclasses
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from helm.clients.openai_client import OpenAIClientUtils
|
|
7
|
+
from helm.common.cache import CacheConfig
|
|
8
|
+
from helm.common.media_object import TEXT_TYPE
|
|
9
|
+
from helm.common.request import (
|
|
10
|
+
Thinking,
|
|
11
|
+
wrap_request_time,
|
|
12
|
+
Request,
|
|
13
|
+
RequestResult,
|
|
14
|
+
GeneratedOutput,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
17
|
+
from helm.clients.client import (
|
|
18
|
+
CachingClient,
|
|
19
|
+
truncate_and_tokenize_response_text,
|
|
20
|
+
generate_uid_for_multimodal_prompt,
|
|
21
|
+
)
|
|
22
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import openai
|
|
26
|
+
from openai import OpenAI
|
|
27
|
+
except ModuleNotFoundError as e:
|
|
28
|
+
handle_module_not_found_error(e, ["openai"])
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class OpenAIResponseClient(CachingClient):
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
tokenizer: Tokenizer,
|
|
35
|
+
tokenizer_name: str,
|
|
36
|
+
cache_config: CacheConfig,
|
|
37
|
+
api_key: Optional[str] = None,
|
|
38
|
+
org_id: Optional[str] = None,
|
|
39
|
+
base_url: Optional[str] = None,
|
|
40
|
+
reasoning_effort: Optional[str] = None,
|
|
41
|
+
openai_model_name: Optional[str] = None,
|
|
42
|
+
):
|
|
43
|
+
super().__init__(cache_config=cache_config)
|
|
44
|
+
self.tokenizer = tokenizer
|
|
45
|
+
self.tokenizer_name = tokenizer_name
|
|
46
|
+
self.client = OpenAI(
|
|
47
|
+
api_key=api_key,
|
|
48
|
+
organization=org_id,
|
|
49
|
+
base_url=base_url,
|
|
50
|
+
)
|
|
51
|
+
self.reasoning_effort = reasoning_effort
|
|
52
|
+
self.openai_model_name = openai_model_name
|
|
53
|
+
|
|
54
|
+
def _get_cache_key(self, raw_request: Dict, request: Request):
|
|
55
|
+
cache_key = CachingClient.make_cache_key(raw_request, request)
|
|
56
|
+
if request.multimodal_prompt:
|
|
57
|
+
prompt_key: str = generate_uid_for_multimodal_prompt(request.multimodal_prompt)
|
|
58
|
+
cache_key = {**cache_key, "multimodal_prompt": prompt_key}
|
|
59
|
+
return cache_key
|
|
60
|
+
|
|
61
|
+
def _make_raw_request(self, request: Request) -> dict[str, Any]:
|
|
62
|
+
input: Union[str, List[Dict[str, Any]]]
|
|
63
|
+
if request.multimodal_prompt is not None:
|
|
64
|
+
content = []
|
|
65
|
+
request.validate()
|
|
66
|
+
for media_object in request.multimodal_prompt.media_objects:
|
|
67
|
+
if media_object.is_type("image") and media_object.location:
|
|
68
|
+
from helm.common.images_utils import encode_base64
|
|
69
|
+
|
|
70
|
+
base64_image: str = encode_base64(media_object.location)
|
|
71
|
+
content.append(
|
|
72
|
+
{
|
|
73
|
+
"type": "input_image",
|
|
74
|
+
"image_url": f"data:image/jpeg;base64,{base64_image}",
|
|
75
|
+
}
|
|
76
|
+
)
|
|
77
|
+
elif media_object.is_type(TEXT_TYPE):
|
|
78
|
+
assert media_object.text is not None
|
|
79
|
+
content.append({"type": "input_text", "text": media_object.text})
|
|
80
|
+
else:
|
|
81
|
+
raise ValueError(f"Unrecognized MediaObject type {media_object.type}")
|
|
82
|
+
input = [{"role": "user", "content": content}]
|
|
83
|
+
else:
|
|
84
|
+
input = request.prompt
|
|
85
|
+
|
|
86
|
+
raw_request: Dict[str, Any] = {
|
|
87
|
+
"model": self._get_model_for_request(request),
|
|
88
|
+
"input": input,
|
|
89
|
+
"top_p": request.top_p,
|
|
90
|
+
# API errors if max_output_tokens is less than 16
|
|
91
|
+
# (Error you get: "Invalid 'max_output_tokens': integer below minimum value.
|
|
92
|
+
# Expected a value >= 16, but got 5 instead.")
|
|
93
|
+
"max_output_tokens": max(16, request.max_tokens),
|
|
94
|
+
"temperature": request.temperature,
|
|
95
|
+
# Don't store responses for later retrieval
|
|
96
|
+
"store": False,
|
|
97
|
+
}
|
|
98
|
+
if self.reasoning_effort:
|
|
99
|
+
raw_request["reasoning"] = {"effort": self.reasoning_effort}
|
|
100
|
+
# If o-series model, get reasoning summaries
|
|
101
|
+
# Plus other changes
|
|
102
|
+
model_engine: str = request.model_engine
|
|
103
|
+
if OpenAIClientUtils.is_reasoning_model(model_engine):
|
|
104
|
+
raw_request["reasoning"]["summary"] = "detailed"
|
|
105
|
+
# Avoid error:
|
|
106
|
+
# "Error code: 400 - {'error': {'message': "Unsupported parameter: 'temperature' is
|
|
107
|
+
# not supported with this model.", 'type': 'invalid_request_error', 'param': 'temperature',
|
|
108
|
+
# 'code': 'unsupported_parameter'}}"
|
|
109
|
+
raw_request.pop("temperature", None)
|
|
110
|
+
|
|
111
|
+
# The following parameters also happen to be unsupported by the o-series (code unsupported_parameter)
|
|
112
|
+
raw_request.pop("top_p", None)
|
|
113
|
+
|
|
114
|
+
return raw_request
|
|
115
|
+
|
|
116
|
+
def _get_model_for_request(self, request: Request) -> str:
|
|
117
|
+
return self.openai_model_name or request.model_engine
|
|
118
|
+
|
|
119
|
+
def make_request(self, request: Request) -> RequestResult:
|
|
120
|
+
# Content can either be text or a list of multimodal content made up of text and images:
|
|
121
|
+
# https://platform.openai.com/docs/api-reference/responses/create
|
|
122
|
+
raw_request = self._make_raw_request(request)
|
|
123
|
+
|
|
124
|
+
# The responses API does not support a "num_completions" parameter,
|
|
125
|
+
# so we need to handle it ourselves with a simple loop
|
|
126
|
+
completions: list[GeneratedOutput] = []
|
|
127
|
+
for _ in range(request.num_completions):
|
|
128
|
+
|
|
129
|
+
def do_it() -> Dict[str, Any]:
|
|
130
|
+
raw_response = self.client.responses.create(**raw_request).model_dump(mode="json")
|
|
131
|
+
assert not raw_response.get("error", None), f"Error in response: {raw_response}"
|
|
132
|
+
return raw_response
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
cache_key = self._get_cache_key(raw_request, request)
|
|
136
|
+
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
137
|
+
except openai.OpenAIError as e:
|
|
138
|
+
return OpenAIClientUtils.handle_openai_error(e, request)
|
|
139
|
+
|
|
140
|
+
# We can only return one completition really,
|
|
141
|
+
# but we get an array of messages back, so we need to contact them
|
|
142
|
+
reasoning_output = ""
|
|
143
|
+
text_output = ""
|
|
144
|
+
|
|
145
|
+
if request.echo_prompt:
|
|
146
|
+
text_output += request.prompt
|
|
147
|
+
for output in response["output"]:
|
|
148
|
+
output_type = output["type"] # one of "message" or "reasoning" from API observation
|
|
149
|
+
is_reasoning_output = output_type == "reasoning"
|
|
150
|
+
|
|
151
|
+
if is_reasoning_output:
|
|
152
|
+
reasoning_output += "\n".join([raw_output["text"] for raw_output in output["summary"]])
|
|
153
|
+
else:
|
|
154
|
+
text_output += "\n".join([raw_output["text"] for raw_output in output["content"]])
|
|
155
|
+
|
|
156
|
+
completion = truncate_and_tokenize_response_text(
|
|
157
|
+
text_output,
|
|
158
|
+
request,
|
|
159
|
+
self.tokenizer,
|
|
160
|
+
self.tokenizer_name,
|
|
161
|
+
original_finish_reason="",
|
|
162
|
+
)
|
|
163
|
+
if reasoning_output:
|
|
164
|
+
completion = dataclasses.replace(completion, thinking=Thinking(text=reasoning_output))
|
|
165
|
+
completions.append(completion)
|
|
166
|
+
|
|
167
|
+
return RequestResult(
|
|
168
|
+
success=True,
|
|
169
|
+
cached=cached,
|
|
170
|
+
request_time=response["request_time"],
|
|
171
|
+
request_datetime=response.get("request_datetime"),
|
|
172
|
+
completions=completions,
|
|
173
|
+
embedding=[],
|
|
174
|
+
)
|