crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -2,16 +2,16 @@
|
|
|
2
2
|
<html lang="en">
|
|
3
3
|
<head>
|
|
4
4
|
<meta charset="UTF-8" />
|
|
5
|
-
<link rel="icon" type="image/svg+xml" href="
|
|
5
|
+
<link rel="icon" type="image/svg+xml" href="https://crfm.stanford.edu/helm/helm.svg" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>Holistic Evaluation of Language Models (HELM)</title>
|
|
8
8
|
<meta name="description" content="The Holistic Evaluation of Language Models (HELM) serves as a living benchmark for transparency in language models. Providing broad coverage and recognizing incompleteness, multi-metric measurements, and standardization. All data and analysis are freely accessible on the website for exploration and study." />
|
|
9
9
|
<script type="text/javascript" src="./config.js"></script>
|
|
10
|
-
<script type="module" crossorigin src="./assets/index-
|
|
11
|
-
<link rel="modulepreload" crossorigin href="./assets/react-
|
|
12
|
-
<link rel="modulepreload" crossorigin href="./assets/recharts-
|
|
13
|
-
<link rel="modulepreload" crossorigin href="./assets/tremor-
|
|
14
|
-
<link rel="stylesheet" href="./assets/index-
|
|
10
|
+
<script type="module" crossorigin src="./assets/index-94295e78.js"></script>
|
|
11
|
+
<link rel="modulepreload" crossorigin href="./assets/react-f82877fd.js">
|
|
12
|
+
<link rel="modulepreload" crossorigin href="./assets/recharts-4037aff0.js">
|
|
13
|
+
<link rel="modulepreload" crossorigin href="./assets/tremor-38a10867.js">
|
|
14
|
+
<link rel="stylesheet" href="./assets/index-b9779128.css">
|
|
15
15
|
</head>
|
|
16
16
|
<body class="block">
|
|
17
17
|
<div id="root"></div>
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
|
|
3
|
-
from helm.common.hierarchical_logger import
|
|
4
|
-
from .local_window_service import LocalWindowService
|
|
3
|
+
from helm.common.hierarchical_logger import hwarn
|
|
4
|
+
from helm.benchmark.window_services.local_window_service import LocalWindowService
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class EncoderDecoderWindowService(LocalWindowService, ABC):
|
|
@@ -21,8 +21,8 @@ class EncoderDecoderWindowService(LocalWindowService, ABC):
|
|
|
21
21
|
vs. the completions, we check the two values separately.
|
|
22
22
|
"""
|
|
23
23
|
if expected_completion_token_length > self.max_output_length:
|
|
24
|
-
|
|
25
|
-
f"
|
|
24
|
+
hwarn(
|
|
25
|
+
f"The expected completion token length ({expected_completion_token_length}) "
|
|
26
26
|
f"exceeds the max output length ({self.max_output_length})."
|
|
27
27
|
)
|
|
28
28
|
return self.get_num_tokens(text) <= self.max_request_length
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from abc import ABC
|
|
2
2
|
from typing import List, Optional, cast
|
|
3
3
|
|
|
4
|
-
from .window_service import ConfigurableWindowService, EncodeResult
|
|
5
|
-
from .tokenizer_service import TokenizerService
|
|
4
|
+
from helm.benchmark.window_services.window_service import ConfigurableWindowService, EncodeResult
|
|
5
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
6
6
|
from helm.common.tokenization_request import (
|
|
7
7
|
DecodeRequest,
|
|
8
8
|
DecodeRequestResult,
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestAnthropicWindowService:
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestBloomWindowService:
|
|
@@ -3,8 +3,13 @@ import tempfile
|
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .test_utils import
|
|
7
|
-
|
|
6
|
+
from helm.benchmark.window_services.test_utils import (
|
|
7
|
+
get_tokenizer_service,
|
|
8
|
+
TEST_PROMPT,
|
|
9
|
+
GPT2_TEST_TOKENS,
|
|
10
|
+
GPT2_TEST_TOKEN_IDS,
|
|
11
|
+
)
|
|
12
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class TestGPT2WindowService:
|
|
@@ -2,9 +2,14 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .test_utils import
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
from helm.benchmark.window_services.test_utils import (
|
|
6
|
+
get_tokenizer_service,
|
|
7
|
+
TEST_PROMPT,
|
|
8
|
+
GPT4_TEST_TOKEN_IDS,
|
|
9
|
+
GPT4_TEST_TOKENS,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
12
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class TestOpenAIWindowService:
|
|
@@ -2,9 +2,14 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .tokenizer_service import TokenizerService
|
|
6
|
-
from .window_service_factory import WindowServiceFactory
|
|
7
|
-
from .test_utils import
|
|
5
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
6
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
|
+
from helm.benchmark.window_services.test_utils import (
|
|
8
|
+
get_tokenizer_service,
|
|
9
|
+
GPT2_TEST_TOKENS,
|
|
10
|
+
GPT2_TEST_TOKEN_IDS,
|
|
11
|
+
TEST_PROMPT,
|
|
12
|
+
)
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class TestGPTJWindowService:
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestGPTNeoXWindowService:
|
|
@@ -2,9 +2,14 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .test_utils import
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
from helm.benchmark.window_services.test_utils import (
|
|
6
|
+
get_tokenizer_service,
|
|
7
|
+
TEST_PROMPT,
|
|
8
|
+
GPT2_TEST_TOKENS,
|
|
9
|
+
GPT2_TEST_TOKEN_IDS,
|
|
10
|
+
)
|
|
11
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
12
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
13
|
|
|
9
14
|
|
|
10
15
|
class TestOpenAIWindowService:
|
|
@@ -2,9 +2,9 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
5
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class TestOPTWindowService:
|
|
@@ -2,9 +2,9 @@ from tempfile import TemporaryDirectory
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .tokenizer_service import TokenizerService
|
|
6
|
-
from .window_service_factory import WindowServiceFactory
|
|
7
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
5
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
6
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
7
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class TestPalmyraWindowService:
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestT0ppWindowService:
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestT511bWindowService:
|
|
@@ -3,9 +3,9 @@ import tempfile
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
8
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
class TestUL2WindowService:
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
-
from helm.common.
|
|
3
|
+
from helm.common.local_context import LocalContext
|
|
4
4
|
from helm.common.cache_backend_config import CacheBackendConfig
|
|
5
|
-
from helm.proxy.services.server_service import ServerService
|
|
6
5
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
7
|
-
from .tokenizer_service import TokenizerService
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
TEST_PROMPT: str = (
|
|
@@ -229,5 +228,5 @@ GPT4_TEST_TOKENS: List[str] = [
|
|
|
229
228
|
|
|
230
229
|
|
|
231
230
|
def get_tokenizer_service(local_path: str, cache_backend_config: CacheBackendConfig) -> TokenizerService:
|
|
232
|
-
|
|
233
|
-
return MetricService(
|
|
231
|
+
context = LocalContext(base_path=local_path, cache_backend_config=cache_backend_config)
|
|
232
|
+
return MetricService(context)
|
|
@@ -2,9 +2,9 @@ import shutil
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.common.cache_backend_config import BlackHoleCacheBackendConfig
|
|
5
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
-
from .tokenizer_service import TokenizerService
|
|
7
|
-
from .window_service_factory import WindowServiceFactory
|
|
5
|
+
from helm.benchmark.window_services.test_utils import get_tokenizer_service, TEST_PROMPT
|
|
6
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
|
+
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class TestYaLMWindowService:
|
|
@@ -1,26 +1,25 @@
|
|
|
1
|
-
from helm.common.
|
|
1
|
+
from helm.common.context import Context
|
|
2
2
|
from helm.common.tokenization_request import (
|
|
3
3
|
TokenizationRequest,
|
|
4
4
|
TokenizationRequestResult,
|
|
5
5
|
DecodeRequest,
|
|
6
6
|
DecodeRequestResult,
|
|
7
7
|
)
|
|
8
|
-
from helm.proxy.services.service import Service
|
|
9
8
|
|
|
10
9
|
|
|
10
|
+
# TODO: Rename this to TokenizerContext
|
|
11
11
|
class TokenizerService:
|
|
12
12
|
"""
|
|
13
|
-
A wrapper around `
|
|
13
|
+
A wrapper around `Context` that makes only necessary server requests to tokenize.
|
|
14
14
|
"""
|
|
15
15
|
|
|
16
|
-
def __init__(self,
|
|
17
|
-
self.
|
|
18
|
-
self._auth: Authentication = auth
|
|
16
|
+
def __init__(self, context: Context):
|
|
17
|
+
self._context: Context = context
|
|
19
18
|
|
|
20
19
|
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
21
20
|
"""Tokenize via an API."""
|
|
22
|
-
return self.
|
|
21
|
+
return self._context.tokenize(request)
|
|
23
22
|
|
|
24
23
|
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
25
24
|
"""Decode via an API."""
|
|
26
|
-
return self.
|
|
25
|
+
return self._context.decode(request)
|
helm/clients/ai21_client.py
CHANGED
|
@@ -11,8 +11,8 @@ from helm.common.request import (
|
|
|
11
11
|
GeneratedOutput,
|
|
12
12
|
Token,
|
|
13
13
|
)
|
|
14
|
-
from .client import CachingClient, truncate_sequence, cleanup_str
|
|
15
|
-
from .ai21_utils import AI21RequestError, handle_failed_request
|
|
14
|
+
from helm.clients.client import CachingClient, truncate_sequence, cleanup_str
|
|
15
|
+
from helm.clients.ai21_utils import AI21RequestError, handle_failed_request
|
|
16
16
|
|
|
17
17
|
try:
|
|
18
18
|
from ai21 import AI21Client as AISDKClient
|
|
@@ -186,7 +186,7 @@ class AI21ChatClient(CachingClient):
|
|
|
186
186
|
completions: List[GeneratedOutput] = []
|
|
187
187
|
|
|
188
188
|
for choice in response["choices"]:
|
|
189
|
-
completions.append(GeneratedOutput(text=choice["message"]["content"], logprob=0.0, tokens=[]))
|
|
189
|
+
completions.append(GeneratedOutput(text=choice["message"]["content"] or "", logprob=0.0, tokens=[]))
|
|
190
190
|
|
|
191
191
|
return RequestResult(
|
|
192
192
|
success=True,
|
|
@@ -4,7 +4,7 @@ from helm.common.cache import CacheConfig
|
|
|
4
4
|
from helm.common.media_object import TEXT_TYPE
|
|
5
5
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
6
|
from helm.common.request import wrap_request_time, Request, RequestResult, GeneratedOutput, Token
|
|
7
|
-
from .client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
7
|
+
from helm.clients.client import CachingClient, truncate_sequence, generate_uid_for_multimodal_prompt
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
10
|
from aleph_alpha_client import Client, CompletionRequest, CompletionResponse, Image, Prompt
|
helm/clients/anthropic_client.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import dataclasses
|
|
1
2
|
from typing import Any, Dict, List, Optional, TypedDict, Union, cast
|
|
2
3
|
import json
|
|
3
4
|
import os
|
|
@@ -7,10 +8,11 @@ import time
|
|
|
7
8
|
import urllib.parse
|
|
8
9
|
|
|
9
10
|
from helm.common.cache import CacheConfig
|
|
10
|
-
from helm.common.hierarchical_logger import htrack_block, hlog
|
|
11
|
+
from helm.common.hierarchical_logger import htrack_block, hlog, hwarn
|
|
11
12
|
from helm.common.media_object import IMAGE_TYPE, TEXT_TYPE
|
|
12
13
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
13
14
|
from helm.common.request import (
|
|
15
|
+
Thinking,
|
|
14
16
|
wrap_request_time,
|
|
15
17
|
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
16
18
|
Request,
|
|
@@ -30,8 +32,12 @@ from helm.clients.client import CachingClient, truncate_sequence, truncate_and_t
|
|
|
30
32
|
try:
|
|
31
33
|
from anthropic import Anthropic, BadRequestError
|
|
32
34
|
from anthropic.types import MessageParam
|
|
35
|
+
from anthropic.types.message import Message
|
|
36
|
+
from anthropic.types.text_block import TextBlock
|
|
37
|
+
from anthropic.types.thinking_block import ThinkingBlock
|
|
33
38
|
from anthropic.types.image_block_param import ImageBlockParam
|
|
34
39
|
from anthropic.types.text_block_param import TextBlockParam
|
|
40
|
+
from anthropic.types.thinking_config_enabled_param import ThinkingConfigEnabledParam
|
|
35
41
|
import websocket
|
|
36
42
|
except ModuleNotFoundError as e:
|
|
37
43
|
handle_module_not_found_error(e, ["anthropic"])
|
|
@@ -231,30 +237,41 @@ class AnthropicMessagesRequest(TypedDict, total=False):
|
|
|
231
237
|
temperature: float
|
|
232
238
|
top_k: int
|
|
233
239
|
top_p: float
|
|
240
|
+
thinking: ThinkingConfigEnabledParam
|
|
234
241
|
|
|
235
242
|
|
|
236
243
|
class AnthropicMessagesRequestError(NonRetriableException):
|
|
237
244
|
pass
|
|
238
245
|
|
|
239
246
|
|
|
240
|
-
class
|
|
247
|
+
class AnthropicMessagesEmptyContentError(Exception):
|
|
241
248
|
pass
|
|
242
249
|
|
|
243
250
|
|
|
244
251
|
class AnthropicMessagesClient(CachingClient):
|
|
245
252
|
# Source: https://docs.anthropic.com/claude/docs/models-overview
|
|
246
|
-
MAX_OUTPUT_TOKENS: int =
|
|
253
|
+
MAX_OUTPUT_TOKENS: int = 64000
|
|
247
254
|
|
|
248
255
|
MAX_IMAGE_SIZE_BYTES: int = 5242880 # 5MB
|
|
249
256
|
|
|
250
257
|
def __init__(
|
|
251
|
-
self,
|
|
258
|
+
self,
|
|
259
|
+
tokenizer: Tokenizer,
|
|
260
|
+
tokenizer_name: str,
|
|
261
|
+
cache_config: CacheConfig,
|
|
262
|
+
thinking_budget_tokens: Optional[int] = None,
|
|
263
|
+
anthropic_model_name: Optional[str] = None,
|
|
264
|
+
api_key: Optional[str] = None,
|
|
265
|
+
stream: Optional[bool] = None,
|
|
252
266
|
):
|
|
253
267
|
super().__init__(cache_config=cache_config)
|
|
254
268
|
self.tokenizer = tokenizer
|
|
255
269
|
self.tokenizer_name = tokenizer_name
|
|
256
270
|
self.client = Anthropic(api_key=api_key)
|
|
257
271
|
self.api_key: Optional[str] = api_key
|
|
272
|
+
self.anthropic_model_name: Optional[str] = anthropic_model_name
|
|
273
|
+
self.thinking_budget_tokens: Optional[int] = thinking_budget_tokens
|
|
274
|
+
self.stream: Optional[bool] = stream
|
|
258
275
|
|
|
259
276
|
def make_request(self, request: Request) -> RequestResult:
|
|
260
277
|
if request.max_tokens > AnthropicMessagesClient.MAX_OUTPUT_TOKENS:
|
|
@@ -293,8 +310,8 @@ class AnthropicMessagesClient(CachingClient):
|
|
|
293
310
|
image_width > AnthropicClient.MAX_IMAGE_DIMENSION
|
|
294
311
|
or image_height > AnthropicClient.MAX_IMAGE_DIMENSION
|
|
295
312
|
):
|
|
296
|
-
|
|
297
|
-
f"
|
|
313
|
+
hwarn(
|
|
314
|
+
f"Image {image_location} exceeds max allowed size: "
|
|
298
315
|
f"{AnthropicClient.MAX_IMAGE_DIMENSION} pixels"
|
|
299
316
|
)
|
|
300
317
|
# Save the resized image to a temporary file
|
|
@@ -309,8 +326,8 @@ class AnthropicMessagesClient(CachingClient):
|
|
|
309
326
|
base64_image = encode_base64(temp_file.name, format="JPEG")
|
|
310
327
|
|
|
311
328
|
elif os.path.getsize(image_location) > AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES:
|
|
312
|
-
|
|
313
|
-
f"
|
|
329
|
+
hwarn(
|
|
330
|
+
f"Image {image_location} exceeds max allowed size: "
|
|
314
331
|
f"{AnthropicMessagesClient.MAX_IMAGE_SIZE_BYTES} bytes"
|
|
315
332
|
)
|
|
316
333
|
# Resize the image so it is smaller than the max allowed size
|
|
@@ -351,7 +368,7 @@ class AnthropicMessagesClient(CachingClient):
|
|
|
351
368
|
|
|
352
369
|
raw_request: AnthropicMessagesRequest = {
|
|
353
370
|
"messages": messages,
|
|
354
|
-
"model": request.model_engine,
|
|
371
|
+
"model": self.anthropic_model_name or request.model_engine,
|
|
355
372
|
"stop_sequences": request.stop_sequences,
|
|
356
373
|
"max_tokens": request.max_tokens,
|
|
357
374
|
"temperature": request.temperature,
|
|
@@ -360,6 +377,15 @@ class AnthropicMessagesClient(CachingClient):
|
|
|
360
377
|
}
|
|
361
378
|
if system_message is not None:
|
|
362
379
|
raw_request["system"] = cast(str, system_message["content"])
|
|
380
|
+
if self.thinking_budget_tokens:
|
|
381
|
+
raw_request["thinking"] = {
|
|
382
|
+
"type": "enabled",
|
|
383
|
+
"budget_tokens": self.thinking_budget_tokens,
|
|
384
|
+
}
|
|
385
|
+
# Avoid error:
|
|
386
|
+
# `top_k` must be unset when thinking is enabled. Please consult our documentation at https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking#important-considerations-when-using-extended-thinking # noqa: E501
|
|
387
|
+
del raw_request["top_k"]
|
|
388
|
+
|
|
363
389
|
completions: List[GeneratedOutput] = []
|
|
364
390
|
|
|
365
391
|
# `num_completions` is not supported, so instead make `num_completions` separate requests.
|
|
@@ -367,11 +393,15 @@ class AnthropicMessagesClient(CachingClient):
|
|
|
367
393
|
|
|
368
394
|
def do_it() -> Dict[str, Any]:
|
|
369
395
|
try:
|
|
370
|
-
|
|
396
|
+
if self.stream:
|
|
397
|
+
with self.client.messages.stream(**raw_request) as message_stream:
|
|
398
|
+
result = message_stream.get_final_message().model_dump()
|
|
399
|
+
else:
|
|
400
|
+
result = self.client.messages.create(**raw_request).model_dump()
|
|
371
401
|
if "content" not in result or not result["content"]:
|
|
372
|
-
raise
|
|
373
|
-
elif "text" not in result["content"][
|
|
374
|
-
raise
|
|
402
|
+
raise AnthropicMessagesEmptyContentError(f"Anthropic response has empty content: {result}")
|
|
403
|
+
elif "text" not in result["content"][-1]:
|
|
404
|
+
raise AnthropicMessagesEmptyContentError(f"Anthropic response has non-text content: {result}")
|
|
375
405
|
return result
|
|
376
406
|
except BadRequestError as e:
|
|
377
407
|
response = e.response.json()
|
|
@@ -387,9 +417,10 @@ class AnthropicMessagesClient(CachingClient):
|
|
|
387
417
|
},
|
|
388
418
|
request,
|
|
389
419
|
)
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
420
|
+
raw_response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
421
|
+
|
|
422
|
+
except AnthropicMessagesEmptyContentError:
|
|
423
|
+
hwarn("Anthropic response has empty content")
|
|
393
424
|
return RequestResult(
|
|
394
425
|
success=False,
|
|
395
426
|
cached=False,
|
|
@@ -399,32 +430,41 @@ class AnthropicMessagesClient(CachingClient):
|
|
|
399
430
|
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
400
431
|
)
|
|
401
432
|
|
|
402
|
-
if _is_content_moderation_failure(
|
|
403
|
-
|
|
404
|
-
f"WARNING: Returning empty request for {request.model_deployment} "
|
|
405
|
-
"due to content moderation filter"
|
|
406
|
-
)
|
|
433
|
+
if _is_content_moderation_failure(raw_response):
|
|
434
|
+
hwarn(f"Returning empty request for {request.model_deployment} " "due to content moderation filter")
|
|
407
435
|
return RequestResult(
|
|
408
436
|
success=False,
|
|
409
437
|
cached=cached,
|
|
410
|
-
error=
|
|
438
|
+
error=raw_response["error"]["message"],
|
|
411
439
|
completions=[],
|
|
412
440
|
embedding=[],
|
|
413
441
|
error_flags=ErrorFlags(is_retriable=False, is_fatal=False),
|
|
414
|
-
request_time=
|
|
415
|
-
request_datetime=
|
|
442
|
+
request_time=raw_response["request_time"],
|
|
443
|
+
request_datetime=raw_response["request_datetime"],
|
|
416
444
|
)
|
|
417
445
|
|
|
446
|
+
response_message: Message = Message.model_validate(raw_response)
|
|
447
|
+
response_text: Optional[str] = None
|
|
448
|
+
response_thinking: Optional[str] = None
|
|
449
|
+
for content in response_message.content:
|
|
450
|
+
if isinstance(content, TextBlock):
|
|
451
|
+
response_text = content.text
|
|
452
|
+
elif isinstance(content, ThinkingBlock):
|
|
453
|
+
response_thinking = content.thinking
|
|
454
|
+
if response_text is None:
|
|
455
|
+
raise Exception("Anthropic response did not contain text block")
|
|
418
456
|
completion = truncate_and_tokenize_response_text(
|
|
419
|
-
|
|
457
|
+
response_text, request, self.tokenizer, self.tokenizer_name, original_finish_reason=""
|
|
420
458
|
)
|
|
459
|
+
if response_thinking is not None:
|
|
460
|
+
completion = dataclasses.replace(completion, thinking=Thinking(text=response_thinking))
|
|
421
461
|
completions.append(completion)
|
|
422
462
|
|
|
423
463
|
return RequestResult(
|
|
424
464
|
success=True,
|
|
425
465
|
cached=cached,
|
|
426
|
-
request_time=
|
|
427
|
-
request_datetime=
|
|
466
|
+
request_time=raw_response["request_time"],
|
|
467
|
+
request_datetime=raw_response["request_datetime"],
|
|
428
468
|
completions=completions,
|
|
429
469
|
embedding=[],
|
|
430
470
|
)
|
|
@@ -617,8 +657,8 @@ class AnthropicLegacyClient(CachingClient):
|
|
|
617
657
|
if logprobs["tokens"] != tokens:
|
|
618
658
|
# This is a known limitation with the Anthropic API. For now keep track of the
|
|
619
659
|
# entries with the mismatch.
|
|
620
|
-
|
|
621
|
-
f"
|
|
660
|
+
hwarn(
|
|
661
|
+
f"naive truncation for logprobs did not work."
|
|
622
662
|
f"\nRequest:{raw_request}\nExpected: {tokens}\nActual: {logprobs['tokens']}"
|
|
623
663
|
)
|
|
624
664
|
check_logprobs = True
|
|
File without changes
|