crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -1,24 +1,19 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
1
2
|
import math
|
|
2
|
-
from dataclasses import dataclass
|
|
3
|
-
from typing import List,
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import List, Dict, Set
|
|
4
5
|
from urllib.parse import unquote
|
|
5
|
-
from functools import partial
|
|
6
6
|
|
|
7
|
-
import json
|
|
8
|
-
import string
|
|
9
|
-
import nltk
|
|
10
7
|
import numpy as np
|
|
11
|
-
import
|
|
12
|
-
import
|
|
13
|
-
|
|
14
|
-
import
|
|
15
|
-
from
|
|
16
|
-
from
|
|
17
|
-
from nltk.translate.bleu_score import sentence_bleu
|
|
18
|
-
from rouge_score import rouge_scorer
|
|
8
|
+
import scipy # type: ignore
|
|
9
|
+
import calibration as cal # type: ignore
|
|
10
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
11
|
+
from helm.benchmark.metrics.evaluate_reference_metrics import compute_reference_metrics, get_reference_metrics_metadata
|
|
12
|
+
from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
|
|
13
|
+
from helm.benchmark.metrics.reference_metric import ReferenceMetric
|
|
19
14
|
|
|
20
15
|
from helm.common.hierarchical_logger import hlog
|
|
21
|
-
from helm.common.request import Token,
|
|
16
|
+
from helm.common.request import Token, GeneratedOutput
|
|
22
17
|
from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
23
18
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
24
19
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
@@ -29,196 +24,18 @@ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
|
29
24
|
from helm.benchmark.window_services.window_service import WindowService
|
|
30
25
|
from helm.benchmark.window_services.window_service_factory import WindowServiceFactory
|
|
31
26
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
32
|
-
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Instance
|
|
33
|
-
from helm.benchmark.
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
nltk.data.find("tokenizers/punkt")
|
|
45
|
-
except LookupError:
|
|
46
|
-
nltk.download("punkt") # Required for rouge
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
EFFICIENCY_DATA_PACKAGE: str = "helm.benchmark.efficiency_data"
|
|
50
|
-
|
|
51
|
-
INFERENCE_IDEALIZED_RUNTIMES_JSON_FILENAME: str = "inference_idealized_runtimes.json"
|
|
52
|
-
INFERENCE_DENOISED_RUNTIMES_JSON_FILENAME: str = "inference_denoised_runtimes.json"
|
|
53
|
-
TRAINING_EFFICIENCY_JSON_FILENAME: str = "training_efficiency.json"
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
def compute_estimated_time_from_prompt_size_and_num_output_tokens(
|
|
57
|
-
request_state: RequestState,
|
|
58
|
-
inference_runtimes_dict: Dict[str, Dict],
|
|
59
|
-
num_prompt_tokens: int,
|
|
60
|
-
num_output_tokens: int,
|
|
61
|
-
) -> Optional[float]:
|
|
62
|
-
estimated_runtime: Optional[float]
|
|
63
|
-
if request_state.request.model_deployment in inference_runtimes_dict:
|
|
64
|
-
inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model_deployment]
|
|
65
|
-
runtime_per_output_token: float = inference_runtimes_dict_for_model["runtime_per_output_token"]
|
|
66
|
-
raw_runtimes_for_prompt_tokens: Dict[str, float] = inference_runtimes_dict_for_model[
|
|
67
|
-
"runtime_for_prompt_tokens"
|
|
68
|
-
]
|
|
69
|
-
runtimes_for_prompt_tokens: Dict[int, float] = {int(k): v for (k, v) in raw_runtimes_for_prompt_tokens.items()}
|
|
70
|
-
|
|
71
|
-
runtime_for_prompt_tokens: Optional[float] = None
|
|
72
|
-
largest_num_tokens_in_efficiency_dict: int = max(runtimes_for_prompt_tokens.keys())
|
|
73
|
-
# Find the smallest num_prompt_tokens larger than the number of tokens in the given prompt,
|
|
74
|
-
# then scale runtime in dict by (num_prompt_tokens / key) to get more accurate estimate: we
|
|
75
|
-
# assume that we can encode the prompt at the same throughput as the smallest key larger than
|
|
76
|
-
# num_prompt_tokens, and number of compute operations scales linearly with num_prompt_tokens.
|
|
77
|
-
for key in sorted(runtimes_for_prompt_tokens.keys()):
|
|
78
|
-
if num_prompt_tokens <= key:
|
|
79
|
-
runtime_for_prompt_tokens = runtimes_for_prompt_tokens[key] * (num_prompt_tokens / key)
|
|
80
|
-
break
|
|
81
|
-
# If number of tokens in the prompt exceeds the largest key in the efficiency dict, then
|
|
82
|
-
# estimate the prompt encoding time by linearly scaling up the runtime for the largest
|
|
83
|
-
# key (this is reasonably accurate under certain simplifying assumptions).
|
|
84
|
-
if runtime_for_prompt_tokens is None:
|
|
85
|
-
runtime_for_prompt_tokens = runtimes_for_prompt_tokens[largest_num_tokens_in_efficiency_dict] * (
|
|
86
|
-
num_prompt_tokens / largest_num_tokens_in_efficiency_dict
|
|
87
|
-
)
|
|
88
|
-
overhead: Optional[float] = inference_runtimes_dict_for_model.get("overhead")
|
|
89
|
-
|
|
90
|
-
# Idealized runtime is sum of the runtime of encoding the input tokens, the runtime of
|
|
91
|
-
# generating `num_output_tokens` (`runtime_per_output_token` * (`num_output_tokens` - 1))
|
|
92
|
-
# if number of output tokens is greater than 0, otherwise just `runtime_for_prompt_tokens`,
|
|
93
|
-
# and the overhead if available.
|
|
94
|
-
estimated_runtime = runtime_for_prompt_tokens
|
|
95
|
-
if num_output_tokens > 0:
|
|
96
|
-
estimated_runtime += runtime_per_output_token * (num_output_tokens - 1)
|
|
97
|
-
# Add overhead if it is available.
|
|
98
|
-
if overhead is not None:
|
|
99
|
-
estimated_runtime += overhead
|
|
100
|
-
else:
|
|
101
|
-
estimated_runtime = None
|
|
102
|
-
|
|
103
|
-
return estimated_runtime
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
def pass_at_k_estimator(n: int, c: int, k: int) -> float:
|
|
107
|
-
"""Calculates 1 - comb(n - c, k) / comb(n, k).
|
|
108
|
-
|
|
109
|
-
Numerically stable version defined in
|
|
110
|
-
https://arxiv.org/pdf/2107.03374.pdf
|
|
111
|
-
"""
|
|
112
|
-
if n - c < k:
|
|
113
|
-
return 1.0
|
|
114
|
-
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def normalize_text(text: str) -> str:
|
|
118
|
-
"""Lower text and remove punctuation, articles and extra whitespace.
|
|
119
|
-
Copied from the [QuAC](http://quac.ai/) evaluation script found at
|
|
120
|
-
https://s3.amazonaws.com/my89public/quac/scorer.py"""
|
|
121
|
-
|
|
122
|
-
def remove_articles(text: str) -> str:
|
|
123
|
-
return re.sub(r"\b(a|an|the)\b", " ", text)
|
|
124
|
-
|
|
125
|
-
def white_space_fix(text: str) -> str:
|
|
126
|
-
return " ".join(text.split())
|
|
127
|
-
|
|
128
|
-
def remove_punc(text: str) -> str:
|
|
129
|
-
exclude = set(string.punctuation)
|
|
130
|
-
return "".join(ch for ch in text if ch not in exclude)
|
|
131
|
-
|
|
132
|
-
def lower(text: str) -> str:
|
|
133
|
-
return text.lower()
|
|
134
|
-
|
|
135
|
-
return white_space_fix(remove_articles(remove_punc(lower(text))))
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def exact_match(gold: str, pred: str) -> float:
|
|
139
|
-
if not pred:
|
|
140
|
-
return 0
|
|
141
|
-
|
|
142
|
-
return 1 if gold.strip() == pred.strip() else 0
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
def quasi_exact_match(gold: str, pred: str) -> float:
|
|
146
|
-
if not pred:
|
|
147
|
-
return 0
|
|
148
|
-
|
|
149
|
-
return 1 if normalize_text(gold) == normalize_text(pred) else 0
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
def prefix_exact_match(gold: str, pred: str) -> float:
|
|
153
|
-
"""
|
|
154
|
-
The `prefix_exact_match` metric is particularly useful in the zero-shot setting, where the model is
|
|
155
|
-
not given examples of the expected outputs and tends to output more tokens than it should.
|
|
156
|
-
|
|
157
|
-
For example, for this zero-shot prompt from BoolQ,
|
|
158
|
-
|
|
159
|
-
Passage: Elmendorf Air Force Base (IATA: EDF, ICAO: PAED, FAA LID: EDF) is a United States military facility
|
|
160
|
-
in Anchorage, the largest city in Alaska. Originally known as Elmendorf Field, it became Elmendorf Air Force
|
|
161
|
-
Base after World War II, and in 2010 it merged with nearby Fort Richardson to form Joint Base Elmendorf-Richardson.
|
|
162
|
-
Question: Is there an air force base in anchorage alaska?
|
|
163
|
-
Answer:
|
|
164
|
-
|
|
165
|
-
the model could output up to `max_tokens` number of tokens "Yes, Elmendorf" instead of just "Yes".
|
|
166
|
-
"""
|
|
167
|
-
if not pred:
|
|
168
|
-
return 0
|
|
169
|
-
|
|
170
|
-
return 1 if pred.strip().startswith(gold.strip()) else 0
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def quasi_prefix_exact_match(gold: str, pred: str) -> float:
|
|
174
|
-
"""
|
|
175
|
-
Same thing as `prefix_exact_match` but we normalize the text before checking if the prefix match.
|
|
176
|
-
"""
|
|
177
|
-
if not pred:
|
|
178
|
-
return 0
|
|
179
|
-
|
|
180
|
-
return 1 if normalize_text(pred).startswith(normalize_text(gold)) else 0
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
def f1_score(gold: str, pred: str) -> float:
|
|
184
|
-
ret = f_measure(set(normalize_text(gold).split()), set(normalize_text(pred).split()))
|
|
185
|
-
if ret is None: # answer is the empty string after normalizing
|
|
186
|
-
return 0.0
|
|
187
|
-
|
|
188
|
-
return ret
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
def exact_match_indicator(gold: str, pred: str, indicator: str = " ") -> float:
|
|
192
|
-
"""
|
|
193
|
-
Exact match, allowing for some preceding context.
|
|
194
|
-
For example, the following two answers are considered matching:
|
|
195
|
-
- Because of x and y, the answer is ## <answer>
|
|
196
|
-
- Given reasons y and z, the answer is ## <answer>
|
|
197
|
-
While the following is considered different from the earlier two
|
|
198
|
-
- Given reasons x and a, the answer is ## <other answer>
|
|
199
|
-
"""
|
|
200
|
-
pred = pred.split(indicator)[-1].strip()
|
|
201
|
-
gold = gold.split(indicator)[-1].strip()
|
|
202
|
-
return exact_match(gold, pred)
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def final_number_exact_match(gold: str, pred: str) -> float:
|
|
206
|
-
"""
|
|
207
|
-
Returns 1 iff the final number in gold and pred match.
|
|
208
|
-
Similar to exact_match_indicator.
|
|
209
|
-
Example:
|
|
210
|
-
- gold = "The answer is 15."
|
|
211
|
-
- pred = "The answer is 15 eggs."
|
|
212
|
-
- Returns 1
|
|
213
|
-
"""
|
|
214
|
-
|
|
215
|
-
def get_final_number(x: str) -> str:
|
|
216
|
-
matches = re.findall(r"-?[\d,]+(?:.\d+)?", x)
|
|
217
|
-
if not matches:
|
|
218
|
-
return ""
|
|
219
|
-
return matches[-1].replace(",", "")
|
|
220
|
-
|
|
221
|
-
return exact_match(get_final_number(gold), get_final_number(pred))
|
|
27
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Instance
|
|
28
|
+
from helm.benchmark.metrics.metric import (
|
|
29
|
+
Metric,
|
|
30
|
+
MetricInterface,
|
|
31
|
+
MetricMetadata,
|
|
32
|
+
MetricResult,
|
|
33
|
+
add_context,
|
|
34
|
+
get_unique_stat_by_name,
|
|
35
|
+
)
|
|
36
|
+
from helm.benchmark.metrics.metric_name import MetricContext, MetricName
|
|
37
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
38
|
+
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
222
39
|
|
|
223
40
|
|
|
224
41
|
def get_num_bytes(tokens: List[Token]) -> int:
|
|
@@ -270,123 +87,6 @@ def convert_tokens_to_text(tokens: List[Token]) -> List[Dict]:
|
|
|
270
87
|
return groups
|
|
271
88
|
|
|
272
89
|
|
|
273
|
-
def rouge_score(gold: str, pred: str, rouge_type: str, scorer: rouge_scorer.RougeScorer) -> float:
|
|
274
|
-
scores = scorer.score(gold, pred)
|
|
275
|
-
return scores[rouge_type].fmeasure
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
def get_rouge_function(rouge_type: str) -> Callable[[str, str], float]:
|
|
279
|
-
scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True)
|
|
280
|
-
return partial(rouge_score, scorer=scorer, rouge_type=rouge_type)
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
def bleu_1(gold: str, pred: str) -> float:
|
|
284
|
-
return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(1, 0, 0, 0))
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
def chinese_bleu_1(gold: str, pred: str) -> float:
|
|
288
|
-
char_tokenizer = ChineseTokenizer()
|
|
289
|
-
return sentence_bleu([char_tokenizer.tokenize(gold)], char_tokenizer.tokenize(pred), weights=(1, 0, 0, 0))
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
def get_chinese_rouge_function(rouge_type: str) -> Callable[[str, str], float]:
|
|
293
|
-
char_tokenizer = ChineseTokenizer()
|
|
294
|
-
scorer = rouge_scorer.RougeScorer([rouge_type], use_stemmer=True, tokenizer=char_tokenizer)
|
|
295
|
-
return partial(rouge_score, scorer=scorer, rouge_type=rouge_type)
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
def cleva_math_result_match(gold: str, pred: str) -> float:
|
|
299
|
-
"""
|
|
300
|
-
Exact match that only cares the last math expression.
|
|
301
|
-
Common math expressions are numbers and fractions.
|
|
302
|
-
"""
|
|
303
|
-
pattern = r"[-+*/%\.\(\)\d]+"
|
|
304
|
-
matches = re.findall(pattern, pred)
|
|
305
|
-
if matches:
|
|
306
|
-
pred = matches[-1].lstrip(")")
|
|
307
|
-
# remove space in front or at the end
|
|
308
|
-
pred = pred.strip()
|
|
309
|
-
return exact_match(gold, pred)
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
def bleu_4(gold: str, pred: str) -> float:
|
|
313
|
-
return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
def extract_set_from_text(
|
|
317
|
-
set_str: str,
|
|
318
|
-
set_start_str: str = " is ",
|
|
319
|
-
set_separator: str = " and ",
|
|
320
|
-
empty_set_str: str = "Nothing.",
|
|
321
|
-
) -> Set[str]:
|
|
322
|
-
"""
|
|
323
|
-
Given a string, extract the set of strings implied by that string.
|
|
324
|
-
set_start_str denotes the start of the set
|
|
325
|
-
set_separator denotes the string separating set elements
|
|
326
|
-
empty_set_str is the string which denotes the empty set
|
|
327
|
-
"""
|
|
328
|
-
if set_str == empty_set_str:
|
|
329
|
-
return set()
|
|
330
|
-
set_str = set_str.replace(".", "")
|
|
331
|
-
extracted_set = set(set_str.split(set_start_str)[-1].split(set_separator))
|
|
332
|
-
return extracted_set
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
def extract_gold_pred_sets(gold: str, pred: str) -> Tuple[Set[str], Set[str]]:
|
|
336
|
-
"""Extract the set of strings implied by the gold and pred strings"""
|
|
337
|
-
gold_set = extract_set_from_text(gold)
|
|
338
|
-
pred_set = extract_set_from_text(pred.split("\n")[0])
|
|
339
|
-
return gold_set, pred_set
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
def iou_set_match(gold: str, pred: str) -> float:
|
|
343
|
-
"""Compute the intersection over union of the gold and pred sets"""
|
|
344
|
-
gold_set, pred_set = extract_gold_pred_sets(gold, pred)
|
|
345
|
-
if len(gold_set) == 0: # If gold is empty, just check if the pred set is also empty
|
|
346
|
-
return float(gold_set == pred_set)
|
|
347
|
-
return len(gold_set.intersection(pred_set)) / len(gold_set.union(pred_set))
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
def f1_set_match(gold: str, pred: str) -> float:
|
|
351
|
-
"""Compute the F1 score of the gold and pred sets"""
|
|
352
|
-
gold_set, pred_set = extract_gold_pred_sets(gold, pred)
|
|
353
|
-
if len(gold_set) == 0: # If gold is empty, just check if the pred set is also empty
|
|
354
|
-
return float(gold_set == pred_set)
|
|
355
|
-
true_positives = gold_set.intersection(pred_set)
|
|
356
|
-
return 2 * len(true_positives) / (len(gold_set) + len(pred_set))
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
def exact_set_match(gold: str, pred: str) -> float:
|
|
360
|
-
"""Compute whether the sets generated exactly match"""
|
|
361
|
-
gold_set, pred_set = extract_gold_pred_sets(gold, pred)
|
|
362
|
-
return float(gold_set == pred_set)
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
def absolute_value_difference(gold: str, pred: str) -> float:
|
|
366
|
-
"""Compute the absolute value of the difference between two numbers (provided as strings),
|
|
367
|
-
or 0.0 if invalid input.
|
|
368
|
-
"""
|
|
369
|
-
|
|
370
|
-
def maybe_int(text: str):
|
|
371
|
-
"""Parse int, ignoring commas in numbers."""
|
|
372
|
-
try:
|
|
373
|
-
val = int(text.replace(",", ""))
|
|
374
|
-
except ValueError:
|
|
375
|
-
return 0.0
|
|
376
|
-
return val
|
|
377
|
-
|
|
378
|
-
gold_val = maybe_int(gold)
|
|
379
|
-
pred_val = maybe_int(pred)
|
|
380
|
-
return abs(gold_val - pred_val)
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
def code_eval(gold: Tuple[str, Optional[Dict]], pred: str) -> float:
|
|
384
|
-
"""Evaluate Code Correctness on test examples."""
|
|
385
|
-
assert gold[1] is not None # gold[1]["canonical_solution"]
|
|
386
|
-
# Warning: will execute machine generated code; need to sandbox before executing
|
|
387
|
-
return float(code_metrics_helper.check_correctness(gold[1], pred, 3.0)["passed"]) # type: ignore
|
|
388
|
-
|
|
389
|
-
|
|
390
90
|
def compute_perplexity_metrics(stats: Dict[MetricName, Stat]) -> List[Stat]:
|
|
391
91
|
# TODO: find out the root cause and undo num_X > 0 check
|
|
392
92
|
# https://github.com/stanford-crfm/benchmarking/issues/350
|
|
@@ -411,7 +111,76 @@ def compute_perplexity_metrics(stats: Dict[MetricName, Stat]) -> List[Stat]:
|
|
|
411
111
|
return derived_stats
|
|
412
112
|
|
|
413
113
|
|
|
414
|
-
|
|
114
|
+
def _get_perplexity_metrics_metadata() -> List[MetricMetadata]:
|
|
115
|
+
return [
|
|
116
|
+
MetricMetadata(
|
|
117
|
+
name="perplexity",
|
|
118
|
+
display_name="Perplexity",
|
|
119
|
+
short_display_name="PPL",
|
|
120
|
+
description="Perplexity of the output completion (effective branching factor per output token).",
|
|
121
|
+
lower_is_better=True,
|
|
122
|
+
group=None,
|
|
123
|
+
),
|
|
124
|
+
MetricMetadata(
|
|
125
|
+
name="logprob_per_byte",
|
|
126
|
+
display_name="Log probability / byte",
|
|
127
|
+
short_display_name="Logprob/byte",
|
|
128
|
+
description="Predicted output's average log probability normalized by the number of bytes.",
|
|
129
|
+
lower_is_better=False,
|
|
130
|
+
group=None,
|
|
131
|
+
),
|
|
132
|
+
MetricMetadata(
|
|
133
|
+
name="bits_per_byte",
|
|
134
|
+
display_name="Bits/byte",
|
|
135
|
+
short_display_name="BPB",
|
|
136
|
+
description="Average number of bits per byte according to model probabilities.",
|
|
137
|
+
lower_is_better=True,
|
|
138
|
+
group=None,
|
|
139
|
+
),
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class InstancesPerSplitMetric(MetricInterface):
|
|
144
|
+
"""Report the average num_instances in each MetricContext across train_trials."""
|
|
145
|
+
|
|
146
|
+
def evaluate(
|
|
147
|
+
self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
|
|
148
|
+
) -> MetricResult:
|
|
149
|
+
adapter_spec = scenario_state.adapter_spec
|
|
150
|
+
global_stats: Dict[MetricName, Stat] = {}
|
|
151
|
+
|
|
152
|
+
for train_trial_index in range(adapter_spec.num_train_trials):
|
|
153
|
+
trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial
|
|
154
|
+
# Group instances in this train_trial by context.
|
|
155
|
+
instances_per_metric_context: Dict[MetricContext, Set[Instance]] = defaultdict(set)
|
|
156
|
+
for request_state in scenario_state.request_states:
|
|
157
|
+
if request_state.train_trial_index == train_trial_index:
|
|
158
|
+
instances_per_metric_context[MetricContext.from_instance(request_state.instance)].add(
|
|
159
|
+
request_state.instance
|
|
160
|
+
)
|
|
161
|
+
for context, instance_set in instances_per_metric_context.items():
|
|
162
|
+
stat = Stat(MetricName("num_instances")).add(len(instance_set))
|
|
163
|
+
merge_stat(trial_stats, add_context(stat, context))
|
|
164
|
+
|
|
165
|
+
# We take the mean value for each trial.
|
|
166
|
+
for stat in trial_stats.values():
|
|
167
|
+
merge_stat(global_stats, stat.take_mean())
|
|
168
|
+
|
|
169
|
+
# There are no per-instance Stats.
|
|
170
|
+
return MetricResult(list(global_stats.values()), [])
|
|
171
|
+
|
|
172
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
173
|
+
return [
|
|
174
|
+
MetricMetadata(
|
|
175
|
+
name="num_instances",
|
|
176
|
+
display_name="# eval",
|
|
177
|
+
description="Number of evaluation instances.",
|
|
178
|
+
lower_is_better=None,
|
|
179
|
+
)
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class BasicGenerationMetric(Metric):
|
|
415
184
|
"""
|
|
416
185
|
Defines basic metrics which don't require domain knowledge. This should be
|
|
417
186
|
fairly comprehensive already, and we should try to use this as much as possible.
|
|
@@ -422,338 +191,11 @@ class BasicMetric(Metric):
|
|
|
422
191
|
|
|
423
192
|
def __init__(self, names: List[str]):
|
|
424
193
|
self.names: List[str] = names
|
|
425
|
-
|
|
426
|
-
# For Efficiency metrics:
|
|
427
|
-
# The `inference_efficiency.json` file contains a `runtime_per_output_token` value
|
|
428
|
-
# (the estimated runtime of generating one output token) and a
|
|
429
|
-
# `runtime_for_prompt_tokens` dict (a mapping from various num_prompt_tokens values to
|
|
430
|
-
# the estimated runtime of encoding a prompt with that many tokens).
|
|
431
|
-
# For example:
|
|
432
|
-
# "openai/davinci": {
|
|
433
|
-
# "runtime_per_output_token": 0.080,
|
|
434
|
-
# "runtime_for_prompt_tokens": {
|
|
435
|
-
# "1": 0.016,
|
|
436
|
-
# "16": 0.018,
|
|
437
|
-
# "32": 0.020,
|
|
438
|
-
# ...
|
|
439
|
-
#
|
|
440
|
-
# These runtimes are generated by initializing Megatron with a model of the right size,
|
|
441
|
-
# obtaining end-to-end generation times for different numbers of prompt and output tokens,
|
|
442
|
-
# and then fitting a linear regression model to the runtimes: the resulting slope is the
|
|
443
|
-
# runtime_per_output_token, which is the processing time for generating each output token,
|
|
444
|
-
# and the y-intercept is the runtime_for_prompt_tokens, with different values for different
|
|
445
|
-
# num_prompt_tokens values.
|
|
446
|
-
# Profiling code and logs, and code to fit the regression model is available at
|
|
447
|
-
# https://github.com/stanford-crfm/benchmarking_efficiency.
|
|
448
|
-
data_package = resources.files(EFFICIENCY_DATA_PACKAGE)
|
|
449
|
-
with data_package.joinpath(INFERENCE_IDEALIZED_RUNTIMES_JSON_FILENAME).open("r") as f:
|
|
450
|
-
self.inference_idealized_runtimes_dict = json.load(f)
|
|
451
|
-
with data_package.joinpath(INFERENCE_DENOISED_RUNTIMES_JSON_FILENAME).open("r") as f:
|
|
452
|
-
self.inference_denoised_runtimes_dict = json.load(f)
|
|
453
|
-
|
|
454
|
-
# We use estimated emitted CO2 during training (in tons of CO2) as a proxy metric
|
|
455
|
-
# for training efficiency. We use reported metrics where applicable, otherwise
|
|
456
|
-
# we estimate them from runtime information, type and number of hardware accelerators
|
|
457
|
-
# used, region, etc.
|
|
458
|
-
with data_package.joinpath(TRAINING_EFFICIENCY_JSON_FILENAME).open("r") as f:
|
|
459
|
-
self.training_efficiency_dict = json.load(f)
|
|
194
|
+
self.efficiency_metric = EfficiencyMetric()
|
|
460
195
|
|
|
461
196
|
def __repr__(self):
|
|
462
197
|
return f"BasicMetric({','.join(self.names)})"
|
|
463
198
|
|
|
464
|
-
def compute_reference_metrics(
|
|
465
|
-
self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
466
|
-
) -> List[Stat]:
|
|
467
|
-
"""
|
|
468
|
-
Setup:
|
|
469
|
-
|
|
470
|
-
- Gold (correct references): G1 ... Gm
|
|
471
|
-
- Predictions (completions): P1 ... Pk
|
|
472
|
-
|
|
473
|
-
For each pair (G, P), we can define a ${score} (e.g., exact match, F1, BLEU).
|
|
474
|
-
|
|
475
|
-
We define the following stats:
|
|
476
|
-
|
|
477
|
-
- ${score}: max_i score(Gi, P1)
|
|
478
|
-
- ${score}@k: max_{i,j} score(Gi, Pj)
|
|
479
|
-
"""
|
|
480
|
-
|
|
481
|
-
def compute_metrics_helper(
|
|
482
|
-
name: MetricName,
|
|
483
|
-
score_func: Callable,
|
|
484
|
-
group: Optional[str] = None,
|
|
485
|
-
) -> List[Stat]:
|
|
486
|
-
if name.name == "pass": # Calculate pass@k for HumanEval from CodeScenario.
|
|
487
|
-
score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy.
|
|
488
|
-
code_golds = cast(List[CodeReference], golds)
|
|
489
|
-
results = [
|
|
490
|
-
score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds
|
|
491
|
-
]
|
|
492
|
-
_len, _sum = len(results), int(sum(results)) # Cast to int to make type match.
|
|
493
|
-
score_1 = pass_at_k_estimator(_len, _sum, 1)
|
|
494
|
-
score_k = pass_at_k_estimator(_len, _sum, adapter_spec.num_outputs)
|
|
495
|
-
elif name.name == "code_eval_acc":
|
|
496
|
-
score_func = cast(Callable[[Tuple[str, Optional[Dict]], str], float], score_func) # Make mypy happy.
|
|
497
|
-
code_golds = cast(List[CodeReference], golds)
|
|
498
|
-
score_1 = max(score_func((gold.output.text, gold.test_cases), preds[0]) for gold in code_golds)
|
|
499
|
-
score_k = max(
|
|
500
|
-
score_func((gold.output.text, gold.test_cases), pred) for gold in code_golds for pred in preds
|
|
501
|
-
)
|
|
502
|
-
else:
|
|
503
|
-
score_func = cast(Callable[[str, str], float], score_func) # Make mypy happy.
|
|
504
|
-
score_1 = max(score_func(gold.output.text, preds[0]) for gold in golds)
|
|
505
|
-
score_k = max(score_func(gold.output.text, pred) for gold in golds for pred in preds)
|
|
506
|
-
|
|
507
|
-
metrics = [Stat(name).add(score_1)] # score_1 corresponds using one prediction
|
|
508
|
-
if adapter_spec.num_outputs != 1:
|
|
509
|
-
metrics.append(Stat(replace(name, name=f"{name.name}@{adapter_spec.num_outputs}")).add(score_k))
|
|
510
|
-
return metrics
|
|
511
|
-
|
|
512
|
-
# maps each string metric name to its associated function
|
|
513
|
-
metric_fn_mapping: Dict[str, Callable] = {
|
|
514
|
-
"exact_match": exact_match,
|
|
515
|
-
"quasi_exact_match": quasi_exact_match,
|
|
516
|
-
"prefix_exact_match": prefix_exact_match,
|
|
517
|
-
"quasi_prefix_exact_match": quasi_prefix_exact_match,
|
|
518
|
-
"exact_match_indicator": exact_match_indicator,
|
|
519
|
-
"final_number_exact_match": final_number_exact_match,
|
|
520
|
-
"exact_set_match": exact_set_match,
|
|
521
|
-
"iou_set_match": iou_set_match,
|
|
522
|
-
"f1_set_match": f1_set_match,
|
|
523
|
-
"math_equiv": is_equiv,
|
|
524
|
-
"math_equiv_chain_of_thought": is_equiv_chain_of_thought,
|
|
525
|
-
"code_eval_acc": code_eval,
|
|
526
|
-
"pass": code_eval,
|
|
527
|
-
"f1_score": f1_score,
|
|
528
|
-
"rouge_1": get_rouge_function("rouge1"),
|
|
529
|
-
"rouge_2": get_rouge_function("rouge2"),
|
|
530
|
-
"rouge_l": get_rouge_function("rougeL"),
|
|
531
|
-
"bleu_1": bleu_1,
|
|
532
|
-
"bleu_4": bleu_4,
|
|
533
|
-
"chinese_bleu_1": chinese_bleu_1,
|
|
534
|
-
"chinese_rouge_1": get_chinese_rouge_function("rouge1"),
|
|
535
|
-
"chinese_rouge_2": get_chinese_rouge_function("rouge2"),
|
|
536
|
-
"cleva_math_result_match": cleva_math_result_match,
|
|
537
|
-
"absolute_value_difference": absolute_value_difference,
|
|
538
|
-
}
|
|
539
|
-
|
|
540
|
-
stats: List[Stat] = []
|
|
541
|
-
|
|
542
|
-
# Gold outputs
|
|
543
|
-
golds: List[Reference] = [reference for reference in request_state.instance.references if reference.is_correct]
|
|
544
|
-
assert len(golds) > 0
|
|
545
|
-
|
|
546
|
-
# Predicted outputs
|
|
547
|
-
assert request_state.result is not None
|
|
548
|
-
sorted_completions: List[Sequence] = sorted(request_state.result.completions, key=lambda x: -x.logprob)
|
|
549
|
-
preds: List[str] = [completion.text.strip() for completion in sorted_completions]
|
|
550
|
-
|
|
551
|
-
# Apply mapping if exists (e.g., for multiple-choice questions A -> Boston, B -> New York)
|
|
552
|
-
# Note: If 'A' and 'B' were the only possible choices, smaller language models like GPT-2 would
|
|
553
|
-
# sometimes predict a random letter like 'M'.
|
|
554
|
-
if request_state.output_mapping is not None:
|
|
555
|
-
preds = [request_state.output_mapping.get(pred) for pred in preds] # type: ignore
|
|
556
|
-
|
|
557
|
-
# Compute max_prob, the probability that the model assigns to its generated text.
|
|
558
|
-
# Use the log prob of sorted_completions[0], which is the completion with the highest
|
|
559
|
-
# log_prob. We use this since that's what's used for computing metrics like exact_match.
|
|
560
|
-
# One subtlety is that when computing exact_match, we strip whitespace, so the actual
|
|
561
|
-
# max_prob is the sum of all the probabilities in the set {x : strip(x) = prediction}.
|
|
562
|
-
# In practice, we think this may not make much of a difference because models may not place
|
|
563
|
-
# high probabilities on having additional spaces (should check this). Also, the sum
|
|
564
|
-
# involves computing the log_prob for many completions which could be intractable.
|
|
565
|
-
max_prob = np.exp(sorted_completions[0].logprob)
|
|
566
|
-
stats.append(Stat(MetricName("max_prob")).add(max_prob))
|
|
567
|
-
|
|
568
|
-
# Add other metrics
|
|
569
|
-
for metric_name in self.names:
|
|
570
|
-
if metric_name in metric_fn_mapping:
|
|
571
|
-
stats.extend(compute_metrics_helper(MetricName(metric_name), metric_fn_mapping[metric_name]))
|
|
572
|
-
else:
|
|
573
|
-
raise NameError(f"{metric_name} is not in the list of metric functions.")
|
|
574
|
-
|
|
575
|
-
return stats
|
|
576
|
-
|
|
577
|
-
def compute_efficiency_metrics(
|
|
578
|
-
self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
579
|
-
) -> List[Stat]:
|
|
580
|
-
"""Compute efficiency metrics for both inference and training.
|
|
581
|
-
For inference, we record both the actual runtime and an estimated idealized runtime
|
|
582
|
-
for the given request with an optimized software implementation run on A100 GPU(s),
|
|
583
|
-
taking into account both the number of tokens in the prompt of the request, and the
|
|
584
|
-
number of generated output tokens.
|
|
585
|
-
For training, we report the estimated total metric tons of CO2 emitted to train the
|
|
586
|
-
model. This is the same for each request."""
|
|
587
|
-
# Compute efficiency metrics for inference.
|
|
588
|
-
assert request_state.result is not None
|
|
589
|
-
|
|
590
|
-
runtime: Optional[float] = None
|
|
591
|
-
batch_size: Optional[int] = None
|
|
592
|
-
# Compute efficiency metrics for inference.
|
|
593
|
-
if request_state.result.request_time is not None:
|
|
594
|
-
runtime = request_state.result.request_time
|
|
595
|
-
batch_size = 1
|
|
596
|
-
# For models that perform offline batch inference, effective runtime is batch_request_time, but also
|
|
597
|
-
# record batch_size to provide nuance.
|
|
598
|
-
if request_state.result.batch_request_time is not None and request_state.result.batch_size is not None:
|
|
599
|
-
runtime = request_state.result.batch_request_time
|
|
600
|
-
batch_size = request_state.result.batch_size
|
|
601
|
-
|
|
602
|
-
# Compute total number of prompt and output tokens.
|
|
603
|
-
# Fetch the right `Tokenizer` depending on the model defined in `AdapterSpec`
|
|
604
|
-
# and calculate the number of tokens in the prompt.
|
|
605
|
-
tokenizer_service: TokenizerService = metric_service
|
|
606
|
-
window_service: WindowService = WindowServiceFactory.get_window_service(
|
|
607
|
-
adapter_spec.model_deployment, tokenizer_service
|
|
608
|
-
)
|
|
609
|
-
prompt: str = request_state.request.prompt
|
|
610
|
-
num_prompt_tokens: int = window_service.get_num_tokens(prompt)
|
|
611
|
-
|
|
612
|
-
# Total number of tokens in the completion.
|
|
613
|
-
num_completion_tokens: int = sum([len(completion.tokens) for completion in request_state.result.completions])
|
|
614
|
-
# Don't include prompt in number of generated tokens (e.g., for language modeling).
|
|
615
|
-
# Assume that tokens for different completions are generated sequentially (instead of batched) when
|
|
616
|
-
# computing num_output_tokens (for the purpose of runtime estimation).
|
|
617
|
-
num_output_tokens: int = num_completion_tokens
|
|
618
|
-
if request_state.request.echo_prompt:
|
|
619
|
-
# num_prompt_tokens > num_output_tokens can happen if tokenizer doesn't round trip.
|
|
620
|
-
if num_prompt_tokens <= num_output_tokens:
|
|
621
|
-
num_output_tokens -= num_prompt_tokens
|
|
622
|
-
else:
|
|
623
|
-
hlog(
|
|
624
|
-
f"WARNING: num_prompt_tokens ({num_prompt_tokens}) > num_output_tokens ({num_output_tokens}) "
|
|
625
|
-
f"for prompt: {prompt}"
|
|
626
|
-
)
|
|
627
|
-
num_output_tokens = 0
|
|
628
|
-
|
|
629
|
-
idealized_runtime: Optional[float] = compute_estimated_time_from_prompt_size_and_num_output_tokens(
|
|
630
|
-
request_state, self.inference_idealized_runtimes_dict, num_prompt_tokens, num_output_tokens
|
|
631
|
-
)
|
|
632
|
-
|
|
633
|
-
denoised_runtime: Optional[float] = compute_estimated_time_from_prompt_size_and_num_output_tokens(
|
|
634
|
-
request_state, self.inference_denoised_runtimes_dict, num_prompt_tokens, num_output_tokens
|
|
635
|
-
)
|
|
636
|
-
# Denoised runtime for offline models is just runtime.
|
|
637
|
-
# We divide by batch_size to get approximate per-input runtime.
|
|
638
|
-
if runtime is not None and request_state.result.batch_size is not None:
|
|
639
|
-
denoised_runtime = runtime / request_state.result.batch_size
|
|
640
|
-
|
|
641
|
-
# Compute efficiency metrics for training.
|
|
642
|
-
training_co2_cost: Optional[float]
|
|
643
|
-
if request_state.request.model_deployment in self.training_efficiency_dict["carbon"]:
|
|
644
|
-
training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model_deployment]["value"]
|
|
645
|
-
else:
|
|
646
|
-
training_co2_cost = None
|
|
647
|
-
|
|
648
|
-
training_energy_cost: Optional[float]
|
|
649
|
-
if request_state.request.model_deployment in self.training_efficiency_dict["energy"]:
|
|
650
|
-
training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model_deployment][
|
|
651
|
-
"value"
|
|
652
|
-
]
|
|
653
|
-
else:
|
|
654
|
-
training_energy_cost = None
|
|
655
|
-
|
|
656
|
-
stats = [
|
|
657
|
-
Stat(MetricName("num_prompt_tokens")).add(num_prompt_tokens),
|
|
658
|
-
Stat(MetricName("num_completion_tokens")).add(num_completion_tokens),
|
|
659
|
-
Stat(MetricName("num_output_tokens")).add(num_output_tokens),
|
|
660
|
-
Stat(MetricName("training_co2_cost")).add(training_co2_cost),
|
|
661
|
-
Stat(MetricName("training_energy_cost")).add(training_energy_cost),
|
|
662
|
-
]
|
|
663
|
-
if runtime is not None:
|
|
664
|
-
stats.append(Stat(MetricName("inference_runtime")).add(runtime))
|
|
665
|
-
if batch_size is not None:
|
|
666
|
-
stats.append(Stat(MetricName("batch_size")).add(batch_size))
|
|
667
|
-
if denoised_runtime is not None:
|
|
668
|
-
stats.append(Stat(MetricName("inference_denoised_runtime")).add(denoised_runtime))
|
|
669
|
-
if idealized_runtime is not None:
|
|
670
|
-
stats.append(Stat(MetricName("inference_idealized_runtime")).add(idealized_runtime))
|
|
671
|
-
return stats
|
|
672
|
-
|
|
673
|
-
def compute_finish_reason_metrics(
|
|
674
|
-
self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
675
|
-
) -> List[Stat]:
|
|
676
|
-
"""Record how often generation finished due to reaching token limit, stop token(s), or end of text"""
|
|
677
|
-
assert request_state.result is not None
|
|
678
|
-
sequence = request_state.result.completions[0]
|
|
679
|
-
valid_reasons = [
|
|
680
|
-
"length",
|
|
681
|
-
"stop",
|
|
682
|
-
"endoftext",
|
|
683
|
-
"unknown",
|
|
684
|
-
]
|
|
685
|
-
if sequence.finish_reason is None or sequence.finish_reason["reason"] not in valid_reasons:
|
|
686
|
-
reason = "unknown"
|
|
687
|
-
else:
|
|
688
|
-
reason = sequence.finish_reason["reason"]
|
|
689
|
-
return [
|
|
690
|
-
Stat(MetricName(f"finish_reason_{valid_reason}")).add(int(reason == valid_reason))
|
|
691
|
-
for valid_reason in valid_reasons
|
|
692
|
-
]
|
|
693
|
-
|
|
694
|
-
def compute_truncation_metrics(
|
|
695
|
-
self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
696
|
-
) -> List[Stat]:
|
|
697
|
-
"""
|
|
698
|
-
Record the number of training instances used in the prompt and whether
|
|
699
|
-
even the prompt needed to be truncated (once we hit zero training instances).
|
|
700
|
-
"""
|
|
701
|
-
return [
|
|
702
|
-
Stat(MetricName("num_train_instances")).add(request_state.num_train_instances),
|
|
703
|
-
Stat(MetricName("prompt_truncated")).add(request_state.prompt_truncated),
|
|
704
|
-
]
|
|
705
|
-
|
|
706
|
-
def compute_all_general_metrics(
|
|
707
|
-
self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
708
|
-
) -> List[Stat]:
|
|
709
|
-
"""
|
|
710
|
-
Compute metrics that are common to both `evaluate_generation` and `evaluate_references`.
|
|
711
|
-
"""
|
|
712
|
-
stats: List[Stat] = []
|
|
713
|
-
|
|
714
|
-
stats.append(Stat(MetricName("num_references")).add(len(request_state.instance.references)))
|
|
715
|
-
|
|
716
|
-
# Copy from adapter spec
|
|
717
|
-
stats.append(Stat(MetricName("num_train_trials")).add(adapter_spec.num_train_trials))
|
|
718
|
-
|
|
719
|
-
stats.extend(self.compute_efficiency_metrics(adapter_spec, request_state, metric_service))
|
|
720
|
-
stats.extend(self.compute_finish_reason_metrics(adapter_spec, request_state, metric_service))
|
|
721
|
-
stats.extend(self.compute_truncation_metrics(adapter_spec, request_state, metric_service))
|
|
722
|
-
|
|
723
|
-
return stats
|
|
724
|
-
|
|
725
|
-
def compute_language_modeling_metrics(
|
|
726
|
-
self, adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
727
|
-
) -> List[Stat]:
|
|
728
|
-
"""Compute the logprob and normalization factors for the first completion"""
|
|
729
|
-
assert request_state.result is not None
|
|
730
|
-
sequence = request_state.result.completions[0]
|
|
731
|
-
|
|
732
|
-
# Remove the empty tokens (typically generated by the AI21 tokenizer in the beginning of the text)
|
|
733
|
-
#
|
|
734
|
-
# Some more details about AI21 tokenizer: If the input prompt begins with a space, then
|
|
735
|
-
# the tokenizer inserts an empty token to the beginning.
|
|
736
|
-
# e.g. " burying him" -> ["▁"(0,0), "▁burying"(0,8), "▁him"(8,12)].
|
|
737
|
-
# TODO(#1522): Update this comment once solved.
|
|
738
|
-
# Since this empty token is introduced by our chunking approach, we need to remove it.
|
|
739
|
-
tokens: List[Token]
|
|
740
|
-
if request_state.num_conditioning_tokens > 0 and sequence.tokens[0].text == "":
|
|
741
|
-
tokens = sequence.tokens[1:]
|
|
742
|
-
else:
|
|
743
|
-
tokens = sequence.tokens
|
|
744
|
-
pred_tokens = tokens[request_state.num_conditioning_tokens :]
|
|
745
|
-
logprob, num_perplexity_tokens, num_bytes = (
|
|
746
|
-
sum(token.logprob for token in pred_tokens),
|
|
747
|
-
len(pred_tokens),
|
|
748
|
-
get_num_bytes(pred_tokens),
|
|
749
|
-
)
|
|
750
|
-
|
|
751
|
-
return [
|
|
752
|
-
Stat(MetricName("logprob")).add(logprob),
|
|
753
|
-
Stat(MetricName("num_perplexity_tokens")).add(num_perplexity_tokens),
|
|
754
|
-
Stat(MetricName("num_bytes")).add(num_bytes),
|
|
755
|
-
]
|
|
756
|
-
|
|
757
199
|
def evaluate_generation(
|
|
758
200
|
self,
|
|
759
201
|
adapter_spec: AdapterSpec,
|
|
@@ -763,15 +205,49 @@ class BasicMetric(Metric):
|
|
|
763
205
|
) -> List[Stat]:
|
|
764
206
|
"""Compute all metrics."""
|
|
765
207
|
stats: List[Stat] = []
|
|
766
|
-
stats.extend(self.
|
|
208
|
+
stats.extend(compute_request_state_metrics(self.efficiency_metric, adapter_spec, request_state, metric_service))
|
|
767
209
|
|
|
768
210
|
if len(request_state.instance.references) > 0:
|
|
769
|
-
stats.extend(self.
|
|
211
|
+
stats.extend(compute_reference_metrics(self.names, adapter_spec, request_state, metric_service))
|
|
770
212
|
|
|
771
|
-
stats.extend(
|
|
213
|
+
stats.extend(compute_language_modeling_metrics(adapter_spec, request_state, metric_service))
|
|
772
214
|
|
|
773
215
|
return stats
|
|
774
216
|
|
|
217
|
+
def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]:
|
|
218
|
+
"""Derive perplexity metrics if applicable. We don't worry about splits and perturbations here."""
|
|
219
|
+
derived_stats: List[Stat] = []
|
|
220
|
+
derived_stats.extend(compute_perplexity_metrics(stats_dict))
|
|
221
|
+
return derived_stats
|
|
222
|
+
|
|
223
|
+
def derive_per_instance_stats(self, per_instance_stats: Dict[Instance, List[Stat]]) -> List[Stat]:
|
|
224
|
+
"""Derive calibration metrics if applicable. We don't worry about splits and perturbations here."""
|
|
225
|
+
derived_stats: List[Stat] = []
|
|
226
|
+
derived_stats.extend(compute_calibration_metrics(per_instance_stats))
|
|
227
|
+
return derived_stats
|
|
228
|
+
|
|
229
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
230
|
+
return (
|
|
231
|
+
get_request_state_metrics_metadata(self.efficiency_metric)
|
|
232
|
+
+ get_reference_metrics_metadata(self.names)
|
|
233
|
+
+ _get_language_modeling_metrics_metadata()
|
|
234
|
+
+ _get_perplexity_metrics_metadata()
|
|
235
|
+
+ _get_calibration_metrics_metadata()
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
class BasicReferenceMetric(ReferenceMetric):
|
|
240
|
+
"""
|
|
241
|
+
Defines basic metrics for Scenarios that use one Request per Reference instead of
|
|
242
|
+
one per Instance.
|
|
243
|
+
"""
|
|
244
|
+
|
|
245
|
+
def __init__(self):
|
|
246
|
+
self.efficiency_metric = EfficiencyMetric()
|
|
247
|
+
|
|
248
|
+
def __repr__(self):
|
|
249
|
+
return "BasicReferenceMetric"
|
|
250
|
+
|
|
775
251
|
def evaluate_references(
|
|
776
252
|
self,
|
|
777
253
|
adapter_spec: AdapterSpec,
|
|
@@ -801,7 +277,7 @@ class BasicMetric(Metric):
|
|
|
801
277
|
assert len(request_state.result.completions) == 1
|
|
802
278
|
|
|
803
279
|
reference_index = request_state.reference_index
|
|
804
|
-
sequence:
|
|
280
|
+
sequence: GeneratedOutput = request_state.result.completions[0]
|
|
805
281
|
reference: str = request_state.instance.references[reference_index].output.text
|
|
806
282
|
|
|
807
283
|
# Find the span of the completion that matches the reference.
|
|
@@ -848,8 +324,14 @@ class BasicMetric(Metric):
|
|
|
848
324
|
raise ValueError(f"Unknown adapter method: {adapter_spec.method}")
|
|
849
325
|
|
|
850
326
|
stats: List[Stat] = []
|
|
851
|
-
stats.extend(self.compute_all_general_metrics(adapter_spec, request_state, metric_service))
|
|
852
327
|
|
|
328
|
+
general_metrics: Dict[MetricName, Stat] = {}
|
|
329
|
+
for request_state in reference_request_states:
|
|
330
|
+
for stat in compute_request_state_metrics(
|
|
331
|
+
self.efficiency_metric, adapter_spec, request_state, metric_service
|
|
332
|
+
):
|
|
333
|
+
merge_stat(general_metrics, stat)
|
|
334
|
+
stats.extend(general_metrics.values())
|
|
853
335
|
max_prob = np.max(scipy.special.softmax(reference_scores))
|
|
854
336
|
|
|
855
337
|
# Multiple references may attain the same maximal score; in such cases,
|
|
@@ -868,18 +350,229 @@ class BasicMetric(Metric):
|
|
|
868
350
|
)
|
|
869
351
|
return stats
|
|
870
352
|
|
|
871
|
-
def
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
353
|
+
def get_metadata(self) -> List[MetricMetadata]:
|
|
354
|
+
return [
|
|
355
|
+
MetricMetadata(
|
|
356
|
+
name="max_prob",
|
|
357
|
+
display_name="Max prob",
|
|
358
|
+
description="Model's average confidence in its prediction (only computed for classification tasks)",
|
|
359
|
+
lower_is_better=False,
|
|
360
|
+
group="calibration_detailed",
|
|
361
|
+
),
|
|
362
|
+
MetricMetadata(
|
|
363
|
+
name="exact_match",
|
|
364
|
+
display_name="Exact match",
|
|
365
|
+
short_display_name="EM",
|
|
366
|
+
description="Fraction of instances that the predicted output matches a correct reference exactly.",
|
|
367
|
+
lower_is_better=False,
|
|
368
|
+
group="accuracy",
|
|
369
|
+
),
|
|
370
|
+
MetricMetadata(
|
|
371
|
+
name="predicted_index",
|
|
372
|
+
display_name="Predicted index",
|
|
373
|
+
description="Integer index of the reference (0, 1, ...) that was predicted by the model (for "
|
|
374
|
+
"multiple-choice).",
|
|
375
|
+
lower_is_better=None,
|
|
376
|
+
group=None,
|
|
377
|
+
),
|
|
378
|
+
]
|
|
876
379
|
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
380
|
+
|
|
381
|
+
def compute_request_state_metrics(
|
|
382
|
+
efficiency_metric: EfficiencyMetric,
|
|
383
|
+
adapter_spec: AdapterSpec,
|
|
384
|
+
request_state: RequestState,
|
|
385
|
+
metric_service: MetricService,
|
|
386
|
+
) -> List[Stat]:
|
|
387
|
+
"""
|
|
388
|
+
Compute metrics that are common to both `evaluate_generation` and `evaluate_references`.
|
|
389
|
+
"""
|
|
390
|
+
stats: List[Stat] = []
|
|
391
|
+
|
|
392
|
+
stats.append(Stat(MetricName("num_references")).add(len(request_state.instance.references)))
|
|
393
|
+
|
|
394
|
+
# Copy from adapter spec
|
|
395
|
+
stats.append(Stat(MetricName("num_train_trials")).add(adapter_spec.num_train_trials))
|
|
396
|
+
|
|
397
|
+
stats.extend(efficiency_metric.compute_efficiency_metrics(adapter_spec, request_state, metric_service))
|
|
398
|
+
stats.extend(_compute_finish_reason_metrics(adapter_spec, request_state, metric_service))
|
|
399
|
+
stats.extend(_compute_truncation_metrics(adapter_spec, request_state, metric_service))
|
|
400
|
+
|
|
401
|
+
return stats
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def get_request_state_metrics_metadata(
|
|
405
|
+
efficiency_metric: EfficiencyMetric,
|
|
406
|
+
) -> List[MetricMetadata]:
|
|
407
|
+
metric_metadata = [
|
|
408
|
+
MetricMetadata(
|
|
409
|
+
name="num_references",
|
|
410
|
+
display_name="# ref",
|
|
411
|
+
description="Number of references.",
|
|
412
|
+
lower_is_better=None,
|
|
413
|
+
group=None,
|
|
414
|
+
),
|
|
415
|
+
MetricMetadata(
|
|
416
|
+
name="num_train_trials",
|
|
417
|
+
display_name="# trials",
|
|
418
|
+
description="Number of trials, where in each trial we choose an independent, random set of training "
|
|
419
|
+
"instances.",
|
|
420
|
+
lower_is_better=None,
|
|
421
|
+
group="general_information",
|
|
422
|
+
),
|
|
423
|
+
]
|
|
424
|
+
return (
|
|
425
|
+
metric_metadata
|
|
426
|
+
+ efficiency_metric.get_metadata()
|
|
427
|
+
+ _get_finish_reason_metrics_metadata()
|
|
428
|
+
+ _get_truncation_metrics_metadata()
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def _compute_finish_reason_metrics(
|
|
433
|
+
adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
434
|
+
) -> List[Stat]:
|
|
435
|
+
"""Record how often generation finished due to reaching token limit, stop token(s), or end of text"""
|
|
436
|
+
assert request_state.result is not None
|
|
437
|
+
sequence = request_state.result.completions[0]
|
|
438
|
+
valid_reasons = [
|
|
439
|
+
"length",
|
|
440
|
+
"stop",
|
|
441
|
+
"endoftext",
|
|
442
|
+
"unknown",
|
|
443
|
+
]
|
|
444
|
+
if sequence.finish_reason is None or sequence.finish_reason["reason"] not in valid_reasons:
|
|
445
|
+
reason = "unknown"
|
|
446
|
+
else:
|
|
447
|
+
reason = sequence.finish_reason["reason"]
|
|
448
|
+
return [
|
|
449
|
+
Stat(MetricName(f"finish_reason_{valid_reason}")).add(int(reason == valid_reason))
|
|
450
|
+
for valid_reason in valid_reasons
|
|
451
|
+
]
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _get_finish_reason_metrics_metadata():
|
|
455
|
+
return [
|
|
456
|
+
MetricMetadata(
|
|
457
|
+
name="finish_reason_endoftext",
|
|
458
|
+
display_name="finish b/c endoftext",
|
|
459
|
+
description="Fraction of instances where the the output was terminated because the end of text token "
|
|
460
|
+
"was generated.",
|
|
461
|
+
lower_is_better=None,
|
|
462
|
+
group=None,
|
|
463
|
+
),
|
|
464
|
+
MetricMetadata(
|
|
465
|
+
name="finish_reason_length",
|
|
466
|
+
display_name="finish b/c length",
|
|
467
|
+
description="Fraction of instances where the the output was terminated because of the max tokens limit.",
|
|
468
|
+
lower_is_better=None,
|
|
469
|
+
group=None,
|
|
470
|
+
),
|
|
471
|
+
MetricMetadata(
|
|
472
|
+
name="finish_reason_stop",
|
|
473
|
+
display_name="finish b/c stop",
|
|
474
|
+
description="Fraction of instances where the the output was terminated because of the stop sequences.",
|
|
475
|
+
lower_is_better=None,
|
|
476
|
+
group=None,
|
|
477
|
+
),
|
|
478
|
+
MetricMetadata(
|
|
479
|
+
name="finish_reason_unknown",
|
|
480
|
+
display_name="finish b/c unknown",
|
|
481
|
+
description="Fraction of instances where the the output was terminated for unknown reasons.",
|
|
482
|
+
lower_is_better=None,
|
|
483
|
+
group=None,
|
|
484
|
+
),
|
|
485
|
+
]
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _compute_truncation_metrics(
|
|
489
|
+
adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
490
|
+
) -> List[Stat]:
|
|
491
|
+
"""
|
|
492
|
+
Record the number of training instances used in the prompt and whether
|
|
493
|
+
even the prompt needed to be truncated (once we hit zero training instances).
|
|
494
|
+
"""
|
|
495
|
+
return [
|
|
496
|
+
Stat(MetricName("num_train_instances")).add(request_state.num_train_instances),
|
|
497
|
+
Stat(MetricName("prompt_truncated")).add(request_state.prompt_truncated),
|
|
498
|
+
]
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def _get_truncation_metrics_metadata() -> List[MetricMetadata]:
|
|
502
|
+
return [
|
|
503
|
+
MetricMetadata(
|
|
504
|
+
name="num_train_instances",
|
|
505
|
+
display_name="# train",
|
|
506
|
+
description="Number of training instances (e.g., in-context examples).",
|
|
507
|
+
lower_is_better=None,
|
|
508
|
+
),
|
|
509
|
+
MetricMetadata(
|
|
510
|
+
name="prompt_truncated",
|
|
511
|
+
display_name="truncated",
|
|
512
|
+
description="Fraction of instances where the "
|
|
513
|
+
"prompt itself was truncated (implies "
|
|
514
|
+
"that there were no in-context "
|
|
515
|
+
"examples).",
|
|
516
|
+
lower_is_better=None,
|
|
517
|
+
),
|
|
518
|
+
]
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def compute_language_modeling_metrics(
|
|
522
|
+
adapter_spec: AdapterSpec, request_state: RequestState, metric_service: MetricService
|
|
523
|
+
) -> List[Stat]:
|
|
524
|
+
"""Compute the logprob and normalization factors for the first completion"""
|
|
525
|
+
assert request_state.result is not None
|
|
526
|
+
sequence = request_state.result.completions[0]
|
|
527
|
+
|
|
528
|
+
# Remove the empty tokens (typically generated by the AI21 tokenizer in the beginning of the text)
|
|
529
|
+
#
|
|
530
|
+
# Some more details about AI21 tokenizer: If the input prompt begins with a space, then
|
|
531
|
+
# the tokenizer inserts an empty token to the beginning.
|
|
532
|
+
# e.g. " burying him" -> ["▁"(0,0), "▁burying"(0,8), "▁him"(8,12)].
|
|
533
|
+
# TODO(#1522): Update this comment once solved.
|
|
534
|
+
# Since this empty token is introduced by our chunking approach, we need to remove it.
|
|
535
|
+
tokens: List[Token]
|
|
536
|
+
if request_state.num_conditioning_tokens > 0 and sequence.tokens[0].text == "":
|
|
537
|
+
tokens = sequence.tokens[1:]
|
|
538
|
+
else:
|
|
539
|
+
tokens = sequence.tokens
|
|
540
|
+
pred_tokens = tokens[request_state.num_conditioning_tokens :]
|
|
541
|
+
logprob, num_perplexity_tokens, num_bytes = (
|
|
542
|
+
sum(token.logprob for token in pred_tokens),
|
|
543
|
+
len(pred_tokens),
|
|
544
|
+
get_num_bytes(pred_tokens),
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
return [
|
|
548
|
+
Stat(MetricName("logprob")).add(logprob),
|
|
549
|
+
Stat(MetricName("num_perplexity_tokens")).add(num_perplexity_tokens),
|
|
550
|
+
Stat(MetricName("num_bytes")).add(num_bytes),
|
|
551
|
+
]
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _get_language_modeling_metrics_metadata() -> List[MetricMetadata]:
|
|
555
|
+
return [
|
|
556
|
+
MetricMetadata(
|
|
557
|
+
name="logprob",
|
|
558
|
+
display_name="Log probability",
|
|
559
|
+
short_display_name="Logprob",
|
|
560
|
+
description="Predicted output's average log probability (input's log prob for language modeling).",
|
|
561
|
+
lower_is_better=False,
|
|
562
|
+
),
|
|
563
|
+
MetricMetadata(
|
|
564
|
+
name="num_perplexity_tokens",
|
|
565
|
+
display_name="# tokens",
|
|
566
|
+
description="Average number of tokens in the predicted output (for language modeling, the input too).",
|
|
567
|
+
lower_is_better=None,
|
|
568
|
+
),
|
|
569
|
+
MetricMetadata(
|
|
570
|
+
name="num_bytes",
|
|
571
|
+
display_name="# bytes",
|
|
572
|
+
description="Average number of bytes in the predicted output (for language modeling, the input too).",
|
|
573
|
+
lower_is_better=None,
|
|
574
|
+
),
|
|
575
|
+
]
|
|
883
576
|
|
|
884
577
|
|
|
885
578
|
def _has_non_zero_valued_logprobs(per_instance_stats: Dict[Instance, List[Stat]]) -> bool:
|
|
@@ -943,3 +636,80 @@ def compute_calibration_metrics(per_instance_stats: Dict[Instance, List[Stat]])
|
|
|
943
636
|
stats.append(Stat(MetricName("platt_ece_1_bin")).add(platt_ece_1_bin))
|
|
944
637
|
|
|
945
638
|
return stats
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def _get_calibration_metrics_metadata() -> List[MetricMetadata]:
|
|
642
|
+
return [
|
|
643
|
+
MetricMetadata(
|
|
644
|
+
name="ece_10_bin",
|
|
645
|
+
display_name="10-bin expected calibration error",
|
|
646
|
+
short_display_name="ECE (10-bin)",
|
|
647
|
+
description="The average difference between the model's confidence and accuracy, averaged across 10 "
|
|
648
|
+
"bins where each bin contains an equal number of points (only computed for classification "
|
|
649
|
+
"tasks). Warning - not reliable for small datasets (e.g., with < 300 examples) because "
|
|
650
|
+
"each bin will have very few examples.",
|
|
651
|
+
lower_is_better=True,
|
|
652
|
+
group="calibration",
|
|
653
|
+
),
|
|
654
|
+
MetricMetadata(
|
|
655
|
+
name="ece_1_bin",
|
|
656
|
+
display_name="1-bin expected calibration error",
|
|
657
|
+
short_display_name="ECE (1-bin)",
|
|
658
|
+
description="The (absolute value) difference between the model's average confidence and accuracy "
|
|
659
|
+
"(only computed for classification tasks).",
|
|
660
|
+
lower_is_better=True,
|
|
661
|
+
group="calibration_detailed",
|
|
662
|
+
),
|
|
663
|
+
MetricMetadata(
|
|
664
|
+
name="selective_acc@10",
|
|
665
|
+
display_name="Accuracy at 10% coverage",
|
|
666
|
+
short_display_name="Acc@10%",
|
|
667
|
+
description="The accuracy for the 10% of predictions that the model is most confident on (only "
|
|
668
|
+
"computed for classification tasks).",
|
|
669
|
+
lower_is_better=False,
|
|
670
|
+
group="calibration_detailed",
|
|
671
|
+
),
|
|
672
|
+
MetricMetadata(
|
|
673
|
+
name="selective_cov_acc_area",
|
|
674
|
+
display_name="Selective coverage-accuracy area",
|
|
675
|
+
short_display_name="Selective Acc",
|
|
676
|
+
description="The area under the coverage-accuracy curve, a standard selective classification metric "
|
|
677
|
+
"(only computed for classification tasks).",
|
|
678
|
+
lower_is_better=False,
|
|
679
|
+
group="calibration_detailed",
|
|
680
|
+
),
|
|
681
|
+
MetricMetadata(
|
|
682
|
+
name="platt_coef",
|
|
683
|
+
display_name="Platt Scaling Coefficient",
|
|
684
|
+
short_display_name="Platt Coef",
|
|
685
|
+
description="Coefficient of the Platt scaling classifier (can compare this across tasks).",
|
|
686
|
+
lower_is_better=False,
|
|
687
|
+
group="calibration_detailed",
|
|
688
|
+
),
|
|
689
|
+
MetricMetadata(
|
|
690
|
+
name="platt_intercept",
|
|
691
|
+
display_name="Platt Scaling Intercept",
|
|
692
|
+
short_display_name="Platt Intercept",
|
|
693
|
+
description="Intercept of the Platt scaling classifier (can compare this across tasks).",
|
|
694
|
+
lower_is_better=False,
|
|
695
|
+
group="calibration_detailed",
|
|
696
|
+
),
|
|
697
|
+
MetricMetadata(
|
|
698
|
+
name="platt_ece_10_bin",
|
|
699
|
+
display_name="10-bin Expected Calibration Error (after Platt scaling)",
|
|
700
|
+
short_display_name="Platt-scaled ECE (10-bin)",
|
|
701
|
+
description="10-bin ECE computed after applying Platt scaling to recalibrate the model's predicted "
|
|
702
|
+
"probabilities.",
|
|
703
|
+
lower_is_better=True,
|
|
704
|
+
group="calibration_detailed",
|
|
705
|
+
),
|
|
706
|
+
MetricMetadata(
|
|
707
|
+
name="platt_ece_1_bin",
|
|
708
|
+
display_name="1-bin expected calibration error (after Platt scaling)",
|
|
709
|
+
short_display_name="Platt-scaled ECE (1-bin)",
|
|
710
|
+
description="1-bin ECE computed after applying Platt scaling to recalibrate the model's predicted "
|
|
711
|
+
"probabilities.",
|
|
712
|
+
lower_is_better=True,
|
|
713
|
+
group="calibration_detailed",
|
|
714
|
+
),
|
|
715
|
+
]
|