crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -5,12 +5,12 @@ from typing import Dict, List, Set
|
|
|
5
5
|
from datasets import load_dataset
|
|
6
6
|
import evaluate
|
|
7
7
|
|
|
8
|
-
from helm.common.general import hlog
|
|
9
8
|
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats
|
|
10
9
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
11
10
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
12
11
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
13
12
|
from helm.benchmark.metrics.statistic import Stat
|
|
13
|
+
from helm.common.hierarchical_logger import hwarn
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class UnitxtMetric(MetricInterface):
|
|
@@ -18,7 +18,10 @@ class UnitxtMetric(MetricInterface):
|
|
|
18
18
|
|
|
19
19
|
def __init__(self, **kwargs):
|
|
20
20
|
super().__init__()
|
|
21
|
-
|
|
21
|
+
if len(kwargs) == 1 and "recipe" in kwargs:
|
|
22
|
+
dataset_name = kwargs["recipe"]
|
|
23
|
+
else:
|
|
24
|
+
dataset_name = ",".join(f"{key}={value}" for key, value in kwargs.items())
|
|
22
25
|
self.dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
|
|
23
26
|
|
|
24
27
|
def evaluate(
|
|
@@ -83,9 +86,8 @@ class UnitxtMetric(MetricInterface):
|
|
|
83
86
|
)
|
|
84
87
|
)
|
|
85
88
|
if non_number_instance_metric_names:
|
|
86
|
-
|
|
87
|
-
"
|
|
88
|
-
f"they were not numbers: {non_number_instance_metric_names}"
|
|
89
|
+
hwarn(
|
|
90
|
+
"Ignored Unitxt instance metrics because " f"they were not numbers: {non_number_instance_metric_names}"
|
|
89
91
|
)
|
|
90
92
|
|
|
91
93
|
# Extract global metrics
|
|
@@ -320,8 +320,10 @@ def compute_emd_recursive(
|
|
|
320
320
|
mask1 = np.any(sig1[:, 1:-2] != gray_most_frequent_color, axis=1)
|
|
321
321
|
mask2 = np.any(sig2[:, 1:-2] != gray_most_frequent_color, axis=1)
|
|
322
322
|
mask = np.logical_or(mask1, mask2)
|
|
323
|
-
|
|
324
|
-
|
|
323
|
+
|
|
324
|
+
if np.any(mask):
|
|
325
|
+
sig1 = sig1[mask]
|
|
326
|
+
sig2 = sig2[mask]
|
|
325
327
|
|
|
326
328
|
# Normalize the weights
|
|
327
329
|
weight1 = sig1[:, 0]
|
|
@@ -324,7 +324,7 @@ class AnnotatedImageMetrics(Metric):
|
|
|
324
324
|
|
|
325
325
|
# Compute the LPIPS score
|
|
326
326
|
assert self._lpips_metric is not None
|
|
327
|
-
score: float = self._lpips_metric(img1, img2).detach().item()
|
|
327
|
+
score: float = 1.0 - self._lpips_metric(img1, img2).detach().item()
|
|
328
328
|
return score
|
|
329
329
|
|
|
330
330
|
def _calculate_fid(self, act1, act2):
|
|
@@ -84,8 +84,8 @@ def sift_similarity(img_a: np.ndarray, img_b: np.ndarray) -> float:
|
|
|
84
84
|
orb = cv2.ORB_create() if hasattr(cv2, "ORB_create") else cv2.ORB()
|
|
85
85
|
|
|
86
86
|
# Find the keypoints and descriptors with ORB
|
|
87
|
-
_, desc_a = orb.detectAndCompute(img_a, None)
|
|
88
|
-
_, desc_b = orb.detectAndCompute(img_b, None)
|
|
87
|
+
_, desc_a = orb.detectAndCompute(img_a, None) # type: ignore
|
|
88
|
+
_, desc_b = orb.detectAndCompute(img_b, None) # type: ignore
|
|
89
89
|
|
|
90
90
|
# Initialize the brute force matcher
|
|
91
91
|
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from typing import Any, Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class WildBenchScoreMetric(Metric):
|
|
12
|
+
"""Score metrics for WildBench."""
|
|
13
|
+
|
|
14
|
+
def evaluate_generation(
|
|
15
|
+
self,
|
|
16
|
+
adapter_spec: AdapterSpec,
|
|
17
|
+
request_state: RequestState,
|
|
18
|
+
metric_service: MetricService,
|
|
19
|
+
eval_cache_path: str,
|
|
20
|
+
) -> List[Stat]:
|
|
21
|
+
assert request_state.annotations
|
|
22
|
+
annotations: Dict[str, Any] = request_state.annotations["wildbench"]
|
|
23
|
+
scores: List[float] = []
|
|
24
|
+
for annotation_key, annotation_value in annotations.items():
|
|
25
|
+
if annotation_key.endswith("_score") and annotation_value is not None:
|
|
26
|
+
scores.append(annotation_value)
|
|
27
|
+
if not scores:
|
|
28
|
+
raise ValueError("Could not compute WB Score because all annotators failed.")
|
|
29
|
+
score = sum(scores) / len(scores)
|
|
30
|
+
score_rescaled = (score - 1) / 9
|
|
31
|
+
return [
|
|
32
|
+
Stat(MetricName("wildbench_score")).add(score),
|
|
33
|
+
Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
|
|
34
|
+
]
|
|
@@ -4,7 +4,7 @@ from dataclasses import dataclass
|
|
|
4
4
|
import cattrs
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from helm.common.hierarchical_logger import hlog, hwarn
|
|
8
8
|
from helm.common.object_spec import ObjectSpec
|
|
9
9
|
from helm.benchmark.model_metadata_registry import (
|
|
10
10
|
ModelMetadata,
|
|
@@ -104,9 +104,7 @@ def register_model_deployment(model_deployment: ModelDeployment) -> None:
|
|
|
104
104
|
try:
|
|
105
105
|
model_metadata = get_model_metadata(model_name)
|
|
106
106
|
except ValueError:
|
|
107
|
-
|
|
108
|
-
f"WARNING: Could not find model metadata for model {model_name} of model deployment {model_deployment.name}"
|
|
109
|
-
)
|
|
107
|
+
hwarn(f"Could not find model metadata for model {model_name} of model deployment {model_deployment.name}")
|
|
110
108
|
model_metadata = get_unknown_model_metadata(model_name)
|
|
111
109
|
register_model_metadata(model_metadata)
|
|
112
110
|
deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name]
|
|
@@ -130,7 +128,7 @@ def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeplo
|
|
|
130
128
|
raise ValueError(f"Model deployment {name} not found")
|
|
131
129
|
deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name]
|
|
132
130
|
if deployment.deprecated and warn_deprecated:
|
|
133
|
-
|
|
131
|
+
hwarn(f"DEPLOYMENT Model deployment {name} is deprecated")
|
|
134
132
|
return deployment
|
|
135
133
|
|
|
136
134
|
|
|
@@ -182,7 +180,7 @@ def get_default_model_deployment_for_model(
|
|
|
182
180
|
deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name]
|
|
183
181
|
if deployment.deprecated and ignore_deprecated:
|
|
184
182
|
if warn_arg_deprecated:
|
|
185
|
-
|
|
183
|
+
hwarn(f"Model deployment {model_name} is deprecated")
|
|
186
184
|
return None
|
|
187
185
|
return deployment.name
|
|
188
186
|
|
|
@@ -193,7 +191,7 @@ def get_default_model_deployment_for_model(
|
|
|
193
191
|
if len(available_deployments) > 0:
|
|
194
192
|
available_deployment_names: List[str] = [deployment.name for deployment in available_deployments]
|
|
195
193
|
if warn_arg_deprecated:
|
|
196
|
-
|
|
194
|
+
hwarn("Model name is deprecated. Please use the model deployment name instead.")
|
|
197
195
|
hlog(f"Available model deployments for model {model_name}: {available_deployment_names}")
|
|
198
196
|
|
|
199
197
|
# Additionally, if there is a non-deprecated deployment, use it.
|
|
@@ -210,7 +208,7 @@ def get_default_model_deployment_for_model(
|
|
|
210
208
|
else:
|
|
211
209
|
chosen_deployment = available_deployments[0]
|
|
212
210
|
if warn_arg_deprecated:
|
|
213
|
-
|
|
211
|
+
hwarn(f"All model deployments for model {model_name} are deprecated.")
|
|
214
212
|
if warn_arg_deprecated:
|
|
215
213
|
hlog(
|
|
216
214
|
f"Choosing {chosen_deployment.name} (the first one) as "
|
|
@@ -22,6 +22,9 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
|
|
|
22
22
|
# OpenAI Chat format
|
|
23
23
|
OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
|
|
24
24
|
|
|
25
|
+
# For NOVA models
|
|
26
|
+
NOVA_MODEL_TAG: str = "NOVA_MODEL_TAG"
|
|
27
|
+
|
|
25
28
|
# For Anthropic models
|
|
26
29
|
ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
|
|
27
30
|
ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
|
|
@@ -66,10 +69,18 @@ OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
|
|
|
66
69
|
LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
|
|
67
70
|
FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
|
|
68
71
|
|
|
72
|
+
# For Audio-langauge models (AudioLMs)
|
|
73
|
+
AUDIO_LANGUAGE_MODEL_TAG: str = "AUDIO_LANGUAGE_MODEL_TAG"
|
|
74
|
+
|
|
69
75
|
# Deprecated models that are no longer available.
|
|
70
76
|
# These are usually closed API models that have been permanently removed
|
|
71
77
|
DEPRECATED_MODEL_TAG: str = "DEPRECATED_MODEL_TAG"
|
|
72
78
|
|
|
79
|
+
# Unsupported models.
|
|
80
|
+
# These are models that we have chosen not to support because they are
|
|
81
|
+
# private, stale, non-notable, or difficult to implement.
|
|
82
|
+
UNSUPPORTED_MODEL_TAG: str = "UNSUPPORTED_MODEL_TAG"
|
|
83
|
+
|
|
73
84
|
# Frozen is set to false as the model_deployment_registry.py file
|
|
74
85
|
# might populate the deployment_names field.
|
|
75
86
|
|
|
@@ -208,6 +219,11 @@ def is_vlm(model_name: str) -> bool:
|
|
|
208
219
|
return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
|
|
209
220
|
|
|
210
221
|
|
|
222
|
+
def is_audiolm(model_name: str) -> bool:
|
|
223
|
+
"""Returns True if the model is a audio-language model (AudioLM). False otherwise."""
|
|
224
|
+
return model_has_tag(model_name, AUDIO_LANGUAGE_MODEL_TAG)
|
|
225
|
+
|
|
226
|
+
|
|
211
227
|
def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
|
|
212
228
|
"""Return placeholder ModelMetadata for an unknown model."""
|
|
213
229
|
return ModelMetadata(
|
|
@@ -4,7 +4,7 @@ import dacite
|
|
|
4
4
|
import importlib_resources as resources
|
|
5
5
|
import yaml
|
|
6
6
|
|
|
7
|
-
from helm.common.hierarchical_logger import htrack, hlog
|
|
7
|
+
from helm.common.hierarchical_logger import htrack, hlog, hwarn
|
|
8
8
|
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
9
9
|
from helm.benchmark.presentation.schema import Schema
|
|
10
10
|
|
|
@@ -71,10 +71,10 @@ def validate_contamination(contamination: Contamination, schema: Schema):
|
|
|
71
71
|
for point in contamination.points:
|
|
72
72
|
for model in point.models:
|
|
73
73
|
if model not in MODEL_NAME_TO_MODEL_METADATA:
|
|
74
|
-
|
|
74
|
+
hwarn(f"model {model} not defined in schema")
|
|
75
75
|
for group in point.groups:
|
|
76
76
|
if group not in schema.name_to_run_group:
|
|
77
|
-
|
|
77
|
+
hwarn(f"group {group} not defined in schema")
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
def read_contamination():
|
|
@@ -11,7 +11,7 @@ import numpy as np
|
|
|
11
11
|
from scipy.stats import pearsonr
|
|
12
12
|
|
|
13
13
|
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
|
|
14
|
-
from helm.common.hierarchical_logger import hlog
|
|
14
|
+
from helm.common.hierarchical_logger import hlog, setup_default_logging
|
|
15
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
16
16
|
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
17
17
|
|
|
@@ -600,17 +600,7 @@ class Plotter:
|
|
|
600
600
|
self.create_constrast_set_plots()
|
|
601
601
|
|
|
602
602
|
|
|
603
|
-
def
|
|
604
|
-
"""
|
|
605
|
-
This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
|
|
606
|
-
It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
|
|
607
|
-
the top-level command `helm-create-plots`.
|
|
608
|
-
"""
|
|
609
|
-
parser = argparse.ArgumentParser()
|
|
610
|
-
parser.add_argument("-o", "--output-path", type=str, help="Path to benchmarking output", default="benchmark_output")
|
|
611
|
-
parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
|
|
612
|
-
parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
|
|
613
|
-
args = parser.parse_args()
|
|
603
|
+
def create_plots(args):
|
|
614
604
|
register_builtin_configs_from_helm_package()
|
|
615
605
|
base_path = os.path.join(args.output_path, "runs", args.suite)
|
|
616
606
|
if not os.path.exists(os.path.join(base_path, "groups")):
|
|
@@ -621,5 +611,36 @@ def main():
|
|
|
621
611
|
plotter.create_all_plots()
|
|
622
612
|
|
|
623
613
|
|
|
614
|
+
def main():
|
|
615
|
+
"""
|
|
616
|
+
This script creates the plots used in the HELM paper (https://arxiv.org/abs/2211.09110).
|
|
617
|
+
It should be run _after_ running `summarize.py` with the same `benchmark_output` and `suite` arguments and through
|
|
618
|
+
the top-level command `helm-create-plots`.
|
|
619
|
+
"""
|
|
620
|
+
parser = argparse.ArgumentParser()
|
|
621
|
+
parser.add_argument(
|
|
622
|
+
"-o",
|
|
623
|
+
"--output-path",
|
|
624
|
+
type=str,
|
|
625
|
+
help="Path to benchmarking output",
|
|
626
|
+
default="benchmark_output",
|
|
627
|
+
)
|
|
628
|
+
parser.add_argument(
|
|
629
|
+
"--suite",
|
|
630
|
+
type=str,
|
|
631
|
+
help="Name of the suite that we are plotting",
|
|
632
|
+
required=True,
|
|
633
|
+
)
|
|
634
|
+
parser.add_argument(
|
|
635
|
+
"--plot-format",
|
|
636
|
+
help="Format for saving plots",
|
|
637
|
+
default="png",
|
|
638
|
+
choices=["png", "pdf"],
|
|
639
|
+
)
|
|
640
|
+
args = parser.parse_args()
|
|
641
|
+
setup_default_logging()
|
|
642
|
+
create_plots(args)
|
|
643
|
+
|
|
644
|
+
|
|
624
645
|
if __name__ == "__main__":
|
|
625
646
|
main()
|
|
@@ -59,6 +59,9 @@ class DisplayPrediction:
|
|
|
59
59
|
|
|
60
60
|
annotations: Optional[Dict[str, Any]]
|
|
61
61
|
|
|
62
|
+
thinking_text: Optional[str]
|
|
63
|
+
"""Thinking text from thinking models."""
|
|
64
|
+
|
|
62
65
|
|
|
63
66
|
@dataclass(frozen=True)
|
|
64
67
|
class DisplayRequest:
|
|
@@ -266,6 +269,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
266
269
|
request_state.instance
|
|
267
270
|
)
|
|
268
271
|
|
|
272
|
+
if request_state.result.completions[0].multimodal_content:
|
|
273
|
+
additional_prediction: str = request_state.result.completions[0].multimodal_content.text
|
|
274
|
+
if additional_prediction:
|
|
275
|
+
predicted_text = f"{additional_prediction} {predicted_text}"
|
|
276
|
+
|
|
269
277
|
# Process images and include if they exist
|
|
270
278
|
images: List[str] = [
|
|
271
279
|
encode_base64(image_location)
|
|
@@ -273,6 +281,10 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
273
281
|
if os.path.exists(image_location)
|
|
274
282
|
]
|
|
275
283
|
|
|
284
|
+
thinking_text: Optional[str] = (
|
|
285
|
+
request_state.result.completions[0].thinking.text if request_state.result.completions[0].thinking else None
|
|
286
|
+
)
|
|
287
|
+
|
|
276
288
|
predictions.append(
|
|
277
289
|
DisplayPrediction(
|
|
278
290
|
instance_id=request_state.instance.id,
|
|
@@ -285,6 +297,7 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
285
297
|
reference_index=request_state.reference_index,
|
|
286
298
|
stats=trial_stats,
|
|
287
299
|
annotations=request_state.annotations,
|
|
300
|
+
thinking_text=thinking_text,
|
|
288
301
|
)
|
|
289
302
|
)
|
|
290
303
|
requests.append(
|
|
@@ -11,6 +11,7 @@ import importlib_resources as resources
|
|
|
11
11
|
from helm.common.general import hlog
|
|
12
12
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
13
13
|
from helm.benchmark.augmentations.perturbation_description import PERTURBATION_WORST
|
|
14
|
+
from helm.common.hierarchical_logger import hwarn
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
# TODO: change to `helm.benchmark.config`
|
|
@@ -281,5 +282,5 @@ def read_schema(schema_path: str) -> Schema:
|
|
|
281
282
|
raw = yaml.safe_load(f)
|
|
282
283
|
schema = dacite.from_dict(Schema, raw)
|
|
283
284
|
if schema.adapter:
|
|
284
|
-
|
|
285
|
+
hwarn(f"The `adapter` field is deprecated and should be removed from schema file {schema_path}")
|
|
285
286
|
return dataclasses.replace(schema, adapter=get_adapter_fields())
|
|
@@ -30,7 +30,7 @@ from helm.common.general import (
|
|
|
30
30
|
unique_simplification,
|
|
31
31
|
)
|
|
32
32
|
from helm.common.codec import from_json
|
|
33
|
-
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
33
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
|
|
34
34
|
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
35
35
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
36
36
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
@@ -52,6 +52,10 @@ from helm.benchmark.presentation.schema import (
|
|
|
52
52
|
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
|
|
53
53
|
from helm.benchmark.presentation.run_display import write_run_display_json
|
|
54
54
|
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
|
|
55
|
+
from helm.common.object_spec import get_class_by_name
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
MODEL_HEADER_CELL_VALUE = "Model"
|
|
55
59
|
|
|
56
60
|
|
|
57
61
|
@dataclass(frozen=True)
|
|
@@ -98,7 +102,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
|
|
|
98
102
|
# This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
|
|
99
103
|
# as the main metric but multiple_choice_separate_original only generates exact_match
|
|
100
104
|
if matcher.name == "quasi_exact_match":
|
|
101
|
-
|
|
105
|
+
hwarn("No quasi_exact_match metric found, looking for exact_match instead")
|
|
102
106
|
matcher = replace(matcher, name="exact_match")
|
|
103
107
|
matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
|
|
104
108
|
if len(matching_stats) == 0:
|
|
@@ -262,18 +266,22 @@ def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
|
|
|
262
266
|
"""
|
|
263
267
|
|
|
264
268
|
row_means: List[Optional[float]] = []
|
|
269
|
+
# if the first column contains the names of models, do not treat it like a value column
|
|
270
|
+
skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
|
|
265
271
|
|
|
266
272
|
# check for all header cells where specified, that lower_is_better is consistent
|
|
267
273
|
orderings = []
|
|
268
|
-
|
|
269
|
-
|
|
274
|
+
header_cells = table.header[1:] if skip_first_column else table.header
|
|
275
|
+
for header_cell in header_cells:
|
|
276
|
+
orderings.append(header_cell.lower_is_better)
|
|
270
277
|
if len(set(orderings)) != 1:
|
|
271
278
|
raise Exception("Cannot mean columns with different values for lower_is_better")
|
|
272
279
|
|
|
273
280
|
for row in table.rows:
|
|
274
281
|
total = 0.0
|
|
275
282
|
count = 0
|
|
276
|
-
|
|
283
|
+
row_cells = row[1:] if skip_first_column else row
|
|
284
|
+
for cell in row_cells:
|
|
277
285
|
if cell.value is not None:
|
|
278
286
|
total += float(cell.value)
|
|
279
287
|
count += 1
|
|
@@ -398,8 +406,8 @@ class Summarizer:
|
|
|
398
406
|
included = False
|
|
399
407
|
for run_group_name in run.run_spec.groups: # go through the groups of the run to determine visibility
|
|
400
408
|
if run_group_name not in self.schema.name_to_run_group:
|
|
401
|
-
|
|
402
|
-
f"
|
|
409
|
+
hwarn(
|
|
410
|
+
f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
|
|
403
411
|
f"but undefined in {self.schema_path}, skipping"
|
|
404
412
|
)
|
|
405
413
|
continue
|
|
@@ -432,14 +440,14 @@ class Summarizer:
|
|
|
432
440
|
run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
|
|
433
441
|
stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
|
|
434
442
|
if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
|
|
435
|
-
|
|
443
|
+
hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
|
|
436
444
|
continue
|
|
437
445
|
run_path: str = os.path.join(run_suite_path, run_dir_name)
|
|
438
446
|
run = self.read_run(run_path)
|
|
439
447
|
self.runs.append(run)
|
|
440
448
|
if run.run_spec.name in self.runs_to_run_suites:
|
|
441
|
-
|
|
442
|
-
f"
|
|
449
|
+
hwarn(
|
|
450
|
+
f"Run entry {run.run_spec.name} is present in two different Run Suites. "
|
|
443
451
|
f"Defaulting to the latest assigned suite: {suite}"
|
|
444
452
|
)
|
|
445
453
|
self.runs_to_run_suites[run.run_spec.name] = suite
|
|
@@ -536,8 +544,8 @@ class Summarizer:
|
|
|
536
544
|
|
|
537
545
|
for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
|
|
538
546
|
if metric_name not in defined_metric_names:
|
|
539
|
-
|
|
540
|
-
f"
|
|
547
|
+
hwarn(
|
|
548
|
+
f"metric name {metric_name} undefined in {self.schema_path} "
|
|
541
549
|
f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
|
|
542
550
|
)
|
|
543
551
|
|
|
@@ -730,8 +738,8 @@ class Summarizer:
|
|
|
730
738
|
if stat is None:
|
|
731
739
|
# Print out near misses to provide a more informative warning
|
|
732
740
|
near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
|
|
733
|
-
|
|
734
|
-
f"
|
|
741
|
+
hwarn(
|
|
742
|
+
f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
|
|
735
743
|
f"{len(near_misses)} near misses matching just the name"
|
|
736
744
|
)
|
|
737
745
|
if len(near_misses) > 0:
|
|
@@ -802,7 +810,7 @@ class Summarizer:
|
|
|
802
810
|
# Create header (cells to display) and the list of metric name filters
|
|
803
811
|
# (to pull out information later).
|
|
804
812
|
if not columns or not adapter_to_runs:
|
|
805
|
-
|
|
813
|
+
hwarn(f"table {title}, has no rows or columns, leaving empty")
|
|
806
814
|
return Table("empty", [], [])
|
|
807
815
|
|
|
808
816
|
header: List[HeaderCell] = []
|
|
@@ -811,7 +819,7 @@ class Summarizer:
|
|
|
811
819
|
num_groups = len(set(run_group.name for run_group, _ in columns)) # number of unique groups, determines headers
|
|
812
820
|
|
|
813
821
|
# Column headers
|
|
814
|
-
header.append(HeaderCell(
|
|
822
|
+
header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
|
|
815
823
|
for run_group, metric_group_name in columns:
|
|
816
824
|
# check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
|
|
817
825
|
if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
|
|
@@ -823,7 +831,7 @@ class Summarizer:
|
|
|
823
831
|
matcher = replace(matcher, sub_split=sub_split)
|
|
824
832
|
header_field = self.schema.name_to_metric.get(matcher.name)
|
|
825
833
|
if header_field is None:
|
|
826
|
-
|
|
834
|
+
hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
|
|
827
835
|
continue
|
|
828
836
|
metadata = {
|
|
829
837
|
"metric": header_field.get_short_display_name(),
|
|
@@ -951,8 +959,8 @@ class Summarizer:
|
|
|
951
959
|
all_run_spec_names = []
|
|
952
960
|
for adapter_spec, runs in adapter_to_runs.items():
|
|
953
961
|
if len(runs) > 1:
|
|
954
|
-
|
|
955
|
-
f"
|
|
962
|
+
hwarn(
|
|
963
|
+
f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
|
|
956
964
|
f" {[run.run_spec.name for run in runs]}"
|
|
957
965
|
)
|
|
958
966
|
for run in runs:
|
|
@@ -969,22 +977,20 @@ class Summarizer:
|
|
|
969
977
|
if strategy == AggregationStrategy.WIN_RATE:
|
|
970
978
|
WIN_RATE_AGGREGATION = "mean"
|
|
971
979
|
win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
|
|
972
|
-
description = "How many models this model outperforms on average (over columns)."
|
|
973
980
|
aggregate_header_cells.append(
|
|
974
981
|
HeaderCell(
|
|
975
982
|
f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
|
|
976
|
-
description=
|
|
983
|
+
description="How many models this model outperforms on average (over columns).",
|
|
977
984
|
lower_is_better=False,
|
|
978
985
|
)
|
|
979
986
|
)
|
|
980
987
|
aggregate_row_values.append(win_rates)
|
|
981
988
|
elif strategy == AggregationStrategy.MEAN:
|
|
982
989
|
means = compute_aggregate_row_means(table)
|
|
983
|
-
description = "An average over columns representing the mean performance."
|
|
984
990
|
aggregate_header_cells.append(
|
|
985
991
|
HeaderCell(
|
|
986
|
-
"Mean
|
|
987
|
-
description=
|
|
992
|
+
"Mean score",
|
|
993
|
+
description="The mean of the scores from all columns.",
|
|
988
994
|
lower_is_better=table.header[0].lower_is_better,
|
|
989
995
|
)
|
|
990
996
|
)
|
|
@@ -1226,10 +1232,57 @@ class Summarizer:
|
|
|
1226
1232
|
|
|
1227
1233
|
|
|
1228
1234
|
@htrack("summarize")
|
|
1235
|
+
def summarize(args):
|
|
1236
|
+
release: Optional[str] = None
|
|
1237
|
+
suites: Optional[str] = None
|
|
1238
|
+
suite: Optional[str] = None
|
|
1239
|
+
if args.suite and (args.release or args.suites):
|
|
1240
|
+
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
|
|
1241
|
+
elif args.suite:
|
|
1242
|
+
# Comment this out while we have a trial period for the `release` method.
|
|
1243
|
+
# hlog(
|
|
1244
|
+
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
|
|
1245
|
+
# "where --release specifies the name of a release and --suites specifies several run suites "
|
|
1246
|
+
# "to be included in that release."
|
|
1247
|
+
# )
|
|
1248
|
+
suite = args.suite
|
|
1249
|
+
elif args.release or args.suites:
|
|
1250
|
+
if not args.release or not args.suites:
|
|
1251
|
+
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
|
|
1252
|
+
release = args.release
|
|
1253
|
+
suites = args.suites
|
|
1254
|
+
else:
|
|
1255
|
+
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1256
|
+
|
|
1257
|
+
schema_path = args.schema_path if args.schema_path else get_default_schema_path()
|
|
1258
|
+
|
|
1259
|
+
register_builtin_configs_from_helm_package()
|
|
1260
|
+
register_configs_from_directory(args.local_path)
|
|
1261
|
+
|
|
1262
|
+
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1263
|
+
summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
|
|
1264
|
+
summarizer = summarizer_cls(
|
|
1265
|
+
release=release,
|
|
1266
|
+
suites=suites,
|
|
1267
|
+
suite=suite,
|
|
1268
|
+
schema_path=schema_path,
|
|
1269
|
+
output_path=args.output_path,
|
|
1270
|
+
verbose=args.debug,
|
|
1271
|
+
num_threads=args.num_threads,
|
|
1272
|
+
allow_unknown_models=args.allow_unknown_models,
|
|
1273
|
+
)
|
|
1274
|
+
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
|
|
1275
|
+
hlog("Done.")
|
|
1276
|
+
|
|
1277
|
+
|
|
1229
1278
|
def main():
|
|
1230
1279
|
parser = argparse.ArgumentParser()
|
|
1231
1280
|
parser.add_argument(
|
|
1232
|
-
"-o",
|
|
1281
|
+
"-o",
|
|
1282
|
+
"--output-path",
|
|
1283
|
+
type=str,
|
|
1284
|
+
help="Where the benchmarking output lives",
|
|
1285
|
+
default="benchmark_output",
|
|
1233
1286
|
)
|
|
1234
1287
|
parser.add_argument(
|
|
1235
1288
|
"--schema-path",
|
|
@@ -1247,9 +1300,18 @@ def main():
|
|
|
1247
1300
|
help="Experimental: Name of the release this summarization should go under.",
|
|
1248
1301
|
)
|
|
1249
1302
|
parser.add_argument(
|
|
1250
|
-
"--suites",
|
|
1303
|
+
"--suites",
|
|
1304
|
+
type=str,
|
|
1305
|
+
nargs="+",
|
|
1306
|
+
help="Experimental: List of suites to summarize for this this release.",
|
|
1307
|
+
)
|
|
1308
|
+
parser.add_argument(
|
|
1309
|
+
"-n",
|
|
1310
|
+
"--num-threads",
|
|
1311
|
+
type=int,
|
|
1312
|
+
help="Max number of threads used to summarize",
|
|
1313
|
+
default=8,
|
|
1251
1314
|
)
|
|
1252
|
-
parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
|
|
1253
1315
|
parser.add_argument(
|
|
1254
1316
|
"--debug",
|
|
1255
1317
|
action="store_true",
|
|
@@ -1272,47 +1334,15 @@ def main():
|
|
|
1272
1334
|
help="Whether to allow unknown models in the metadata file",
|
|
1273
1335
|
default=True,
|
|
1274
1336
|
)
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
if args.suite and (args.release or args.suites):
|
|
1281
|
-
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
|
|
1282
|
-
elif args.suite:
|
|
1283
|
-
# Comment this out while we have a trial period for the `release` method.
|
|
1284
|
-
# hlog(
|
|
1285
|
-
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
|
|
1286
|
-
# "where --release specifies the name of a release and --suites specifies several run suites "
|
|
1287
|
-
# "to be included in that release."
|
|
1288
|
-
# )
|
|
1289
|
-
suite = args.suite
|
|
1290
|
-
elif args.release or args.suites:
|
|
1291
|
-
if not args.release or not args.suites:
|
|
1292
|
-
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
|
|
1293
|
-
release = args.release
|
|
1294
|
-
suites = args.suites
|
|
1295
|
-
else:
|
|
1296
|
-
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1297
|
-
|
|
1298
|
-
schema_path = args.schema_path if args.schema_path else get_default_schema_path()
|
|
1299
|
-
|
|
1300
|
-
register_builtin_configs_from_helm_package()
|
|
1301
|
-
register_configs_from_directory(args.local_path)
|
|
1302
|
-
|
|
1303
|
-
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1304
|
-
summarizer = Summarizer(
|
|
1305
|
-
release=release,
|
|
1306
|
-
suites=suites,
|
|
1307
|
-
suite=suite,
|
|
1308
|
-
schema_path=schema_path,
|
|
1309
|
-
output_path=args.output_path,
|
|
1310
|
-
verbose=args.debug,
|
|
1311
|
-
num_threads=args.num_threads,
|
|
1312
|
-
allow_unknown_models=args.allow_unknown_models,
|
|
1337
|
+
parser.add_argument(
|
|
1338
|
+
"--summarizer-class-name",
|
|
1339
|
+
type=str,
|
|
1340
|
+
default=None,
|
|
1341
|
+
help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
|
|
1313
1342
|
)
|
|
1314
|
-
|
|
1315
|
-
|
|
1343
|
+
args = parser.parse_args()
|
|
1344
|
+
setup_default_logging()
|
|
1345
|
+
summarize(args)
|
|
1316
1346
|
|
|
1317
1347
|
|
|
1318
1348
|
if __name__ == "__main__":
|