crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -9,12 +9,11 @@ Usage:
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
|
-
import
|
|
12
|
+
import dataclasses
|
|
13
13
|
import os
|
|
14
14
|
import datetime
|
|
15
15
|
import urllib.parse
|
|
16
16
|
import json
|
|
17
|
-
import yaml
|
|
18
17
|
from collections import defaultdict
|
|
19
18
|
from dataclasses import dataclass, replace
|
|
20
19
|
from statistics import mean, median
|
|
@@ -22,50 +21,50 @@ from typing import List, Optional, Dict, Any, Tuple, Set
|
|
|
22
21
|
|
|
23
22
|
from tqdm import tqdm
|
|
24
23
|
from helm.benchmark.model_deployment_registry import get_model_deployment
|
|
25
|
-
|
|
26
24
|
from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
|
|
27
25
|
from helm.common.general import (
|
|
28
26
|
write,
|
|
29
27
|
ensure_directory_exists,
|
|
30
28
|
asdict_without_nones,
|
|
31
|
-
serialize_dates,
|
|
32
29
|
parallel_map,
|
|
33
30
|
singleton,
|
|
34
31
|
unique_simplification,
|
|
35
32
|
)
|
|
36
33
|
from helm.common.codec import from_json
|
|
37
|
-
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
38
|
-
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
34
|
+
from helm.common.hierarchical_logger import hlog, htrack, htrack_block, hwarn, setup_default_logging
|
|
35
|
+
from helm.benchmark.scenarios.scenario import Scenario, ScenarioMetadata, ScenarioSpec, create_scenario
|
|
39
36
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
40
|
-
from helm.benchmark.data_overlap.data_overlap_spec import DataOverlapStats, GroupOverlapStats
|
|
41
|
-
from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
|
|
42
37
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
43
|
-
from helm.benchmark.metrics.metric import
|
|
38
|
+
from helm.benchmark.metrics.metric import (
|
|
39
|
+
MetricInterface,
|
|
40
|
+
MetricMetadata,
|
|
41
|
+
MetricSpec,
|
|
42
|
+
create_metric,
|
|
43
|
+
get_all_stats_by_name,
|
|
44
|
+
)
|
|
44
45
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
45
|
-
from helm.benchmark.
|
|
46
|
+
from helm.benchmark.run_spec import RunSpec
|
|
47
|
+
from helm.benchmark.runner import LATEST_SYMLINK
|
|
46
48
|
from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
|
|
47
49
|
from helm.benchmark.presentation.schema import (
|
|
50
|
+
MetricGroup,
|
|
48
51
|
MetricNameMatcher,
|
|
49
52
|
RunGroup,
|
|
53
|
+
Field,
|
|
54
|
+
Schema,
|
|
50
55
|
read_schema,
|
|
51
|
-
|
|
56
|
+
get_default_schema_path,
|
|
52
57
|
BY_GROUP,
|
|
53
58
|
THIS_GROUP_ONLY,
|
|
54
59
|
NO_GROUPS,
|
|
55
60
|
)
|
|
56
|
-
from helm.benchmark.presentation.contamination import (
|
|
57
|
-
read_contamination,
|
|
58
|
-
validate_contamination,
|
|
59
|
-
CONTAMINATION_SYMBOLS,
|
|
60
|
-
CONTAMINATION_STYLES,
|
|
61
|
-
CONTAMINATION_LEVEL_STRONG,
|
|
62
|
-
)
|
|
63
61
|
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
|
|
64
62
|
from helm.benchmark.presentation.run_display import write_run_display_json
|
|
65
|
-
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata
|
|
63
|
+
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
|
|
64
|
+
from helm.common.object_spec import get_class_by_name
|
|
66
65
|
|
|
67
66
|
|
|
68
|
-
|
|
67
|
+
MODEL_HEADER_CELL_VALUE = "Model"
|
|
69
68
|
|
|
70
69
|
|
|
71
70
|
@dataclass(frozen=True)
|
|
@@ -112,7 +111,7 @@ def get_unique_stat_by_matcher(stats: List[Stat], matcher: MetricNameMatcher) ->
|
|
|
112
111
|
# This is necessary for prompting ablations at the moment, since some scenarios normally have quasi_exact_match
|
|
113
112
|
# as the main metric but multiple_choice_separate_original only generates exact_match
|
|
114
113
|
if matcher.name == "quasi_exact_match":
|
|
115
|
-
|
|
114
|
+
hwarn("No quasi_exact_match metric found, looking for exact_match instead")
|
|
116
115
|
matcher = replace(matcher, name="exact_match")
|
|
117
116
|
matching_stats = [stat for stat in stats if matcher.matches(stat.name)]
|
|
118
117
|
if len(matching_stats) == 0:
|
|
@@ -172,7 +171,7 @@ def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetad
|
|
|
172
171
|
except ValueError:
|
|
173
172
|
pass
|
|
174
173
|
|
|
175
|
-
# Return a placeholder "
|
|
174
|
+
# Return a placeholder "unknown model" model metadata.
|
|
176
175
|
return get_unknown_model_metadata(adapter_spec.model)
|
|
177
176
|
|
|
178
177
|
|
|
@@ -233,25 +232,27 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
233
232
|
"""
|
|
234
233
|
assert aggregation in ["mean", "median"]
|
|
235
234
|
win_rates_per_row: List[List[float]] = [[] for _ in table.rows]
|
|
236
|
-
for
|
|
235
|
+
for column_index, header_cell in enumerate(table.header):
|
|
237
236
|
lower_is_better = header_cell.lower_is_better
|
|
238
237
|
if lower_is_better is None: # column does not have a meaningful ordering
|
|
239
238
|
continue
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
if
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
239
|
+
value_to_count: Dict[float, int] = defaultdict(int)
|
|
240
|
+
for row in table.rows:
|
|
241
|
+
value = row[column_index].value
|
|
242
|
+
if value is not None:
|
|
243
|
+
value_to_count[value] += 1
|
|
244
|
+
value_to_wins: Dict[float, float] = {}
|
|
245
|
+
acc_count = 0
|
|
246
|
+
for value, value_count in sorted(value_to_count.items(), reverse=lower_is_better):
|
|
247
|
+
value_to_wins[value] = acc_count + ((value_count - 1) / 2)
|
|
248
|
+
acc_count += value_count
|
|
249
|
+
total_count = acc_count
|
|
250
|
+
if total_count < 2:
|
|
251
251
|
continue
|
|
252
|
-
for
|
|
253
|
-
|
|
254
|
-
|
|
252
|
+
for row_index, row in enumerate(table.rows):
|
|
253
|
+
value = row[column_index].value
|
|
254
|
+
if value is not None:
|
|
255
|
+
win_rates_per_row[row_index].append(value_to_wins[row[column_index].value] / (total_count - 1))
|
|
255
256
|
|
|
256
257
|
# Note: the logic up to here is somewhat general as it simply computes win rates across columns for each row.
|
|
257
258
|
# Here, we simply average these win rates but we might want some more involved later (e.g., weighted average).
|
|
@@ -266,7 +267,47 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
266
267
|
return aggregate_win_rates
|
|
267
268
|
|
|
268
269
|
|
|
269
|
-
|
|
270
|
+
def compute_aggregate_row_means(table: Table) -> List[Optional[float]]:
|
|
271
|
+
"""
|
|
272
|
+
Computes the aggregate mean of each row across columns.
|
|
273
|
+
Returns a list of means, one per row, with None if a row was never meaningfully comparable (i.e., all
|
|
274
|
+
non-null values of the row are in columns we skip).
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
row_means: List[Optional[float]] = []
|
|
278
|
+
# if the first column contains the names of models, do not treat it like a value column
|
|
279
|
+
skip_first_column = table.header and table.header[0].value == MODEL_HEADER_CELL_VALUE
|
|
280
|
+
|
|
281
|
+
# check for all header cells where specified, that lower_is_better is consistent
|
|
282
|
+
orderings = []
|
|
283
|
+
header_cells = table.header[1:] if skip_first_column else table.header
|
|
284
|
+
for header_cell in header_cells:
|
|
285
|
+
orderings.append(header_cell.lower_is_better)
|
|
286
|
+
if len(set(orderings)) != 1:
|
|
287
|
+
raise Exception("Cannot mean columns with different values for lower_is_better")
|
|
288
|
+
|
|
289
|
+
for row in table.rows:
|
|
290
|
+
total = 0.0
|
|
291
|
+
count = 0
|
|
292
|
+
row_cells = row[1:] if skip_first_column else row
|
|
293
|
+
for cell in row_cells:
|
|
294
|
+
if cell.value is not None:
|
|
295
|
+
total += float(cell.value)
|
|
296
|
+
count += 1
|
|
297
|
+
if count == 0:
|
|
298
|
+
row_means.append(None)
|
|
299
|
+
else:
|
|
300
|
+
row_means.append(total / count)
|
|
301
|
+
|
|
302
|
+
return row_means
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class AggregationStrategy:
|
|
306
|
+
WIN_RATE = "win_rate"
|
|
307
|
+
MEAN = "mean"
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
ALL_AGGREGATION_STRATEGIES = [AggregationStrategy.WIN_RATE, AggregationStrategy.MEAN]
|
|
270
311
|
|
|
271
312
|
|
|
272
313
|
class Summarizer:
|
|
@@ -309,7 +350,7 @@ class Summarizer:
|
|
|
309
350
|
release: Optional[str],
|
|
310
351
|
suites: Optional[List[str]],
|
|
311
352
|
suite: Optional[str],
|
|
312
|
-
|
|
353
|
+
schema_path: Optional[str],
|
|
313
354
|
output_path: str,
|
|
314
355
|
verbose: bool,
|
|
315
356
|
num_threads: int,
|
|
@@ -329,7 +370,7 @@ class Summarizer:
|
|
|
329
370
|
self.suites: List[str]
|
|
330
371
|
self.run_suite_paths: List[str]
|
|
331
372
|
self.suite: Optional[str] = None
|
|
332
|
-
self.
|
|
373
|
+
self.schema_path = schema_path
|
|
333
374
|
self.release: Optional[str] = None
|
|
334
375
|
if suite:
|
|
335
376
|
self.suite = suite
|
|
@@ -344,12 +385,8 @@ class Summarizer:
|
|
|
344
385
|
self.verbose: bool = verbose
|
|
345
386
|
self.num_threads: int = num_threads
|
|
346
387
|
self.allow_unknown_models: bool = allow_unknown_models
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
self.schema = read_schema(schema_file)
|
|
351
|
-
self.contamination = read_contamination()
|
|
352
|
-
validate_contamination(self.contamination, self.schema)
|
|
388
|
+
self.schema = read_schema(schema_path) if schema_path else Schema()
|
|
389
|
+
self.metric_metadata: List[MetricMetadata] = []
|
|
353
390
|
|
|
354
391
|
def read_run(self, run_path: str) -> Run:
|
|
355
392
|
"""Load the `Run` object from `run_path`."""
|
|
@@ -375,9 +412,9 @@ class Summarizer:
|
|
|
375
412
|
included = False
|
|
376
413
|
for run_group_name in run.run_spec.groups: # go through the groups of the run to determine visibility
|
|
377
414
|
if run_group_name not in self.schema.name_to_run_group:
|
|
378
|
-
|
|
379
|
-
f"
|
|
380
|
-
f"but undefined in {self.
|
|
415
|
+
hwarn(
|
|
416
|
+
f"group {run_group_name} mentioned in run spec {run.run_spec.name} "
|
|
417
|
+
f"but undefined in {self.schema_path}, skipping"
|
|
381
418
|
)
|
|
382
419
|
continue
|
|
383
420
|
run_group = self.schema.name_to_run_group[run_group_name]
|
|
@@ -396,6 +433,8 @@ class Summarizer:
|
|
|
396
433
|
|
|
397
434
|
def read_runs_for_suite(self, suite, run_suite_path):
|
|
398
435
|
"""Load the runs in the run suite path."""
|
|
436
|
+
if not os.path.exists(run_suite_path):
|
|
437
|
+
raise Exception(f"Suite {suite} does not exist at {run_suite_path}")
|
|
399
438
|
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
|
|
400
439
|
# so filter them out.
|
|
401
440
|
run_dir_names = sorted(
|
|
@@ -409,14 +448,14 @@ class Summarizer:
|
|
|
409
448
|
run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
|
|
410
449
|
stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
|
|
411
450
|
if not os.path.exists(run_spec_path) or not os.path.exists(stats_path):
|
|
412
|
-
|
|
451
|
+
hwarn(f"{run_dir_name} doesn't have run_spec.json or stats.json, skipping")
|
|
413
452
|
continue
|
|
414
453
|
run_path: str = os.path.join(run_suite_path, run_dir_name)
|
|
415
454
|
run = self.read_run(run_path)
|
|
416
455
|
self.runs.append(run)
|
|
417
456
|
if run.run_spec.name in self.runs_to_run_suites:
|
|
418
|
-
|
|
419
|
-
f"
|
|
457
|
+
hwarn(
|
|
458
|
+
f"Run entry {run.run_spec.name} is present in two different Run Suites. "
|
|
420
459
|
f"Defaulting to the latest assigned suite: {suite}"
|
|
421
460
|
)
|
|
422
461
|
self.runs_to_run_suites[run.run_spec.name] = suite
|
|
@@ -433,11 +472,205 @@ class Summarizer:
|
|
|
433
472
|
self.group_adapter_to_runs[group_name][adapter_spec].append(run)
|
|
434
473
|
self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
|
|
435
474
|
|
|
436
|
-
|
|
475
|
+
@dataclass(frozen=True)
|
|
476
|
+
class _ModelField(Field):
|
|
477
|
+
"""The frontend version of ModelMetadata.
|
|
478
|
+
|
|
479
|
+
The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
|
|
480
|
+
|
|
481
|
+
All attributes have the same meaning as in ModelMetadata."""
|
|
482
|
+
|
|
483
|
+
# TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
|
|
484
|
+
creator_organization: Optional[str] = None
|
|
485
|
+
access: Optional[str] = None
|
|
486
|
+
todo: bool = False
|
|
487
|
+
release_date: Optional[str] = None
|
|
488
|
+
num_parameters: Optional[int] = None
|
|
489
|
+
|
|
490
|
+
def get_model_field_dicts(self) -> List[Dict]:
|
|
491
|
+
"""Get a list of `ModelField`s dicts that will be written to schema.json.
|
|
492
|
+
|
|
493
|
+
The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
|
|
494
|
+
|
|
495
|
+
This is populated by reading the `ModelMetadata` configs and filtering down to models that were
|
|
496
|
+
actually used, and converting each `ModelMetadata` to a `ModelField`."""
|
|
497
|
+
# TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
|
|
498
|
+
used_model_names: Set[str] = set()
|
|
499
|
+
for run in self.runs:
|
|
500
|
+
used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
|
|
501
|
+
|
|
502
|
+
model_field_dicts: List[Dict] = []
|
|
503
|
+
for model_name in get_all_models():
|
|
504
|
+
if model_name not in used_model_names:
|
|
505
|
+
continue
|
|
506
|
+
model_metadata = get_model_metadata(model_name)
|
|
507
|
+
model_field = Summarizer._ModelField(
|
|
508
|
+
name=model_metadata.name,
|
|
509
|
+
display_name=model_metadata.display_name,
|
|
510
|
+
short_display_name=model_metadata.display_name,
|
|
511
|
+
description=model_metadata.description,
|
|
512
|
+
creator_organization=model_metadata.creator_organization_name,
|
|
513
|
+
access=model_metadata.access,
|
|
514
|
+
todo=False,
|
|
515
|
+
release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
|
|
516
|
+
num_parameters=model_metadata.num_parameters,
|
|
517
|
+
)
|
|
518
|
+
model_field_dicts.append(asdict_without_nones(model_field))
|
|
519
|
+
return model_field_dicts
|
|
520
|
+
|
|
521
|
+
def get_metric_metadata(self) -> List[MetricMetadata]:
|
|
522
|
+
if self.metric_metadata:
|
|
523
|
+
return self.metric_metadata
|
|
524
|
+
metric_specs: List[MetricSpec] = []
|
|
525
|
+
for run in self.runs:
|
|
526
|
+
metric_specs.extend(run.run_spec.metric_specs)
|
|
527
|
+
metric_specs = list(set(metric_specs))
|
|
528
|
+
metric_name_to_metadata: Dict[str, MetricMetadata] = {}
|
|
529
|
+
for metric_spec in metric_specs:
|
|
530
|
+
try:
|
|
531
|
+
metric: MetricInterface = create_metric(metric_spec)
|
|
532
|
+
metric_metadata_list = metric.get_metadata()
|
|
533
|
+
for metric_metadata in metric_metadata_list:
|
|
534
|
+
metric_name_to_metadata[metric_metadata.name] = metric_metadata
|
|
535
|
+
except NotImplementedError:
|
|
536
|
+
pass
|
|
537
|
+
except (ModuleNotFoundError, AttributeError, TypeError):
|
|
538
|
+
pass
|
|
539
|
+
|
|
540
|
+
run_stat_names: Set[str] = set()
|
|
541
|
+
for run in self.runs:
|
|
542
|
+
for stat in run.stats:
|
|
543
|
+
run_stat_names.add(stat.name.name)
|
|
544
|
+
|
|
545
|
+
metric_names_to_prune = set(metric_name_to_metadata.keys()) - run_stat_names
|
|
546
|
+
for metric_name_to_prune in metric_names_to_prune:
|
|
547
|
+
del metric_name_to_metadata[metric_name_to_prune]
|
|
548
|
+
self.metric_metadata = list(metric_name_to_metadata.values())
|
|
549
|
+
return self.metric_metadata
|
|
550
|
+
|
|
551
|
+
def metric_metadata_to_field(self, metric_metadata: MetricMetadata) -> Field:
|
|
552
|
+
return Field(
|
|
553
|
+
name=metric_metadata.name,
|
|
554
|
+
display_name=metric_metadata.display_name,
|
|
555
|
+
short_display_name=metric_metadata.short_display_name,
|
|
556
|
+
description=metric_metadata.description,
|
|
557
|
+
lower_is_better=metric_metadata.lower_is_better,
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
def auto_generate_metric_fields(self) -> List[Field]:
|
|
561
|
+
return [self.metric_metadata_to_field(metric_metadata) for metric_metadata in self.get_metric_metadata()]
|
|
562
|
+
|
|
563
|
+
def auto_generate_metric_groups(self) -> List[MetricGroup]:
|
|
564
|
+
metric_groups = [
|
|
565
|
+
MetricGroup(
|
|
566
|
+
name="main_metric",
|
|
567
|
+
display_name="Main Metric",
|
|
568
|
+
description="Main Metric",
|
|
569
|
+
metrics=[MetricNameMatcher(name="${main_name}", split="${main_split}")],
|
|
570
|
+
)
|
|
571
|
+
]
|
|
572
|
+
metric_group_to_metrics: Dict[str, List[str]] = {}
|
|
573
|
+
for metric_metadata in self.metric_metadata:
|
|
574
|
+
if metric_metadata.group:
|
|
575
|
+
if metric_metadata.group not in metric_group_to_metrics:
|
|
576
|
+
metric_group_to_metrics[metric_metadata.group] = []
|
|
577
|
+
metric_group_to_metrics[metric_metadata.group].append(metric_metadata.name)
|
|
578
|
+
for metric_group, metric_names in metric_group_to_metrics.items():
|
|
579
|
+
display_name = metric_group.replace("_", " ").capitalize()
|
|
580
|
+
metric_groups.append(
|
|
581
|
+
MetricGroup(
|
|
582
|
+
name=metric_group,
|
|
583
|
+
# TODO: Make display_name and description nicer
|
|
584
|
+
display_name=display_name,
|
|
585
|
+
description=display_name,
|
|
586
|
+
aggregation_strategies=[],
|
|
587
|
+
metrics=[
|
|
588
|
+
MetricNameMatcher(name=metric_name, split="${main_split}") for metric_name in metric_names
|
|
589
|
+
],
|
|
590
|
+
)
|
|
591
|
+
)
|
|
592
|
+
return metric_groups
|
|
593
|
+
|
|
594
|
+
def get_scenario_metadata(self) -> List[ScenarioMetadata]:
|
|
595
|
+
scenario_specs = [run.run_spec.scenario_spec for run in self.runs]
|
|
596
|
+
scenario_specs = list(set(scenario_specs))
|
|
597
|
+
scenario_name_to_metadata: Dict[str, ScenarioMetadata] = {}
|
|
598
|
+
for scenario_spec in scenario_specs:
|
|
599
|
+
try:
|
|
600
|
+
scenario: Scenario = create_scenario(scenario_spec)
|
|
601
|
+
scenario_metadata = scenario.get_metadata()
|
|
602
|
+
scenario_name_to_metadata[scenario_metadata.name] = scenario_metadata
|
|
603
|
+
except NotImplementedError:
|
|
604
|
+
pass
|
|
605
|
+
except (ModuleNotFoundError, AttributeError, TypeError):
|
|
606
|
+
pass
|
|
607
|
+
|
|
608
|
+
run_groups: Set[str] = set()
|
|
609
|
+
for run in self.runs:
|
|
610
|
+
for run_group in run.run_spec.groups:
|
|
611
|
+
run_groups.add(run_group)
|
|
612
|
+
|
|
613
|
+
scenario_names_to_prune = set(scenario_name_to_metadata.keys()) - run_groups
|
|
614
|
+
for scenario_name_to_prune in scenario_names_to_prune:
|
|
615
|
+
del scenario_name_to_metadata[scenario_name_to_prune]
|
|
616
|
+
return list(scenario_name_to_metadata.values())
|
|
617
|
+
|
|
618
|
+
def scenario_metadata_to_run_group(self, scenario_metadata: ScenarioMetadata) -> RunGroup:
|
|
619
|
+
metric_group_names = [metric_group.name for metric_group in self.schema.metric_groups]
|
|
620
|
+
return RunGroup(
|
|
621
|
+
name=scenario_metadata.name,
|
|
622
|
+
display_name=scenario_metadata.display_name,
|
|
623
|
+
short_display_name=scenario_metadata.short_display_name,
|
|
624
|
+
description=scenario_metadata.description,
|
|
625
|
+
metric_groups=metric_group_names,
|
|
626
|
+
environment={
|
|
627
|
+
"main_name": scenario_metadata.main_metric,
|
|
628
|
+
"main_split": scenario_metadata.main_split,
|
|
629
|
+
},
|
|
630
|
+
taxonomy=scenario_metadata.taxonomy,
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
def auto_generate_all_scenarios_run_group(self) -> RunGroup:
|
|
634
|
+
return RunGroup(
|
|
635
|
+
name="all_scenarios",
|
|
636
|
+
display_name="All Scenarios",
|
|
637
|
+
description="All scenarios",
|
|
638
|
+
category="Scenario Groups",
|
|
639
|
+
subgroups=[run_group.name for run_group in self.schema.run_groups if len(run_group.subgroups) == 0],
|
|
640
|
+
)
|
|
641
|
+
|
|
642
|
+
def auto_generate_scenario_run_groups(self) -> List[RunGroup]:
|
|
643
|
+
return [
|
|
644
|
+
self.scenario_metadata_to_run_group(scenario_metadata) for scenario_metadata in self.get_scenario_metadata()
|
|
645
|
+
]
|
|
646
|
+
|
|
647
|
+
def fix_up_schema(self) -> None:
|
|
648
|
+
# if not self.schema.run_groups:
|
|
649
|
+
if not self.schema.metrics:
|
|
650
|
+
self.schema = dataclasses.replace(self.schema, metrics=self.auto_generate_metric_fields())
|
|
651
|
+
# Can only auto-generate metric groups if metrics were also auto-generated
|
|
652
|
+
# because auto_generate_metric_groups() requires self.metric_metadata()
|
|
653
|
+
# which is populated by auto_generate_metric_fields()
|
|
654
|
+
if not self.schema.metric_groups:
|
|
655
|
+
self.schema = dataclasses.replace(self.schema, metric_groups=self.auto_generate_metric_groups())
|
|
656
|
+
if not any([len(run_group.subgroups) == 0 for run_group in self.schema.run_groups]):
|
|
657
|
+
self.schema = dataclasses.replace(
|
|
658
|
+
self.schema, run_groups=self.schema.run_groups + self.auto_generate_scenario_run_groups()
|
|
659
|
+
)
|
|
660
|
+
if not any([len(run_group.subgroups) > 0 for run_group in self.schema.run_groups]):
|
|
661
|
+
self.schema = dataclasses.replace(
|
|
662
|
+
self.schema, run_groups=[self.auto_generate_all_scenarios_run_group()] + self.schema.run_groups
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
def write_schema(self) -> None:
|
|
437
666
|
"""Write the schema file to benchmark_output so the frontend knows about it."""
|
|
667
|
+
# Manually add the model metadata to the schema.json, where the frontend expects it.
|
|
668
|
+
# TODO: Move model metadata out of schema.json into its own model_metadata.json file.
|
|
669
|
+
raw_schema = asdict_without_nones(self.schema)
|
|
670
|
+
raw_schema["models"] = self.get_model_field_dicts()
|
|
438
671
|
write(
|
|
439
672
|
os.path.join(self.run_release_path, "schema.json"),
|
|
440
|
-
json.dumps(
|
|
673
|
+
json.dumps(raw_schema, indent=2),
|
|
441
674
|
)
|
|
442
675
|
|
|
443
676
|
def read_runs(self):
|
|
@@ -450,136 +683,6 @@ class Summarizer:
|
|
|
450
683
|
for suite, run_suite_path in zip(self.suites, self.run_suite_paths):
|
|
451
684
|
self.read_runs_for_suite(suite, run_suite_path)
|
|
452
685
|
|
|
453
|
-
def read_overlap_stats(self):
|
|
454
|
-
"""
|
|
455
|
-
Load the overlap stats in the run suite path.
|
|
456
|
-
Concretely:
|
|
457
|
-
- get group -> scenario_spec information from self.runs
|
|
458
|
-
run_spec data
|
|
459
|
-
- read the files in the data_overlap directory in run_suite_path
|
|
460
|
-
which are scenario_spec -> overlap ids
|
|
461
|
-
- get aggregate stats for group -> overlap ratio
|
|
462
|
-
"""
|
|
463
|
-
|
|
464
|
-
def get_group_to_scenario_specs(run_specs: List[RunSpec]) -> Dict[str, List[ScenarioSpec]]:
|
|
465
|
-
scenario_specs_to_groups: Dict[ScenarioSpec, List[str]] = {}
|
|
466
|
-
for run_spec in run_specs:
|
|
467
|
-
scenario_spec = run_spec.scenario_spec
|
|
468
|
-
groups = run_spec.groups
|
|
469
|
-
if (
|
|
470
|
-
scenario_spec.class_name
|
|
471
|
-
!= "helm.benchmark.scenarios.synthetic_efficiency_scenario.SyntheticEfficiencyScenario"
|
|
472
|
-
):
|
|
473
|
-
scenario_specs_to_groups[scenario_spec] = groups
|
|
474
|
-
|
|
475
|
-
group_to_scenario_specs: Dict[str, List[ScenarioSpec]] = {}
|
|
476
|
-
for scenario_spec, groups in scenario_specs_to_groups.items():
|
|
477
|
-
for group in groups:
|
|
478
|
-
if group not in group_to_scenario_specs:
|
|
479
|
-
group_to_scenario_specs[group] = []
|
|
480
|
-
group_to_scenario_specs[group].append(scenario_spec)
|
|
481
|
-
return group_to_scenario_specs
|
|
482
|
-
|
|
483
|
-
def get_stats_file_metadata(data_overlap_dir: str) -> Dict[str, List[str]]:
|
|
484
|
-
"""
|
|
485
|
-
Takes the data_overlap_dir as input and returns a dictionary
|
|
486
|
-
of stats_file_path -> List(model_names)
|
|
487
|
-
|
|
488
|
-
Sample input:
|
|
489
|
-
file_models_mapping:
|
|
490
|
-
- file_name: file1
|
|
491
|
-
model_names:
|
|
492
|
-
- model1
|
|
493
|
-
- model2
|
|
494
|
-
- file_name: file2
|
|
495
|
-
model_names:
|
|
496
|
-
- model2
|
|
497
|
-
- model3
|
|
498
|
-
|
|
499
|
-
"""
|
|
500
|
-
metadata_file_path: str = os.path.join(data_overlap_dir, "metadata.yaml")
|
|
501
|
-
if not os.path.exists(metadata_file_path):
|
|
502
|
-
return {}
|
|
503
|
-
|
|
504
|
-
with open(metadata_file_path, "r") as yaml_file:
|
|
505
|
-
data = yaml.safe_load(yaml_file)
|
|
506
|
-
|
|
507
|
-
file_metadata: Dict[str, List[str]] = {}
|
|
508
|
-
for entry in data["file_models_mapping"]:
|
|
509
|
-
if "file_name" in entry and "model_names" in entry:
|
|
510
|
-
file_path: str = os.path.join(data_overlap_dir, entry["file_name"])
|
|
511
|
-
file_metadata[file_path] = entry["model_names"]
|
|
512
|
-
|
|
513
|
-
return file_metadata
|
|
514
|
-
|
|
515
|
-
self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
|
|
516
|
-
|
|
517
|
-
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
|
|
518
|
-
if not os.path.isdir(data_overlap_dir):
|
|
519
|
-
hlog(f"Directory {data_overlap_dir} not found; skipped import of overlap results.")
|
|
520
|
-
return
|
|
521
|
-
|
|
522
|
-
group_to_scenario_specs = get_group_to_scenario_specs([run.run_spec for run in self.runs])
|
|
523
|
-
|
|
524
|
-
stats_file_metadata = get_stats_file_metadata(data_overlap_dir)
|
|
525
|
-
|
|
526
|
-
for file_path, model_names in stats_file_metadata.items():
|
|
527
|
-
overlap_stats_jsons = open(file_path, "r").readlines()
|
|
528
|
-
|
|
529
|
-
data_overlap_stats_list: List[DataOverlapStats] = []
|
|
530
|
-
for overlap_stats_json in overlap_stats_jsons:
|
|
531
|
-
overlap_stats_dict = json.loads(overlap_stats_json)
|
|
532
|
-
data_overlap_stats_list.append(cattrs.structure(overlap_stats_dict, DataOverlapStats))
|
|
533
|
-
|
|
534
|
-
scenario_spec_overlap_counts: Dict[ScenarioSpec, Tuple[int, int, int]] = {}
|
|
535
|
-
for data_overlap_stats in data_overlap_stats_list:
|
|
536
|
-
data_overlap_stats_key = data_overlap_stats.data_overlap_stats_key
|
|
537
|
-
n = data_overlap_stats_key.overlap_protocol_spec.n
|
|
538
|
-
if n == OVERLAP_N_COUNT:
|
|
539
|
-
light_scenario_key = data_overlap_stats_key.light_scenario_key
|
|
540
|
-
scenario_spec = light_scenario_key.scenario_spec
|
|
541
|
-
if scenario_spec in self.scenario_spec_instance_id_dict:
|
|
542
|
-
# Get statistics based on the subset of instance_ids that HELM uses for a scenario
|
|
543
|
-
instance_ids = self.scenario_spec_instance_id_dict[scenario_spec]
|
|
544
|
-
num_instances = len(instance_ids)
|
|
545
|
-
num_overlapping_inputs = len(
|
|
546
|
-
set(data_overlap_stats.instance_ids_with_overlapping_input) & set(instance_ids)
|
|
547
|
-
)
|
|
548
|
-
num_overlapping_references = len(
|
|
549
|
-
set(data_overlap_stats.instance_ids_with_overlapping_reference) & set(instance_ids)
|
|
550
|
-
)
|
|
551
|
-
scenario_spec_overlap_counts[scenario_spec] = (
|
|
552
|
-
num_instances,
|
|
553
|
-
num_overlapping_inputs,
|
|
554
|
-
num_overlapping_references,
|
|
555
|
-
)
|
|
556
|
-
|
|
557
|
-
for group, scenario_specs in group_to_scenario_specs.items():
|
|
558
|
-
group_num_instances = 0
|
|
559
|
-
group_num_overlapping_inputs = 0
|
|
560
|
-
group_num_overlapping_references = 0
|
|
561
|
-
for scenario_spec in scenario_specs:
|
|
562
|
-
if scenario_spec in scenario_spec_overlap_counts:
|
|
563
|
-
(
|
|
564
|
-
num_instances,
|
|
565
|
-
num_overlapping_inputs,
|
|
566
|
-
num_overlapping_references,
|
|
567
|
-
) = scenario_spec_overlap_counts[scenario_spec]
|
|
568
|
-
group_num_instances += num_instances
|
|
569
|
-
group_num_overlapping_inputs += num_overlapping_inputs
|
|
570
|
-
group_num_overlapping_references += num_overlapping_references
|
|
571
|
-
if group_num_instances != 0:
|
|
572
|
-
group_overlap_stats = GroupOverlapStats(
|
|
573
|
-
group=group,
|
|
574
|
-
num_instances=group_num_instances,
|
|
575
|
-
num_overlapping_inputs=group_num_overlapping_inputs,
|
|
576
|
-
num_overlapping_references=group_num_overlapping_references,
|
|
577
|
-
)
|
|
578
|
-
for model_name in model_names:
|
|
579
|
-
# Assume model name will only be associated with single group overlap list for now
|
|
580
|
-
# can update to join lists if need arises
|
|
581
|
-
self._model_group_overlap_stats[(model_name, group)] = group_overlap_stats
|
|
582
|
-
|
|
583
686
|
@htrack(None)
|
|
584
687
|
def check_metrics_defined(self):
|
|
585
688
|
"""Check that all the metrics that appear in stats are defined."""
|
|
@@ -593,8 +696,8 @@ class Summarizer:
|
|
|
593
696
|
|
|
594
697
|
for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
|
|
595
698
|
if metric_name not in defined_metric_names:
|
|
596
|
-
|
|
597
|
-
f"
|
|
699
|
+
hwarn(
|
|
700
|
+
f"metric name {metric_name} undefined in {self.schema_path} "
|
|
598
701
|
f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
|
|
599
702
|
)
|
|
600
703
|
|
|
@@ -726,9 +829,6 @@ class Summarizer:
|
|
|
726
829
|
num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
|
|
727
830
|
num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
|
|
728
831
|
|
|
729
|
-
if len(num_instances) == 0:
|
|
730
|
-
continue
|
|
731
|
-
|
|
732
832
|
rows.append(
|
|
733
833
|
[
|
|
734
834
|
Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
|
|
@@ -762,9 +862,9 @@ class Summarizer:
|
|
|
762
862
|
self,
|
|
763
863
|
runs: List[Run],
|
|
764
864
|
matcher: MetricNameMatcher,
|
|
765
|
-
contamination_level: Optional[str],
|
|
766
865
|
additional_info: Optional[str],
|
|
767
866
|
hide_value: bool = False,
|
|
867
|
+
is_scenario_table: bool = False,
|
|
768
868
|
) -> Cell:
|
|
769
869
|
"""
|
|
770
870
|
Use the metric name identified by `matcher` to pull out the stats from
|
|
@@ -790,8 +890,8 @@ class Summarizer:
|
|
|
790
890
|
if stat is None:
|
|
791
891
|
# Print out near misses to provide a more informative warning
|
|
792
892
|
near_misses = [stat for stat in run.stats if stat.name.name == matcher.name]
|
|
793
|
-
|
|
794
|
-
f"
|
|
893
|
+
hwarn(
|
|
894
|
+
f"run spec {run.run_spec.name} does not have any stat matched by {matcher}, "
|
|
795
895
|
f"{len(near_misses)} near misses matching just the name"
|
|
796
896
|
)
|
|
797
897
|
if len(near_misses) > 0:
|
|
@@ -818,23 +918,38 @@ class Summarizer:
|
|
|
818
918
|
if self.verbose:
|
|
819
919
|
description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
|
|
820
920
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
921
|
+
# Link the runs that this cell was aggregated from, if this is not a scenario table.
|
|
922
|
+
# Scenario tables link to the runs in the model cells,
|
|
923
|
+
# whereas non-scenario tables link to the runs in the metrics cells.
|
|
924
|
+
run_spec_names: Optional[List] = None
|
|
925
|
+
if not is_scenario_table:
|
|
926
|
+
# Deduplicate run spec names becuase aggregated_run_specs may have duplicated
|
|
927
|
+
# run specs if a run spec belongs to multiple groups.
|
|
928
|
+
run_spec_names = []
|
|
929
|
+
run_spec_names_set = set()
|
|
930
|
+
for run_spec_name in aggregated_run_specs:
|
|
931
|
+
if run_spec_name not in run_spec_names_set:
|
|
932
|
+
run_spec_names.append(run_spec_name)
|
|
933
|
+
run_spec_names_set.add(run_spec_name)
|
|
934
|
+
|
|
935
|
+
return Cell(
|
|
936
|
+
value=value,
|
|
937
|
+
description=description,
|
|
938
|
+
style={},
|
|
939
|
+
run_spec_names=run_spec_names,
|
|
940
|
+
)
|
|
826
941
|
|
|
827
942
|
def create_group_table(
|
|
828
943
|
self,
|
|
829
944
|
name: str,
|
|
830
945
|
title: str,
|
|
831
946
|
adapter_to_runs: Dict[AdapterSpec, List[Run]],
|
|
832
|
-
|
|
947
|
+
is_scenario_table: bool,
|
|
833
948
|
columns: List[Tuple[RunGroup, str]], # run_group, metric_group
|
|
834
949
|
sort_by_model_order: bool = True,
|
|
835
950
|
sub_split: Optional[str] = None,
|
|
836
951
|
bold_columns: bool = True,
|
|
837
|
-
|
|
952
|
+
aggregation_strategies: List[str] = [],
|
|
838
953
|
) -> Table:
|
|
839
954
|
"""
|
|
840
955
|
Create a table for where each row is an adapter (for which we have a set of runs) and columns are pairs of
|
|
@@ -847,7 +962,7 @@ class Summarizer:
|
|
|
847
962
|
# Create header (cells to display) and the list of metric name filters
|
|
848
963
|
# (to pull out information later).
|
|
849
964
|
if not columns or not adapter_to_runs:
|
|
850
|
-
|
|
965
|
+
hwarn(f"table {title}, has no rows or columns, leaving empty")
|
|
851
966
|
return Table("empty", [], [])
|
|
852
967
|
|
|
853
968
|
header: List[HeaderCell] = []
|
|
@@ -856,7 +971,7 @@ class Summarizer:
|
|
|
856
971
|
num_groups = len(set(run_group.name for run_group, _ in columns)) # number of unique groups, determines headers
|
|
857
972
|
|
|
858
973
|
# Column headers
|
|
859
|
-
header.append(HeaderCell(
|
|
974
|
+
header.append(HeaderCell(MODEL_HEADER_CELL_VALUE))
|
|
860
975
|
for run_group, metric_group_name in columns:
|
|
861
976
|
# check if at least the basic version of a metric group is evaluated (e.g., "bias" for "bias_detailed")
|
|
862
977
|
if metric_group_name.replace("_detailed", "") not in run_group.metric_groups:
|
|
@@ -868,7 +983,7 @@ class Summarizer:
|
|
|
868
983
|
matcher = replace(matcher, sub_split=sub_split)
|
|
869
984
|
header_field = self.schema.name_to_metric.get(matcher.name)
|
|
870
985
|
if header_field is None:
|
|
871
|
-
|
|
986
|
+
hwarn(f"metric name {matcher.name} undefined in {self.schema_path}, skipping")
|
|
872
987
|
continue
|
|
873
988
|
metadata = {
|
|
874
989
|
"metric": header_field.get_short_display_name(),
|
|
@@ -876,7 +991,8 @@ class Summarizer:
|
|
|
876
991
|
}
|
|
877
992
|
|
|
878
993
|
header_name = header_field.get_short_display_name()
|
|
879
|
-
|
|
994
|
+
run_group_short_description = run_group.short_description or run_group.description or ""
|
|
995
|
+
description = (run_group_short_description + "\n\n" if run_group_short_description else "") + (
|
|
880
996
|
(header_field.display_name if header_field.display_name else header_field.name)
|
|
881
997
|
+ ": "
|
|
882
998
|
+ (header_field.description if header_field.description is not None else "")
|
|
@@ -921,10 +1037,10 @@ class Summarizer:
|
|
|
921
1037
|
|
|
922
1038
|
adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
|
|
923
1039
|
if sort_by_model_order:
|
|
924
|
-
# Sort models by the order defined in the
|
|
925
|
-
# Models not defined in the
|
|
926
|
-
# placed before models in defined the
|
|
927
|
-
model_order =
|
|
1040
|
+
# Sort models by the order defined in the the model metadata config.
|
|
1041
|
+
# Models not defined in the model metadata config will be sorted alphabetically and
|
|
1042
|
+
# placed before models in defined the model metadata config.
|
|
1043
|
+
model_order = get_all_models()
|
|
928
1044
|
|
|
929
1045
|
def _adapter_spec_sort_key(spec):
|
|
930
1046
|
index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
|
|
@@ -948,21 +1064,18 @@ class Summarizer:
|
|
|
948
1064
|
runs = adapter_to_runs[adapter_spec]
|
|
949
1065
|
display_name = get_method_display_name(model_metadata.display_name, info)
|
|
950
1066
|
|
|
951
|
-
# Link
|
|
952
|
-
|
|
1067
|
+
# Link the runs that this row was aggregated from, if this is a scenario table.
|
|
1068
|
+
# Scenario tables link to the runs in the model cells,
|
|
1069
|
+
# whereas non-scenario tables link to the runs in the metrics cells.
|
|
1070
|
+
run_spec_names: Optional[List[str]]
|
|
1071
|
+
if is_scenario_table:
|
|
953
1072
|
run_spec_names = [run.run_spec.name for run in runs]
|
|
954
1073
|
href = run_spec_names_to_url(run_spec_names)
|
|
955
1074
|
else:
|
|
1075
|
+
run_spec_names = None
|
|
956
1076
|
href = None
|
|
957
1077
|
|
|
958
|
-
|
|
959
|
-
point = self.contamination.get_point(model_name, columns[0][0].name)
|
|
960
|
-
if num_groups == 1 and point is not None: # display contamination information at the adapter level
|
|
961
|
-
cells = [
|
|
962
|
-
Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
|
|
963
|
-
]
|
|
964
|
-
else:
|
|
965
|
-
cells = [Cell(display_name, description="", href=href)]
|
|
1078
|
+
cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
|
|
966
1079
|
assert len(group_names) == len(matchers)
|
|
967
1080
|
for group_name, matcher in zip(group_names, matchers):
|
|
968
1081
|
group_runs = [run for run in runs if group_name in run.run_spec.groups]
|
|
@@ -971,23 +1084,7 @@ class Summarizer:
|
|
|
971
1084
|
if "babi" in group_name and "task:" not in name:
|
|
972
1085
|
group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
|
|
973
1086
|
|
|
974
|
-
|
|
975
|
-
if point is not None:
|
|
976
|
-
description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
|
|
977
|
-
contamination_level = point.level
|
|
978
|
-
else:
|
|
979
|
-
description = ""
|
|
980
|
-
contamination_level = None
|
|
981
|
-
|
|
982
|
-
group_overlap_stats = None
|
|
983
|
-
if (model_name, group_name) in self._model_group_overlap_stats:
|
|
984
|
-
group_overlap_stats = self._model_group_overlap_stats[(model_name, group_name)]
|
|
985
|
-
|
|
986
|
-
description = (
|
|
987
|
-
f"Overlapping input ratio: {group_overlap_stats.overlapping_input_ratio:.3f}\n"
|
|
988
|
-
f"Overlapping reference ratio: {group_overlap_stats.overlapping_reference_ratio:.3f}\n"
|
|
989
|
-
f"{description}"
|
|
990
|
-
)
|
|
1087
|
+
description = ""
|
|
991
1088
|
|
|
992
1089
|
# HACK: we want to hide stats for the following model-metric combinations:
|
|
993
1090
|
# 1. Calibration metrics + AI21/Anthropic
|
|
@@ -999,9 +1096,9 @@ class Summarizer:
|
|
|
999
1096
|
self.create_cell(
|
|
1000
1097
|
group_runs,
|
|
1001
1098
|
matcher,
|
|
1002
|
-
contamination_level,
|
|
1003
1099
|
additional_info=description,
|
|
1004
1100
|
hide_value=hide_value,
|
|
1101
|
+
is_scenario_table=is_scenario_table,
|
|
1005
1102
|
)
|
|
1006
1103
|
)
|
|
1007
1104
|
|
|
@@ -1011,12 +1108,12 @@ class Summarizer:
|
|
|
1011
1108
|
# There could be a ton of runs, so only do this if there are 2-5
|
|
1012
1109
|
# TODO: replace in frontend with a selector to choose which rows to visualize.
|
|
1013
1110
|
links = []
|
|
1014
|
-
if
|
|
1111
|
+
if is_scenario_table:
|
|
1015
1112
|
all_run_spec_names = []
|
|
1016
1113
|
for adapter_spec, runs in adapter_to_runs.items():
|
|
1017
1114
|
if len(runs) > 1:
|
|
1018
|
-
|
|
1019
|
-
f"
|
|
1115
|
+
hwarn(
|
|
1116
|
+
f"table row corresponding to adapter spec {adapter_spec} has {len(runs)} > 1 runs:"
|
|
1020
1117
|
f" {[run.run_spec.name for run in runs]}"
|
|
1021
1118
|
)
|
|
1022
1119
|
for run in runs:
|
|
@@ -1026,21 +1123,42 @@ class Summarizer:
|
|
|
1026
1123
|
|
|
1027
1124
|
table = Table(title=title, header=header, rows=rows, links=links, name=name)
|
|
1028
1125
|
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1126
|
+
aggregate_header_cells: List[HeaderCell] = []
|
|
1127
|
+
aggregate_row_values: List[List[Optional[float]]] = []
|
|
1128
|
+
|
|
1129
|
+
for strategy in aggregation_strategies:
|
|
1130
|
+
if strategy == AggregationStrategy.WIN_RATE:
|
|
1131
|
+
WIN_RATE_AGGREGATION = "mean"
|
|
1132
|
+
win_rates = compute_aggregate_row_win_rates(table, aggregation=WIN_RATE_AGGREGATION)
|
|
1133
|
+
aggregate_header_cells.append(
|
|
1134
|
+
HeaderCell(
|
|
1135
|
+
f"{WIN_RATE_AGGREGATION.capitalize()} win rate",
|
|
1136
|
+
description="How many models this model outperforms on average (over columns).",
|
|
1137
|
+
lower_is_better=False,
|
|
1138
|
+
)
|
|
1139
|
+
)
|
|
1140
|
+
aggregate_row_values.append(win_rates)
|
|
1141
|
+
elif strategy == AggregationStrategy.MEAN:
|
|
1142
|
+
means = compute_aggregate_row_means(table)
|
|
1143
|
+
aggregate_header_cells.append(
|
|
1144
|
+
HeaderCell(
|
|
1145
|
+
"Mean score",
|
|
1146
|
+
description="The mean of the scores from all columns.",
|
|
1147
|
+
lower_is_better=table.header[0].lower_is_better,
|
|
1148
|
+
)
|
|
1149
|
+
)
|
|
1150
|
+
aggregate_row_values.append(means)
|
|
1151
|
+
else:
|
|
1152
|
+
raise Exception(
|
|
1153
|
+
f"Unknown aggregation strategy found: {strategy}. Please use one of: {ALL_AGGREGATION_STRATEGIES}"
|
|
1154
|
+
)
|
|
1155
|
+
|
|
1156
|
+
for i in range(len(aggregate_header_cells)):
|
|
1157
|
+
aggregate_header_cell = aggregate_header_cells[i]
|
|
1158
|
+
aggregate_rows = aggregate_row_values[i]
|
|
1159
|
+
table.header.insert(i + 1, aggregate_header_cell)
|
|
1160
|
+
for row, row_val in zip(table.rows, aggregate_rows):
|
|
1161
|
+
row.insert(i + 1, Cell(row_val))
|
|
1044
1162
|
|
|
1045
1163
|
if bold_columns:
|
|
1046
1164
|
for i, header_cell in enumerate(table.header):
|
|
@@ -1088,16 +1206,25 @@ class Summarizer:
|
|
|
1088
1206
|
|
|
1089
1207
|
if len(adapter_to_runs) > 0:
|
|
1090
1208
|
for metric_group in all_metric_groups:
|
|
1091
|
-
|
|
1209
|
+
metric_group_config = self.schema.name_to_metric_group[metric_group]
|
|
1210
|
+
display_name = metric_group_config.get_short_display_name()
|
|
1211
|
+
aggregate_strategies: List[str]
|
|
1212
|
+
if metric_group_config.aggregation_strategies is not None:
|
|
1213
|
+
aggregate_strategies = metric_group_config.aggregation_strategies
|
|
1214
|
+
elif metric_group_config.hide_win_rates:
|
|
1215
|
+
aggregate_strategies = []
|
|
1216
|
+
else:
|
|
1217
|
+
aggregate_strategies = [AggregationStrategy.WIN_RATE]
|
|
1092
1218
|
table = self.create_group_table(
|
|
1093
1219
|
name=metric_group,
|
|
1094
1220
|
title=display_name,
|
|
1095
1221
|
adapter_to_runs=adapter_to_runs,
|
|
1096
1222
|
columns=[(subgroup, metric_group) for subgroup in subgroups],
|
|
1097
|
-
|
|
1098
|
-
|
|
1223
|
+
is_scenario_table=False,
|
|
1224
|
+
aggregation_strategies=aggregate_strategies,
|
|
1099
1225
|
)
|
|
1100
|
-
|
|
1226
|
+
if len(table.header) > 1:
|
|
1227
|
+
tables.append(table)
|
|
1101
1228
|
return tables
|
|
1102
1229
|
|
|
1103
1230
|
def create_group_tables_by_subgroup(self, group: RunGroup) -> List[Table]:
|
|
@@ -1126,7 +1253,7 @@ class Summarizer:
|
|
|
1126
1253
|
name=scenario_name,
|
|
1127
1254
|
adapter_to_runs=adapter_to_runs,
|
|
1128
1255
|
columns=columns,
|
|
1129
|
-
|
|
1256
|
+
is_scenario_table=True,
|
|
1130
1257
|
)
|
|
1131
1258
|
tables.append(table)
|
|
1132
1259
|
scenarios_shown += 1
|
|
@@ -1138,7 +1265,7 @@ class Summarizer:
|
|
|
1138
1265
|
name=f"{subgroup.name}:sub_split={sub_split}",
|
|
1139
1266
|
adapter_to_runs=adapter_to_runs,
|
|
1140
1267
|
columns=columns,
|
|
1141
|
-
|
|
1268
|
+
is_scenario_table=False,
|
|
1142
1269
|
sub_split=sub_split,
|
|
1143
1270
|
)
|
|
1144
1271
|
tables.append(table)
|
|
@@ -1158,7 +1285,7 @@ class Summarizer:
|
|
|
1158
1285
|
name=subgroup.name,
|
|
1159
1286
|
adapter_to_runs=adapter_to_runs,
|
|
1160
1287
|
columns=columns,
|
|
1161
|
-
|
|
1288
|
+
is_scenario_table=False,
|
|
1162
1289
|
)
|
|
1163
1290
|
tables = [table] + tables
|
|
1164
1291
|
all_tables.extend(tables)
|
|
@@ -1225,72 +1352,6 @@ class Summarizer:
|
|
|
1225
1352
|
|
|
1226
1353
|
parallel_map(process, self.runs, parallelism=self.num_threads)
|
|
1227
1354
|
|
|
1228
|
-
def read_scenario_spec_instance_ids(self, num_instances) -> None:
|
|
1229
|
-
"""
|
|
1230
|
-
This file checks if there exists a file, scenario_spec_instance_ids.json
|
|
1231
|
-
that it can read the instance_ids associated with scenario_specs.
|
|
1232
|
-
|
|
1233
|
-
It will write the num_instances used in the run as part of the file name
|
|
1234
|
-
|
|
1235
|
-
If it doesn't exist, it will go through all the scenario_state files
|
|
1236
|
-
and parse the instance_ids and output it to the file for future uses
|
|
1237
|
-
|
|
1238
|
-
Only when the scenario_specs for the data overlap script change
|
|
1239
|
-
(or num_instances are different), will this need to be rerun.
|
|
1240
|
-
|
|
1241
|
-
In such cases, do not include the file as part of the data_overlap directory.
|
|
1242
|
-
"""
|
|
1243
|
-
self.scenario_spec_instance_id_dict: Dict[ScenarioSpec, List[str]] = dict()
|
|
1244
|
-
|
|
1245
|
-
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
|
|
1246
|
-
if not os.path.isdir(data_overlap_dir):
|
|
1247
|
-
hlog(f"Directory {data_overlap_dir} not found; skipped producing instance ids file.")
|
|
1248
|
-
return
|
|
1249
|
-
|
|
1250
|
-
scenario_spec_instance_ids_json = os.path.join(
|
|
1251
|
-
data_overlap_dir, f"scenario_spec_instance_ids_{num_instances}.jsonl"
|
|
1252
|
-
)
|
|
1253
|
-
if not os.path.exists(scenario_spec_instance_ids_json):
|
|
1254
|
-
hlog(f"No scenario spec instance ids json, writing to {scenario_spec_instance_ids_json}")
|
|
1255
|
-
self.write_scenario_spec_instance_ids_json(scenario_spec_instance_ids_json)
|
|
1256
|
-
else:
|
|
1257
|
-
hlog(f"Reading scenario spec instance ids json from {scenario_spec_instance_ids_json}")
|
|
1258
|
-
scenario_spec_instance_ids_jsons = open(scenario_spec_instance_ids_json, "r").readlines()
|
|
1259
|
-
|
|
1260
|
-
for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
|
|
1261
|
-
scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
|
|
1262
|
-
scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
|
|
1263
|
-
self.scenario_spec_instance_id_dict[
|
|
1264
|
-
scenario_spec_instance_ids.scenario_spec
|
|
1265
|
-
] = scenario_spec_instance_ids.instance_ids
|
|
1266
|
-
|
|
1267
|
-
def write_scenario_spec_instance_ids_json(self, file_path) -> None:
|
|
1268
|
-
for run in self.runs:
|
|
1269
|
-
run_spec = run.run_spec
|
|
1270
|
-
scenario_spec = run_spec.scenario_spec
|
|
1271
|
-
if scenario_spec in self.scenario_spec_instance_id_dict:
|
|
1272
|
-
continue
|
|
1273
|
-
|
|
1274
|
-
run_path = run.run_path
|
|
1275
|
-
instances_file_path = os.path.join(run_path, "instances.json")
|
|
1276
|
-
with open(instances_file_path, "r") as f:
|
|
1277
|
-
raw_instances = json.load(f)
|
|
1278
|
-
|
|
1279
|
-
# Optimization: Don't structure to dataclass, since we only need to read `id`
|
|
1280
|
-
instance_ids = [raw_instance["id"] for raw_instance in raw_instances]
|
|
1281
|
-
self.scenario_spec_instance_id_dict[scenario_spec] = instance_ids
|
|
1282
|
-
|
|
1283
|
-
all_scenario_spec_instance_ids = []
|
|
1284
|
-
for scenario_spec, instance_ids in self.scenario_spec_instance_id_dict.items():
|
|
1285
|
-
scenario_spec_instance_ids = ScenarioSpecInstanceIds(scenario_spec=scenario_spec, instance_ids=instance_ids)
|
|
1286
|
-
all_scenario_spec_instance_ids.append(scenario_spec_instance_ids)
|
|
1287
|
-
|
|
1288
|
-
with open(file_path, "w") as f:
|
|
1289
|
-
f.writelines(
|
|
1290
|
-
f"{json.dumps(asdict_without_nones(scenario_spec_instance_ids))}\n"
|
|
1291
|
-
for scenario_spec_instance_ids in all_scenario_spec_instance_ids
|
|
1292
|
-
)
|
|
1293
|
-
|
|
1294
1355
|
def symlink_latest(self) -> None:
|
|
1295
1356
|
# Create a symlink runs/latest -> runs/<name_of_suite>,
|
|
1296
1357
|
# so runs/latest always points to the latest run suite.
|
|
@@ -1302,24 +1363,20 @@ class Summarizer:
|
|
|
1302
1363
|
os.unlink(symlink_path)
|
|
1303
1364
|
os.symlink(os.path.basename(self.run_release_path), symlink_path)
|
|
1304
1365
|
|
|
1305
|
-
def run_pipeline(self, skip_completed: bool
|
|
1366
|
+
def run_pipeline(self, skip_completed: bool) -> None:
|
|
1306
1367
|
"""Run the entire summarization pipeline."""
|
|
1307
|
-
self.write_schema()
|
|
1308
|
-
|
|
1309
1368
|
self.read_runs()
|
|
1310
1369
|
self.group_runs()
|
|
1311
|
-
self.check_metrics_defined()
|
|
1312
1370
|
|
|
1313
|
-
self.
|
|
1314
|
-
|
|
1315
|
-
# Must happen after summarizer.write_run_display_json()
|
|
1316
|
-
# because it uses instances.json files
|
|
1317
|
-
self.read_scenario_spec_instance_ids(num_instances)
|
|
1371
|
+
ensure_directory_exists(self.run_release_path)
|
|
1318
1372
|
|
|
1319
|
-
# Must happen after
|
|
1320
|
-
# because it uses self.
|
|
1321
|
-
self.
|
|
1373
|
+
# Must happen after self.read_runs()
|
|
1374
|
+
# because it uses self.runs
|
|
1375
|
+
self.fix_up_schema()
|
|
1376
|
+
self.check_metrics_defined()
|
|
1377
|
+
self.write_schema()
|
|
1322
1378
|
|
|
1379
|
+
self.write_run_display_json(skip_completed)
|
|
1323
1380
|
self.write_executive_summary()
|
|
1324
1381
|
self.write_runs()
|
|
1325
1382
|
self.write_run_specs()
|
|
@@ -1331,16 +1388,70 @@ class Summarizer:
|
|
|
1331
1388
|
|
|
1332
1389
|
|
|
1333
1390
|
@htrack("summarize")
|
|
1391
|
+
def summarize(args):
|
|
1392
|
+
release: Optional[str] = None
|
|
1393
|
+
suites: Optional[str] = None
|
|
1394
|
+
suite: Optional[str] = None
|
|
1395
|
+
if args.suite and (args.release or args.suites):
|
|
1396
|
+
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
|
|
1397
|
+
elif args.suite:
|
|
1398
|
+
# Comment this out while we have a trial period for the `release` method.
|
|
1399
|
+
# hlog(
|
|
1400
|
+
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
|
|
1401
|
+
# "where --release specifies the name of a release and --suites specifies several run suites "
|
|
1402
|
+
# "to be included in that release."
|
|
1403
|
+
# )
|
|
1404
|
+
suite = args.suite
|
|
1405
|
+
elif args.release or args.suites:
|
|
1406
|
+
if not args.release or not args.suites:
|
|
1407
|
+
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
|
|
1408
|
+
release = args.release
|
|
1409
|
+
suites = args.suites
|
|
1410
|
+
else:
|
|
1411
|
+
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1412
|
+
|
|
1413
|
+
schema_path: Optional[str]
|
|
1414
|
+
if args.auto_generate_schema:
|
|
1415
|
+
if args.schema_path:
|
|
1416
|
+
raise ValueError("--schema-path must be unset if --auto-generate-schema is set")
|
|
1417
|
+
schema_path = None
|
|
1418
|
+
elif args.schema_path:
|
|
1419
|
+
schema_path = args.schema_path
|
|
1420
|
+
else:
|
|
1421
|
+
schema_path = get_default_schema_path()
|
|
1422
|
+
|
|
1423
|
+
register_builtin_configs_from_helm_package()
|
|
1424
|
+
register_configs_from_directory(args.local_path)
|
|
1425
|
+
|
|
1426
|
+
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1427
|
+
summarizer_cls = get_class_by_name(args.summarizer_class_name) if args.summarizer_class_name else Summarizer
|
|
1428
|
+
summarizer = summarizer_cls(
|
|
1429
|
+
release=release,
|
|
1430
|
+
suites=suites,
|
|
1431
|
+
suite=suite,
|
|
1432
|
+
schema_path=schema_path,
|
|
1433
|
+
output_path=args.output_path,
|
|
1434
|
+
verbose=args.debug,
|
|
1435
|
+
num_threads=args.num_threads,
|
|
1436
|
+
allow_unknown_models=args.allow_unknown_models,
|
|
1437
|
+
)
|
|
1438
|
+
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json)
|
|
1439
|
+
hlog("Done.")
|
|
1440
|
+
|
|
1441
|
+
|
|
1334
1442
|
def main():
|
|
1335
1443
|
parser = argparse.ArgumentParser()
|
|
1336
1444
|
parser.add_argument(
|
|
1337
|
-
"-o",
|
|
1445
|
+
"-o",
|
|
1446
|
+
"--output-path",
|
|
1447
|
+
type=str,
|
|
1448
|
+
help="Where the benchmarking output lives",
|
|
1449
|
+
default="benchmark_output",
|
|
1338
1450
|
)
|
|
1339
1451
|
parser.add_argument(
|
|
1340
|
-
"--schema-
|
|
1452
|
+
"--schema-path",
|
|
1341
1453
|
type=str,
|
|
1342
|
-
help="
|
|
1343
|
-
default=SCHEMA_CLASSIC_YAML_FILENAME,
|
|
1454
|
+
help="Path to the schema file (e.g., schema_classic.yaml).",
|
|
1344
1455
|
)
|
|
1345
1456
|
parser.add_argument(
|
|
1346
1457
|
"--suite",
|
|
@@ -1353,9 +1464,18 @@ def main():
|
|
|
1353
1464
|
help="Experimental: Name of the release this summarization should go under.",
|
|
1354
1465
|
)
|
|
1355
1466
|
parser.add_argument(
|
|
1356
|
-
"--suites",
|
|
1467
|
+
"--suites",
|
|
1468
|
+
type=str,
|
|
1469
|
+
nargs="+",
|
|
1470
|
+
help="Experimental: List of suites to summarize for this this release.",
|
|
1471
|
+
)
|
|
1472
|
+
parser.add_argument(
|
|
1473
|
+
"-n",
|
|
1474
|
+
"--num-threads",
|
|
1475
|
+
type=int,
|
|
1476
|
+
help="Max number of threads used to summarize",
|
|
1477
|
+
default=8,
|
|
1357
1478
|
)
|
|
1358
|
-
parser.add_argument("-n", "--num-threads", type=int, help="Max number of threads used to summarize", default=8)
|
|
1359
1479
|
parser.add_argument(
|
|
1360
1480
|
"--debug",
|
|
1361
1481
|
action="store_true",
|
|
@@ -1366,12 +1486,6 @@ def main():
|
|
|
1366
1486
|
action="store_true",
|
|
1367
1487
|
help="Skip write_run_display_json() for runs which already have all output display JSON files",
|
|
1368
1488
|
)
|
|
1369
|
-
parser.add_argument(
|
|
1370
|
-
"-num-instances",
|
|
1371
|
-
type=int,
|
|
1372
|
-
help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
|
|
1373
|
-
default=1000,
|
|
1374
|
-
)
|
|
1375
1489
|
parser.add_argument(
|
|
1376
1490
|
"--local-path",
|
|
1377
1491
|
type=str,
|
|
@@ -1384,45 +1498,26 @@ def main():
|
|
|
1384
1498
|
help="Whether to allow unknown models in the metadata file",
|
|
1385
1499
|
default=True,
|
|
1386
1500
|
)
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
if args.suite and (args.release or args.suites):
|
|
1393
|
-
raise ValueError("If --suite is specified, then --release and --suites must NOT be specified.")
|
|
1394
|
-
elif args.suite:
|
|
1395
|
-
# Comment this out while we have a trial period for the `release` method.
|
|
1396
|
-
# hlog(
|
|
1397
|
-
# "WARNING: The --suite flag is deprecated. Using --release and --suites is now preferred, "
|
|
1398
|
-
# "where --release specifies the name of a release and --suites specifies several run suites "
|
|
1399
|
-
# "to be included in that release."
|
|
1400
|
-
# )
|
|
1401
|
-
suite = args.suite
|
|
1402
|
-
elif args.release or args.suites:
|
|
1403
|
-
if not args.release or not args.suites:
|
|
1404
|
-
raise ValueError("If --release is specified, then --suites must also be specified and vice versa")
|
|
1405
|
-
release = args.release
|
|
1406
|
-
suites = args.suites
|
|
1407
|
-
else:
|
|
1408
|
-
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1409
|
-
|
|
1410
|
-
register_builtin_configs_from_helm_package()
|
|
1411
|
-
register_configs_from_directory(args.local_path)
|
|
1412
|
-
|
|
1413
|
-
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1414
|
-
summarizer = Summarizer(
|
|
1415
|
-
release=release,
|
|
1416
|
-
suites=suites,
|
|
1417
|
-
suite=suite,
|
|
1418
|
-
schema_file=args.schema_file,
|
|
1419
|
-
output_path=args.output_path,
|
|
1420
|
-
verbose=args.debug,
|
|
1421
|
-
num_threads=args.num_threads,
|
|
1422
|
-
allow_unknown_models=args.allow_unknown_models,
|
|
1501
|
+
parser.add_argument(
|
|
1502
|
+
"--summarizer-class-name",
|
|
1503
|
+
type=str,
|
|
1504
|
+
default=None,
|
|
1505
|
+
help="EXPERIMENTAL: Full class name of the Summarizer class to use. If unset, uses the default Summarizer.",
|
|
1423
1506
|
)
|
|
1424
|
-
|
|
1425
|
-
|
|
1507
|
+
parser.add_argument(
|
|
1508
|
+
"--log-config",
|
|
1509
|
+
type=str,
|
|
1510
|
+
default=None,
|
|
1511
|
+
help="PATH to a YAML file to customize logging",
|
|
1512
|
+
)
|
|
1513
|
+
parser.add_argument(
|
|
1514
|
+
"--auto-generate-schema",
|
|
1515
|
+
action="store_true",
|
|
1516
|
+
help="EXPERIMENTAL: Auto-generate schema",
|
|
1517
|
+
)
|
|
1518
|
+
args = parser.parse_args()
|
|
1519
|
+
setup_default_logging(args.log_config)
|
|
1520
|
+
summarize(args)
|
|
1426
1521
|
|
|
1427
1522
|
|
|
1428
1523
|
if __name__ == "__main__":
|