crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -21,7 +21,8 @@ from statistics import mean, median
|
|
|
21
21
|
from typing import List, Optional, Dict, Any, Tuple, Set
|
|
22
22
|
|
|
23
23
|
from tqdm import tqdm
|
|
24
|
-
|
|
24
|
+
from helm.benchmark.model_deployment_registry import get_model_deployment
|
|
25
|
+
from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
|
|
25
26
|
from helm.common.general import (
|
|
26
27
|
write,
|
|
27
28
|
ensure_directory_exists,
|
|
@@ -39,25 +40,22 @@ from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
|
|
|
39
40
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
40
41
|
from helm.benchmark.metrics.metric import get_all_stats_by_name
|
|
41
42
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
42
|
-
from helm.benchmark.
|
|
43
|
+
from helm.benchmark.run_spec import RunSpec
|
|
44
|
+
from helm.benchmark.runner import LATEST_SYMLINK
|
|
43
45
|
from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
|
|
44
46
|
from helm.benchmark.presentation.schema import (
|
|
45
47
|
MetricNameMatcher,
|
|
46
48
|
RunGroup,
|
|
49
|
+
Field,
|
|
47
50
|
read_schema,
|
|
48
|
-
|
|
51
|
+
get_default_schema_path,
|
|
49
52
|
BY_GROUP,
|
|
50
53
|
THIS_GROUP_ONLY,
|
|
51
54
|
NO_GROUPS,
|
|
52
55
|
)
|
|
53
|
-
from helm.benchmark.
|
|
54
|
-
read_contamination,
|
|
55
|
-
validate_contamination,
|
|
56
|
-
CONTAMINATION_SYMBOLS,
|
|
57
|
-
CONTAMINATION_STYLES,
|
|
58
|
-
CONTAMINATION_LEVEL_STRONG,
|
|
59
|
-
)
|
|
56
|
+
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
|
|
60
57
|
from helm.benchmark.presentation.run_display import write_run_display_json
|
|
58
|
+
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
|
|
61
59
|
|
|
62
60
|
|
|
63
61
|
OVERLAP_N_COUNT = 13
|
|
@@ -139,6 +137,38 @@ def get_scenario_name(group: RunGroup, scenario_spec: ScenarioSpec):
|
|
|
139
137
|
return group.name + "_" + dict_to_str(scenario_spec.args).replace(" ", "").replace("/", "_")
|
|
140
138
|
|
|
141
139
|
|
|
140
|
+
def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetadata:
|
|
141
|
+
"""Return the ModelMetadata for the model in the given AdapterSpec."""
|
|
142
|
+
# Get model metadata based on `model` in `adapter_spec`
|
|
143
|
+
try:
|
|
144
|
+
return get_model_metadata(adapter_spec.model)
|
|
145
|
+
except ValueError:
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
# Get model metadata based on `model_deployment` in `adapter_spec`
|
|
149
|
+
try:
|
|
150
|
+
model_deployment = get_model_deployment(adapter_spec.model_deployment)
|
|
151
|
+
if model_deployment.model_name:
|
|
152
|
+
return get_model_metadata(model_deployment.model_name)
|
|
153
|
+
except ValueError:
|
|
154
|
+
pass
|
|
155
|
+
|
|
156
|
+
# In some cases, some models were renamed such that the old model name is now the model deployment name
|
|
157
|
+
# For instance, the model called "huggingface/gpt2" is now called "openai/gpt2", but its model deployment
|
|
158
|
+
# is still called "huggingface/gpt2".
|
|
159
|
+
# Handle these cases here.
|
|
160
|
+
# TODO: Delete this block eventually.
|
|
161
|
+
try:
|
|
162
|
+
model_deployment = get_model_deployment(adapter_spec.model)
|
|
163
|
+
if model_deployment.model_name:
|
|
164
|
+
return get_model_metadata(model_deployment.model_name)
|
|
165
|
+
except ValueError:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
# Return a placeholder "unknown model" model metadata.
|
|
169
|
+
return get_unknown_model_metadata(adapter_spec.model)
|
|
170
|
+
|
|
171
|
+
|
|
142
172
|
def get_coarse_adapter_spec(
|
|
143
173
|
adapter_spec: AdapterSpec, scenario_spec: Optional[ScenarioSpec] = None, adapter_keys_shown: List[str] = []
|
|
144
174
|
) -> AdapterSpec:
|
|
@@ -165,7 +195,7 @@ def get_coarse_adapter_spec(
|
|
|
165
195
|
|
|
166
196
|
# Create a new adapter_spec, keeping only the model and the keys in adapter_keys_shown
|
|
167
197
|
adapter_spec_kwargs = {key: adapter_spec.__dict__[key] for key in adapter_keys_shown}
|
|
168
|
-
return AdapterSpec(**adapter_spec_kwargs)
|
|
198
|
+
return AdapterSpec(**adapter_spec_kwargs)
|
|
169
199
|
|
|
170
200
|
|
|
171
201
|
def get_method_display_name(model_display_name: Optional[str], info: Dict[str, Any]) -> str:
|
|
@@ -178,6 +208,8 @@ def get_method_display_name(model_display_name: Optional[str], info: Dict[str, A
|
|
|
178
208
|
info = dict(info)
|
|
179
209
|
if "model" in info:
|
|
180
210
|
del info["model"]
|
|
211
|
+
if "model_deployment" in info:
|
|
212
|
+
del info["model_deployment"]
|
|
181
213
|
|
|
182
214
|
return (model_display_name or "???") + (f" [{dict_to_str(info)}]" if len(info) > 0 else "")
|
|
183
215
|
|
|
@@ -199,15 +231,7 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
199
231
|
if lower_is_better is None: # column does not have a meaningful ordering
|
|
200
232
|
continue
|
|
201
233
|
|
|
202
|
-
|
|
203
|
-
def is_cell_valid(cell: Cell) -> bool: # ignore cells which are strongly contaminated or have no value
|
|
204
|
-
if cell.value is None:
|
|
205
|
-
return False
|
|
206
|
-
if cell.contamination_level and cell.contamination_level == CONTAMINATION_LEVEL_STRONG:
|
|
207
|
-
return False
|
|
208
|
-
return True
|
|
209
|
-
|
|
210
|
-
values = [(row[i].value, j) for j, row in enumerate(table.rows) if is_cell_valid(row[i])]
|
|
234
|
+
values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
|
|
211
235
|
if len(values) < 2: # don't rank a single model
|
|
212
236
|
continue
|
|
213
237
|
for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
|
|
@@ -270,9 +294,11 @@ class Summarizer:
|
|
|
270
294
|
release: Optional[str],
|
|
271
295
|
suites: Optional[List[str]],
|
|
272
296
|
suite: Optional[str],
|
|
297
|
+
schema_path: str,
|
|
273
298
|
output_path: str,
|
|
274
299
|
verbose: bool,
|
|
275
300
|
num_threads: int,
|
|
301
|
+
allow_unknown_models: bool,
|
|
276
302
|
):
|
|
277
303
|
"""
|
|
278
304
|
A note on the relation between `release`, `suites`, and `suite`:
|
|
@@ -288,6 +314,7 @@ class Summarizer:
|
|
|
288
314
|
self.suites: List[str]
|
|
289
315
|
self.run_suite_paths: List[str]
|
|
290
316
|
self.suite: Optional[str] = None
|
|
317
|
+
self.schema_path = schema_path
|
|
291
318
|
self.release: Optional[str] = None
|
|
292
319
|
if suite:
|
|
293
320
|
self.suite = suite
|
|
@@ -301,12 +328,11 @@ class Summarizer:
|
|
|
301
328
|
self.run_suite_paths = [os.path.join(output_path, "runs", suite) for suite in suites]
|
|
302
329
|
self.verbose: bool = verbose
|
|
303
330
|
self.num_threads: int = num_threads
|
|
331
|
+
self.allow_unknown_models: bool = allow_unknown_models
|
|
304
332
|
|
|
305
333
|
ensure_directory_exists(self.run_release_path)
|
|
306
334
|
|
|
307
|
-
self.schema = read_schema()
|
|
308
|
-
self.contamination = read_contamination()
|
|
309
|
-
validate_contamination(self.contamination, self.schema)
|
|
335
|
+
self.schema = read_schema(schema_path)
|
|
310
336
|
|
|
311
337
|
def read_run(self, run_path: str) -> Run:
|
|
312
338
|
"""Load the `Run` object from `run_path`."""
|
|
@@ -334,7 +360,7 @@ class Summarizer:
|
|
|
334
360
|
if run_group_name not in self.schema.name_to_run_group:
|
|
335
361
|
hlog(
|
|
336
362
|
f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
|
|
337
|
-
f"but undefined in {
|
|
363
|
+
f"but undefined in {self.schema_path}, skipping"
|
|
338
364
|
)
|
|
339
365
|
continue
|
|
340
366
|
run_group = self.schema.name_to_run_group[run_group_name]
|
|
@@ -355,7 +381,13 @@ class Summarizer:
|
|
|
355
381
|
"""Load the runs in the run suite path."""
|
|
356
382
|
# run_suite_path can contain subdirectories that are not runs (e.g. eval_cache, groups)
|
|
357
383
|
# so filter them out.
|
|
358
|
-
run_dir_names = sorted(
|
|
384
|
+
run_dir_names = sorted(
|
|
385
|
+
[
|
|
386
|
+
p
|
|
387
|
+
for p in os.listdir(run_suite_path)
|
|
388
|
+
if p != "eval_cache" and p != "groups" and os.path.isdir(os.path.join(run_suite_path, p))
|
|
389
|
+
]
|
|
390
|
+
)
|
|
359
391
|
for run_dir_name in tqdm(run_dir_names, disable=None):
|
|
360
392
|
run_spec_path: str = os.path.join(run_suite_path, run_dir_name, "run_spec.json")
|
|
361
393
|
stats_path: str = os.path.join(run_suite_path, run_dir_name, "stats.json")
|
|
@@ -363,13 +395,8 @@ class Summarizer:
|
|
|
363
395
|
hlog(f"WARNING: {run_dir_name} doesn't have run_spec.json or stats.json, skipping")
|
|
364
396
|
continue
|
|
365
397
|
run_path: str = os.path.join(run_suite_path, run_dir_name)
|
|
366
|
-
self.
|
|
367
|
-
|
|
368
|
-
# For each group (e.g., natural_qa), map
|
|
369
|
-
# (i) scenario spec (e.g., subject=philosophy) [optional] and
|
|
370
|
-
# (ii) adapter spec (e.g., model = openai/davinci)
|
|
371
|
-
# to list of runs
|
|
372
|
-
for run in self.runs:
|
|
398
|
+
run = self.read_run(run_path)
|
|
399
|
+
self.runs.append(run)
|
|
373
400
|
if run.run_spec.name in self.runs_to_run_suites:
|
|
374
401
|
hlog(
|
|
375
402
|
f"WARNING: Run entry {run.run_spec.name} is present in two different Run Suites. "
|
|
@@ -377,12 +404,75 @@ class Summarizer:
|
|
|
377
404
|
)
|
|
378
405
|
self.runs_to_run_suites[run.run_spec.name] = suite
|
|
379
406
|
|
|
407
|
+
def group_runs(self):
|
|
408
|
+
# For each group (e.g., natural_qa), map
|
|
409
|
+
# (i) scenario spec (e.g., subject=philosophy) [optional] and
|
|
410
|
+
# (ii) adapter spec (e.g., model = openai/davinci)
|
|
411
|
+
# to list of runs
|
|
412
|
+
for run in self.runs:
|
|
380
413
|
scenario_spec = run.run_spec.scenario_spec
|
|
381
414
|
adapter_spec = run.run_spec.adapter_spec
|
|
382
415
|
for group_name in run.run_spec.groups:
|
|
383
416
|
self.group_adapter_to_runs[group_name][adapter_spec].append(run)
|
|
384
417
|
self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
|
|
385
418
|
|
|
419
|
+
@dataclass(frozen=True)
|
|
420
|
+
class _ModelField(Field):
|
|
421
|
+
"""The frontend version of ModelMetadata.
|
|
422
|
+
|
|
423
|
+
The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
|
|
424
|
+
|
|
425
|
+
All attributes have the same meaning as in ModelMetadata."""
|
|
426
|
+
|
|
427
|
+
# TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
|
|
428
|
+
creator_organization: Optional[str] = None
|
|
429
|
+
access: Optional[str] = None
|
|
430
|
+
todo: bool = False
|
|
431
|
+
release_date: Optional[str] = None
|
|
432
|
+
num_parameters: Optional[int] = None
|
|
433
|
+
|
|
434
|
+
def get_model_field_dicts(self) -> List[Dict]:
|
|
435
|
+
"""Get a list of `ModelField`s dicts that will be written to schema.json.
|
|
436
|
+
|
|
437
|
+
The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
|
|
438
|
+
|
|
439
|
+
This is populated by reading the `ModelMetadata` configs and filtering down to models that were
|
|
440
|
+
actually used, and converting each `ModelMetadata` to a `ModelField`."""
|
|
441
|
+
# TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
|
|
442
|
+
used_model_names: Set[str] = set()
|
|
443
|
+
for run in self.runs:
|
|
444
|
+
used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
|
|
445
|
+
|
|
446
|
+
model_field_dicts: List[Dict] = []
|
|
447
|
+
for model_name in get_all_models():
|
|
448
|
+
if model_name not in used_model_names:
|
|
449
|
+
continue
|
|
450
|
+
model_metadata = get_model_metadata(model_name)
|
|
451
|
+
model_field = Summarizer._ModelField(
|
|
452
|
+
name=model_metadata.name,
|
|
453
|
+
display_name=model_metadata.display_name,
|
|
454
|
+
short_display_name=model_metadata.display_name,
|
|
455
|
+
description=model_metadata.description,
|
|
456
|
+
creator_organization=model_metadata.creator_organization_name,
|
|
457
|
+
access=model_metadata.access,
|
|
458
|
+
todo=False,
|
|
459
|
+
release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
|
|
460
|
+
num_parameters=model_metadata.num_parameters,
|
|
461
|
+
)
|
|
462
|
+
model_field_dicts.append(asdict_without_nones(model_field))
|
|
463
|
+
return model_field_dicts
|
|
464
|
+
|
|
465
|
+
def write_schema(self) -> None:
|
|
466
|
+
"""Write the schema file to benchmark_output so the frontend knows about it."""
|
|
467
|
+
# Manually add the model metadata to the schema.json, where the frontend expects it.
|
|
468
|
+
# TODO: Move model metadata out of schema.json into its own model_metadata.json file.
|
|
469
|
+
raw_schema = asdict_without_nones(self.schema)
|
|
470
|
+
raw_schema["models"] = self.get_model_field_dicts()
|
|
471
|
+
write(
|
|
472
|
+
os.path.join(self.run_release_path, "schema.json"),
|
|
473
|
+
json.dumps(raw_schema, indent=2),
|
|
474
|
+
)
|
|
475
|
+
|
|
386
476
|
def read_runs(self):
|
|
387
477
|
self.runs: List[Run] = []
|
|
388
478
|
self.runs_to_run_suites: Dict[str, str] = {}
|
|
@@ -455,6 +545,7 @@ class Summarizer:
|
|
|
455
545
|
|
|
456
546
|
return file_metadata
|
|
457
547
|
|
|
548
|
+
# TODO: Delete this after @andyzorigin's project is done.
|
|
458
549
|
self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
|
|
459
550
|
|
|
460
551
|
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
|
|
@@ -537,7 +628,7 @@ class Summarizer:
|
|
|
537
628
|
for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
|
|
538
629
|
if metric_name not in defined_metric_names:
|
|
539
630
|
hlog(
|
|
540
|
-
f"WARNING: metric name {metric_name} undefined in {
|
|
631
|
+
f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
|
|
541
632
|
f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
|
|
542
633
|
)
|
|
543
634
|
|
|
@@ -564,12 +655,12 @@ class Summarizer:
|
|
|
564
655
|
# TODO: move to write_executive_summary()
|
|
565
656
|
models_to_costs: Dict[str, Dict[str]] = defaultdict(lambda: defaultdict(int))
|
|
566
657
|
for run in self.runs:
|
|
567
|
-
|
|
658
|
+
deployment: str = run.run_spec.adapter_spec.model_deployment
|
|
568
659
|
|
|
569
660
|
for stat in run.stats:
|
|
570
661
|
stat_name = stat.name.name
|
|
571
662
|
if stat_name in Summarizer.COST_REPORT_FIELDS and not stat.name.split:
|
|
572
|
-
models_to_costs[
|
|
663
|
+
models_to_costs[deployment][stat_name] += stat.sum
|
|
573
664
|
|
|
574
665
|
# Do a second pass to add up the total number of tokens
|
|
575
666
|
for costs in models_to_costs.values():
|
|
@@ -639,7 +730,8 @@ class Summarizer:
|
|
|
639
730
|
header = [
|
|
640
731
|
HeaderCell("Group"),
|
|
641
732
|
HeaderCell("Description"),
|
|
642
|
-
# Synchronize these names with
|
|
733
|
+
# Synchronize these names with the appropriate schema file
|
|
734
|
+
# TODO: different schema files might have different fields (for multimodal)
|
|
643
735
|
HeaderCell("Adaptation method", description="Adaptation strategy (e.g., generation)"),
|
|
644
736
|
HeaderCell("# instances", description="Number of instances evaluated on"),
|
|
645
737
|
HeaderCell("# references", description="Number of references provided per instance"),
|
|
@@ -660,7 +752,7 @@ class Summarizer:
|
|
|
660
752
|
for subgroup in self.expand_subgroups(group):
|
|
661
753
|
for adapter_spec, runs in self.group_adapter_to_runs[subgroup.name].items():
|
|
662
754
|
filtered_runs = self.filter_runs_by_visibility(runs, subgroup)
|
|
663
|
-
models.add(adapter_spec.
|
|
755
|
+
models.add(adapter_spec.model_deployment)
|
|
664
756
|
methods.add(adapter_spec.method)
|
|
665
757
|
for run in filtered_runs:
|
|
666
758
|
num_instances.extend(get_all_stats_by_name(run.stats, "num_instances"))
|
|
@@ -668,9 +760,6 @@ class Summarizer:
|
|
|
668
760
|
num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
|
|
669
761
|
num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
|
|
670
762
|
|
|
671
|
-
if len(num_instances) == 0:
|
|
672
|
-
continue
|
|
673
|
-
|
|
674
763
|
rows.append(
|
|
675
764
|
[
|
|
676
765
|
Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
|
|
@@ -704,9 +793,9 @@ class Summarizer:
|
|
|
704
793
|
self,
|
|
705
794
|
runs: List[Run],
|
|
706
795
|
matcher: MetricNameMatcher,
|
|
707
|
-
contamination_level: Optional[str],
|
|
708
796
|
additional_info: Optional[str],
|
|
709
797
|
hide_value: bool = False,
|
|
798
|
+
is_scenario_table: bool = False,
|
|
710
799
|
) -> Cell:
|
|
711
800
|
"""
|
|
712
801
|
Use the metric name identified by `matcher` to pull out the stats from
|
|
@@ -760,18 +849,33 @@ class Summarizer:
|
|
|
760
849
|
if self.verbose:
|
|
761
850
|
description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
|
|
762
851
|
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
852
|
+
# Link the runs that this cell was aggregated from, if this is not a scenario table.
|
|
853
|
+
# Scenario tables link to the runs in the model cells,
|
|
854
|
+
# whereas non-scenario tables link to the runs in the metrics cells.
|
|
855
|
+
run_spec_names: Optional[List] = None
|
|
856
|
+
if not is_scenario_table:
|
|
857
|
+
# Deduplicate run spec names becuase aggregated_run_specs may have duplicated
|
|
858
|
+
# run specs if a run spec belongs to multiple groups.
|
|
859
|
+
run_spec_names = []
|
|
860
|
+
run_spec_names_set = set()
|
|
861
|
+
for run_spec_name in aggregated_run_specs:
|
|
862
|
+
if run_spec_name not in run_spec_names_set:
|
|
863
|
+
run_spec_names.append(run_spec_name)
|
|
864
|
+
run_spec_names_set.add(run_spec_name)
|
|
865
|
+
|
|
866
|
+
return Cell(
|
|
867
|
+
value=value,
|
|
868
|
+
description=description,
|
|
869
|
+
style={},
|
|
870
|
+
run_spec_names=run_spec_names,
|
|
871
|
+
)
|
|
768
872
|
|
|
769
873
|
def create_group_table(
|
|
770
874
|
self,
|
|
771
875
|
name: str,
|
|
772
876
|
title: str,
|
|
773
877
|
adapter_to_runs: Dict[AdapterSpec, List[Run]],
|
|
774
|
-
|
|
878
|
+
is_scenario_table: bool,
|
|
775
879
|
columns: List[Tuple[RunGroup, str]], # run_group, metric_group
|
|
776
880
|
sort_by_model_order: bool = True,
|
|
777
881
|
sub_split: Optional[str] = None,
|
|
@@ -810,7 +914,7 @@ class Summarizer:
|
|
|
810
914
|
matcher = replace(matcher, sub_split=sub_split)
|
|
811
915
|
header_field = self.schema.name_to_metric.get(matcher.name)
|
|
812
916
|
if header_field is None:
|
|
813
|
-
hlog(f"WARNING: metric name {matcher.name} undefined in {
|
|
917
|
+
hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
|
|
814
918
|
continue
|
|
815
919
|
metadata = {
|
|
816
920
|
"metric": header_field.get_short_display_name(),
|
|
@@ -863,55 +967,45 @@ class Summarizer:
|
|
|
863
967
|
|
|
864
968
|
adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
|
|
865
969
|
if sort_by_model_order:
|
|
866
|
-
# Sort models by the order defined in the
|
|
867
|
-
# Models not defined in the
|
|
868
|
-
# placed before models in defined the
|
|
869
|
-
model_order =
|
|
970
|
+
# Sort models by the order defined in the the model metadata config.
|
|
971
|
+
# Models not defined in the model metadata config will be sorted alphabetically and
|
|
972
|
+
# placed before models in defined the model metadata config.
|
|
973
|
+
model_order = get_all_models()
|
|
870
974
|
|
|
871
975
|
def _adapter_spec_sort_key(spec):
|
|
872
|
-
index = model_order.index(spec.
|
|
873
|
-
return (index, spec.
|
|
976
|
+
index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
|
|
977
|
+
return (index, spec.model_deployment)
|
|
874
978
|
|
|
875
979
|
adapter_specs = list(sorted(adapter_specs, key=_adapter_spec_sort_key))
|
|
876
980
|
|
|
877
981
|
# Pull out only the keys of the method adapter_spec that is needed to
|
|
878
982
|
# uniquely identify the method.
|
|
879
|
-
infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model"])
|
|
983
|
+
infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model_deployment", "model"])
|
|
880
984
|
|
|
881
985
|
assert len(adapter_specs) == len(infos), [adapter_specs, infos]
|
|
882
986
|
|
|
883
987
|
# Populate the contents of the table
|
|
884
988
|
rows = []
|
|
885
989
|
for adapter_spec, info in zip(adapter_specs, infos):
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
# Fall back to using the model name as the model display name if the model is not
|
|
890
|
-
# defined in the schema.
|
|
891
|
-
model_display_name = (
|
|
892
|
-
self.schema.name_to_model[model_name].display_name
|
|
893
|
-
if model_name in self.schema.name_to_model
|
|
894
|
-
else model_name
|
|
895
|
-
)
|
|
990
|
+
model_metadata = get_model_metadata_for_adapter_spec(adapter_spec)
|
|
991
|
+
|
|
992
|
+
model_name: str = model_metadata.name
|
|
896
993
|
|
|
897
994
|
runs = adapter_to_runs[adapter_spec]
|
|
898
|
-
display_name = get_method_display_name(
|
|
995
|
+
display_name = get_method_display_name(model_metadata.display_name, info)
|
|
899
996
|
|
|
900
|
-
# Link
|
|
901
|
-
|
|
997
|
+
# Link the runs that this row was aggregated from, if this is a scenario table.
|
|
998
|
+
# Scenario tables link to the runs in the model cells,
|
|
999
|
+
# whereas non-scenario tables link to the runs in the metrics cells.
|
|
1000
|
+
run_spec_names: Optional[List[str]]
|
|
1001
|
+
if is_scenario_table:
|
|
902
1002
|
run_spec_names = [run.run_spec.name for run in runs]
|
|
903
1003
|
href = run_spec_names_to_url(run_spec_names)
|
|
904
1004
|
else:
|
|
1005
|
+
run_spec_names = None
|
|
905
1006
|
href = None
|
|
906
1007
|
|
|
907
|
-
|
|
908
|
-
point = self.contamination.get_point(model_name, columns[0][0].name)
|
|
909
|
-
if num_groups == 1 and point is not None: # display contamination information at the adapter level
|
|
910
|
-
cells = [
|
|
911
|
-
Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
|
|
912
|
-
]
|
|
913
|
-
else:
|
|
914
|
-
cells = [Cell(display_name, description="", href=href)]
|
|
1008
|
+
cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
|
|
915
1009
|
assert len(group_names) == len(matchers)
|
|
916
1010
|
for group_name, matcher in zip(group_names, matchers):
|
|
917
1011
|
group_runs = [run for run in runs if group_name in run.run_spec.groups]
|
|
@@ -920,13 +1014,7 @@ class Summarizer:
|
|
|
920
1014
|
if "babi" in group_name and "task:" not in name:
|
|
921
1015
|
group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
|
|
922
1016
|
|
|
923
|
-
|
|
924
|
-
if point is not None:
|
|
925
|
-
description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
|
|
926
|
-
contamination_level = point.level
|
|
927
|
-
else:
|
|
928
|
-
description = ""
|
|
929
|
-
contamination_level = None
|
|
1017
|
+
description = ""
|
|
930
1018
|
|
|
931
1019
|
group_overlap_stats = None
|
|
932
1020
|
if (model_name, group_name) in self._model_group_overlap_stats:
|
|
@@ -948,9 +1036,9 @@ class Summarizer:
|
|
|
948
1036
|
self.create_cell(
|
|
949
1037
|
group_runs,
|
|
950
1038
|
matcher,
|
|
951
|
-
contamination_level,
|
|
952
1039
|
additional_info=description,
|
|
953
1040
|
hide_value=hide_value,
|
|
1041
|
+
is_scenario_table=is_scenario_table,
|
|
954
1042
|
)
|
|
955
1043
|
)
|
|
956
1044
|
|
|
@@ -960,7 +1048,7 @@ class Summarizer:
|
|
|
960
1048
|
# There could be a ton of runs, so only do this if there are 2-5
|
|
961
1049
|
# TODO: replace in frontend with a selector to choose which rows to visualize.
|
|
962
1050
|
links = []
|
|
963
|
-
if
|
|
1051
|
+
if is_scenario_table:
|
|
964
1052
|
all_run_spec_names = []
|
|
965
1053
|
for adapter_spec, runs in adapter_to_runs.items():
|
|
966
1054
|
if len(runs) > 1:
|
|
@@ -1043,8 +1131,8 @@ class Summarizer:
|
|
|
1043
1131
|
title=display_name,
|
|
1044
1132
|
adapter_to_runs=adapter_to_runs,
|
|
1045
1133
|
columns=[(subgroup, metric_group) for subgroup in subgroups],
|
|
1046
|
-
|
|
1047
|
-
add_win_rate=
|
|
1134
|
+
is_scenario_table=False,
|
|
1135
|
+
add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
|
|
1048
1136
|
)
|
|
1049
1137
|
tables.append(table)
|
|
1050
1138
|
return tables
|
|
@@ -1075,7 +1163,7 @@ class Summarizer:
|
|
|
1075
1163
|
name=scenario_name,
|
|
1076
1164
|
adapter_to_runs=adapter_to_runs,
|
|
1077
1165
|
columns=columns,
|
|
1078
|
-
|
|
1166
|
+
is_scenario_table=True,
|
|
1079
1167
|
)
|
|
1080
1168
|
tables.append(table)
|
|
1081
1169
|
scenarios_shown += 1
|
|
@@ -1087,7 +1175,7 @@ class Summarizer:
|
|
|
1087
1175
|
name=f"{subgroup.name}:sub_split={sub_split}",
|
|
1088
1176
|
adapter_to_runs=adapter_to_runs,
|
|
1089
1177
|
columns=columns,
|
|
1090
|
-
|
|
1178
|
+
is_scenario_table=False,
|
|
1091
1179
|
sub_split=sub_split,
|
|
1092
1180
|
)
|
|
1093
1181
|
tables.append(table)
|
|
@@ -1107,7 +1195,7 @@ class Summarizer:
|
|
|
1107
1195
|
name=subgroup.name,
|
|
1108
1196
|
adapter_to_runs=adapter_to_runs,
|
|
1109
1197
|
columns=columns,
|
|
1110
|
-
|
|
1198
|
+
is_scenario_table=False,
|
|
1111
1199
|
)
|
|
1112
1200
|
tables = [table] + tables
|
|
1113
1201
|
all_tables.extend(tables)
|
|
@@ -1209,9 +1297,9 @@ class Summarizer:
|
|
|
1209
1297
|
for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
|
|
1210
1298
|
scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
|
|
1211
1299
|
scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
|
|
1212
|
-
self.scenario_spec_instance_id_dict[
|
|
1213
|
-
scenario_spec_instance_ids.
|
|
1214
|
-
|
|
1300
|
+
self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
|
|
1301
|
+
scenario_spec_instance_ids.instance_ids
|
|
1302
|
+
)
|
|
1215
1303
|
|
|
1216
1304
|
def write_scenario_spec_instance_ids_json(self, file_path) -> None:
|
|
1217
1305
|
for run in self.runs:
|
|
@@ -1249,11 +1337,12 @@ class Summarizer:
|
|
|
1249
1337
|
if os.path.islink(symlink_path):
|
|
1250
1338
|
# Remove the previous symlink if it exists.
|
|
1251
1339
|
os.unlink(symlink_path)
|
|
1252
|
-
os.symlink(os.path.
|
|
1340
|
+
os.symlink(os.path.basename(self.run_release_path), symlink_path)
|
|
1253
1341
|
|
|
1254
1342
|
def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
|
|
1255
|
-
"""Run the entire summarization pipeline
|
|
1343
|
+
"""Run the entire summarization pipeline."""
|
|
1256
1344
|
self.read_runs()
|
|
1345
|
+
self.group_runs()
|
|
1257
1346
|
self.check_metrics_defined()
|
|
1258
1347
|
|
|
1259
1348
|
self.write_run_display_json(skip_completed)
|
|
@@ -1266,6 +1355,10 @@ class Summarizer:
|
|
|
1266
1355
|
# because it uses self.scenario_spec_instance_id_dict
|
|
1267
1356
|
self.read_overlap_stats()
|
|
1268
1357
|
|
|
1358
|
+
# Must happen after self.read_runs()
|
|
1359
|
+
# because it uses self.runs
|
|
1360
|
+
self.write_schema()
|
|
1361
|
+
|
|
1269
1362
|
self.write_executive_summary()
|
|
1270
1363
|
self.write_runs()
|
|
1271
1364
|
self.write_run_specs()
|
|
@@ -1276,12 +1369,17 @@ class Summarizer:
|
|
|
1276
1369
|
self.symlink_latest()
|
|
1277
1370
|
|
|
1278
1371
|
|
|
1279
|
-
@htrack(
|
|
1372
|
+
@htrack("summarize")
|
|
1280
1373
|
def main():
|
|
1281
1374
|
parser = argparse.ArgumentParser()
|
|
1282
1375
|
parser.add_argument(
|
|
1283
1376
|
"-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
|
|
1284
1377
|
)
|
|
1378
|
+
parser.add_argument(
|
|
1379
|
+
"--schema-path",
|
|
1380
|
+
type=str,
|
|
1381
|
+
help="Path to the schema file (e.g., schema_classic.yaml).",
|
|
1382
|
+
)
|
|
1285
1383
|
parser.add_argument(
|
|
1286
1384
|
"--suite",
|
|
1287
1385
|
type=str,
|
|
@@ -1312,6 +1410,18 @@ def main():
|
|
|
1312
1410
|
help="Number of instance ids we're using; only for annotating scenario spec instance ids file",
|
|
1313
1411
|
default=1000,
|
|
1314
1412
|
)
|
|
1413
|
+
parser.add_argument(
|
|
1414
|
+
"--local-path",
|
|
1415
|
+
type=str,
|
|
1416
|
+
help="If running locally, the path for `ServerService`.",
|
|
1417
|
+
default="prod_env",
|
|
1418
|
+
)
|
|
1419
|
+
parser.add_argument(
|
|
1420
|
+
"--allow-unknown-models",
|
|
1421
|
+
type=bool,
|
|
1422
|
+
help="Whether to allow unknown models in the metadata file",
|
|
1423
|
+
default=True,
|
|
1424
|
+
)
|
|
1315
1425
|
args = parser.parse_args()
|
|
1316
1426
|
|
|
1317
1427
|
release: Optional[str] = None
|
|
@@ -1335,14 +1445,21 @@ def main():
|
|
|
1335
1445
|
else:
|
|
1336
1446
|
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1337
1447
|
|
|
1448
|
+
schema_path = args.schema_path if args.schema_path else get_default_schema_path()
|
|
1449
|
+
|
|
1450
|
+
register_builtin_configs_from_helm_package()
|
|
1451
|
+
register_configs_from_directory(args.local_path)
|
|
1452
|
+
|
|
1338
1453
|
# Output JSON files summarizing the benchmark results which will be loaded in the web interface
|
|
1339
1454
|
summarizer = Summarizer(
|
|
1340
1455
|
release=release,
|
|
1341
1456
|
suites=suites,
|
|
1342
1457
|
suite=suite,
|
|
1458
|
+
schema_path=schema_path,
|
|
1343
1459
|
output_path=args.output_path,
|
|
1344
1460
|
verbose=args.debug,
|
|
1345
1461
|
num_threads=args.num_threads,
|
|
1462
|
+
allow_unknown_models=args.allow_unknown_models,
|
|
1346
1463
|
)
|
|
1347
1464
|
summarizer.run_pipeline(skip_completed=args.skip_completed_run_display_json, num_instances=args.num_instances)
|
|
1348
1465
|
hlog("Done.")
|
|
@@ -4,26 +4,26 @@ from typing import Any, Optional, List, Dict
|
|
|
4
4
|
|
|
5
5
|
@dataclass(frozen=True)
|
|
6
6
|
class Cell:
|
|
7
|
-
# Semantic value (that can be used for sorting)
|
|
8
7
|
value: Any
|
|
8
|
+
"""Semantic value (that can be used for sorting)"""
|
|
9
9
|
|
|
10
|
-
# Optionally, if we want to render things specially (floating points to 3 decimal points)
|
|
11
10
|
display_value: Optional[str] = None
|
|
11
|
+
"""Optionally, if we want to render things specially (floating points to 3 decimal points)"""
|
|
12
12
|
|
|
13
|
-
# Detailed description if hover over the cell
|
|
14
13
|
description: Optional[str] = None
|
|
14
|
+
"""Detailed description if hover over the cell"""
|
|
15
15
|
|
|
16
|
-
# If we click on the link for this cell, it takes us somewhere
|
|
17
16
|
href: Optional[str] = None
|
|
17
|
+
"""If we click on the link for this cell, it takes us somewhere"""
|
|
18
18
|
|
|
19
|
-
# Styling
|
|
20
19
|
style: Optional[Dict[str, Any]] = None
|
|
20
|
+
"""Styling"""
|
|
21
21
|
|
|
22
|
-
# If the value or display_value is markdown that needs to be interpreted
|
|
23
22
|
markdown: bool = False
|
|
23
|
+
"""If the value or display_value is markdown that needs to be interpreted"""
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
run_spec_names: Optional[List[str]] = None
|
|
26
|
+
"""The names of the runs that this cell's value was aggregated from, if the cell contains an aggregate value."""
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
@dataclass(frozen=True)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from helm.benchmark.presentation.schema import read_schema
|
|
1
|
+
from helm.benchmark.presentation.schema import read_schema, get_default_schema_path
|
|
2
2
|
from helm.benchmark.presentation.contamination import read_contamination, validate_contamination
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def test_contamination_schema():
|
|
6
|
-
schema = read_schema()
|
|
6
|
+
schema = read_schema(get_default_schema_path())
|
|
7
7
|
contamination = read_contamination()
|
|
8
8
|
validate_contamination(contamination, schema)
|
|
9
9
|
|