crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -22,13 +22,11 @@ from typing import List, Optional, Dict, Any, Tuple, Set
|
|
|
22
22
|
|
|
23
23
|
from tqdm import tqdm
|
|
24
24
|
from helm.benchmark.model_deployment_registry import get_model_deployment
|
|
25
|
-
|
|
26
25
|
from helm.benchmark.model_metadata_registry import get_unknown_model_metadata
|
|
27
26
|
from helm.common.general import (
|
|
28
27
|
write,
|
|
29
28
|
ensure_directory_exists,
|
|
30
29
|
asdict_without_nones,
|
|
31
|
-
serialize_dates,
|
|
32
30
|
parallel_map,
|
|
33
31
|
singleton,
|
|
34
32
|
unique_simplification,
|
|
@@ -42,27 +40,22 @@ from helm.benchmark.data_overlap.light_scenario import ScenarioSpecInstanceIds
|
|
|
42
40
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
43
41
|
from helm.benchmark.metrics.metric import get_all_stats_by_name
|
|
44
42
|
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
45
|
-
from helm.benchmark.
|
|
43
|
+
from helm.benchmark.run_spec import RunSpec
|
|
44
|
+
from helm.benchmark.runner import LATEST_SYMLINK
|
|
46
45
|
from helm.benchmark.presentation.table import Cell, HeaderCell, Table, Hyperlink, table_to_latex
|
|
47
46
|
from helm.benchmark.presentation.schema import (
|
|
48
47
|
MetricNameMatcher,
|
|
49
48
|
RunGroup,
|
|
49
|
+
Field,
|
|
50
50
|
read_schema,
|
|
51
|
-
|
|
51
|
+
get_default_schema_path,
|
|
52
52
|
BY_GROUP,
|
|
53
53
|
THIS_GROUP_ONLY,
|
|
54
54
|
NO_GROUPS,
|
|
55
55
|
)
|
|
56
|
-
from helm.benchmark.presentation.contamination import (
|
|
57
|
-
read_contamination,
|
|
58
|
-
validate_contamination,
|
|
59
|
-
CONTAMINATION_SYMBOLS,
|
|
60
|
-
CONTAMINATION_STYLES,
|
|
61
|
-
CONTAMINATION_LEVEL_STRONG,
|
|
62
|
-
)
|
|
63
56
|
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package, register_configs_from_directory
|
|
64
57
|
from helm.benchmark.presentation.run_display import write_run_display_json
|
|
65
|
-
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata
|
|
58
|
+
from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, get_all_models
|
|
66
59
|
|
|
67
60
|
|
|
68
61
|
OVERLAP_N_COUNT = 13
|
|
@@ -172,7 +165,7 @@ def get_model_metadata_for_adapter_spec(adapter_spec: AdapterSpec) -> ModelMetad
|
|
|
172
165
|
except ValueError:
|
|
173
166
|
pass
|
|
174
167
|
|
|
175
|
-
# Return a placeholder "
|
|
168
|
+
# Return a placeholder "unknown model" model metadata.
|
|
176
169
|
return get_unknown_model_metadata(adapter_spec.model)
|
|
177
170
|
|
|
178
171
|
|
|
@@ -238,15 +231,7 @@ def compute_aggregate_row_win_rates(table: Table, aggregation: str = "mean") ->
|
|
|
238
231
|
if lower_is_better is None: # column does not have a meaningful ordering
|
|
239
232
|
continue
|
|
240
233
|
|
|
241
|
-
|
|
242
|
-
def is_cell_valid(cell: Cell) -> bool: # ignore cells which are strongly contaminated or have no value
|
|
243
|
-
if cell.value is None:
|
|
244
|
-
return False
|
|
245
|
-
if cell.contamination_level and cell.contamination_level == CONTAMINATION_LEVEL_STRONG:
|
|
246
|
-
return False
|
|
247
|
-
return True
|
|
248
|
-
|
|
249
|
-
values = [(row[i].value, j) for j, row in enumerate(table.rows) if is_cell_valid(row[i])]
|
|
234
|
+
values = [(row[i].value, j) for j, row in enumerate(table.rows) if row[i].value is not None]
|
|
250
235
|
if len(values) < 2: # don't rank a single model
|
|
251
236
|
continue
|
|
252
237
|
for wins, (v, j) in enumerate(sorted(values, reverse=lower_is_better)):
|
|
@@ -309,7 +294,7 @@ class Summarizer:
|
|
|
309
294
|
release: Optional[str],
|
|
310
295
|
suites: Optional[List[str]],
|
|
311
296
|
suite: Optional[str],
|
|
312
|
-
|
|
297
|
+
schema_path: str,
|
|
313
298
|
output_path: str,
|
|
314
299
|
verbose: bool,
|
|
315
300
|
num_threads: int,
|
|
@@ -329,7 +314,7 @@ class Summarizer:
|
|
|
329
314
|
self.suites: List[str]
|
|
330
315
|
self.run_suite_paths: List[str]
|
|
331
316
|
self.suite: Optional[str] = None
|
|
332
|
-
self.
|
|
317
|
+
self.schema_path = schema_path
|
|
333
318
|
self.release: Optional[str] = None
|
|
334
319
|
if suite:
|
|
335
320
|
self.suite = suite
|
|
@@ -347,9 +332,7 @@ class Summarizer:
|
|
|
347
332
|
|
|
348
333
|
ensure_directory_exists(self.run_release_path)
|
|
349
334
|
|
|
350
|
-
self.schema = read_schema(
|
|
351
|
-
self.contamination = read_contamination()
|
|
352
|
-
validate_contamination(self.contamination, self.schema)
|
|
335
|
+
self.schema = read_schema(schema_path)
|
|
353
336
|
|
|
354
337
|
def read_run(self, run_path: str) -> Run:
|
|
355
338
|
"""Load the `Run` object from `run_path`."""
|
|
@@ -377,7 +360,7 @@ class Summarizer:
|
|
|
377
360
|
if run_group_name not in self.schema.name_to_run_group:
|
|
378
361
|
hlog(
|
|
379
362
|
f"WARNING: group {run_group_name} mentioned in run spec {run.run_spec.name} "
|
|
380
|
-
f"but undefined in {self.
|
|
363
|
+
f"but undefined in {self.schema_path}, skipping"
|
|
381
364
|
)
|
|
382
365
|
continue
|
|
383
366
|
run_group = self.schema.name_to_run_group[run_group_name]
|
|
@@ -433,11 +416,61 @@ class Summarizer:
|
|
|
433
416
|
self.group_adapter_to_runs[group_name][adapter_spec].append(run)
|
|
434
417
|
self.group_scenario_adapter_to_runs[group_name][scenario_spec][adapter_spec].append(run)
|
|
435
418
|
|
|
436
|
-
|
|
419
|
+
@dataclass(frozen=True)
|
|
420
|
+
class _ModelField(Field):
|
|
421
|
+
"""The frontend version of ModelMetadata.
|
|
422
|
+
|
|
423
|
+
The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
|
|
424
|
+
|
|
425
|
+
All attributes have the same meaning as in ModelMetadata."""
|
|
426
|
+
|
|
427
|
+
# TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
|
|
428
|
+
creator_organization: Optional[str] = None
|
|
429
|
+
access: Optional[str] = None
|
|
430
|
+
todo: bool = False
|
|
431
|
+
release_date: Optional[str] = None
|
|
432
|
+
num_parameters: Optional[int] = None
|
|
433
|
+
|
|
434
|
+
def get_model_field_dicts(self) -> List[Dict]:
|
|
435
|
+
"""Get a list of `ModelField`s dicts that will be written to schema.json.
|
|
436
|
+
|
|
437
|
+
The frontend expects schema.json to contains a field under "model" that contains a list of `ModelField`s.
|
|
438
|
+
|
|
439
|
+
This is populated by reading the `ModelMetadata` configs and filtering down to models that were
|
|
440
|
+
actually used, and converting each `ModelMetadata` to a `ModelField`."""
|
|
441
|
+
# TODO: Migrate frontend to use ModelMetadata instead of ModelField and delete this.
|
|
442
|
+
used_model_names: Set[str] = set()
|
|
443
|
+
for run in self.runs:
|
|
444
|
+
used_model_names.add(get_model_metadata_for_adapter_spec(run.run_spec.adapter_spec).name)
|
|
445
|
+
|
|
446
|
+
model_field_dicts: List[Dict] = []
|
|
447
|
+
for model_name in get_all_models():
|
|
448
|
+
if model_name not in used_model_names:
|
|
449
|
+
continue
|
|
450
|
+
model_metadata = get_model_metadata(model_name)
|
|
451
|
+
model_field = Summarizer._ModelField(
|
|
452
|
+
name=model_metadata.name,
|
|
453
|
+
display_name=model_metadata.display_name,
|
|
454
|
+
short_display_name=model_metadata.display_name,
|
|
455
|
+
description=model_metadata.description,
|
|
456
|
+
creator_organization=model_metadata.creator_organization_name,
|
|
457
|
+
access=model_metadata.access,
|
|
458
|
+
todo=False,
|
|
459
|
+
release_date=model_metadata.release_date.isoformat() if model_metadata.release_date else None,
|
|
460
|
+
num_parameters=model_metadata.num_parameters,
|
|
461
|
+
)
|
|
462
|
+
model_field_dicts.append(asdict_without_nones(model_field))
|
|
463
|
+
return model_field_dicts
|
|
464
|
+
|
|
465
|
+
def write_schema(self) -> None:
|
|
437
466
|
"""Write the schema file to benchmark_output so the frontend knows about it."""
|
|
467
|
+
# Manually add the model metadata to the schema.json, where the frontend expects it.
|
|
468
|
+
# TODO: Move model metadata out of schema.json into its own model_metadata.json file.
|
|
469
|
+
raw_schema = asdict_without_nones(self.schema)
|
|
470
|
+
raw_schema["models"] = self.get_model_field_dicts()
|
|
438
471
|
write(
|
|
439
472
|
os.path.join(self.run_release_path, "schema.json"),
|
|
440
|
-
json.dumps(
|
|
473
|
+
json.dumps(raw_schema, indent=2),
|
|
441
474
|
)
|
|
442
475
|
|
|
443
476
|
def read_runs(self):
|
|
@@ -512,6 +545,7 @@ class Summarizer:
|
|
|
512
545
|
|
|
513
546
|
return file_metadata
|
|
514
547
|
|
|
548
|
+
# TODO: Delete this after @andyzorigin's project is done.
|
|
515
549
|
self._model_group_overlap_stats: Dict[Tuple[str, str], GroupOverlapStats] = {}
|
|
516
550
|
|
|
517
551
|
data_overlap_dir = os.path.join(self.run_release_path, "data_overlap")
|
|
@@ -594,7 +628,7 @@ class Summarizer:
|
|
|
594
628
|
for metric_name, run_spec_names in metric_name_to_run_spec_names.items():
|
|
595
629
|
if metric_name not in defined_metric_names:
|
|
596
630
|
hlog(
|
|
597
|
-
f"WARNING: metric name {metric_name} undefined in {self.
|
|
631
|
+
f"WARNING: metric name {metric_name} undefined in {self.schema_path} "
|
|
598
632
|
f"but appears in {len(run_spec_names)} run specs, including {run_spec_names[0]}"
|
|
599
633
|
)
|
|
600
634
|
|
|
@@ -726,9 +760,6 @@ class Summarizer:
|
|
|
726
760
|
num_prompt_tokens.extend(get_all_stats_by_name(run.stats, "num_prompt_tokens"))
|
|
727
761
|
num_completion_tokens.extend(get_all_stats_by_name(run.stats, "num_completion_tokens"))
|
|
728
762
|
|
|
729
|
-
if len(num_instances) == 0:
|
|
730
|
-
continue
|
|
731
|
-
|
|
732
763
|
rows.append(
|
|
733
764
|
[
|
|
734
765
|
Cell(group.display_name, href=get_benchmarking_url({"group": group.name})),
|
|
@@ -762,9 +793,9 @@ class Summarizer:
|
|
|
762
793
|
self,
|
|
763
794
|
runs: List[Run],
|
|
764
795
|
matcher: MetricNameMatcher,
|
|
765
|
-
contamination_level: Optional[str],
|
|
766
796
|
additional_info: Optional[str],
|
|
767
797
|
hide_value: bool = False,
|
|
798
|
+
is_scenario_table: bool = False,
|
|
768
799
|
) -> Cell:
|
|
769
800
|
"""
|
|
770
801
|
Use the metric name identified by `matcher` to pull out the stats from
|
|
@@ -818,18 +849,33 @@ class Summarizer:
|
|
|
818
849
|
if self.verbose:
|
|
819
850
|
description += "\n-- ".join(["\nRun specs:", *aggregated_run_specs])
|
|
820
851
|
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
852
|
+
# Link the runs that this cell was aggregated from, if this is not a scenario table.
|
|
853
|
+
# Scenario tables link to the runs in the model cells,
|
|
854
|
+
# whereas non-scenario tables link to the runs in the metrics cells.
|
|
855
|
+
run_spec_names: Optional[List] = None
|
|
856
|
+
if not is_scenario_table:
|
|
857
|
+
# Deduplicate run spec names becuase aggregated_run_specs may have duplicated
|
|
858
|
+
# run specs if a run spec belongs to multiple groups.
|
|
859
|
+
run_spec_names = []
|
|
860
|
+
run_spec_names_set = set()
|
|
861
|
+
for run_spec_name in aggregated_run_specs:
|
|
862
|
+
if run_spec_name not in run_spec_names_set:
|
|
863
|
+
run_spec_names.append(run_spec_name)
|
|
864
|
+
run_spec_names_set.add(run_spec_name)
|
|
865
|
+
|
|
866
|
+
return Cell(
|
|
867
|
+
value=value,
|
|
868
|
+
description=description,
|
|
869
|
+
style={},
|
|
870
|
+
run_spec_names=run_spec_names,
|
|
871
|
+
)
|
|
826
872
|
|
|
827
873
|
def create_group_table(
|
|
828
874
|
self,
|
|
829
875
|
name: str,
|
|
830
876
|
title: str,
|
|
831
877
|
adapter_to_runs: Dict[AdapterSpec, List[Run]],
|
|
832
|
-
|
|
878
|
+
is_scenario_table: bool,
|
|
833
879
|
columns: List[Tuple[RunGroup, str]], # run_group, metric_group
|
|
834
880
|
sort_by_model_order: bool = True,
|
|
835
881
|
sub_split: Optional[str] = None,
|
|
@@ -868,7 +914,7 @@ class Summarizer:
|
|
|
868
914
|
matcher = replace(matcher, sub_split=sub_split)
|
|
869
915
|
header_field = self.schema.name_to_metric.get(matcher.name)
|
|
870
916
|
if header_field is None:
|
|
871
|
-
hlog(f"WARNING: metric name {matcher.name} undefined in {self.
|
|
917
|
+
hlog(f"WARNING: metric name {matcher.name} undefined in {self.schema_path}, skipping")
|
|
872
918
|
continue
|
|
873
919
|
metadata = {
|
|
874
920
|
"metric": header_field.get_short_display_name(),
|
|
@@ -921,10 +967,10 @@ class Summarizer:
|
|
|
921
967
|
|
|
922
968
|
adapter_specs: List[AdapterSpec] = list(adapter_to_runs.keys())
|
|
923
969
|
if sort_by_model_order:
|
|
924
|
-
# Sort models by the order defined in the
|
|
925
|
-
# Models not defined in the
|
|
926
|
-
# placed before models in defined the
|
|
927
|
-
model_order =
|
|
970
|
+
# Sort models by the order defined in the the model metadata config.
|
|
971
|
+
# Models not defined in the model metadata config will be sorted alphabetically and
|
|
972
|
+
# placed before models in defined the model metadata config.
|
|
973
|
+
model_order = get_all_models()
|
|
928
974
|
|
|
929
975
|
def _adapter_spec_sort_key(spec):
|
|
930
976
|
index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1
|
|
@@ -948,21 +994,18 @@ class Summarizer:
|
|
|
948
994
|
runs = adapter_to_runs[adapter_spec]
|
|
949
995
|
display_name = get_method_display_name(model_metadata.display_name, info)
|
|
950
996
|
|
|
951
|
-
# Link
|
|
952
|
-
|
|
997
|
+
# Link the runs that this row was aggregated from, if this is a scenario table.
|
|
998
|
+
# Scenario tables link to the runs in the model cells,
|
|
999
|
+
# whereas non-scenario tables link to the runs in the metrics cells.
|
|
1000
|
+
run_spec_names: Optional[List[str]]
|
|
1001
|
+
if is_scenario_table:
|
|
953
1002
|
run_spec_names = [run.run_spec.name for run in runs]
|
|
954
1003
|
href = run_spec_names_to_url(run_spec_names)
|
|
955
1004
|
else:
|
|
1005
|
+
run_spec_names = None
|
|
956
1006
|
href = None
|
|
957
1007
|
|
|
958
|
-
|
|
959
|
-
point = self.contamination.get_point(model_name, columns[0][0].name)
|
|
960
|
-
if num_groups == 1 and point is not None: # display contamination information at the adapter level
|
|
961
|
-
cells = [
|
|
962
|
-
Cell(display_name + CONTAMINATION_SYMBOLS[point.level], description=point.description, href=href)
|
|
963
|
-
]
|
|
964
|
-
else:
|
|
965
|
-
cells = [Cell(display_name, description="", href=href)]
|
|
1008
|
+
cells = [Cell(display_name, description="", href=href, run_spec_names=run_spec_names)]
|
|
966
1009
|
assert len(group_names) == len(matchers)
|
|
967
1010
|
for group_name, matcher in zip(group_names, matchers):
|
|
968
1011
|
group_runs = [run for run in runs if group_name in run.run_spec.groups]
|
|
@@ -971,13 +1014,7 @@ class Summarizer:
|
|
|
971
1014
|
if "babi" in group_name and "task:" not in name:
|
|
972
1015
|
group_runs = [run for run in group_runs if "task=all" in run.run_spec.name]
|
|
973
1016
|
|
|
974
|
-
|
|
975
|
-
if point is not None:
|
|
976
|
-
description = CONTAMINATION_SYMBOLS[point.level] + " " + point.description
|
|
977
|
-
contamination_level = point.level
|
|
978
|
-
else:
|
|
979
|
-
description = ""
|
|
980
|
-
contamination_level = None
|
|
1017
|
+
description = ""
|
|
981
1018
|
|
|
982
1019
|
group_overlap_stats = None
|
|
983
1020
|
if (model_name, group_name) in self._model_group_overlap_stats:
|
|
@@ -999,9 +1036,9 @@ class Summarizer:
|
|
|
999
1036
|
self.create_cell(
|
|
1000
1037
|
group_runs,
|
|
1001
1038
|
matcher,
|
|
1002
|
-
contamination_level,
|
|
1003
1039
|
additional_info=description,
|
|
1004
1040
|
hide_value=hide_value,
|
|
1041
|
+
is_scenario_table=is_scenario_table,
|
|
1005
1042
|
)
|
|
1006
1043
|
)
|
|
1007
1044
|
|
|
@@ -1011,7 +1048,7 @@ class Summarizer:
|
|
|
1011
1048
|
# There could be a ton of runs, so only do this if there are 2-5
|
|
1012
1049
|
# TODO: replace in frontend with a selector to choose which rows to visualize.
|
|
1013
1050
|
links = []
|
|
1014
|
-
if
|
|
1051
|
+
if is_scenario_table:
|
|
1015
1052
|
all_run_spec_names = []
|
|
1016
1053
|
for adapter_spec, runs in adapter_to_runs.items():
|
|
1017
1054
|
if len(runs) > 1:
|
|
@@ -1094,8 +1131,8 @@ class Summarizer:
|
|
|
1094
1131
|
title=display_name,
|
|
1095
1132
|
adapter_to_runs=adapter_to_runs,
|
|
1096
1133
|
columns=[(subgroup, metric_group) for subgroup in subgroups],
|
|
1097
|
-
|
|
1098
|
-
add_win_rate=
|
|
1134
|
+
is_scenario_table=False,
|
|
1135
|
+
add_win_rate=not self.schema.name_to_metric_group[metric_group].hide_win_rates,
|
|
1099
1136
|
)
|
|
1100
1137
|
tables.append(table)
|
|
1101
1138
|
return tables
|
|
@@ -1126,7 +1163,7 @@ class Summarizer:
|
|
|
1126
1163
|
name=scenario_name,
|
|
1127
1164
|
adapter_to_runs=adapter_to_runs,
|
|
1128
1165
|
columns=columns,
|
|
1129
|
-
|
|
1166
|
+
is_scenario_table=True,
|
|
1130
1167
|
)
|
|
1131
1168
|
tables.append(table)
|
|
1132
1169
|
scenarios_shown += 1
|
|
@@ -1138,7 +1175,7 @@ class Summarizer:
|
|
|
1138
1175
|
name=f"{subgroup.name}:sub_split={sub_split}",
|
|
1139
1176
|
adapter_to_runs=adapter_to_runs,
|
|
1140
1177
|
columns=columns,
|
|
1141
|
-
|
|
1178
|
+
is_scenario_table=False,
|
|
1142
1179
|
sub_split=sub_split,
|
|
1143
1180
|
)
|
|
1144
1181
|
tables.append(table)
|
|
@@ -1158,7 +1195,7 @@ class Summarizer:
|
|
|
1158
1195
|
name=subgroup.name,
|
|
1159
1196
|
adapter_to_runs=adapter_to_runs,
|
|
1160
1197
|
columns=columns,
|
|
1161
|
-
|
|
1198
|
+
is_scenario_table=False,
|
|
1162
1199
|
)
|
|
1163
1200
|
tables = [table] + tables
|
|
1164
1201
|
all_tables.extend(tables)
|
|
@@ -1260,9 +1297,9 @@ class Summarizer:
|
|
|
1260
1297
|
for scenario_spec_instance_ids_json in scenario_spec_instance_ids_jsons:
|
|
1261
1298
|
scenario_spec_instance_ids_dict = json.loads(scenario_spec_instance_ids_json)
|
|
1262
1299
|
scenario_spec_instance_ids = cattrs.structure(scenario_spec_instance_ids_dict, ScenarioSpecInstanceIds)
|
|
1263
|
-
self.scenario_spec_instance_id_dict[
|
|
1264
|
-
scenario_spec_instance_ids.
|
|
1265
|
-
|
|
1300
|
+
self.scenario_spec_instance_id_dict[scenario_spec_instance_ids.scenario_spec] = (
|
|
1301
|
+
scenario_spec_instance_ids.instance_ids
|
|
1302
|
+
)
|
|
1266
1303
|
|
|
1267
1304
|
def write_scenario_spec_instance_ids_json(self, file_path) -> None:
|
|
1268
1305
|
for run in self.runs:
|
|
@@ -1304,8 +1341,6 @@ class Summarizer:
|
|
|
1304
1341
|
|
|
1305
1342
|
def run_pipeline(self, skip_completed: bool, num_instances: int) -> None:
|
|
1306
1343
|
"""Run the entire summarization pipeline."""
|
|
1307
|
-
self.write_schema()
|
|
1308
|
-
|
|
1309
1344
|
self.read_runs()
|
|
1310
1345
|
self.group_runs()
|
|
1311
1346
|
self.check_metrics_defined()
|
|
@@ -1320,6 +1355,10 @@ class Summarizer:
|
|
|
1320
1355
|
# because it uses self.scenario_spec_instance_id_dict
|
|
1321
1356
|
self.read_overlap_stats()
|
|
1322
1357
|
|
|
1358
|
+
# Must happen after self.read_runs()
|
|
1359
|
+
# because it uses self.runs
|
|
1360
|
+
self.write_schema()
|
|
1361
|
+
|
|
1323
1362
|
self.write_executive_summary()
|
|
1324
1363
|
self.write_runs()
|
|
1325
1364
|
self.write_run_specs()
|
|
@@ -1337,10 +1376,9 @@ def main():
|
|
|
1337
1376
|
"-o", "--output-path", type=str, help="Where the benchmarking output lives", default="benchmark_output"
|
|
1338
1377
|
)
|
|
1339
1378
|
parser.add_argument(
|
|
1340
|
-
"--schema-
|
|
1379
|
+
"--schema-path",
|
|
1341
1380
|
type=str,
|
|
1342
|
-
help="
|
|
1343
|
-
default=SCHEMA_CLASSIC_YAML_FILENAME,
|
|
1381
|
+
help="Path to the schema file (e.g., schema_classic.yaml).",
|
|
1344
1382
|
)
|
|
1345
1383
|
parser.add_argument(
|
|
1346
1384
|
"--suite",
|
|
@@ -1407,6 +1445,8 @@ def main():
|
|
|
1407
1445
|
else:
|
|
1408
1446
|
raise ValueError("Exactly one of --release or --suite must be specified.")
|
|
1409
1447
|
|
|
1448
|
+
schema_path = args.schema_path if args.schema_path else get_default_schema_path()
|
|
1449
|
+
|
|
1410
1450
|
register_builtin_configs_from_helm_package()
|
|
1411
1451
|
register_configs_from_directory(args.local_path)
|
|
1412
1452
|
|
|
@@ -1415,7 +1455,7 @@ def main():
|
|
|
1415
1455
|
release=release,
|
|
1416
1456
|
suites=suites,
|
|
1417
1457
|
suite=suite,
|
|
1418
|
-
|
|
1458
|
+
schema_path=schema_path,
|
|
1419
1459
|
output_path=args.output_path,
|
|
1420
1460
|
verbose=args.debug,
|
|
1421
1461
|
num_threads=args.num_threads,
|
|
@@ -4,26 +4,26 @@ from typing import Any, Optional, List, Dict
|
|
|
4
4
|
|
|
5
5
|
@dataclass(frozen=True)
|
|
6
6
|
class Cell:
|
|
7
|
-
# Semantic value (that can be used for sorting)
|
|
8
7
|
value: Any
|
|
8
|
+
"""Semantic value (that can be used for sorting)"""
|
|
9
9
|
|
|
10
|
-
# Optionally, if we want to render things specially (floating points to 3 decimal points)
|
|
11
10
|
display_value: Optional[str] = None
|
|
11
|
+
"""Optionally, if we want to render things specially (floating points to 3 decimal points)"""
|
|
12
12
|
|
|
13
|
-
# Detailed description if hover over the cell
|
|
14
13
|
description: Optional[str] = None
|
|
14
|
+
"""Detailed description if hover over the cell"""
|
|
15
15
|
|
|
16
|
-
# If we click on the link for this cell, it takes us somewhere
|
|
17
16
|
href: Optional[str] = None
|
|
17
|
+
"""If we click on the link for this cell, it takes us somewhere"""
|
|
18
18
|
|
|
19
|
-
# Styling
|
|
20
19
|
style: Optional[Dict[str, Any]] = None
|
|
20
|
+
"""Styling"""
|
|
21
21
|
|
|
22
|
-
# If the value or display_value is markdown that needs to be interpreted
|
|
23
22
|
markdown: bool = False
|
|
23
|
+
"""If the value or display_value is markdown that needs to be interpreted"""
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
25
|
+
run_spec_names: Optional[List[str]] = None
|
|
26
|
+
"""The names of the runs that this cell's value was aggregated from, if the cell contains an aggregate value."""
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
@dataclass(frozen=True)
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
from helm.benchmark.presentation.schema import read_schema,
|
|
1
|
+
from helm.benchmark.presentation.schema import read_schema, get_default_schema_path
|
|
2
2
|
from helm.benchmark.presentation.contamination import read_contamination, validate_contamination
|
|
3
3
|
|
|
4
4
|
|
|
5
5
|
def test_contamination_schema():
|
|
6
|
-
schema = read_schema(
|
|
6
|
+
schema = read_schema(get_default_schema_path())
|
|
7
7
|
contamination = read_contamination()
|
|
8
8
|
validate_contamination(contamination, schema)
|
|
9
9
|
|
|
@@ -3,8 +3,7 @@ import pytest
|
|
|
3
3
|
|
|
4
4
|
from helm.common.object_spec import parse_object_spec
|
|
5
5
|
from helm.benchmark.presentation.run_entry import read_run_entries
|
|
6
|
-
from helm.benchmark.
|
|
7
|
-
from helm.benchmark import vlm_run_specs # noqa
|
|
6
|
+
from helm.benchmark.run_spec_factory import construct_run_specs
|
|
8
7
|
|
|
9
8
|
|
|
10
9
|
def list_fnames():
|
|
@@ -2,7 +2,7 @@ import os
|
|
|
2
2
|
import tempfile
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.presentation.summarize import Summarizer
|
|
5
|
-
from helm.benchmark.presentation.schema import
|
|
5
|
+
from helm.benchmark.presentation.schema import get_default_schema_path
|
|
6
6
|
from helm.common.general import ensure_directory_exists
|
|
7
7
|
|
|
8
8
|
|
|
@@ -13,7 +13,7 @@ def test_summarize_suite():
|
|
|
13
13
|
release=None,
|
|
14
14
|
suites=None,
|
|
15
15
|
suite="test_suite",
|
|
16
|
-
|
|
16
|
+
schema_path=get_default_schema_path(),
|
|
17
17
|
output_path=output_path,
|
|
18
18
|
verbose=False,
|
|
19
19
|
num_threads=4,
|
|
@@ -31,7 +31,7 @@ def test_summarize_release():
|
|
|
31
31
|
release="test_release",
|
|
32
32
|
suites=["test_suite_1", "test_suite_2"],
|
|
33
33
|
suite=None,
|
|
34
|
-
|
|
34
|
+
schema_path=get_default_schema_path(),
|
|
35
35
|
output_path=output_path,
|
|
36
36
|
verbose=False,
|
|
37
37
|
num_threads=4,
|
helm/benchmark/run.py
CHANGED
|
@@ -1,27 +1,26 @@
|
|
|
1
1
|
import argparse
|
|
2
2
|
from dataclasses import replace
|
|
3
|
+
import os
|
|
3
4
|
from typing import List, Optional
|
|
4
|
-
|
|
5
|
-
register_huggingface_hub_model_from_flag_value,
|
|
6
|
-
register_huggingface_local_model_from_flag_value,
|
|
7
|
-
)
|
|
5
|
+
|
|
8
6
|
|
|
9
7
|
from helm.benchmark.presentation.run_entry import RunEntry, read_run_entries
|
|
8
|
+
from helm.common.cache_backend_config import MongoCacheBackendConfig, SqliteCacheBackendConfig
|
|
10
9
|
from helm.common.general import ensure_directory_exists
|
|
11
10
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
12
11
|
from helm.common.authentication import Authentication
|
|
13
12
|
from helm.common.object_spec import parse_object_spec, get_class_by_name
|
|
14
13
|
from helm.proxy.services.remote_service import create_authentication, add_service_args
|
|
14
|
+
from helm.proxy.services.service import CACHE_DIR
|
|
15
15
|
|
|
16
16
|
from helm.benchmark.config_registry import (
|
|
17
17
|
register_configs_from_directory,
|
|
18
18
|
register_builtin_configs_from_helm_package,
|
|
19
19
|
)
|
|
20
20
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
21
|
-
from helm.benchmark import
|
|
22
|
-
from .
|
|
23
|
-
from .
|
|
24
|
-
from .run_specs import construct_run_specs
|
|
21
|
+
from helm.benchmark.executor import ExecutionSpec
|
|
22
|
+
from helm.benchmark.runner import Runner, RunSpec, LATEST_SYMLINK, set_benchmark_output_path
|
|
23
|
+
from helm.benchmark.run_spec_factory import construct_run_specs
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
def run_entries_to_run_specs(
|
|
@@ -85,16 +84,29 @@ def run_benchmarking(
|
|
|
85
84
|
skip_completed_runs: bool,
|
|
86
85
|
exit_on_error: bool,
|
|
87
86
|
runner_class_name: Optional[str],
|
|
88
|
-
mongo_uri: str =
|
|
87
|
+
mongo_uri: Optional[str] = None,
|
|
88
|
+
disable_cache: Optional[bool] = None,
|
|
89
89
|
) -> List[RunSpec]:
|
|
90
90
|
"""Runs RunSpecs given a list of RunSpec descriptions."""
|
|
91
|
+
sqlite_cache_backend_config: Optional[SqliteCacheBackendConfig] = None
|
|
92
|
+
mongo_cache_backend_config: Optional[MongoCacheBackendConfig] = None
|
|
93
|
+
|
|
94
|
+
if not disable_cache:
|
|
95
|
+
if mongo_uri:
|
|
96
|
+
mongo_cache_backend_config = MongoCacheBackendConfig(mongo_uri)
|
|
97
|
+
else:
|
|
98
|
+
sqlite_cache_path = os.path.join(local_path, CACHE_DIR)
|
|
99
|
+
ensure_directory_exists(sqlite_cache_path)
|
|
100
|
+
sqlite_cache_backend_config = SqliteCacheBackendConfig(sqlite_cache_path)
|
|
101
|
+
|
|
91
102
|
execution_spec = ExecutionSpec(
|
|
92
103
|
auth=auth,
|
|
93
104
|
url=url,
|
|
94
105
|
local_path=local_path,
|
|
95
106
|
parallelism=num_threads,
|
|
96
107
|
dry_run=dry_run,
|
|
97
|
-
|
|
108
|
+
sqlite_cache_backend_config=sqlite_cache_backend_config,
|
|
109
|
+
mongo_cache_backend_config=mongo_cache_backend_config,
|
|
98
110
|
)
|
|
99
111
|
with htrack_block("run_specs"):
|
|
100
112
|
for run_spec in run_specs:
|
|
@@ -160,13 +172,6 @@ def add_run_args(parser: argparse.ArgumentParser):
|
|
|
160
172
|
help="Name of the suite this run belongs to (default is today's date).",
|
|
161
173
|
required=True,
|
|
162
174
|
)
|
|
163
|
-
parser.add_argument(
|
|
164
|
-
"--local",
|
|
165
|
-
action="store_true",
|
|
166
|
-
help="DEPRECATED: Does nothing. Do not use. Previously enabled local mode. "
|
|
167
|
-
"Now does nothing and will be removed in the next released version. "
|
|
168
|
-
"Local mode is enabled by default, and only disabled if the --server_url flag is set.",
|
|
169
|
-
)
|
|
170
175
|
parser.add_argument(
|
|
171
176
|
"--local-path",
|
|
172
177
|
type=str,
|
|
@@ -179,6 +184,11 @@ def add_run_args(parser: argparse.ArgumentParser):
|
|
|
179
184
|
help="If non-empty, the URL of the MongoDB database that will be used for caching instead of SQLite",
|
|
180
185
|
default="",
|
|
181
186
|
)
|
|
187
|
+
parser.add_argument(
|
|
188
|
+
"--disable-cache",
|
|
189
|
+
action="store_true",
|
|
190
|
+
help="If true, the request-response cache for model clients and tokenizers will be disabled.",
|
|
191
|
+
)
|
|
182
192
|
|
|
183
193
|
|
|
184
194
|
def validate_args(args):
|
|
@@ -227,7 +237,14 @@ def main():
|
|
|
227
237
|
help="Run RunSpecs with priority less than or equal to this number. "
|
|
228
238
|
"If a value for --priority is not specified, run on everything",
|
|
229
239
|
)
|
|
230
|
-
parser.add_argument(
|
|
240
|
+
parser.add_argument(
|
|
241
|
+
"--run-specs",
|
|
242
|
+
nargs="*",
|
|
243
|
+
help="DEPRECATED: Use --run-entries instead. Will be removed in a future release. "
|
|
244
|
+
"Specifies run entries to run.",
|
|
245
|
+
default=[],
|
|
246
|
+
)
|
|
247
|
+
parser.add_argument("-r", "--run-entries", nargs="*", help="Specifies run entries to run", default=[])
|
|
231
248
|
parser.add_argument(
|
|
232
249
|
"--enable-huggingface-models",
|
|
233
250
|
nargs="+",
|
|
@@ -254,14 +271,25 @@ def main():
|
|
|
254
271
|
register_builtin_configs_from_helm_package()
|
|
255
272
|
register_configs_from_directory(args.local_path)
|
|
256
273
|
|
|
257
|
-
|
|
258
|
-
register_huggingface_hub_model_from_flag_value
|
|
259
|
-
|
|
260
|
-
|
|
274
|
+
if args.enable_huggingface_models:
|
|
275
|
+
from helm.benchmark.huggingface_registration import register_huggingface_hub_model_from_flag_value
|
|
276
|
+
|
|
277
|
+
for huggingface_model_name in args.enable_huggingface_models:
|
|
278
|
+
register_huggingface_hub_model_from_flag_value(huggingface_model_name)
|
|
279
|
+
if args.enable_local_huggingface_models:
|
|
280
|
+
from helm.benchmark.huggingface_registration import register_huggingface_local_model_from_flag_value
|
|
281
|
+
|
|
282
|
+
for huggingface_model_path in args.enable_local_huggingface_models:
|
|
283
|
+
register_huggingface_local_model_from_flag_value(huggingface_model_path)
|
|
261
284
|
|
|
262
285
|
run_entries: List[RunEntry] = []
|
|
263
286
|
if args.conf_paths:
|
|
264
287
|
run_entries.extend(read_run_entries(args.conf_paths).entries)
|
|
288
|
+
if args.run_entries:
|
|
289
|
+
run_entries.extend(
|
|
290
|
+
[RunEntry(description=description, priority=1, groups=None) for description in args.run_entries]
|
|
291
|
+
)
|
|
292
|
+
# TODO: Remove this eventually.
|
|
265
293
|
if args.run_specs:
|
|
266
294
|
run_entries.extend(
|
|
267
295
|
[RunEntry(description=description, priority=1, groups=None) for description in args.run_specs]
|
|
@@ -306,13 +334,13 @@ def main():
|
|
|
306
334
|
exit_on_error=args.exit_on_error,
|
|
307
335
|
runner_class_name=args.runner_class_name,
|
|
308
336
|
mongo_uri=args.mongo_uri,
|
|
337
|
+
disable_cache=args.disable_cache,
|
|
309
338
|
)
|
|
310
339
|
|
|
311
|
-
if args.
|
|
340
|
+
if args.run_specs:
|
|
312
341
|
hlog(
|
|
313
|
-
"WARNING: The --
|
|
314
|
-
"
|
|
315
|
-
"--server_url flag is set. Please remove --local from your command."
|
|
342
|
+
"WARNING: The --run-specs flag is deprecated and will be removed in a future release. "
|
|
343
|
+
"Use --run-entries instead."
|
|
316
344
|
)
|
|
317
345
|
|
|
318
346
|
hlog("Done.")
|