crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/benchmark/run_expander.py
CHANGED
|
@@ -3,7 +3,7 @@ from abc import ABC, abstractmethod
|
|
|
3
3
|
from dataclasses import replace
|
|
4
4
|
from typing import Any, List, Dict, Optional, Tuple, Type
|
|
5
5
|
|
|
6
|
-
from helm.
|
|
6
|
+
from helm.benchmark.model_metadata_registry import (
|
|
7
7
|
get_all_instruction_following_models,
|
|
8
8
|
get_all_code_models,
|
|
9
9
|
get_all_models,
|
|
@@ -11,20 +11,17 @@ from helm.proxy.models import (
|
|
|
11
11
|
get_model_names_with_tag,
|
|
12
12
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
13
13
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
14
|
-
GPT2_TOKENIZER_TAG,
|
|
15
|
-
AI21_TOKENIZER_TAG,
|
|
16
|
-
COHERE_TOKENIZER_TAG,
|
|
17
|
-
OPT_TOKENIZER_TAG,
|
|
18
|
-
GPTJ_TOKENIZER_TAG,
|
|
19
|
-
GPTNEO_TOKENIZER_TAG,
|
|
20
|
-
GPT4_TOKENIZER_TAG,
|
|
21
14
|
ABLATION_MODEL_TAG,
|
|
15
|
+
TEXT_TO_IMAGE_MODEL_TAG,
|
|
22
16
|
VISION_LANGUAGE_MODEL_TAG,
|
|
23
17
|
)
|
|
24
|
-
from .
|
|
25
|
-
from helm.benchmark.
|
|
18
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
19
|
+
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
20
|
+
from .run_spec import RunSpec
|
|
21
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
|
|
26
22
|
from .augmentations.perturbation import PerturbationSpec
|
|
27
23
|
from .augmentations.data_augmenter import DataAugmenterSpec
|
|
24
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
|
|
28
25
|
|
|
29
26
|
|
|
30
27
|
class RunExpander(ABC):
|
|
@@ -227,12 +224,16 @@ class AddToStopRunExpander(RunExpander):
|
|
|
227
224
|
self.value = value
|
|
228
225
|
|
|
229
226
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
227
|
+
if self.value == "newline":
|
|
228
|
+
stop_sequence = "\n"
|
|
229
|
+
else:
|
|
230
|
+
stop_sequence = self.value
|
|
230
231
|
return [
|
|
231
232
|
replace(
|
|
232
233
|
run_spec,
|
|
233
234
|
name=run_spec.name,
|
|
234
235
|
adapter_spec=replace(
|
|
235
|
-
run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [
|
|
236
|
+
run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [stop_sequence]
|
|
236
237
|
),
|
|
237
238
|
),
|
|
238
239
|
]
|
|
@@ -261,6 +262,238 @@ class GlobalPrefixRunExpander(RunExpander):
|
|
|
261
262
|
]
|
|
262
263
|
|
|
263
264
|
|
|
265
|
+
# Instruction-following models like GPT-4, Claude, PaLM 2 don't do in-context
|
|
266
|
+
# learning naturally like base models, and they prefer to respond in a wordy
|
|
267
|
+
# way as an assistant. Therefore, for these models, we must provide explicit
|
|
268
|
+
# instructions to follow the format of the in-context examples.
|
|
269
|
+
IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX = (
|
|
270
|
+
"Here are some input-output examples. "
|
|
271
|
+
+ "Read the examples carefully to figure out the mapping. "
|
|
272
|
+
+ "The output of the last example is not given, "
|
|
273
|
+
+ "and your job is to figure out what it is."
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
|
|
277
|
+
"Please provide the output to this last example. " + "It is critical to follow the format of the preceding outputs!"
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
class AnthropicClaude2RunExpander(RunExpander):
|
|
282
|
+
"""
|
|
283
|
+
Custom prompt for Anthropic Claude 1 and Claude 2 models.
|
|
284
|
+
These models need more explicit instructions about following the format.
|
|
285
|
+
"""
|
|
286
|
+
|
|
287
|
+
name = "anthropic"
|
|
288
|
+
|
|
289
|
+
# These strings must be added to the prompt in order to pass prompt validation,
|
|
290
|
+
# otherwise the Anthropic API will return an error.
|
|
291
|
+
# See: https://docs.anthropic.com/claude/reference/prompt-validation
|
|
292
|
+
HUMAN_PROMPT = "\n\nHuman:"
|
|
293
|
+
AI_PROMPT = "\n\nAssistant:"
|
|
294
|
+
|
|
295
|
+
def __init__(self):
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
299
|
+
return [
|
|
300
|
+
replace(
|
|
301
|
+
run_spec,
|
|
302
|
+
name=run_spec.name,
|
|
303
|
+
adapter_spec=replace(
|
|
304
|
+
run_spec.adapter_spec,
|
|
305
|
+
global_prefix=AnthropicClaude2RunExpander.HUMAN_PROMPT
|
|
306
|
+
+ " "
|
|
307
|
+
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX
|
|
308
|
+
+ "\n\n",
|
|
309
|
+
global_suffix="\n\n"
|
|
310
|
+
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
311
|
+
+ AnthropicClaude2RunExpander.AI_PROMPT
|
|
312
|
+
+ " "
|
|
313
|
+
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
314
|
+
),
|
|
315
|
+
),
|
|
316
|
+
]
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
class AnthropicClaude3RunExpander(RunExpander):
|
|
320
|
+
"""Custom prompts for Anthropic Claude 3 models."""
|
|
321
|
+
|
|
322
|
+
name = "claude_3"
|
|
323
|
+
|
|
324
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
325
|
+
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
326
|
+
instructions = "Answer with only a single letter."
|
|
327
|
+
if run_spec.adapter_spec.instructions:
|
|
328
|
+
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
329
|
+
return [
|
|
330
|
+
replace(
|
|
331
|
+
run_spec,
|
|
332
|
+
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
333
|
+
),
|
|
334
|
+
]
|
|
335
|
+
return [run_spec]
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class OpenAIRunExpander(RunExpander):
|
|
339
|
+
"""
|
|
340
|
+
Custom prompt for OpenAI models.
|
|
341
|
+
These models need more explicit instructions about following the format.
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
# TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
|
|
345
|
+
|
|
346
|
+
name = "openai"
|
|
347
|
+
|
|
348
|
+
def __init__(self):
|
|
349
|
+
pass
|
|
350
|
+
|
|
351
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
352
|
+
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
353
|
+
return [run_spec]
|
|
354
|
+
|
|
355
|
+
return [
|
|
356
|
+
replace(
|
|
357
|
+
run_spec,
|
|
358
|
+
name=run_spec.name,
|
|
359
|
+
adapter_spec=replace(
|
|
360
|
+
run_spec.adapter_spec,
|
|
361
|
+
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
|
|
362
|
+
global_suffix="\n\n"
|
|
363
|
+
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
364
|
+
+ "\n"
|
|
365
|
+
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
366
|
+
),
|
|
367
|
+
),
|
|
368
|
+
]
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
class GoogleRunExpander(RunExpander):
|
|
372
|
+
"""
|
|
373
|
+
Custom prompt for Google models.
|
|
374
|
+
These models need more explicit instructions about following the format.
|
|
375
|
+
"""
|
|
376
|
+
|
|
377
|
+
# TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
|
|
378
|
+
|
|
379
|
+
name = "google"
|
|
380
|
+
|
|
381
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
382
|
+
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
383
|
+
return [run_spec]
|
|
384
|
+
|
|
385
|
+
return [
|
|
386
|
+
replace(
|
|
387
|
+
run_spec,
|
|
388
|
+
name=run_spec.name,
|
|
389
|
+
adapter_spec=replace(
|
|
390
|
+
run_spec.adapter_spec,
|
|
391
|
+
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
|
|
392
|
+
global_suffix="\n\n"
|
|
393
|
+
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
394
|
+
+ "\n"
|
|
395
|
+
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
396
|
+
),
|
|
397
|
+
),
|
|
398
|
+
]
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
class MistralRunExpander(RunExpander):
|
|
402
|
+
"""Custom prompt for Mistral models."""
|
|
403
|
+
|
|
404
|
+
# TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
|
|
405
|
+
|
|
406
|
+
name = "output_format_instructions"
|
|
407
|
+
|
|
408
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
409
|
+
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
410
|
+
return [run_spec]
|
|
411
|
+
|
|
412
|
+
return [
|
|
413
|
+
replace(
|
|
414
|
+
run_spec,
|
|
415
|
+
name=run_spec.name,
|
|
416
|
+
adapter_spec=replace(
|
|
417
|
+
run_spec.adapter_spec,
|
|
418
|
+
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
|
|
419
|
+
global_suffix="\n\n"
|
|
420
|
+
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
421
|
+
+ "\n"
|
|
422
|
+
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
423
|
+
),
|
|
424
|
+
),
|
|
425
|
+
]
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
class IDEFICSInstructRunExpander(RunExpander):
|
|
429
|
+
"""
|
|
430
|
+
Custom prompt for IDEFICS instruct models which require a specific format.
|
|
431
|
+
See https://huggingface.co/HuggingFaceM4/idefics-80b-instruct for more information.
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
name = "idefics_instruct"
|
|
435
|
+
|
|
436
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
437
|
+
return [
|
|
438
|
+
replace(
|
|
439
|
+
run_spec,
|
|
440
|
+
name=run_spec.name,
|
|
441
|
+
adapter_spec=replace(
|
|
442
|
+
run_spec.adapter_spec,
|
|
443
|
+
input_prefix="User: ",
|
|
444
|
+
input_suffix="<end_of_utterance>",
|
|
445
|
+
output_prefix="\nAssistant: ",
|
|
446
|
+
output_suffix="<end_of_utterance>",
|
|
447
|
+
stop_sequences=["<end_of_utterance>"],
|
|
448
|
+
),
|
|
449
|
+
),
|
|
450
|
+
]
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
class LlavaRunExpander(RunExpander):
|
|
454
|
+
"""
|
|
455
|
+
Custom prompt for Llava 1.5 models which should use a specific format.
|
|
456
|
+
See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
name = "llava"
|
|
460
|
+
|
|
461
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
462
|
+
return [
|
|
463
|
+
replace(
|
|
464
|
+
run_spec,
|
|
465
|
+
name=run_spec.name,
|
|
466
|
+
adapter_spec=replace(
|
|
467
|
+
run_spec.adapter_spec,
|
|
468
|
+
input_prefix="USER: <image>",
|
|
469
|
+
input_suffix="",
|
|
470
|
+
output_prefix="\nASSISTANT: ",
|
|
471
|
+
output_suffix="",
|
|
472
|
+
),
|
|
473
|
+
),
|
|
474
|
+
]
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
class OpenFlamingoRunExpander(RunExpander):
|
|
478
|
+
"""
|
|
479
|
+
Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
|
|
480
|
+
"""
|
|
481
|
+
|
|
482
|
+
name = "open_flamingo"
|
|
483
|
+
|
|
484
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
485
|
+
return [
|
|
486
|
+
replace(
|
|
487
|
+
run_spec,
|
|
488
|
+
name=run_spec.name,
|
|
489
|
+
adapter_spec=replace(
|
|
490
|
+
run_spec.adapter_spec,
|
|
491
|
+
input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
|
|
492
|
+
),
|
|
493
|
+
),
|
|
494
|
+
]
|
|
495
|
+
|
|
496
|
+
|
|
264
497
|
class FormatPromptRunExpander(RunExpander):
|
|
265
498
|
"""Adds a prefix and suffix to the prompt."""
|
|
266
499
|
|
|
@@ -277,7 +510,7 @@ class FormatPromptRunExpander(RunExpander):
|
|
|
277
510
|
name=run_spec.name,
|
|
278
511
|
adapter_spec=replace(
|
|
279
512
|
run_spec.adapter_spec,
|
|
280
|
-
|
|
513
|
+
input_prefix=self.prefix,
|
|
281
514
|
output_prefix=self.suffix,
|
|
282
515
|
),
|
|
283
516
|
),
|
|
@@ -306,6 +539,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
306
539
|
"one": [1],
|
|
307
540
|
"all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
|
|
308
541
|
"big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
|
|
542
|
+
"heim_human_eval": [0, 1, 2, 4, 8],
|
|
309
543
|
}
|
|
310
544
|
|
|
311
545
|
|
|
@@ -313,7 +547,12 @@ class MaxEvalInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
313
547
|
"""For overriding the number of eval instances at the run level."""
|
|
314
548
|
|
|
315
549
|
name = "max_eval_instances"
|
|
316
|
-
values_dict: Dict[str, List[Any]] = {
|
|
550
|
+
values_dict: Dict[str, List[Any]] = {
|
|
551
|
+
"default": [1_000],
|
|
552
|
+
"heim_default": [100],
|
|
553
|
+
"heim_fid": [30_000],
|
|
554
|
+
"heim_art_styles": [17],
|
|
555
|
+
}
|
|
317
556
|
|
|
318
557
|
|
|
319
558
|
class NumOutputsRunExpander(ReplaceValueRunExpander):
|
|
@@ -326,6 +565,15 @@ class NumOutputsRunExpander(ReplaceValueRunExpander):
|
|
|
326
565
|
}
|
|
327
566
|
|
|
328
567
|
|
|
568
|
+
class NumTrialRunExpander(ReplaceValueRunExpander):
|
|
569
|
+
"""For getting different generations for the same requests."""
|
|
570
|
+
|
|
571
|
+
name = "num_trials"
|
|
572
|
+
values_dict = {
|
|
573
|
+
"heim_efficiency": [5],
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
|
|
329
577
|
class ModelRunExpander(ReplaceValueRunExpander):
|
|
330
578
|
"""
|
|
331
579
|
For specifying different models.
|
|
@@ -355,10 +603,6 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
355
603
|
"code": get_all_code_models(),
|
|
356
604
|
"instruction_following": get_all_instruction_following_models(),
|
|
357
605
|
"limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG),
|
|
358
|
-
"gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG),
|
|
359
|
-
"ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG),
|
|
360
|
-
"cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG),
|
|
361
|
-
"opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG),
|
|
362
606
|
"summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"],
|
|
363
607
|
"biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM
|
|
364
608
|
"interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"],
|
|
@@ -371,6 +615,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
371
615
|
"openai/text-davinci-003",
|
|
372
616
|
],
|
|
373
617
|
"opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
|
|
618
|
+
"text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
|
|
374
619
|
"vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
|
|
375
620
|
}
|
|
376
621
|
|
|
@@ -388,6 +633,40 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
388
633
|
return values_dict
|
|
389
634
|
|
|
390
635
|
|
|
636
|
+
class ModelDeploymentRunExpander(ReplaceValueRunExpander):
|
|
637
|
+
"""For overriding model deployment"""
|
|
638
|
+
|
|
639
|
+
name = "model_deployment"
|
|
640
|
+
values_dict: Dict[str, List[Any]] = {}
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
class EvalSplitRunExpander(RunExpander):
|
|
644
|
+
"""Sets the evaluation split.
|
|
645
|
+
|
|
646
|
+
By default, evaluation instances are drawn from both test and validation splits.
|
|
647
|
+
This run expander allows drawing evaluation instances from only the test split or
|
|
648
|
+
only the validation split."""
|
|
649
|
+
|
|
650
|
+
# NOTE: This does not subclass `ReplaceValueRunExpander` because we want the
|
|
651
|
+
# run expander name to be "eval_split", not "eval_splits".
|
|
652
|
+
|
|
653
|
+
name = "eval_split"
|
|
654
|
+
|
|
655
|
+
def __init__(self, value):
|
|
656
|
+
if value != TEST_SPLIT and value != VALID_SPLIT:
|
|
657
|
+
raise ValueError(f'Split must be "{TEST_SPLIT}" or "{VALID_SPLIT}", but got "{value}"')
|
|
658
|
+
self.split = value
|
|
659
|
+
|
|
660
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
661
|
+
return [
|
|
662
|
+
replace(
|
|
663
|
+
run_spec,
|
|
664
|
+
name=f"{run_spec.name}{',' if ':' in run_spec.name else ':'}eval_split={self.split}",
|
|
665
|
+
adapter_spec=replace(run_spec.adapter_spec, eval_splits=[self.split]),
|
|
666
|
+
)
|
|
667
|
+
]
|
|
668
|
+
|
|
669
|
+
|
|
391
670
|
############################################################
|
|
392
671
|
|
|
393
672
|
|
|
@@ -576,6 +855,20 @@ def mandarin_to_cantonese() -> PerturbationSpec:
|
|
|
576
855
|
)
|
|
577
856
|
|
|
578
857
|
|
|
858
|
+
def translate(language_code: str) -> PerturbationSpec:
|
|
859
|
+
return PerturbationSpec(
|
|
860
|
+
class_name="helm.benchmark.augmentations.translate_perturbation.TranslatePerturbation",
|
|
861
|
+
args={"language_code": language_code},
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def suffix(text: str) -> PerturbationSpec:
|
|
866
|
+
return PerturbationSpec(
|
|
867
|
+
class_name="helm.benchmark.augmentations.suffix_perturbation.SuffixPerturbation",
|
|
868
|
+
args={"suffix": text},
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
|
|
579
872
|
# Specifies the data augmentations that we're interested in trying out.
|
|
580
873
|
# Concretely, this is a mapping from the name (which is specified in a conf
|
|
581
874
|
# file or the CLI) to a list of options to try, where each option is a list of perturbations.
|
|
@@ -767,6 +1060,21 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
|
|
|
767
1060
|
mandarin_to_cantonese(),
|
|
768
1061
|
]
|
|
769
1062
|
},
|
|
1063
|
+
# Multilinguality
|
|
1064
|
+
"chinese": {"chinese": [translate(language_code="zh-CN")]},
|
|
1065
|
+
"hindi": {"hindi": [translate(language_code="hi")]},
|
|
1066
|
+
"spanish": {"spanish": [translate(language_code="es")]},
|
|
1067
|
+
# Styles
|
|
1068
|
+
"art": {
|
|
1069
|
+
"art": [
|
|
1070
|
+
suffix("oil painting"),
|
|
1071
|
+
suffix("watercolor"),
|
|
1072
|
+
suffix("pencil sketch"),
|
|
1073
|
+
suffix("animation"),
|
|
1074
|
+
suffix("vector graphics"),
|
|
1075
|
+
suffix("pixel art"),
|
|
1076
|
+
]
|
|
1077
|
+
},
|
|
770
1078
|
}
|
|
771
1079
|
|
|
772
1080
|
|
|
@@ -880,18 +1188,18 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
|
|
|
880
1188
|
"huggingface/santacoder": ["bigcode/santacoder"],
|
|
881
1189
|
"huggingface/starcoder": ["bigcode/starcoder"],
|
|
882
1190
|
}
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
1191
|
+
list_tokenizers = [
|
|
1192
|
+
"huggingface/gpt2",
|
|
1193
|
+
"ai21/j1",
|
|
1194
|
+
"cohere/cohere",
|
|
1195
|
+
"meta/opt",
|
|
1196
|
+
"eleutherai/gptj",
|
|
1197
|
+
"openai/cl100k_base",
|
|
1198
|
+
"eleutherai/gptneox",
|
|
891
1199
|
]
|
|
892
|
-
for
|
|
893
|
-
for model in
|
|
894
|
-
model_to_tokenizer_mapping[model] = [
|
|
1200
|
+
for tokenizer_name in list_tokenizers:
|
|
1201
|
+
for model in get_model_names_with_tokenizer(tokenizer_name):
|
|
1202
|
+
model_to_tokenizer_mapping[model] = [tokenizer_name]
|
|
895
1203
|
# tokenizer=default will map to using the right tokenizer for a given model.
|
|
896
1204
|
values_dict = {"default": model_to_tokenizer_mapping}
|
|
897
1205
|
|
|
@@ -907,10 +1215,10 @@ class TokenizerRunExpander(ScenarioSpecRunExpander):
|
|
|
907
1215
|
self.all_values = [value]
|
|
908
1216
|
|
|
909
1217
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
910
|
-
# Find right tokenizer given model.
|
|
1218
|
+
# Find right tokenizer given model deployment name.
|
|
911
1219
|
if isinstance(self.all_values, dict):
|
|
912
|
-
|
|
913
|
-
self.values = self.all_values[
|
|
1220
|
+
deployment: str = run_spec.adapter_spec.model_deployment
|
|
1221
|
+
self.values = self.all_values[deployment] if deployment in self.all_values else []
|
|
914
1222
|
else:
|
|
915
1223
|
self.values = self.all_values
|
|
916
1224
|
return super().expand(run_spec)
|
|
@@ -1113,12 +1421,15 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1113
1421
|
MaxTrainInstancesRunExpander,
|
|
1114
1422
|
MaxEvalInstancesRunExpander,
|
|
1115
1423
|
NumOutputsRunExpander,
|
|
1424
|
+
NumTrialRunExpander,
|
|
1116
1425
|
ModelRunExpander,
|
|
1426
|
+
ModelDeploymentRunExpander,
|
|
1117
1427
|
DataAugmentationRunExpander,
|
|
1118
1428
|
TokenizerRunExpander,
|
|
1119
1429
|
NumPromptTokensRunExpander,
|
|
1120
1430
|
NumOutputTokensRunExpander,
|
|
1121
1431
|
ChatMLRunExpander,
|
|
1432
|
+
EvalSplitRunExpander,
|
|
1122
1433
|
]
|
|
1123
1434
|
|
|
1124
1435
|
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
import importlib
|
|
3
|
+
import os
|
|
4
|
+
import pkgutil
|
|
5
|
+
from typing import Callable, Dict, Iterable, List, Optional, TypeVar
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
|
+
from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
|
|
9
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
10
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
11
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class RunSpec:
|
|
16
|
+
"""
|
|
17
|
+
Specifies how to do a single run, which gets a scenario, adapts it, and
|
|
18
|
+
computes a list of stats based on the defined metrics.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
"""Unique identifier of the RunSpec"""
|
|
23
|
+
|
|
24
|
+
scenario_spec: ScenarioSpec
|
|
25
|
+
"""Which scenario"""
|
|
26
|
+
|
|
27
|
+
adapter_spec: AdapterSpec
|
|
28
|
+
"""Specifies how to adapt an instance into a set of requests"""
|
|
29
|
+
|
|
30
|
+
metric_specs: List[MetricSpec]
|
|
31
|
+
"""What to evaluate on"""
|
|
32
|
+
|
|
33
|
+
data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
|
|
34
|
+
"""Data augmenter. The default `DataAugmenterSpec` does nothing."""
|
|
35
|
+
|
|
36
|
+
groups: List[str] = field(default_factory=list)
|
|
37
|
+
"""Groups that this run spec belongs to (for aggregation)"""
|
|
38
|
+
|
|
39
|
+
annotators: Optional[List[AnnotatorSpec]] = None
|
|
40
|
+
"""Annotators to use for this run spec"""
|
|
41
|
+
|
|
42
|
+
def __post_init__(self):
|
|
43
|
+
"""
|
|
44
|
+
`self.name` is used as the name of the output folder for the `RunSpec`.
|
|
45
|
+
Clean up `self.name` by replacing any "/"'s with "_".
|
|
46
|
+
"""
|
|
47
|
+
# TODO: Don't mutate name! clean this up before passing it into the constructor here
|
|
48
|
+
object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
RunSpecFunction = Callable[..., RunSpec]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_REGISTERED_RUN_SPEC_FUNCTIONS: Dict[str, RunSpecFunction] = {}
|
|
55
|
+
"""Dict of run spec function names to run spec functions."""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
F = TypeVar("F", bound=RunSpecFunction)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def run_spec_function(name: str) -> Callable[[F], F]:
|
|
62
|
+
"""Register the run spec function under the given name."""
|
|
63
|
+
|
|
64
|
+
def wrap(func: F) -> F:
|
|
65
|
+
if name in _REGISTERED_RUN_SPEC_FUNCTIONS:
|
|
66
|
+
raise ValueError(f"A run spec function with name {name} already exists")
|
|
67
|
+
_REGISTERED_RUN_SPEC_FUNCTIONS[name] = func
|
|
68
|
+
return func
|
|
69
|
+
|
|
70
|
+
return wrap
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# Copied from https://docs.python.org/3/library/pkgutil.html#pkgutil.iter_modules
|
|
74
|
+
def _iter_namespace(ns_pkg) -> Iterable[pkgutil.ModuleInfo]:
|
|
75
|
+
# Specifying the second argument (prefix) to iter_modules makes the
|
|
76
|
+
# returned name an absolute name instead of a relative one. This allows
|
|
77
|
+
# import_module to work without having to do additional modification to
|
|
78
|
+
# the name.
|
|
79
|
+
return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def discover_run_spec_functions() -> None:
|
|
83
|
+
"""Discover and register all run spec functions under helm.benchmark.run_specs"""
|
|
84
|
+
import helm.benchmark.run_specs # noqa
|
|
85
|
+
|
|
86
|
+
for finder, name, ispkg in _iter_namespace(helm.benchmark.run_specs):
|
|
87
|
+
importlib.import_module(name)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_run_spec_function(name: str) -> Optional[RunSpecFunction]:
|
|
91
|
+
"""Return the run spec function registered under the given name."""
|
|
92
|
+
discover_run_spec_functions()
|
|
93
|
+
return _REGISTERED_RUN_SPEC_FUNCTIONS.get(name)
|