crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +8 -2
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +36 -0
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +214 -16
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +14 -16
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +203 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +12 -72
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1069 -546
- helm/config/model_metadata.yaml +753 -31
- helm/config/tokenizer_configs.yaml +142 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
"""Run spec functions for evaluating Vision-Language Models."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional, Dict
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
7
|
+
ADAPT_GENERATION_MULTIMODAL,
|
|
8
|
+
ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
|
|
9
|
+
)
|
|
10
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
11
|
+
get_basic_reference_metric_specs,
|
|
12
|
+
get_exact_match_metric_specs,
|
|
13
|
+
get_open_ended_generation_metric_specs,
|
|
14
|
+
)
|
|
15
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
16
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
17
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
18
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
############################################################
|
|
22
|
+
# Prototypical adapter specs for VLM evaluation
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_generation_adapter_spec(
|
|
26
|
+
instructions: str = "",
|
|
27
|
+
input_prefix: str = "",
|
|
28
|
+
input_suffix: str = "",
|
|
29
|
+
output_prefix: str = "",
|
|
30
|
+
output_suffix: str = "",
|
|
31
|
+
max_tokens: int = 100,
|
|
32
|
+
stop_sequences: Optional[List[str]] = None,
|
|
33
|
+
) -> AdapterSpec:
|
|
34
|
+
return AdapterSpec(
|
|
35
|
+
method=ADAPT_GENERATION_MULTIMODAL,
|
|
36
|
+
global_prefix="",
|
|
37
|
+
instructions=instructions,
|
|
38
|
+
input_prefix=input_prefix,
|
|
39
|
+
input_suffix=input_suffix,
|
|
40
|
+
output_prefix=output_prefix,
|
|
41
|
+
output_suffix=output_suffix,
|
|
42
|
+
instance_prefix="\n",
|
|
43
|
+
# We focus on zero-shot evaluation for now as most open VLMs only support a single image input
|
|
44
|
+
max_train_instances=0,
|
|
45
|
+
num_outputs=1,
|
|
46
|
+
max_tokens=max_tokens,
|
|
47
|
+
stop_sequences=stop_sequences if stop_sequences is not None else [],
|
|
48
|
+
random=None,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_short_answer_generation_adapter_spec():
|
|
53
|
+
return get_generation_adapter_spec(
|
|
54
|
+
instructions="Just give a short answer without answering in a complete sentence.",
|
|
55
|
+
max_tokens=20,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def get_multiple_choice_joint_adapter_spec(
|
|
60
|
+
input_noun: Optional[str],
|
|
61
|
+
output_noun: str,
|
|
62
|
+
max_train_instances: int = 0,
|
|
63
|
+
num_outputs: int = 1,
|
|
64
|
+
) -> AdapterSpec:
|
|
65
|
+
return AdapterSpec(
|
|
66
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT_MULTIMODAL,
|
|
67
|
+
global_prefix="",
|
|
68
|
+
instructions="Answer the multiple choice question by just giving the letter of the correct answer.",
|
|
69
|
+
input_prefix=f"{input_noun}: " if input_noun is not None else "",
|
|
70
|
+
input_suffix="\n",
|
|
71
|
+
output_prefix=f"{output_noun}: ",
|
|
72
|
+
output_suffix="\n",
|
|
73
|
+
instance_prefix="\n",
|
|
74
|
+
max_train_instances=max_train_instances,
|
|
75
|
+
num_outputs=num_outputs,
|
|
76
|
+
max_tokens=1,
|
|
77
|
+
stop_sequences=["\n"],
|
|
78
|
+
temperature=0.0,
|
|
79
|
+
random=None,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
############################################################
|
|
84
|
+
# VHELM metric specs
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_image2structure_metric_specs(
|
|
88
|
+
generation_type: str,
|
|
89
|
+
metric_names: Optional[List[str]] = None,
|
|
90
|
+
args: Optional[Dict] = None,
|
|
91
|
+
include_edit_similarity: bool = True,
|
|
92
|
+
size_handling_method: str = "resize",
|
|
93
|
+
) -> List[MetricSpec]:
|
|
94
|
+
from helm.benchmark.metrics.vision_language.image_metrics import AnnotatedImageMetrics
|
|
95
|
+
|
|
96
|
+
if metric_names is None:
|
|
97
|
+
metric_names = [
|
|
98
|
+
AnnotatedImageMetrics.PIXEL_SIMILARITY,
|
|
99
|
+
AnnotatedImageMetrics.FID_SIMILARITY,
|
|
100
|
+
AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
|
|
101
|
+
]
|
|
102
|
+
if include_edit_similarity:
|
|
103
|
+
metric_names.append(AnnotatedImageMetrics.EDIT_SIMILARITY)
|
|
104
|
+
if args is None:
|
|
105
|
+
args = {}
|
|
106
|
+
metric_specs = [
|
|
107
|
+
MetricSpec(
|
|
108
|
+
class_name="helm.benchmark.metrics.vision_language.image_metrics.AnnotatedImageMetrics",
|
|
109
|
+
args={
|
|
110
|
+
"generation_type": generation_type,
|
|
111
|
+
"metric_names": metric_names,
|
|
112
|
+
"size_handling_method": size_handling_method,
|
|
113
|
+
**args,
|
|
114
|
+
},
|
|
115
|
+
),
|
|
116
|
+
]
|
|
117
|
+
return metric_specs + get_basic_reference_metric_specs()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
############################################################
|
|
121
|
+
# VHELM run specs
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@run_spec_function("chart2csv")
|
|
125
|
+
def get_chart2csv_spec() -> RunSpec:
|
|
126
|
+
scenario_spec = ScenarioSpec(
|
|
127
|
+
class_name="helm.benchmark.scenarios.vision_language.image2structure.chart2csv_scenario.Chart2CSVScenario",
|
|
128
|
+
args={},
|
|
129
|
+
)
|
|
130
|
+
adapter_spec: AdapterSpec = get_generation_adapter_spec(
|
|
131
|
+
instructions="Generate the CSV for the chart. Some of the labels may be missing due to the size of the chart. "
|
|
132
|
+
"Please infer the missing labels based on the surrounding context. "
|
|
133
|
+
"Just give the CSV without any explanation.",
|
|
134
|
+
max_tokens=1000,
|
|
135
|
+
)
|
|
136
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
137
|
+
|
|
138
|
+
run_spec_name: str = "chart2csv"
|
|
139
|
+
return RunSpec(
|
|
140
|
+
name=run_spec_name,
|
|
141
|
+
scenario_spec=scenario_spec,
|
|
142
|
+
adapter_spec=adapter_spec,
|
|
143
|
+
metric_specs=metric_specs,
|
|
144
|
+
groups=[run_spec_name],
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@run_spec_function("hateful_memes")
|
|
149
|
+
def get_hateful_memes_spec() -> RunSpec:
|
|
150
|
+
scenario_spec = ScenarioSpec(
|
|
151
|
+
class_name="helm.benchmark.scenarios.vision_language.hateful_memes_scenario.HatefulMemesScenario", args={}
|
|
152
|
+
)
|
|
153
|
+
adapter_spec: AdapterSpec = get_generation_adapter_spec(
|
|
154
|
+
instructions="Answer Yes or No without an explanation.",
|
|
155
|
+
max_tokens=3,
|
|
156
|
+
)
|
|
157
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
158
|
+
|
|
159
|
+
run_spec_name: str = "hateful_memes"
|
|
160
|
+
return RunSpec(
|
|
161
|
+
name=run_spec_name,
|
|
162
|
+
scenario_spec=scenario_spec,
|
|
163
|
+
adapter_spec=adapter_spec,
|
|
164
|
+
metric_specs=metric_specs,
|
|
165
|
+
groups=[run_spec_name],
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@run_spec_function("viz_wiz")
|
|
170
|
+
def get_viz_wiz_spec() -> RunSpec:
|
|
171
|
+
scenario_spec = ScenarioSpec(
|
|
172
|
+
class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
|
|
173
|
+
)
|
|
174
|
+
adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
|
|
175
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
176
|
+
|
|
177
|
+
run_spec_name: str = "viz_wiz"
|
|
178
|
+
return RunSpec(
|
|
179
|
+
name=run_spec_name,
|
|
180
|
+
scenario_spec=scenario_spec,
|
|
181
|
+
adapter_spec=adapter_spec,
|
|
182
|
+
metric_specs=metric_specs,
|
|
183
|
+
groups=[run_spec_name],
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@run_spec_function("vqa")
|
|
188
|
+
def get_vqa_spec() -> RunSpec:
|
|
189
|
+
scenario_spec = ScenarioSpec(
|
|
190
|
+
class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
|
|
191
|
+
)
|
|
192
|
+
adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
|
|
193
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs() + get_open_ended_generation_metric_specs()
|
|
194
|
+
|
|
195
|
+
run_spec_name: str = "vqa"
|
|
196
|
+
return RunSpec(
|
|
197
|
+
name=run_spec_name,
|
|
198
|
+
scenario_spec=scenario_spec,
|
|
199
|
+
adapter_spec=adapter_spec,
|
|
200
|
+
metric_specs=metric_specs,
|
|
201
|
+
groups=[run_spec_name],
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
@run_spec_function("image2latex")
|
|
206
|
+
def get_image2latex_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
|
|
207
|
+
scenario_spec = ScenarioSpec(
|
|
208
|
+
class_name="helm.benchmark.scenarios.vision_language.image2structure.latex_scenario.LatexScenario",
|
|
209
|
+
args={"subset": subset, "recompile_prompt": recompile_prompt},
|
|
210
|
+
)
|
|
211
|
+
adapter_spec: AdapterSpec = get_generation_adapter_spec(
|
|
212
|
+
instructions="Just give a short answer without answering in a complete sentence.",
|
|
213
|
+
max_tokens=2000,
|
|
214
|
+
)
|
|
215
|
+
metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
|
|
216
|
+
generation_type="latex",
|
|
217
|
+
args=args,
|
|
218
|
+
include_edit_similarity=True,
|
|
219
|
+
size_handling_method="padding",
|
|
220
|
+
)
|
|
221
|
+
annotator_specs: List[AnnotatorSpec] = [
|
|
222
|
+
AnnotatorSpec(
|
|
223
|
+
class_name="helm.benchmark.annotation.image2structure.latex_compiler_annotator.LatexCompilerAnnotator",
|
|
224
|
+
)
|
|
225
|
+
]
|
|
226
|
+
|
|
227
|
+
run_spec_name: str = "image2latex"
|
|
228
|
+
return RunSpec(
|
|
229
|
+
name=f"{run_spec_name}:subset={subset}",
|
|
230
|
+
scenario_spec=scenario_spec,
|
|
231
|
+
adapter_spec=adapter_spec,
|
|
232
|
+
metric_specs=metric_specs,
|
|
233
|
+
groups=[run_spec_name],
|
|
234
|
+
annotators=annotator_specs,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
@run_spec_function("image2webpage")
|
|
239
|
+
def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Optional[Dict] = None) -> RunSpec:
|
|
240
|
+
scenario_spec = ScenarioSpec(
|
|
241
|
+
class_name="helm.benchmark.scenarios.vision_language.image2structure.webpage_scenario.WebpageScenario",
|
|
242
|
+
args={"subset": subset, "recompile_prompt": recompile_prompt},
|
|
243
|
+
)
|
|
244
|
+
adapter_spec: AdapterSpec = get_generation_adapter_spec(
|
|
245
|
+
instructions="Just give a short answer without answering in a complete sentence.",
|
|
246
|
+
max_tokens=2000,
|
|
247
|
+
)
|
|
248
|
+
metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
|
|
249
|
+
generation_type="webpage",
|
|
250
|
+
args=args,
|
|
251
|
+
include_edit_similarity=True,
|
|
252
|
+
size_handling_method="none",
|
|
253
|
+
)
|
|
254
|
+
annotator_specs: List[AnnotatorSpec] = [
|
|
255
|
+
AnnotatorSpec(
|
|
256
|
+
class_name="helm.benchmark.annotation.image2structure.webpage_compiler_annotator.WebpageCompilerAnnotator",
|
|
257
|
+
)
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
run_spec_name: str = "image2webpage"
|
|
261
|
+
return RunSpec(
|
|
262
|
+
name=f"{run_spec_name}:subset={subset}",
|
|
263
|
+
scenario_spec=scenario_spec,
|
|
264
|
+
adapter_spec=adapter_spec,
|
|
265
|
+
metric_specs=metric_specs,
|
|
266
|
+
groups=[run_spec_name],
|
|
267
|
+
annotators=annotator_specs,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
@run_spec_function("image2musicsheet")
|
|
272
|
+
def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
|
|
273
|
+
scenario_spec = ScenarioSpec(
|
|
274
|
+
class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
|
|
275
|
+
args={"subset": "music", "recompile_prompt": False}, # There os only one subset for music sheets
|
|
276
|
+
)
|
|
277
|
+
adapter_spec: AdapterSpec = get_generation_adapter_spec(
|
|
278
|
+
instructions="Just give a short answer without answering in a complete sentence.",
|
|
279
|
+
max_tokens=2000,
|
|
280
|
+
)
|
|
281
|
+
metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
|
|
282
|
+
generation_type="lilypond",
|
|
283
|
+
args=args,
|
|
284
|
+
include_edit_similarity=False, # No ground truth for music sheets
|
|
285
|
+
size_handling_method="padding",
|
|
286
|
+
)
|
|
287
|
+
annotator_specs: List[AnnotatorSpec] = [
|
|
288
|
+
AnnotatorSpec(
|
|
289
|
+
class_name="helm.benchmark.annotation.image2structure.lilypond_compiler_annotator.LilypondCompilerAnnotator", # noqa: E501
|
|
290
|
+
)
|
|
291
|
+
]
|
|
292
|
+
|
|
293
|
+
run_spec_name: str = "image2musicsheet"
|
|
294
|
+
return RunSpec(
|
|
295
|
+
name=f"{run_spec_name}",
|
|
296
|
+
scenario_spec=scenario_spec,
|
|
297
|
+
adapter_spec=adapter_spec,
|
|
298
|
+
metric_specs=metric_specs,
|
|
299
|
+
groups=[run_spec_name],
|
|
300
|
+
annotators=annotator_specs,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@run_spec_function("mmmu")
|
|
305
|
+
def get_mmmu_spec(subject: str, question_type: str) -> RunSpec:
|
|
306
|
+
scenario_spec = ScenarioSpec(
|
|
307
|
+
class_name="helm.benchmark.scenarios.vision_language.mmmu_scenario.MMMUScenario",
|
|
308
|
+
args={"subject": subject, "question_type": question_type},
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
adapter_spec: AdapterSpec
|
|
312
|
+
if question_type == "open":
|
|
313
|
+
adapter_spec = get_short_answer_generation_adapter_spec()
|
|
314
|
+
elif question_type == "multiple-choice":
|
|
315
|
+
adapter_spec = get_multiple_choice_joint_adapter_spec(
|
|
316
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
raise ValueError(f"Invalid question type: {question_type}")
|
|
320
|
+
|
|
321
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
322
|
+
run_spec_name: str = "mmmu"
|
|
323
|
+
return RunSpec(
|
|
324
|
+
name=f"{run_spec_name}:subject={subject},question_type={question_type}",
|
|
325
|
+
scenario_spec=scenario_spec,
|
|
326
|
+
adapter_spec=adapter_spec,
|
|
327
|
+
metric_specs=metric_specs,
|
|
328
|
+
groups=[run_spec_name],
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@run_spec_function("unicorn")
|
|
333
|
+
def get_unicorn_spec(subject: str) -> RunSpec:
|
|
334
|
+
scenario_spec = ScenarioSpec(
|
|
335
|
+
class_name="helm.benchmark.scenarios.vision_language.unicorn_scenario.UnicornScenario",
|
|
336
|
+
args={"subject": subject},
|
|
337
|
+
)
|
|
338
|
+
adapter_spec: AdapterSpec = get_generation_adapter_spec(
|
|
339
|
+
instructions="Only give numerical or boolean answer without an explanation."
|
|
340
|
+
)
|
|
341
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
342
|
+
|
|
343
|
+
run_spec_name: str = "unicorn"
|
|
344
|
+
return RunSpec(
|
|
345
|
+
name=run_spec_name,
|
|
346
|
+
scenario_spec=scenario_spec,
|
|
347
|
+
adapter_spec=adapter_spec,
|
|
348
|
+
metric_specs=metric_specs,
|
|
349
|
+
groups=[run_spec_name],
|
|
350
|
+
)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
@run_spec_function("bingo")
|
|
354
|
+
def get_bingo_spec(subject: str) -> RunSpec:
|
|
355
|
+
scenario_spec = ScenarioSpec(
|
|
356
|
+
class_name="helm.benchmark.scenarios.vision_language.bingo_scenario.BingoScenario", args={"subject": subject}
|
|
357
|
+
)
|
|
358
|
+
adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
|
|
359
|
+
metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
|
|
360
|
+
|
|
361
|
+
run_spec_name: str = "bingo"
|
|
362
|
+
return RunSpec(
|
|
363
|
+
name=run_spec_name,
|
|
364
|
+
scenario_spec=scenario_spec,
|
|
365
|
+
adapter_spec=adapter_spec,
|
|
366
|
+
metric_specs=metric_specs,
|
|
367
|
+
groups=[run_spec_name],
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
@run_spec_function("multipanelvqa")
|
|
372
|
+
def get_multipanelvqa_spec(subject: str, question_type: str) -> RunSpec:
|
|
373
|
+
scenario_spec = ScenarioSpec(
|
|
374
|
+
class_name="helm.benchmark.scenarios.vision_language.multipanelvqa_scenario.MultipanelVQAScenario",
|
|
375
|
+
args={"subject": subject, "question_type": question_type},
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
adapter_spec: AdapterSpec
|
|
379
|
+
if question_type == "open":
|
|
380
|
+
adapter_spec = get_short_answer_generation_adapter_spec()
|
|
381
|
+
elif question_type == "multiple-choice":
|
|
382
|
+
adapter_spec = get_multiple_choice_joint_adapter_spec(
|
|
383
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
384
|
+
)
|
|
385
|
+
else:
|
|
386
|
+
raise ValueError(f"Invalid question type: {question_type}")
|
|
387
|
+
|
|
388
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
389
|
+
run_spec_name: str = "multipanelvqa"
|
|
390
|
+
return RunSpec(
|
|
391
|
+
name=f"{run_spec_name}:subject={subject},question_type={question_type}",
|
|
392
|
+
scenario_spec=scenario_spec,
|
|
393
|
+
adapter_spec=adapter_spec,
|
|
394
|
+
metric_specs=metric_specs,
|
|
395
|
+
groups=[run_spec_name],
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
@run_spec_function("pope")
|
|
400
|
+
def get_pope_spec() -> RunSpec:
|
|
401
|
+
scenario_spec = ScenarioSpec(
|
|
402
|
+
class_name="helm.benchmark.scenarios.vision_language.pope_scenario.POPEScenario",
|
|
403
|
+
)
|
|
404
|
+
adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
|
|
405
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
406
|
+
)
|
|
407
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
408
|
+
|
|
409
|
+
run_spec_name: str = "pope"
|
|
410
|
+
return RunSpec(
|
|
411
|
+
name=run_spec_name,
|
|
412
|
+
scenario_spec=scenario_spec,
|
|
413
|
+
adapter_spec=adapter_spec,
|
|
414
|
+
metric_specs=metric_specs,
|
|
415
|
+
groups=[run_spec_name],
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
@run_spec_function("seed_bench")
|
|
420
|
+
def get_seed_bench_spec(subject: str) -> RunSpec:
|
|
421
|
+
scenario_spec = ScenarioSpec(
|
|
422
|
+
class_name="helm.benchmark.scenarios.vision_language.seed_bench_scenario.SEEDBenchScenario",
|
|
423
|
+
args={"subject": subject},
|
|
424
|
+
)
|
|
425
|
+
adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
|
|
426
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
427
|
+
)
|
|
428
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
429
|
+
|
|
430
|
+
run_spec_name: str = "seed_bench"
|
|
431
|
+
return RunSpec(
|
|
432
|
+
name=run_spec_name,
|
|
433
|
+
scenario_spec=scenario_spec,
|
|
434
|
+
adapter_spec=adapter_spec,
|
|
435
|
+
metric_specs=metric_specs,
|
|
436
|
+
groups=[run_spec_name],
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
@run_spec_function("mme")
|
|
441
|
+
def get_mme_spec(subject: str) -> RunSpec:
|
|
442
|
+
scenario_spec = ScenarioSpec(
|
|
443
|
+
class_name="helm.benchmark.scenarios.vision_language.mme_scenario.MMEScenario",
|
|
444
|
+
args={"subject": subject},
|
|
445
|
+
)
|
|
446
|
+
adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
|
|
447
|
+
input_noun=None, output_noun="Answer", max_train_instances=0
|
|
448
|
+
)
|
|
449
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
450
|
+
|
|
451
|
+
run_spec_name: str = "mme"
|
|
452
|
+
return RunSpec(
|
|
453
|
+
name=run_spec_name,
|
|
454
|
+
scenario_spec=scenario_spec,
|
|
455
|
+
adapter_spec=adapter_spec,
|
|
456
|
+
metric_specs=metric_specs,
|
|
457
|
+
groups=[run_spec_name],
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
@run_spec_function("heim_human_eval")
|
|
462
|
+
def get_heim_human_eval_spec(question_type: str) -> RunSpec:
|
|
463
|
+
scenario_spec = ScenarioSpec(
|
|
464
|
+
class_name="helm.benchmark.scenarios.vision_language.heim_human_eval_scenario.HEIMHumanEvalScenario",
|
|
465
|
+
args={"question_type": question_type},
|
|
466
|
+
)
|
|
467
|
+
adapter_spec: AdapterSpec = get_multiple_choice_joint_adapter_spec(
|
|
468
|
+
input_noun=None,
|
|
469
|
+
output_noun="Answer",
|
|
470
|
+
num_outputs=1,
|
|
471
|
+
max_train_instances=0,
|
|
472
|
+
)
|
|
473
|
+
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
474
|
+
|
|
475
|
+
run_spec_name: str = "heim_human_eval"
|
|
476
|
+
return RunSpec(
|
|
477
|
+
name=f"{run_spec_name}:question_type={question_type}",
|
|
478
|
+
scenario_spec=scenario_spec,
|
|
479
|
+
adapter_spec=adapter_spec,
|
|
480
|
+
metric_specs=metric_specs,
|
|
481
|
+
groups=[run_spec_name],
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
@run_spec_function("mementos")
|
|
486
|
+
def get_mementos_spec(subject: str) -> RunSpec:
|
|
487
|
+
scenario_spec = ScenarioSpec(
|
|
488
|
+
class_name="helm.benchmark.scenarios.vision_language.mementos_scenario.MementosScenario",
|
|
489
|
+
args={"subject": subject},
|
|
490
|
+
)
|
|
491
|
+
adapter_spec: AdapterSpec = get_short_answer_generation_adapter_spec()
|
|
492
|
+
metric_specs: List[MetricSpec] = get_open_ended_generation_metric_specs()
|
|
493
|
+
|
|
494
|
+
run_spec_name: str = "mementos"
|
|
495
|
+
return RunSpec(
|
|
496
|
+
name=run_spec_name,
|
|
497
|
+
scenario_spec=scenario_spec,
|
|
498
|
+
adapter_spec=adapter_spec,
|
|
499
|
+
metric_specs=metric_specs,
|
|
500
|
+
groups=[run_spec_name],
|
|
501
|
+
)
|
helm/benchmark/runner.py
CHANGED
|
@@ -6,45 +6,45 @@ import traceback
|
|
|
6
6
|
import typing
|
|
7
7
|
from collections import Counter
|
|
8
8
|
import dataclasses
|
|
9
|
-
from dataclasses import dataclass, field
|
|
10
9
|
from typing import Any, Dict, List
|
|
11
10
|
import numpy as np
|
|
12
11
|
|
|
13
12
|
from tqdm import tqdm
|
|
14
13
|
|
|
14
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
15
15
|
from helm.common.general import ensure_directory_exists, write, asdict_without_nones
|
|
16
16
|
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
17
17
|
from helm.common.cache import cache_stats
|
|
18
|
-
from .
|
|
19
|
-
from .scenarios.scenario import (
|
|
18
|
+
from helm.benchmark.scenarios.scenario import (
|
|
20
19
|
EVAL_SPLITS,
|
|
21
20
|
TRAIN_SPLIT,
|
|
22
21
|
Scenario,
|
|
23
|
-
ScenarioSpec,
|
|
24
22
|
create_scenario,
|
|
25
23
|
Instance,
|
|
26
24
|
get_scenario_cache_path,
|
|
27
25
|
with_instance_ids,
|
|
28
26
|
)
|
|
29
|
-
from .adaptation.adapters.adapter import Adapter
|
|
30
|
-
from .adaptation.adapters.adapter_factory import AdapterFactory
|
|
31
|
-
from .adaptation.scenario_state import ScenarioState
|
|
32
|
-
from .
|
|
33
|
-
from .data_preprocessor import DataPreprocessor
|
|
34
|
-
from .executor import ExecutionSpec, Executor
|
|
35
|
-
from .
|
|
36
|
-
from .metrics.
|
|
37
|
-
from .metrics.
|
|
38
|
-
from .metrics.
|
|
39
|
-
from .
|
|
27
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
28
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
|
|
29
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
30
|
+
from helm.benchmark.run_spec import RunSpec
|
|
31
|
+
from helm.benchmark.data_preprocessor import DataPreprocessor
|
|
32
|
+
from helm.benchmark.executor import ExecutionSpec, Executor
|
|
33
|
+
from helm.benchmark.annotation_executor import AnnotationExecutionSpec, AnnotationExecutor
|
|
34
|
+
from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
|
|
35
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
36
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
37
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
|
|
38
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
40
39
|
|
|
41
40
|
|
|
42
41
|
LATEST_SYMLINK: str = "latest"
|
|
43
42
|
_BENCHMARK_OUTPUT_PATH: str = "benchmark_output"
|
|
43
|
+
_CACHED_MODELS_FOLDER: str = "models"
|
|
44
44
|
|
|
45
45
|
|
|
46
46
|
def get_benchmark_output_path() -> str:
|
|
47
|
-
"""Get the
|
|
47
|
+
"""Get the benchmark output path.
|
|
48
48
|
|
|
49
49
|
Many run spec functions need to know the benchmark output path,
|
|
50
50
|
but there is no way to pass it via the run spec function,
|
|
@@ -52,8 +52,15 @@ def get_benchmark_output_path() -> str:
|
|
|
52
52
|
return _BENCHMARK_OUTPUT_PATH
|
|
53
53
|
|
|
54
54
|
|
|
55
|
+
def get_cached_models_path() -> str:
|
|
56
|
+
"""Get the cached models pat within the benchmark output path."""
|
|
57
|
+
path: str = os.path.join(get_benchmark_output_path(), _CACHED_MODELS_FOLDER)
|
|
58
|
+
ensure_directory_exists(path)
|
|
59
|
+
return path
|
|
60
|
+
|
|
61
|
+
|
|
55
62
|
def set_benchmark_output_path(benchmark_output_path: str) -> None:
|
|
56
|
-
"""Set the
|
|
63
|
+
"""Set the benchmark output path."""
|
|
57
64
|
global _BENCHMARK_OUTPUT_PATH
|
|
58
65
|
_BENCHMARK_OUTPUT_PATH = benchmark_output_path
|
|
59
66
|
|
|
@@ -64,40 +71,6 @@ class RunnerError(Exception):
|
|
|
64
71
|
pass
|
|
65
72
|
|
|
66
73
|
|
|
67
|
-
@dataclass(frozen=True)
|
|
68
|
-
class RunSpec:
|
|
69
|
-
"""
|
|
70
|
-
Specifies how to do a single run, which gets a scenario, adapts it, and
|
|
71
|
-
computes a list of stats based on the defined metrics.
|
|
72
|
-
"""
|
|
73
|
-
|
|
74
|
-
# Unique identifier of the RunSpec
|
|
75
|
-
name: str
|
|
76
|
-
|
|
77
|
-
# Which scenario
|
|
78
|
-
scenario_spec: ScenarioSpec
|
|
79
|
-
|
|
80
|
-
# Specifies how to adapt an instance into a set of requests
|
|
81
|
-
adapter_spec: AdapterSpec
|
|
82
|
-
|
|
83
|
-
# What to evaluate on
|
|
84
|
-
metric_specs: List[MetricSpec]
|
|
85
|
-
|
|
86
|
-
# Data augmenter. The default `DataAugmenterSpec` does nothing.
|
|
87
|
-
data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
|
|
88
|
-
|
|
89
|
-
# Groups that this run spec belongs to (for aggregation)
|
|
90
|
-
groups: List[str] = field(default_factory=list)
|
|
91
|
-
|
|
92
|
-
def __post_init__(self):
|
|
93
|
-
"""
|
|
94
|
-
`self.name` is used as the name of the output folder for the `RunSpec`.
|
|
95
|
-
Clean up `self.name` by replacing any "/"'s with "_".
|
|
96
|
-
"""
|
|
97
|
-
# TODO: Don't mutate name! clean this up before passing it into the constructor here
|
|
98
|
-
object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
|
|
99
|
-
|
|
100
|
-
|
|
101
74
|
def remove_stats_nans(stats: List[Stat]) -> List[Stat]:
|
|
102
75
|
"""Return a new list of stats with stats with NaNs removed.
|
|
103
76
|
|
|
@@ -129,7 +102,9 @@ def remove_per_instance_stats_nans(per_instance_stats_list: List[PerInstanceStat
|
|
|
129
102
|
return result
|
|
130
103
|
|
|
131
104
|
|
|
132
|
-
def downsample_eval_instances(
|
|
105
|
+
def downsample_eval_instances(
|
|
106
|
+
instances: List[Instance], max_eval_instances: int, eval_splits: List[str]
|
|
107
|
+
) -> List[Instance]:
|
|
133
108
|
"""
|
|
134
109
|
Get the instances necessary for this run:
|
|
135
110
|
Train instances (split=train): keep all (if any) for in-context learning
|
|
@@ -138,7 +113,7 @@ def downsample_eval_instances(instances: List[Instance], max_eval_instances: int
|
|
|
138
113
|
"""
|
|
139
114
|
all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
|
|
140
115
|
|
|
141
|
-
all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in
|
|
116
|
+
all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in eval_splits]
|
|
142
117
|
if len(all_eval_instances) > max_eval_instances:
|
|
143
118
|
# The random sampling includes instances monotonically.
|
|
144
119
|
np.random.seed(0)
|
|
@@ -179,6 +154,15 @@ class Runner:
|
|
|
179
154
|
exit_on_error: bool,
|
|
180
155
|
):
|
|
181
156
|
self.executor = Executor(execution_spec)
|
|
157
|
+
self.annotator_executor = AnnotationExecutor(
|
|
158
|
+
AnnotationExecutionSpec(
|
|
159
|
+
local_path=execution_spec.local_path if execution_spec.local_path is not None else "",
|
|
160
|
+
parallelism=execution_spec.parallelism,
|
|
161
|
+
dry_run=execution_spec.dry_run,
|
|
162
|
+
sqlite_cache_backend_config=execution_spec.sqlite_cache_backend_config,
|
|
163
|
+
mongo_cache_backend_config=execution_spec.mongo_cache_backend_config,
|
|
164
|
+
)
|
|
165
|
+
)
|
|
182
166
|
self.dry_run: bool = execution_spec.dry_run
|
|
183
167
|
self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
|
|
184
168
|
self.metric_service = MetricService(self.executor.service, execution_spec.auth)
|
|
@@ -280,12 +264,14 @@ class Runner:
|
|
|
280
264
|
return # Exit after saving the instances.
|
|
281
265
|
|
|
282
266
|
# Give each instance a unique ID
|
|
283
|
-
|
|
267
|
+
if any([instance.id is None for instance in instances]):
|
|
268
|
+
instances = with_instance_ids(instances)
|
|
284
269
|
|
|
285
270
|
# Get the instances necessary for this run.
|
|
286
271
|
max_eval_instances = run_spec.adapter_spec.max_eval_instances
|
|
272
|
+
eval_splits = run_spec.adapter_spec.eval_splits or EVAL_SPLITS
|
|
287
273
|
if max_eval_instances is not None:
|
|
288
|
-
instances = downsample_eval_instances(instances, max_eval_instances)
|
|
274
|
+
instances = downsample_eval_instances(instances, max_eval_instances, eval_splits)
|
|
289
275
|
|
|
290
276
|
# Data preprocessing
|
|
291
277
|
instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
|
|
@@ -294,15 +280,23 @@ class Runner:
|
|
|
294
280
|
|
|
295
281
|
# Adapt (convert to requests)
|
|
296
282
|
adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
|
|
297
|
-
|
|
283
|
+
request_states: List[RequestState] = adapter.adapt(instances, self.executor.execution_spec.parallelism)
|
|
284
|
+
scenario_state: ScenarioState = ScenarioState(
|
|
285
|
+
adapter_spec=run_spec.adapter_spec,
|
|
286
|
+
request_states=request_states,
|
|
287
|
+
annotator_specs=run_spec.annotators,
|
|
288
|
+
)
|
|
298
289
|
|
|
299
290
|
# Execute (fill up results)
|
|
300
291
|
scenario_state = self.executor.execute(scenario_state)
|
|
301
292
|
|
|
293
|
+
# Annotate (post-process the results)
|
|
294
|
+
scenario_state = self.annotator_executor.execute(scenario_state)
|
|
295
|
+
|
|
302
296
|
# Apply the metrics
|
|
303
297
|
# When performing a dry run, only estimate the number of tokens instead
|
|
304
298
|
# of calculating the metrics.
|
|
305
|
-
metrics: List[
|
|
299
|
+
metrics: List[MetricInterface] = (
|
|
306
300
|
[DryRunMetric()] if self.dry_run else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
|
|
307
301
|
)
|
|
308
302
|
stats: List[Stat] = []
|