crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +8 -2
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +36 -0
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +214 -16
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +14 -16
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +203 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +12 -72
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1069 -546
- helm/config/model_metadata.yaml +753 -31
- helm/config/tokenizer_configs.yaml +142 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/benchmark/run_expander.py
CHANGED
|
@@ -12,15 +12,16 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
12
12
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
13
13
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
14
14
|
ABLATION_MODEL_TAG,
|
|
15
|
+
TEXT_TO_IMAGE_MODEL_TAG,
|
|
15
16
|
VISION_LANGUAGE_MODEL_TAG,
|
|
16
17
|
)
|
|
17
18
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
18
|
-
from helm.common.general import handle_module_not_found_error
|
|
19
19
|
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
20
|
-
from .
|
|
21
|
-
from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
|
|
20
|
+
from .run_spec import RunSpec
|
|
21
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
|
|
22
22
|
from .augmentations.perturbation import PerturbationSpec
|
|
23
23
|
from .augmentations.data_augmenter import DataAugmenterSpec
|
|
24
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class RunExpander(ABC):
|
|
@@ -223,12 +224,16 @@ class AddToStopRunExpander(RunExpander):
|
|
|
223
224
|
self.value = value
|
|
224
225
|
|
|
225
226
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
227
|
+
if self.value == "newline":
|
|
228
|
+
stop_sequence = "\n"
|
|
229
|
+
else:
|
|
230
|
+
stop_sequence = self.value
|
|
226
231
|
return [
|
|
227
232
|
replace(
|
|
228
233
|
run_spec,
|
|
229
234
|
name=run_spec.name,
|
|
230
235
|
adapter_spec=replace(
|
|
231
|
-
run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [
|
|
236
|
+
run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [stop_sequence]
|
|
232
237
|
),
|
|
233
238
|
),
|
|
234
239
|
]
|
|
@@ -273,33 +278,37 @@ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
|
|
|
273
278
|
)
|
|
274
279
|
|
|
275
280
|
|
|
276
|
-
class
|
|
281
|
+
class AnthropicClaude2RunExpander(RunExpander):
|
|
277
282
|
"""
|
|
278
|
-
Custom prompt for Anthropic models.
|
|
283
|
+
Custom prompt for Anthropic Claude 1 and Claude 2 models.
|
|
279
284
|
These models need more explicit instructions about following the format.
|
|
280
285
|
"""
|
|
281
286
|
|
|
282
287
|
name = "anthropic"
|
|
283
288
|
|
|
289
|
+
# These strings must be added to the prompt in order to pass prompt validation,
|
|
290
|
+
# otherwise the Anthropic API will return an error.
|
|
291
|
+
# See: https://docs.anthropic.com/claude/reference/prompt-validation
|
|
292
|
+
HUMAN_PROMPT = "\n\nHuman:"
|
|
293
|
+
AI_PROMPT = "\n\nAssistant:"
|
|
294
|
+
|
|
284
295
|
def __init__(self):
|
|
285
296
|
pass
|
|
286
297
|
|
|
287
298
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
288
|
-
try:
|
|
289
|
-
import anthropic
|
|
290
|
-
except ModuleNotFoundError as e:
|
|
291
|
-
handle_module_not_found_error(e, ["anthropic"])
|
|
292
|
-
|
|
293
299
|
return [
|
|
294
300
|
replace(
|
|
295
301
|
run_spec,
|
|
296
302
|
name=run_spec.name,
|
|
297
303
|
adapter_spec=replace(
|
|
298
304
|
run_spec.adapter_spec,
|
|
299
|
-
global_prefix=
|
|
305
|
+
global_prefix=AnthropicClaude2RunExpander.HUMAN_PROMPT
|
|
306
|
+
+ " "
|
|
307
|
+
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX
|
|
308
|
+
+ "\n\n",
|
|
300
309
|
global_suffix="\n\n"
|
|
301
310
|
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
302
|
-
+
|
|
311
|
+
+ AnthropicClaude2RunExpander.AI_PROMPT
|
|
303
312
|
+ " "
|
|
304
313
|
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
305
314
|
),
|
|
@@ -307,13 +316,32 @@ class AnthropicRunExpander(RunExpander):
|
|
|
307
316
|
]
|
|
308
317
|
|
|
309
318
|
|
|
319
|
+
class AnthropicClaude3RunExpander(RunExpander):
|
|
320
|
+
"""Custom prompts for Anthropic Claude 3 models."""
|
|
321
|
+
|
|
322
|
+
name = "claude_3"
|
|
323
|
+
|
|
324
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
325
|
+
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
326
|
+
instructions = "Answer with only a single letter."
|
|
327
|
+
if run_spec.adapter_spec.instructions:
|
|
328
|
+
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
329
|
+
return [
|
|
330
|
+
replace(
|
|
331
|
+
run_spec,
|
|
332
|
+
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
333
|
+
),
|
|
334
|
+
]
|
|
335
|
+
return [run_spec]
|
|
336
|
+
|
|
337
|
+
|
|
310
338
|
class OpenAIRunExpander(RunExpander):
|
|
311
339
|
"""
|
|
312
340
|
Custom prompt for OpenAI models.
|
|
313
341
|
These models need more explicit instructions about following the format.
|
|
314
342
|
"""
|
|
315
343
|
|
|
316
|
-
# TODO: Refactor out common logic between this and GoogleRunExpander.
|
|
344
|
+
# TODO: Refactor out common logic between this and GoogleRunExpander and MistralRunExpander.
|
|
317
345
|
|
|
318
346
|
name = "openai"
|
|
319
347
|
|
|
@@ -346,7 +374,7 @@ class GoogleRunExpander(RunExpander):
|
|
|
346
374
|
These models need more explicit instructions about following the format.
|
|
347
375
|
"""
|
|
348
376
|
|
|
349
|
-
# TODO: Refactor out common logic between this and OpenAIRunExpander.
|
|
377
|
+
# TODO: Refactor out common logic between this and OpenAIRunExpander and MistralRunExpander.
|
|
350
378
|
|
|
351
379
|
name = "google"
|
|
352
380
|
|
|
@@ -370,6 +398,102 @@ class GoogleRunExpander(RunExpander):
|
|
|
370
398
|
]
|
|
371
399
|
|
|
372
400
|
|
|
401
|
+
class MistralRunExpander(RunExpander):
|
|
402
|
+
"""Custom prompt for Mistral models."""
|
|
403
|
+
|
|
404
|
+
# TODO: Refactor out common logic between this and GoogleRunExpander and OpenAIRunExpander.
|
|
405
|
+
|
|
406
|
+
name = "output_format_instructions"
|
|
407
|
+
|
|
408
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
409
|
+
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
410
|
+
return [run_spec]
|
|
411
|
+
|
|
412
|
+
return [
|
|
413
|
+
replace(
|
|
414
|
+
run_spec,
|
|
415
|
+
name=run_spec.name,
|
|
416
|
+
adapter_spec=replace(
|
|
417
|
+
run_spec.adapter_spec,
|
|
418
|
+
global_prefix=IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX + "\n\n",
|
|
419
|
+
global_suffix="\n\n"
|
|
420
|
+
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
421
|
+
+ "\n"
|
|
422
|
+
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
423
|
+
),
|
|
424
|
+
),
|
|
425
|
+
]
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
class IDEFICSInstructRunExpander(RunExpander):
|
|
429
|
+
"""
|
|
430
|
+
Custom prompt for IDEFICS instruct models which require a specific format.
|
|
431
|
+
See https://huggingface.co/HuggingFaceM4/idefics-80b-instruct for more information.
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
name = "idefics_instruct"
|
|
435
|
+
|
|
436
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
437
|
+
return [
|
|
438
|
+
replace(
|
|
439
|
+
run_spec,
|
|
440
|
+
name=run_spec.name,
|
|
441
|
+
adapter_spec=replace(
|
|
442
|
+
run_spec.adapter_spec,
|
|
443
|
+
input_prefix="User: ",
|
|
444
|
+
input_suffix="<end_of_utterance>",
|
|
445
|
+
output_prefix="\nAssistant: ",
|
|
446
|
+
output_suffix="<end_of_utterance>",
|
|
447
|
+
stop_sequences=["<end_of_utterance>"],
|
|
448
|
+
),
|
|
449
|
+
),
|
|
450
|
+
]
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
class LlavaRunExpander(RunExpander):
|
|
454
|
+
"""
|
|
455
|
+
Custom prompt for Llava 1.5 models which should use a specific format.
|
|
456
|
+
See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
|
|
457
|
+
"""
|
|
458
|
+
|
|
459
|
+
name = "llava"
|
|
460
|
+
|
|
461
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
462
|
+
return [
|
|
463
|
+
replace(
|
|
464
|
+
run_spec,
|
|
465
|
+
name=run_spec.name,
|
|
466
|
+
adapter_spec=replace(
|
|
467
|
+
run_spec.adapter_spec,
|
|
468
|
+
input_prefix="USER: <image>",
|
|
469
|
+
input_suffix="",
|
|
470
|
+
output_prefix="\nASSISTANT: ",
|
|
471
|
+
output_suffix="",
|
|
472
|
+
),
|
|
473
|
+
),
|
|
474
|
+
]
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
class OpenFlamingoRunExpander(RunExpander):
|
|
478
|
+
"""
|
|
479
|
+
Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
|
|
480
|
+
"""
|
|
481
|
+
|
|
482
|
+
name = "open_flamingo"
|
|
483
|
+
|
|
484
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
485
|
+
return [
|
|
486
|
+
replace(
|
|
487
|
+
run_spec,
|
|
488
|
+
name=run_spec.name,
|
|
489
|
+
adapter_spec=replace(
|
|
490
|
+
run_spec.adapter_spec,
|
|
491
|
+
input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
|
|
492
|
+
),
|
|
493
|
+
),
|
|
494
|
+
]
|
|
495
|
+
|
|
496
|
+
|
|
373
497
|
class FormatPromptRunExpander(RunExpander):
|
|
374
498
|
"""Adds a prefix and suffix to the prompt."""
|
|
375
499
|
|
|
@@ -415,6 +539,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
415
539
|
"one": [1],
|
|
416
540
|
"all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
|
|
417
541
|
"big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
|
|
542
|
+
"heim_human_eval": [0, 1, 2, 4, 8],
|
|
418
543
|
}
|
|
419
544
|
|
|
420
545
|
|
|
@@ -422,7 +547,12 @@ class MaxEvalInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
422
547
|
"""For overriding the number of eval instances at the run level."""
|
|
423
548
|
|
|
424
549
|
name = "max_eval_instances"
|
|
425
|
-
values_dict: Dict[str, List[Any]] = {
|
|
550
|
+
values_dict: Dict[str, List[Any]] = {
|
|
551
|
+
"default": [1_000],
|
|
552
|
+
"heim_default": [100],
|
|
553
|
+
"heim_fid": [30_000],
|
|
554
|
+
"heim_art_styles": [17],
|
|
555
|
+
}
|
|
426
556
|
|
|
427
557
|
|
|
428
558
|
class NumOutputsRunExpander(ReplaceValueRunExpander):
|
|
@@ -435,6 +565,15 @@ class NumOutputsRunExpander(ReplaceValueRunExpander):
|
|
|
435
565
|
}
|
|
436
566
|
|
|
437
567
|
|
|
568
|
+
class NumTrialRunExpander(ReplaceValueRunExpander):
|
|
569
|
+
"""For getting different generations for the same requests."""
|
|
570
|
+
|
|
571
|
+
name = "num_trials"
|
|
572
|
+
values_dict = {
|
|
573
|
+
"heim_efficiency": [5],
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
|
|
438
577
|
class ModelRunExpander(ReplaceValueRunExpander):
|
|
439
578
|
"""
|
|
440
579
|
For specifying different models.
|
|
@@ -476,6 +615,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
476
615
|
"openai/text-davinci-003",
|
|
477
616
|
],
|
|
478
617
|
"opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
|
|
618
|
+
"text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
|
|
479
619
|
"vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
|
|
480
620
|
}
|
|
481
621
|
|
|
@@ -500,6 +640,33 @@ class ModelDeploymentRunExpander(ReplaceValueRunExpander):
|
|
|
500
640
|
values_dict: Dict[str, List[Any]] = {}
|
|
501
641
|
|
|
502
642
|
|
|
643
|
+
class EvalSplitRunExpander(RunExpander):
|
|
644
|
+
"""Sets the evaluation split.
|
|
645
|
+
|
|
646
|
+
By default, evaluation instances are drawn from both test and validation splits.
|
|
647
|
+
This run expander allows drawing evaluation instances from only the test split or
|
|
648
|
+
only the validation split."""
|
|
649
|
+
|
|
650
|
+
# NOTE: This does not subclass `ReplaceValueRunExpander` because we want the
|
|
651
|
+
# run expander name to be "eval_split", not "eval_splits".
|
|
652
|
+
|
|
653
|
+
name = "eval_split"
|
|
654
|
+
|
|
655
|
+
def __init__(self, value):
|
|
656
|
+
if value != TEST_SPLIT and value != VALID_SPLIT:
|
|
657
|
+
raise ValueError(f'Split must be "{TEST_SPLIT}" or "{VALID_SPLIT}", but got "{value}"')
|
|
658
|
+
self.split = value
|
|
659
|
+
|
|
660
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
661
|
+
return [
|
|
662
|
+
replace(
|
|
663
|
+
run_spec,
|
|
664
|
+
name=f"{run_spec.name}{',' if ':' in run_spec.name else ':'}eval_split={self.split}",
|
|
665
|
+
adapter_spec=replace(run_spec.adapter_spec, eval_splits=[self.split]),
|
|
666
|
+
)
|
|
667
|
+
]
|
|
668
|
+
|
|
669
|
+
|
|
503
670
|
############################################################
|
|
504
671
|
|
|
505
672
|
|
|
@@ -688,6 +855,20 @@ def mandarin_to_cantonese() -> PerturbationSpec:
|
|
|
688
855
|
)
|
|
689
856
|
|
|
690
857
|
|
|
858
|
+
def translate(language_code: str) -> PerturbationSpec:
|
|
859
|
+
return PerturbationSpec(
|
|
860
|
+
class_name="helm.benchmark.augmentations.translate_perturbation.TranslatePerturbation",
|
|
861
|
+
args={"language_code": language_code},
|
|
862
|
+
)
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def suffix(text: str) -> PerturbationSpec:
|
|
866
|
+
return PerturbationSpec(
|
|
867
|
+
class_name="helm.benchmark.augmentations.suffix_perturbation.SuffixPerturbation",
|
|
868
|
+
args={"suffix": text},
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
|
|
691
872
|
# Specifies the data augmentations that we're interested in trying out.
|
|
692
873
|
# Concretely, this is a mapping from the name (which is specified in a conf
|
|
693
874
|
# file or the CLI) to a list of options to try, where each option is a list of perturbations.
|
|
@@ -879,6 +1060,21 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
|
|
|
879
1060
|
mandarin_to_cantonese(),
|
|
880
1061
|
]
|
|
881
1062
|
},
|
|
1063
|
+
# Multilinguality
|
|
1064
|
+
"chinese": {"chinese": [translate(language_code="zh-CN")]},
|
|
1065
|
+
"hindi": {"hindi": [translate(language_code="hi")]},
|
|
1066
|
+
"spanish": {"spanish": [translate(language_code="es")]},
|
|
1067
|
+
# Styles
|
|
1068
|
+
"art": {
|
|
1069
|
+
"art": [
|
|
1070
|
+
suffix("oil painting"),
|
|
1071
|
+
suffix("watercolor"),
|
|
1072
|
+
suffix("pencil sketch"),
|
|
1073
|
+
suffix("animation"),
|
|
1074
|
+
suffix("vector graphics"),
|
|
1075
|
+
suffix("pixel art"),
|
|
1076
|
+
]
|
|
1077
|
+
},
|
|
882
1078
|
}
|
|
883
1079
|
|
|
884
1080
|
|
|
@@ -1225,6 +1421,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1225
1421
|
MaxTrainInstancesRunExpander,
|
|
1226
1422
|
MaxEvalInstancesRunExpander,
|
|
1227
1423
|
NumOutputsRunExpander,
|
|
1424
|
+
NumTrialRunExpander,
|
|
1228
1425
|
ModelRunExpander,
|
|
1229
1426
|
ModelDeploymentRunExpander,
|
|
1230
1427
|
DataAugmentationRunExpander,
|
|
@@ -1232,6 +1429,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1232
1429
|
NumPromptTokensRunExpander,
|
|
1233
1430
|
NumOutputTokensRunExpander,
|
|
1234
1431
|
ChatMLRunExpander,
|
|
1432
|
+
EvalSplitRunExpander,
|
|
1235
1433
|
]
|
|
1236
1434
|
|
|
1237
1435
|
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
import importlib
|
|
3
|
+
import os
|
|
4
|
+
import pkgutil
|
|
5
|
+
from typing import Callable, Dict, Iterable, List, Optional, TypeVar
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
|
+
from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
|
|
9
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
10
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
11
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class RunSpec:
|
|
16
|
+
"""
|
|
17
|
+
Specifies how to do a single run, which gets a scenario, adapts it, and
|
|
18
|
+
computes a list of stats based on the defined metrics.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
"""Unique identifier of the RunSpec"""
|
|
23
|
+
|
|
24
|
+
scenario_spec: ScenarioSpec
|
|
25
|
+
"""Which scenario"""
|
|
26
|
+
|
|
27
|
+
adapter_spec: AdapterSpec
|
|
28
|
+
"""Specifies how to adapt an instance into a set of requests"""
|
|
29
|
+
|
|
30
|
+
metric_specs: List[MetricSpec]
|
|
31
|
+
"""What to evaluate on"""
|
|
32
|
+
|
|
33
|
+
data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
|
|
34
|
+
"""Data augmenter. The default `DataAugmenterSpec` does nothing."""
|
|
35
|
+
|
|
36
|
+
groups: List[str] = field(default_factory=list)
|
|
37
|
+
"""Groups that this run spec belongs to (for aggregation)"""
|
|
38
|
+
|
|
39
|
+
annotators: Optional[List[AnnotatorSpec]] = None
|
|
40
|
+
"""Annotators to use for this run spec"""
|
|
41
|
+
|
|
42
|
+
def __post_init__(self):
|
|
43
|
+
"""
|
|
44
|
+
`self.name` is used as the name of the output folder for the `RunSpec`.
|
|
45
|
+
Clean up `self.name` by replacing any "/"'s with "_".
|
|
46
|
+
"""
|
|
47
|
+
# TODO: Don't mutate name! clean this up before passing it into the constructor here
|
|
48
|
+
object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
RunSpecFunction = Callable[..., RunSpec]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_REGISTERED_RUN_SPEC_FUNCTIONS: Dict[str, RunSpecFunction] = {}
|
|
55
|
+
"""Dict of run spec function names to run spec functions."""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
F = TypeVar("F", bound=RunSpecFunction)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def run_spec_function(name: str) -> Callable[[F], F]:
|
|
62
|
+
"""Register the run spec function under the given name."""
|
|
63
|
+
|
|
64
|
+
def wrap(func: F) -> F:
|
|
65
|
+
if name in _REGISTERED_RUN_SPEC_FUNCTIONS:
|
|
66
|
+
raise ValueError(f"A run spec function with name {name} already exists")
|
|
67
|
+
_REGISTERED_RUN_SPEC_FUNCTIONS[name] = func
|
|
68
|
+
return func
|
|
69
|
+
|
|
70
|
+
return wrap
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# Copied from https://docs.python.org/3/library/pkgutil.html#pkgutil.iter_modules
|
|
74
|
+
def _iter_namespace(ns_pkg) -> Iterable[pkgutil.ModuleInfo]:
|
|
75
|
+
# Specifying the second argument (prefix) to iter_modules makes the
|
|
76
|
+
# returned name an absolute name instead of a relative one. This allows
|
|
77
|
+
# import_module to work without having to do additional modification to
|
|
78
|
+
# the name.
|
|
79
|
+
return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def discover_run_spec_functions() -> None:
|
|
83
|
+
"""Discover and register all run spec functions under helm.benchmark.run_specs"""
|
|
84
|
+
import helm.benchmark.run_specs # noqa
|
|
85
|
+
|
|
86
|
+
for finder, name, ispkg in _iter_namespace(helm.benchmark.run_specs):
|
|
87
|
+
importlib.import_module(name)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_run_spec_function(name: str) -> Optional[RunSpecFunction]:
|
|
91
|
+
"""Return the run spec function registered under the given name."""
|
|
92
|
+
discover_run_spec_functions()
|
|
93
|
+
return _REGISTERED_RUN_SPEC_FUNCTIONS.get(name)
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
5
|
+
ADAPT_GENERATION,
|
|
6
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
7
|
+
ADAPT_GENERATION_MULTIMODAL,
|
|
8
|
+
)
|
|
9
|
+
from helm.benchmark.model_deployment_registry import (
|
|
10
|
+
ModelDeployment,
|
|
11
|
+
get_default_model_deployment_for_model,
|
|
12
|
+
get_model_deployment,
|
|
13
|
+
)
|
|
14
|
+
from helm.benchmark.model_metadata_registry import (
|
|
15
|
+
ANTHROPIC_CLAUDE_1_MODEL_TAG,
|
|
16
|
+
ANTHROPIC_CLAUDE_2_MODEL_TAG,
|
|
17
|
+
BUGGY_TEMP_0_TAG,
|
|
18
|
+
CHATML_MODEL_TAG,
|
|
19
|
+
GOOGLE_GEMINI_MODEL_TAG,
|
|
20
|
+
IDEFICS_INSTRUCT_MODEL_TAG,
|
|
21
|
+
IDEFICS_MODEL_TAG,
|
|
22
|
+
LLAVA_MODEL_TAG,
|
|
23
|
+
OPEN_FLAMINGO_MODEL_TAG,
|
|
24
|
+
VISION_LANGUAGE_MODEL_TAG,
|
|
25
|
+
NLG_PREFIX_TAG,
|
|
26
|
+
NO_NEWLINES_TAG,
|
|
27
|
+
ModelMetadata,
|
|
28
|
+
get_model_metadata,
|
|
29
|
+
)
|
|
30
|
+
from helm.benchmark.run_expander import (
|
|
31
|
+
RUN_EXPANDERS,
|
|
32
|
+
AnthropicClaude2RunExpander,
|
|
33
|
+
ChatMLRunExpander,
|
|
34
|
+
GlobalPrefixRunExpander,
|
|
35
|
+
IDEFICSInstructRunExpander,
|
|
36
|
+
IncreaseTemperatureRunExpander,
|
|
37
|
+
IncreaseMaxTokensRunExpander,
|
|
38
|
+
LlavaRunExpander,
|
|
39
|
+
OpenFlamingoRunExpander,
|
|
40
|
+
StopRunExpander,
|
|
41
|
+
)
|
|
42
|
+
from helm.benchmark.run_spec import RunSpec, get_run_spec_function
|
|
43
|
+
from helm.common.general import singleton
|
|
44
|
+
from helm.common.object_spec import ObjectSpec
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
48
|
+
"""
|
|
49
|
+
Takes a specification (name, args) and returns a list of `RunSpec`s.
|
|
50
|
+
"""
|
|
51
|
+
# Note that we are abusing `spec` a bit because the name is not actually a class name.
|
|
52
|
+
name = spec.class_name
|
|
53
|
+
args = spec.args
|
|
54
|
+
|
|
55
|
+
run_spec_function = get_run_spec_function(name)
|
|
56
|
+
if run_spec_function is None:
|
|
57
|
+
raise ValueError(f"Unknown run spec name: {name}")
|
|
58
|
+
|
|
59
|
+
# Peel off the run expanders (e.g., model)
|
|
60
|
+
expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
|
|
61
|
+
args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
|
|
62
|
+
|
|
63
|
+
run_specs: List[RunSpec] = [run_spec_function(**args)]
|
|
64
|
+
|
|
65
|
+
# Apply expanders
|
|
66
|
+
for expander in expanders:
|
|
67
|
+
run_specs = [
|
|
68
|
+
child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
def alter_run_spec(run_spec: RunSpec) -> RunSpec:
|
|
72
|
+
if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
|
|
73
|
+
raise ValueError("At least one of model_deployment and model must be specified")
|
|
74
|
+
elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment:
|
|
75
|
+
# Infer model from model deployment
|
|
76
|
+
default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name
|
|
77
|
+
if not default_model_name:
|
|
78
|
+
default_model_name = run_spec.adapter_spec.model_deployment
|
|
79
|
+
run_spec = dataclasses.replace(
|
|
80
|
+
run_spec,
|
|
81
|
+
adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name),
|
|
82
|
+
)
|
|
83
|
+
elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
|
|
84
|
+
# Infer model deployment from model
|
|
85
|
+
default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model)
|
|
86
|
+
if not default_model_deployment:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}"
|
|
89
|
+
)
|
|
90
|
+
run_spec = dataclasses.replace(
|
|
91
|
+
run_spec,
|
|
92
|
+
adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Both model and model_deployment should now be filled
|
|
96
|
+
assert run_spec.adapter_spec.model_deployment
|
|
97
|
+
assert run_spec.adapter_spec.model
|
|
98
|
+
|
|
99
|
+
model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model)
|
|
100
|
+
deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment)
|
|
101
|
+
if run_spec.adapter_spec.model != deployment.model_name:
|
|
102
|
+
raise ValueError(
|
|
103
|
+
f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'"
|
|
104
|
+
f"for model '{run_spec.adapter_spec.model}' but the model deployment is "
|
|
105
|
+
f"for a different model '{deployment.model_name}'"
|
|
106
|
+
)
|
|
107
|
+
# For models that strip newlines, when we're generating, we need to set
|
|
108
|
+
# the delimiter to be '###' so we stop properly.
|
|
109
|
+
if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
|
|
110
|
+
ADAPT_GENERATION,
|
|
111
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
112
|
+
):
|
|
113
|
+
stop_expander = StopRunExpander(value="hash")
|
|
114
|
+
run_spec = singleton(stop_expander.expand(run_spec))
|
|
115
|
+
|
|
116
|
+
if NLG_PREFIX_TAG in model.tags:
|
|
117
|
+
global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
|
|
118
|
+
run_spec = singleton(global_prefix_expander.expand(run_spec))
|
|
119
|
+
|
|
120
|
+
if CHATML_MODEL_TAG in model.tags:
|
|
121
|
+
chatml_expander = ChatMLRunExpander()
|
|
122
|
+
run_spec = singleton(chatml_expander.expand(run_spec))
|
|
123
|
+
|
|
124
|
+
# Anthropic Claude 1 and 2 prompts
|
|
125
|
+
if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
|
|
126
|
+
run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
|
|
127
|
+
|
|
128
|
+
# Google Gemini Vision returns an empty completion or throws an error if max_tokens is 1
|
|
129
|
+
if (
|
|
130
|
+
VISION_LANGUAGE_MODEL_TAG in model.tags
|
|
131
|
+
and GOOGLE_GEMINI_MODEL_TAG in model.tags
|
|
132
|
+
and run_spec.adapter_spec.max_tokens == 1
|
|
133
|
+
):
|
|
134
|
+
run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
|
|
135
|
+
|
|
136
|
+
# IDEFICS special handling
|
|
137
|
+
if IDEFICS_MODEL_TAG in model.tags:
|
|
138
|
+
# IDEFICS requires more `max_tokens` to generate something reasonable for open-ended generation
|
|
139
|
+
if run_spec.adapter_spec.method == ADAPT_GENERATION_MULTIMODAL:
|
|
140
|
+
run_spec = singleton(IncreaseMaxTokensRunExpander(value=30).expand(run_spec))
|
|
141
|
+
|
|
142
|
+
if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
|
|
143
|
+
run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
|
|
144
|
+
|
|
145
|
+
# Llava
|
|
146
|
+
if LLAVA_MODEL_TAG in model.tags:
|
|
147
|
+
run_spec = singleton(LlavaRunExpander().expand(run_spec))
|
|
148
|
+
|
|
149
|
+
# OpenFlamingo
|
|
150
|
+
if OPEN_FLAMINGO_MODEL_TAG in model.tags:
|
|
151
|
+
run_spec = singleton(OpenFlamingoRunExpander().expand(run_spec))
|
|
152
|
+
|
|
153
|
+
# For multiple choice
|
|
154
|
+
if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
|
|
155
|
+
increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
|
|
156
|
+
run_spec = singleton(increase_temperature_expander.expand(run_spec))
|
|
157
|
+
|
|
158
|
+
return run_spec
|
|
159
|
+
|
|
160
|
+
run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
|
|
161
|
+
|
|
162
|
+
return run_specs
|
|
File without changes
|