crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/benchmark/run_expander.py
CHANGED
|
@@ -8,19 +8,22 @@ from helm.benchmark.model_metadata_registry import (
|
|
|
8
8
|
get_all_code_models,
|
|
9
9
|
get_all_models,
|
|
10
10
|
get_all_text_models,
|
|
11
|
+
get_model_metadata,
|
|
11
12
|
get_model_names_with_tag,
|
|
12
13
|
FULL_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
13
14
|
LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG,
|
|
14
15
|
ABLATION_MODEL_TAG,
|
|
16
|
+
TEXT_TO_IMAGE_MODEL_TAG,
|
|
15
17
|
VISION_LANGUAGE_MODEL_TAG,
|
|
18
|
+
INSTRUCTION_FOLLOWING_MODEL_TAG,
|
|
16
19
|
)
|
|
17
20
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
18
|
-
from helm.common.general import handle_module_not_found_error
|
|
19
21
|
from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer
|
|
20
|
-
from .
|
|
21
|
-
from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution
|
|
22
|
+
from .run_spec import RunSpec
|
|
23
|
+
from helm.benchmark.adaptation.adapter_spec import ADAPT_MULTIPLE_CHOICE_JOINT, AdapterSpec, Substitution
|
|
22
24
|
from .augmentations.perturbation import PerturbationSpec
|
|
23
25
|
from .augmentations.data_augmenter import DataAugmenterSpec
|
|
26
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, VALID_SPLIT
|
|
24
27
|
|
|
25
28
|
|
|
26
29
|
class RunExpander(ABC):
|
|
@@ -223,12 +226,16 @@ class AddToStopRunExpander(RunExpander):
|
|
|
223
226
|
self.value = value
|
|
224
227
|
|
|
225
228
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
229
|
+
if self.value == "newline":
|
|
230
|
+
stop_sequence = "\n"
|
|
231
|
+
else:
|
|
232
|
+
stop_sequence = self.value
|
|
226
233
|
return [
|
|
227
234
|
replace(
|
|
228
235
|
run_spec,
|
|
229
236
|
name=run_spec.name,
|
|
230
237
|
adapter_spec=replace(
|
|
231
|
-
run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [
|
|
238
|
+
run_spec.adapter_spec, stop_sequences=run_spec.adapter_spec.stop_sequences + [stop_sequence]
|
|
232
239
|
),
|
|
233
240
|
),
|
|
234
241
|
]
|
|
@@ -273,33 +280,37 @@ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX = (
|
|
|
273
280
|
)
|
|
274
281
|
|
|
275
282
|
|
|
276
|
-
class
|
|
283
|
+
class AnthropicClaude2RunExpander(RunExpander):
|
|
277
284
|
"""
|
|
278
|
-
Custom prompt for Anthropic models.
|
|
285
|
+
Custom prompt for Anthropic Claude 1 and Claude 2 models.
|
|
279
286
|
These models need more explicit instructions about following the format.
|
|
280
287
|
"""
|
|
281
288
|
|
|
282
289
|
name = "anthropic"
|
|
283
290
|
|
|
291
|
+
# These strings must be added to the prompt in order to pass prompt validation,
|
|
292
|
+
# otherwise the Anthropic API will return an error.
|
|
293
|
+
# See: https://docs.anthropic.com/claude/reference/prompt-validation
|
|
294
|
+
HUMAN_PROMPT = "\n\nHuman:"
|
|
295
|
+
AI_PROMPT = "\n\nAssistant:"
|
|
296
|
+
|
|
284
297
|
def __init__(self):
|
|
285
298
|
pass
|
|
286
299
|
|
|
287
300
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
288
|
-
try:
|
|
289
|
-
import anthropic
|
|
290
|
-
except ModuleNotFoundError as e:
|
|
291
|
-
handle_module_not_found_error(e, ["anthropic"])
|
|
292
|
-
|
|
293
301
|
return [
|
|
294
302
|
replace(
|
|
295
303
|
run_spec,
|
|
296
304
|
name=run_spec.name,
|
|
297
305
|
adapter_spec=replace(
|
|
298
306
|
run_spec.adapter_spec,
|
|
299
|
-
global_prefix=
|
|
307
|
+
global_prefix=AnthropicClaude2RunExpander.HUMAN_PROMPT
|
|
308
|
+
+ " "
|
|
309
|
+
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_PREFIX
|
|
310
|
+
+ "\n\n",
|
|
300
311
|
global_suffix="\n\n"
|
|
301
312
|
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
302
|
-
+
|
|
313
|
+
+ AnthropicClaude2RunExpander.AI_PROMPT
|
|
303
314
|
+ " "
|
|
304
315
|
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
305
316
|
),
|
|
@@ -307,21 +318,66 @@ class AnthropicRunExpander(RunExpander):
|
|
|
307
318
|
]
|
|
308
319
|
|
|
309
320
|
|
|
310
|
-
class
|
|
311
|
-
"""
|
|
312
|
-
Custom prompt for OpenAI models.
|
|
313
|
-
These models need more explicit instructions about following the format.
|
|
314
|
-
"""
|
|
321
|
+
class AnthropicClaude3RunExpander(RunExpander):
|
|
322
|
+
"""Custom prompts for Anthropic Claude 3 models."""
|
|
315
323
|
|
|
316
|
-
|
|
324
|
+
name = "claude_3"
|
|
317
325
|
|
|
318
|
-
|
|
326
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
327
|
+
# Remove all stop sequences that do not contain non-whitespace characters.
|
|
328
|
+
# This prevents the Anthropic API from returnin the following error:
|
|
329
|
+
# "stop_sequences: each stop sequence must contain non-whitespace"
|
|
330
|
+
stop_sequences_with_non_whitespace = [
|
|
331
|
+
stop_sequence for stop_sequence in run_spec.adapter_spec.stop_sequences if stop_sequence.strip()
|
|
332
|
+
]
|
|
333
|
+
run_spec = replace(
|
|
334
|
+
run_spec,
|
|
335
|
+
adapter_spec=replace(run_spec.adapter_spec, stop_sequences=stop_sequences_with_non_whitespace),
|
|
336
|
+
)
|
|
337
|
+
if run_spec.adapter_spec.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
338
|
+
instructions = "Answer with only a single letter."
|
|
339
|
+
if run_spec.adapter_spec.instructions:
|
|
340
|
+
instructions = f"{instructions}\n\n{run_spec.adapter_spec.instructions}"
|
|
341
|
+
return [
|
|
342
|
+
replace(
|
|
343
|
+
run_spec,
|
|
344
|
+
adapter_spec=replace(run_spec.adapter_spec, instructions=instructions),
|
|
345
|
+
),
|
|
346
|
+
]
|
|
347
|
+
return [run_spec]
|
|
319
348
|
|
|
320
|
-
|
|
321
|
-
|
|
349
|
+
|
|
350
|
+
class FollowFormatInstructionsRunExpander(RunExpander):
|
|
351
|
+
"""Adds more explicit instructions about following the format to prompts.
|
|
352
|
+
|
|
353
|
+
The argument controlls which models will receive these instructions.
|
|
354
|
+
If "all", all models receive these instructions.
|
|
355
|
+
If "instruct", only instruction-following models receive these instructions.
|
|
356
|
+
|
|
357
|
+
Only supports the generation adaptation method. Raises an error if used on
|
|
358
|
+
a RunSpec that uses a different adaptation method.
|
|
359
|
+
|
|
360
|
+
Note: For legacy backwards compatibility reasons, despite the use of the word
|
|
361
|
+
"instructions" in this run expander's name, this run expander actually
|
|
362
|
+
modifies the global_prefix and the global_suffix of the AdapterSpec rather than
|
|
363
|
+
the instructions.
|
|
364
|
+
"""
|
|
365
|
+
|
|
366
|
+
name = "follow_format_instructions"
|
|
367
|
+
|
|
368
|
+
def __init__(self, value: str):
|
|
369
|
+
if value != "all" and value != "instruct":
|
|
370
|
+
raise ValueError("Value of add_follow_the_format_instructions run expander must be 'all' or 'instruct'")
|
|
371
|
+
self.value = value
|
|
322
372
|
|
|
323
373
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
324
374
|
if run_spec.adapter_spec.method != ADAPT_GENERATION:
|
|
375
|
+
raise Exception("follow_format_instructions run expander only supports the generation adaptation method")
|
|
376
|
+
|
|
377
|
+
if (
|
|
378
|
+
self.value == "instruct"
|
|
379
|
+
and INSTRUCTION_FOLLOWING_MODEL_TAG not in get_model_metadata(run_spec.adapter_spec.model).tags
|
|
380
|
+
):
|
|
325
381
|
return [run_spec]
|
|
326
382
|
|
|
327
383
|
return [
|
|
@@ -340,31 +396,70 @@ class OpenAIRunExpander(RunExpander):
|
|
|
340
396
|
]
|
|
341
397
|
|
|
342
398
|
|
|
343
|
-
class
|
|
399
|
+
class IDEFICSInstructRunExpander(RunExpander):
|
|
344
400
|
"""
|
|
345
|
-
Custom prompt for
|
|
346
|
-
|
|
401
|
+
Custom prompt for IDEFICS instruct models which require a specific format.
|
|
402
|
+
See https://huggingface.co/HuggingFaceM4/idefics-80b-instruct for more information.
|
|
347
403
|
"""
|
|
348
404
|
|
|
349
|
-
|
|
405
|
+
name = "idefics_instruct"
|
|
406
|
+
|
|
407
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
408
|
+
return [
|
|
409
|
+
replace(
|
|
410
|
+
run_spec,
|
|
411
|
+
name=run_spec.name,
|
|
412
|
+
adapter_spec=replace(
|
|
413
|
+
run_spec.adapter_spec,
|
|
414
|
+
input_prefix="User: ",
|
|
415
|
+
input_suffix="<end_of_utterance>",
|
|
416
|
+
output_prefix="\nAssistant: ",
|
|
417
|
+
output_suffix="<end_of_utterance>",
|
|
418
|
+
stop_sequences=["<end_of_utterance>"],
|
|
419
|
+
),
|
|
420
|
+
),
|
|
421
|
+
]
|
|
350
422
|
|
|
351
|
-
|
|
423
|
+
|
|
424
|
+
class LlavaRunExpander(RunExpander):
|
|
425
|
+
"""
|
|
426
|
+
Custom prompt for Llava 1.5 models which should use a specific format.
|
|
427
|
+
See https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing for more information.
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
name = "llava"
|
|
352
431
|
|
|
353
432
|
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
354
|
-
|
|
355
|
-
|
|
433
|
+
return [
|
|
434
|
+
replace(
|
|
435
|
+
run_spec,
|
|
436
|
+
name=run_spec.name,
|
|
437
|
+
adapter_spec=replace(
|
|
438
|
+
run_spec.adapter_spec,
|
|
439
|
+
input_prefix="USER: <image>",
|
|
440
|
+
input_suffix="",
|
|
441
|
+
output_prefix="\nASSISTANT: ",
|
|
442
|
+
output_suffix="",
|
|
443
|
+
),
|
|
444
|
+
),
|
|
445
|
+
]
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
class OpenFlamingoRunExpander(RunExpander):
|
|
449
|
+
"""
|
|
450
|
+
Custom prompt for OpenFlamingo following: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
name = "open_flamingo"
|
|
356
454
|
|
|
455
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
357
456
|
return [
|
|
358
457
|
replace(
|
|
359
458
|
run_spec,
|
|
360
459
|
name=run_spec.name,
|
|
361
460
|
adapter_spec=replace(
|
|
362
461
|
run_spec.adapter_spec,
|
|
363
|
-
|
|
364
|
-
global_suffix="\n\n"
|
|
365
|
-
+ IN_CONTEXT_LEARNING_INSTRUCTIONS_SUFFIX
|
|
366
|
-
+ "\n"
|
|
367
|
-
+ run_spec.adapter_spec.output_prefix.strip(),
|
|
462
|
+
input_prefix=f"<|endofchunk|>{run_spec.adapter_spec.input_prefix}",
|
|
368
463
|
),
|
|
369
464
|
),
|
|
370
465
|
]
|
|
@@ -415,6 +510,7 @@ class MaxTrainInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
415
510
|
"one": [1],
|
|
416
511
|
"all": [0, 1, 2, 4, 8, 16], # Cap at 16 due to limited context length
|
|
417
512
|
"big_bench_few_shot_setting": [0, 1, 2, 3], # Commonly used few-shot setting in BIG-bench
|
|
513
|
+
"vhelm": [0, 1, 2, 4, 8],
|
|
418
514
|
}
|
|
419
515
|
|
|
420
516
|
|
|
@@ -422,7 +518,12 @@ class MaxEvalInstancesRunExpander(ReplaceValueRunExpander):
|
|
|
422
518
|
"""For overriding the number of eval instances at the run level."""
|
|
423
519
|
|
|
424
520
|
name = "max_eval_instances"
|
|
425
|
-
values_dict: Dict[str, List[Any]] = {
|
|
521
|
+
values_dict: Dict[str, List[Any]] = {
|
|
522
|
+
"default": [1_000],
|
|
523
|
+
"heim_default": [100],
|
|
524
|
+
"heim_fid": [30_000],
|
|
525
|
+
"heim_art_styles": [17],
|
|
526
|
+
}
|
|
426
527
|
|
|
427
528
|
|
|
428
529
|
class NumOutputsRunExpander(ReplaceValueRunExpander):
|
|
@@ -435,6 +536,15 @@ class NumOutputsRunExpander(ReplaceValueRunExpander):
|
|
|
435
536
|
}
|
|
436
537
|
|
|
437
538
|
|
|
539
|
+
class NumTrialRunExpander(ReplaceValueRunExpander):
|
|
540
|
+
"""For getting different generations for the same requests."""
|
|
541
|
+
|
|
542
|
+
name = "num_trials"
|
|
543
|
+
values_dict = {
|
|
544
|
+
"heim_efficiency": [5],
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
|
|
438
548
|
class ModelRunExpander(ReplaceValueRunExpander):
|
|
439
549
|
"""
|
|
440
550
|
For specifying different models.
|
|
@@ -476,6 +586,7 @@ class ModelRunExpander(ReplaceValueRunExpander):
|
|
|
476
586
|
"openai/text-davinci-003",
|
|
477
587
|
],
|
|
478
588
|
"opinions_qa_ai21": ["ai21/j1-grande", "ai21/j1-jumbo", "ai21/j1-grande-v2-beta"],
|
|
589
|
+
"text_to_image": get_model_names_with_tag(TEXT_TO_IMAGE_MODEL_TAG),
|
|
479
590
|
"vlm": get_model_names_with_tag(VISION_LANGUAGE_MODEL_TAG),
|
|
480
591
|
}
|
|
481
592
|
|
|
@@ -500,6 +611,33 @@ class ModelDeploymentRunExpander(ReplaceValueRunExpander):
|
|
|
500
611
|
values_dict: Dict[str, List[Any]] = {}
|
|
501
612
|
|
|
502
613
|
|
|
614
|
+
class EvalSplitRunExpander(RunExpander):
|
|
615
|
+
"""Sets the evaluation split.
|
|
616
|
+
|
|
617
|
+
By default, evaluation instances are drawn from both test and validation splits.
|
|
618
|
+
This run expander allows drawing evaluation instances from only the test split or
|
|
619
|
+
only the validation split."""
|
|
620
|
+
|
|
621
|
+
# NOTE: This does not subclass `ReplaceValueRunExpander` because we want the
|
|
622
|
+
# run expander name to be "eval_split", not "eval_splits".
|
|
623
|
+
|
|
624
|
+
name = "eval_split"
|
|
625
|
+
|
|
626
|
+
def __init__(self, value):
|
|
627
|
+
if value != TEST_SPLIT and value != VALID_SPLIT:
|
|
628
|
+
raise ValueError(f'Split must be "{TEST_SPLIT}" or "{VALID_SPLIT}", but got "{value}"')
|
|
629
|
+
self.split = value
|
|
630
|
+
|
|
631
|
+
def expand(self, run_spec: RunSpec) -> List[RunSpec]:
|
|
632
|
+
return [
|
|
633
|
+
replace(
|
|
634
|
+
run_spec,
|
|
635
|
+
name=f"{run_spec.name}{',' if ':' in run_spec.name else ':'}eval_split={self.split}",
|
|
636
|
+
adapter_spec=replace(run_spec.adapter_spec, eval_splits=[self.split]),
|
|
637
|
+
)
|
|
638
|
+
]
|
|
639
|
+
|
|
640
|
+
|
|
503
641
|
############################################################
|
|
504
642
|
|
|
505
643
|
|
|
@@ -688,6 +826,20 @@ def mandarin_to_cantonese() -> PerturbationSpec:
|
|
|
688
826
|
)
|
|
689
827
|
|
|
690
828
|
|
|
829
|
+
def translate(language_code: str) -> PerturbationSpec:
|
|
830
|
+
return PerturbationSpec(
|
|
831
|
+
class_name="helm.benchmark.augmentations.translate_perturbation.TranslatePerturbation",
|
|
832
|
+
args={"language_code": language_code},
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
|
|
836
|
+
def suffix(text: str) -> PerturbationSpec:
|
|
837
|
+
return PerturbationSpec(
|
|
838
|
+
class_name="helm.benchmark.augmentations.suffix_perturbation.SuffixPerturbation",
|
|
839
|
+
args={"suffix": text},
|
|
840
|
+
)
|
|
841
|
+
|
|
842
|
+
|
|
691
843
|
# Specifies the data augmentations that we're interested in trying out.
|
|
692
844
|
# Concretely, this is a mapping from the name (which is specified in a conf
|
|
693
845
|
# file or the CLI) to a list of options to try, where each option is a list of perturbations.
|
|
@@ -879,6 +1031,21 @@ PERTURBATION_SPECS_DICT: Dict[str, Dict[str, List[PerturbationSpec]]] = {
|
|
|
879
1031
|
mandarin_to_cantonese(),
|
|
880
1032
|
]
|
|
881
1033
|
},
|
|
1034
|
+
# Multilinguality
|
|
1035
|
+
"chinese": {"chinese": [translate(language_code="zh-CN")]},
|
|
1036
|
+
"hindi": {"hindi": [translate(language_code="hi")]},
|
|
1037
|
+
"spanish": {"spanish": [translate(language_code="es")]},
|
|
1038
|
+
# Styles
|
|
1039
|
+
"art": {
|
|
1040
|
+
"art": [
|
|
1041
|
+
suffix("oil painting"),
|
|
1042
|
+
suffix("watercolor"),
|
|
1043
|
+
suffix("pencil sketch"),
|
|
1044
|
+
suffix("animation"),
|
|
1045
|
+
suffix("vector graphics"),
|
|
1046
|
+
suffix("pixel art"),
|
|
1047
|
+
]
|
|
1048
|
+
},
|
|
882
1049
|
}
|
|
883
1050
|
|
|
884
1051
|
|
|
@@ -1219,12 +1386,14 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1219
1386
|
NewlineRunExpander,
|
|
1220
1387
|
StopRunExpander,
|
|
1221
1388
|
FormatPromptRunExpander,
|
|
1389
|
+
FollowFormatInstructionsRunExpander,
|
|
1222
1390
|
AddToStopRunExpander,
|
|
1223
1391
|
GlobalPrefixRunExpander,
|
|
1224
1392
|
NumTrainTrialsRunExpander,
|
|
1225
1393
|
MaxTrainInstancesRunExpander,
|
|
1226
1394
|
MaxEvalInstancesRunExpander,
|
|
1227
1395
|
NumOutputsRunExpander,
|
|
1396
|
+
NumTrialRunExpander,
|
|
1228
1397
|
ModelRunExpander,
|
|
1229
1398
|
ModelDeploymentRunExpander,
|
|
1230
1399
|
DataAugmentationRunExpander,
|
|
@@ -1232,6 +1401,7 @@ RUN_EXPANDER_SUBCLASSES: List[Type[RunExpander]] = [
|
|
|
1232
1401
|
NumPromptTokensRunExpander,
|
|
1233
1402
|
NumOutputTokensRunExpander,
|
|
1234
1403
|
ChatMLRunExpander,
|
|
1404
|
+
EvalSplitRunExpander,
|
|
1235
1405
|
]
|
|
1236
1406
|
|
|
1237
1407
|
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
import importlib
|
|
3
|
+
import os
|
|
4
|
+
import pkgutil
|
|
5
|
+
from typing import Callable, Dict, Iterable, List, Optional, TypeVar
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
8
|
+
from helm.benchmark.augmentations.data_augmenter import DataAugmenterSpec
|
|
9
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
10
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec
|
|
11
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class RunSpec:
|
|
16
|
+
"""
|
|
17
|
+
Specifies how to do a single run, which gets a scenario, adapts it, and
|
|
18
|
+
computes a list of stats based on the defined metrics.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name: str
|
|
22
|
+
"""Unique identifier of the RunSpec"""
|
|
23
|
+
|
|
24
|
+
scenario_spec: ScenarioSpec
|
|
25
|
+
"""Which scenario"""
|
|
26
|
+
|
|
27
|
+
adapter_spec: AdapterSpec
|
|
28
|
+
"""Specifies how to adapt an instance into a set of requests"""
|
|
29
|
+
|
|
30
|
+
metric_specs: List[MetricSpec]
|
|
31
|
+
"""What to evaluate on"""
|
|
32
|
+
|
|
33
|
+
data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
|
|
34
|
+
"""Data augmenter. The default `DataAugmenterSpec` does nothing."""
|
|
35
|
+
|
|
36
|
+
groups: List[str] = field(default_factory=list)
|
|
37
|
+
"""Groups that this run spec belongs to (for aggregation)"""
|
|
38
|
+
|
|
39
|
+
annotators: Optional[List[AnnotatorSpec]] = None
|
|
40
|
+
"""Annotators to use for this run spec"""
|
|
41
|
+
|
|
42
|
+
def __post_init__(self):
|
|
43
|
+
"""
|
|
44
|
+
`self.name` is used as the name of the output folder for the `RunSpec`.
|
|
45
|
+
Clean up `self.name` by replacing any "/"'s with "_".
|
|
46
|
+
"""
|
|
47
|
+
# TODO: Don't mutate name! clean this up before passing it into the constructor here
|
|
48
|
+
object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
RunSpecFunction = Callable[..., RunSpec]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_REGISTERED_RUN_SPEC_FUNCTIONS: Dict[str, RunSpecFunction] = {}
|
|
55
|
+
"""Dict of run spec function names to run spec functions."""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
F = TypeVar("F", bound=RunSpecFunction)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def run_spec_function(name: str) -> Callable[[F], F]:
|
|
62
|
+
"""Register the run spec function under the given name."""
|
|
63
|
+
|
|
64
|
+
def wrap(func: F) -> F:
|
|
65
|
+
if name in _REGISTERED_RUN_SPEC_FUNCTIONS:
|
|
66
|
+
raise ValueError(f"A run spec function with name {name} already exists")
|
|
67
|
+
_REGISTERED_RUN_SPEC_FUNCTIONS[name] = func
|
|
68
|
+
return func
|
|
69
|
+
|
|
70
|
+
return wrap
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# Copied from https://docs.python.org/3/library/pkgutil.html#pkgutil.iter_modules
|
|
74
|
+
def _iter_namespace(ns_pkg) -> Iterable[pkgutil.ModuleInfo]:
|
|
75
|
+
# Specifying the second argument (prefix) to iter_modules makes the
|
|
76
|
+
# returned name an absolute name instead of a relative one. This allows
|
|
77
|
+
# import_module to work without having to do additional modification to
|
|
78
|
+
# the name.
|
|
79
|
+
return pkgutil.iter_modules(ns_pkg.__path__, ns_pkg.__name__ + ".")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def discover_run_spec_functions() -> None:
|
|
83
|
+
"""Discover and register all run spec functions under helm.benchmark.run_specs"""
|
|
84
|
+
import helm.benchmark.run_specs # noqa
|
|
85
|
+
|
|
86
|
+
for finder, name, ispkg in _iter_namespace(helm.benchmark.run_specs):
|
|
87
|
+
importlib.import_module(name)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def get_run_spec_function(name: str) -> Optional[RunSpecFunction]:
|
|
91
|
+
"""Return the run spec function registered under the given name."""
|
|
92
|
+
discover_run_spec_functions()
|
|
93
|
+
return _REGISTERED_RUN_SPEC_FUNCTIONS.get(name)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
5
|
+
ADAPT_GENERATION,
|
|
6
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
7
|
+
)
|
|
8
|
+
from helm.benchmark.model_deployment_registry import (
|
|
9
|
+
ModelDeployment,
|
|
10
|
+
get_default_model_deployment_for_model,
|
|
11
|
+
get_model_deployment,
|
|
12
|
+
)
|
|
13
|
+
from helm.benchmark.model_metadata_registry import (
|
|
14
|
+
ANTHROPIC_CLAUDE_1_MODEL_TAG,
|
|
15
|
+
ANTHROPIC_CLAUDE_2_MODEL_TAG,
|
|
16
|
+
ANTHROPIC_CLAUDE_3_MODEL_TAG,
|
|
17
|
+
BUGGY_TEMP_0_TAG,
|
|
18
|
+
CHATML_MODEL_TAG,
|
|
19
|
+
GOOGLE_GEMINI_PRO_VISION_V1_TAG,
|
|
20
|
+
IDEFICS_INSTRUCT_MODEL_TAG,
|
|
21
|
+
LLAVA_MODEL_TAG,
|
|
22
|
+
OPEN_FLAMINGO_MODEL_TAG,
|
|
23
|
+
NLG_PREFIX_TAG,
|
|
24
|
+
NO_NEWLINES_TAG,
|
|
25
|
+
VISION_LANGUAGE_MODEL_TAG,
|
|
26
|
+
IDEFICS_MODEL_TAG,
|
|
27
|
+
ModelMetadata,
|
|
28
|
+
get_model_metadata,
|
|
29
|
+
)
|
|
30
|
+
from helm.benchmark.run_expander import (
|
|
31
|
+
RUN_EXPANDERS,
|
|
32
|
+
AnthropicClaude2RunExpander,
|
|
33
|
+
AnthropicClaude3RunExpander,
|
|
34
|
+
ChatMLRunExpander,
|
|
35
|
+
GlobalPrefixRunExpander,
|
|
36
|
+
IDEFICSInstructRunExpander,
|
|
37
|
+
IncreaseTemperatureRunExpander,
|
|
38
|
+
IncreaseMaxTokensRunExpander,
|
|
39
|
+
LlavaRunExpander,
|
|
40
|
+
OpenFlamingoRunExpander,
|
|
41
|
+
StopRunExpander,
|
|
42
|
+
)
|
|
43
|
+
from helm.benchmark.run_spec import RunSpec, get_run_spec_function
|
|
44
|
+
from helm.common.general import singleton
|
|
45
|
+
from helm.common.object_spec import ObjectSpec
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
49
|
+
"""
|
|
50
|
+
Takes a specification (name, args) and returns a list of `RunSpec`s.
|
|
51
|
+
"""
|
|
52
|
+
# Note that we are abusing `spec` a bit because the name is not actually a class name.
|
|
53
|
+
name = spec.class_name
|
|
54
|
+
args = spec.args
|
|
55
|
+
|
|
56
|
+
run_spec_function = get_run_spec_function(name)
|
|
57
|
+
if run_spec_function is None:
|
|
58
|
+
raise ValueError(f"Unknown run spec name: {name}")
|
|
59
|
+
|
|
60
|
+
# Peel off the run expanders (e.g., model)
|
|
61
|
+
expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
|
|
62
|
+
args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
|
|
63
|
+
|
|
64
|
+
run_specs: List[RunSpec] = [run_spec_function(**args)]
|
|
65
|
+
|
|
66
|
+
# Apply expanders
|
|
67
|
+
for expander in expanders:
|
|
68
|
+
run_specs = [
|
|
69
|
+
child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
def alter_run_spec(run_spec: RunSpec) -> RunSpec:
|
|
73
|
+
if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
|
|
74
|
+
raise ValueError("At least one of model_deployment and model must be specified")
|
|
75
|
+
elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment:
|
|
76
|
+
# Infer model from model deployment
|
|
77
|
+
default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name
|
|
78
|
+
if not default_model_name:
|
|
79
|
+
default_model_name = run_spec.adapter_spec.model_deployment
|
|
80
|
+
run_spec = dataclasses.replace(
|
|
81
|
+
run_spec,
|
|
82
|
+
adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name),
|
|
83
|
+
)
|
|
84
|
+
elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment:
|
|
85
|
+
# Infer model deployment from model
|
|
86
|
+
default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model)
|
|
87
|
+
if not default_model_deployment:
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}"
|
|
90
|
+
)
|
|
91
|
+
run_spec = dataclasses.replace(
|
|
92
|
+
run_spec,
|
|
93
|
+
adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment),
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
# Both model and model_deployment should now be filled
|
|
97
|
+
assert run_spec.adapter_spec.model_deployment
|
|
98
|
+
assert run_spec.adapter_spec.model
|
|
99
|
+
|
|
100
|
+
model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model)
|
|
101
|
+
deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment)
|
|
102
|
+
if run_spec.adapter_spec.model != deployment.model_name:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'"
|
|
105
|
+
f"for model '{run_spec.adapter_spec.model}' but the model deployment is "
|
|
106
|
+
f"for a different model '{deployment.model_name}'"
|
|
107
|
+
)
|
|
108
|
+
# For models that strip newlines, when we're generating, we need to set
|
|
109
|
+
# the delimiter to be '###' so we stop properly.
|
|
110
|
+
if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
|
|
111
|
+
ADAPT_GENERATION,
|
|
112
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
113
|
+
):
|
|
114
|
+
stop_expander = StopRunExpander(value="hash")
|
|
115
|
+
run_spec = singleton(stop_expander.expand(run_spec))
|
|
116
|
+
|
|
117
|
+
if NLG_PREFIX_TAG in model.tags:
|
|
118
|
+
global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
|
|
119
|
+
run_spec = singleton(global_prefix_expander.expand(run_spec))
|
|
120
|
+
|
|
121
|
+
if CHATML_MODEL_TAG in model.tags:
|
|
122
|
+
chatml_expander = ChatMLRunExpander()
|
|
123
|
+
run_spec = singleton(chatml_expander.expand(run_spec))
|
|
124
|
+
|
|
125
|
+
# Anthropic Claude 1 and 2 prompts
|
|
126
|
+
if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
|
|
127
|
+
run_spec = singleton(AnthropicClaude2RunExpander().expand(run_spec))
|
|
128
|
+
|
|
129
|
+
# Anthropic Claude 3
|
|
130
|
+
if ANTHROPIC_CLAUDE_3_MODEL_TAG in model.tags:
|
|
131
|
+
run_spec = singleton(AnthropicClaude3RunExpander().expand(run_spec))
|
|
132
|
+
|
|
133
|
+
# Google Gemini Vision v1.0 returns an empty completion or throws an error if max_tokens is 1
|
|
134
|
+
if (
|
|
135
|
+
VISION_LANGUAGE_MODEL_TAG in model.tags
|
|
136
|
+
and GOOGLE_GEMINI_PRO_VISION_V1_TAG in model.tags
|
|
137
|
+
and run_spec.adapter_spec.max_tokens == 1
|
|
138
|
+
):
|
|
139
|
+
run_spec = singleton(IncreaseMaxTokensRunExpander(value=1).expand(run_spec))
|
|
140
|
+
|
|
141
|
+
# IDEFICS special handling
|
|
142
|
+
if IDEFICS_MODEL_TAG in model.tags:
|
|
143
|
+
if IDEFICS_INSTRUCT_MODEL_TAG in model.tags:
|
|
144
|
+
run_spec = singleton(IDEFICSInstructRunExpander().expand(run_spec))
|
|
145
|
+
|
|
146
|
+
# Llava
|
|
147
|
+
if LLAVA_MODEL_TAG in model.tags:
|
|
148
|
+
run_spec = singleton(LlavaRunExpander().expand(run_spec))
|
|
149
|
+
|
|
150
|
+
# OpenFlamingo
|
|
151
|
+
if OPEN_FLAMINGO_MODEL_TAG in model.tags:
|
|
152
|
+
run_spec = singleton(OpenFlamingoRunExpander().expand(run_spec))
|
|
153
|
+
|
|
154
|
+
# For multiple choice
|
|
155
|
+
if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
|
|
156
|
+
increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
|
|
157
|
+
run_spec = singleton(increase_temperature_expander.expand(run_spec))
|
|
158
|
+
|
|
159
|
+
return run_spec
|
|
160
|
+
|
|
161
|
+
run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
|
|
162
|
+
|
|
163
|
+
return run_specs
|
|
File without changes
|