crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SEEDBenchScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
SEED-Bench: Benchmarking Multimodal LLMs with Generative Comprehension
|
|
23
|
+
|
|
24
|
+
Based on powerful Large Language Models (LLMs), recent generative Multimodal
|
|
25
|
+
Large Language Models (MLLMs) have gained prominence as a pivotal research area.
|
|
26
|
+
In Seed-Bench, we address the evaluation of generative comprehension in MLLMs
|
|
27
|
+
as a preliminary step towards a comprehensive assessment of generative models.
|
|
28
|
+
SEED-Bench consists of 19K multiple choice questions with accurate human annotations
|
|
29
|
+
(x 6 larger than existing benchmarks), which spans 12 evaluation dimensions
|
|
30
|
+
including the comprehension of both the image and video modality. We select 9
|
|
31
|
+
evaluation aspects that take image as the input. In the benchmark,
|
|
32
|
+
Multiple-choice questions with groundtruth options derived from human
|
|
33
|
+
annotation enables an objective and efficient assessment of model performance,
|
|
34
|
+
eliminating the need for human or GPT intervention during evaluation. We employ
|
|
35
|
+
the multiple-choice metric for evaluating the performance of models.
|
|
36
|
+
|
|
37
|
+
@article{li2023seed,
|
|
38
|
+
title={Seed-bench: Benchmarking multimodal llms with generative comprehension},
|
|
39
|
+
author={Li, Bohao and Wang, Rui and Wang, Guangzhi and Ge, Yuying and Ge, Yixiao and Shan, Ying},
|
|
40
|
+
journal={arXiv preprint arXiv:2307.16125},
|
|
41
|
+
year={2023}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
Paper: https://arxiv.org/abs/2307.16125
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
SEED_BENCH_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/SEED-Bench"
|
|
48
|
+
|
|
49
|
+
SUBJECTS: Dict[str, int] = {
|
|
50
|
+
"scene-understanding": 1,
|
|
51
|
+
"instance-identity": 2,
|
|
52
|
+
"instance-attributes": 3,
|
|
53
|
+
"instance-location": 4,
|
|
54
|
+
"instances-counting": 5,
|
|
55
|
+
"spatial-relation": 6,
|
|
56
|
+
"instance-interaction": 7,
|
|
57
|
+
"visual-reasoning": 8,
|
|
58
|
+
"text-understanding": 9,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
name = "seed_bench"
|
|
62
|
+
description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2307.16125))."
|
|
63
|
+
tags = ["vision-language"]
|
|
64
|
+
|
|
65
|
+
def __init__(self, subject: str):
|
|
66
|
+
super().__init__()
|
|
67
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
68
|
+
self._subject: str = subject
|
|
69
|
+
|
|
70
|
+
def get_subject_name(self, subject_name: str) -> str:
|
|
71
|
+
return "-".join(subject_name.lower().split())
|
|
72
|
+
|
|
73
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
74
|
+
images_path: str = os.path.join(output_path, "images")
|
|
75
|
+
ensure_directory_exists(images_path)
|
|
76
|
+
|
|
77
|
+
# There is only the test split in Unicorn benchmark
|
|
78
|
+
instances: List[Instance] = []
|
|
79
|
+
# Process the test set
|
|
80
|
+
# Two open-ended generation instances and
|
|
81
|
+
# one multi-choice generation instance per row
|
|
82
|
+
for row in tqdm(
|
|
83
|
+
load_dataset(
|
|
84
|
+
self.SEED_BENCH_HUGGINGFACE_DATASET_NAME,
|
|
85
|
+
split=TEST_SPLIT,
|
|
86
|
+
cache_dir=output_path,
|
|
87
|
+
)
|
|
88
|
+
):
|
|
89
|
+
question_type_key: str = self.get_subject_name(self._subject)
|
|
90
|
+
if row["question_type_id"] != self.SUBJECTS[question_type_key]:
|
|
91
|
+
continue
|
|
92
|
+
question_id: str = row["question_id"]
|
|
93
|
+
# Download the image
|
|
94
|
+
# Save the image locally
|
|
95
|
+
image_path: str = os.path.join(images_path, f"{question_id}.png")
|
|
96
|
+
if not os.path.exists(image_path):
|
|
97
|
+
# some images are CMYK mode, convert to RGB.
|
|
98
|
+
row["image"][0].convert("RGB").save(image_path, "PNG", optimize=True)
|
|
99
|
+
|
|
100
|
+
# Add the references
|
|
101
|
+
references: List[Reference] = []
|
|
102
|
+
question: str = row["question"]
|
|
103
|
+
answer: str
|
|
104
|
+
content: List[MediaObject]
|
|
105
|
+
options: List[str] = [row["choice_a"], row["choice_b"], row["choice_c"], row["choice_d"]]
|
|
106
|
+
answer = row["answer"].strip()
|
|
107
|
+
# The given correct answer is a letter, but we need an index
|
|
108
|
+
correct_answer_index: int = ord(answer) - ord("A")
|
|
109
|
+
# The options are originally appended to the question
|
|
110
|
+
|
|
111
|
+
for i, option in enumerate(options):
|
|
112
|
+
reference: Reference
|
|
113
|
+
is_correct: bool = i == correct_answer_index
|
|
114
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
|
|
115
|
+
references.append(reference)
|
|
116
|
+
|
|
117
|
+
content = [
|
|
118
|
+
MediaObject(location=image_path, content_type="image/png"),
|
|
119
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
120
|
+
]
|
|
121
|
+
instances.append(
|
|
122
|
+
Instance(
|
|
123
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
124
|
+
references=references,
|
|
125
|
+
split=TEST_SPLIT,
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
return instances
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class UnicornScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
How Many Unicorns are in this Image? A Safety Evaluation Benchmark of Vision LLMs
|
|
23
|
+
|
|
24
|
+
We shift our focus from evaluating standard performance to introducing a comprehensive safety evaluation
|
|
25
|
+
suite Unicorn, covering both out-of-distribution (OOD) generalization and adversarial robustness. For the OOD
|
|
26
|
+
evaluation, we present two novel VQA datasets --- OODCV-VQA and Sketchy-VQA, each with one variant, designed
|
|
27
|
+
to test model performance under challenging conditions. In the OOD scenario, questions are matched with
|
|
28
|
+
boolean or numerical answers, and we use exact match metrics for evaluation. When comparing OOD Sketchy-VQA
|
|
29
|
+
with its synthesized in-distribution counterpart, we found an average model output F1 drop of 8.9%,
|
|
30
|
+
highlighting the challenging nature of the OOD scenario in the Unicorn benchmark.
|
|
31
|
+
|
|
32
|
+
@article{tu2023unicorns,
|
|
33
|
+
title={How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs},
|
|
34
|
+
author={Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han,
|
|
35
|
+
Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
|
|
36
|
+
journal={arXiv preprint arXiv:2311.16101},
|
|
37
|
+
year={2023}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
Paper: https://arxiv.org/abs/2311.16101
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
UNICORN_HUGGINGFACE_DATASET_NAME: str = "PahaII/unicorn"
|
|
44
|
+
|
|
45
|
+
IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main/images/{image_path}?download=true"
|
|
46
|
+
|
|
47
|
+
SUBJECTS: List[str] = ["OODCV-VQA", "OODCV-Counterfactual", "Sketchy-VQA", "Sketchy-Challenging"]
|
|
48
|
+
|
|
49
|
+
IMG_TYPE: Dict[str, str] = {
|
|
50
|
+
"OODCV-VQA": "jpeg",
|
|
51
|
+
"OODCV-Counterfactual": "jpeg",
|
|
52
|
+
"Sketchy-VQA": "png",
|
|
53
|
+
"Sketchy-Challenging": "png",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
name = "unicorn"
|
|
57
|
+
description = (
|
|
58
|
+
"Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
|
|
59
|
+
" ([paper](https://arxiv.org/abs/2311.16101))."
|
|
60
|
+
)
|
|
61
|
+
tags = ["vision-language"]
|
|
62
|
+
|
|
63
|
+
def __init__(self, subject: str):
|
|
64
|
+
super().__init__()
|
|
65
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
66
|
+
self._subject: str = subject
|
|
67
|
+
self._image_type: str = self.IMG_TYPE[self._subject]
|
|
68
|
+
|
|
69
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
+
images_path: str = os.path.join(output_path, "images")
|
|
71
|
+
ensure_directory_exists(images_path)
|
|
72
|
+
|
|
73
|
+
# There is only the test split in Unicorn benchmark
|
|
74
|
+
instances: List[Instance] = []
|
|
75
|
+
question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
|
|
76
|
+
|
|
77
|
+
# Process the test set
|
|
78
|
+
for row in tqdm(
|
|
79
|
+
load_dataset(
|
|
80
|
+
self.UNICORN_HUGGINGFACE_DATASET_NAME,
|
|
81
|
+
data_files=question_data_files,
|
|
82
|
+
split=TEST_SPLIT,
|
|
83
|
+
cache_dir=output_path,
|
|
84
|
+
)
|
|
85
|
+
):
|
|
86
|
+
# Download the image
|
|
87
|
+
image_path: str = row["image_path"]
|
|
88
|
+
local_image_path: str = os.path.join(output_path, image_path)
|
|
89
|
+
ensure_file_downloaded(
|
|
90
|
+
source_url=self.IMAGE_URL.format(image_path=image_path),
|
|
91
|
+
target_path=local_image_path,
|
|
92
|
+
unpack=False,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
content: List[MediaObject] = [
|
|
96
|
+
MediaObject(location=local_image_path, content_type=f"image/{self._image_type}"),
|
|
97
|
+
MediaObject(text=row["question"], content_type="text/plain"),
|
|
98
|
+
]
|
|
99
|
+
answer: str = row["answer"]
|
|
100
|
+
instances.append(
|
|
101
|
+
Instance(
|
|
102
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
103
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
104
|
+
split=TEST_SPLIT,
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return instances
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import Dict, List, Set
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
CORRECT_TAG,
|
|
7
|
+
TRAIN_SPLIT,
|
|
8
|
+
VALID_SPLIT,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
Reference,
|
|
13
|
+
Scenario,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
+
from helm.common.general import ensure_file_downloaded
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class VizWizScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
VizWiz is a real-world visual question answering dataset consisting of questions
|
|
22
|
+
asked by people who are blind. It originates from a natural visual question answering
|
|
23
|
+
setting where blind people each took an image and recorded a spoken question about it,
|
|
24
|
+
together with 10 crowdsourced answers per visual question.
|
|
25
|
+
|
|
26
|
+
Version as of January 1, 2020:
|
|
27
|
+
|
|
28
|
+
- 20,523 training image/question pairs
|
|
29
|
+
- 205,230 training answer/answer confidence pairs
|
|
30
|
+
- 4,319 validation image/question pairs
|
|
31
|
+
- 43,190 validation answer/answer confidence pairs
|
|
32
|
+
|
|
33
|
+
where answer confidences are one of {"yes", "maybe", "no"}.
|
|
34
|
+
|
|
35
|
+
Answers are publicly shared for the train and validation splits and hidden for the test split.
|
|
36
|
+
|
|
37
|
+
Paper: https://arxiv.org/abs/1802.08218
|
|
38
|
+
Website: https://vizwiz.org/tasks-and-datasets/vqa
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
# Annotations are not available for the test set
|
|
42
|
+
ANNOTATIONS_URL: str = "https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip"
|
|
43
|
+
SPLIT_TO_ANNOTATIONS_FILE: Dict[str, str] = {
|
|
44
|
+
TRAIN_SPLIT: "train.json",
|
|
45
|
+
VALID_SPLIT: "val.json",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
SPLIT_TO_IMAGES: Dict[str, str] = {
|
|
49
|
+
TRAIN_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip",
|
|
50
|
+
VALID_SPLIT: "https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
name = "viz_wiz"
|
|
54
|
+
description = (
|
|
55
|
+
"Real-world VQA dataset consisting of questions asked by "
|
|
56
|
+
"people who are blind ([paper](https://arxiv.org/abs/1802.08218))."
|
|
57
|
+
)
|
|
58
|
+
tags = ["vision-language", "visual question answering"]
|
|
59
|
+
|
|
60
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
61
|
+
# Download the questions and annotations
|
|
62
|
+
annotations_path: str = os.path.join(output_path, "annotations")
|
|
63
|
+
ensure_file_downloaded(
|
|
64
|
+
source_url=self.ANNOTATIONS_URL,
|
|
65
|
+
target_path=annotations_path,
|
|
66
|
+
unpack=True,
|
|
67
|
+
unpack_type="unzip",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
instances: List[Instance] = []
|
|
71
|
+
for split in [TRAIN_SPLIT, VALID_SPLIT]:
|
|
72
|
+
# Download the images for the split
|
|
73
|
+
images_path: str = os.path.join(output_path, split)
|
|
74
|
+
ensure_file_downloaded(
|
|
75
|
+
source_url=self.SPLIT_TO_IMAGES[split],
|
|
76
|
+
target_path=images_path,
|
|
77
|
+
unpack=True,
|
|
78
|
+
unpack_type="unzip",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
annotations_split_path: str = os.path.join(annotations_path, self.SPLIT_TO_ANNOTATIONS_FILE[split])
|
|
82
|
+
with open(annotations_split_path) as f:
|
|
83
|
+
for image_annotation in json.load(f):
|
|
84
|
+
image_path: str = os.path.join(images_path, image_annotation["image"])
|
|
85
|
+
assert os.path.exists(image_path), f"Image {image_path} does not exist"
|
|
86
|
+
|
|
87
|
+
content: List[MediaObject] = [
|
|
88
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
89
|
+
MediaObject(text=image_annotation["question"], content_type="text/plain"),
|
|
90
|
+
]
|
|
91
|
+
deduped_answers: Set[str] = {
|
|
92
|
+
answer_json["answer"]
|
|
93
|
+
for answer_json in image_annotation["answers"]
|
|
94
|
+
if answer_json["answer_confidence"] == "yes"
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
instances.append(
|
|
98
|
+
Instance(
|
|
99
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
100
|
+
references=[
|
|
101
|
+
Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in deduped_answers
|
|
102
|
+
],
|
|
103
|
+
split=split,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return instances
|
|
@@ -54,7 +54,7 @@ class VQAScenario(Scenario):
|
|
|
54
54
|
TEST_SPLIT: "http://images.cocodataset.org/zips/test2015.zip",
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
name = "
|
|
57
|
+
name = "vqa"
|
|
58
58
|
description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
|
|
59
59
|
tags = ["vision-language", "visual question answering"]
|
|
60
60
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import List, Any
|
|
2
2
|
from datasets import load_dataset
|
|
3
|
-
from helm.common.hierarchical_logger import
|
|
3
|
+
from helm.common.hierarchical_logger import htrack_block
|
|
4
4
|
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
5
5
|
|
|
6
6
|
|
|
@@ -59,26 +59,26 @@ class WMT14Scenario(Scenario):
|
|
|
59
59
|
return deduplicated_dataset
|
|
60
60
|
|
|
61
61
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
62
|
+
with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
|
|
63
|
+
subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
|
|
64
|
+
hf_dataset: Any = load_dataset("wmt14", subset_name, trust_remote_code=True)
|
|
65
|
+
splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
|
|
66
66
|
|
|
67
67
|
instances: List[Instance] = []
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
68
|
+
with htrack_block("Generating instances"):
|
|
69
|
+
# Some training sets are too large, so we will only take a random subset of it.
|
|
70
|
+
hf_dataset["train"] = hf_dataset["train"].shuffle(seed=42)[:MAX_TRAIN_INSTANCES]
|
|
71
|
+
hf_dataset["train"]["translation"] = self._deduplicate(hf_dataset["train"]["translation"])
|
|
72
|
+
for example in hf_dataset["train"]["translation"]:
|
|
73
|
+
source_sentence: str = example[self.source_language]
|
|
74
|
+
target_sentence: str = example[self.target_language]
|
|
75
|
+
instances.append(
|
|
76
|
+
Instance(
|
|
77
|
+
input=Input(text=source_sentence),
|
|
78
|
+
references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
|
|
79
|
+
split="train",
|
|
80
|
+
)
|
|
80
81
|
)
|
|
81
|
-
)
|
|
82
82
|
|
|
83
83
|
# No special handling needed for validation or test.
|
|
84
84
|
for split_name in ["validation", "test"]:
|
helm/benchmark/server.py
CHANGED
|
@@ -5,10 +5,15 @@ Starts a local HTTP server to display benchmarking assets.
|
|
|
5
5
|
|
|
6
6
|
import argparse
|
|
7
7
|
import importlib_resources as resources
|
|
8
|
+
import json
|
|
8
9
|
from os import path
|
|
9
10
|
import urllib
|
|
10
11
|
|
|
11
|
-
from bottle import Bottle, static_file
|
|
12
|
+
from bottle import Bottle, static_file, HTTPResponse
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
15
|
+
from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
|
|
16
|
+
from helm.common.general import serialize_dates
|
|
12
17
|
|
|
13
18
|
|
|
14
19
|
app = Bottle()
|
|
@@ -28,6 +33,35 @@ def serve_config():
|
|
|
28
33
|
)
|
|
29
34
|
|
|
30
35
|
|
|
36
|
+
# Shim for running helm-server for old suites from old version of helm-summarize
|
|
37
|
+
# that do not contain schema.json.
|
|
38
|
+
#
|
|
39
|
+
# The HELM web frontend expects to find a schema.json at /benchmark_output/runs/<version>/schema.json
|
|
40
|
+
# which is produced by the new version of helm-summarize but not the old version.
|
|
41
|
+
# When serving a suite produced by the old version of helm-summarize, the schena.json will be missing.
|
|
42
|
+
# This shim supports those suites by serving a schena.json that is dynamically computed from schema_classic.yaml
|
|
43
|
+
#
|
|
44
|
+
# We will remove this in a few months after most users have moved to the new version of helm-summarize.
|
|
45
|
+
#
|
|
46
|
+
# TODO(2024-03-01): Remove this.
|
|
47
|
+
@app.get("/benchmark_output/<runs_or_releases:re:runs|releases>/<version>/schema.json")
|
|
48
|
+
def server_schema(runs_or_releases, version):
|
|
49
|
+
relative_schema_path = path.join(runs_or_releases, version, "schema.json")
|
|
50
|
+
absolute_schema_path = path.join(app.config["helm.outputpath"], relative_schema_path)
|
|
51
|
+
if path.isfile(absolute_schema_path):
|
|
52
|
+
response = static_file(relative_schema_path, root=app.config["helm.outputpath"])
|
|
53
|
+
else:
|
|
54
|
+
# Suite does not contain schema.json
|
|
55
|
+
# Fall back to schema_classic.yaml from the static directory
|
|
56
|
+
classic_schema_path = path.join(app.config["helm.staticpath"], SCHEMA_CLASSIC_YAML_FILENAME)
|
|
57
|
+
with open(classic_schema_path, "r") as f:
|
|
58
|
+
response = HTTPResponse(json.dumps(yaml.safe_load(f), indent=2, default=serialize_dates))
|
|
59
|
+
response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
|
|
60
|
+
response.set_header("Expires", "0")
|
|
61
|
+
response.content_type = "application/json"
|
|
62
|
+
return response
|
|
63
|
+
|
|
64
|
+
|
|
31
65
|
@app.get("/benchmark_output/<filename:path>")
|
|
32
66
|
def serve_benchmark_output(filename):
|
|
33
67
|
response = static_file(filename, root=app.config["helm.outputpath"])
|
|
@@ -36,6 +70,14 @@ def serve_benchmark_output(filename):
|
|
|
36
70
|
return response
|
|
37
71
|
|
|
38
72
|
|
|
73
|
+
@app.get("/cache/output/<filename:path>")
|
|
74
|
+
def serve_cache_output(filename):
|
|
75
|
+
response = static_file(filename, root=app.config["helm.cacheoutputpath"])
|
|
76
|
+
response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
|
|
77
|
+
response.set_header("Expires", "0")
|
|
78
|
+
return response
|
|
79
|
+
|
|
80
|
+
|
|
39
81
|
@app.get("/")
|
|
40
82
|
@app.get("/<filename:path>")
|
|
41
83
|
def serve_static(filename="index.html"):
|
|
@@ -53,6 +95,12 @@ def main():
|
|
|
53
95
|
help="The location of the output path (filesystem path or URL)",
|
|
54
96
|
default="benchmark_output",
|
|
55
97
|
)
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"--cache-output-path",
|
|
100
|
+
type=str,
|
|
101
|
+
help="The location of the filesystem cache output folder (filesystem path or URL)",
|
|
102
|
+
default="prod_env/cache/output",
|
|
103
|
+
)
|
|
56
104
|
parser.add_argument(
|
|
57
105
|
"--suite",
|
|
58
106
|
type=str,
|
|
@@ -65,6 +113,11 @@ def main():
|
|
|
65
113
|
default=None,
|
|
66
114
|
help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
|
|
67
115
|
)
|
|
116
|
+
parser.add_argument(
|
|
117
|
+
"--jquery",
|
|
118
|
+
action="store_true",
|
|
119
|
+
help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
|
|
120
|
+
)
|
|
68
121
|
args = parser.parse_args()
|
|
69
122
|
|
|
70
123
|
if args.suite and args.release:
|
|
@@ -73,7 +126,8 @@ def main():
|
|
|
73
126
|
# Determine the location of the static directory.
|
|
74
127
|
# This is a hack: it assumes that the static directory has a physical location,
|
|
75
128
|
# which is not always the case (e.g. when using zipimport).
|
|
76
|
-
|
|
129
|
+
static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
|
|
130
|
+
resource_path = resources.files(static_package_name).joinpath("index.html")
|
|
77
131
|
with resources.as_file(resource_path) as resource_filename:
|
|
78
132
|
static_path = str(resource_filename.parent)
|
|
79
133
|
|
|
@@ -83,16 +137,19 @@ def main():
|
|
|
83
137
|
# Output path is a URL, so set the output path base URL in the frontend to that URL
|
|
84
138
|
# so that the frontend reads from that URL directly.
|
|
85
139
|
app.config["helm.outputpath"] = None
|
|
140
|
+
# TODO: figure out helm.cacheoutputpath
|
|
86
141
|
app.config["helm.outputurl"] = args.output_path
|
|
87
142
|
else:
|
|
88
143
|
# Output path is a location on disk, so set the output path base URL to /benchmark_output
|
|
89
144
|
# and then serve files from the location on disk at that URL.
|
|
90
145
|
app.config["helm.outputpath"] = path.abspath(args.output_path)
|
|
146
|
+
app.config["helm.cacheoutputpath"] = path.abspath(args.cache_output_path)
|
|
91
147
|
app.config["helm.outputurl"] = "benchmark_output"
|
|
92
148
|
|
|
93
149
|
app.config["helm.suite"] = args.suite or "latest"
|
|
94
150
|
app.config["helm.release"] = args.release
|
|
95
151
|
|
|
152
|
+
print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
|
|
96
153
|
app.run(host="0.0.0.0", port=args.port)
|
|
97
154
|
|
|
98
155
|
|
helm/benchmark/slurm_jobs.py
CHANGED
|
@@ -2,6 +2,8 @@ import re
|
|
|
2
2
|
import subprocess
|
|
3
3
|
from typing import Mapping, Set, Union
|
|
4
4
|
|
|
5
|
+
from retrying import retry
|
|
6
|
+
|
|
5
7
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
8
|
|
|
7
9
|
try:
|
|
@@ -67,6 +69,11 @@ def submit_slurm_job(command: str, slurm_args: Mapping[str, Union[str, int]]) ->
|
|
|
67
69
|
return slurm.sbatch(command)
|
|
68
70
|
|
|
69
71
|
|
|
72
|
+
@retry(
|
|
73
|
+
wait_incrementing_start=5 * 1000, # 5 seconds
|
|
74
|
+
wait_incrementing_increment=5 * 1000, # 5 seconds
|
|
75
|
+
stop_max_attempt_number=5,
|
|
76
|
+
)
|
|
70
77
|
def get_slurm_job_state(job_id: int) -> str:
|
|
71
78
|
"""Get the state of a Slurm job."""
|
|
72
79
|
try:
|
|
@@ -80,6 +87,11 @@ def get_slurm_job_state(job_id: int) -> str:
|
|
|
80
87
|
return search_result.group(1)
|
|
81
88
|
|
|
82
89
|
|
|
90
|
+
@retry(
|
|
91
|
+
wait_incrementing_start=5 * 1000, # 5 seconds
|
|
92
|
+
wait_incrementing_increment=5 * 1000, # 5 seconds
|
|
93
|
+
stop_max_attempt_number=5,
|
|
94
|
+
)
|
|
83
95
|
def cancel_slurm_job(job_id: int) -> None:
|
|
84
96
|
"""Cancel a Slurm job."""
|
|
85
97
|
try:
|