crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MMEScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
|
|
23
|
+
|
|
24
|
+
Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
|
|
25
|
+
multimodal tasks, showing amazing emergent abilities in recent studies. However,
|
|
26
|
+
it is difficult for these case studies to fully reflect the performance of MLLM,
|
|
27
|
+
lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
|
|
28
|
+
the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
|
|
29
|
+
and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
|
|
30
|
+
that may arise from direct use of public datasets for evaluation, the annotations
|
|
31
|
+
of instruction-answer pairs are all manually designed. The concise instruction design
|
|
32
|
+
allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
|
|
33
|
+
Besides, with such an instruction, we can also easily carry out quantitative
|
|
34
|
+
statistics. We rephrase the answer type of MME to multiple-choice question-answering.
|
|
35
|
+
We use the multiple-choice metrics for 14 different evaluation tasks.
|
|
36
|
+
|
|
37
|
+
@article{fu2023mme,
|
|
38
|
+
title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
|
|
39
|
+
author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
|
|
40
|
+
Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
|
|
41
|
+
Sun, Xing and Wu, Yunsheng and Ji, Rongrong},
|
|
42
|
+
journal={arXiv preprint arXiv:2306.13394},
|
|
43
|
+
year={2023}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
Paper: https://arxiv.org/abs/2306.13394
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
|
|
50
|
+
|
|
51
|
+
SUBJECTS: List[str] = [
|
|
52
|
+
"existence",
|
|
53
|
+
"scene",
|
|
54
|
+
"posters",
|
|
55
|
+
"color",
|
|
56
|
+
"OCR",
|
|
57
|
+
"position",
|
|
58
|
+
"celebrity",
|
|
59
|
+
"artwork",
|
|
60
|
+
"commonsense_reasoning",
|
|
61
|
+
"numerical_calculation",
|
|
62
|
+
"landmark",
|
|
63
|
+
"count",
|
|
64
|
+
"text_translation",
|
|
65
|
+
"code_reasoning",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
name = "mme"
|
|
69
|
+
description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2306.13394))."
|
|
70
|
+
tags = ["vision-language"]
|
|
71
|
+
options: List[str] = ["Yes", "No"]
|
|
72
|
+
|
|
73
|
+
def __init__(self, subject: str):
|
|
74
|
+
super().__init__()
|
|
75
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
76
|
+
self._subject: str = subject
|
|
77
|
+
|
|
78
|
+
def get_label_from_answer(self, answer: str):
|
|
79
|
+
label: str
|
|
80
|
+
if answer == "Yes":
|
|
81
|
+
label = "A"
|
|
82
|
+
elif answer == "No":
|
|
83
|
+
label = "B"
|
|
84
|
+
else:
|
|
85
|
+
raise NotImplementedError(f"Invalid answer: {answer}")
|
|
86
|
+
return label
|
|
87
|
+
|
|
88
|
+
def remove_question_suffix_for_mcqa(self, question: str):
|
|
89
|
+
return question.replace("Please answer yes or no.", "").strip()
|
|
90
|
+
|
|
91
|
+
def get_question_id(self, question_id: str):
|
|
92
|
+
return question_id.split(".")[0].replace("/", "-")
|
|
93
|
+
|
|
94
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
95
|
+
images_path: str = os.path.join(output_path, "images")
|
|
96
|
+
ensure_directory_exists(images_path)
|
|
97
|
+
|
|
98
|
+
# There is only the test split in Unicorn benchmark
|
|
99
|
+
instances: List[Instance] = []
|
|
100
|
+
# Process the test set
|
|
101
|
+
# Two open-ended generation instances and
|
|
102
|
+
# one multi-choice generation instance per row
|
|
103
|
+
for row in tqdm(
|
|
104
|
+
load_dataset(
|
|
105
|
+
self.MME_HUGGINGFACE_DATASET_NAME,
|
|
106
|
+
split=TEST_SPLIT,
|
|
107
|
+
cache_dir=output_path,
|
|
108
|
+
)
|
|
109
|
+
):
|
|
110
|
+
if row["category"] != self._subject:
|
|
111
|
+
continue
|
|
112
|
+
question_id: str = self.get_question_id(row["question_id"])
|
|
113
|
+
# Save the image locally
|
|
114
|
+
image_path: str = os.path.join(images_path, f"{question_id}.png")
|
|
115
|
+
if not os.path.exists(image_path):
|
|
116
|
+
row["image"].save(image_path)
|
|
117
|
+
|
|
118
|
+
question: str = self.remove_question_suffix_for_mcqa(row["question"])
|
|
119
|
+
answer: str = row["answer"]
|
|
120
|
+
references: List[Reference] = []
|
|
121
|
+
|
|
122
|
+
answer = self.get_label_from_answer(answer)
|
|
123
|
+
# The given correct answer is a letter, but we need an index
|
|
124
|
+
correct_answer_index: int = ord(answer) - ord("A")
|
|
125
|
+
# The options are originally appended to the question
|
|
126
|
+
|
|
127
|
+
for i, option in enumerate(self.options):
|
|
128
|
+
reference: Reference
|
|
129
|
+
is_correct: bool = i == correct_answer_index
|
|
130
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
|
|
131
|
+
references.append(reference)
|
|
132
|
+
|
|
133
|
+
content = [
|
|
134
|
+
MediaObject(location=image_path, content_type="image/png"),
|
|
135
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
136
|
+
]
|
|
137
|
+
instances.append(
|
|
138
|
+
Instance(
|
|
139
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
140
|
+
references=references,
|
|
141
|
+
split=TEST_SPLIT,
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return instances
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
VALID_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MMMUScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI
|
|
23
|
+
|
|
24
|
+
We introduce MMMU: a new benchmark designed to evaluate multimodal models on massive multi-discipline
|
|
25
|
+
tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously
|
|
26
|
+
collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines:
|
|
27
|
+
Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering.
|
|
28
|
+
These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such
|
|
29
|
+
as charts, diagrams, maps, tables, music sheets, and chemical structures.
|
|
30
|
+
|
|
31
|
+
@article{yue2023mmmu,
|
|
32
|
+
title={MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI},
|
|
33
|
+
author={Xiang Yue and Yuansheng Ni and Kai Zhang and Tianyu Zheng and Ruoqi Liu and Ge Zhang and Samuel
|
|
34
|
+
Stevens and Dongfu Jiang and Weiming Ren and Yuxuan Sun and Cong Wei and Botao Yu and Ruibin Yuan and
|
|
35
|
+
Renliang Sun and Ming Yin and Boyuan Zheng and Zhenzhu Yang and Yibo Liu and Wenhao Huang and Huan Sun
|
|
36
|
+
and Yu Su and Wenhu Chen},
|
|
37
|
+
journal={arXiv preprint arXiv:2311.16502},
|
|
38
|
+
year={2023},
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
Paper: https://arxiv.org/abs/2311.16502
|
|
42
|
+
Website: https://mmmu-benchmark.github.io/
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
MMMU_HUGGINGFACE_DATASET_NAME: str = "MMMU/MMMU"
|
|
46
|
+
MAX_NUM_IMAGES: int = 7
|
|
47
|
+
|
|
48
|
+
SUBJECTS: List[str] = [
|
|
49
|
+
"Accounting",
|
|
50
|
+
"Agriculture",
|
|
51
|
+
"Architecture_and_Engineering",
|
|
52
|
+
"Art",
|
|
53
|
+
"Art_Theory",
|
|
54
|
+
"Basic_Medical_Science",
|
|
55
|
+
"Biology",
|
|
56
|
+
"Chemistry",
|
|
57
|
+
"Clinical_Medicine",
|
|
58
|
+
"Computer_Science",
|
|
59
|
+
"Design",
|
|
60
|
+
"Diagnostics_and_Laboratory_Medicine",
|
|
61
|
+
"Economics",
|
|
62
|
+
"Electronics",
|
|
63
|
+
"Energy_and_Power",
|
|
64
|
+
"Finance",
|
|
65
|
+
"Geography",
|
|
66
|
+
"History",
|
|
67
|
+
"Literature",
|
|
68
|
+
"Manage",
|
|
69
|
+
"Marketing",
|
|
70
|
+
"Materials",
|
|
71
|
+
"Math",
|
|
72
|
+
"Mechanical_Engineering",
|
|
73
|
+
"Music",
|
|
74
|
+
"Pharmacy",
|
|
75
|
+
"Physics",
|
|
76
|
+
"Psychology",
|
|
77
|
+
"Public_Health",
|
|
78
|
+
"Sociology",
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
name = "mmmu"
|
|
82
|
+
description = (
|
|
83
|
+
"Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
|
|
84
|
+
"subject knowledge and deliberate reasoning ([paper](https://arxiv.org/abs/2311.16502))."
|
|
85
|
+
)
|
|
86
|
+
tags = ["vision-language"]
|
|
87
|
+
|
|
88
|
+
def __init__(self, subject: str, question_type: str):
|
|
89
|
+
super().__init__()
|
|
90
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
91
|
+
self._subject: str = subject
|
|
92
|
+
|
|
93
|
+
assert question_type in ["multiple-choice", "open"], f"Invalid question type: {question_type}"
|
|
94
|
+
self._question_type: str = question_type
|
|
95
|
+
|
|
96
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
97
|
+
images_path: str = os.path.join(output_path, "images", self._subject)
|
|
98
|
+
ensure_directory_exists(images_path)
|
|
99
|
+
|
|
100
|
+
instances: List[Instance] = []
|
|
101
|
+
|
|
102
|
+
# Process the validation set
|
|
103
|
+
# There seems to be a dev set, but it's unavailable through load_dataset.
|
|
104
|
+
# When loading "dev", we get error: ValueError: Unknown split "dev". Should be one of ['test', 'validation'].
|
|
105
|
+
# The test set doesn't have answers, since the MMMU competition/leaderboard uses the test set
|
|
106
|
+
for row in tqdm(
|
|
107
|
+
load_dataset(self.MMMU_HUGGINGFACE_DATASET_NAME, self._subject, split="validation", cache_dir=output_path)
|
|
108
|
+
):
|
|
109
|
+
# Skip questions that aren't in the subject we're interested in
|
|
110
|
+
if row["question_type"] != self._question_type:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
question_id: str = row["id"]
|
|
114
|
+
if self._subject not in question_id:
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
question_template: str = row["question"]
|
|
118
|
+
options: List[str] = eval(row["options"]) # Convert the string to list of options
|
|
119
|
+
answer: str = row["answer"]
|
|
120
|
+
|
|
121
|
+
# Create the question. Questions can have text and images interleaved
|
|
122
|
+
question_template_to_image_path: Dict[str, str] = {}
|
|
123
|
+
content: List[MediaObject] = []
|
|
124
|
+
for img_number in range(1, self.MAX_NUM_IMAGES):
|
|
125
|
+
image_id: str = f"image_{img_number}"
|
|
126
|
+
if row[image_id] is None:
|
|
127
|
+
# At this point, there are no more images for this question
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
# Save the image locally
|
|
131
|
+
image_path: str = os.path.join(images_path, f"{question_id}_{image_id}.png")
|
|
132
|
+
if not os.path.exists(image_path):
|
|
133
|
+
row[image_id].save(image_path)
|
|
134
|
+
|
|
135
|
+
image_template_tag: str = f"<image {img_number}>"
|
|
136
|
+
question_template_to_image_path[image_template_tag] = image_path
|
|
137
|
+
|
|
138
|
+
# There are cases when the image is included, but it is not used either in the
|
|
139
|
+
# question template or in the answer options
|
|
140
|
+
if image_template_tag not in question_template:
|
|
141
|
+
# The image is not in the question template
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
head, question_template = question_template.split(image_template_tag, 1)
|
|
145
|
+
if head:
|
|
146
|
+
content.append(MediaObject(text=head, content_type="text/plain"))
|
|
147
|
+
content.append(MediaObject(location=image_path, content_type="image/png"))
|
|
148
|
+
|
|
149
|
+
# Add the rest of the question template
|
|
150
|
+
if question_template:
|
|
151
|
+
content.append(MediaObject(text=question_template, content_type="text/plain"))
|
|
152
|
+
|
|
153
|
+
# Add the references
|
|
154
|
+
references: List[Reference] = []
|
|
155
|
+
if self._question_type == "multiple-choice":
|
|
156
|
+
# The given correct answer is a letter, but we need an index
|
|
157
|
+
correct_answer_index: int = ord(answer) - ord("A")
|
|
158
|
+
|
|
159
|
+
for i, option in enumerate(options):
|
|
160
|
+
reference: Reference
|
|
161
|
+
is_correct: bool = i == correct_answer_index
|
|
162
|
+
|
|
163
|
+
if option in question_template_to_image_path:
|
|
164
|
+
# Image answer choices
|
|
165
|
+
image_path = question_template_to_image_path[option]
|
|
166
|
+
image_choice = MediaObject(location=image_path, content_type="image/png")
|
|
167
|
+
reference = Reference(
|
|
168
|
+
Output(multimedia_content=MultimediaObject([image_choice])),
|
|
169
|
+
tags=[CORRECT_TAG] if is_correct else [],
|
|
170
|
+
)
|
|
171
|
+
else:
|
|
172
|
+
# Text answer choices
|
|
173
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
|
|
174
|
+
|
|
175
|
+
references.append(reference)
|
|
176
|
+
else:
|
|
177
|
+
references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
|
|
178
|
+
|
|
179
|
+
instances.append(
|
|
180
|
+
Instance(
|
|
181
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
182
|
+
references=references,
|
|
183
|
+
split=VALID_SPLIT,
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
return instances
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded
|
|
7
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
TRAIN_SPLIT,
|
|
16
|
+
VALID_SPLIT,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MSCOCOCaptioningScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
|
|
23
|
+
It has 330K images, with over 200K of them labeled. We use the 2014 version of the dataset instead
|
|
24
|
+
of the 2017 version because of the larger validation set. According to https://cocodataset.org/#download,
|
|
25
|
+
the 2014 version has 83K images in the train split and 41K in the val split.
|
|
26
|
+
|
|
27
|
+
Each image also has five captions. For example, image #335111 has the following five captions:
|
|
28
|
+
1. a row of bikes on the sidewalk, 2 on the ground.
|
|
29
|
+
2. a couple of bikes laying on their sides on a sidewalk.
|
|
30
|
+
3. a person wearing a black coat with a hood stands on the street, near many bikes
|
|
31
|
+
4. a woman standing in front of a row of bicycles in front of a bus stop with two bikes knocked over
|
|
32
|
+
5. there are some bicycles laying on their sides
|
|
33
|
+
|
|
34
|
+
Paper: https://arxiv.org/abs/1405.0312
|
|
35
|
+
Website: https://cocodataset.org/#home
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
|
|
39
|
+
SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2014.zip"
|
|
40
|
+
COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
|
|
41
|
+
|
|
42
|
+
name = "mscoco"
|
|
43
|
+
description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
|
|
44
|
+
tags = ["text-to-image", "image-to-text"]
|
|
45
|
+
|
|
46
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
+
# Download the annotations which contains the image IDs, filenames and captions
|
|
48
|
+
data_path: str = os.path.join(output_path, "data")
|
|
49
|
+
ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
|
|
50
|
+
|
|
51
|
+
instances: List[Instance] = []
|
|
52
|
+
for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
|
|
53
|
+
# Download the images of the split
|
|
54
|
+
split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
|
|
55
|
+
split_path: str = os.path.join(data_path, coco_split)
|
|
56
|
+
ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
|
|
57
|
+
|
|
58
|
+
# Read the metadata for the split
|
|
59
|
+
metadata_path: str = os.path.join(data_path, f"captions_{coco_split}2014.json")
|
|
60
|
+
with open(metadata_path, "r") as f:
|
|
61
|
+
metadata: Dict[str, Any] = json.load(f)
|
|
62
|
+
|
|
63
|
+
# Get the path of each image
|
|
64
|
+
image_id_to_path: Dict[int, str] = {
|
|
65
|
+
image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
|
|
66
|
+
for image_metadata in metadata["images"]
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Gather the five captions for each image
|
|
70
|
+
image_id_to_captions: Dict[int, List[str]] = defaultdict(list)
|
|
71
|
+
for annotation in metadata["annotations"]:
|
|
72
|
+
image_id_to_captions[annotation["image_id"]].append(annotation["caption"])
|
|
73
|
+
|
|
74
|
+
# Create instances
|
|
75
|
+
for image_id in image_id_to_path:
|
|
76
|
+
image_path: str = image_id_to_path[image_id]
|
|
77
|
+
captions: List[str] = image_id_to_captions[image_id]
|
|
78
|
+
|
|
79
|
+
content: List[MediaObject] = [
|
|
80
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
81
|
+
]
|
|
82
|
+
instances.append(
|
|
83
|
+
Instance(
|
|
84
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
85
|
+
references=[
|
|
86
|
+
Reference(Output(text=caption.rstrip()), tags=[CORRECT_TAG]) for caption in captions
|
|
87
|
+
],
|
|
88
|
+
split=helm_split,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return instances
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Dict, List, Set
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded
|
|
7
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
TRAIN_SPLIT,
|
|
16
|
+
VALID_SPLIT,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MSCOCOCategorizationScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Microsoft COCO (MS-COCO) is a large-scale object detection, segmentation, and captioning dataset.
|
|
23
|
+
It has 330K images, with over 200K of them labeled. We use the 2017 version of the dataset
|
|
24
|
+
for the categorization task.
|
|
25
|
+
|
|
26
|
+
Paper: https://arxiv.org/abs/1405.0312
|
|
27
|
+
Website: https://cocodataset.org/#home
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
ANNOTATIONS_DOWNLOAD_URL: str = "http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip"
|
|
31
|
+
SPLIT_DOWNLOAD_URL_TEMPLATE: str = "http://images.cocodataset.org/zips/{split}2017.zip"
|
|
32
|
+
COCO_SPLIT_TO_HELM_SPLIT: Dict[str, str] = {"train": TRAIN_SPLIT, "val": VALID_SPLIT}
|
|
33
|
+
|
|
34
|
+
name = "mscoco"
|
|
35
|
+
description = "Microsoft COCO: Common Objects in Context ([paper](https://arxiv.org/abs/1405.0312))."
|
|
36
|
+
tags = ["text-to-image", "image-to-text"]
|
|
37
|
+
|
|
38
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
39
|
+
# Download the annotations which contains the image IDs, filenames and captions
|
|
40
|
+
data_path: str = os.path.join(output_path, "data_2017")
|
|
41
|
+
ensure_file_downloaded(source_url=self.ANNOTATIONS_DOWNLOAD_URL, target_path=data_path, unpack=True)
|
|
42
|
+
|
|
43
|
+
super_categories_to_categories: Dict[str, List[str]] = defaultdict(list)
|
|
44
|
+
category_id_to_category: Dict[int, str] = {}
|
|
45
|
+
category_id_to_super_category: Dict[int, str] = {}
|
|
46
|
+
|
|
47
|
+
instances: List[Instance] = []
|
|
48
|
+
for coco_split, helm_split in self.COCO_SPLIT_TO_HELM_SPLIT.items():
|
|
49
|
+
# Download the images of the split
|
|
50
|
+
split_url: str = self.SPLIT_DOWNLOAD_URL_TEMPLATE.format(split=coco_split)
|
|
51
|
+
split_path: str = os.path.join(data_path, coco_split)
|
|
52
|
+
ensure_file_downloaded(source_url=split_url, target_path=split_path, unpack=True)
|
|
53
|
+
|
|
54
|
+
# Read the metadata for the split
|
|
55
|
+
metadata_path: str = os.path.join(data_path, f"stuff_{coco_split}2017.json")
|
|
56
|
+
with open(metadata_path, "r") as f:
|
|
57
|
+
metadata: Dict[str, Any] = json.load(f)
|
|
58
|
+
|
|
59
|
+
for category_metadata in metadata["categories"]:
|
|
60
|
+
# Each metadata looks like this {'supercategory': 'textile', 'id': 92, 'name': 'banner'}
|
|
61
|
+
category_id: int = category_metadata["id"]
|
|
62
|
+
category: str = category_metadata["name"]
|
|
63
|
+
super_category: str = category_metadata["supercategory"]
|
|
64
|
+
super_categories_to_categories[super_category].append(category)
|
|
65
|
+
category_id_to_category[category_id] = category
|
|
66
|
+
category_id_to_super_category[category_id] = super_category
|
|
67
|
+
|
|
68
|
+
# Get the path of each image
|
|
69
|
+
image_id_to_path: Dict[int, str] = {
|
|
70
|
+
image_metadata["id"]: os.path.join(split_path, image_metadata["file_name"])
|
|
71
|
+
for image_metadata in metadata["images"]
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# Gather the five captions for each image
|
|
75
|
+
image_id_to_category_ids: Dict[int, List[int]] = defaultdict(list)
|
|
76
|
+
for annotation in metadata["annotations"]:
|
|
77
|
+
image_id_to_category_ids[annotation["image_id"]].append(annotation["category_id"])
|
|
78
|
+
|
|
79
|
+
# Create instances
|
|
80
|
+
for image_id in image_id_to_path:
|
|
81
|
+
image_path: str = image_id_to_path[image_id]
|
|
82
|
+
assert os.path.exists(image_path), f"Image path {image_path} does not exist"
|
|
83
|
+
category_ids: List[int] = image_id_to_category_ids[image_id]
|
|
84
|
+
|
|
85
|
+
content: List[MediaObject] = [
|
|
86
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
87
|
+
]
|
|
88
|
+
references: List[Reference] = []
|
|
89
|
+
correct_super_categories: Set[str] = set(
|
|
90
|
+
category_id_to_super_category[category_id] for category_id in category_ids
|
|
91
|
+
)
|
|
92
|
+
# for category_id in category_ids:
|
|
93
|
+
# category = category_id_to_category[category_id]
|
|
94
|
+
# super_category = category_id_to_super_category[category_id]
|
|
95
|
+
# references.extend(
|
|
96
|
+
# [
|
|
97
|
+
# Reference(Output(text=category), tags=[CORRECT_TAG]),
|
|
98
|
+
# Reference(Output(text=super_category), tags=[CORRECT_TAG]),
|
|
99
|
+
# ]
|
|
100
|
+
# )
|
|
101
|
+
for super_category in super_categories_to_categories:
|
|
102
|
+
references.append(
|
|
103
|
+
Reference(
|
|
104
|
+
Output(text=super_category),
|
|
105
|
+
tags=[CORRECT_TAG] if super_category in correct_super_categories else [],
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
instances.append(
|
|
110
|
+
Instance(
|
|
111
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
112
|
+
references=references,
|
|
113
|
+
split=helm_split,
|
|
114
|
+
)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
return instances
|