crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
ALL_SPLITS,
|
|
7
|
+
CORRECT_TAG,
|
|
8
|
+
VALID_SPLIT,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class GQAScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Questions about real-world visual reasoning and compositional QA
|
|
23
|
+
|
|
24
|
+
@misc{hudson2019gqa,
|
|
25
|
+
title={GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering},
|
|
26
|
+
author={Drew A. Hudson and Christopher D. Manning},
|
|
27
|
+
year={2019},
|
|
28
|
+
eprint={1902.09506},
|
|
29
|
+
archivePrefix={arXiv},
|
|
30
|
+
primaryClass={cs.CL}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
Paper: https://arxiv.org/abs/1902.09506
|
|
34
|
+
Website: https://github.com/stanford-crfm/helm/issues/1951
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
QUESTIONS_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/questions1.2.zip"
|
|
38
|
+
IMAGES_URL: str = "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip"
|
|
39
|
+
|
|
40
|
+
name = "gqa"
|
|
41
|
+
description = (
|
|
42
|
+
"Questions about real-world visual reasoning and compositional QA "
|
|
43
|
+
"([paper](https://arxiv.org/abs/1902.09506))."
|
|
44
|
+
)
|
|
45
|
+
tags = ["vision-language", "reasoning"]
|
|
46
|
+
|
|
47
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
48
|
+
questions_path: str = os.path.join(output_path, "questions")
|
|
49
|
+
ensure_file_downloaded(
|
|
50
|
+
source_url=self.QUESTIONS_URL, target_path=questions_path, unpack=True, unpack_type="unzip"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
images_path: str = os.path.join(output_path, "images")
|
|
54
|
+
ensure_file_downloaded(source_url=self.IMAGES_URL, target_path=images_path, unpack=True, unpack_type="unzip")
|
|
55
|
+
|
|
56
|
+
instances: List[Instance] = []
|
|
57
|
+
for helm_split in ALL_SPLITS:
|
|
58
|
+
if helm_split == TEST_SPLIT:
|
|
59
|
+
# The test split doesn't have annotations
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
split: str = "val" if helm_split == VALID_SPLIT else helm_split
|
|
63
|
+
|
|
64
|
+
# Read the questions from the JSON
|
|
65
|
+
questions_split_path: str = os.path.join(questions_path, f"{split}_balanced_questions.json")
|
|
66
|
+
with open(questions_split_path, "r") as questions_file:
|
|
67
|
+
questions: Dict[str, Any] = json.load(questions_file)
|
|
68
|
+
for question_id, question_data in questions.items():
|
|
69
|
+
question: str = question_data["question"]
|
|
70
|
+
short_answer: str = question_data["answer"]
|
|
71
|
+
full_answer: str = question_data["fullAnswer"]
|
|
72
|
+
|
|
73
|
+
image_id: str = question_data["imageId"]
|
|
74
|
+
local_image_path: str = os.path.join(images_path, f"{image_id}.jpg")
|
|
75
|
+
|
|
76
|
+
content: List[MediaObject] = [
|
|
77
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
78
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
79
|
+
]
|
|
80
|
+
instances.append(
|
|
81
|
+
Instance(
|
|
82
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
83
|
+
references=[
|
|
84
|
+
Reference(Output(text=short_answer), tags=[CORRECT_TAG]),
|
|
85
|
+
Reference(Output(text=full_answer), tags=[CORRECT_TAG]),
|
|
86
|
+
],
|
|
87
|
+
split=helm_split,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
return instances
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
ALL_SPLITS,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
VALID_SPLIT,
|
|
11
|
+
Instance,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
Reference,
|
|
15
|
+
Scenario,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
18
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HatefulMemesScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
The Hateful Memes Challenge and Dataset is a competition and open source dataset designed to
|
|
24
|
+
measure progress in multimodal vision-and-language classification.
|
|
25
|
+
|
|
26
|
+
In order for AI to become a more effective tool for detecting hate speech, it must be able to understand
|
|
27
|
+
content the way people do: holistically. When viewing a meme, for example, we don’t think about the words
|
|
28
|
+
and photo independently of each other; we understand the combined meaning. This is extremely challenging
|
|
29
|
+
for machines, however, because it means they can’t analyze the text and the image separately. They must
|
|
30
|
+
combine these different modalities and understand how the meaning changes when they are presented together.
|
|
31
|
+
|
|
32
|
+
@article{kiela2020hateful,
|
|
33
|
+
title={The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes},
|
|
34
|
+
author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh,
|
|
35
|
+
Amanpreet and Ringshia, Pratik and Testuggine, Davide},
|
|
36
|
+
journal={arXiv preprint arXiv:2005.04790},
|
|
37
|
+
year={2020}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
Paper: https://arxiv.org/abs/2005.04790
|
|
41
|
+
Website: https://ai.meta.com/tools/hatefulmemes/
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
IMAGE_URL: str = (
|
|
45
|
+
"https://huggingface.co/datasets/neuralcatcher/hateful_memes/resolve/main/{image_path}?download=true"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# This question is from the example on the website
|
|
49
|
+
QUESTION: str = "Is this meme mean?"
|
|
50
|
+
|
|
51
|
+
name = "hateful_memes"
|
|
52
|
+
description = "Identify if a meme is hateful or not ([paper](https://arxiv.org/abs/2005.04790))."
|
|
53
|
+
tags = ["vision-language"]
|
|
54
|
+
|
|
55
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
56
|
+
ensure_directory_exists(os.path.join(output_path, "img"))
|
|
57
|
+
|
|
58
|
+
instances: List[Instance] = []
|
|
59
|
+
for split in ALL_SPLITS:
|
|
60
|
+
for row in tqdm(
|
|
61
|
+
load_dataset(
|
|
62
|
+
"neuralcatcher/hateful_memes",
|
|
63
|
+
split="validation" if split == VALID_SPLIT else split,
|
|
64
|
+
cache_dir=output_path,
|
|
65
|
+
)
|
|
66
|
+
):
|
|
67
|
+
# Download the meme
|
|
68
|
+
image_path: str = row["img"]
|
|
69
|
+
local_image_path: str = os.path.join(output_path, image_path)
|
|
70
|
+
ensure_file_downloaded(
|
|
71
|
+
source_url=self.IMAGE_URL.format(image_path=image_path),
|
|
72
|
+
target_path=local_image_path,
|
|
73
|
+
unpack=False,
|
|
74
|
+
)
|
|
75
|
+
# Some examples are missing images. Skip those for now
|
|
76
|
+
if not os.path.exists(local_image_path) or os.path.getsize(local_image_path) == 0:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
content: List[MediaObject] = [
|
|
80
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
81
|
+
MediaObject(text=self.QUESTION, content_type="text/plain"),
|
|
82
|
+
]
|
|
83
|
+
instances.append(
|
|
84
|
+
Instance(
|
|
85
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
86
|
+
references=[
|
|
87
|
+
Reference(Output(text="Yes"), tags=[CORRECT_TAG] if row["label"] == 1 else []),
|
|
88
|
+
Reference(Output(text="No"), tags=[CORRECT_TAG] if row["label"] == 0 else []),
|
|
89
|
+
],
|
|
90
|
+
split=split,
|
|
91
|
+
)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
return instances
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, List, Set
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
ALL_SPLITS,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class HEIMHumanEvalReference(Reference):
|
|
22
|
+
# The number of human annotators who gave this reference or answer.
|
|
23
|
+
num_human_answered: int = 0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HEIMHumanEvalScenario(Scenario):
|
|
27
|
+
"""
|
|
28
|
+
In [Holistic Evaluation of Text-To-Image Models (HEIM)](https://arxiv.org/abs/2311.04287), we evaluated 26
|
|
29
|
+
state-of-the-art text-to-image models using across 12 different aspects (e.g., toxicity mitigation, unbiasedness,
|
|
30
|
+
originality, etc.). We used human annotators through AWS Mechanical Turk to evaluate the models for some of
|
|
31
|
+
these aspects (see image below).
|
|
32
|
+
This scenario contains the AI-generated images and human annotations for the following question types:
|
|
33
|
+
|
|
34
|
+
1. Alignment
|
|
35
|
+
2. Aesthetics
|
|
36
|
+
3. Clear subject
|
|
37
|
+
4. Originality
|
|
38
|
+
5. Photorealism
|
|
39
|
+
|
|
40
|
+
Citations:
|
|
41
|
+
- HEIM: https://arxiv.org/abs/2311.04287
|
|
42
|
+
- MS COCO: https://arxiv.org/abs/1405.0312
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
DATASET_DOWNLOAD_URL: str = (
|
|
46
|
+
"https://worksheets.codalab.org/rest/bundles/0x502d646c366c4f1d8c4a2ccf163b958f/contents/blob/"
|
|
47
|
+
)
|
|
48
|
+
VALID_QUESTION_TYPES: Set[str] = {"alignment", "aesthetics", "clear_subject", "originality", "photorealism"}
|
|
49
|
+
|
|
50
|
+
name = "heim_human_eval"
|
|
51
|
+
description = (
|
|
52
|
+
"Images generated by text-to-image models and human annotations for HEIM "
|
|
53
|
+
"([paper](https://arxiv.org/abs/2311.04287))."
|
|
54
|
+
)
|
|
55
|
+
tags = ["vision-language", "visual question answering", "image evaluation"]
|
|
56
|
+
|
|
57
|
+
def __init__(self, question_type: str):
|
|
58
|
+
super().__init__()
|
|
59
|
+
assert question_type in self.VALID_QUESTION_TYPES, f"Invalid question type: {question_type}"
|
|
60
|
+
self._question_type: str = question_type
|
|
61
|
+
|
|
62
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
63
|
+
# Download the dataset
|
|
64
|
+
output_path = os.path.join(output_path, "dataset")
|
|
65
|
+
ensure_file_downloaded(
|
|
66
|
+
source_url=self.DATASET_DOWNLOAD_URL, target_path=output_path, unpack=True, unpack_type="untar"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Load the multiple-choice questions
|
|
70
|
+
with open(os.path.join(output_path, "questions.json")) as questions_file:
|
|
71
|
+
question_info: Dict = json.load(questions_file)[self._question_type]
|
|
72
|
+
|
|
73
|
+
instances: List[Instance] = []
|
|
74
|
+
for split in ALL_SPLITS:
|
|
75
|
+
annotations_split_path: str = os.path.join(output_path, f"{self._question_type}_{split}.jsonl")
|
|
76
|
+
with open(annotations_split_path) as f:
|
|
77
|
+
# Read each line/example as a JSON object
|
|
78
|
+
for line in f.readlines():
|
|
79
|
+
image_annotation: Dict = json.loads(line)
|
|
80
|
+
image_path: str = os.path.join(output_path, image_annotation["image_path"])
|
|
81
|
+
assert os.path.exists(image_path), f"Image {image_path} does not exist"
|
|
82
|
+
|
|
83
|
+
# Get the most common human answer(s) for the question
|
|
84
|
+
human_answers: List[str] = [str(answer) for answer in image_annotation["human_annotations"]]
|
|
85
|
+
human_answers_to_counts = Counter(human_answers)
|
|
86
|
+
max_count: int = max(human_answers_to_counts.values())
|
|
87
|
+
modes: List[str] = [value for value, count in human_answers_to_counts.items() if count == max_count]
|
|
88
|
+
|
|
89
|
+
content: List[MediaObject] = [MediaObject(location=image_path, content_type="image/png")]
|
|
90
|
+
if "prompt" in image_annotation:
|
|
91
|
+
# Include the prompt in the content if it exists
|
|
92
|
+
prompt: str = image_annotation["prompt"]
|
|
93
|
+
content.append(MediaObject(text=f"Description: {prompt}", content_type="text/plain"))
|
|
94
|
+
content.append(MediaObject(text=question_info["question"], content_type="text/plain"))
|
|
95
|
+
|
|
96
|
+
references: List[Reference] = [
|
|
97
|
+
HEIMHumanEvalReference(
|
|
98
|
+
Output(text=answer),
|
|
99
|
+
# The mode is the most common human answer and the reference we mark as correct
|
|
100
|
+
tags=[CORRECT_TAG] if value in modes else [],
|
|
101
|
+
num_human_answered=human_answers_to_counts[value],
|
|
102
|
+
)
|
|
103
|
+
for value, answer in question_info["choices"].items()
|
|
104
|
+
]
|
|
105
|
+
instances.append(
|
|
106
|
+
Instance(
|
|
107
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
108
|
+
references=references,
|
|
109
|
+
split=split,
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return instances
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
CORRECT_TAG,
|
|
6
|
+
TEST_SPLIT,
|
|
7
|
+
Instance,
|
|
8
|
+
Input,
|
|
9
|
+
Output,
|
|
10
|
+
Reference,
|
|
11
|
+
Scenario,
|
|
12
|
+
)
|
|
13
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Chart2CSVScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
Chart to CSV
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "chart2csv"
|
|
22
|
+
description = "Convert a chart to CSV."
|
|
23
|
+
tags = ["vision-language", "image2structure"]
|
|
24
|
+
|
|
25
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
26
|
+
assert os.path.exists(output_path), f"Dataset does not exist at {output_path}"
|
|
27
|
+
instances: List[Instance] = []
|
|
28
|
+
|
|
29
|
+
charts_path: str = os.path.join(output_path, "charts")
|
|
30
|
+
ground_truths_path: str = os.path.join(output_path, "groundtruth")
|
|
31
|
+
|
|
32
|
+
for chart_file in os.listdir(charts_path):
|
|
33
|
+
if not chart_file.endswith(".png"):
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
chart_path: str = os.path.join(charts_path, chart_file)
|
|
37
|
+
ground_truth_file: str = chart_file.replace(".png", ".psv")
|
|
38
|
+
ground_truth_path: str = os.path.join(ground_truths_path, ground_truth_file)
|
|
39
|
+
assert os.path.exists(ground_truth_path), f"Ground truth does not exist at {ground_truth_path}"
|
|
40
|
+
|
|
41
|
+
content: List[MediaObject] = [
|
|
42
|
+
MediaObject(location=chart_path, content_type="image/png"),
|
|
43
|
+
]
|
|
44
|
+
with open(ground_truth_path, "r") as file:
|
|
45
|
+
ground_truth: str = file.read().replace("|", ",")
|
|
46
|
+
|
|
47
|
+
instances.append(
|
|
48
|
+
Instance(
|
|
49
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
50
|
+
references=[Reference(Output(text=ground_truth), tags=[CORRECT_TAG])],
|
|
51
|
+
split=TEST_SPLIT,
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return instances
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List, Optional, Dict, Any
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
|
|
5
|
+
from datasets import load_dataset
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
ASSET_NAME_TAG,
|
|
11
|
+
ASSET_PATH_TAG,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
VALID_SPLIT,
|
|
14
|
+
Instance,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
Reference,
|
|
18
|
+
Scenario,
|
|
19
|
+
)
|
|
20
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
21
|
+
from helm.common.general import ensure_directory_exists
|
|
22
|
+
from helm.common.hierarchical_logger import hlog
|
|
23
|
+
|
|
24
|
+
PROCESSED: str = "processed"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Image2StructureScenario(Scenario):
|
|
28
|
+
BASE_PROMPT: str
|
|
29
|
+
HUGGINGFACE_DATASET_NAME: str
|
|
30
|
+
SUBSETS: List[str]
|
|
31
|
+
|
|
32
|
+
name: str
|
|
33
|
+
description: str
|
|
34
|
+
tags = ["vision-language"]
|
|
35
|
+
|
|
36
|
+
helm_split_to_huggingface_split = {
|
|
37
|
+
TEST_SPLIT: "test",
|
|
38
|
+
VALID_SPLIT: "validation",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
|
|
42
|
+
super().__init__()
|
|
43
|
+
assert subset in self.SUBSETS, f"Invalid subset: {subset}"
|
|
44
|
+
self._subset: str = subset
|
|
45
|
+
self._recompile_prompt: bool = recompile_prompt
|
|
46
|
+
self._split: str = split
|
|
47
|
+
self._output_path: Optional[str] = None
|
|
48
|
+
|
|
49
|
+
def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
|
|
50
|
+
# By default, there are no assets
|
|
51
|
+
del row["assets"]
|
|
52
|
+
row["assets_paths"] = []
|
|
53
|
+
row["assets_names"] = []
|
|
54
|
+
return row
|
|
55
|
+
|
|
56
|
+
def build_prompt(self, row: Dict[str, Any]) -> str:
|
|
57
|
+
return self.BASE_PROMPT
|
|
58
|
+
|
|
59
|
+
@abstractmethod
|
|
60
|
+
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
61
|
+
"""Compile the prompt, should save the image and return the text extracted from the image"""
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
def finalize(self, row: Dict[str, Any]) -> None:
|
|
65
|
+
"""Perform cleanup operations after the instance has been generated."""
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
69
|
+
"""Get the instances for the scenario. This compile_and_save method should be implemented by the subclass.
|
|
70
|
+
Additionally, the subclass should implement the preprocess_row method if any preprocessing is needed.
|
|
71
|
+
|
|
72
|
+
For each instance, the following steps are performed:
|
|
73
|
+
1. Preprocess the row
|
|
74
|
+
2. Save the image locally
|
|
75
|
+
- 2.a. If we don't want to recompile the prompt, save the image directly
|
|
76
|
+
- 2.b. If we want to recompile the prompt, compile the structure and save the image
|
|
77
|
+
3. Create the prompt
|
|
78
|
+
4. Create the multimedia content
|
|
79
|
+
5. Create the reference
|
|
80
|
+
6. Finalize the Instance
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
output_path (str): The path where the instances will be saved
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
List[Instance]: The list of instances
|
|
87
|
+
"""
|
|
88
|
+
self._output_path = output_path
|
|
89
|
+
images_path: str = os.path.join(output_path, "data/images", self._subset)
|
|
90
|
+
assets_path: str = os.path.join(output_path, "data/assets", self._subset)
|
|
91
|
+
ensure_directory_exists(images_path)
|
|
92
|
+
ensure_directory_exists(assets_path)
|
|
93
|
+
|
|
94
|
+
instances: List[Instance] = []
|
|
95
|
+
|
|
96
|
+
# Process the desired set of instances
|
|
97
|
+
for row in tqdm(
|
|
98
|
+
load_dataset(
|
|
99
|
+
self.HUGGINGFACE_DATASET_NAME,
|
|
100
|
+
self._subset,
|
|
101
|
+
split=self.helm_split_to_huggingface_split[self._split],
|
|
102
|
+
cache_dir=output_path,
|
|
103
|
+
)
|
|
104
|
+
):
|
|
105
|
+
question_uuid: str = str(row["uuid"]).replace('"', "")
|
|
106
|
+
if row["category"][1:-1] != self._subset:
|
|
107
|
+
hlog(
|
|
108
|
+
f"Skipping instance {question_uuid} as it belong in category"
|
|
109
|
+
f" {row['category']} and not {self._subset}"
|
|
110
|
+
)
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
# Step 1: Preprocess the row
|
|
114
|
+
row = self.preprocess_row(row, assets_path)
|
|
115
|
+
|
|
116
|
+
# Step 2: Save the image locally
|
|
117
|
+
image_path: str = os.path.join(images_path, f"{question_uuid}.png")
|
|
118
|
+
if not os.path.exists(image_path):
|
|
119
|
+
if not self._recompile_prompt: # 2.a
|
|
120
|
+
row["image"].save(image_path)
|
|
121
|
+
else: # 2.b
|
|
122
|
+
if "structure" not in row:
|
|
123
|
+
raise ValueError("Cannot recompile prompt without structure")
|
|
124
|
+
structure: str = row["structure"]
|
|
125
|
+
text: str = self.compile_and_save(structure, assets_path, image_path)
|
|
126
|
+
row["text"] = text
|
|
127
|
+
|
|
128
|
+
# Step 3: Create the prompt
|
|
129
|
+
prompt: str = self.build_prompt(row)
|
|
130
|
+
|
|
131
|
+
# Step 4: Create the multimedia content
|
|
132
|
+
image_object = MediaObject(location=image_path, content_type="image/png")
|
|
133
|
+
content: List[MediaObject] = [
|
|
134
|
+
MediaObject(text=prompt, content_type="text/plain"),
|
|
135
|
+
image_object,
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
# Step 5: Create the references
|
|
139
|
+
# 5.a Create the reference containing the structure and the associated image.
|
|
140
|
+
reference: Reference
|
|
141
|
+
if "structure" in row:
|
|
142
|
+
multimedia_object: MultimediaObject
|
|
143
|
+
if os.path.exists(row["structure"]):
|
|
144
|
+
# 5.a.1 The structure is a path, therefore represent it as a multimedia object
|
|
145
|
+
# containing the files used to compile the structure (such as a repository
|
|
146
|
+
# containing the HTML, CSS, and JavaScript files used to generate a webpage)
|
|
147
|
+
multimedia_object = MultimediaObject(
|
|
148
|
+
[image_object, MediaObject(location=row["structure"], content_type="path/path")]
|
|
149
|
+
)
|
|
150
|
+
elif row["structure"] == PROCESSED:
|
|
151
|
+
# 5.a.2 The structure has been processed and is no longer present in the row
|
|
152
|
+
# This can be the case if the structure is a base64 encoding of an archive that
|
|
153
|
+
# has been extracted to a temporary path and processed but the path is no longer
|
|
154
|
+
# existing (deleted after the processing is done)
|
|
155
|
+
multimedia_object = MultimediaObject([image_object])
|
|
156
|
+
else:
|
|
157
|
+
# 5.a.3 The structure is not a path, therefore it is directly a valid string
|
|
158
|
+
# representing the structure (such as LaTeX code)
|
|
159
|
+
multimedia_object = MultimediaObject([image_object])
|
|
160
|
+
reference = Reference(
|
|
161
|
+
output=Output(text=row["text"], multimedia_content=multimedia_object),
|
|
162
|
+
tags=[CORRECT_TAG],
|
|
163
|
+
)
|
|
164
|
+
else:
|
|
165
|
+
if "text" in row:
|
|
166
|
+
reference = Reference(
|
|
167
|
+
output=Output(text=row["text"], multimedia_content=MultimediaObject([image_object])),
|
|
168
|
+
tags=[CORRECT_TAG],
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
reference = Reference(
|
|
172
|
+
output=Output(multimedia_content=MultimediaObject([image_object])), tags=[CORRECT_TAG]
|
|
173
|
+
)
|
|
174
|
+
references: List[Reference] = [reference]
|
|
175
|
+
|
|
176
|
+
# 5.b Create the reference containing the assets
|
|
177
|
+
if len(row["assets_paths"]) > 0:
|
|
178
|
+
assets_paths_reference: Reference = Reference(
|
|
179
|
+
output=Output(
|
|
180
|
+
text=", ".join(
|
|
181
|
+
row["assets_paths"]
|
|
182
|
+
), # TODO: This is for debugging purposes (to show in the frontend)
|
|
183
|
+
multimedia_content=MultimediaObject(
|
|
184
|
+
[
|
|
185
|
+
MediaObject(location=asset, content_type=f"image/{asset.split('.')[-1].lower()}")
|
|
186
|
+
for asset in row["assets_paths"]
|
|
187
|
+
]
|
|
188
|
+
),
|
|
189
|
+
),
|
|
190
|
+
tags=[ASSET_PATH_TAG],
|
|
191
|
+
)
|
|
192
|
+
references.append(assets_paths_reference)
|
|
193
|
+
assets_names_reference: Reference = Reference(
|
|
194
|
+
output=Output(
|
|
195
|
+
text=", ".join(
|
|
196
|
+
row["assets_names"]
|
|
197
|
+
), # TODO: This is for debugging purposes (to show in the frontend)
|
|
198
|
+
multimedia_content=MultimediaObject(
|
|
199
|
+
[MediaObject(text=asset, content_type="text/plain") for asset in row["assets_names"]]
|
|
200
|
+
),
|
|
201
|
+
),
|
|
202
|
+
tags=[ASSET_NAME_TAG],
|
|
203
|
+
)
|
|
204
|
+
references.append(assets_names_reference)
|
|
205
|
+
|
|
206
|
+
# Step 6: Finalize the Instance
|
|
207
|
+
self.finalize(row)
|
|
208
|
+
instance = Instance(
|
|
209
|
+
input=Input(multimedia_content=MultimediaObject(content)), references=references, split=self._split
|
|
210
|
+
)
|
|
211
|
+
instances.append(instance)
|
|
212
|
+
|
|
213
|
+
assert len(instances) > 0, f"No instances found for subject {self._subset}"
|
|
214
|
+
return instances
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
2
|
+
from helm.benchmark.scenarios.vision_language.image2structure.utils_latex import (
|
|
3
|
+
latex_to_image,
|
|
4
|
+
strip_unnecessary_latex_parts,
|
|
5
|
+
)
|
|
6
|
+
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class LatexScenario(Image2StructureScenario):
|
|
10
|
+
BASE_PROMPT = "Please provide the LaTeX code used to generate this image. Only generate the code relevant to what you see. Your code will be surrounded by all the imports necessary as well as the begin and end document delimiters." # noqa: E501
|
|
11
|
+
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-latex"
|
|
12
|
+
SUBSETS = ["equation", "table", "plot", "algorithm"]
|
|
13
|
+
|
|
14
|
+
name = "image2latex"
|
|
15
|
+
description = "Evaluate multimodal models on Latex generation to recreate a provided image"
|
|
16
|
+
|
|
17
|
+
def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
|
|
18
|
+
super().__init__(subset, recompile_prompt, split)
|
|
19
|
+
|
|
20
|
+
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
21
|
+
image, infos = latex_to_image(structure, assets_path=assets_path, crop=True)
|
|
22
|
+
image.save(destination_path)
|
|
23
|
+
assert "latex_code" in infos
|
|
24
|
+
text: str = strip_unnecessary_latex_parts(infos["latex_code"])
|
|
25
|
+
return text
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
2
|
+
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class MusicSheetScenario(Image2StructureScenario):
|
|
6
|
+
BASE_PROMPT = (
|
|
7
|
+
"Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasibly possible.\n" # noqa: E501
|
|
8
|
+
"This music sheet was created by me, and I would like to recreate it using Lilypond."
|
|
9
|
+
)
|
|
10
|
+
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
|
|
11
|
+
SUBSETS = ["music"]
|
|
12
|
+
|
|
13
|
+
name = "image2musicsheet"
|
|
14
|
+
description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
|
|
15
|
+
|
|
16
|
+
def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
|
|
17
|
+
super().__init__(subset, recompile_prompt, split)
|
|
18
|
+
|
|
19
|
+
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
20
|
+
raise Exception("Music sheets have no ground truth, compilation is not possible")
|