crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +8 -2
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +36 -0
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +214 -16
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +14 -16
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +203 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +12 -72
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1069 -546
- helm/config/model_metadata.yaml +753 -31
- helm/config/tokenizer_configs.yaml +142 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
from typing import Dict, List, Any
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
4
|
+
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
|
|
5
|
+
Image2StructureScenario,
|
|
6
|
+
PROCESSED,
|
|
7
|
+
)
|
|
8
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
|
|
9
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
|
|
10
|
+
save_random_screenshot,
|
|
11
|
+
ScreenshotOptions,
|
|
12
|
+
)
|
|
13
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
|
|
14
|
+
from helm.common.general import ensure_directory_exists
|
|
15
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from html2text import HTML2Text
|
|
19
|
+
except ModuleNotFoundError as e:
|
|
20
|
+
handle_module_not_found_error(e, suggestions=["image2structure"])
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
import base64
|
|
24
|
+
import os
|
|
25
|
+
import threading
|
|
26
|
+
import tarfile
|
|
27
|
+
import shutil
|
|
28
|
+
import time
|
|
29
|
+
import pickle
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_repo(base64_encoding: str, repo_path: str) -> None:
|
|
33
|
+
# Decode the base64 string which corresponds to an archive
|
|
34
|
+
# and extract the files to the repo_path
|
|
35
|
+
try:
|
|
36
|
+
archive = base64.b64decode(base64_encoding)
|
|
37
|
+
# Write to .tar file
|
|
38
|
+
with open(os.path.join(repo_path, "repo.tar.gz"), "wb") as f:
|
|
39
|
+
f.write(archive)
|
|
40
|
+
# Extract
|
|
41
|
+
with tarfile.open(os.path.join(repo_path, "repo.tar.gz"), "r:gz") as tar:
|
|
42
|
+
tar.extractall(repo_path)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
raise ValueError(f"Failed to decode and extract the base64 archive: {e}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def list_assets(repo_path: str, extensions: List[str]) -> List[str]:
|
|
48
|
+
asset_paths: List[str] = []
|
|
49
|
+
for root, _, files in os.walk(repo_path):
|
|
50
|
+
for file in files:
|
|
51
|
+
if file.split(".")[-1].lower() in extensions:
|
|
52
|
+
# Remove repo_path from the file path
|
|
53
|
+
# ignore paths that start with a dot or in _site
|
|
54
|
+
if not root.startswith(os.path.join(repo_path, "_site")) and not root.startswith(
|
|
55
|
+
os.path.join(repo_path, ".")
|
|
56
|
+
):
|
|
57
|
+
asset_paths.append(os.path.relpath(os.path.join(root, file), repo_path))
|
|
58
|
+
return asset_paths
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def serve_and_take_screenshot(
|
|
62
|
+
repo_path: str,
|
|
63
|
+
destination_path: str,
|
|
64
|
+
screenshot_options: ScreenshotOptions = ScreenshotOptions(),
|
|
65
|
+
max_tries: int = 5,
|
|
66
|
+
) -> Dict[str, Any]:
|
|
67
|
+
# Start the Jekyll server
|
|
68
|
+
# Select a unique port per thread
|
|
69
|
+
port: int = 4000 + int(threading.get_ident()) % 1000
|
|
70
|
+
server = JekyllServer(repo_path, port=port, verbose=False)
|
|
71
|
+
success: bool = server.start()
|
|
72
|
+
if not success:
|
|
73
|
+
# This runs on examples that are not expected to fail
|
|
74
|
+
server.stop()
|
|
75
|
+
raise ValueError(f"Jekyll server failed to start: {repo_path}")
|
|
76
|
+
|
|
77
|
+
# Take a screenshot of a random page
|
|
78
|
+
success = False
|
|
79
|
+
error: Exception
|
|
80
|
+
for _ in range(max_tries):
|
|
81
|
+
try:
|
|
82
|
+
infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
|
|
83
|
+
success = True
|
|
84
|
+
break
|
|
85
|
+
except Exception as e:
|
|
86
|
+
if "net::ERR_CONNECTION_REFUSED" in str(e):
|
|
87
|
+
error = e
|
|
88
|
+
server.stop()
|
|
89
|
+
time.sleep(0.5)
|
|
90
|
+
server.start()
|
|
91
|
+
time.sleep(0.5)
|
|
92
|
+
else:
|
|
93
|
+
# Do not retry
|
|
94
|
+
break
|
|
95
|
+
if not success:
|
|
96
|
+
raise ValueError(f"Failed to take a screenshot: {error}")
|
|
97
|
+
|
|
98
|
+
# Stop the server
|
|
99
|
+
server.stop()
|
|
100
|
+
time.sleep(0.1)
|
|
101
|
+
|
|
102
|
+
return infos
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class WebpageScenario(Image2StructureScenario):
|
|
106
|
+
BASE_PROMPT = (
|
|
107
|
+
"Please generate the source code to generate a webpage that looks like this image as much as feasibly possible.\n" # noqa: E501
|
|
108
|
+
"You should output a json object associating each file name with its content.\n\n"
|
|
109
|
+
"Here is a simple example of the expected structure (that does not correspond to the image)."
|
|
110
|
+
" In this example, 3 files are created: index.html, style.css and script.js.\n"
|
|
111
|
+
"[\n"
|
|
112
|
+
" {\n"
|
|
113
|
+
' "filename": "index.html",\n'
|
|
114
|
+
' "content": "<!DOCTYPE html>\\n<html>\\n<head>\\n<title>Title of the document</title>\\n</head>\\n<body>\\n\\n<p>Content of the document......</p>\\n\\n</body>\\n</html>"\n' # noqa: E501
|
|
115
|
+
" },\n"
|
|
116
|
+
" {\n"
|
|
117
|
+
' "filename": "style.css",\n'
|
|
118
|
+
' "content": "body {\\n background-color: lightblue;\\n}\\nh1 {\\n color: white;\\n text-align: center;\\n}"\n' # noqa: E501
|
|
119
|
+
" },\n"
|
|
120
|
+
" {\n"
|
|
121
|
+
' "filename": "script.js",\n'
|
|
122
|
+
' "content": "document.getElementById(\\"demo\\").innerHTML = \\"Hello JavaScript!\\";"\n'
|
|
123
|
+
" }\n"
|
|
124
|
+
"]\n"
|
|
125
|
+
"You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary," # noqa: E501
|
|
126
|
+
" they will be created for you automatically. Try to write some realistic code keeping in mind that is should"
|
|
127
|
+
" look like the image as much as feasibly possible."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
|
|
131
|
+
SUBSETS = ["css", "html", "javascript"]
|
|
132
|
+
MAX_TRIES: int = 5
|
|
133
|
+
ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
|
|
134
|
+
|
|
135
|
+
name = "image2webpage"
|
|
136
|
+
description = "Evaluate multimodal models on webpage generation to recreate a provided image"
|
|
137
|
+
|
|
138
|
+
def __init__(
|
|
139
|
+
self,
|
|
140
|
+
subset: str,
|
|
141
|
+
recompile_prompt: bool = True,
|
|
142
|
+
split: str = VALID_SPLIT,
|
|
143
|
+
screenshot_options: ScreenshotOptions = ScreenshotOptions(),
|
|
144
|
+
):
|
|
145
|
+
super().__init__(subset, recompile_prompt, split)
|
|
146
|
+
self._screenshot_options = screenshot_options
|
|
147
|
+
self._html2text = HTML2Text()
|
|
148
|
+
self._html2text.ignore_links = True
|
|
149
|
+
|
|
150
|
+
def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
|
|
151
|
+
"""Extract the base64 encoding of the repo from the row and return it."""
|
|
152
|
+
# No need to reprocess if the assets are already saved
|
|
153
|
+
assets_save_path: str = os.path.join(assets_path, str(row["uuid"].replace('"', "")))
|
|
154
|
+
if os.path.exists(assets_save_path):
|
|
155
|
+
try:
|
|
156
|
+
with open(os.path.join(assets_save_path, "assets_paths.pkl"), "rb") as f:
|
|
157
|
+
row["assets_paths"] = pickle.load(f)
|
|
158
|
+
with open(os.path.join(assets_save_path, "assets_names.pkl"), "rb") as f:
|
|
159
|
+
row["assets_names"] = pickle.load(f)
|
|
160
|
+
del row["assets"]
|
|
161
|
+
row["structure"] = PROCESSED
|
|
162
|
+
return row
|
|
163
|
+
except Exception:
|
|
164
|
+
# There was an issue when loading the assets, reprocess
|
|
165
|
+
shutil.rmtree(assets_save_path)
|
|
166
|
+
ensure_directory_exists(assets_save_path)
|
|
167
|
+
|
|
168
|
+
# Structure is a base64 encoding of the repo
|
|
169
|
+
if self._output_path is None:
|
|
170
|
+
raise ValueError("Output path not set")
|
|
171
|
+
repo_path = os.path.join(self._output_path, f"tmp{threading.get_ident()}_{self._subset}")
|
|
172
|
+
ensure_directory_exists(repo_path)
|
|
173
|
+
|
|
174
|
+
# Decode the base64 string which corresponds to an archive
|
|
175
|
+
# and extract the files to the repo_path
|
|
176
|
+
structure: str = row["structure"]
|
|
177
|
+
extract_repo(structure, repo_path)
|
|
178
|
+
row["structure"] = PROCESSED
|
|
179
|
+
row["repo_path"] = repo_path # Stored for cleanup
|
|
180
|
+
|
|
181
|
+
# Process the assets
|
|
182
|
+
asset_paths: List[str] = list_assets(repo_path, self.ASSETS_EXTENSIONS)
|
|
183
|
+
del row["assets"]
|
|
184
|
+
row["assets_paths"] = []
|
|
185
|
+
row["assets_names"] = []
|
|
186
|
+
# Copy each asset to a unique persistent path
|
|
187
|
+
for i, asset_local_path in enumerate(asset_paths):
|
|
188
|
+
asset_name: str = asset_local_path
|
|
189
|
+
asset_dest_path = os.path.join(assets_save_path, f"{i}.{asset_local_path.split('.')[-1]}")
|
|
190
|
+
shutil.copyfile(os.path.join(row["repo_path"], asset_local_path), asset_dest_path)
|
|
191
|
+
row["assets_paths"].append(asset_dest_path)
|
|
192
|
+
row["assets_names"].append(asset_name)
|
|
193
|
+
|
|
194
|
+
# Save both assets_paths and assets_names as files than can be loaded
|
|
195
|
+
with open(os.path.join(assets_save_path, "assets_paths.pkl"), "wb") as f:
|
|
196
|
+
pickle.dump(row["assets_paths"], f)
|
|
197
|
+
with open(os.path.join(assets_save_path, "assets_names.pkl"), "wb") as f:
|
|
198
|
+
pickle.dump(row["assets_names"], f)
|
|
199
|
+
|
|
200
|
+
return row
|
|
201
|
+
|
|
202
|
+
def build_prompt(self, row: Dict[str, Any]) -> str:
|
|
203
|
+
prompt: str = self.BASE_PROMPT
|
|
204
|
+
assert "assets_paths" in row, "No assets paths in the row"
|
|
205
|
+
assert "assets_names" in row, "No assets names in the row"
|
|
206
|
+
assert len(row["assets_paths"]) == len(row["assets_names"])
|
|
207
|
+
if len(row["assets_names"]) > 0:
|
|
208
|
+
prompt += "\nYou have access to the following assets:\n"
|
|
209
|
+
for asset_local_path in row["assets_names"]:
|
|
210
|
+
prompt += f"- {asset_local_path}\n"
|
|
211
|
+
return prompt
|
|
212
|
+
|
|
213
|
+
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
214
|
+
# Structure is the path to the repo
|
|
215
|
+
# Serve and take screenshot
|
|
216
|
+
repo_path: str = structure
|
|
217
|
+
infos: Dict[str, Any] = serve_and_take_screenshot(repo_path, destination_path, self._screenshot_options)
|
|
218
|
+
text: str = convert_html_to_text(self._html2text, infos["html"])
|
|
219
|
+
return text
|
|
220
|
+
|
|
221
|
+
def finalize(self, row: Dict[str, Any]) -> None:
|
|
222
|
+
"""Perform cleanup operations after the instance has been generated."""
|
|
223
|
+
if "repo_path" in row:
|
|
224
|
+
repo_path: str = row["repo_path"]
|
|
225
|
+
shutil.rmtree(repo_path)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Instance,
|
|
13
|
+
Output,
|
|
14
|
+
Reference,
|
|
15
|
+
Scenario,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MementosScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
Mementos: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
|
|
24
|
+
|
|
25
|
+
We introduces Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning abilities. Mementos
|
|
26
|
+
features 4,761 diverse image sequences with varying lengths.
|
|
27
|
+
|
|
28
|
+
@misc{wang2024mementos,
|
|
29
|
+
title={Mementos: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences},
|
|
30
|
+
author={Xiyao Wang and Yuhang Zhou and Xiaoyu Liu and Hongjin Lu and Yuancheng Xu and Feihong He and Jaehong Yoon
|
|
31
|
+
and Taixi Lu and Gedas Bertasius and Mohit Bansal and Huaxiu Yao and Furong Huang},
|
|
32
|
+
year={2024},
|
|
33
|
+
eprint={2401.10529},
|
|
34
|
+
archivePrefix={arXiv},
|
|
35
|
+
primaryClass={cs.CV}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
Paper: https://arxiv.org/abs/2401.10529
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "shenmishajing/unofficial_mementos_dataset"
|
|
42
|
+
|
|
43
|
+
IMAGE_URL: str = (
|
|
44
|
+
"https://huggingface.co/datasets/shenmishajing/unofficial_mementos_dataset/resolve/main/"
|
|
45
|
+
+ "{subject}/{split}/{file_name}?download=true"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
DATA_FILES: str = "{subject}/{split}/metadata.csv"
|
|
49
|
+
|
|
50
|
+
QUESTION_PROMPT: str = (
|
|
51
|
+
"Write a description for the given image sequence in a single paragraph, what is happening in this episode?"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
SUBJECTS: List[str] = ["comics", "dailylife", "robotics"]
|
|
55
|
+
|
|
56
|
+
name = "mementos"
|
|
57
|
+
description = (
|
|
58
|
+
"A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
|
|
59
|
+
" ([paper](https://arxiv.org/abs/2401.10529))."
|
|
60
|
+
)
|
|
61
|
+
tags = ["vision-language"]
|
|
62
|
+
|
|
63
|
+
def __init__(self, subject: str):
|
|
64
|
+
super().__init__()
|
|
65
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
66
|
+
self._subject: str = subject
|
|
67
|
+
|
|
68
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
69
|
+
subject_output_path: str = os.path.join(output_path, self._subject)
|
|
70
|
+
ensure_directory_exists(subject_output_path)
|
|
71
|
+
|
|
72
|
+
data_files = {
|
|
73
|
+
split: self.DATA_FILES.format(subject=self._subject, split=split) for split in [TRAIN_SPLIT, TEST_SPLIT]
|
|
74
|
+
}
|
|
75
|
+
instances: List[Instance] = []
|
|
76
|
+
|
|
77
|
+
for split in [TRAIN_SPLIT, TEST_SPLIT]:
|
|
78
|
+
cur_output_path = os.path.join(subject_output_path, split)
|
|
79
|
+
ensure_directory_exists(cur_output_path)
|
|
80
|
+
|
|
81
|
+
# Process the test set
|
|
82
|
+
for row in tqdm(
|
|
83
|
+
load_dataset(
|
|
84
|
+
self.MEMENTOS_HUGGINGFACE_DATASET_NAME.format(subject=self._subject),
|
|
85
|
+
data_files=data_files,
|
|
86
|
+
split=split,
|
|
87
|
+
cache_dir=cur_output_path,
|
|
88
|
+
)
|
|
89
|
+
):
|
|
90
|
+
# Download the image
|
|
91
|
+
file_name: str = row["file_name"]
|
|
92
|
+
local_image_path: str = os.path.join(cur_output_path, file_name)
|
|
93
|
+
ensure_file_downloaded(
|
|
94
|
+
source_url=self.IMAGE_URL.format(subject=self._subject, split=split, file_name=file_name),
|
|
95
|
+
target_path=local_image_path,
|
|
96
|
+
unpack=False,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
content: List[MediaObject] = [
|
|
100
|
+
MediaObject(location=local_image_path, content_type="image/png"),
|
|
101
|
+
MediaObject(text=self.QUESTION_PROMPT, content_type="text/plain"),
|
|
102
|
+
]
|
|
103
|
+
answer: str = row["description"]
|
|
104
|
+
instances.append(
|
|
105
|
+
Instance(
|
|
106
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
107
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
108
|
+
split=split,
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
print()
|
|
113
|
+
|
|
114
|
+
return instances
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def main():
|
|
118
|
+
scenario = MementosScenario("robotics")
|
|
119
|
+
instances = scenario.get_instances("output")
|
|
120
|
+
print(instances)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
main()
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MMEScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models
|
|
23
|
+
|
|
24
|
+
Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform
|
|
25
|
+
multimodal tasks, showing amazing emergent abilities in recent studies. However,
|
|
26
|
+
it is difficult for these case studies to fully reflect the performance of MLLM,
|
|
27
|
+
lacking a comprehensive evaluation. In MME, we fill in this blank, presenting
|
|
28
|
+
the first comprehensive MLLM Evaluation benchmark MME. It measures both perception
|
|
29
|
+
and cognition abilities on a total of 14 subtasks. In order to avoid data leakage
|
|
30
|
+
that may arise from direct use of public datasets for evaluation, the annotations
|
|
31
|
+
of instruction-answer pairs are all manually designed. The concise instruction design
|
|
32
|
+
allows us to fairly compare MLLMs, instead of struggling in prompt engineering.
|
|
33
|
+
Besides, with such an instruction, we can also easily carry out quantitative
|
|
34
|
+
statistics. We rephrase the answer type of MME to multiple-choice question-answering.
|
|
35
|
+
We use the multiple-choice metrics for 14 different evaluation tasks.
|
|
36
|
+
|
|
37
|
+
@article{fu2023mme,
|
|
38
|
+
title={MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language Models},
|
|
39
|
+
author={Fu, Chaoyou and Chen, Peixian and Shen, Yunhang and Qin, Yulei and
|
|
40
|
+
Zhang, Mengdan and Lin, Xu and Yang, Jinrui and Zheng, Xiawu and Li, Ke and
|
|
41
|
+
Sun, Xing and Wu, Yunsheng and Ji, Rongrong},
|
|
42
|
+
journal={arXiv preprint arXiv:2306.13394},
|
|
43
|
+
year={2023}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
Paper: https://arxiv.org/abs/2306.13394
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
MME_HUGGINGFACE_DATASET_NAME: str = "lmms-lab/MME"
|
|
50
|
+
|
|
51
|
+
SUBJECTS: List[str] = [
|
|
52
|
+
"existence",
|
|
53
|
+
"scene",
|
|
54
|
+
"posters",
|
|
55
|
+
"color",
|
|
56
|
+
"OCR",
|
|
57
|
+
"position",
|
|
58
|
+
"celebrity",
|
|
59
|
+
"artwork",
|
|
60
|
+
"commonsense_reasoning",
|
|
61
|
+
"numerical_calculation",
|
|
62
|
+
"landmark",
|
|
63
|
+
"count",
|
|
64
|
+
"text_translation",
|
|
65
|
+
"code_reasoning",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
name = "mme"
|
|
69
|
+
description = "Evaluate multimodal models on ([paper](https://arxiv.org/abs/2306.13394))."
|
|
70
|
+
tags = ["vision-language"]
|
|
71
|
+
options: List[str] = ["Yes", "No"]
|
|
72
|
+
|
|
73
|
+
def __init__(self, subject: str):
|
|
74
|
+
super().__init__()
|
|
75
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
76
|
+
self._subject: str = subject
|
|
77
|
+
|
|
78
|
+
def get_label_from_answer(self, answer: str):
|
|
79
|
+
label: str
|
|
80
|
+
if answer == "Yes":
|
|
81
|
+
label = "A"
|
|
82
|
+
elif answer == "No":
|
|
83
|
+
label = "B"
|
|
84
|
+
else:
|
|
85
|
+
raise NotImplementedError(f"Invalid answer: {answer}")
|
|
86
|
+
return label
|
|
87
|
+
|
|
88
|
+
def remove_question_suffix_for_mcqa(self, question: str):
|
|
89
|
+
return question.replace("Please answer yes or no.", "").strip()
|
|
90
|
+
|
|
91
|
+
def get_question_id(self, question_id: str):
|
|
92
|
+
return question_id.split(".")[0].replace("/", "-")
|
|
93
|
+
|
|
94
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
95
|
+
images_path: str = os.path.join(output_path, "images")
|
|
96
|
+
ensure_directory_exists(images_path)
|
|
97
|
+
|
|
98
|
+
# There is only the test split in Unicorn benchmark
|
|
99
|
+
instances: List[Instance] = []
|
|
100
|
+
# Process the test set
|
|
101
|
+
# Two open-ended generation instances and
|
|
102
|
+
# one multi-choice generation instance per row
|
|
103
|
+
for row in tqdm(
|
|
104
|
+
load_dataset(
|
|
105
|
+
self.MME_HUGGINGFACE_DATASET_NAME,
|
|
106
|
+
split=TEST_SPLIT,
|
|
107
|
+
cache_dir=output_path,
|
|
108
|
+
)
|
|
109
|
+
):
|
|
110
|
+
if row["category"] != self._subject:
|
|
111
|
+
continue
|
|
112
|
+
question_id: str = self.get_question_id(row["question_id"])
|
|
113
|
+
# Save the image locally
|
|
114
|
+
image_path: str = os.path.join(images_path, f"{question_id}.png")
|
|
115
|
+
if not os.path.exists(image_path):
|
|
116
|
+
row["image"].save(image_path)
|
|
117
|
+
|
|
118
|
+
question: str = self.remove_question_suffix_for_mcqa(row["question"])
|
|
119
|
+
answer: str = row["answer"]
|
|
120
|
+
references: List[Reference] = []
|
|
121
|
+
|
|
122
|
+
answer = self.get_label_from_answer(answer)
|
|
123
|
+
# The given correct answer is a letter, but we need an index
|
|
124
|
+
correct_answer_index: int = ord(answer) - ord("A")
|
|
125
|
+
# The options are originally appended to the question
|
|
126
|
+
|
|
127
|
+
for i, option in enumerate(self.options):
|
|
128
|
+
reference: Reference
|
|
129
|
+
is_correct: bool = i == correct_answer_index
|
|
130
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
|
|
131
|
+
references.append(reference)
|
|
132
|
+
|
|
133
|
+
content = [
|
|
134
|
+
MediaObject(location=image_path, content_type="image/png"),
|
|
135
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
136
|
+
]
|
|
137
|
+
instances.append(
|
|
138
|
+
Instance(
|
|
139
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
140
|
+
references=references,
|
|
141
|
+
split=TEST_SPLIT,
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return instances
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
VALID_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MMMUScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI
|
|
23
|
+
|
|
24
|
+
We introduce MMMU: a new benchmark designed to evaluate multimodal models on massive multi-discipline
|
|
25
|
+
tasks demanding college-level subject knowledge and deliberate reasoning. MMMU includes 11.5K meticulously
|
|
26
|
+
collected multimodal questions from college exams, quizzes, and textbooks, covering six core disciplines:
|
|
27
|
+
Art & Design, Business, Science, Health & Medicine, Humanities & Social Science, and Tech & Engineering.
|
|
28
|
+
These questions span 30 subjects and 183 subfields, comprising 30 highly heterogeneous image types, such
|
|
29
|
+
as charts, diagrams, maps, tables, music sheets, and chemical structures.
|
|
30
|
+
|
|
31
|
+
@article{yue2023mmmu,
|
|
32
|
+
title={MMMU: A Massive Multi-discipline Multimodal Understanding and Reasoning Benchmark for Expert AGI},
|
|
33
|
+
author={Xiang Yue and Yuansheng Ni and Kai Zhang and Tianyu Zheng and Ruoqi Liu and Ge Zhang and Samuel
|
|
34
|
+
Stevens and Dongfu Jiang and Weiming Ren and Yuxuan Sun and Cong Wei and Botao Yu and Ruibin Yuan and
|
|
35
|
+
Renliang Sun and Ming Yin and Boyuan Zheng and Zhenzhu Yang and Yibo Liu and Wenhao Huang and Huan Sun
|
|
36
|
+
and Yu Su and Wenhu Chen},
|
|
37
|
+
journal={arXiv preprint arXiv:2311.16502},
|
|
38
|
+
year={2023},
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
Paper: https://arxiv.org/abs/2311.16502
|
|
42
|
+
Website: https://mmmu-benchmark.github.io/
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
MMMU_HUGGINGFACE_DATASET_NAME: str = "MMMU/MMMU"
|
|
46
|
+
MAX_NUM_IMAGES: int = 7
|
|
47
|
+
|
|
48
|
+
SUBJECTS: List[str] = [
|
|
49
|
+
"Accounting",
|
|
50
|
+
"Agriculture",
|
|
51
|
+
"Architecture_and_Engineering",
|
|
52
|
+
"Art",
|
|
53
|
+
"Art_Theory",
|
|
54
|
+
"Basic_Medical_Science",
|
|
55
|
+
"Biology",
|
|
56
|
+
"Chemistry",
|
|
57
|
+
"Clinical_Medicine",
|
|
58
|
+
"Computer_Science",
|
|
59
|
+
"Design",
|
|
60
|
+
"Diagnostics_and_Laboratory_Medicine",
|
|
61
|
+
"Economics",
|
|
62
|
+
"Electronics",
|
|
63
|
+
"Energy_and_Power",
|
|
64
|
+
"Finance",
|
|
65
|
+
"Geography",
|
|
66
|
+
"History",
|
|
67
|
+
"Literature",
|
|
68
|
+
"Manage",
|
|
69
|
+
"Marketing",
|
|
70
|
+
"Materials",
|
|
71
|
+
"Math",
|
|
72
|
+
"Mechanical_Engineering",
|
|
73
|
+
"Music",
|
|
74
|
+
"Pharmacy",
|
|
75
|
+
"Physics",
|
|
76
|
+
"Psychology",
|
|
77
|
+
"Public_Health",
|
|
78
|
+
"Sociology",
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
name = "mmmu"
|
|
82
|
+
description = (
|
|
83
|
+
"Evaluate multimodal models on massive multi-discipline tasks demanding college-level "
|
|
84
|
+
"subject knowledge and deliberate reasoning ([paper](https://arxiv.org/abs/2311.16502))."
|
|
85
|
+
)
|
|
86
|
+
tags = ["vision-language"]
|
|
87
|
+
|
|
88
|
+
def __init__(self, subject: str, question_type: str):
|
|
89
|
+
super().__init__()
|
|
90
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
91
|
+
self._subject: str = subject
|
|
92
|
+
|
|
93
|
+
assert question_type in ["multiple-choice", "open"], f"Invalid question type: {question_type}"
|
|
94
|
+
self._question_type: str = question_type
|
|
95
|
+
|
|
96
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
97
|
+
images_path: str = os.path.join(output_path, "images", self._subject)
|
|
98
|
+
ensure_directory_exists(images_path)
|
|
99
|
+
|
|
100
|
+
instances: List[Instance] = []
|
|
101
|
+
|
|
102
|
+
# Process the validation set
|
|
103
|
+
# There seems to be a dev set, but it's unavailable through load_dataset.
|
|
104
|
+
# When loading "dev", we get error: ValueError: Unknown split "dev". Should be one of ['test', 'validation'].
|
|
105
|
+
# The test set doesn't have answers, since the MMMU competition/leaderboard uses the test set
|
|
106
|
+
for row in tqdm(
|
|
107
|
+
load_dataset(self.MMMU_HUGGINGFACE_DATASET_NAME, self._subject, split="validation", cache_dir=output_path)
|
|
108
|
+
):
|
|
109
|
+
# Skip questions that aren't in the subject we're interested in
|
|
110
|
+
if row["question_type"] != self._question_type:
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
question_id: str = row["id"]
|
|
114
|
+
if self._subject not in question_id:
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
question_template: str = row["question"]
|
|
118
|
+
options: List[str] = eval(row["options"]) # Convert the string to list of options
|
|
119
|
+
answer: str = row["answer"]
|
|
120
|
+
|
|
121
|
+
# Create the question. Questions can have text and images interleaved
|
|
122
|
+
question_template_to_image_path: Dict[str, str] = {}
|
|
123
|
+
content: List[MediaObject] = []
|
|
124
|
+
for img_number in range(1, self.MAX_NUM_IMAGES):
|
|
125
|
+
image_id: str = f"image_{img_number}"
|
|
126
|
+
if row[image_id] is None:
|
|
127
|
+
# At this point, there are no more images for this question
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
# Save the image locally
|
|
131
|
+
image_path: str = os.path.join(images_path, f"{question_id}_{image_id}.png")
|
|
132
|
+
if not os.path.exists(image_path):
|
|
133
|
+
row[image_id].save(image_path)
|
|
134
|
+
|
|
135
|
+
image_template_tag: str = f"<image {img_number}>"
|
|
136
|
+
question_template_to_image_path[image_template_tag] = image_path
|
|
137
|
+
|
|
138
|
+
# There are cases when the image is included, but it is not used either in the
|
|
139
|
+
# question template or in the answer options
|
|
140
|
+
if image_template_tag not in question_template:
|
|
141
|
+
# The image is not in the question template
|
|
142
|
+
continue
|
|
143
|
+
|
|
144
|
+
head, question_template = question_template.split(image_template_tag, 1)
|
|
145
|
+
if head:
|
|
146
|
+
content.append(MediaObject(text=head, content_type="text/plain"))
|
|
147
|
+
content.append(MediaObject(location=image_path, content_type="image/png"))
|
|
148
|
+
|
|
149
|
+
# Add the rest of the question template
|
|
150
|
+
if question_template:
|
|
151
|
+
content.append(MediaObject(text=question_template, content_type="text/plain"))
|
|
152
|
+
|
|
153
|
+
# Add the references
|
|
154
|
+
references: List[Reference] = []
|
|
155
|
+
if self._question_type == "multiple-choice":
|
|
156
|
+
# The given correct answer is a letter, but we need an index
|
|
157
|
+
correct_answer_index: int = ord(answer) - ord("A")
|
|
158
|
+
|
|
159
|
+
for i, option in enumerate(options):
|
|
160
|
+
reference: Reference
|
|
161
|
+
is_correct: bool = i == correct_answer_index
|
|
162
|
+
|
|
163
|
+
if option in question_template_to_image_path:
|
|
164
|
+
# Image answer choices
|
|
165
|
+
image_path = question_template_to_image_path[option]
|
|
166
|
+
image_choice = MediaObject(location=image_path, content_type="image/png")
|
|
167
|
+
reference = Reference(
|
|
168
|
+
Output(multimedia_content=MultimediaObject([image_choice])),
|
|
169
|
+
tags=[CORRECT_TAG] if is_correct else [],
|
|
170
|
+
)
|
|
171
|
+
else:
|
|
172
|
+
# Text answer choices
|
|
173
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
|
|
174
|
+
|
|
175
|
+
references.append(reference)
|
|
176
|
+
else:
|
|
177
|
+
references.append(Reference(Output(text=answer), tags=[CORRECT_TAG]))
|
|
178
|
+
|
|
179
|
+
instances.append(
|
|
180
|
+
Instance(
|
|
181
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
182
|
+
references=references,
|
|
183
|
+
split=VALID_SPLIT,
|
|
184
|
+
)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
return instances
|