crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from typing import Tuple, Dict, Any
|
|
2
|
+
|
|
3
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from selenium import webdriver
|
|
7
|
+
import selenium.common.exceptions
|
|
8
|
+
except ModuleNotFoundError as e:
|
|
9
|
+
handle_module_not_found_error(e, suggestions=["image2structure"])
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def init_driver(url: str, resolution: Tuple[int, int] = (1920, 1080)) -> webdriver.Chrome:
|
|
13
|
+
"""Initialize the WebDriver
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
url (str): The URL of the website. Usually "http://localhost:{port}".
|
|
17
|
+
resolution (tuple[int, int], optional): The resolution of the WebDriver. Defaults to (1920, 1080).
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
webdriver.Chrome: The Chrome WebDriver
|
|
21
|
+
"""
|
|
22
|
+
options = webdriver.ChromeOptions()
|
|
23
|
+
options.add_argument(f"--window-size={resolution[0]},{resolution[1]}")
|
|
24
|
+
options.add_argument("--headless") # Optional: run in headless mode
|
|
25
|
+
options.add_argument("--no-sandbox") # Optional: for certain environments
|
|
26
|
+
options.add_argument("--disable-dev-shm-usage") # Optional: overcome limited resource problems
|
|
27
|
+
driver = webdriver.Chrome(options=options)
|
|
28
|
+
driver.get(url)
|
|
29
|
+
return driver
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def close_driver(driver: webdriver.Chrome):
|
|
33
|
+
"""Close the WebDriver
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
driver (webdriver.Chrome): The Chrome WebDriver
|
|
37
|
+
"""
|
|
38
|
+
driver.quit()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class ScreenshotOptions:
|
|
42
|
+
"""A class to store the parameters for taking a screenshot"""
|
|
43
|
+
|
|
44
|
+
"""The resolution of the screenshot"""
|
|
45
|
+
resolution: Tuple[int, int] = (1920, 1080)
|
|
46
|
+
|
|
47
|
+
"""The delay between each action in milliseconds"""
|
|
48
|
+
delay_between_each_action_ms: int = 1000
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def save_random_screenshot(path: str, port: int, options: ScreenshotOptions = ScreenshotOptions()) -> Dict[str, Any]:
|
|
52
|
+
"""Save a screenshot of a random page
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
path (str): The path to save the screenshot
|
|
56
|
+
port (int): The port to use for the website.
|
|
57
|
+
options (ScreenshotOptions, optional): The options to use for taking the screenshot.
|
|
58
|
+
Defaults to ScreenshotOptions().
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
infos (Dict[str, Any]): Additional information about the screenshot
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
ValueError: If the path does not end with .png
|
|
65
|
+
"""
|
|
66
|
+
if not path.endswith(".png"):
|
|
67
|
+
raise ValueError("The path should end with .png")
|
|
68
|
+
|
|
69
|
+
driver: webdriver.Chrome
|
|
70
|
+
try:
|
|
71
|
+
driver = init_driver(url=f"http://localhost:{port}", resolution=options.resolution)
|
|
72
|
+
except selenium.common.exceptions.WebDriverException as e:
|
|
73
|
+
raise Exception(f"Failed to initialize the driver: {e}")
|
|
74
|
+
except Exception as e:
|
|
75
|
+
raise Exception(f"An unknown error occurred while initializing the driver: {e}")
|
|
76
|
+
|
|
77
|
+
# Extract the HTML of the page
|
|
78
|
+
html = driver.page_source
|
|
79
|
+
|
|
80
|
+
# Take a screenshot of the page
|
|
81
|
+
driver.save_screenshot(path)
|
|
82
|
+
close_driver(driver)
|
|
83
|
+
|
|
84
|
+
return {"html": html}
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import os
|
|
3
|
+
import signal
|
|
4
|
+
from typing import Optional
|
|
5
|
+
import time
|
|
6
|
+
import socket
|
|
7
|
+
import threading
|
|
8
|
+
|
|
9
|
+
from helm.common.hierarchical_logger import hlog
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JekyllServer:
|
|
13
|
+
"""A class to start and stop a Jekyll server in a separate process."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, repo_path: str, port: int, verbose: bool = False):
|
|
16
|
+
self.repo_path: str = repo_path
|
|
17
|
+
self.verbose: bool = verbose
|
|
18
|
+
self.port: int = port
|
|
19
|
+
self.process: Optional[subprocess.Popen] = None
|
|
20
|
+
self.success: bool = False # Shared flag to indicate if the server started successfully
|
|
21
|
+
|
|
22
|
+
def __del__(self):
|
|
23
|
+
self.stop()
|
|
24
|
+
if JekyllServer.is_port_in_use(self.port):
|
|
25
|
+
if self.verbose:
|
|
26
|
+
hlog(f"Port {self.port} is in use. Attempting to free it.")
|
|
27
|
+
self.kill_process_using_port(self.port)
|
|
28
|
+
if self.verbose:
|
|
29
|
+
hlog("JekyllServer object deleted.")
|
|
30
|
+
|
|
31
|
+
def setup_gemfile(self):
|
|
32
|
+
# Check if Gemfile exists, if not, copy Gemfile.default to Gemfile
|
|
33
|
+
if not os.path.exists(f"{self.repo_path}/Gemfile"):
|
|
34
|
+
default_gemfile_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "Gemfile.default")
|
|
35
|
+
os.system(f"cp {default_gemfile_path} {self.repo_path}/Gemfile")
|
|
36
|
+
if self.verbose:
|
|
37
|
+
hlog("Copied Gemfile.default to Gemfile")
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
# Gemfile exists, check if it has the jekyll gem
|
|
41
|
+
if "jekyll" in open(f"{self.repo_path}/Gemfile").read():
|
|
42
|
+
# TODO: figure out if we need to do anything here
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
# Gemfile exists, but doesn't have jekyll gem
|
|
46
|
+
with open(f"{self.repo_path}/Gemfile", "a") as file:
|
|
47
|
+
file.write('gem "jekyll", "~> 4.3.3"')
|
|
48
|
+
if self.verbose:
|
|
49
|
+
hlog("Added jekyll gem to Gemfile")
|
|
50
|
+
|
|
51
|
+
def setup_config(self):
|
|
52
|
+
# Check if _config.yml exists, if not, copy _config.default.yml to _config.yml
|
|
53
|
+
if not os.path.exists(f"{self.repo_path}/_config.yml"):
|
|
54
|
+
default_config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "_config.default.yml")
|
|
55
|
+
os.system(f"cp {default_config_path} {self.repo_path}/_config.yml")
|
|
56
|
+
if self.verbose:
|
|
57
|
+
hlog("Copied _config.default.yml to _config.yml")
|
|
58
|
+
# Search for line starting with "port:" and replace it with "port: <port>"
|
|
59
|
+
with open(f"{self.repo_path}/_config.yml", "r") as file:
|
|
60
|
+
lines = file.readlines()
|
|
61
|
+
with open(f"{self.repo_path}/_config.yml", "w") as file:
|
|
62
|
+
for line in lines:
|
|
63
|
+
if line.startswith("port"):
|
|
64
|
+
file.write(f"port: {self.port}\n")
|
|
65
|
+
else:
|
|
66
|
+
file.write(line)
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def is_port_in_use(port: int) -> bool:
|
|
70
|
+
"""Check if a port is in use on localhost."""
|
|
71
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
72
|
+
return s.connect_ex(("localhost", port)) == 0
|
|
73
|
+
|
|
74
|
+
def kill_process_using_port(self, port: int):
|
|
75
|
+
"""Find and kill the process using the specified port."""
|
|
76
|
+
command = f"lsof -ti:{port} | grep '[0-9]' | xargs -r kill -9"
|
|
77
|
+
os.system(command)
|
|
78
|
+
if self.verbose:
|
|
79
|
+
hlog(f"Killed process using port {port}.")
|
|
80
|
+
|
|
81
|
+
def stream_output(self, process: subprocess.Popen):
|
|
82
|
+
"""Read from stdout and stderr streams and hlog."""
|
|
83
|
+
assert process.stdout is not None
|
|
84
|
+
assert process.stderr is not None
|
|
85
|
+
while True:
|
|
86
|
+
output = process.stdout.readline()
|
|
87
|
+
if not output:
|
|
88
|
+
err = process.stderr.readline()
|
|
89
|
+
if err:
|
|
90
|
+
decoded_line = err.decode("utf-8").strip()
|
|
91
|
+
if self.verbose:
|
|
92
|
+
hlog(f"\t> \033[91mStderr: {decoded_line}\033[0m")
|
|
93
|
+
self.success = False
|
|
94
|
+
break
|
|
95
|
+
else:
|
|
96
|
+
# No more output
|
|
97
|
+
break
|
|
98
|
+
else:
|
|
99
|
+
decoded_line = output.decode("utf-8").strip()
|
|
100
|
+
if self.verbose:
|
|
101
|
+
hlog(f"\t> Stdout: {decoded_line}")
|
|
102
|
+
if "Server running... press ctrl-c to stop." in decoded_line:
|
|
103
|
+
self.success = True
|
|
104
|
+
break
|
|
105
|
+
|
|
106
|
+
def start(self, timeout: int = 30) -> bool:
|
|
107
|
+
"""Start the Jekyll server in a separate process and monitor the output."""
|
|
108
|
+
if JekyllServer.is_port_in_use(self.port):
|
|
109
|
+
if self.verbose:
|
|
110
|
+
hlog(f"Port {self.port} is in use. Attempting to free it.")
|
|
111
|
+
self.kill_process_using_port(self.port)
|
|
112
|
+
|
|
113
|
+
self.setup_gemfile()
|
|
114
|
+
self.setup_config()
|
|
115
|
+
command_install = f"cd {self.repo_path} && bundle install"
|
|
116
|
+
subprocess.run(command_install, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
117
|
+
|
|
118
|
+
command_serve = f"cd {self.repo_path} && bundle exec jekyll serve --port {self.port}"
|
|
119
|
+
self.process = subprocess.Popen(
|
|
120
|
+
command_serve,
|
|
121
|
+
shell=True,
|
|
122
|
+
stdout=subprocess.PIPE,
|
|
123
|
+
stderr=subprocess.PIPE,
|
|
124
|
+
preexec_fn=os.setsid,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Start thread to read output
|
|
128
|
+
output_thread = threading.Thread(target=self.stream_output, args=(self.process,))
|
|
129
|
+
output_thread.start()
|
|
130
|
+
|
|
131
|
+
# Wait for the thread to complete or timeout
|
|
132
|
+
output_thread.join(timeout=timeout)
|
|
133
|
+
|
|
134
|
+
if output_thread.is_alive():
|
|
135
|
+
# If the thread is still alive after the timeout, the server did not start
|
|
136
|
+
# successfully within the timeout period
|
|
137
|
+
hlog("Timeout reached without detecting server start.")
|
|
138
|
+
self.process.terminate() # Terminate the process if it's still running
|
|
139
|
+
output_thread.join() # Ensure the thread is cleaned up
|
|
140
|
+
return False
|
|
141
|
+
else:
|
|
142
|
+
if self.verbose:
|
|
143
|
+
if self.success:
|
|
144
|
+
hlog("Jekyll server started successfully.")
|
|
145
|
+
else:
|
|
146
|
+
hlog("Jekyll server failed to start.")
|
|
147
|
+
return self.success # Return the success flag
|
|
148
|
+
|
|
149
|
+
def stop(self, timeout=5):
|
|
150
|
+
"""Stop the Jekyll server and terminate the process with a timeout.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
timeout (int, optional): Time to wait for the server to gracefully shut down. Defaults to 5 seconds.
|
|
154
|
+
"""
|
|
155
|
+
if self.process:
|
|
156
|
+
# Try to terminate the process group gracefully
|
|
157
|
+
os.killpg(os.getpgid(self.process.pid), signal.SIGTERM)
|
|
158
|
+
self.process.terminate()
|
|
159
|
+
|
|
160
|
+
# Wait for the process to end, checking periodically
|
|
161
|
+
try:
|
|
162
|
+
# Wait up to `timeout` seconds for process to terminate
|
|
163
|
+
for _ in range(timeout):
|
|
164
|
+
if self.process.poll() is not None: # Process has terminated
|
|
165
|
+
break
|
|
166
|
+
time.sleep(1) # Wait a bit before checking again
|
|
167
|
+
else:
|
|
168
|
+
# If the process is still alive after the timeout, kill it
|
|
169
|
+
os.killpg(os.getpgid(self.process.pid), signal.SIGKILL)
|
|
170
|
+
self.process.kill()
|
|
171
|
+
self.process.wait() # Wait for process to be killed
|
|
172
|
+
if self.verbose:
|
|
173
|
+
hlog("Jekyll server forcefully stopped.")
|
|
174
|
+
except Exception as e:
|
|
175
|
+
if self.verbose:
|
|
176
|
+
hlog(f"Error stopping the Jekyll server: {e}")
|
|
177
|
+
|
|
178
|
+
self.process = None
|
|
179
|
+
if self.verbose:
|
|
180
|
+
hlog("Jekyll server stopped.")
|
|
181
|
+
elif self.verbose:
|
|
182
|
+
hlog("Jekyll server is not running.")
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
from html2text import HTML2Text
|
|
7
|
+
except ModuleNotFoundError as e:
|
|
8
|
+
handle_module_not_found_error(e, suggestions=["image2structure"])
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def convert_html_to_text(handler: HTML2Text, html: str) -> str:
|
|
12
|
+
"""Convert HTML to text
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
handler (HTML2Text): The HTML2Text handler
|
|
16
|
+
html (str): The HTML to convert
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
str: The text
|
|
20
|
+
"""
|
|
21
|
+
text: str = handler.handle(html)
|
|
22
|
+
# Normalize space sequences to a single space globally
|
|
23
|
+
text = re.sub(r" +", " ", text)
|
|
24
|
+
# Replace tabs with a single space
|
|
25
|
+
text = re.sub(r"\t", " ", text)
|
|
26
|
+
# Remove leading and trailing spaces on each line
|
|
27
|
+
text = re.sub(r"^[ \t]+|[ \t]+$", "", text, flags=re.MULTILINE)
|
|
28
|
+
# Remove unnecessary whitespace - multiple empty lines and tabulations
|
|
29
|
+
text = re.sub(r"\n\s*\n", "\n", text)
|
|
30
|
+
|
|
31
|
+
return text.strip()
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
from typing import Dict, List, Any
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.scenarios.scenario import VALID_SPLIT
|
|
4
|
+
from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import (
|
|
5
|
+
Image2StructureScenario,
|
|
6
|
+
PROCESSED,
|
|
7
|
+
)
|
|
8
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage.jekyll_server import JekyllServer
|
|
9
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage.driver import (
|
|
10
|
+
save_random_screenshot,
|
|
11
|
+
ScreenshotOptions,
|
|
12
|
+
)
|
|
13
|
+
from helm.benchmark.scenarios.vision_language.image2structure.webpage.utils import convert_html_to_text
|
|
14
|
+
from helm.common.general import ensure_directory_exists
|
|
15
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
16
|
+
|
|
17
|
+
try:
|
|
18
|
+
from html2text import HTML2Text
|
|
19
|
+
except ModuleNotFoundError as e:
|
|
20
|
+
handle_module_not_found_error(e, suggestions=["image2structure"])
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
import base64
|
|
24
|
+
import os
|
|
25
|
+
import threading
|
|
26
|
+
import tarfile
|
|
27
|
+
import shutil
|
|
28
|
+
import time
|
|
29
|
+
import pickle
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def extract_repo(base64_encoding: str, repo_path: str) -> None:
|
|
33
|
+
# Decode the base64 string which corresponds to an archive
|
|
34
|
+
# and extract the files to the repo_path
|
|
35
|
+
try:
|
|
36
|
+
archive = base64.b64decode(base64_encoding)
|
|
37
|
+
# Write to .tar file
|
|
38
|
+
with open(os.path.join(repo_path, "repo.tar.gz"), "wb") as f:
|
|
39
|
+
f.write(archive)
|
|
40
|
+
# Extract
|
|
41
|
+
with tarfile.open(os.path.join(repo_path, "repo.tar.gz"), "r:gz") as tar:
|
|
42
|
+
tar.extractall(repo_path)
|
|
43
|
+
except Exception as e:
|
|
44
|
+
raise ValueError(f"Failed to decode and extract the base64 archive: {e}")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def list_assets(repo_path: str, extensions: List[str]) -> List[str]:
|
|
48
|
+
asset_paths: List[str] = []
|
|
49
|
+
for root, _, files in os.walk(repo_path):
|
|
50
|
+
for file in files:
|
|
51
|
+
if file.split(".")[-1].lower() in extensions:
|
|
52
|
+
# Remove repo_path from the file path
|
|
53
|
+
# ignore paths that start with a dot or in _site
|
|
54
|
+
if not root.startswith(os.path.join(repo_path, "_site")) and not root.startswith(
|
|
55
|
+
os.path.join(repo_path, ".")
|
|
56
|
+
):
|
|
57
|
+
asset_paths.append(os.path.relpath(os.path.join(root, file), repo_path))
|
|
58
|
+
return asset_paths
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def serve_and_take_screenshot(
|
|
62
|
+
repo_path: str,
|
|
63
|
+
destination_path: str,
|
|
64
|
+
screenshot_options: ScreenshotOptions = ScreenshotOptions(),
|
|
65
|
+
max_tries: int = 5,
|
|
66
|
+
) -> Dict[str, Any]:
|
|
67
|
+
# Start the Jekyll server
|
|
68
|
+
# Select a unique port per thread
|
|
69
|
+
port: int = 4000 + int(threading.get_ident()) % 1000
|
|
70
|
+
server = JekyllServer(repo_path, port=port, verbose=False)
|
|
71
|
+
success: bool = server.start()
|
|
72
|
+
if not success:
|
|
73
|
+
# This runs on examples that are not expected to fail
|
|
74
|
+
server.stop()
|
|
75
|
+
raise ValueError(f"Jekyll server failed to start: {repo_path}")
|
|
76
|
+
|
|
77
|
+
# Take a screenshot of a random page
|
|
78
|
+
success = False
|
|
79
|
+
error: Exception
|
|
80
|
+
for _ in range(max_tries):
|
|
81
|
+
try:
|
|
82
|
+
infos: Dict[str, Any] = save_random_screenshot(destination_path, port=port, options=screenshot_options)
|
|
83
|
+
success = True
|
|
84
|
+
break
|
|
85
|
+
except Exception as e:
|
|
86
|
+
if "net::ERR_CONNECTION_REFUSED" in str(e):
|
|
87
|
+
error = e
|
|
88
|
+
server.stop()
|
|
89
|
+
time.sleep(0.5)
|
|
90
|
+
server.start()
|
|
91
|
+
time.sleep(0.5)
|
|
92
|
+
else:
|
|
93
|
+
# Do not retry
|
|
94
|
+
break
|
|
95
|
+
if not success:
|
|
96
|
+
raise ValueError(f"Failed to take a screenshot: {error}")
|
|
97
|
+
|
|
98
|
+
# Stop the server
|
|
99
|
+
server.stop()
|
|
100
|
+
time.sleep(0.1)
|
|
101
|
+
|
|
102
|
+
return infos
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class WebpageScenario(Image2StructureScenario):
|
|
106
|
+
BASE_PROMPT = (
|
|
107
|
+
"Please generate the source code to generate a webpage that looks like this image as much as feasibly possible.\n" # noqa: E501
|
|
108
|
+
"You should output a json object associating each file name with its content.\n\n"
|
|
109
|
+
"Here is a simple example of the expected structure (that does not correspond to the image)."
|
|
110
|
+
" In this example, 3 files are created: index.html, style.css and script.js.\n"
|
|
111
|
+
"[\n"
|
|
112
|
+
" {\n"
|
|
113
|
+
' "filename": "index.html",\n'
|
|
114
|
+
' "content": "<!DOCTYPE html>\\n<html>\\n<head>\\n<title>Title of the document</title>\\n</head>\\n<body>\\n\\n<p>Content of the document......</p>\\n\\n</body>\\n</html>"\n' # noqa: E501
|
|
115
|
+
" },\n"
|
|
116
|
+
" {\n"
|
|
117
|
+
' "filename": "style.css",\n'
|
|
118
|
+
' "content": "body {\\n background-color: lightblue;\\n}\\nh1 {\\n color: white;\\n text-align: center;\\n}"\n' # noqa: E501
|
|
119
|
+
" },\n"
|
|
120
|
+
" {\n"
|
|
121
|
+
' "filename": "script.js",\n'
|
|
122
|
+
' "content": "document.getElementById(\\"demo\\").innerHTML = \\"Hello JavaScript!\\";"\n'
|
|
123
|
+
" }\n"
|
|
124
|
+
"]\n"
|
|
125
|
+
"You do not have to create files with the same names. Create as many files as you need, you can even use directories if necessary," # noqa: E501
|
|
126
|
+
" they will be created for you automatically. Try to write some realistic code keeping in mind that is should"
|
|
127
|
+
" look like the image as much as feasibly possible."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-webpage"
|
|
131
|
+
SUBSETS = ["css", "html", "javascript"]
|
|
132
|
+
MAX_TRIES: int = 5
|
|
133
|
+
ASSETS_EXTENSIONS: List[str] = ["png", "jpg", "jpeg", "gif", "svg", "webp", "ico", "bmp", "tiff"]
|
|
134
|
+
|
|
135
|
+
name = "image2webpage"
|
|
136
|
+
description = "Evaluate multimodal models on webpage generation to recreate a provided image"
|
|
137
|
+
|
|
138
|
+
def __init__(
|
|
139
|
+
self,
|
|
140
|
+
subset: str,
|
|
141
|
+
recompile_prompt: bool = True,
|
|
142
|
+
split: str = VALID_SPLIT,
|
|
143
|
+
screenshot_options: ScreenshotOptions = ScreenshotOptions(),
|
|
144
|
+
):
|
|
145
|
+
super().__init__(subset, recompile_prompt, split)
|
|
146
|
+
self._screenshot_options = screenshot_options
|
|
147
|
+
self._html2text = HTML2Text()
|
|
148
|
+
self._html2text.ignore_links = True
|
|
149
|
+
|
|
150
|
+
def preprocess_row(self, row: Dict[str, Any], assets_path: str) -> Dict[str, Any]:
|
|
151
|
+
"""Extract the base64 encoding of the repo from the row and return it."""
|
|
152
|
+
# No need to reprocess if the assets are already saved
|
|
153
|
+
assets_save_path: str = os.path.join(assets_path, str(row["uuid"].replace('"', "")))
|
|
154
|
+
if os.path.exists(assets_save_path):
|
|
155
|
+
try:
|
|
156
|
+
with open(os.path.join(assets_save_path, "assets_paths.pkl"), "rb") as f:
|
|
157
|
+
row["assets_paths"] = pickle.load(f)
|
|
158
|
+
with open(os.path.join(assets_save_path, "assets_names.pkl"), "rb") as f:
|
|
159
|
+
row["assets_names"] = pickle.load(f)
|
|
160
|
+
del row["assets"]
|
|
161
|
+
row["structure"] = PROCESSED
|
|
162
|
+
return row
|
|
163
|
+
except Exception:
|
|
164
|
+
# There was an issue when loading the assets, reprocess
|
|
165
|
+
shutil.rmtree(assets_save_path)
|
|
166
|
+
ensure_directory_exists(assets_save_path)
|
|
167
|
+
|
|
168
|
+
# Structure is a base64 encoding of the repo
|
|
169
|
+
if self._output_path is None:
|
|
170
|
+
raise ValueError("Output path not set")
|
|
171
|
+
repo_path = os.path.join(self._output_path, f"tmp{threading.get_ident()}_{self._subset}")
|
|
172
|
+
ensure_directory_exists(repo_path)
|
|
173
|
+
|
|
174
|
+
# Decode the base64 string which corresponds to an archive
|
|
175
|
+
# and extract the files to the repo_path
|
|
176
|
+
structure: str = row["structure"]
|
|
177
|
+
extract_repo(structure, repo_path)
|
|
178
|
+
row["structure"] = PROCESSED
|
|
179
|
+
row["repo_path"] = repo_path # Stored for cleanup
|
|
180
|
+
|
|
181
|
+
# Process the assets
|
|
182
|
+
asset_paths: List[str] = list_assets(repo_path, self.ASSETS_EXTENSIONS)
|
|
183
|
+
del row["assets"]
|
|
184
|
+
row["assets_paths"] = []
|
|
185
|
+
row["assets_names"] = []
|
|
186
|
+
# Copy each asset to a unique persistent path
|
|
187
|
+
for i, asset_local_path in enumerate(asset_paths):
|
|
188
|
+
asset_name: str = asset_local_path
|
|
189
|
+
asset_dest_path = os.path.join(assets_save_path, f"{i}.{asset_local_path.split('.')[-1]}")
|
|
190
|
+
shutil.copyfile(os.path.join(row["repo_path"], asset_local_path), asset_dest_path)
|
|
191
|
+
row["assets_paths"].append(asset_dest_path)
|
|
192
|
+
row["assets_names"].append(asset_name)
|
|
193
|
+
|
|
194
|
+
# Save both assets_paths and assets_names as files than can be loaded
|
|
195
|
+
with open(os.path.join(assets_save_path, "assets_paths.pkl"), "wb") as f:
|
|
196
|
+
pickle.dump(row["assets_paths"], f)
|
|
197
|
+
with open(os.path.join(assets_save_path, "assets_names.pkl"), "wb") as f:
|
|
198
|
+
pickle.dump(row["assets_names"], f)
|
|
199
|
+
|
|
200
|
+
return row
|
|
201
|
+
|
|
202
|
+
def build_prompt(self, row: Dict[str, Any]) -> str:
|
|
203
|
+
prompt: str = self.BASE_PROMPT
|
|
204
|
+
assert "assets_paths" in row, "No assets paths in the row"
|
|
205
|
+
assert "assets_names" in row, "No assets names in the row"
|
|
206
|
+
assert len(row["assets_paths"]) == len(row["assets_names"])
|
|
207
|
+
if len(row["assets_names"]) > 0:
|
|
208
|
+
prompt += "\nYou have access to the following assets:\n"
|
|
209
|
+
for asset_local_path in row["assets_names"]:
|
|
210
|
+
prompt += f"- {asset_local_path}\n"
|
|
211
|
+
return prompt
|
|
212
|
+
|
|
213
|
+
def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
|
|
214
|
+
# Structure is the path to the repo
|
|
215
|
+
# Serve and take screenshot
|
|
216
|
+
repo_path: str = structure
|
|
217
|
+
infos: Dict[str, Any] = serve_and_take_screenshot(repo_path, destination_path, self._screenshot_options)
|
|
218
|
+
text: str = convert_html_to_text(self._html2text, infos["html"])
|
|
219
|
+
return text
|
|
220
|
+
|
|
221
|
+
def finalize(self, row: Dict[str, Any]) -> None:
|
|
222
|
+
"""Perform cleanup operations after the instance has been generated."""
|
|
223
|
+
if "repo_path" in row:
|
|
224
|
+
repo_path: str = row["repo_path"]
|
|
225
|
+
shutil.rmtree(repo_path)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Instance,
|
|
13
|
+
Output,
|
|
14
|
+
Reference,
|
|
15
|
+
Scenario,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MementosScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
Mementos: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences
|
|
24
|
+
|
|
25
|
+
We introduces Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning abilities. Mementos
|
|
26
|
+
features 4,761 diverse image sequences with varying lengths.
|
|
27
|
+
|
|
28
|
+
@misc{wang2024mementos,
|
|
29
|
+
title={Mementos: A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences},
|
|
30
|
+
author={Xiyao Wang and Yuhang Zhou and Xiaoyu Liu and Hongjin Lu and Yuancheng Xu and Feihong He and Jaehong Yoon
|
|
31
|
+
and Taixi Lu and Gedas Bertasius and Mohit Bansal and Huaxiu Yao and Furong Huang},
|
|
32
|
+
year={2024},
|
|
33
|
+
eprint={2401.10529},
|
|
34
|
+
archivePrefix={arXiv},
|
|
35
|
+
primaryClass={cs.CV}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
Paper: https://arxiv.org/abs/2401.10529
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
MEMENTOS_HUGGINGFACE_DATASET_NAME: str = "shenmishajing/unofficial_mementos_dataset"
|
|
42
|
+
|
|
43
|
+
IMAGE_URL: str = (
|
|
44
|
+
"https://huggingface.co/datasets/shenmishajing/unofficial_mementos_dataset/resolve/main/"
|
|
45
|
+
+ "{subject}/{split}/{file_name}?download=true"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
DATA_FILES: str = "{subject}/{split}/metadata.csv"
|
|
49
|
+
|
|
50
|
+
QUESTION_PROMPT: str = (
|
|
51
|
+
"Write a description for the given image sequence in a single paragraph, what is happening in this episode?"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
SUBJECTS: List[str] = ["comics", "dailylife", "robotics"]
|
|
55
|
+
|
|
56
|
+
name = "mementos"
|
|
57
|
+
description = (
|
|
58
|
+
"A Comprehensive Benchmark for Multimodal Large Language Model Reasoning over Image Sequences"
|
|
59
|
+
" ([paper](https://arxiv.org/abs/2401.10529))."
|
|
60
|
+
)
|
|
61
|
+
tags = ["vision-language"]
|
|
62
|
+
|
|
63
|
+
def __init__(self, subject: str):
|
|
64
|
+
super().__init__()
|
|
65
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
66
|
+
self._subject: str = subject
|
|
67
|
+
|
|
68
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
69
|
+
subject_output_path: str = os.path.join(output_path, self._subject)
|
|
70
|
+
ensure_directory_exists(subject_output_path)
|
|
71
|
+
|
|
72
|
+
data_files = {
|
|
73
|
+
split: self.DATA_FILES.format(subject=self._subject, split=split) for split in [TRAIN_SPLIT, TEST_SPLIT]
|
|
74
|
+
}
|
|
75
|
+
instances: List[Instance] = []
|
|
76
|
+
|
|
77
|
+
for split in [TRAIN_SPLIT, TEST_SPLIT]:
|
|
78
|
+
cur_output_path = os.path.join(subject_output_path, split)
|
|
79
|
+
ensure_directory_exists(cur_output_path)
|
|
80
|
+
|
|
81
|
+
# Process the test set
|
|
82
|
+
for row in tqdm(
|
|
83
|
+
load_dataset(
|
|
84
|
+
self.MEMENTOS_HUGGINGFACE_DATASET_NAME.format(subject=self._subject),
|
|
85
|
+
data_files=data_files,
|
|
86
|
+
split=split,
|
|
87
|
+
cache_dir=cur_output_path,
|
|
88
|
+
)
|
|
89
|
+
):
|
|
90
|
+
# Download the image
|
|
91
|
+
file_name: str = row["file_name"]
|
|
92
|
+
local_image_path: str = os.path.join(cur_output_path, file_name)
|
|
93
|
+
ensure_file_downloaded(
|
|
94
|
+
source_url=self.IMAGE_URL.format(subject=self._subject, split=split, file_name=file_name),
|
|
95
|
+
target_path=local_image_path,
|
|
96
|
+
unpack=False,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
content: List[MediaObject] = [
|
|
100
|
+
MediaObject(location=local_image_path, content_type="image/png"),
|
|
101
|
+
MediaObject(text=self.QUESTION_PROMPT, content_type="text/plain"),
|
|
102
|
+
]
|
|
103
|
+
answer: str = row["description"]
|
|
104
|
+
instances.append(
|
|
105
|
+
Instance(
|
|
106
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
107
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
108
|
+
split=split,
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
print()
|
|
113
|
+
|
|
114
|
+
return instances
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def main():
|
|
118
|
+
scenario = MementosScenario("robotics")
|
|
119
|
+
instances = scenario.get_instances("output")
|
|
120
|
+
print(instances)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
main()
|