crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.simple_scenarios import (
|
|
5
|
+
SimpleMCQAScenario,
|
|
6
|
+
SimpleShortAnswerQAScenario,
|
|
7
|
+
SimpleClassificationScenario,
|
|
8
|
+
)
|
|
9
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Input, Output, Reference
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.scenarios
|
|
13
|
+
def test_simple_mcqa_scenario():
|
|
14
|
+
scenario = SimpleMCQAScenario()
|
|
15
|
+
with TemporaryDirectory() as tmpdir:
|
|
16
|
+
instances = scenario.get_instances(tmpdir)
|
|
17
|
+
assert len(instances) == 90
|
|
18
|
+
assert instances[0].input == Input(text="Is 10 even or odd?")
|
|
19
|
+
assert instances[0].references == [
|
|
20
|
+
Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
|
|
21
|
+
Reference(output=Output(text="Odd"), tags=[]),
|
|
22
|
+
]
|
|
23
|
+
assert instances[0].split == "train"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@pytest.mark.scenarios
|
|
27
|
+
def test_simple_short_answer_qa_scenario():
|
|
28
|
+
scenario = SimpleShortAnswerQAScenario()
|
|
29
|
+
with TemporaryDirectory() as tmpdir:
|
|
30
|
+
instances = scenario.get_instances(tmpdir)
|
|
31
|
+
assert len(instances) == 90
|
|
32
|
+
assert instances[0].input == Input(text="Is 10 even or odd?")
|
|
33
|
+
assert instances[0].references == [
|
|
34
|
+
Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
|
|
35
|
+
]
|
|
36
|
+
assert instances[0].split == "train"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.scenarios
|
|
40
|
+
def test_simple_classification_scenario():
|
|
41
|
+
scenario = SimpleClassificationScenario()
|
|
42
|
+
with TemporaryDirectory() as tmpdir:
|
|
43
|
+
instances = scenario.get_instances(tmpdir)
|
|
44
|
+
assert len(instances) == 90
|
|
45
|
+
assert instances[0].input == Input(text="10")
|
|
46
|
+
assert instances[0].references == [
|
|
47
|
+
Reference(output=Output(text="Even"), tags=[CORRECT_TAG]),
|
|
48
|
+
Reference(output=Output(text="Odd"), tags=[]),
|
|
49
|
+
]
|
|
50
|
+
assert instances[0].split == "train"
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ThaiExamScenario(Scenario):
|
|
11
|
+
"""
|
|
12
|
+
ThaiExam, a benchmark comprising Thai multiple-choice examinations as follows:
|
|
13
|
+
|
|
14
|
+
∙ ONET: The Ordinary National Educational Test (ONET) is an examination for students in Thailand.
|
|
15
|
+
We select the grade-12 ONET exam, which comprises 5 subjects and each question has 5 choices.
|
|
16
|
+
These subjects are Thai, English, Mathematics, Social Studies, and Science.
|
|
17
|
+
Amounting to a total of 170 questions and options.
|
|
18
|
+
|
|
19
|
+
∙ IC: The Investment Consultant (IC) examination, a licensing test for investment professionals in Thailand.
|
|
20
|
+
Developed by the Stock Exchange of Thailand (SET), features 4 choices per question.
|
|
21
|
+
We extracted questions for levels 1, 2, and 3 resulting in a total of 95 questions and options.
|
|
22
|
+
|
|
23
|
+
∙ TGAT: The Thai General Aptitude Test (TGAT), a national high school examination in Thailand.
|
|
24
|
+
Focuses on critical and logical thinking skills.
|
|
25
|
+
We collected a total of 90 questions and answers. The TGAT consists of four choices per question.
|
|
26
|
+
|
|
27
|
+
∙ TPAT-1: The Thai Professional Aptitude Test 1 (TPAT-1) is a national high school examination in Thailand.
|
|
28
|
+
The Exam assesses students’ professional skills requirement in medical schools.
|
|
29
|
+
This subset contains reasoning and medical ethics. We collected a total of 116 questions and answers.
|
|
30
|
+
The TPAT-1 consists of 5 choices per question.
|
|
31
|
+
|
|
32
|
+
∙ A-Level: An academic knowledge assessment examination (Applied Knowledge Level)
|
|
33
|
+
that covers general foundational subjects taught in schools.
|
|
34
|
+
The content assessed in this examination aligns with the curriculum guidelines
|
|
35
|
+
and emphasizes the practical application of knowledge in daily life.
|
|
36
|
+
We collected a total of 175 questions and answers.
|
|
37
|
+
|
|
38
|
+
We created and used these exams to evaluate the performance of the Typhoon models(https://arxiv.org/abs/2312.13951).
|
|
39
|
+
|
|
40
|
+
Prompt models using the following format
|
|
41
|
+
|
|
42
|
+
<input> # train
|
|
43
|
+
A. <reference>
|
|
44
|
+
B. <reference>
|
|
45
|
+
C. <reference>
|
|
46
|
+
D. <reference>
|
|
47
|
+
E. <reference>
|
|
48
|
+
Answer: <A/B/C/D/E>
|
|
49
|
+
|
|
50
|
+
x N (N-shot)
|
|
51
|
+
|
|
52
|
+
<input> # test
|
|
53
|
+
A. <reference1>
|
|
54
|
+
B. <reference2>
|
|
55
|
+
C. <reference3>
|
|
56
|
+
D. <reference4>
|
|
57
|
+
E. <reference5>
|
|
58
|
+
Answer:
|
|
59
|
+
|
|
60
|
+
For example:
|
|
61
|
+
|
|
62
|
+
ในระบบย่อยอาหารของมนุษย์ การดูดซึมสารอาหารส่วนใหญ่เกิดขึ้นที่อวัยวะใด?
|
|
63
|
+
A. ลำไส้เล็ก
|
|
64
|
+
B. ตับอ่อน
|
|
65
|
+
C. ลำไส้ใหญ่
|
|
66
|
+
D. กระเพาะอาหาร
|
|
67
|
+
E. หัวใจ
|
|
68
|
+
Answer: A
|
|
69
|
+
|
|
70
|
+
ข้อใดอธิบายเกี่ยวกับแรงไฟฟ้าได้ถูกต้อง?
|
|
71
|
+
A. เกิดได้โดยที่วัตถุไม่ต้องสัมผัสกัน
|
|
72
|
+
B. เป็นได้เฉพาะแรงผลักเท่านั้น
|
|
73
|
+
C. เป็นได้เฉพาะแรงดูดเท่านั้น
|
|
74
|
+
D. เป็นแรงต้านระหว่างวัตถุเท่านั้น
|
|
75
|
+
E. ถูกทุกข้อ
|
|
76
|
+
Answer:
|
|
77
|
+
|
|
78
|
+
Target: A
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
name = "thai_exam"
|
|
82
|
+
description = "ThaiExam benchmark comprising Thai multiple-choice examinations."
|
|
83
|
+
tags = ["knowledge", "multiple_choice"]
|
|
84
|
+
|
|
85
|
+
def __init__(self, exam: str):
|
|
86
|
+
super().__init__()
|
|
87
|
+
self.exam = exam
|
|
88
|
+
|
|
89
|
+
def download_thai_exam(self, path: str):
|
|
90
|
+
ensure_file_downloaded(
|
|
91
|
+
"https://storage.googleapis.com/thai_dataset/thai_exam.tar.gz",
|
|
92
|
+
target_path=path,
|
|
93
|
+
unpack=True,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
def process_jsonl(self, jsonl_path: str, split: str) -> List[Instance]:
|
|
97
|
+
instances: List[Instance] = []
|
|
98
|
+
hlog(f"Reading {jsonl_path}")
|
|
99
|
+
with open(jsonl_path, "r") as f:
|
|
100
|
+
for line in f:
|
|
101
|
+
data = json.loads(line)
|
|
102
|
+
# for handle missing key incase of some subject doesn't have all 5 choices
|
|
103
|
+
answers = [data[key] for key in ["a", "b", "c", "d", "e"] if key in data and data[key] != ""]
|
|
104
|
+
answers_dict = dict(zip(["A", "B", "C", "D", "E"], answers))
|
|
105
|
+
|
|
106
|
+
question, correct_answer = data["question"], answers_dict[data["answer"].upper()]
|
|
107
|
+
|
|
108
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
109
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
110
|
+
|
|
111
|
+
instance = Instance(
|
|
112
|
+
input=Input(text=question),
|
|
113
|
+
references=list(map(answer_to_reference, answers)),
|
|
114
|
+
split=split,
|
|
115
|
+
)
|
|
116
|
+
instances.append(instance)
|
|
117
|
+
return instances
|
|
118
|
+
|
|
119
|
+
def get_instances(self, output_path) -> List[Instance]:
|
|
120
|
+
data_path: str = os.path.join(output_path, "data")
|
|
121
|
+
self.download_thai_exam(data_path)
|
|
122
|
+
|
|
123
|
+
instances: List[Instance] = []
|
|
124
|
+
splits: Dict[str, str] = {
|
|
125
|
+
"train": TRAIN_SPLIT,
|
|
126
|
+
"test": TEST_SPLIT,
|
|
127
|
+
}
|
|
128
|
+
for split in splits:
|
|
129
|
+
jsonl_path: str = os.path.join(data_path, self.exam, f"{self.exam}_{split}.jsonl")
|
|
130
|
+
if not os.path.exists(jsonl_path):
|
|
131
|
+
hlog(f"{jsonl_path} doesn't exist, skipping")
|
|
132
|
+
continue
|
|
133
|
+
instances.extend(self.process_jsonl(jsonl_path, splits[split]))
|
|
134
|
+
|
|
135
|
+
return instances
|
|
@@ -50,14 +50,13 @@ class ThePileScenario(Scenario):
|
|
|
50
50
|
self.subset = subset
|
|
51
51
|
|
|
52
52
|
@htrack(None)
|
|
53
|
-
def load_and_cache_all_subsets(self, output_path):
|
|
54
|
-
data_path = os.path.join(output_path, "data")
|
|
53
|
+
def load_and_cache_all_subsets(self, data_jsonl, output_path):
|
|
55
54
|
subsets: Dict[str, List] = {subset: [] for subset in self.pile_subsets}
|
|
56
55
|
|
|
57
56
|
# Load all data into memory
|
|
58
57
|
with htrack_block("Loading"):
|
|
59
|
-
hlog(f"Loading all data from {
|
|
60
|
-
with open(
|
|
58
|
+
hlog(f"Loading all data from {data_jsonl}")
|
|
59
|
+
with open(data_jsonl) as f:
|
|
61
60
|
data = [json.loads(line) for line in f]
|
|
62
61
|
|
|
63
62
|
# Classify the documents by subset
|
|
@@ -76,10 +75,10 @@ class ThePileScenario(Scenario):
|
|
|
76
75
|
|
|
77
76
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
78
77
|
# Download the raw data
|
|
79
|
-
|
|
78
|
+
data_jsonl = os.path.join(output_path, "data")
|
|
80
79
|
ensure_file_downloaded(
|
|
81
80
|
source_url="https://the-eye.eu/public/AI/pile/test.jsonl.zst",
|
|
82
|
-
target_path=
|
|
81
|
+
target_path=data_jsonl,
|
|
83
82
|
unpack=True,
|
|
84
83
|
)
|
|
85
84
|
|
|
@@ -87,7 +86,7 @@ class ThePileScenario(Scenario):
|
|
|
87
86
|
|
|
88
87
|
# If the target subset does not exist, load and cache all subsets to the directory
|
|
89
88
|
if not os.path.exists(subset_path):
|
|
90
|
-
self.load_and_cache_all_subsets(output_path)
|
|
89
|
+
self.load_and_cache_all_subsets(data_jsonl, output_path)
|
|
91
90
|
|
|
92
91
|
# Read all the instances
|
|
93
92
|
instances = []
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from datasets import load_dataset
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Output,
|
|
6
|
+
Reference,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Input,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
VALID_SPLIT,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class UnitxtScenario(Scenario):
|
|
18
|
+
"""Integration with Unitxt: https://unitxt.rtfd.io/"""
|
|
19
|
+
|
|
20
|
+
name = "unitxt"
|
|
21
|
+
description = "Unitxt Scenarios"
|
|
22
|
+
tags = ["unitxt"]
|
|
23
|
+
|
|
24
|
+
UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME = {
|
|
25
|
+
"train": TRAIN_SPLIT,
|
|
26
|
+
"test": TEST_SPLIT,
|
|
27
|
+
"validation": VALID_SPLIT,
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
def __init__(self, **kwargs):
|
|
31
|
+
super().__init__()
|
|
32
|
+
self.kwargs = kwargs
|
|
33
|
+
|
|
34
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
35
|
+
dataset_name = ",".join(f"{key}={value}" for key, value in self.kwargs.items())
|
|
36
|
+
dataset = load_dataset("unitxt/data", dataset_name, trust_remote_code=True)
|
|
37
|
+
|
|
38
|
+
instances: List[Instance] = []
|
|
39
|
+
|
|
40
|
+
for unitxt_split_name, helm_split_name in UnitxtScenario.UNITXT_SPLIT_NAME_TO_HELM_SPLIT_NAME.items():
|
|
41
|
+
for index, row in enumerate(dataset[unitxt_split_name]):
|
|
42
|
+
references = [
|
|
43
|
+
Reference(
|
|
44
|
+
output=Output(text=reference_text),
|
|
45
|
+
tags=[CORRECT_TAG],
|
|
46
|
+
)
|
|
47
|
+
for reference_text in row["references"]
|
|
48
|
+
]
|
|
49
|
+
instance = Instance(
|
|
50
|
+
id=f"{unitxt_split_name}{index}",
|
|
51
|
+
input=Input(text=row["source"]),
|
|
52
|
+
references=references,
|
|
53
|
+
split=helm_split_name,
|
|
54
|
+
)
|
|
55
|
+
instances.append(instance)
|
|
56
|
+
return instances
|
|
@@ -137,7 +137,9 @@ class VerifiabilityJudgementScenario(Scenario):
|
|
|
137
137
|
for _, filesplit in split_to_filesplit.items():
|
|
138
138
|
target_name = f"verifiability_judgments_{filesplit}.jsonl"
|
|
139
139
|
target_path: str = os.path.join(data_path, target_name)
|
|
140
|
-
url: str =
|
|
140
|
+
url: str = (
|
|
141
|
+
f"https://github.com/nelson-liu/evaluating-verifiability-in-generative-search-engines/raw/40bf37e3a4eca7d82515df2c800ec9605458d637/verifiability_judgments/{target_name}.gz" # noqa: E501
|
|
142
|
+
)
|
|
141
143
|
ensure_file_downloaded(source_url=url, target_path=target_path)
|
|
142
144
|
assert os.path.exists(target_path)
|
|
143
145
|
|
|
@@ -26,7 +26,7 @@ class VicunaScenario(Scenario):
|
|
|
26
26
|
return self.category == "all" or raw["category"] == self.category
|
|
27
27
|
|
|
28
28
|
# Download the raw data
|
|
29
|
-
source_url = "https://raw.githubusercontent.com/lm-sys/FastChat/
|
|
29
|
+
source_url = "https://raw.githubusercontent.com/lm-sys/FastChat/v0.2.5/fastchat/eval/table/question.jsonl"
|
|
30
30
|
data_path: str = os.path.join(output_path, "vicuna_questions.jsonl")
|
|
31
31
|
|
|
32
32
|
ensure_file_downloaded(
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BingoScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
Holistic Analysis of Hallucination in GPT-4V(ision): Bias and Interference Challenges
|
|
23
|
+
|
|
24
|
+
We introduce a new benchmark, namely, the Bias and Interference Challenges in Visual Language Models (Bingo).
|
|
25
|
+
This benchmark is designed to evaluate and shed light on the two common types of hallucinations in visual
|
|
26
|
+
language models: bias and interference. Here, bias refers to the model's tendency to hallucinate certain types
|
|
27
|
+
of responses, possibly due to imbalance in its training data. Interference pertains to scenarios where the
|
|
28
|
+
judgment of GPT-4V(ision) can be disrupted due to how the text prompt is phrased or how the input image is
|
|
29
|
+
presented. The benchmark consists of open-ended question-answer pairs, and we employ open-ended generation
|
|
30
|
+
metrics for evaluation. In the experiment, we identify a notable regional bias, whereby GPT-4V(ision) is
|
|
31
|
+
better at interpreting Western images or images with English writing compared to images from other countries
|
|
32
|
+
or containing text in other languages.
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@article{cui2023holistic,
|
|
36
|
+
title={Holistic analysis of hallucination in gpt-4v (ision): Bias and interference challenges},
|
|
37
|
+
author={Cui, Chenhang and Zhou, Yiyang and Yang, Xinyu and Wu, Shirley and Zhang, Linjun and
|
|
38
|
+
Zou, James and Yao, Huaxiu},
|
|
39
|
+
journal={arXiv preprint arXiv:2311.03287},
|
|
40
|
+
year={2023}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
Paper: https://arxiv.org/abs/2311.03287
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
BINGO_HUGGINGFACE_DATASET_NAME: str = "PahaII/Bingo"
|
|
47
|
+
|
|
48
|
+
IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/Bingo/resolve/main/images/{image_path}?download=true"
|
|
49
|
+
|
|
50
|
+
SUBJECTS: List[str] = ["T2I", "I2I", "OCR", "Factual", "Region"]
|
|
51
|
+
|
|
52
|
+
name = "bingo"
|
|
53
|
+
description = (
|
|
54
|
+
"Evaluate multimodal models on biased and inference-challenging scenarios with five subjects"
|
|
55
|
+
" ([paper](https://arxiv.org/abs/2311.03287))."
|
|
56
|
+
)
|
|
57
|
+
tags = ["vision-language"]
|
|
58
|
+
|
|
59
|
+
def __init__(self, subject: str):
|
|
60
|
+
super().__init__()
|
|
61
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
62
|
+
self._subject: str = subject
|
|
63
|
+
|
|
64
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
65
|
+
images_path: str = os.path.join(output_path, "images")
|
|
66
|
+
ensure_directory_exists(images_path)
|
|
67
|
+
|
|
68
|
+
# There is only the test split in Unicorn benchmark
|
|
69
|
+
instances: List[Instance] = []
|
|
70
|
+
question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
|
|
71
|
+
|
|
72
|
+
# Process the test set
|
|
73
|
+
for row in tqdm(
|
|
74
|
+
load_dataset(
|
|
75
|
+
self.BINGO_HUGGINGFACE_DATASET_NAME,
|
|
76
|
+
data_files=question_data_files,
|
|
77
|
+
split=TEST_SPLIT,
|
|
78
|
+
cache_dir=output_path,
|
|
79
|
+
)
|
|
80
|
+
):
|
|
81
|
+
# Download the image
|
|
82
|
+
image_path: str = row["image_path"]
|
|
83
|
+
local_image_path: str = os.path.join(output_path, image_path)
|
|
84
|
+
ensure_file_downloaded(
|
|
85
|
+
source_url=self.IMAGE_URL.format(image_path=image_path),
|
|
86
|
+
target_path=local_image_path,
|
|
87
|
+
unpack=False,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
content: List[MediaObject] = [
|
|
91
|
+
MediaObject(location=local_image_path, content_type="image/png"),
|
|
92
|
+
MediaObject(text=row["question"], content_type="text/plain"),
|
|
93
|
+
]
|
|
94
|
+
answer: str = row["answer"]
|
|
95
|
+
instances.append(
|
|
96
|
+
Instance(
|
|
97
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
98
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
99
|
+
split=TEST_SPLIT,
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
return instances
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
ALL_SPLITS,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
VALID_SPLIT,
|
|
11
|
+
Instance,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
Reference,
|
|
15
|
+
Scenario,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
18
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HatefulMemesScenario(Scenario):
|
|
22
|
+
"""
|
|
23
|
+
The Hateful Memes Challenge and Dataset is a competition and open source dataset designed to
|
|
24
|
+
measure progress in multimodal vision-and-language classification.
|
|
25
|
+
|
|
26
|
+
In order for AI to become a more effective tool for detecting hate speech, it must be able to understand
|
|
27
|
+
content the way people do: holistically. When viewing a meme, for example, we don’t think about the words
|
|
28
|
+
and photo independently of each other; we understand the combined meaning. This is extremely challenging
|
|
29
|
+
for machines, however, because it means they can’t analyze the text and the image separately. They must
|
|
30
|
+
combine these different modalities and understand how the meaning changes when they are presented together.
|
|
31
|
+
|
|
32
|
+
@article{kiela2020hateful,
|
|
33
|
+
title={The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes},
|
|
34
|
+
author={Kiela, Douwe and Firooz, Hamed and Mohan, Aravind and Goswami, Vedanuj and Singh,
|
|
35
|
+
Amanpreet and Ringshia, Pratik and Testuggine, Davide},
|
|
36
|
+
journal={arXiv preprint arXiv:2005.04790},
|
|
37
|
+
year={2020}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
Paper: https://arxiv.org/abs/2005.04790
|
|
41
|
+
Website: https://ai.meta.com/tools/hatefulmemes/
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
IMAGE_URL: str = (
|
|
45
|
+
"https://huggingface.co/datasets/neuralcatcher/hateful_memes/resolve/main/{image_path}?download=true"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# This question is from the example on the website
|
|
49
|
+
QUESTION: str = "Is this meme mean?"
|
|
50
|
+
|
|
51
|
+
name = "hateful_memes"
|
|
52
|
+
description = "Identify if a meme is hateful or not ([paper](https://arxiv.org/abs/2005.04790))."
|
|
53
|
+
tags = ["vision-language"]
|
|
54
|
+
|
|
55
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
56
|
+
ensure_directory_exists(os.path.join(output_path, "img"))
|
|
57
|
+
|
|
58
|
+
instances: List[Instance] = []
|
|
59
|
+
for split in ALL_SPLITS:
|
|
60
|
+
for row in tqdm(
|
|
61
|
+
load_dataset(
|
|
62
|
+
"neuralcatcher/hateful_memes",
|
|
63
|
+
split="validation" if split == VALID_SPLIT else split,
|
|
64
|
+
cache_dir=output_path,
|
|
65
|
+
)
|
|
66
|
+
):
|
|
67
|
+
# Download the meme
|
|
68
|
+
image_path: str = row["img"]
|
|
69
|
+
local_image_path: str = os.path.join(output_path, image_path)
|
|
70
|
+
ensure_file_downloaded(
|
|
71
|
+
source_url=self.IMAGE_URL.format(image_path=image_path),
|
|
72
|
+
target_path=local_image_path,
|
|
73
|
+
unpack=False,
|
|
74
|
+
)
|
|
75
|
+
# Some examples are missing images. Skip those for now
|
|
76
|
+
if not os.path.exists(local_image_path) or os.path.getsize(local_image_path) == 0:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
content: List[MediaObject] = [
|
|
80
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
81
|
+
MediaObject(text=self.QUESTION, content_type="text/plain"),
|
|
82
|
+
]
|
|
83
|
+
answer: str = "Yes" if row["label"] == 1 else "No"
|
|
84
|
+
instances.append(
|
|
85
|
+
Instance(
|
|
86
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
87
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
88
|
+
split=split,
|
|
89
|
+
)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return instances
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import Dict, List, Set
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
ALL_SPLITS,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class HEIMHumanEvalReference(Reference):
|
|
22
|
+
# The number of human annotators who gave this reference or answer.
|
|
23
|
+
num_human_answered: int = 0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class HEIMHumanEvalScenario(Scenario):
|
|
27
|
+
"""
|
|
28
|
+
In [Holistic Evaluation of Text-To-Image Models (HEIM)](https://arxiv.org/abs/2311.04287), we evaluated 26
|
|
29
|
+
state-of-the-art text-to-image models using across 12 different aspects (e.g., toxicity mitigation, unbiasedness,
|
|
30
|
+
originality, etc.). We used human annotators through AWS Mechanical Turk to evaluate the models for some of
|
|
31
|
+
these aspects (see image below).
|
|
32
|
+
This scenario contains the AI-generated images and human annotations for the following question types:
|
|
33
|
+
|
|
34
|
+
1. Alignment
|
|
35
|
+
2. Aesthetics
|
|
36
|
+
3. Clear subject
|
|
37
|
+
4. Originality
|
|
38
|
+
5. Photorealism
|
|
39
|
+
|
|
40
|
+
Citations:
|
|
41
|
+
- HEIM: https://arxiv.org/abs/2311.04287
|
|
42
|
+
- MS COCO: https://arxiv.org/abs/1405.0312
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
DATASET_DOWNLOAD_URL: str = (
|
|
46
|
+
"https://worksheets.codalab.org/rest/bundles/0x502d646c366c4f1d8c4a2ccf163b958f/contents/blob/"
|
|
47
|
+
)
|
|
48
|
+
VALID_QUESTION_TYPES: Set[str] = {"alignment", "aesthetics", "clear_subject", "originality", "photorealism"}
|
|
49
|
+
|
|
50
|
+
name = "heim_human_eval"
|
|
51
|
+
description = (
|
|
52
|
+
"Images generated by text-to-image models and human annotations for HEIM "
|
|
53
|
+
"([paper](https://arxiv.org/abs/2311.04287))."
|
|
54
|
+
)
|
|
55
|
+
tags = ["vision-language", "visual question answering", "image evaluation"]
|
|
56
|
+
|
|
57
|
+
def __init__(self, question_type: str):
|
|
58
|
+
super().__init__()
|
|
59
|
+
assert question_type in self.VALID_QUESTION_TYPES, f"Invalid question type: {question_type}"
|
|
60
|
+
self._question_type: str = question_type
|
|
61
|
+
|
|
62
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
63
|
+
# Download the dataset
|
|
64
|
+
output_path = os.path.join(output_path, "dataset")
|
|
65
|
+
ensure_file_downloaded(
|
|
66
|
+
source_url=self.DATASET_DOWNLOAD_URL, target_path=output_path, unpack=True, unpack_type="untar"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# Load the multiple-choice questions
|
|
70
|
+
with open(os.path.join(output_path, "questions.json")) as questions_file:
|
|
71
|
+
question_info: Dict = json.load(questions_file)[self._question_type]
|
|
72
|
+
|
|
73
|
+
instances: List[Instance] = []
|
|
74
|
+
for split in ALL_SPLITS:
|
|
75
|
+
annotations_split_path: str = os.path.join(output_path, f"{self._question_type}_{split}.jsonl")
|
|
76
|
+
with open(annotations_split_path) as f:
|
|
77
|
+
# Read each line/example as a JSON object
|
|
78
|
+
for line in f.readlines():
|
|
79
|
+
image_annotation: Dict = json.loads(line)
|
|
80
|
+
image_path: str = os.path.join(output_path, image_annotation["image_path"])
|
|
81
|
+
assert os.path.exists(image_path), f"Image {image_path} does not exist"
|
|
82
|
+
|
|
83
|
+
# Get the most common human answer(s) for the question
|
|
84
|
+
human_answers: List[str] = [str(answer) for answer in image_annotation["human_annotations"]]
|
|
85
|
+
human_answers_to_counts = Counter(human_answers)
|
|
86
|
+
max_count: int = max(human_answers_to_counts.values())
|
|
87
|
+
modes: List[str] = [value for value, count in human_answers_to_counts.items() if count == max_count]
|
|
88
|
+
|
|
89
|
+
content: List[MediaObject] = [MediaObject(location=image_path, content_type="image/png")]
|
|
90
|
+
if "prompt" in image_annotation:
|
|
91
|
+
# Include the prompt in the content if it exists
|
|
92
|
+
prompt: str = image_annotation["prompt"]
|
|
93
|
+
content.append(MediaObject(text=f"Description: {prompt}", content_type="text/plain"))
|
|
94
|
+
content.append(MediaObject(text=question_info["question"], content_type="text/plain"))
|
|
95
|
+
|
|
96
|
+
references: List[Reference] = [
|
|
97
|
+
HEIMHumanEvalReference(
|
|
98
|
+
Output(text=answer),
|
|
99
|
+
# The mode is the most common human answer and the reference we mark as correct
|
|
100
|
+
tags=[CORRECT_TAG] if value in modes else [],
|
|
101
|
+
num_human_answered=human_answers_to_counts[value],
|
|
102
|
+
)
|
|
103
|
+
for value, answer in question_info["choices"].items()
|
|
104
|
+
]
|
|
105
|
+
instances.append(
|
|
106
|
+
Instance(
|
|
107
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
108
|
+
references=references,
|
|
109
|
+
split=split,
|
|
110
|
+
)
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
return instances
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
CORRECT_TAG,
|
|
6
|
+
TEST_SPLIT,
|
|
7
|
+
Instance,
|
|
8
|
+
Input,
|
|
9
|
+
Output,
|
|
10
|
+
Reference,
|
|
11
|
+
Scenario,
|
|
12
|
+
)
|
|
13
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Chart2CSVScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
Chart to CSV
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "chart2csv"
|
|
22
|
+
description = "Convert a chart to CSV."
|
|
23
|
+
tags = ["vision-language", "image2structure"]
|
|
24
|
+
|
|
25
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
26
|
+
assert os.path.exists(output_path), f"Dataset does not exist at {output_path}"
|
|
27
|
+
instances: List[Instance] = []
|
|
28
|
+
|
|
29
|
+
charts_path: str = os.path.join(output_path, "charts")
|
|
30
|
+
ground_truths_path: str = os.path.join(output_path, "groundtruth")
|
|
31
|
+
|
|
32
|
+
for chart_file in os.listdir(charts_path):
|
|
33
|
+
if not chart_file.endswith(".png"):
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
chart_path: str = os.path.join(charts_path, chart_file)
|
|
37
|
+
ground_truth_file: str = chart_file.replace(".png", ".psv")
|
|
38
|
+
ground_truth_path: str = os.path.join(ground_truths_path, ground_truth_file)
|
|
39
|
+
assert os.path.exists(ground_truth_path), f"Ground truth does not exist at {ground_truth_path}"
|
|
40
|
+
|
|
41
|
+
content: List[MediaObject] = [
|
|
42
|
+
MediaObject(location=chart_path, content_type="image/png"),
|
|
43
|
+
]
|
|
44
|
+
with open(ground_truth_path, "r") as file:
|
|
45
|
+
ground_truth: str = file.read().replace("|", ",")
|
|
46
|
+
|
|
47
|
+
instances.append(
|
|
48
|
+
Instance(
|
|
49
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
50
|
+
references=[Reference(Output(text=ground_truth), tags=[CORRECT_TAG])],
|
|
51
|
+
split=TEST_SPLIT,
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
return instances
|