crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PartiPromptsScenario(Scenario):
|
|
10
|
+
"""
|
|
11
|
+
PartiPrompts (P2) is a set of 1600 diverse English prompts that allow us to more comprehensively
|
|
12
|
+
evaluate and test the limits of text-to-image synthesis models.
|
|
13
|
+
|
|
14
|
+
Each prompt in the P2 benchmark is associated with two labels:
|
|
15
|
+
1. Category: indicating a broad group that a prompt belongs to
|
|
16
|
+
2. Challenge: highlighting an aspect which makes a prompt difficult
|
|
17
|
+
|
|
18
|
+
Categories:
|
|
19
|
+
- Abstract: Descriptions that represent abstract concepts, including single words and simple numbers.
|
|
20
|
+
- World Knowledge: Descriptions focused on objects and places that exist in the real world.
|
|
21
|
+
- People: Descriptions where the primary participants are human beings (but not specific individuals,
|
|
22
|
+
living or dead).
|
|
23
|
+
- Animals: Descriptions in which the primary participants are animals.
|
|
24
|
+
- Illustrations: Descriptions of images that involve specific types of graphical representations,
|
|
25
|
+
including geometrical objects, diagrams, and symbols.
|
|
26
|
+
- Artifacts: Descriptions that represent abstract concepts, including single words and simple numbers.
|
|
27
|
+
- Food & Beverage: Descriptions of things animals, especially human beings, eat or drink.
|
|
28
|
+
- Vehicles: Descriptions where the focus is on man-made devices for transportation.
|
|
29
|
+
- Arts: Descriptions of existing paintings or intended to produce novel images in the format of a painting.
|
|
30
|
+
- Indoor Scenes: Descriptions about objects and participants that occur indoors.
|
|
31
|
+
- Outdoor Scenes: Descriptions about objects and participants that occur outdoors.
|
|
32
|
+
- Produce & Plants: Descriptions focused on plants or their products (fruits, vegetables, seeds, etc).
|
|
33
|
+
|
|
34
|
+
Challenges:
|
|
35
|
+
- Simple Detail: Descriptions that include only simple or high-level details.
|
|
36
|
+
- Fine-grained Detail: Descriptions that include very detailed specifications of attributes or
|
|
37
|
+
actions of entities or objects in a scene.
|
|
38
|
+
- Complex: Descriptions that include many fine-grained, interacting details or relationships between multiple
|
|
39
|
+
participants.
|
|
40
|
+
- Quantity: Descriptions that specify particular counts of occurrences of subjects in a scene.
|
|
41
|
+
- Style & Format: Descriptions that specifically focus on the visual manner in which a subject or scene
|
|
42
|
+
must be depicted.
|
|
43
|
+
- Properties & Positioning: Descriptions that target precise assignment of properties to entities or
|
|
44
|
+
objects (often in the context of multiple entities or objects), and/or the
|
|
45
|
+
relative spatial arrangement of entities and objects with respect to one
|
|
46
|
+
another or landmarks in the scene.
|
|
47
|
+
- Linguistic Structures: Long and/or abstract words or complex syntactic structures or semantic
|
|
48
|
+
ambiguities.
|
|
49
|
+
- Writing & Symbols: Descriptions that require words or symbols to be accurately represented
|
|
50
|
+
in the context of the visual scene.
|
|
51
|
+
- Imagination: Descriptions that include participants or interactions that are not, or are generally unlikely
|
|
52
|
+
to be, found in the modern day world.
|
|
53
|
+
- Basic: Descriptions about a single subject or concept with little to no detail or embellishment.
|
|
54
|
+
- Perspective: Descriptions that specify particular viewpoints or positioning of the subjects in a scene.
|
|
55
|
+
|
|
56
|
+
Paper: https://arxiv.org/abs/2206.10789
|
|
57
|
+
Website: https://parti.research.google/
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
DATASET_DOWNLOAD_URL: str = "https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv"
|
|
61
|
+
ALL_CATEGORY: str = "all"
|
|
62
|
+
|
|
63
|
+
name = "parti_prompts"
|
|
64
|
+
description = (
|
|
65
|
+
"PartiPrompts (P2) is a set of 1600 diverse English prompts that allow to more comprehensively "
|
|
66
|
+
"evaluate and test the limits of text-to-image synthesis models ([paper](https://arxiv.org/abs/2206.10789))."
|
|
67
|
+
)
|
|
68
|
+
tags = ["text-to-image"]
|
|
69
|
+
|
|
70
|
+
def __init__(self, category: str):
|
|
71
|
+
super().__init__()
|
|
72
|
+
self.category: str = category
|
|
73
|
+
|
|
74
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
75
|
+
prompts_path: str = os.path.join(output_path, "prompts.tsv")
|
|
76
|
+
ensure_file_downloaded(source_url=self.DATASET_DOWNLOAD_URL, target_path=prompts_path)
|
|
77
|
+
|
|
78
|
+
instances: List[Instance] = []
|
|
79
|
+
with open(prompts_path) as f:
|
|
80
|
+
tsv_reader = csv.reader(f, delimiter="\t")
|
|
81
|
+
for i, row in enumerate(tsv_reader):
|
|
82
|
+
if i == 0:
|
|
83
|
+
# Skip the header
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
prompt: str = row[0]
|
|
87
|
+
category: str = row[1]
|
|
88
|
+
|
|
89
|
+
# P2 does not have reference images
|
|
90
|
+
instance = Instance(Input(text=prompt), references=[], split=TEST_SPLIT)
|
|
91
|
+
if category.startswith(self.category) or self.category == self.ALL_CATEGORY:
|
|
92
|
+
instances.append(instance)
|
|
93
|
+
|
|
94
|
+
return instances
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class RadiologyScenario(Scenario):
|
|
7
|
+
"""
|
|
8
|
+
From "What Does DALL-E 2 Know About Radiology?", DALL-E 2 fails to generate realistic
|
|
9
|
+
X-ray, CT, MRI and ultrasound images. For example, generated images of X-rays have
|
|
10
|
+
smaller bones missing, extra joints added, etc.
|
|
11
|
+
|
|
12
|
+
Prompts are in the following format:
|
|
13
|
+
|
|
14
|
+
“An x-ray of {head, chest, shoulder, abdomen, pelvis, hand, knee, ankle}” => 8 prompts
|
|
15
|
+
“An MRI of the {heart, liver, kidney}” => 3 prompts
|
|
16
|
+
“A CT of the {heart, liver, kidney}” => 3 prompts
|
|
17
|
+
“Ultrasound of the {heart, liver, kidney}” => 3 prompts
|
|
18
|
+
|
|
19
|
+
Paper: https://arxiv.org/abs/2209.13696
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
X_RAY_ANATOMICAL_AREA: List[str] = ["head", "chest", "shoulder", "abdomen", "pelvis", "hand", "knee", "ankle"]
|
|
23
|
+
ORGANS: List[str] = ["heart", "liver", "kidney"]
|
|
24
|
+
|
|
25
|
+
name = "radiology"
|
|
26
|
+
description = "Prompts to generate radiological images ([paper](https://arxiv.org/abs/2209.13696))."
|
|
27
|
+
tags = ["text-to-image", "knowledge"]
|
|
28
|
+
|
|
29
|
+
def get_instances(self, _) -> List[Instance]:
|
|
30
|
+
prompts: List[str] = []
|
|
31
|
+
|
|
32
|
+
for anatomical_area in self.X_RAY_ANATOMICAL_AREA:
|
|
33
|
+
prompts.append(f"An x-ray of {anatomical_area}")
|
|
34
|
+
|
|
35
|
+
for organ in self.ORGANS:
|
|
36
|
+
prompts.append(f"An MRI of the {organ}")
|
|
37
|
+
prompts.append(f"A CT of the {organ}")
|
|
38
|
+
prompts.append(f"Ultrasound of the {organ}")
|
|
39
|
+
|
|
40
|
+
assert len(prompts) == 17, "Invalid number of prompts"
|
|
41
|
+
# There are no reference images
|
|
42
|
+
return [Instance(Input(text=prompt), references=[], split=TEST_SPLIT) for prompt in prompts]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from typing import List, Set
|
|
2
|
+
import csv
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RelationalUnderstandingScenario(Scenario):
|
|
10
|
+
"""
|
|
11
|
+
From Testing Relational Understanding in Text-Guided Image Generation, based on existing cognitive,
|
|
12
|
+
linguistic, and developmental literature, the authors created a set of 15 relations (8 physical,
|
|
13
|
+
7 agentic) and a set of 12 entities (6 objects, 6 agents). The physical relations were: in, on,
|
|
14
|
+
under, covering, near, occluded by, hanging over, and tied to. The agentic relations were: pushing,
|
|
15
|
+
pulling, touching, hitting, kicking, helping, and hindering. The objects were: box, cylinder,
|
|
16
|
+
blanket, bowl, teacup, and knife. The agents were: man, woman, child, robot, monkey, and iguana.
|
|
17
|
+
|
|
18
|
+
The authors created 5 different prompts for each relation, by randomly sampling two entities five
|
|
19
|
+
times, resulting in 75 distinct basic relation prompts (e.g., a monkey touching an iguana). Withs
|
|
20
|
+
these prompts, the authors showed that DALL-E 2 suffers from a significant lack of commonsense
|
|
21
|
+
reasoning in the form of relational understanding.
|
|
22
|
+
|
|
23
|
+
Paper: https://arxiv.org/abs/2208.00005
|
|
24
|
+
Website: https://osf.io/sm68h
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "relational_understanding"
|
|
28
|
+
description = (
|
|
29
|
+
"Consists of 75 basic relation prompts that tests commonsense reasoning "
|
|
30
|
+
"([paper](https://arxiv.org/abs/2208.00005))."
|
|
31
|
+
)
|
|
32
|
+
tags = ["text-to-image", "reasoning"]
|
|
33
|
+
|
|
34
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
35
|
+
data_path: str = os.path.join(output_path, "choice_data.csv")
|
|
36
|
+
ensure_file_downloaded(source_url="https://osf.io/download/tb3a4", target_path=data_path)
|
|
37
|
+
|
|
38
|
+
instances: List[Instance] = []
|
|
39
|
+
seen_prompts: Set[str] = set()
|
|
40
|
+
with open(data_path) as csv_file:
|
|
41
|
+
csv_reader = csv.reader(csv_file, delimiter=",")
|
|
42
|
+
for i, row in enumerate(csv_reader):
|
|
43
|
+
if i == 0:
|
|
44
|
+
# Skip the header
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
prompt: str = row[1]
|
|
48
|
+
if prompt not in seen_prompts:
|
|
49
|
+
instances.append(Instance(Input(text=prompt), references=[], split=TEST_SPLIT))
|
|
50
|
+
seen_prompts.add(prompt)
|
|
51
|
+
|
|
52
|
+
return instances
|
helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TIMEMostSignificantHistoricalFigures(Scenario):
|
|
7
|
+
"""
|
|
8
|
+
People from TIME's "The 100 Most Significant Figures in History" list.
|
|
9
|
+
|
|
10
|
+
https://ideas.time.com/2013/12/10/whos-biggest-the-100-most-significant-figures-in-history/
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
HISTORICAL_FIGURES: List[str] = [
|
|
14
|
+
"Jesus",
|
|
15
|
+
"Napoleon Bonaparte",
|
|
16
|
+
"Muhammad",
|
|
17
|
+
"William Shakespeare",
|
|
18
|
+
"Abraham Lincoln",
|
|
19
|
+
"George Washington",
|
|
20
|
+
"Adolf Hitler",
|
|
21
|
+
"Aristotle",
|
|
22
|
+
"Alexander the Great",
|
|
23
|
+
"Thomas Jefferson",
|
|
24
|
+
"Henry VIII of England",
|
|
25
|
+
"Charles Darwin",
|
|
26
|
+
"Elizabeth I of England",
|
|
27
|
+
"Karl Marx",
|
|
28
|
+
"Julius Caesar",
|
|
29
|
+
"Queen Victoria",
|
|
30
|
+
"Martin Luther",
|
|
31
|
+
"Joseph Stalin",
|
|
32
|
+
"Albert Einstein",
|
|
33
|
+
"Christopher Columbus",
|
|
34
|
+
"Isaac Newton",
|
|
35
|
+
"Charlemagne",
|
|
36
|
+
"Theodore Roosevelt",
|
|
37
|
+
"Wolfgang Amadeus Mozart",
|
|
38
|
+
"Plato",
|
|
39
|
+
"Louis XIV of France",
|
|
40
|
+
"Ludwig van Beethoven",
|
|
41
|
+
"Ulysses S.Grant",
|
|
42
|
+
"Leonardo da Vinci",
|
|
43
|
+
"Augustus",
|
|
44
|
+
"Carl Linnaeus",
|
|
45
|
+
"Ronald Reagan",
|
|
46
|
+
"Charles Dickens",
|
|
47
|
+
"Paul the Apostle",
|
|
48
|
+
"Benjamin Franklin",
|
|
49
|
+
# "George W.Bush",
|
|
50
|
+
"Winston Churchill",
|
|
51
|
+
"Genghis Khan",
|
|
52
|
+
"Charles I of England",
|
|
53
|
+
"Thomas Edison",
|
|
54
|
+
"James I of England",
|
|
55
|
+
"Friedrich Nietzsche",
|
|
56
|
+
"Franklin D.Roosevelt",
|
|
57
|
+
"Sigmund Freud",
|
|
58
|
+
"Alexander Hamilton",
|
|
59
|
+
"Mohandas Karamchand Gandhi",
|
|
60
|
+
"Woodrow Wilson",
|
|
61
|
+
"Johann Sebastian Bach",
|
|
62
|
+
"Galileo Galilei",
|
|
63
|
+
"Oliver Cromwell",
|
|
64
|
+
"James Madison",
|
|
65
|
+
"Gautama Buddha",
|
|
66
|
+
"Mark Twain",
|
|
67
|
+
"Edgar Allan Poe",
|
|
68
|
+
"Joseph Smith, Jr.",
|
|
69
|
+
"Adam Smith",
|
|
70
|
+
"David, King of Israel",
|
|
71
|
+
"George III of the United Kingdom",
|
|
72
|
+
"Immanuel Kant",
|
|
73
|
+
"James Cook",
|
|
74
|
+
"John Adams",
|
|
75
|
+
"Richard Wagner",
|
|
76
|
+
"Pyotr Ilyich Tchaikovsky",
|
|
77
|
+
"Voltaire",
|
|
78
|
+
"Saint Peter",
|
|
79
|
+
"Andrew Jackson",
|
|
80
|
+
"Constantine the Great",
|
|
81
|
+
"Socrates",
|
|
82
|
+
"Elvis Presley",
|
|
83
|
+
"William the Conqueror",
|
|
84
|
+
"John F.Kennedy",
|
|
85
|
+
"Augustine of Hippo",
|
|
86
|
+
"Vincent van Gogh",
|
|
87
|
+
"Nicolaus Copernicus",
|
|
88
|
+
"Vladimir Lenin",
|
|
89
|
+
"Robert E.Lee",
|
|
90
|
+
"Oscar Wilde",
|
|
91
|
+
"Charles II of England",
|
|
92
|
+
"Cicero",
|
|
93
|
+
"Jean-Jacques Rousseau",
|
|
94
|
+
"Francis Bacon",
|
|
95
|
+
"Richard Nixon",
|
|
96
|
+
"Louis XVI of France",
|
|
97
|
+
"Charles V, Holy Roman Emperor",
|
|
98
|
+
"King Arthur",
|
|
99
|
+
"Michelangelo",
|
|
100
|
+
"Philip II of Spain",
|
|
101
|
+
"Johann Wolfgang von Goethe",
|
|
102
|
+
"Ali, founder of Sufism",
|
|
103
|
+
"Thomas Aquinas",
|
|
104
|
+
"Pope John Paul II",
|
|
105
|
+
"René Descartes",
|
|
106
|
+
"Nikola Tesla",
|
|
107
|
+
"Harry S.Truman",
|
|
108
|
+
"Joan of Arc",
|
|
109
|
+
"Dante Alighieri",
|
|
110
|
+
"Otto von Bismarck",
|
|
111
|
+
"Grover Cleveland",
|
|
112
|
+
"John Calvin",
|
|
113
|
+
"John Locke",
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
name = "time_most_significant_historical_figures"
|
|
117
|
+
description = 'People from TIME\'s "The 100 Most Significant Figures in History" list.'
|
|
118
|
+
tags = ["text-to-image", "knowledge"]
|
|
119
|
+
|
|
120
|
+
def get_instances(self, _) -> List[Instance]:
|
|
121
|
+
return [
|
|
122
|
+
Instance(Input(text=historical_figure), references=[], split=TEST_SPLIT)
|
|
123
|
+
for historical_figure in self.HISTORICAL_FIGURES
|
|
124
|
+
]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
|
|
6
|
+
from helm.common.general import get_file_name
|
|
7
|
+
from helm.common.images_utils import copy_image
|
|
8
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
9
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, CORRECT_TAG, TEST_SPLIT
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class WinogroundScenario(Scenario):
|
|
13
|
+
"""
|
|
14
|
+
Winoground is a novel task and dataset for evaluating the ability of vision and language models
|
|
15
|
+
to conduct visio-linguistic compositional reasoning. Given two images and two captions, the
|
|
16
|
+
goal is to match them correctly—but crucially, both captions contain a completely identical set
|
|
17
|
+
of words/morphemes, only in a different order. The dataset was carefully hand-curated by
|
|
18
|
+
expert annotators and is labeled with a rich set of fine-grained tags to assist in analyzing
|
|
19
|
+
model performance.
|
|
20
|
+
|
|
21
|
+
Users must agree to share their contact information before downloading the dataset from
|
|
22
|
+
Hugging Face. Either agree to the terms and set HUGGING_FACE_ACCESS_TOKEN to an access token
|
|
23
|
+
of a valid Hugging Face account or have the dataset pre-downloaded at the Hugging Face cache
|
|
24
|
+
(default path: ~/.cache/huggingface/datasets).
|
|
25
|
+
|
|
26
|
+
Paper: https://arxiv.org/abs/2204.03162
|
|
27
|
+
Website: https://huggingface.co/datasets/facebook/winoground
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
name = "winoground"
|
|
31
|
+
description = (
|
|
32
|
+
"Winoground is a novel task and dataset for evaluating the ability of vision and language models "
|
|
33
|
+
"to conduct visio-linguistic compositional reasoning "
|
|
34
|
+
"([paper](https://arxiv.org/abs/2204.03162))."
|
|
35
|
+
)
|
|
36
|
+
tags = ["text-to-image", "image-to-text", "visual_reasoning"]
|
|
37
|
+
|
|
38
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
39
|
+
auth_token: str = os.environ.get("HUGGING_FACE_ACCESS_TOKEN", "")
|
|
40
|
+
|
|
41
|
+
instances: List[Instance] = []
|
|
42
|
+
for row in load_dataset("facebook/winoground", split="test", use_auth_token=auth_token):
|
|
43
|
+
# Use the first example of the pair for now (index 0)
|
|
44
|
+
caption: str = row["caption_0"]
|
|
45
|
+
image_path: str = row["image_0"].filename
|
|
46
|
+
|
|
47
|
+
# Create a copy of the image in the benchmark output folder for metrics computation
|
|
48
|
+
image_copy_path: str = os.path.join(output_path, get_file_name(image_path))
|
|
49
|
+
if not os.path.exists(image_copy_path):
|
|
50
|
+
copy_image(image_path, image_copy_path)
|
|
51
|
+
content: MultimediaObject = MultimediaObject(
|
|
52
|
+
[MediaObject(content_type="image/png", location=image_copy_path)]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
instances.append(
|
|
56
|
+
Instance(
|
|
57
|
+
input=Input(text=caption),
|
|
58
|
+
references=[Reference(Output(multimedia_content=content), tags=[CORRECT_TAG])],
|
|
59
|
+
split=TEST_SPLIT,
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
return instances
|
|
@@ -107,7 +107,6 @@ class IMDBScenario(Scenario):
|
|
|
107
107
|
contrast_map = {}
|
|
108
108
|
|
|
109
109
|
for orig_line, contrast_line in zip(orig_and_contrast_inputs[0], orig_and_contrast_inputs[1]):
|
|
110
|
-
|
|
111
110
|
orig_label_name, orig_context = orig_line.strip().split("\t")
|
|
112
111
|
orig_label = label_name_to_id[orig_label_name]
|
|
113
112
|
|
|
@@ -96,8 +96,12 @@ class LegalBenchScenario(Scenario):
|
|
|
96
96
|
|
|
97
97
|
# Download data from Huggingface. LegalBench provides splits for samples to
|
|
98
98
|
# be used for prompt construction and for testing.
|
|
99
|
-
train_dataset = datasets.load_dataset(
|
|
100
|
-
|
|
99
|
+
train_dataset = datasets.load_dataset(
|
|
100
|
+
"nguha/legalbench", self.subset, trust_remote_code=True, cache_dir=cache_dir, split="train"
|
|
101
|
+
)
|
|
102
|
+
test_dataset = datasets.load_dataset(
|
|
103
|
+
"nguha/legalbench", self.subset, trust_remote_code=True, cache_dir=cache_dir, split="test"
|
|
104
|
+
)
|
|
101
105
|
assert isinstance(train_dataset, datasets.Dataset)
|
|
102
106
|
assert isinstance(test_dataset, datasets.Dataset)
|
|
103
107
|
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
from xml.etree.ElementTree import Element
|
|
4
|
+
import xml.etree.ElementTree as ET
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded
|
|
7
|
+
from .scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LiveQAScenario(Scenario):
|
|
11
|
+
"""
|
|
12
|
+
TREC-2017 LiveQA: Medical Question Answering Task
|
|
13
|
+
|
|
14
|
+
The LiveQA'17 medical task focuses on consumer health question answering.
|
|
15
|
+
Please refer to the original paper for more information about the constructed datasets and the LiveQA Track:
|
|
16
|
+
https://trec.nist.gov/pubs/trec26/papers/Overview-QA.pdf
|
|
17
|
+
|
|
18
|
+
Paper citation:
|
|
19
|
+
|
|
20
|
+
@inproceedings{LiveMedQA2017,
|
|
21
|
+
author = {Asma {Ben Abacha} and Eugene Agichtein and Yuval Pinter and Dina Demner{-}Fushman},
|
|
22
|
+
title = {Overview of the Medical Question Answering Task at TREC 2017 LiveQA},
|
|
23
|
+
booktitle = {TREC 2017},
|
|
24
|
+
year = {2017}
|
|
25
|
+
}
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
SOURCE_REPO_URL = "https://raw.githubusercontent.com/abachaa/LiveQA_MedicalTask_TREC2017/master/TestDataset/"
|
|
29
|
+
FILENAME = "TREC-2017-LiveQA-Medical-Test-Questions-w-summaries.xml"
|
|
30
|
+
|
|
31
|
+
name = "live_qa"
|
|
32
|
+
description = "TREC-2017 LiveQA: Medical Question Answering Task"
|
|
33
|
+
tags = ["knowledge", "generation", "question_answering", "biomedical"]
|
|
34
|
+
|
|
35
|
+
def download_liveqa(self, path: str):
|
|
36
|
+
"""Download the XML file containing the questions & reference answers"""
|
|
37
|
+
ensure_file_downloaded(
|
|
38
|
+
source_url=os.path.join(self.SOURCE_REPO_URL, self.FILENAME),
|
|
39
|
+
target_path=os.path.join(path, self.FILENAME),
|
|
40
|
+
unpack=False,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def remove_whitespace(s: str) -> str:
|
|
45
|
+
"""Just remove all whitespace from a string"""
|
|
46
|
+
return " ".join(s.strip().split())
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _extract_question_id(element: Element):
|
|
50
|
+
return element.attrib["qid"]
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def _extract_question(cls, element: Element) -> str:
|
|
54
|
+
"""Given an XML Element representing a question, extract just the question as text"""
|
|
55
|
+
return cls.remove_whitespace(element.find("NLM-Summary").text) # type: ignore
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def _extract_answers(cls, element: Element) -> List[str]:
|
|
59
|
+
"""Given an XML Element representing a question, extract the reference answers"""
|
|
60
|
+
answers = []
|
|
61
|
+
for answer in element.iter("ANSWER"):
|
|
62
|
+
answers.append(cls.remove_whitespace(answer.text)) # type: ignore
|
|
63
|
+
|
|
64
|
+
return answers
|
|
65
|
+
|
|
66
|
+
def process_xml(self, base_path: str) -> List[Instance]:
|
|
67
|
+
"""Parse the XMLs into question-answer(s) pairs"""
|
|
68
|
+
xml_path = os.path.join(base_path, self.FILENAME)
|
|
69
|
+
tree = ET.parse(xml_path)
|
|
70
|
+
root = tree.getroot()
|
|
71
|
+
|
|
72
|
+
instances = []
|
|
73
|
+
for question_root in root:
|
|
74
|
+
# get the actual question and question ID
|
|
75
|
+
id = self._extract_question_id(question_root)
|
|
76
|
+
question = Input(self._extract_question(question_root))
|
|
77
|
+
|
|
78
|
+
# parse out the reference answers
|
|
79
|
+
answers = self._extract_answers(question_root)
|
|
80
|
+
references = [Reference(Output(answer), tags=[CORRECT_TAG]) for answer in answers]
|
|
81
|
+
|
|
82
|
+
# stitch it all together
|
|
83
|
+
instances.append(Instance(question, references, split=TEST_SPLIT, id=id))
|
|
84
|
+
|
|
85
|
+
return instances
|
|
86
|
+
|
|
87
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
88
|
+
"""entrypoint to creating this scenario's instances"""
|
|
89
|
+
# get the dataset
|
|
90
|
+
self.download_liveqa(output_path)
|
|
91
|
+
|
|
92
|
+
# get the instances by parsing the XML
|
|
93
|
+
instances = self.process_xml(output_path)
|
|
94
|
+
return instances
|