crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +8 -2
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +36 -0
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +214 -16
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +14 -16
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +203 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +12 -72
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1069 -546
- helm/config/model_metadata.yaml +753 -31
- helm/config/tokenizer_configs.yaml +142 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/benchmark/metrics/metric.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
from abc import ABC
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
2
|
from dataclasses import dataclass, replace
|
|
3
3
|
from collections import defaultdict
|
|
4
|
-
from typing import List, Dict, Tuple, Optional, Iterable
|
|
4
|
+
from typing import List, Dict, Tuple, Optional, Iterable
|
|
5
5
|
|
|
6
6
|
from helm.common.object_spec import ObjectSpec, create_object
|
|
7
7
|
from helm.common.general import singleton, parallel_map
|
|
@@ -67,35 +67,36 @@ class Processor:
|
|
|
67
67
|
eval_cache_path: str
|
|
68
68
|
adapter_spec: AdapterSpec
|
|
69
69
|
|
|
70
|
-
def process(self,
|
|
70
|
+
def process(self, generation_states: List[RequestState]) -> List[Stat]:
|
|
71
71
|
instance_stats: List[Stat] = []
|
|
72
72
|
|
|
73
73
|
# Evaluate generated request_state
|
|
74
|
-
generation_states
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
)
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
# Evaluate the references
|
|
83
|
-
references_states = request_state_set.references_states
|
|
84
|
-
if len(references_states) != 0:
|
|
85
|
-
instance_stats.extend(
|
|
86
|
-
self.metric.evaluate_references(
|
|
87
|
-
self.adapter_spec, references_states, self.metric_service, self.eval_cache_path
|
|
88
|
-
)
|
|
74
|
+
if len(generation_states) == 0:
|
|
75
|
+
return instance_stats
|
|
76
|
+
instance_stats.extend(
|
|
77
|
+
self.metric.evaluate_generation(
|
|
78
|
+
self.adapter_spec, singleton(generation_states), self.metric_service, self.eval_cache_path
|
|
89
79
|
)
|
|
80
|
+
)
|
|
90
81
|
|
|
91
82
|
# Add instance-related context (e.g., split, perturbation) to the metrics
|
|
92
83
|
for i, stat in enumerate(instance_stats):
|
|
93
|
-
instance_stats[i] = add_context(stat, MetricContext.from_instance(
|
|
84
|
+
instance_stats[i] = add_context(stat, MetricContext.from_instance(generation_states[0].instance))
|
|
94
85
|
|
|
95
86
|
return instance_stats
|
|
96
87
|
|
|
97
88
|
|
|
98
|
-
class
|
|
89
|
+
class MetricInterface(ABC):
|
|
90
|
+
"""Interface for all Metrics."""
|
|
91
|
+
|
|
92
|
+
@abstractmethod
|
|
93
|
+
def evaluate(
|
|
94
|
+
self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
|
|
95
|
+
) -> MetricResult:
|
|
96
|
+
pass
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class Metric(MetricInterface, ABC):
|
|
99
100
|
"""
|
|
100
101
|
A `Metric` takes the results of execution and produces `Stat`s for a
|
|
101
102
|
scenario.
|
|
@@ -116,8 +117,11 @@ class Metric(ABC):
|
|
|
116
117
|
Any logic that doesn't decompose along instances should go here, such
|
|
117
118
|
as robustness.
|
|
118
119
|
"""
|
|
119
|
-
|
|
120
|
-
|
|
120
|
+
assert scenario_state.adapter_spec.method != ADAPT_LANGUAGE_MODELING, (
|
|
121
|
+
"Metric no longer knows how to handle the language modeling adapter. "
|
|
122
|
+
+ "All run_specs with that adapter should use LanguageModelingMetric. "
|
|
123
|
+
+ "If you are seeing this issue, please file a Github issue."
|
|
124
|
+
)
|
|
121
125
|
|
|
122
126
|
adapter_spec = scenario_state.adapter_spec
|
|
123
127
|
global_stats: Dict[MetricName, Stat] = {}
|
|
@@ -125,18 +129,9 @@ class Metric(ABC):
|
|
|
125
129
|
|
|
126
130
|
for train_trial_index in range(adapter_spec.num_train_trials):
|
|
127
131
|
# Construct inputs
|
|
128
|
-
|
|
132
|
+
generation_state_sets: List[List[RequestState]] = []
|
|
129
133
|
for instance in scenario_state.instances:
|
|
130
|
-
|
|
131
|
-
references_states = []
|
|
132
|
-
for reference_index in range(len(instance.references)):
|
|
133
|
-
references_states.extend(
|
|
134
|
-
scenario_state.get_request_states(train_trial_index, instance, reference_index)
|
|
135
|
-
)
|
|
136
|
-
request_state_set = RequestStateSet(
|
|
137
|
-
instance=instance, generation_states=generation_states, references_states=references_states
|
|
138
|
-
)
|
|
139
|
-
request_state_sets.append(request_state_set)
|
|
134
|
+
generation_state_sets.append(scenario_state.get_request_states(train_trial_index, instance, None))
|
|
140
135
|
|
|
141
136
|
# Do it!
|
|
142
137
|
processor = Processor(
|
|
@@ -147,7 +142,7 @@ class Metric(ABC):
|
|
|
147
142
|
)
|
|
148
143
|
results: List[List[Stat]] = parallel_map(
|
|
149
144
|
processor.process,
|
|
150
|
-
|
|
145
|
+
generation_state_sets,
|
|
151
146
|
parallelism=parallelism,
|
|
152
147
|
)
|
|
153
148
|
|
|
@@ -194,22 +189,10 @@ class Metric(ABC):
|
|
|
194
189
|
for stat in self.derive_per_instance_stats(instance_dict):
|
|
195
190
|
merge_stat(trial_stats, add_context(stat, context))
|
|
196
191
|
|
|
197
|
-
# Compute statistics that depend on all the `RequestStates` (e.g., bias metrics).
|
|
198
|
-
# Aggregate request states and call evaluate_instances in case the metric needs it.
|
|
199
|
-
grouped_request_states: Dict[MetricContext, List[RequestState]] = defaultdict(list)
|
|
200
|
-
for instance in scenario_state.instances:
|
|
201
|
-
# TODO: do we need to support reference_index that is not None?
|
|
202
|
-
grouped_request_states[MetricContext.from_instance(instance)].extend(
|
|
203
|
-
scenario_state.get_request_states(train_trial_index, instance, None)
|
|
204
|
-
)
|
|
205
|
-
for context, request_states in grouped_request_states.items():
|
|
206
|
-
for stat in self.evaluate_instances(request_states):
|
|
207
|
-
merge_stat(trial_stats, add_context(stat, context))
|
|
208
|
-
|
|
209
192
|
# Compute worst-case metrics.
|
|
210
193
|
# This is here since we want these stats for all metrics and they
|
|
211
194
|
# aggregate across contexts (perturbations).
|
|
212
|
-
worst_case_stats =
|
|
195
|
+
worst_case_stats = compute_worst_case_metrics(dict(zip(scenario_state.instances, results)))
|
|
213
196
|
for stat in worst_case_stats:
|
|
214
197
|
merge_stat(trial_stats, stat)
|
|
215
198
|
|
|
@@ -222,6 +205,7 @@ class Metric(ABC):
|
|
|
222
205
|
# Wrap aggregated and per-instance stats in a MetricResult.
|
|
223
206
|
return MetricResult(list(global_stats.values()), all_per_instance_stats)
|
|
224
207
|
|
|
208
|
+
@abstractmethod
|
|
225
209
|
def evaluate_generation(
|
|
226
210
|
self,
|
|
227
211
|
adapter_spec: AdapterSpec,
|
|
@@ -230,21 +214,7 @@ class Metric(ABC):
|
|
|
230
214
|
eval_cache_path: str,
|
|
231
215
|
) -> List[Stat]:
|
|
232
216
|
"""Evaluate free-form generation. Override me!"""
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
def evaluate_references(
|
|
236
|
-
self,
|
|
237
|
-
adapter_spec: AdapterSpec,
|
|
238
|
-
reference_request_states: List[RequestState],
|
|
239
|
-
metric_service: MetricService,
|
|
240
|
-
eval_cache_path: str,
|
|
241
|
-
) -> List[Stat]:
|
|
242
|
-
"""Evaluate the references. Override me!"""
|
|
243
|
-
return []
|
|
244
|
-
|
|
245
|
-
def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
|
|
246
|
-
"""Evaluate all request states directly. Use only if nothing else works. Override me!"""
|
|
247
|
-
return []
|
|
217
|
+
pass
|
|
248
218
|
|
|
249
219
|
def derive_stats(self, stats_dict: Dict[MetricName, Stat]) -> List[Stat]:
|
|
250
220
|
"""Derive stats based on existing stats, e.g., for perplexity. Override me!"""
|
|
@@ -254,119 +224,70 @@ class Metric(ABC):
|
|
|
254
224
|
"""Derive stats based on existing per-instance stats, e.g., for calibration. Override me!"""
|
|
255
225
|
return []
|
|
256
226
|
|
|
257
|
-
def evaluate_language_modeling(
|
|
258
|
-
self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str
|
|
259
|
-
) -> MetricResult:
|
|
260
|
-
global_stats: Dict[MetricName, Stat] = {}
|
|
261
|
-
# The first and only trial
|
|
262
|
-
trial_stats: Dict[MetricName, Stat] = {}
|
|
263
|
-
# Per-instance stats
|
|
264
|
-
all_per_instance_stats: List[PerInstanceStats] = []
|
|
265
|
-
instance_ids_per_context: Dict[MetricContext, Set[str]] = defaultdict(set)
|
|
266
|
-
|
|
267
|
-
for request_state in scenario_state.request_states:
|
|
268
|
-
# Evaluate request_state
|
|
269
|
-
request_stats = self.evaluate_generation(
|
|
270
|
-
scenario_state.adapter_spec, request_state, metric_service, eval_cache_path
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
# Add instance-related context (e.g., split, perturbation) to the metrics
|
|
274
|
-
for i, stat in enumerate(request_stats):
|
|
275
|
-
context = MetricContext.from_instance(request_state.instance)
|
|
276
|
-
request_stats[i] = add_context(stat, context)
|
|
277
|
-
assert request_state.instance.id is not None
|
|
278
|
-
instance_ids_per_context[context].add(request_state.instance.id)
|
|
279
|
-
|
|
280
|
-
# Use trial index of 0 here since we run only one trial for LM
|
|
281
|
-
assert request_state.instance.id is not None
|
|
282
|
-
all_per_instance_stats.append(
|
|
283
|
-
PerInstanceStats(request_state.instance.id, request_state.instance.perturbation, 0, request_stats)
|
|
284
|
-
)
|
|
285
|
-
|
|
286
|
-
for stat in request_stats:
|
|
287
|
-
merge_stat(trial_stats, stat)
|
|
288
|
-
|
|
289
|
-
# group stats according to the context (e.g., split, perturbation) and call derive_stats on each grouping
|
|
290
|
-
grouped_trial_stats: Dict[MetricContext, Dict[MetricName, Stat]] = defaultdict(dict)
|
|
291
|
-
for metric_name, stat in trial_stats.items():
|
|
292
|
-
grouped_trial_stats[MetricContext.from_metric_name(metric_name)][metric_name] = stat # group by context
|
|
293
|
-
|
|
294
|
-
for context, stats_dict in grouped_trial_stats.items():
|
|
295
|
-
for stat in self.derive_stats(stats_dict):
|
|
296
|
-
merge_stat(trial_stats, add_context(stat, context))
|
|
297
|
-
# keep track of how many instances are in each subset
|
|
298
|
-
num_instances_stat = Stat(MetricName("num_instances")).add(len(instance_ids_per_context[context]))
|
|
299
|
-
merge_stat(trial_stats, add_context(num_instances_stat, context))
|
|
300
227
|
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
For each instance, we compute the worst case perfomance between each perturbation and the non-perturbed input
|
|
308
|
-
(perturbation=None). This allows us to reason about the invariances of a model as opposed to just looking
|
|
309
|
-
at its performance on perturbed inputs. We also compute the worst case performance across all robustness-related
|
|
310
|
-
and fairness-related perturbations (including the original input in both).
|
|
228
|
+
def compute_worst_case_metrics(per_instance_stats: Dict[Instance, List[Stat]]) -> List[Stat]:
|
|
229
|
+
"""
|
|
230
|
+
For each instance, we compute the worst case perfomance between each perturbation and the non-perturbed input
|
|
231
|
+
(perturbation=None). This allows us to reason about the invariances of a model as opposed to just looking
|
|
232
|
+
at its performance on perturbed inputs. We also compute the worst case performance across all robustness-related
|
|
233
|
+
and fairness-related perturbations (including the original input in both).
|
|
311
234
|
|
|
312
|
-
|
|
313
|
-
|
|
235
|
+
For each such worst-case metric, we record a `before_` metric that aggregates the performance on the
|
|
236
|
+
non-perturbed version of the corresponding inputs.
|
|
314
237
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
merge_stat(derived_stats_dict, Stat(replace(stat.name, perturbation=worst)).add(stat.min))
|
|
369
|
-
return list(derived_stats_dict.values())
|
|
238
|
+
We return the aggregate metrics across instances. Note that none of these metrics make a lot of sense if the
|
|
239
|
+
original, un-perturbed version of an Instance is not included in a scenario (i.e., we want
|
|
240
|
+
`include_original=True`).
|
|
241
|
+
"""
|
|
242
|
+
# Collect statistics per input-metric pair across perturbations
|
|
243
|
+
per_instance_perturbation_stats: Dict[Tuple[MetricName, str], List[Stat]] = defaultdict(list)
|
|
244
|
+
for instance, stats in per_instance_stats.items():
|
|
245
|
+
for stat in stats:
|
|
246
|
+
assert instance.id is not None
|
|
247
|
+
# Group all perturbations for a specific metric name together
|
|
248
|
+
per_instance_perturbation_stats[(replace(stat.name, perturbation=None), instance.id)].append(stat)
|
|
249
|
+
|
|
250
|
+
# Compute worst perturbation stats
|
|
251
|
+
derived_stats_dict: Dict[MetricName, Stat] = {}
|
|
252
|
+
for (metric_name, instance_id), stats in per_instance_perturbation_stats.items():
|
|
253
|
+
original_stat: Optional[Stat] = None
|
|
254
|
+
robustness_stat = Stat(
|
|
255
|
+
replace(metric_name, perturbation=PerturbationDescription(name="robustness", robustness=True))
|
|
256
|
+
)
|
|
257
|
+
fairness_stat = Stat(replace(metric_name, perturbation=PerturbationDescription(name="fairness", fairness=True)))
|
|
258
|
+
individual_perturbation_stats: Dict[PerturbationDescription, Stat] = {}
|
|
259
|
+
|
|
260
|
+
for stat in stats: # go through all the perturbations of the instance and merge relevant stats
|
|
261
|
+
perturbation = stat.name.perturbation
|
|
262
|
+
if perturbation is None:
|
|
263
|
+
assert (
|
|
264
|
+
original_stat is None
|
|
265
|
+
), f"For {metric_name} got both {original_stat} and {stat}" # we should only have one original stat
|
|
266
|
+
original_stat = stat
|
|
267
|
+
else:
|
|
268
|
+
if perturbation.robustness:
|
|
269
|
+
robustness_stat.merge(stat)
|
|
270
|
+
if perturbation.fairness:
|
|
271
|
+
fairness_stat.merge(stat)
|
|
272
|
+
assert perturbation not in individual_perturbation_stats, perturbation
|
|
273
|
+
individual_perturbation_stats[perturbation] = Stat(stat.name).merge(stat) # copy
|
|
274
|
+
|
|
275
|
+
for stat in [robustness_stat, fairness_stat, *individual_perturbation_stats.values()]:
|
|
276
|
+
perturbation = stat.name.perturbation
|
|
277
|
+
assert perturbation is not None
|
|
278
|
+
|
|
279
|
+
if original_stat is not None:
|
|
280
|
+
stat.merge(original_stat)
|
|
281
|
+
if perturbation.name not in ["robustness", "fairness"]:
|
|
282
|
+
before = replace(perturbation, computed_on=PERTURBATION_ORIGINAL)
|
|
283
|
+
merge_stat(derived_stats_dict, Stat(replace(stat.name, perturbation=before)).merge(original_stat))
|
|
284
|
+
|
|
285
|
+
# keep the minimum performance for each input
|
|
286
|
+
worst = replace(perturbation, computed_on=PERTURBATION_WORST)
|
|
287
|
+
if stat.count > 0:
|
|
288
|
+
# TODO: take stat.max if lower_is_better = True
|
|
289
|
+
merge_stat(derived_stats_dict, Stat(replace(stat.name, perturbation=worst)).add(stat.min))
|
|
290
|
+
return list(derived_stats_dict.values())
|
|
370
291
|
|
|
371
292
|
|
|
372
293
|
class MetricSpec(ObjectSpec):
|
|
@@ -2,9 +2,13 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
from helm.common.authentication import Authentication
|
|
4
4
|
from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult
|
|
5
|
+
from helm.common.file_upload_request import FileUploadResult, FileUploadRequest
|
|
6
|
+
from helm.common.nudity_check_request import NudityCheckRequest, NudityCheckResult
|
|
7
|
+
from helm.common.clip_score_request import CLIPScoreRequest, CLIPScoreResult
|
|
5
8
|
from helm.common.perspective_api_request import PerspectiveAPIRequest, PerspectiveAPIRequestResult
|
|
6
9
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
7
10
|
from helm.proxy.services.service import Service
|
|
11
|
+
from helm.common.cache import Cache
|
|
8
12
|
|
|
9
13
|
|
|
10
14
|
class MetricService(TokenizerService):
|
|
@@ -15,8 +19,20 @@ class MetricService(TokenizerService):
|
|
|
15
19
|
def __init__(self, service: Service, auth: Authentication):
|
|
16
20
|
super().__init__(service, auth)
|
|
17
21
|
|
|
22
|
+
def check_nudity(self, request: NudityCheckRequest) -> NudityCheckResult:
|
|
23
|
+
return self._service.check_nudity(self._auth, request)
|
|
24
|
+
|
|
25
|
+
def compute_clip_score(self, request: CLIPScoreRequest) -> CLIPScoreResult:
|
|
26
|
+
return self._service.compute_clip_score(self._auth, request)
|
|
27
|
+
|
|
28
|
+
def upload(self, request: FileUploadRequest) -> FileUploadResult:
|
|
29
|
+
return self._service.upload(self._auth, request)
|
|
30
|
+
|
|
18
31
|
def get_toxicity_scores(self, request: PerspectiveAPIRequest) -> PerspectiveAPIRequestResult:
|
|
19
32
|
return self._service.get_toxicity_scores(self._auth, request)
|
|
20
33
|
|
|
21
34
|
def make_critique_request(self, request: CritiqueRequest) -> Optional[CritiqueRequestResult]:
|
|
22
35
|
return self._service.make_critique_request(self._auth, request)
|
|
36
|
+
|
|
37
|
+
def get_cache(self, shard_name: str) -> Cache:
|
|
38
|
+
return Cache(self._service.get_cache_config(shard_name))
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
-
from .
|
|
4
|
+
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
5
5
|
from .metric_name import MetricName
|
|
6
6
|
from .statistic import Stat
|
|
7
7
|
from nltk.translate.bleu_score import corpus_bleu
|
|
8
8
|
|
|
9
9
|
|
|
10
|
-
class CLEVAParaphraseGenerationMetric(
|
|
10
|
+
class CLEVAParaphraseGenerationMetric(EvaluateInstancesMetric):
|
|
11
11
|
"""
|
|
12
12
|
Compute the Chinese iBLEU score for Paraphrase Generation scenarios of CLEVA benchmark.
|
|
13
13
|
This implementation allows variable number of references (i.e., golds).
|
|
@@ -21,8 +21,7 @@ class CLEVAParaphraseGenerationMetric(Metric):
|
|
|
21
21
|
def __init__(self, alpha: float = 0.8): # calculate iBLEU_0.8 by default
|
|
22
22
|
self.alpha = alpha
|
|
23
23
|
|
|
24
|
-
def evaluate_instances(self, request_states: List[RequestState]) -> List[Stat]:
|
|
25
|
-
|
|
24
|
+
def evaluate_instances(self, request_states: List[RequestState], eval_cache_path: str) -> List[Stat]:
|
|
26
25
|
inputs: List = []
|
|
27
26
|
preds: List = []
|
|
28
27
|
golds: List[List[str]] = []
|
|
@@ -4,11 +4,11 @@ from typing import Callable, Dict, List, Tuple, Optional
|
|
|
4
4
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_RANKING_BINARY
|
|
5
5
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
6
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
7
|
+
from helm.benchmark.metrics.reference_metric import ReferenceMetric
|
|
7
8
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
9
|
from helm.benchmark.scenarios.scenario import unpack_tag, CORRECT_TAG, Reference
|
|
9
10
|
from helm.common.request import RequestResult
|
|
10
11
|
from helm.common.general import assert_present, binarize_dict
|
|
11
|
-
from .metric import Metric
|
|
12
12
|
from .metric_name import MetricName
|
|
13
13
|
from .metric_service import MetricService
|
|
14
14
|
from .statistic import Stat
|
|
@@ -58,7 +58,7 @@ class RankingObject:
|
|
|
58
58
|
model_relevance: Optional[int] = None
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
class RankingMetric(
|
|
61
|
+
class RankingMetric(ReferenceMetric):
|
|
62
62
|
"""Ranking metric."""
|
|
63
63
|
|
|
64
64
|
""" Methods supported by this metric.
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
from helm.benchmark.metrics.metric import (
|
|
5
|
+
MetricInterface,
|
|
6
|
+
MetricResult,
|
|
7
|
+
PerInstanceStats,
|
|
8
|
+
add_context,
|
|
9
|
+
compute_worst_case_metrics,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from helm.common.general import parallel_map
|
|
13
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_LANGUAGE_MODELING
|
|
14
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
15
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
16
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
17
|
+
from .metric_name import MetricName, MetricContext
|
|
18
|
+
from .metric_service import MetricService
|
|
19
|
+
from .statistic import Stat, merge_stat
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class Processor:
|
|
24
|
+
"""Evaluates an instance."""
|
|
25
|
+
|
|
26
|
+
# TODO: not ideal that we have circular dependencies; subclasses of Metric
|
|
27
|
+
# should override the Processor rather than the Metric.
|
|
28
|
+
metric: "ReferenceMetric"
|
|
29
|
+
metric_service: MetricService
|
|
30
|
+
eval_cache_path: str
|
|
31
|
+
adapter_spec: AdapterSpec
|
|
32
|
+
|
|
33
|
+
def process(self, references_states: List[RequestState]) -> List[Stat]:
|
|
34
|
+
instance_stats: List[Stat] = []
|
|
35
|
+
|
|
36
|
+
# Evaluate the references
|
|
37
|
+
if len(references_states) == 0:
|
|
38
|
+
return instance_stats
|
|
39
|
+
instance_stats.extend(
|
|
40
|
+
self.metric.evaluate_references(
|
|
41
|
+
self.adapter_spec, references_states, self.metric_service, self.eval_cache_path
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# Add instance-related context (e.g., split, perturbation) to the metrics
|
|
46
|
+
for i, stat in enumerate(instance_stats):
|
|
47
|
+
instance_stats[i] = add_context(stat, MetricContext.from_instance(references_states[0].instance))
|
|
48
|
+
|
|
49
|
+
return instance_stats
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ReferenceMetric(MetricInterface, ABC):
|
|
53
|
+
"""
|
|
54
|
+
A `Metric` takes the results of execution and produces `Stat`s for a
|
|
55
|
+
scenario.
|
|
56
|
+
|
|
57
|
+
Note: `Metric` actually right now is a bit of misnomer because it produces many
|
|
58
|
+
`Stat`s, that might be distinct but are computed together. Eventually we
|
|
59
|
+
might move to a world where there is one (or very few metrics that are domain-independent).
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def evaluate(
|
|
63
|
+
self, scenario_state: ScenarioState, metric_service: MetricService, eval_cache_path: str, parallelism: int
|
|
64
|
+
) -> MetricResult:
|
|
65
|
+
"""
|
|
66
|
+
Main entry point for a `Metric`. This function groups the single
|
|
67
|
+
list of `RequestState` by training trial and instance, and invokes
|
|
68
|
+
other functions to process those. This should serve most purposes.
|
|
69
|
+
|
|
70
|
+
Any logic that doesn't decompose along instances should go here, such
|
|
71
|
+
as robustness.
|
|
72
|
+
"""
|
|
73
|
+
assert scenario_state.adapter_spec.method != ADAPT_LANGUAGE_MODELING, (
|
|
74
|
+
"Metric no longer knows how to handle the language modeling adapter. "
|
|
75
|
+
+ "All run_specs with that adapter should use LanguageModelingMetric. "
|
|
76
|
+
+ "If you are seeing this issue, please file a Github issue."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
adapter_spec = scenario_state.adapter_spec
|
|
80
|
+
global_stats: Dict[MetricName, Stat] = {}
|
|
81
|
+
all_per_instance_stats: List[PerInstanceStats] = []
|
|
82
|
+
|
|
83
|
+
for train_trial_index in range(adapter_spec.num_train_trials):
|
|
84
|
+
# Construct inputs
|
|
85
|
+
request_state_sets: List[List[RequestState]] = []
|
|
86
|
+
for instance in scenario_state.instances:
|
|
87
|
+
references_states = []
|
|
88
|
+
for reference_index in range(len(instance.references)):
|
|
89
|
+
references_states.extend(
|
|
90
|
+
scenario_state.get_request_states(train_trial_index, instance, reference_index)
|
|
91
|
+
)
|
|
92
|
+
request_state_sets.append(references_states)
|
|
93
|
+
|
|
94
|
+
# Do it!
|
|
95
|
+
processor = Processor(
|
|
96
|
+
metric=self,
|
|
97
|
+
metric_service=metric_service,
|
|
98
|
+
eval_cache_path=eval_cache_path,
|
|
99
|
+
adapter_spec=scenario_state.adapter_spec,
|
|
100
|
+
)
|
|
101
|
+
results: List[List[Stat]] = parallel_map(
|
|
102
|
+
processor.process,
|
|
103
|
+
request_state_sets,
|
|
104
|
+
parallelism=parallelism,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Compute per-instance stats
|
|
108
|
+
per_instance_stats: List[PerInstanceStats] = []
|
|
109
|
+
for instance, stats in zip(scenario_state.instances, results):
|
|
110
|
+
assert instance.id is not None, f"id was none for instance: {instance}"
|
|
111
|
+
# Sometimes a metric (e.g., BiasMetric) doesn't produce any statistics
|
|
112
|
+
if len(stats) > 0:
|
|
113
|
+
per_instance_stats.append(
|
|
114
|
+
PerInstanceStats(instance.id, instance.perturbation, train_trial_index, stats)
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Aggregate these stats
|
|
118
|
+
trial_stats: Dict[MetricName, Stat] = {} # Statistics just for this trial
|
|
119
|
+
for instance_stats in results:
|
|
120
|
+
for stat in instance_stats:
|
|
121
|
+
merge_stat(trial_stats, stat)
|
|
122
|
+
|
|
123
|
+
# Compute worst-case metrics.
|
|
124
|
+
# This is here since we want these stats for all metrics and they
|
|
125
|
+
# aggregate across contexts (perturbations).
|
|
126
|
+
worst_case_stats = compute_worst_case_metrics(dict(zip(scenario_state.instances, results)))
|
|
127
|
+
for stat in worst_case_stats:
|
|
128
|
+
merge_stat(trial_stats, stat)
|
|
129
|
+
|
|
130
|
+
# We take the mean value for each trial.
|
|
131
|
+
for stat in trial_stats.values():
|
|
132
|
+
merge_stat(global_stats, stat.take_mean())
|
|
133
|
+
|
|
134
|
+
all_per_instance_stats.extend(per_instance_stats)
|
|
135
|
+
|
|
136
|
+
# Wrap aggregated and per-instance stats in a MetricResult.
|
|
137
|
+
return MetricResult(list(global_stats.values()), all_per_instance_stats)
|
|
138
|
+
|
|
139
|
+
@abstractmethod
|
|
140
|
+
def evaluate_references(
|
|
141
|
+
self,
|
|
142
|
+
adapter_spec: AdapterSpec,
|
|
143
|
+
reference_request_states: List[RequestState],
|
|
144
|
+
metric_service: MetricService,
|
|
145
|
+
eval_cache_path: str,
|
|
146
|
+
) -> List[Stat]:
|
|
147
|
+
"""Evaluate the references. Override me!"""
|
|
148
|
+
pass
|
|
@@ -50,7 +50,6 @@ class SummaCImager:
|
|
|
50
50
|
def __init__(
|
|
51
51
|
self, model_name="mnli", granularity="paragraph", use_cache=True, max_doc_sents=100, device="cuda", **kwargs
|
|
52
52
|
):
|
|
53
|
-
|
|
54
53
|
self.grans = granularity.split("-")
|
|
55
54
|
|
|
56
55
|
assert (
|
|
@@ -154,7 +153,6 @@ class SummaCImager:
|
|
|
154
153
|
for j in range(N_gen)
|
|
155
154
|
]
|
|
156
155
|
for batch in utils_misc.batcher(dataset, batch_size=20):
|
|
157
|
-
|
|
158
156
|
if self.model_name == "decomp":
|
|
159
157
|
batch_evids, batch_conts, batch_neuts = [], [], []
|
|
160
158
|
batch_json = [{"premise": d["premise"], "hypothesis": d["hypothesis"]} for d in batch]
|
|
@@ -12,13 +12,13 @@ from collections import defaultdict
|
|
|
12
12
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
13
13
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
14
14
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
15
|
+
from helm.benchmark.metrics.evaluate_reference_metrics import get_rouge_function
|
|
15
16
|
from helm.common.hierarchical_logger import hlog
|
|
16
17
|
from helm.common.general import ensure_file_downloaded
|
|
17
18
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
18
19
|
from .metric import Metric, MetricResult
|
|
19
20
|
from .metric_name import MetricName
|
|
20
21
|
from .metric_service import MetricService
|
|
21
|
-
from .basic_metrics import get_rouge_function
|
|
22
22
|
from .statistic import Stat
|
|
23
23
|
from .summac.model_summac import SummaCZS
|
|
24
24
|
from bert_score import BERTScorer
|
|
@@ -58,7 +58,7 @@ class SummarizationMetric(Metric):
|
|
|
58
58
|
try:
|
|
59
59
|
from summ_eval.data_stats_metric import DataStatsMetric
|
|
60
60
|
except ModuleNotFoundError as e:
|
|
61
|
-
handle_module_not_found_error(e, ["
|
|
61
|
+
handle_module_not_found_error(e, ["summarization"])
|
|
62
62
|
|
|
63
63
|
self.data_stats_metric = DataStatsMetric()
|
|
64
64
|
self.task: str = task
|