crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +8 -2
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +36 -0
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +214 -16
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +14 -16
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +203 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +12 -72
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1069 -546
- helm/config/model_metadata.yaml +753 -31
- helm/config/tokenizer_configs.yaml +142 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -22,11 +22,17 @@ CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG"
|
|
|
22
22
|
# OpenAI Chat format
|
|
23
23
|
OPENAI_CHATGPT_MODEL_TAG: str = "OPENAI_CHATGPT_MODEL_TAG"
|
|
24
24
|
|
|
25
|
+
# Mistral instruction-following format
|
|
26
|
+
MISTRAL_MODEL_TAG: str = "MISTRAL_MODEL_TAG"
|
|
27
|
+
|
|
25
28
|
# For Anthropic models
|
|
26
29
|
ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG"
|
|
27
30
|
ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG"
|
|
31
|
+
ANTHROPIC_CLAUDE_3_MODEL_TAG: str = "ANTHROPIC_CLAUDE_3_MODEL_TAG"
|
|
28
32
|
|
|
29
33
|
GOOGLE_PALM_2_MODEL_TAG: str = "GOOGLE_PALM_2_MODEL_TAG"
|
|
34
|
+
GOOGLE_GEMINI_MODEL_TAG: str = "GOOGLE_GEMINI_MODEL_TAG"
|
|
35
|
+
GOOGLE_GEMMA_INSTRUCT_MODEL_TAG: str = "GOOGLE_GEMMA_INSTRUCT_MODEL_TAG"
|
|
30
36
|
|
|
31
37
|
# Models which emit garbage tokens when temperature=0.
|
|
32
38
|
BUGGY_TEMP_0_TAG: str = "BUGGY_TEMP_0_TAG"
|
|
@@ -46,12 +52,27 @@ NLG_PREFIX_TAG: str = "NLG_PREFIX_TAG"
|
|
|
46
52
|
# Some models can follow instructions.
|
|
47
53
|
INSTRUCTION_FOLLOWING_MODEL_TAG: str = "INSTRUCTION_FOLLOWING_MODEL_TAG"
|
|
48
54
|
|
|
55
|
+
# For text-to-image models
|
|
56
|
+
TEXT_TO_IMAGE_MODEL_TAG: str = "TEXT_TO_IMAGE_MODEL_TAG"
|
|
57
|
+
|
|
49
58
|
# For Vision-langauge models (VLMs)
|
|
50
59
|
VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG"
|
|
60
|
+
# IDEFICS require a special prompt format (see `IDEFICSInstructRunExpander`)
|
|
61
|
+
IDEFICS_INSTRUCT_MODEL_TAG: str = "IDEFICS_INSTRUCT_MODEL_TAG"
|
|
62
|
+
IDEFICS_MODEL_TAG: str = "IDEFICS_MODEL_TAG"
|
|
63
|
+
# Llava should use a special prompt format (see `LlavaRunExpander`)
|
|
64
|
+
LLAVA_MODEL_TAG: str = "LLAVA_MODEL_TAG"
|
|
65
|
+
# OpenFlamingo has a special prompt format (see `OpenFlamingoRunExpander`)
|
|
66
|
+
OPEN_FLAMINGO_MODEL_TAG: str = "OPEN_FLAMINGO_MODEL_TAG"
|
|
67
|
+
# Some VLMs do not support multiple images in the prompt
|
|
68
|
+
LIMITED_FUNCTIONALITY_VLM_TAG: str = "LIMITED_FUNCTIONALITY_VLM_TAG"
|
|
69
|
+
FULL_FUNCTIONALITY_VLM_TAG: str = "FULL_FUNCTIONALITY_VLM_TAG"
|
|
51
70
|
|
|
52
71
|
|
|
53
72
|
# Frozen is set to false as the model_deployment_registry.py file
|
|
54
73
|
# might populate the deployment_names field.
|
|
74
|
+
|
|
75
|
+
|
|
55
76
|
@dataclass(frozen=False)
|
|
56
77
|
class ModelMetadata:
|
|
57
78
|
name: str
|
|
@@ -153,6 +174,11 @@ def get_model_names_with_tag(tag: str) -> List[str]:
|
|
|
153
174
|
return [model.name for model in ALL_MODELS_METADATA if tag in model.tags]
|
|
154
175
|
|
|
155
176
|
|
|
177
|
+
def model_has_tag(model_name: str, tag: str) -> bool:
|
|
178
|
+
"""Return True if the model has the given tag. False otherwise."""
|
|
179
|
+
return tag in get_model_metadata(model_name).tags
|
|
180
|
+
|
|
181
|
+
|
|
156
182
|
def get_all_text_models() -> List[str]:
|
|
157
183
|
"""Return all model names of text models."""
|
|
158
184
|
return get_model_names_with_tag(TEXT_MODEL_TAG)
|
|
@@ -168,6 +194,16 @@ def get_all_instruction_following_models() -> List[str]:
|
|
|
168
194
|
return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG)
|
|
169
195
|
|
|
170
196
|
|
|
197
|
+
def is_text_to_image_model(model_name: str) -> bool:
|
|
198
|
+
"""Returns True if the model is a text-to-image model. False otherwise."""
|
|
199
|
+
return model_has_tag(model_name, TEXT_TO_IMAGE_MODEL_TAG)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def is_vlm(model_name: str) -> bool:
|
|
203
|
+
"""Returns True if the model is a vision-language model (VLM). False otherwise."""
|
|
204
|
+
return model_has_tag(model_name, VISION_LANGUAGE_MODEL_TAG)
|
|
205
|
+
|
|
206
|
+
|
|
171
207
|
def get_unknown_model_metadata(helm_model_name: str) -> ModelMetadata:
|
|
172
208
|
"""Return placeholder ModelMetadata for an unknown model."""
|
|
173
209
|
return ModelMetadata(
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
import signal
|
|
2
|
+
import threading
|
|
3
|
+
import traceback
|
|
4
|
+
from typing import List
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
import torch
|
|
8
|
+
import torch.multiprocessing as multiprocessing
|
|
9
|
+
from concurrent.futures import ProcessPoolExecutor as Pool
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
from helm.benchmark.config_registry import (
|
|
13
|
+
register_configs_from_directory,
|
|
14
|
+
register_builtin_configs_from_helm_package,
|
|
15
|
+
)
|
|
16
|
+
from helm.benchmark.executor import ExecutionSpec
|
|
17
|
+
from helm.benchmark.runner import Runner, RunSpec, RunnerError
|
|
18
|
+
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
19
|
+
from helm.benchmark.runner_config_registry import RUNNER_CONFIG
|
|
20
|
+
|
|
21
|
+
_MAX_CONCURRENT_WORKERS_ENV_NAME = "HELM_MAX_CONCURRENT_WORKERS"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# From
|
|
25
|
+
# https://stackoverflow.com/questions/71300294/how-to-terminate-pythons-processpoolexecutor-when-parent-process-dies
|
|
26
|
+
def start_thread_to_terminate_when_parent_process_dies(ppid):
|
|
27
|
+
pid = os.getpid()
|
|
28
|
+
|
|
29
|
+
def f():
|
|
30
|
+
while True:
|
|
31
|
+
try:
|
|
32
|
+
os.kill(ppid, 0)
|
|
33
|
+
except OSError:
|
|
34
|
+
os.kill(pid, signal.SIGTERM)
|
|
35
|
+
time.sleep(1)
|
|
36
|
+
|
|
37
|
+
thread = threading.Thread(target=f, daemon=True)
|
|
38
|
+
thread.start()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def initialize_worker(gpu_id: int):
|
|
42
|
+
hlog(f"Worker {gpu_id} initializing")
|
|
43
|
+
|
|
44
|
+
# Wait for 0.1 seconds to ensure all workers are initialized with different CUDA_VISIBLE_DEVICES
|
|
45
|
+
time.sleep(0.1)
|
|
46
|
+
|
|
47
|
+
# Pin GPU to worker process
|
|
48
|
+
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
|
|
49
|
+
|
|
50
|
+
# Necessary for code_metrics in humaneval to work properly
|
|
51
|
+
multiprocessing.set_start_method("fork", force=True)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class MultiGPURunner(Runner):
|
|
55
|
+
"""Runner that runs the entire benchmark on multiple GPUs.
|
|
56
|
+
|
|
57
|
+
This is a thin wrapper around `Runner` that runs the entire benchmark on
|
|
58
|
+
multiple GPUs using `multiprocessing`.
|
|
59
|
+
|
|
60
|
+
Note that this runner will load multiple models into memory at the same
|
|
61
|
+
time if your running configuration specifies that, similar to the `Runner`
|
|
62
|
+
class. `SlurmRunner` on the other hand will load at most one model on a
|
|
63
|
+
GPU"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
execution_spec: ExecutionSpec,
|
|
68
|
+
output_path: str,
|
|
69
|
+
suite: str,
|
|
70
|
+
skip_instances: bool,
|
|
71
|
+
cache_instances: bool,
|
|
72
|
+
cache_instances_only: bool,
|
|
73
|
+
skip_completed_runs: bool,
|
|
74
|
+
exit_on_error: bool,
|
|
75
|
+
):
|
|
76
|
+
super().__init__(
|
|
77
|
+
execution_spec=execution_spec,
|
|
78
|
+
output_path=output_path,
|
|
79
|
+
suite=suite,
|
|
80
|
+
skip_instances=skip_instances,
|
|
81
|
+
cache_instances=cache_instances,
|
|
82
|
+
cache_instances_only=cache_instances_only,
|
|
83
|
+
skip_completed_runs=skip_completed_runs,
|
|
84
|
+
exit_on_error=exit_on_error,
|
|
85
|
+
)
|
|
86
|
+
# Configure max concurrent worker jobs from the environment variable.
|
|
87
|
+
env_max_concurrent_workers = os.getenv(_MAX_CONCURRENT_WORKERS_ENV_NAME)
|
|
88
|
+
self.max_concurrent_workers = (
|
|
89
|
+
int(env_max_concurrent_workers)
|
|
90
|
+
if env_max_concurrent_workers
|
|
91
|
+
else (
|
|
92
|
+
RUNNER_CONFIG.helm_max_concurrent_workers
|
|
93
|
+
if RUNNER_CONFIG.helm_max_concurrent_workers > 0
|
|
94
|
+
else torch.cuda.device_count()
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def safe_run_one(self, run_spec: RunSpec):
|
|
99
|
+
register_builtin_configs_from_helm_package()
|
|
100
|
+
if self.executor.execution_spec.local_path is not None:
|
|
101
|
+
register_configs_from_directory(self.executor.execution_spec.local_path)
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
with htrack_block(f"Running {run_spec.name}"):
|
|
105
|
+
self.run_one(run_spec)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
hlog(f"Error when running {run_spec.name}:\n{traceback.format_exc()}")
|
|
108
|
+
return e
|
|
109
|
+
|
|
110
|
+
def run_all(self, run_specs: List[RunSpec]):
|
|
111
|
+
"""Run the entire benchmark on multiple GPU"""
|
|
112
|
+
|
|
113
|
+
# Set the start method to forkserver to avoid issues with CUDA.
|
|
114
|
+
multiprocessing.set_start_method("forkserver")
|
|
115
|
+
|
|
116
|
+
with Pool(
|
|
117
|
+
max_workers=self.max_concurrent_workers,
|
|
118
|
+
initializer=start_thread_to_terminate_when_parent_process_dies,
|
|
119
|
+
initargs=(os.getpid(),),
|
|
120
|
+
) as pool:
|
|
121
|
+
# Pin GPUs to each worker process
|
|
122
|
+
pool.map(initialize_worker, [i for i in range(self.max_concurrent_workers)])
|
|
123
|
+
|
|
124
|
+
# Run all queued tasks
|
|
125
|
+
error_msgs = list(tqdm(pool.map(self.safe_run_one, run_specs), total=len(run_specs), disable=None))
|
|
126
|
+
|
|
127
|
+
# Raise exception for failed runs, if any.
|
|
128
|
+
failed_run_names = [
|
|
129
|
+
run_spec.name for error_msg, run_spec in zip(error_msgs, run_specs) if error_msg is not None
|
|
130
|
+
]
|
|
131
|
+
if failed_run_names:
|
|
132
|
+
failed_runs_str = ", ".join([f'"{run_name}"' for run_name in failed_run_names])
|
|
133
|
+
raise RunnerError(f"Failed runs: [{failed_runs_str}]")
|
|
@@ -10,9 +10,10 @@ from typing import List, Dict, Optional, Any, Callable, Union, Mapping, Tuple, S
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
from scipy.stats import pearsonr
|
|
12
12
|
|
|
13
|
+
from helm.benchmark.config_registry import register_builtin_configs_from_helm_package
|
|
13
14
|
from helm.common.hierarchical_logger import hlog
|
|
14
15
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
15
|
-
from helm.benchmark.
|
|
16
|
+
from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA
|
|
16
17
|
from helm.benchmark.presentation.summarize import AGGREGATE_WIN_RATE_COLUMN
|
|
17
18
|
|
|
18
19
|
try:
|
|
@@ -133,9 +134,6 @@ class Plotter:
|
|
|
133
134
|
self.plot_format = plot_format
|
|
134
135
|
self._tables_cache: Dict[str, Dict[str, Table]] = {}
|
|
135
136
|
|
|
136
|
-
schema = read_schema(SCHEMA_CLASSIC_YAML_FILENAME)
|
|
137
|
-
self.model_metadata = {model_field.display_name: model_field for model_field in schema.models}
|
|
138
|
-
|
|
139
137
|
def get_group_tables(self, group_name: str) -> Dict[str, Table]:
|
|
140
138
|
"""Reads and parses group tables. Uses _tables_cache to avoid reprocessing the same table multiple times."""
|
|
141
139
|
if group_name in self._tables_cache:
|
|
@@ -338,14 +336,14 @@ class Plotter:
|
|
|
338
336
|
|
|
339
337
|
def get_model_release_date(model_name: str) -> Optional[date]:
|
|
340
338
|
"""Maps a model name to the month of model release."""
|
|
341
|
-
release_date =
|
|
339
|
+
release_date = MODEL_NAME_TO_MODEL_METADATA[model_name].release_date
|
|
342
340
|
if release_date is None:
|
|
343
341
|
return None
|
|
344
342
|
return release_date.replace(day=1)
|
|
345
343
|
|
|
346
344
|
def get_model_size(model_name: str) -> Optional[int]:
|
|
347
345
|
"""Maps a model name to the number of parameters, rounding to the nearest leading digit."""
|
|
348
|
-
size =
|
|
346
|
+
size = MODEL_NAME_TO_MODEL_METADATA[model_name].num_parameters
|
|
349
347
|
if size is None:
|
|
350
348
|
return None
|
|
351
349
|
grain = 10 ** (len(str(size)) - 1)
|
|
@@ -401,7 +399,9 @@ class Plotter:
|
|
|
401
399
|
|
|
402
400
|
for i, access_level in enumerate(access_levels):
|
|
403
401
|
model_indices: List[int] = [
|
|
404
|
-
idx
|
|
402
|
+
idx
|
|
403
|
+
for idx, model in enumerate(table.adapters)
|
|
404
|
+
if MODEL_NAME_TO_MODEL_METADATA[model].access == access_level
|
|
405
405
|
]
|
|
406
406
|
best_model_index = model_indices[table.mean_win_rates[model_indices].argmax()]
|
|
407
407
|
|
|
@@ -611,6 +611,7 @@ def main():
|
|
|
611
611
|
parser.add_argument("--suite", type=str, help="Name of the suite that we are plotting", required=True)
|
|
612
612
|
parser.add_argument("--plot-format", help="Format for saving plots", default="png", choices=["png", "pdf"])
|
|
613
613
|
args = parser.parse_args()
|
|
614
|
+
register_builtin_configs_from_helm_package()
|
|
614
615
|
base_path = os.path.join(args.output_path, "runs", args.suite)
|
|
615
616
|
if not os.path.exists(os.path.join(base_path, "groups")):
|
|
616
617
|
hlog(f"ERROR: Could not find `groups` directory under {base_path}. Did you run `summarize.py` first?")
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
from collections import OrderedDict, defaultdict
|
|
2
2
|
from dataclasses import dataclass
|
|
3
3
|
import os
|
|
4
|
-
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
|
4
|
+
from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
|
|
5
5
|
|
|
6
|
-
from helm.benchmark.adaptation.
|
|
6
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
7
7
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_METHODS,
|
|
8
8
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
9
9
|
)
|
|
@@ -12,11 +12,13 @@ from helm.benchmark.adaptation.request_state import RequestState
|
|
|
12
12
|
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
13
13
|
from helm.benchmark.augmentations.perturbation_description import PerturbationDescription
|
|
14
14
|
from helm.benchmark.metrics.metric import PerInstanceStats
|
|
15
|
+
from helm.common.multimodal_request_utils import gather_generated_image_locations
|
|
15
16
|
from helm.benchmark.presentation.schema import Schema
|
|
16
|
-
from helm.benchmark.
|
|
17
|
+
from helm.benchmark.run_spec import RunSpec
|
|
17
18
|
from helm.benchmark.scenarios.scenario import Instance
|
|
18
19
|
from helm.common.general import write
|
|
19
20
|
from helm.common.hierarchical_logger import hlog, htrack
|
|
21
|
+
from helm.common.images_utils import encode_base64
|
|
20
22
|
from helm.common.request import Request
|
|
21
23
|
from helm.common.codec import from_json, to_json
|
|
22
24
|
|
|
@@ -43,6 +45,9 @@ class DisplayPrediction:
|
|
|
43
45
|
truncated_predicted_text: Optional[str]
|
|
44
46
|
"""The truncated prediction text, if truncation is required by the Adapter method."""
|
|
45
47
|
|
|
48
|
+
base64_images: Optional[List[str]]
|
|
49
|
+
"""Images in base64."""
|
|
50
|
+
|
|
46
51
|
mapped_output: Optional[str]
|
|
47
52
|
"""The mapped output, if an output mapping exists and the prediction can be mapped"""
|
|
48
53
|
|
|
@@ -52,6 +57,8 @@ class DisplayPrediction:
|
|
|
52
57
|
stats: Dict[str, float]
|
|
53
58
|
"""Statistics computed from the predicted output"""
|
|
54
59
|
|
|
60
|
+
annotations: Optional[Dict[str, Any]]
|
|
61
|
+
|
|
55
62
|
|
|
56
63
|
@dataclass(frozen=True)
|
|
57
64
|
class DisplayRequest:
|
|
@@ -73,7 +80,7 @@ class DisplayRequest:
|
|
|
73
80
|
"""The actual Request to display in the web frontend.
|
|
74
81
|
|
|
75
82
|
There can be multiple requests per trial. The displayed request should be the
|
|
76
|
-
most relevant request e.g. the request for the chosen
|
|
83
|
+
most relevant request e.g. the request for the chosen choice for multiple choice questions."""
|
|
77
84
|
|
|
78
85
|
|
|
79
86
|
def _read_scenario_state(scenario_state_path: str) -> ScenarioState:
|
|
@@ -102,8 +109,7 @@ def _truncate_predicted_text(
|
|
|
102
109
|
tokens = request_state.result.completions[0].tokens
|
|
103
110
|
if tokens:
|
|
104
111
|
first_token = tokens[0]
|
|
105
|
-
|
|
106
|
-
prefix = first_token.text
|
|
112
|
+
prefix = first_token.text
|
|
107
113
|
if prefix:
|
|
108
114
|
predicted_text = predicted_text
|
|
109
115
|
prefix = prefix
|
|
@@ -126,7 +132,7 @@ def _get_metric_names_for_group(run_group_name: str, schema: Schema) -> Set[str]
|
|
|
126
132
|
if metric_group is None:
|
|
127
133
|
continue
|
|
128
134
|
for metric_name_matcher in metric_group.metrics:
|
|
129
|
-
if metric_name_matcher.perturbation_name:
|
|
135
|
+
if metric_name_matcher.perturbation_name and metric_name_matcher.perturbation_name != "__all__":
|
|
130
136
|
continue
|
|
131
137
|
result.add(metric_name_matcher.substitute(run_group.environment).name)
|
|
132
138
|
return result
|
|
@@ -256,9 +262,17 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
256
262
|
mapped_output = (
|
|
257
263
|
request_state.output_mapping.get(predicted_text.strip()) if request_state.output_mapping else None
|
|
258
264
|
)
|
|
259
|
-
instance_id_to_instance[
|
|
260
|
-
|
|
261
|
-
|
|
265
|
+
instance_id_to_instance[(request_state.instance.id, request_state.instance.perturbation)] = (
|
|
266
|
+
request_state.instance
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
# Process images and include if they exist
|
|
270
|
+
images: List[str] = [
|
|
271
|
+
encode_base64(image_location)
|
|
272
|
+
for image_location in gather_generated_image_locations(request_state.result)
|
|
273
|
+
if os.path.exists(image_location)
|
|
274
|
+
]
|
|
275
|
+
|
|
262
276
|
predictions.append(
|
|
263
277
|
DisplayPrediction(
|
|
264
278
|
instance_id=request_state.instance.id,
|
|
@@ -266,9 +280,11 @@ def write_run_display_json(run_path: str, run_spec: RunSpec, schema: Schema, ski
|
|
|
266
280
|
train_trial_index=request_state.train_trial_index,
|
|
267
281
|
predicted_text=predicted_text,
|
|
268
282
|
truncated_predicted_text=_truncate_predicted_text(predicted_text, request_state, run_spec.adapter_spec),
|
|
283
|
+
base64_images=images,
|
|
269
284
|
mapped_output=mapped_output,
|
|
270
285
|
reference_index=request_state.reference_index,
|
|
271
286
|
stats=trial_stats,
|
|
287
|
+
annotations=request_state.annotations,
|
|
272
288
|
)
|
|
273
289
|
)
|
|
274
290
|
requests.append(
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
|
-
from datetime import date
|
|
3
2
|
from typing import List, Optional, Dict
|
|
4
3
|
import dacite
|
|
5
4
|
import mako.template
|
|
@@ -46,34 +45,6 @@ class Field:
|
|
|
46
45
|
return name
|
|
47
46
|
|
|
48
47
|
|
|
49
|
-
# Note: also see Model from `models.py`.
|
|
50
|
-
@dataclass(frozen=True)
|
|
51
|
-
class ModelField(Field):
|
|
52
|
-
# Organization that originally created the model (e.g. "EleutherAI")
|
|
53
|
-
# Note that this may be different from group or the prefix of the model `name`
|
|
54
|
-
# ("together" in "together/gpt-j-6b") as the hosting organization
|
|
55
|
-
# may be different from the creator organization. We also capitalize
|
|
56
|
-
# this field properly to later display in the UI.
|
|
57
|
-
# TODO: in the future, we want to cleanup the naming in the following ways:
|
|
58
|
-
# - make the creator_organization an identifier with a separate display name
|
|
59
|
-
# - have a convention like <hosting_organization><creator_organization>/<model_name>
|
|
60
|
-
creator_organization: Optional[str] = None
|
|
61
|
-
|
|
62
|
-
# How this model is available (e.g., limited)
|
|
63
|
-
access: Optional[str] = None
|
|
64
|
-
|
|
65
|
-
# Whether we have yet to evaluate this model
|
|
66
|
-
todo: bool = False
|
|
67
|
-
|
|
68
|
-
# When was the model released
|
|
69
|
-
release_date: Optional[date] = None
|
|
70
|
-
|
|
71
|
-
# The number of parameters
|
|
72
|
-
# This should be a string as the number of parameters is usually a round number (175B),
|
|
73
|
-
# but we set it as an int for plotting purposes.
|
|
74
|
-
num_parameters: Optional[int] = None
|
|
75
|
-
|
|
76
|
-
|
|
77
48
|
@dataclass(frozen=True)
|
|
78
49
|
class MetricNameMatcher:
|
|
79
50
|
"""
|
|
@@ -100,7 +71,7 @@ class MetricNameMatcher:
|
|
|
100
71
|
if self.name != metric_name.name:
|
|
101
72
|
return False
|
|
102
73
|
|
|
103
|
-
if self.split != metric_name.split:
|
|
74
|
+
if self.split != "__all__" and self.split != metric_name.split:
|
|
104
75
|
return False
|
|
105
76
|
|
|
106
77
|
# Optional
|
|
@@ -121,9 +92,11 @@ class MetricNameMatcher:
|
|
|
121
92
|
return MetricNameMatcher(
|
|
122
93
|
name=mako.template.Template(self.name).render(**environment),
|
|
123
94
|
split=mako.template.Template(self.split).render(**environment),
|
|
124
|
-
perturbation_name=
|
|
125
|
-
|
|
126
|
-
|
|
95
|
+
perturbation_name=(
|
|
96
|
+
mako.template.Template(self.perturbation_name).render(**environment)
|
|
97
|
+
if self.perturbation_name is not None
|
|
98
|
+
else None
|
|
99
|
+
),
|
|
127
100
|
)
|
|
128
101
|
|
|
129
102
|
|
|
@@ -135,6 +108,9 @@ class MetricGroup(Field):
|
|
|
135
108
|
|
|
136
109
|
metrics: List[MetricNameMatcher] = field(default_factory=list)
|
|
137
110
|
|
|
111
|
+
hide_win_rates: Optional[bool] = None
|
|
112
|
+
"""If set to true, do not compute win rates."""
|
|
113
|
+
|
|
138
114
|
|
|
139
115
|
BY_METRIC = "by_metric"
|
|
140
116
|
BY_GROUP = "by_group"
|
|
@@ -222,9 +198,6 @@ class RunGroup(Field):
|
|
|
222
198
|
class Schema:
|
|
223
199
|
"""Specifies information about what to display on the frontend."""
|
|
224
200
|
|
|
225
|
-
# Models
|
|
226
|
-
models: List[ModelField]
|
|
227
|
-
|
|
228
201
|
# Adapter fields (e.g., temperature)
|
|
229
202
|
adapter: List[Field]
|
|
230
203
|
|
|
@@ -241,17 +214,19 @@ class Schema:
|
|
|
241
214
|
run_groups: List[RunGroup]
|
|
242
215
|
|
|
243
216
|
def __post_init__(self):
|
|
244
|
-
self.name_to_model = {model.name: model for model in self.models}
|
|
245
217
|
self.name_to_metric = {metric.name: metric for metric in self.metrics}
|
|
246
218
|
self.name_to_perturbation = {perturbation.name: perturbation for perturbation in self.perturbations}
|
|
247
219
|
self.name_to_metric_group = {metric_group.name: metric_group for metric_group in self.metric_groups}
|
|
248
220
|
self.name_to_run_group = {run_group.name: run_group for run_group in self.run_groups}
|
|
249
221
|
|
|
250
222
|
|
|
251
|
-
def
|
|
223
|
+
def get_default_schema_path() -> str:
|
|
224
|
+
return resources.files(SCHEMA_YAML_PACKAGE).joinpath(SCHEMA_CLASSIC_YAML_FILENAME)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def read_schema(schema_path: str) -> Schema:
|
|
252
228
|
# TODO: merge in model metadata from `model_metadata.yaml`
|
|
253
|
-
schema_path = resources.files(SCHEMA_YAML_PACKAGE).joinpath(filename)
|
|
254
229
|
hlog(f"Reading schema file {schema_path}...")
|
|
255
|
-
with
|
|
230
|
+
with open(schema_path, "r") as f:
|
|
256
231
|
raw = yaml.safe_load(f)
|
|
257
232
|
return dacite.from_dict(Schema, raw)
|