crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import os.path
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class UnicornScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
How Many Unicorns are in this Image? A Safety Evaluation Benchmark of Vision LLMs
|
|
23
|
+
|
|
24
|
+
We shift our focus from evaluating standard performance to introducing a comprehensive safety evaluation
|
|
25
|
+
suite Unicorn, covering both out-of-distribution (OOD) generalization and adversarial robustness. For the OOD
|
|
26
|
+
evaluation, we present two novel VQA datasets --- OODCV-VQA and Sketchy-VQA, each with one variant, designed
|
|
27
|
+
to test model performance under challenging conditions. In the OOD scenario, questions are matched with
|
|
28
|
+
boolean or numerical answers, and we use exact match metrics for evaluation. When comparing OOD Sketchy-VQA
|
|
29
|
+
with its synthesized in-distribution counterpart, we found an average model output F1 drop of 8.9%,
|
|
30
|
+
highlighting the challenging nature of the OOD scenario in the Unicorn benchmark.
|
|
31
|
+
|
|
32
|
+
@article{tu2023unicorns,
|
|
33
|
+
title={How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for Vision LLMs},
|
|
34
|
+
author={Tu, Haoqin and Cui, Chenhang and Wang, Zijun and Zhou, Yiyang and Zhao, Bingchen and Han,
|
|
35
|
+
Junlin and Zhou, Wangchunshu and Yao, Huaxiu and Xie, Cihang},
|
|
36
|
+
journal={arXiv preprint arXiv:2311.16101},
|
|
37
|
+
year={2023}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
Paper: https://arxiv.org/abs/2311.16101
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
UNICORN_HUGGINGFACE_DATASET_NAME: str = "PahaII/unicorn"
|
|
44
|
+
|
|
45
|
+
IMAGE_URL: str = "https://huggingface.co/datasets/PahaII/unicorn/resolve/main/images/{image_path}?download=true"
|
|
46
|
+
|
|
47
|
+
SUBJECTS: List[str] = ["OODCV-VQA", "OODCV-Counterfactual", "Sketchy-VQA", "Sketchy-Challenging"]
|
|
48
|
+
|
|
49
|
+
IMG_TYPE: Dict[str, str] = {
|
|
50
|
+
"OODCV-VQA": "jpeg",
|
|
51
|
+
"OODCV-Counterfactual": "jpeg",
|
|
52
|
+
"Sketchy-VQA": "png",
|
|
53
|
+
"Sketchy-Challenging": "png",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
name = "unicorn"
|
|
57
|
+
description = (
|
|
58
|
+
"Evaluate multimodal models on two out-of-distribution scenarios with four subjects"
|
|
59
|
+
" ([paper](https://arxiv.org/abs/2311.16101))."
|
|
60
|
+
)
|
|
61
|
+
tags = ["vision-language"]
|
|
62
|
+
|
|
63
|
+
def __init__(self, subject: str):
|
|
64
|
+
super().__init__()
|
|
65
|
+
assert subject in self.SUBJECTS, f"Invalid subject: {subject}"
|
|
66
|
+
self._subject: str = subject
|
|
67
|
+
self._image_type: str = self.IMG_TYPE[self._subject]
|
|
68
|
+
|
|
69
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
+
images_path: str = os.path.join(output_path, "images")
|
|
71
|
+
ensure_directory_exists(images_path)
|
|
72
|
+
|
|
73
|
+
# There is only the test split in Unicorn benchmark
|
|
74
|
+
instances: List[Instance] = []
|
|
75
|
+
question_data_files = {TEST_SPLIT: f"{self._subject}.json"}
|
|
76
|
+
|
|
77
|
+
# Process the test set
|
|
78
|
+
for row in tqdm(
|
|
79
|
+
load_dataset(
|
|
80
|
+
self.UNICORN_HUGGINGFACE_DATASET_NAME,
|
|
81
|
+
data_files=question_data_files,
|
|
82
|
+
split=TEST_SPLIT,
|
|
83
|
+
cache_dir=output_path,
|
|
84
|
+
)
|
|
85
|
+
):
|
|
86
|
+
# Download the image
|
|
87
|
+
image_path: str = row["image_path"]
|
|
88
|
+
local_image_path: str = os.path.join(output_path, image_path)
|
|
89
|
+
ensure_file_downloaded(
|
|
90
|
+
source_url=self.IMAGE_URL.format(image_path=image_path),
|
|
91
|
+
target_path=local_image_path,
|
|
92
|
+
unpack=False,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
content: List[MediaObject] = [
|
|
96
|
+
MediaObject(location=local_image_path, content_type=f"image/{self._image_type}"),
|
|
97
|
+
MediaObject(text=row["question"], content_type="text/plain"),
|
|
98
|
+
]
|
|
99
|
+
answer: str = row["answer"]
|
|
100
|
+
instances.append(
|
|
101
|
+
Instance(
|
|
102
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
103
|
+
references=[Reference(Output(text=answer), tags=[CORRECT_TAG])],
|
|
104
|
+
split=TEST_SPLIT,
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
return instances
|
|
@@ -13,13 +13,13 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
13
|
Scenario,
|
|
14
14
|
)
|
|
15
15
|
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
-
from helm.common.general import
|
|
16
|
+
from helm.common.general import ensure_file_downloaded
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
class VizWizScenario(Scenario):
|
|
20
20
|
"""
|
|
21
|
-
VizWiz is a real-world visual question answering dataset consisting of questions
|
|
22
|
-
|
|
21
|
+
VizWiz is a real-world visual question answering dataset consisting of questions asked by people who are
|
|
22
|
+
visually impaired. It originates from a natural visual question answering
|
|
23
23
|
setting where blind people each took an image and recorded a spoken question about it,
|
|
24
24
|
together with 10 crowdsourced answers per visual question.
|
|
25
25
|
|
|
@@ -60,7 +60,6 @@ class VizWizScenario(Scenario):
|
|
|
60
60
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
61
61
|
# Download the questions and annotations
|
|
62
62
|
annotations_path: str = os.path.join(output_path, "annotations")
|
|
63
|
-
ensure_directory_exists(annotations_path)
|
|
64
63
|
ensure_file_downloaded(
|
|
65
64
|
source_url=self.ANNOTATIONS_URL,
|
|
66
65
|
target_path=annotations_path,
|
|
@@ -54,7 +54,7 @@ class VQAScenario(Scenario):
|
|
|
54
54
|
TEST_SPLIT: "http://images.cocodataset.org/zips/test2015.zip",
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
name = "
|
|
57
|
+
name = "vqa"
|
|
58
58
|
description = "Open-ended questions about images ([paper](https://arxiv.org/abs/1612.00837))."
|
|
59
59
|
tags = ["vision-language", "visual question answering"]
|
|
60
60
|
|
|
@@ -111,11 +111,13 @@ class VQAScenario(Scenario):
|
|
|
111
111
|
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
112
112
|
MediaObject(text=question_json["question"], content_type="text/plain"),
|
|
113
113
|
]
|
|
114
|
-
|
|
115
114
|
instances.append(
|
|
116
115
|
Instance(
|
|
117
116
|
Input(multimedia_content=MultimediaObject(content)),
|
|
118
|
-
references=[
|
|
117
|
+
references=[
|
|
118
|
+
Reference(Output(text=answer_json["answer"]), tags=[CORRECT_TAG])
|
|
119
|
+
for answer_json in answers_json["answers"]
|
|
120
|
+
],
|
|
119
121
|
split=split,
|
|
120
122
|
)
|
|
121
123
|
)
|
|
@@ -61,7 +61,7 @@ class WMT14Scenario(Scenario):
|
|
|
61
61
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
62
62
|
with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
|
|
63
63
|
subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
|
|
64
|
-
hf_dataset: Any = load_dataset("wmt14", subset_name)
|
|
64
|
+
hf_dataset: Any = load_dataset("wmt14", subset_name, trust_remote_code=True)
|
|
65
65
|
splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
|
|
66
66
|
|
|
67
67
|
instances: List[Instance] = []
|
helm/benchmark/server.py
CHANGED
|
@@ -70,6 +70,14 @@ def serve_benchmark_output(filename):
|
|
|
70
70
|
return response
|
|
71
71
|
|
|
72
72
|
|
|
73
|
+
@app.get("/cache/output/<filename:path>")
|
|
74
|
+
def serve_cache_output(filename):
|
|
75
|
+
response = static_file(filename, root=app.config["helm.cacheoutputpath"])
|
|
76
|
+
response.set_header("Cache-Control", "no-cache, no-store, must-revalidate")
|
|
77
|
+
response.set_header("Expires", "0")
|
|
78
|
+
return response
|
|
79
|
+
|
|
80
|
+
|
|
73
81
|
@app.get("/")
|
|
74
82
|
@app.get("/<filename:path>")
|
|
75
83
|
def serve_static(filename="index.html"):
|
|
@@ -87,6 +95,12 @@ def main():
|
|
|
87
95
|
help="The location of the output path (filesystem path or URL)",
|
|
88
96
|
default="benchmark_output",
|
|
89
97
|
)
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"--cache-output-path",
|
|
100
|
+
type=str,
|
|
101
|
+
help="The location of the filesystem cache output folder (filesystem path or URL)",
|
|
102
|
+
default="prod_env/cache/output",
|
|
103
|
+
)
|
|
90
104
|
parser.add_argument(
|
|
91
105
|
"--suite",
|
|
92
106
|
type=str,
|
|
@@ -99,6 +113,11 @@ def main():
|
|
|
99
113
|
default=None,
|
|
100
114
|
help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
|
|
101
115
|
)
|
|
116
|
+
parser.add_argument(
|
|
117
|
+
"--jquery",
|
|
118
|
+
action="store_true",
|
|
119
|
+
help="Whether to serve the legacy jQuery frontend instead of the React frontend.",
|
|
120
|
+
)
|
|
102
121
|
args = parser.parse_args()
|
|
103
122
|
|
|
104
123
|
if args.suite and args.release:
|
|
@@ -107,7 +126,8 @@ def main():
|
|
|
107
126
|
# Determine the location of the static directory.
|
|
108
127
|
# This is a hack: it assumes that the static directory has a physical location,
|
|
109
128
|
# which is not always the case (e.g. when using zipimport).
|
|
110
|
-
|
|
129
|
+
static_package_name = "helm.benchmark.static" if args.jquery else "helm.benchmark.static_build"
|
|
130
|
+
resource_path = resources.files(static_package_name).joinpath("index.html")
|
|
111
131
|
with resources.as_file(resource_path) as resource_filename:
|
|
112
132
|
static_path = str(resource_filename.parent)
|
|
113
133
|
|
|
@@ -117,16 +137,19 @@ def main():
|
|
|
117
137
|
# Output path is a URL, so set the output path base URL in the frontend to that URL
|
|
118
138
|
# so that the frontend reads from that URL directly.
|
|
119
139
|
app.config["helm.outputpath"] = None
|
|
140
|
+
# TODO: figure out helm.cacheoutputpath
|
|
120
141
|
app.config["helm.outputurl"] = args.output_path
|
|
121
142
|
else:
|
|
122
143
|
# Output path is a location on disk, so set the output path base URL to /benchmark_output
|
|
123
144
|
# and then serve files from the location on disk at that URL.
|
|
124
145
|
app.config["helm.outputpath"] = path.abspath(args.output_path)
|
|
146
|
+
app.config["helm.cacheoutputpath"] = path.abspath(args.cache_output_path)
|
|
125
147
|
app.config["helm.outputurl"] = "benchmark_output"
|
|
126
148
|
|
|
127
149
|
app.config["helm.suite"] = args.suite or "latest"
|
|
128
150
|
app.config["helm.release"] = args.release
|
|
129
151
|
|
|
152
|
+
print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
|
|
130
153
|
app.run(host="0.0.0.0", port=args.port)
|
|
131
154
|
|
|
132
155
|
|
helm/benchmark/slurm_runner.py
CHANGED
|
@@ -10,6 +10,10 @@ import sys
|
|
|
10
10
|
|
|
11
11
|
from helm.common.codec import from_json, to_json
|
|
12
12
|
from helm.common.general import write
|
|
13
|
+
from helm.benchmark.config_registry import (
|
|
14
|
+
register_configs_from_directory,
|
|
15
|
+
register_builtin_configs_from_helm_package,
|
|
16
|
+
)
|
|
13
17
|
from helm.benchmark.executor import ExecutionSpec
|
|
14
18
|
from helm.benchmark.runner import Runner, RunSpec, RunnerError
|
|
15
19
|
from helm.benchmark.slurm_jobs import (
|
|
@@ -24,10 +28,11 @@ from helm.benchmark.slurm_jobs import (
|
|
|
24
28
|
from helm.common.general import ensure_directory_exists
|
|
25
29
|
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
26
30
|
|
|
31
|
+
from helm.benchmark.runner_config_registry import RUNNER_CONFIG
|
|
27
32
|
|
|
28
|
-
_DEFAULT_MAX_CONCURRENT_WORKER_SLURM_JOBS = 8
|
|
29
33
|
_MAX_CONCURRENT_WORKER_SLURM_JOBS_ENV_NAME = "HELM_MAX_CONCURRENT_WORKER_SLURM_JOBS"
|
|
30
34
|
_SLURM_NODE_NAMES_ENV_NAME = "HELM_SLURM_NODE_NAMES"
|
|
35
|
+
_DEFAULT_MAX_CONCURRENT_WORKER_SLURM = 8
|
|
31
36
|
|
|
32
37
|
|
|
33
38
|
@dataclass
|
|
@@ -89,12 +94,15 @@ class SlurmRunner(Runner):
|
|
|
89
94
|
self.slurm_runner_spec_path = os.path.join(self.slurm_base_dir, "slurm_runner_spec.json")
|
|
90
95
|
|
|
91
96
|
# Configure max concurrent worker Slurm jobs from the environment variable.
|
|
92
|
-
# TODO: Read from a configuration file instead
|
|
93
97
|
env_max_concurrent_worker_slurm_jobs = os.getenv(_MAX_CONCURRENT_WORKER_SLURM_JOBS_ENV_NAME)
|
|
94
98
|
self.max_concurrent_worker_slurm_jobs = (
|
|
95
99
|
int(env_max_concurrent_worker_slurm_jobs)
|
|
96
100
|
if env_max_concurrent_worker_slurm_jobs
|
|
97
|
-
else
|
|
101
|
+
else (
|
|
102
|
+
RUNNER_CONFIG.helm_max_concurrent_workers
|
|
103
|
+
if RUNNER_CONFIG.helm_max_concurrent_workers > 0
|
|
104
|
+
else _DEFAULT_MAX_CONCURRENT_WORKER_SLURM
|
|
105
|
+
)
|
|
98
106
|
)
|
|
99
107
|
|
|
100
108
|
def run_all(self, run_specs: List[RunSpec]):
|
|
@@ -222,8 +230,7 @@ class SlurmRunner(Runner):
|
|
|
222
230
|
break
|
|
223
231
|
|
|
224
232
|
# Refresh every minute
|
|
225
|
-
|
|
226
|
-
time.sleep(60)
|
|
233
|
+
time.sleep(RUNNER_CONFIG.slurm_monitor_interval)
|
|
227
234
|
finally:
|
|
228
235
|
# Cleanup by cancelling all jobs during program termination or if an exception is raised.
|
|
229
236
|
cancel_all_jobs()
|
|
@@ -261,34 +268,48 @@ class SlurmRunner(Runner):
|
|
|
261
268
|
run_spec_path,
|
|
262
269
|
]
|
|
263
270
|
)
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
271
|
+
if RUNNER_CONFIG.slurm_args is None:
|
|
272
|
+
raw_slurm_args: Dict[str, str] = {
|
|
273
|
+
"account": "nlp",
|
|
274
|
+
"cpus_per_task": "4",
|
|
275
|
+
"mem": "32G",
|
|
276
|
+
"gres": "gpu:0",
|
|
277
|
+
"open_mode": "append",
|
|
278
|
+
"partition": "john",
|
|
279
|
+
"time": "14-0", # Deadline of 14 days
|
|
280
|
+
"mail_type": "FAIL",
|
|
281
|
+
"job_name": run_name,
|
|
282
|
+
"output": log_path,
|
|
283
|
+
"chdir": os.getcwd(),
|
|
284
|
+
}
|
|
285
|
+
# TODO: Move resource requirements into RunSpec.
|
|
286
|
+
slurm_node_names = os.getenv(_SLURM_NODE_NAMES_ENV_NAME)
|
|
287
|
+
if run_spec.name.startswith("msmarco:"):
|
|
288
|
+
raw_slurm_args["mem"] = "64G"
|
|
289
|
+
if "device=cuda" in run_spec.name:
|
|
290
|
+
raw_slurm_args["gres"] = "gpu:1"
|
|
291
|
+
raw_slurm_args["partition"] = "jag-hi"
|
|
292
|
+
if "model=huggingface" in run_spec.name:
|
|
293
|
+
raw_slurm_args["gres"] = "gpu:1"
|
|
294
|
+
raw_slurm_args["partition"] = "sphinx"
|
|
295
|
+
if not slurm_node_names or "sphinx" not in slurm_node_names:
|
|
296
|
+
raise Exception(
|
|
297
|
+
f"Environment variable {_SLURM_NODE_NAMES_ENV_NAME} must be set to sphinx node names"
|
|
298
|
+
)
|
|
299
|
+
if slurm_node_names:
|
|
300
|
+
raw_slurm_args["nodelist"] = slurm_node_names
|
|
301
|
+
|
|
302
|
+
else:
|
|
303
|
+
raw_slurm_args = RUNNER_CONFIG.slurm_args
|
|
304
|
+
|
|
305
|
+
dynamic_slurm_args = {
|
|
306
|
+
"job_name": run_name,
|
|
307
|
+
"output": log_path,
|
|
308
|
+
"chdir": os.getcwd(),
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
# User should not set these manually, overwrite them if necessary
|
|
312
|
+
raw_slurm_args.update(dynamic_slurm_args)
|
|
292
313
|
|
|
293
314
|
slurm_args: Dict[str, str] = {key: shlex.quote(value) for key, value in raw_slurm_args.items()}
|
|
294
315
|
# Uncomment this to get notification emails from Slurm for Slurm worker jobs.
|
|
@@ -300,27 +321,15 @@ class SlurmRunner(Runner):
|
|
|
300
321
|
return slurm_job_id
|
|
301
322
|
|
|
302
323
|
|
|
303
|
-
def run_as_worker(slurm_runner_spec_path: str, run_spec_path: str):
|
|
304
|
-
"""Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
|
|
305
|
-
|
|
306
|
-
Used by the worker Slurm jobs only."""
|
|
307
|
-
with open(slurm_runner_spec_path, "r") as f:
|
|
308
|
-
slurm_runner_spec = from_json(f.read(), SlurmRunnerSpec)
|
|
309
|
-
with open(run_spec_path, "r") as f:
|
|
310
|
-
run_spec = from_json(f.read(), RunSpec)
|
|
311
|
-
slurm_runner = SlurmRunner(**slurm_runner_spec.to_kwargs())
|
|
312
|
-
slurm_runner.run_one(run_spec)
|
|
313
|
-
|
|
314
|
-
|
|
315
324
|
def main():
|
|
316
325
|
"""Entry point for the SlurmRunner's worker Slurm jobs that run a single RunSpec.
|
|
317
326
|
|
|
318
327
|
This entry point should only be used by SlurmRunner. Users should use `helm-run` instead.
|
|
319
328
|
SlurmRunner has to use this entry point instead of helm-run because there is no way to
|
|
320
329
|
specify the worker Slurm job parameters through `helm-run`. In particular, there is no way
|
|
321
|
-
to run a specific `RunSpec` using the `--run-
|
|
322
|
-
`run-
|
|
323
|
-
convert a `RunSpec` into a `
|
|
330
|
+
to run a specific `RunSpec` using the `--run-entries` parameter of `helm-run`, because the
|
|
331
|
+
`run-entries` argument contains `RunEntry` description (not `RunSpec`s), and there is no way to
|
|
332
|
+
convert a `RunSpec` into a `RunEntry` description."""
|
|
324
333
|
parser = argparse.ArgumentParser()
|
|
325
334
|
parser.add_argument(
|
|
326
335
|
"--slurm-runner-spec-path",
|
|
@@ -335,7 +344,19 @@ def main():
|
|
|
335
344
|
required=True,
|
|
336
345
|
)
|
|
337
346
|
args = parser.parse_args()
|
|
338
|
-
|
|
347
|
+
|
|
348
|
+
# Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
|
|
349
|
+
with open(args.slurm_runner_spec_path, "r") as f:
|
|
350
|
+
slurm_runner_spec = from_json(f.read(), SlurmRunnerSpec)
|
|
351
|
+
with open(args.run_spec_path, "r") as f:
|
|
352
|
+
run_spec = from_json(f.read(), RunSpec)
|
|
353
|
+
|
|
354
|
+
register_builtin_configs_from_helm_package()
|
|
355
|
+
if slurm_runner_spec.execution_spec.local_path is not None:
|
|
356
|
+
register_configs_from_directory(slurm_runner_spec.execution_spec.local_path)
|
|
357
|
+
|
|
358
|
+
slurm_runner = SlurmRunner(**slurm_runner_spec.to_kwargs())
|
|
359
|
+
slurm_runner.run_one(run_spec)
|
|
339
360
|
|
|
340
361
|
|
|
341
362
|
if __name__ == "__main__":
|