crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/benchmark/runner.py
CHANGED
|
@@ -6,70 +6,69 @@ import traceback
|
|
|
6
6
|
import typing
|
|
7
7
|
from collections import Counter
|
|
8
8
|
import dataclasses
|
|
9
|
-
from dataclasses import dataclass, field
|
|
10
9
|
from typing import Any, Dict, List
|
|
10
|
+
import numpy as np
|
|
11
11
|
|
|
12
12
|
from tqdm import tqdm
|
|
13
13
|
|
|
14
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
14
15
|
from helm.common.general import ensure_directory_exists, write, asdict_without_nones
|
|
15
16
|
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
16
17
|
from helm.common.cache import cache_stats
|
|
17
|
-
from .
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
from .
|
|
27
|
-
from .
|
|
28
|
-
from .
|
|
29
|
-
from .
|
|
18
|
+
from helm.benchmark.scenarios.scenario import (
|
|
19
|
+
EVAL_SPLITS,
|
|
20
|
+
TRAIN_SPLIT,
|
|
21
|
+
Scenario,
|
|
22
|
+
create_scenario,
|
|
23
|
+
Instance,
|
|
24
|
+
get_scenario_cache_path,
|
|
25
|
+
with_instance_ids,
|
|
26
|
+
)
|
|
27
|
+
from helm.benchmark.adaptation.adapters.adapter import Adapter
|
|
28
|
+
from helm.benchmark.adaptation.adapters.adapter_factory import AdapterFactory
|
|
29
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
30
|
+
from helm.benchmark.run_spec import RunSpec
|
|
31
|
+
from helm.benchmark.data_preprocessor import DataPreprocessor
|
|
32
|
+
from helm.benchmark.executor import ExecutionSpec, Executor
|
|
33
|
+
from helm.benchmark.annotation_executor import AnnotationExecutionSpec, AnnotationExecutor
|
|
34
|
+
from helm.benchmark.metrics.dry_run_metrics import DryRunMetric
|
|
35
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
36
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
37
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, create_metric, Stat
|
|
38
|
+
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
30
39
|
|
|
31
40
|
|
|
32
41
|
LATEST_SYMLINK: str = "latest"
|
|
42
|
+
_BENCHMARK_OUTPUT_PATH: str = "benchmark_output"
|
|
43
|
+
_CACHED_MODELS_FOLDER: str = "models"
|
|
33
44
|
|
|
34
45
|
|
|
35
|
-
|
|
36
|
-
"""
|
|
37
|
-
|
|
38
|
-
pass
|
|
39
|
-
|
|
46
|
+
def get_benchmark_output_path() -> str:
|
|
47
|
+
"""Get the benchmark output path.
|
|
40
48
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
|
-
computes a list of stats based on the defined metrics.
|
|
46
|
-
"""
|
|
49
|
+
Many run spec functions need to know the benchmark output path,
|
|
50
|
+
but there is no way to pass it via the run spec function,
|
|
51
|
+
so instead the run spec function should read this global variable."""
|
|
52
|
+
return _BENCHMARK_OUTPUT_PATH
|
|
47
53
|
|
|
48
|
-
# Unique identifier of the RunSpec
|
|
49
|
-
name: str
|
|
50
54
|
|
|
51
|
-
|
|
52
|
-
|
|
55
|
+
def get_cached_models_path() -> str:
|
|
56
|
+
"""Get the cached models pat within the benchmark output path."""
|
|
57
|
+
path: str = os.path.join(get_benchmark_output_path(), _CACHED_MODELS_FOLDER)
|
|
58
|
+
ensure_directory_exists(path)
|
|
59
|
+
return path
|
|
53
60
|
|
|
54
|
-
# Specifies how to adapt an instance into a set of requests
|
|
55
|
-
adapter_spec: AdapterSpec
|
|
56
61
|
|
|
57
|
-
|
|
58
|
-
|
|
62
|
+
def set_benchmark_output_path(benchmark_output_path: str) -> None:
|
|
63
|
+
"""Set the benchmark output path."""
|
|
64
|
+
global _BENCHMARK_OUTPUT_PATH
|
|
65
|
+
_BENCHMARK_OUTPUT_PATH = benchmark_output_path
|
|
59
66
|
|
|
60
|
-
# Data augmenter. The default `DataAugmenterSpec` does nothing.
|
|
61
|
-
data_augmenter_spec: DataAugmenterSpec = DataAugmenterSpec()
|
|
62
67
|
|
|
63
|
-
|
|
64
|
-
|
|
68
|
+
class RunnerError(Exception):
|
|
69
|
+
"""Error that happens in the Runner."""
|
|
65
70
|
|
|
66
|
-
|
|
67
|
-
"""
|
|
68
|
-
`self.name` is used as the name of the output folder for the `RunSpec`.
|
|
69
|
-
Clean up `self.name` by replacing any "/"'s with "_".
|
|
70
|
-
"""
|
|
71
|
-
# TODO: Don't mutate name! clean this up before passing it into the constructor here
|
|
72
|
-
object.__setattr__(self, "name", self.name.replace(os.path.sep, "_"))
|
|
71
|
+
pass
|
|
73
72
|
|
|
74
73
|
|
|
75
74
|
def remove_stats_nans(stats: List[Stat]) -> List[Stat]:
|
|
@@ -103,6 +102,40 @@ def remove_per_instance_stats_nans(per_instance_stats_list: List[PerInstanceStat
|
|
|
103
102
|
return result
|
|
104
103
|
|
|
105
104
|
|
|
105
|
+
def downsample_eval_instances(
|
|
106
|
+
instances: List[Instance], max_eval_instances: int, eval_splits: List[str]
|
|
107
|
+
) -> List[Instance]:
|
|
108
|
+
"""
|
|
109
|
+
Get the instances necessary for this run:
|
|
110
|
+
Train instances (split=train): keep all (if any) for in-context learning
|
|
111
|
+
Eval instances (split=valid or test): keep at most `max_eval_instances` specified in `AdapterSpec` by sampling
|
|
112
|
+
Return the resulting train and eval instances.
|
|
113
|
+
"""
|
|
114
|
+
all_train_instances: List[Instance] = [instance for instance in instances if instance.split == TRAIN_SPLIT]
|
|
115
|
+
|
|
116
|
+
all_eval_instances: List[Instance] = [instance for instance in instances if instance.split in eval_splits]
|
|
117
|
+
if len(all_eval_instances) > max_eval_instances:
|
|
118
|
+
# The random sampling includes instances monotonically.
|
|
119
|
+
np.random.seed(0)
|
|
120
|
+
selected_eval_instances = list(
|
|
121
|
+
np.random.choice(
|
|
122
|
+
all_eval_instances, # type: ignore
|
|
123
|
+
max_eval_instances,
|
|
124
|
+
replace=False,
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
else:
|
|
128
|
+
selected_eval_instances = all_eval_instances
|
|
129
|
+
|
|
130
|
+
hlog(
|
|
131
|
+
f"{len(instances)} instances, "
|
|
132
|
+
f"{len(all_train_instances)} train instances, "
|
|
133
|
+
f"{len(selected_eval_instances)}/{len(all_eval_instances)} eval instances"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return all_train_instances + selected_eval_instances
|
|
137
|
+
|
|
138
|
+
|
|
106
139
|
class Runner:
|
|
107
140
|
"""
|
|
108
141
|
The main entry point for running the entire benchmark. Mostly just
|
|
@@ -121,6 +154,15 @@ class Runner:
|
|
|
121
154
|
exit_on_error: bool,
|
|
122
155
|
):
|
|
123
156
|
self.executor = Executor(execution_spec)
|
|
157
|
+
self.annotator_executor = AnnotationExecutor(
|
|
158
|
+
AnnotationExecutionSpec(
|
|
159
|
+
local_path=execution_spec.local_path if execution_spec.local_path is not None else "",
|
|
160
|
+
parallelism=execution_spec.parallelism,
|
|
161
|
+
dry_run=execution_spec.dry_run,
|
|
162
|
+
sqlite_cache_backend_config=execution_spec.sqlite_cache_backend_config,
|
|
163
|
+
mongo_cache_backend_config=execution_spec.mongo_cache_backend_config,
|
|
164
|
+
)
|
|
165
|
+
)
|
|
124
166
|
self.dry_run: bool = execution_spec.dry_run
|
|
125
167
|
self.tokenizer_service = TokenizerService(self.executor.service, execution_spec.auth)
|
|
126
168
|
self.metric_service = MetricService(self.executor.service, execution_spec.auth)
|
|
@@ -131,9 +173,8 @@ class Runner:
|
|
|
131
173
|
self.exit_on_error: bool = exit_on_error
|
|
132
174
|
|
|
133
175
|
ensure_directory_exists(output_path)
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
ensure_directory_exists(self.scenarios_path)
|
|
176
|
+
self.output_path = output_path
|
|
177
|
+
|
|
137
178
|
# Decide where to save input instances
|
|
138
179
|
self.instances_path: str = os.path.join(output_path, "scenario_instances")
|
|
139
180
|
ensure_directory_exists(self.instances_path)
|
|
@@ -145,11 +186,13 @@ class Runner:
|
|
|
145
186
|
self.eval_cache_path: str = os.path.join(self.runs_path, "eval_cache")
|
|
146
187
|
ensure_directory_exists(self.eval_cache_path)
|
|
147
188
|
|
|
148
|
-
def
|
|
189
|
+
def _get_run_path(self, run_spec: RunSpec) -> str:
|
|
190
|
+
return os.path.join(self.runs_path, run_spec.name)
|
|
191
|
+
|
|
192
|
+
def _is_run_completed(self, run_path: str):
|
|
149
193
|
"""Return whether the run was previously completed.
|
|
150
194
|
|
|
151
195
|
A run is completed if all of the expected output files exist."""
|
|
152
|
-
run_path: str = os.path.join(self.runs_path, run_spec.name)
|
|
153
196
|
if not os.path.isdir(run_path):
|
|
154
197
|
return False
|
|
155
198
|
output_paths = [
|
|
@@ -182,31 +225,21 @@ class Runner:
|
|
|
182
225
|
raise RunnerError(f"Failed runs: [{failed_runs_str}]")
|
|
183
226
|
|
|
184
227
|
def run_one(self, run_spec: RunSpec):
|
|
228
|
+
run_path: str = self._get_run_path(run_spec)
|
|
229
|
+
if self.skip_completed_runs and self._is_run_completed(run_path):
|
|
230
|
+
hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
|
|
231
|
+
return
|
|
232
|
+
ensure_directory_exists(run_path)
|
|
233
|
+
|
|
185
234
|
# Load the scenario
|
|
186
235
|
scenario: Scenario = create_scenario(run_spec.scenario_spec)
|
|
187
236
|
|
|
188
|
-
# This `output_path` will be used when `Adapter` calls `Scenario.get_instances`.
|
|
189
|
-
scenario_output_path = os.path.join(self.scenarios_path, scenario.name)
|
|
190
|
-
ensure_directory_exists(scenario_output_path)
|
|
191
|
-
|
|
192
237
|
# This 'output_path' will be used when the model's input instances are saved.
|
|
193
238
|
args_str = ",".join([f"{k}={v}" for k, v in sorted(run_spec.scenario_spec.args.items())])
|
|
194
239
|
scenario_name_with_args = f"{scenario.name}:{args_str}" if args_str else f"{scenario.name}"
|
|
195
240
|
input_instances_output_path = os.path.join(self.instances_path, scenario_name_with_args)
|
|
196
241
|
input_instances_file_path = os.path.join(input_instances_output_path, "input_instances.json")
|
|
197
242
|
|
|
198
|
-
run_path: str = os.path.join(self.runs_path, run_spec.name)
|
|
199
|
-
ensure_directory_exists(run_path)
|
|
200
|
-
|
|
201
|
-
if self.skip_completed_runs and self._is_run_completed(run_spec):
|
|
202
|
-
# If scenario_state.json exists, assume that all other output files exist
|
|
203
|
-
# because scenario_state.json is the last output file to be written.
|
|
204
|
-
hlog(f"Skipping run {run_spec.name} because run is completed and all output files exist.")
|
|
205
|
-
return
|
|
206
|
-
|
|
207
|
-
# Fetch and initialize the Adapter based on the `AdapterSpec`.
|
|
208
|
-
adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
|
|
209
|
-
|
|
210
243
|
instances: List[Instance]
|
|
211
244
|
if self.skip_instances:
|
|
212
245
|
instances = []
|
|
@@ -217,6 +250,7 @@ class Runner:
|
|
|
217
250
|
instances = [dacite.from_dict(Instance, instance) for instance in json_instances]
|
|
218
251
|
else:
|
|
219
252
|
# Create the instances of the scenario
|
|
253
|
+
scenario_output_path = get_scenario_cache_path(self.output_path, scenario.name)
|
|
220
254
|
with htrack_block("scenario.get_instances"):
|
|
221
255
|
instances = scenario.get_instances(scenario_output_path)
|
|
222
256
|
if self.cache_instances and not os.path.exists(input_instances_file_path):
|
|
@@ -230,10 +264,14 @@ class Runner:
|
|
|
230
264
|
return # Exit after saving the instances.
|
|
231
265
|
|
|
232
266
|
# Give each instance a unique ID
|
|
233
|
-
|
|
267
|
+
if any([instance.id is None for instance in instances]):
|
|
268
|
+
instances = with_instance_ids(instances)
|
|
234
269
|
|
|
235
270
|
# Get the instances necessary for this run.
|
|
236
|
-
|
|
271
|
+
max_eval_instances = run_spec.adapter_spec.max_eval_instances
|
|
272
|
+
eval_splits = run_spec.adapter_spec.eval_splits or EVAL_SPLITS
|
|
273
|
+
if max_eval_instances is not None:
|
|
274
|
+
instances = downsample_eval_instances(instances, max_eval_instances, eval_splits)
|
|
237
275
|
|
|
238
276
|
# Data preprocessing
|
|
239
277
|
instances = DataPreprocessor(run_spec.data_augmenter_spec).preprocess(
|
|
@@ -241,15 +279,24 @@ class Runner:
|
|
|
241
279
|
)
|
|
242
280
|
|
|
243
281
|
# Adapt (convert to requests)
|
|
244
|
-
|
|
282
|
+
adapter: Adapter = AdapterFactory.get_adapter(run_spec.adapter_spec, self.tokenizer_service)
|
|
283
|
+
request_states: List[RequestState] = adapter.adapt(instances, self.executor.execution_spec.parallelism)
|
|
284
|
+
scenario_state: ScenarioState = ScenarioState(
|
|
285
|
+
adapter_spec=run_spec.adapter_spec,
|
|
286
|
+
request_states=request_states,
|
|
287
|
+
annotator_specs=run_spec.annotators,
|
|
288
|
+
)
|
|
245
289
|
|
|
246
290
|
# Execute (fill up results)
|
|
247
291
|
scenario_state = self.executor.execute(scenario_state)
|
|
248
292
|
|
|
293
|
+
# Annotate (post-process the results)
|
|
294
|
+
scenario_state = self.annotator_executor.execute(scenario_state)
|
|
295
|
+
|
|
249
296
|
# Apply the metrics
|
|
250
297
|
# When performing a dry run, only estimate the number of tokens instead
|
|
251
298
|
# of calculating the metrics.
|
|
252
|
-
metrics: List[
|
|
299
|
+
metrics: List[MetricInterface] = (
|
|
253
300
|
[DryRunMetric()] if self.dry_run else [create_metric(metric_spec) for metric_spec in run_spec.metric_specs]
|
|
254
301
|
)
|
|
255
302
|
stats: List[Stat] = []
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
import dacite
|
|
4
|
+
import yaml
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class RunnerConfigSpec:
|
|
9
|
+
helm_max_concurrent_workers: int = -1
|
|
10
|
+
slurm_monitor_interval: int = 60
|
|
11
|
+
slurm_args: Optional[Dict[str, Any]] = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
RUNNER_CONFIG = RunnerConfigSpec()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def register_runner_config_from_path(dir_path: str) -> None:
|
|
18
|
+
global RUNNER_CONFIG
|
|
19
|
+
with open(dir_path, "r") as f:
|
|
20
|
+
raw = yaml.safe_load(f)
|
|
21
|
+
RUNNER_CONFIG = dacite.from_dict(RunnerConfigSpec, raw)
|
|
@@ -123,7 +123,7 @@ class BBQScenario(Scenario):
|
|
|
123
123
|
# trying to book a cab on Uber.", "question": "Who was not comfortable using the phone?", "ans0":
|
|
124
124
|
# "The grandfather", "ans1": "The grandson", "ans2": "Can't be determined", "label": 2}
|
|
125
125
|
|
|
126
|
-
for
|
|
126
|
+
for idx, (loaded_line, category) in enumerate(loaded_lines):
|
|
127
127
|
is_negative: bool = loaded_line["question_polarity"] == "neg"
|
|
128
128
|
is_ambiguous: bool = loaded_line["context_condition"] == "ambig"
|
|
129
129
|
context: str = loaded_line["context"]
|
|
@@ -98,7 +98,7 @@ class BOLDScenario(Scenario):
|
|
|
98
98
|
# with https://raw.githubusercontent.com/amazon-
|
|
99
99
|
# research/bold/main/prompts/political_ideology_prompt.json as an example:
|
|
100
100
|
# {"left-wing": {"left-wing_politics": }}
|
|
101
|
-
for
|
|
101
|
+
for prompt_category, prompt_topic in prompt_topics_dict.items():
|
|
102
102
|
for prompt_text in prompt_topic.values():
|
|
103
103
|
prompt_text = prompt_text[0] # stored as a list containing a single string
|
|
104
104
|
prompt_tuples.append((prompt_category, prompt_text))
|
|
@@ -108,7 +108,7 @@ class BOLDScenario(Scenario):
|
|
|
108
108
|
|
|
109
109
|
split_sizes = {"train": len(instances) - DEFAULT_TEST_SIZE, "test": DEFAULT_TEST_SIZE}
|
|
110
110
|
|
|
111
|
-
for
|
|
111
|
+
for idx, prompt_tuple in enumerate(prompt_tuples):
|
|
112
112
|
prompt_category, prompt_text = prompt_tuple
|
|
113
113
|
curr_split = TRAIN_SPLIT
|
|
114
114
|
|
|
@@ -10,14 +10,29 @@ from helm.benchmark.adaptation.adapters.adapter_factory import (
|
|
|
10
10
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
11
11
|
ADAPT_GENERATION,
|
|
12
12
|
)
|
|
13
|
-
from helm.
|
|
13
|
+
from helm.benchmark.runner import get_benchmark_output_path
|
|
14
|
+
from helm.common.general import (
|
|
15
|
+
assert_is_str,
|
|
16
|
+
assert_is_str_list,
|
|
17
|
+
ensure_file_downloaded,
|
|
18
|
+
ensure_directory_exists,
|
|
19
|
+
)
|
|
14
20
|
from helm.common.hierarchical_logger import hlog
|
|
15
|
-
from .scenario import
|
|
21
|
+
from .scenario import (
|
|
22
|
+
Scenario,
|
|
23
|
+
Instance,
|
|
24
|
+
Reference,
|
|
25
|
+
TRAIN_SPLIT,
|
|
26
|
+
TEST_SPLIT,
|
|
27
|
+
CORRECT_TAG,
|
|
28
|
+
Input,
|
|
29
|
+
Output,
|
|
30
|
+
get_scenario_cache_path,
|
|
31
|
+
)
|
|
16
32
|
from .code_scenario import CodeReference, CodeInstance
|
|
17
33
|
|
|
18
34
|
|
|
19
35
|
CLEVA_DATA_URL = "http://39.108.215.175/data"
|
|
20
|
-
CLEVA_DATA_PATH = "benchmark_output/scenarios/cleva"
|
|
21
36
|
|
|
22
37
|
|
|
23
38
|
@dataclass(frozen=True)
|
|
@@ -69,26 +84,17 @@ class Converter:
|
|
|
69
84
|
"""Convert a data point in CLEVA format to a HELM instance according to a given CLEVA prompt template."""
|
|
70
85
|
transformed_data = self._apply_all(copy.deepcopy(data), templates)
|
|
71
86
|
|
|
72
|
-
prompt
|
|
73
|
-
assert isinstance(prompt, str)
|
|
87
|
+
prompt = assert_is_str(transformed_data["input"])
|
|
74
88
|
if "choices" in transformed_data:
|
|
75
89
|
# This is a multiple-choice task
|
|
76
|
-
choices
|
|
77
|
-
# Gurantee `choices` must be `List[str]`
|
|
78
|
-
assert isinstance(choices, list)
|
|
79
|
-
for c in choices:
|
|
80
|
-
assert isinstance(c, str)
|
|
90
|
+
choices = assert_is_str_list(transformed_data["choices"])
|
|
81
91
|
references: List[Reference] = [
|
|
82
92
|
Reference(Output(text=text), tags=[CORRECT_TAG] if idx in transformed_data["label"] else [])
|
|
83
93
|
for idx, text in enumerate(choices)
|
|
84
94
|
]
|
|
85
95
|
else:
|
|
86
96
|
# This is a generation task
|
|
87
|
-
correct_answer
|
|
88
|
-
# Gurantee `label` must be `List[str]`
|
|
89
|
-
assert isinstance(correct_answer, list)
|
|
90
|
-
for a in correct_answer:
|
|
91
|
-
assert isinstance(a, str)
|
|
97
|
+
correct_answer = assert_is_str_list(transformed_data["label"])
|
|
92
98
|
references = [Reference(Output(text=answer), tags=[CORRECT_TAG]) for answer in correct_answer]
|
|
93
99
|
|
|
94
100
|
instance = Instance(
|
|
@@ -109,15 +115,12 @@ class Converter:
|
|
|
109
115
|
to a HELM CodeInstance according to a given CLEVA prompt template.
|
|
110
116
|
"""
|
|
111
117
|
|
|
112
|
-
|
|
113
|
-
data["prompt"] = templates["input"].format(**data)
|
|
114
|
-
assert isinstance(data["prompt"], str)
|
|
115
|
-
assert isinstance(data["canonical_solution"], str)
|
|
118
|
+
data["prompt"] = assert_is_str(templates["input"]).format(**data)
|
|
116
119
|
instance = CodeInstance(
|
|
117
|
-
input=Input(text=data["prompt"]),
|
|
120
|
+
input=Input(text=assert_is_str(data["prompt"])),
|
|
118
121
|
references=[
|
|
119
122
|
CodeReference(
|
|
120
|
-
output=Output(text=data["canonical_solution"]),
|
|
123
|
+
output=Output(text=assert_is_str(data["canonical_solution"])),
|
|
121
124
|
test_cases=data,
|
|
122
125
|
tags=[CORRECT_TAG],
|
|
123
126
|
)
|
|
@@ -211,27 +214,18 @@ class Converter:
|
|
|
211
214
|
transformed_data[k] = self._apply(data[k], template, **data)
|
|
212
215
|
|
|
213
216
|
# We then merge all other fields into the `input`
|
|
214
|
-
|
|
215
|
-
data["input"] = templates["input"].format(**transformed_data)
|
|
217
|
+
data["input"] = assert_is_str(templates["input"]).format(**transformed_data)
|
|
216
218
|
if "choices" in data:
|
|
217
219
|
# We take the corresponding choices and apply the `label` template
|
|
218
220
|
# Note: we do not allow `label` template to access other fields in multi-choice tasks
|
|
219
221
|
# Overwrite `choices` to the actual continuations
|
|
220
|
-
choices
|
|
221
|
-
# Gurantee `choices` must be `List[str]`
|
|
222
|
-
assert isinstance(choices, list)
|
|
223
|
-
for c in choices:
|
|
224
|
-
assert isinstance(c, str)
|
|
222
|
+
choices = assert_is_str_list(data["choices"])
|
|
225
223
|
data["choices"] = [self._apply(c, templates.get("label", None), label=c) for c in choices]
|
|
226
224
|
else:
|
|
227
225
|
# For generation tasks, we allow it to access to other stringified fields
|
|
228
226
|
kwargs = transformed_data
|
|
229
227
|
del kwargs["label"]
|
|
230
|
-
labels
|
|
231
|
-
# Gurantee `label` must be `List[str]`
|
|
232
|
-
assert isinstance(labels, list)
|
|
233
|
-
for label in labels:
|
|
234
|
-
assert isinstance(label, str)
|
|
228
|
+
labels = assert_is_str_list(data["label"])
|
|
235
229
|
data["label"] = [self._apply(x, templates.get("label", None), **kwargs, label=x) for x in labels]
|
|
236
230
|
return data
|
|
237
231
|
|
|
@@ -402,7 +396,10 @@ class CLEVAScenario(Scenario):
|
|
|
402
396
|
self.subtask = subtask
|
|
403
397
|
self.version = version
|
|
404
398
|
self.converter = Converter()
|
|
405
|
-
|
|
399
|
+
scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), CLEVAScenario.name)
|
|
400
|
+
self.prompt_template, _ = CLEVAScenario.get_prompt_setting(
|
|
401
|
+
self.task, subtask, version, prompt_id, scenario_cache_path
|
|
402
|
+
)
|
|
406
403
|
|
|
407
404
|
@property
|
|
408
405
|
@abstractmethod
|
|
@@ -410,14 +407,14 @@ class CLEVAScenario(Scenario):
|
|
|
410
407
|
pass
|
|
411
408
|
|
|
412
409
|
@classmethod
|
|
413
|
-
def download_dataset(cls, task: str, version: str):
|
|
410
|
+
def download_dataset(cls, task: str, version: str, cache_dir: str):
|
|
414
411
|
source_url: str = CLEVA_DATA_URL + f"/{version}/{task}.zip"
|
|
415
|
-
target_dir: str = os.path.join(
|
|
412
|
+
target_dir: str = os.path.join(cache_dir, "data", version)
|
|
416
413
|
ensure_directory_exists(target_dir)
|
|
417
414
|
ensure_file_downloaded(source_url=source_url, target_path=os.path.join(target_dir, task), unpack=True)
|
|
418
415
|
|
|
419
|
-
def load_dataset(self) -> Dict[str, List[Dict[str, Any]]]:
|
|
420
|
-
data_dir: str = os.path.join(
|
|
416
|
+
def load_dataset(self, cache_dir: str) -> Dict[str, List[Dict[str, Any]]]:
|
|
417
|
+
data_dir: str = os.path.join(cache_dir, "data", self.version, self.task)
|
|
421
418
|
if self.subtask:
|
|
422
419
|
data_dir = os.path.join(data_dir, self.subtask)
|
|
423
420
|
|
|
@@ -434,8 +431,8 @@ class CLEVAScenario(Scenario):
|
|
|
434
431
|
return dataset
|
|
435
432
|
|
|
436
433
|
@staticmethod
|
|
437
|
-
def load_prompt_templates(task: str, subtask: Optional[str], version: str) -> List[Dict[str, Any]]:
|
|
438
|
-
prompt_dir: str = os.path.join(
|
|
434
|
+
def load_prompt_templates(task: str, subtask: Optional[str], version: str, cache_dir: str) -> List[Dict[str, Any]]:
|
|
435
|
+
prompt_dir: str = os.path.join(cache_dir, "data", version, task)
|
|
439
436
|
if subtask:
|
|
440
437
|
prompt_dir = os.path.join(prompt_dir, subtask)
|
|
441
438
|
file_path = os.path.join(prompt_dir, "prompts.json")
|
|
@@ -448,7 +445,7 @@ class CLEVAScenario(Scenario):
|
|
|
448
445
|
|
|
449
446
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
450
447
|
# Download the raw data
|
|
451
|
-
dataset = self.load_dataset()
|
|
448
|
+
dataset = self.load_dataset(output_path)
|
|
452
449
|
|
|
453
450
|
# Read all the instances
|
|
454
451
|
instances: List[Instance] = []
|
|
@@ -465,9 +462,9 @@ class CLEVAScenario(Scenario):
|
|
|
465
462
|
|
|
466
463
|
@classmethod
|
|
467
464
|
def get_prompt_setting(
|
|
468
|
-
cls, task: str, subtask: Optional[str], version: str, prompt_id: int
|
|
465
|
+
cls, task: str, subtask: Optional[str], version: str, prompt_id: int, output_path: str
|
|
469
466
|
) -> Tuple[Dict[str, Any], PromptSetting]:
|
|
470
|
-
prompt_templates = cls.load_prompt_templates(task, subtask, version)
|
|
467
|
+
prompt_templates = cls.load_prompt_templates(task, subtask, version, output_path)
|
|
471
468
|
if prompt_id >= len(prompt_templates):
|
|
472
469
|
raise ValueError(
|
|
473
470
|
f"You want to use prompt template with prompt_id {prompt_id}, but there is only"
|
|
@@ -519,10 +516,10 @@ class CLEVAScenario(Scenario):
|
|
|
519
516
|
|
|
520
517
|
@classmethod
|
|
521
518
|
def load_inference_parameters(
|
|
522
|
-
cls, task: str, subtask: Optional[str], version: str, prompt_id: int
|
|
519
|
+
cls, task: str, subtask: Optional[str], version: str, prompt_id: int, cache_dir: str
|
|
523
520
|
) -> Dict[str, Any]:
|
|
524
521
|
# We use a dict instead of dataclass to store hyperparameters such that we can set different default values
|
|
525
|
-
params_dir: str = os.path.join(
|
|
522
|
+
params_dir: str = os.path.join(cache_dir, "data", version, task)
|
|
526
523
|
if subtask:
|
|
527
524
|
params_dir = os.path.join(params_dir, subtask)
|
|
528
525
|
file_path = os.path.join(params_dir, "infer_params.json")
|
|
@@ -932,7 +929,7 @@ class CLEVADialogueGenerationScenario(CLEVAScenario):
|
|
|
932
929
|
|
|
933
930
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
934
931
|
# Download the raw data
|
|
935
|
-
dataset = self.load_dataset()
|
|
932
|
+
dataset = self.load_dataset(output_path)
|
|
936
933
|
|
|
937
934
|
# Read all the instances
|
|
938
935
|
instances: List[Instance] = []
|
|
@@ -48,6 +48,7 @@ APPS is a benchmark for code generation from natural language specifications.
|
|
|
48
48
|
Each instance has 1) a problem description with examples (as what you get in
|
|
49
49
|
programming competitions), 2) coding solutions, 3) test cases.
|
|
50
50
|
"""
|
|
51
|
+
|
|
51
52
|
import io
|
|
52
53
|
import json
|
|
53
54
|
import os
|
|
@@ -139,8 +140,8 @@ def _read_and_preprocess_apps(target_path: str) -> List[CodeInstance]:
|
|
|
139
140
|
# only if the version of Python has a default limit.
|
|
140
141
|
#
|
|
141
142
|
# See: https://docs.python.org/3/library/stdtypes.html#int-max-str-digits
|
|
142
|
-
if hasattr(sys, "set_int_max_str_digits"):
|
|
143
|
-
sys.set_int_max_str_digits(100000)
|
|
143
|
+
if hasattr(sys, "set_int_max_str_digits"):
|
|
144
|
+
sys.set_int_max_str_digits(100000)
|
|
144
145
|
|
|
145
146
|
SINGLE_STR_LIMIT = 150000 # From original codebase.
|
|
146
147
|
|