crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -1,798 +1,51 @@
|
|
|
1
|
-
|
|
2
|
-
import itertools
|
|
3
|
-
from functools import partial
|
|
4
|
-
from typing import Any, Callable, List, Dict, Optional, Set, TypeVar
|
|
1
|
+
"""Run spec functions for the HELM Classic leaderboard.
|
|
5
2
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
3
|
+
Website: https://crfm.stanford.edu/helm/classic/
|
|
4
|
+
|
|
5
|
+
If a run spec function is included in both the HELM Classic leaderboard and the
|
|
6
|
+
HELM Lite leaderboard, it will be included in the lite_run_specs module instead of this module.
|
|
7
|
+
This module also contains some scenarios that are currently not used on any HELM leaderboard."""
|
|
8
|
+
|
|
9
|
+
from typing import Any, Dict, List, Optional, Set
|
|
10
|
+
|
|
11
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
12
|
+
ADAPT_GENERATION,
|
|
10
13
|
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
11
14
|
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
12
|
-
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
13
|
-
ADAPT_GENERATION,
|
|
14
15
|
ADAPT_RANKING_BINARY,
|
|
16
|
+
AdapterSpec,
|
|
15
17
|
)
|
|
16
18
|
from helm.benchmark.adaptation.adapters.binary_ranking_adapter import BinaryRankingAdapter
|
|
17
|
-
from helm.benchmark.adaptation.
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
StopRunExpander,
|
|
25
|
-
ChatMLRunExpander,
|
|
26
|
-
AddToStopRunExpander,
|
|
27
|
-
IncreaseMaxTokensRunExpander,
|
|
28
|
-
FormatPromptRunExpander,
|
|
29
|
-
IncreaseTemperatureRunExpander,
|
|
30
|
-
)
|
|
31
|
-
from .runner import RunSpec
|
|
32
|
-
from .scenarios.lex_glue_scenario import (
|
|
33
|
-
get_lex_glue_max_train_instances,
|
|
34
|
-
get_lex_glue_instructions,
|
|
35
|
-
get_lex_glue_max_tokens,
|
|
36
|
-
get_lex_glue_task_type,
|
|
19
|
+
from helm.benchmark.adaptation.common_adapter_specs import (
|
|
20
|
+
get_completion_adapter_spec,
|
|
21
|
+
get_generation_adapter_spec,
|
|
22
|
+
get_language_modeling_adapter_spec,
|
|
23
|
+
get_multiple_choice_adapter_spec,
|
|
24
|
+
get_ranking_binary_adapter_spec,
|
|
25
|
+
get_summarization_adapter_spec,
|
|
37
26
|
)
|
|
38
|
-
from .
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
27
|
+
from helm.benchmark.metrics.common_metric_specs import (
|
|
28
|
+
get_basic_metric_specs,
|
|
29
|
+
get_bias_metric_specs,
|
|
30
|
+
get_classification_metric_specs,
|
|
31
|
+
get_copyright_metric_specs,
|
|
32
|
+
get_disinformation_metric_specs,
|
|
33
|
+
get_exact_match_metric_specs,
|
|
34
|
+
get_f1_metric_specs,
|
|
35
|
+
get_generative_harms_metric_specs,
|
|
36
|
+
get_language_modeling_metric_specs,
|
|
37
|
+
get_numeracy_metric_specs,
|
|
38
|
+
get_open_ended_generation_metric_specs,
|
|
39
|
+
get_summarization_metric_specs,
|
|
40
|
+
get_basic_generation_metric_specs,
|
|
41
|
+
get_basic_reference_metric_specs,
|
|
42
|
+
get_generic_metric_specs,
|
|
49
43
|
)
|
|
50
|
-
from helm.
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
NLG_PREFIX_TAG,
|
|
56
|
-
CHATML_MODEL_TAG,
|
|
57
|
-
OPENAI_CHATGPT_MODEL_TAG,
|
|
58
|
-
BUGGY_TEMP_0_TAG,
|
|
59
|
-
)
|
|
60
|
-
from helm.common.general import singleton
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
############################################################
|
|
64
|
-
# Prototypical adapter specs
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def format_instructions(instructions: str) -> str:
|
|
68
|
-
if len(instructions) > 0:
|
|
69
|
-
instructions += "\n"
|
|
70
|
-
return instructions
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
def get_multiple_choice_joint_adapter_spec(
|
|
74
|
-
instructions: str,
|
|
75
|
-
input_noun: Optional[str],
|
|
76
|
-
output_noun: str,
|
|
77
|
-
num_outputs: int = 5,
|
|
78
|
-
max_train_instances: int = 5,
|
|
79
|
-
max_tokens: int = 5,
|
|
80
|
-
sample_train: bool = True,
|
|
81
|
-
**kwargs,
|
|
82
|
-
) -> AdapterSpec:
|
|
83
|
-
"""
|
|
84
|
-
[instructions]
|
|
85
|
-
|
|
86
|
-
[input_noun]: [input]
|
|
87
|
-
[reference_1]
|
|
88
|
-
...
|
|
89
|
-
[reference_k]
|
|
90
|
-
[output_noun]: [output]
|
|
91
|
-
|
|
92
|
-
[input_noun]: [input]
|
|
93
|
-
[reference_1]
|
|
94
|
-
...
|
|
95
|
-
[reference_k]
|
|
96
|
-
[output_noun]:
|
|
97
|
-
"""
|
|
98
|
-
|
|
99
|
-
return AdapterSpec(
|
|
100
|
-
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
101
|
-
instructions=format_instructions(instructions),
|
|
102
|
-
input_prefix=f"{input_noun}: " if input_noun is not None else "",
|
|
103
|
-
input_suffix="\n" if input_noun is not None else "",
|
|
104
|
-
output_prefix=f"{output_noun}: ",
|
|
105
|
-
output_suffix="\n",
|
|
106
|
-
max_train_instances=max_train_instances,
|
|
107
|
-
num_outputs=num_outputs,
|
|
108
|
-
max_tokens=max_tokens,
|
|
109
|
-
temperature=0.0,
|
|
110
|
-
stop_sequences=["\n"],
|
|
111
|
-
sample_train=sample_train,
|
|
112
|
-
**kwargs,
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec:
|
|
117
|
-
"""
|
|
118
|
-
[input] [reference_i]
|
|
119
|
-
or
|
|
120
|
-
[reference_i]
|
|
121
|
-
"""
|
|
122
|
-
assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
|
|
123
|
-
|
|
124
|
-
return AdapterSpec(
|
|
125
|
-
method=method,
|
|
126
|
-
instructions="",
|
|
127
|
-
input_prefix="",
|
|
128
|
-
input_suffix="",
|
|
129
|
-
output_prefix=" " if not empty_input else "",
|
|
130
|
-
output_suffix="",
|
|
131
|
-
# Separate is basically language modeling, so can't easily use in-context examples
|
|
132
|
-
max_train_instances=0,
|
|
133
|
-
num_outputs=1,
|
|
134
|
-
max_tokens=0,
|
|
135
|
-
temperature=0.0,
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def get_multiple_choice_adapter_spec(
|
|
140
|
-
method: str,
|
|
141
|
-
instructions: str,
|
|
142
|
-
input_noun: Optional[str],
|
|
143
|
-
output_noun: str,
|
|
144
|
-
max_train_instances: int = 5,
|
|
145
|
-
num_outputs: int = 5,
|
|
146
|
-
max_tokens: int = 1,
|
|
147
|
-
empty_input: bool = False,
|
|
148
|
-
sample_train: bool = True,
|
|
149
|
-
**kwargs,
|
|
150
|
-
):
|
|
151
|
-
"""
|
|
152
|
-
Toggle between joint and separate adapters.
|
|
153
|
-
"""
|
|
154
|
-
if method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
155
|
-
return get_multiple_choice_joint_adapter_spec(
|
|
156
|
-
instructions,
|
|
157
|
-
input_noun,
|
|
158
|
-
output_noun,
|
|
159
|
-
max_train_instances=max_train_instances,
|
|
160
|
-
num_outputs=num_outputs,
|
|
161
|
-
max_tokens=max_tokens,
|
|
162
|
-
sample_train=sample_train,
|
|
163
|
-
**kwargs,
|
|
164
|
-
)
|
|
165
|
-
elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
|
|
166
|
-
return get_multiple_choice_separate_adapter_spec(method, empty_input)
|
|
167
|
-
else:
|
|
168
|
-
raise ValueError(f"Invalid adaptation method: {method}")
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def get_ranking_binary_adapter_spec(
|
|
172
|
-
instructions: str = "",
|
|
173
|
-
document_noun: str = "Passage",
|
|
174
|
-
query_noun: str = "Query",
|
|
175
|
-
output_prefix: str = "Does the passage answer the query?",
|
|
176
|
-
output_noun: str = "Answer",
|
|
177
|
-
max_train_instances: int = 4,
|
|
178
|
-
num_outputs: int = 1,
|
|
179
|
-
num_train_trials: int = 1,
|
|
180
|
-
temperature: float = 0.0,
|
|
181
|
-
max_tokens: int = 5,
|
|
182
|
-
**kwargs,
|
|
183
|
-
) -> AdapterSpec:
|
|
184
|
-
"""
|
|
185
|
-
[instructions]
|
|
186
|
-
|
|
187
|
-
[object_noun]: [object]
|
|
188
|
-
[query_noun]: [query]
|
|
189
|
-
[prompt_noun]: [prompt_content]
|
|
190
|
-
[output_noun]: [output]
|
|
191
|
-
|
|
192
|
-
...
|
|
193
|
-
|
|
194
|
-
[object_noun]: [object]
|
|
195
|
-
[query_noun]: [query]
|
|
196
|
-
[prompt_noun]: [prompt_content]
|
|
197
|
-
[output_noun]: [output]
|
|
198
|
-
|
|
199
|
-
[object_noun]: [object]
|
|
200
|
-
[query_noun]: [query]
|
|
201
|
-
[prompt_noun]: [prompt_content]
|
|
202
|
-
[output_noun]: [output]
|
|
203
|
-
"""
|
|
204
|
-
msg = (
|
|
205
|
-
"There must be an even number of in-context examples to ensure that"
|
|
206
|
-
"an equal number of positive and negative examples are included."
|
|
207
|
-
)
|
|
208
|
-
assert max_train_instances % 2 == 0, msg
|
|
209
|
-
max_train_instances = int(max_train_instances / 2)
|
|
210
|
-
|
|
211
|
-
return AdapterSpec(
|
|
212
|
-
method=ADAPT_RANKING_BINARY,
|
|
213
|
-
instructions=format_instructions(instructions),
|
|
214
|
-
input_prefix=f"{query_noun}: ",
|
|
215
|
-
input_suffix="\n",
|
|
216
|
-
reference_prefix=f"{document_noun}: ",
|
|
217
|
-
reference_suffix="\n",
|
|
218
|
-
output_prefix=f"{output_prefix}\n{output_noun}: ",
|
|
219
|
-
max_train_instances=max_train_instances,
|
|
220
|
-
num_outputs=num_outputs,
|
|
221
|
-
num_train_trials=num_train_trials,
|
|
222
|
-
temperature=temperature,
|
|
223
|
-
max_tokens=max_tokens,
|
|
224
|
-
**kwargs,
|
|
225
|
-
)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
def get_completion_adapter_spec(
|
|
229
|
-
instructions: str = "",
|
|
230
|
-
input_prefix: str = "",
|
|
231
|
-
output_prefix: str = "",
|
|
232
|
-
output_suffix: str = "",
|
|
233
|
-
max_train_instances: int = 0,
|
|
234
|
-
temperature: float = 0.0,
|
|
235
|
-
num_outputs: int = 1,
|
|
236
|
-
max_tokens: int = 100,
|
|
237
|
-
stop_sequences: Optional[List] = None, # default value of `stop_sequences` is no stop sequence,
|
|
238
|
-
**kwargs,
|
|
239
|
-
) -> AdapterSpec:
|
|
240
|
-
"""
|
|
241
|
-
[input][output_prefix][output][output_suffix]
|
|
242
|
-
|
|
243
|
-
[input][output_prefix]
|
|
244
|
-
"""
|
|
245
|
-
if stop_sequences is None:
|
|
246
|
-
stop_sequences = []
|
|
247
|
-
|
|
248
|
-
return AdapterSpec(
|
|
249
|
-
method=ADAPT_GENERATION,
|
|
250
|
-
instructions=format_instructions(instructions),
|
|
251
|
-
input_prefix=input_prefix,
|
|
252
|
-
input_suffix="",
|
|
253
|
-
output_prefix=output_prefix,
|
|
254
|
-
output_suffix=output_suffix,
|
|
255
|
-
max_train_instances=max_train_instances,
|
|
256
|
-
temperature=temperature,
|
|
257
|
-
num_outputs=num_outputs,
|
|
258
|
-
max_tokens=max_tokens,
|
|
259
|
-
stop_sequences=stop_sequences,
|
|
260
|
-
**kwargs,
|
|
261
|
-
)
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
def get_generation_adapter_spec(
|
|
265
|
-
instructions: str = "",
|
|
266
|
-
input_noun: Optional[str] = None,
|
|
267
|
-
newline_after_input_noun: bool = False,
|
|
268
|
-
output_noun: Optional[str] = None,
|
|
269
|
-
newline_after_output_noun: bool = False,
|
|
270
|
-
max_train_instances: int = 5,
|
|
271
|
-
num_outputs: int = 1,
|
|
272
|
-
max_tokens: int = 5,
|
|
273
|
-
stop_sequences: Optional[List] = None, # default value of `stop_sequences` is ["\n"]
|
|
274
|
-
temperature: float = 0.0,
|
|
275
|
-
multi_label: bool = False,
|
|
276
|
-
) -> AdapterSpec:
|
|
277
|
-
"""
|
|
278
|
-
[instructions]
|
|
279
|
-
|
|
280
|
-
[input_noun]: [input]
|
|
281
|
-
[output_noun]: [output]
|
|
282
|
-
|
|
283
|
-
[input_noun]: [input]
|
|
284
|
-
[output_noun]:
|
|
285
|
-
"""
|
|
286
|
-
|
|
287
|
-
def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
|
|
288
|
-
"""
|
|
289
|
-
When `append_new_line` is False:
|
|
290
|
-
[input_noun]: [input]
|
|
291
|
-
|
|
292
|
-
When `append_new_line` is True:
|
|
293
|
-
[input_noun]:
|
|
294
|
-
[input]
|
|
295
|
-
"""
|
|
296
|
-
prefix: str = f"{noun}:" if noun is not None else ""
|
|
297
|
-
if len(prefix) > 0:
|
|
298
|
-
prefix += "\n" if append_new_line else " "
|
|
299
|
-
return prefix
|
|
300
|
-
|
|
301
|
-
if stop_sequences is None:
|
|
302
|
-
stop_sequences = ["\n"]
|
|
303
|
-
|
|
304
|
-
return AdapterSpec(
|
|
305
|
-
method=ADAPT_GENERATION,
|
|
306
|
-
instructions=format_instructions(instructions),
|
|
307
|
-
input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun),
|
|
308
|
-
input_suffix="\n",
|
|
309
|
-
output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun),
|
|
310
|
-
output_suffix="\n",
|
|
311
|
-
max_train_instances=max_train_instances,
|
|
312
|
-
num_outputs=num_outputs,
|
|
313
|
-
max_tokens=max_tokens,
|
|
314
|
-
temperature=temperature,
|
|
315
|
-
stop_sequences=stop_sequences,
|
|
316
|
-
multi_label=multi_label,
|
|
317
|
-
)
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
def get_instruct_adapter_spec(
|
|
321
|
-
num_outputs: int = 1,
|
|
322
|
-
max_tokens: int = 512,
|
|
323
|
-
temperature: float = 0.7,
|
|
324
|
-
) -> AdapterSpec:
|
|
325
|
-
"""
|
|
326
|
-
Zero-shot instruction-following.
|
|
327
|
-
"""
|
|
328
|
-
return AdapterSpec(
|
|
329
|
-
method=ADAPT_GENERATION,
|
|
330
|
-
instructions="",
|
|
331
|
-
input_prefix="",
|
|
332
|
-
input_suffix="\n",
|
|
333
|
-
output_prefix="",
|
|
334
|
-
output_suffix="",
|
|
335
|
-
max_train_instances=0,
|
|
336
|
-
num_outputs=num_outputs,
|
|
337
|
-
max_tokens=max_tokens,
|
|
338
|
-
temperature=temperature,
|
|
339
|
-
stop_sequences=[],
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
def get_language_modeling_adapter_spec() -> AdapterSpec:
|
|
344
|
-
"""
|
|
345
|
-
Used for language modeling.
|
|
346
|
-
"""
|
|
347
|
-
return AdapterSpec(
|
|
348
|
-
method=ADAPT_LANGUAGE_MODELING,
|
|
349
|
-
instructions="",
|
|
350
|
-
input_prefix="",
|
|
351
|
-
input_suffix="",
|
|
352
|
-
output_prefix="",
|
|
353
|
-
output_suffix="",
|
|
354
|
-
max_train_instances=0,
|
|
355
|
-
num_outputs=1,
|
|
356
|
-
max_tokens=0,
|
|
357
|
-
temperature=0.0,
|
|
358
|
-
)
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
|
|
362
|
-
"""
|
|
363
|
-
Used for summarization.
|
|
364
|
-
"""
|
|
365
|
-
|
|
366
|
-
if num_sents == 1:
|
|
367
|
-
out_pref = "Summarize the above article in 1 sentence.\n"
|
|
368
|
-
elif num_sents is None:
|
|
369
|
-
out_pref = "Summarize the above article.\n"
|
|
370
|
-
else:
|
|
371
|
-
out_pref = f"Summarize the above article in {num_sents} sentences.\n"
|
|
372
|
-
|
|
373
|
-
return AdapterSpec(
|
|
374
|
-
method=ADAPT_GENERATION,
|
|
375
|
-
instructions="",
|
|
376
|
-
input_prefix="###\nArticle: ",
|
|
377
|
-
input_suffix="\n\n",
|
|
378
|
-
output_prefix=out_pref,
|
|
379
|
-
output_suffix="\n",
|
|
380
|
-
max_train_instances=max_train_instances,
|
|
381
|
-
num_outputs=1,
|
|
382
|
-
stop_sequences=["###"], # Separator between few-shot instances.
|
|
383
|
-
**kwargs,
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
def get_machine_translation_adapter_spec(
|
|
388
|
-
source_language, target_language, max_train_instances, **kwargs
|
|
389
|
-
) -> AdapterSpec:
|
|
390
|
-
"""
|
|
391
|
-
Used for machine translation.
|
|
392
|
-
"""
|
|
393
|
-
return AdapterSpec(
|
|
394
|
-
method=ADAPT_GENERATION,
|
|
395
|
-
instructions=f"Translate {source_language} to {target_language}:",
|
|
396
|
-
input_prefix="",
|
|
397
|
-
input_suffix=" = ",
|
|
398
|
-
output_prefix="",
|
|
399
|
-
output_suffix="\n",
|
|
400
|
-
max_train_instances=max_train_instances,
|
|
401
|
-
num_outputs=1,
|
|
402
|
-
stop_sequences=["\n\n"],
|
|
403
|
-
temperature=0.0,
|
|
404
|
-
**kwargs,
|
|
405
|
-
)
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
############################################################
|
|
409
|
-
# Examples of scenario and adapter specs
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
def get_scenario_spec1() -> ScenarioSpec:
|
|
413
|
-
return ScenarioSpec(
|
|
414
|
-
class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
|
|
415
|
-
args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 10, "num_test_instances": 10},
|
|
416
|
-
)
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
def get_scenario_spec_tiny():
|
|
420
|
-
return ScenarioSpec(
|
|
421
|
-
class_name="helm.benchmark.scenarios.simple_scenarios.Simple1Scenario",
|
|
422
|
-
args={"num_input_tokens": 5, "vocab_size": 20, "num_train_instances": 2, "num_test_instances": 2},
|
|
423
|
-
)
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
def get_adapter_spec1() -> AdapterSpec:
|
|
427
|
-
return AdapterSpec(
|
|
428
|
-
method=ADAPT_GENERATION,
|
|
429
|
-
instructions="Please solve the following problem.\n",
|
|
430
|
-
max_train_instances=5,
|
|
431
|
-
max_eval_instances=10,
|
|
432
|
-
num_outputs=3,
|
|
433
|
-
num_train_trials=3,
|
|
434
|
-
model="simple/model1",
|
|
435
|
-
temperature=1,
|
|
436
|
-
stop_sequences=["."],
|
|
437
|
-
)
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
############################################################
|
|
441
|
-
# Metrics
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
def get_basic_metric_specs(names: List[str]) -> List[MetricSpec]:
|
|
445
|
-
return [MetricSpec(class_name="helm.benchmark.metrics.basic_metrics.BasicMetric", args={"names": names})]
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
def get_exact_match_metric_specs() -> List[MetricSpec]:
|
|
449
|
-
return get_basic_metric_specs(
|
|
450
|
-
["exact_match", "quasi_exact_match", "prefix_exact_match", "quasi_prefix_exact_match"]
|
|
451
|
-
)
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
def get_f1_metric_specs() -> List[MetricSpec]:
|
|
455
|
-
return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score"])
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
def get_classification_metric_specs(delimiter: Optional[str] = None) -> List[MetricSpec]:
|
|
459
|
-
return [
|
|
460
|
-
MetricSpec(
|
|
461
|
-
class_name="helm.benchmark.metrics.classification_metrics.ClassificationMetric",
|
|
462
|
-
args={"delimiter": delimiter},
|
|
463
|
-
)
|
|
464
|
-
]
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
def get_multiple_choice_classification_metric_specs() -> List[MetricSpec]:
|
|
468
|
-
return [
|
|
469
|
-
MetricSpec(
|
|
470
|
-
class_name="helm.benchmark.metrics.classification_metrics.MultipleChoiceClassificationMetric", args={}
|
|
471
|
-
)
|
|
472
|
-
]
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
def get_bbq_metric_specs() -> List[MetricSpec]:
|
|
476
|
-
return [
|
|
477
|
-
MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
|
|
478
|
-
] + get_exact_match_metric_specs()
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
def get_msmarco_metric_specs(track: str, rank: Optional[int] = None) -> List[MetricSpec]:
|
|
482
|
-
# Names of the measures we want to compute.
|
|
483
|
-
measure_names = MSMARCOScenario.MEASURE_NAMES[track]
|
|
484
|
-
multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
|
|
485
|
-
|
|
486
|
-
return [
|
|
487
|
-
MetricSpec(
|
|
488
|
-
class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
|
|
489
|
-
args={
|
|
490
|
-
"method": ADAPT_RANKING_BINARY,
|
|
491
|
-
"measure_names": measure_names,
|
|
492
|
-
"correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
|
|
493
|
-
"wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
|
|
494
|
-
"rank": rank,
|
|
495
|
-
"multiple_relevance_values": multiple_relevance_values,
|
|
496
|
-
},
|
|
497
|
-
),
|
|
498
|
-
] + get_basic_metric_specs(names=[])
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
def get_toxicity_metric_specs() -> List[MetricSpec]:
|
|
502
|
-
return [
|
|
503
|
-
MetricSpec(class_name="helm.benchmark.metrics.toxicity_metrics.ToxicityMetric", args={}),
|
|
504
|
-
]
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
def get_bias_metric_specs() -> List[MetricSpec]:
|
|
508
|
-
demographic_categories = ["race", "gender"]
|
|
509
|
-
target_categories = ["adjective", "profession"]
|
|
510
|
-
cross_dem_target = itertools.product(demographic_categories, target_categories)
|
|
511
|
-
|
|
512
|
-
return [
|
|
513
|
-
MetricSpec(
|
|
514
|
-
class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
|
|
515
|
-
args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
|
|
516
|
-
)
|
|
517
|
-
for dem, tgt in cross_dem_target
|
|
518
|
-
] + [
|
|
519
|
-
MetricSpec(
|
|
520
|
-
class_name="helm.benchmark.metrics.bias_metrics.BiasMetric",
|
|
521
|
-
args={"mode": "representation", "demographic_category": dem},
|
|
522
|
-
)
|
|
523
|
-
for dem in demographic_categories
|
|
524
|
-
]
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
def get_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
|
|
528
|
-
return (
|
|
529
|
-
get_bias_metric_specs()
|
|
530
|
-
+ get_toxicity_metric_specs()
|
|
531
|
-
+ (get_basic_metric_specs([]) if include_basic_metrics else [])
|
|
532
|
-
)
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
def get_summarization_metric_specs(args: Dict[str, Any]) -> List[MetricSpec]:
|
|
536
|
-
return [
|
|
537
|
-
MetricSpec(class_name="helm.benchmark.metrics.summarization_metrics.SummarizationMetric", args=args)
|
|
538
|
-
] + get_basic_metric_specs([])
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
def get_summarization_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
|
|
542
|
-
return [
|
|
543
|
-
MetricSpec(
|
|
544
|
-
class_name="helm.benchmark.metrics.summarization_critique_metrics.SummarizationCritiqueMetric",
|
|
545
|
-
args={"num_respondents": num_respondents},
|
|
546
|
-
)
|
|
547
|
-
]
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
def get_srn_metric_specs() -> List[MetricSpec]:
|
|
551
|
-
return get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
def get_numeracy_metric_specs(run_solver: bool = False) -> List[MetricSpec]:
|
|
555
|
-
metric_specs: List[MetricSpec] = get_basic_metric_specs(
|
|
556
|
-
["exact_match", "quasi_exact_match", "absolute_value_difference"]
|
|
557
|
-
)
|
|
558
|
-
|
|
559
|
-
# The solvers are slow to run so make them skippable
|
|
560
|
-
if run_solver:
|
|
561
|
-
metric_specs += [
|
|
562
|
-
MetricSpec(class_name="helm.benchmark.metrics.numeracy_metrics.DistanceMetric", args={}),
|
|
563
|
-
]
|
|
564
|
-
return metric_specs
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
def get_math_metric_specs(use_chain_of_thought: bool = True) -> List[MetricSpec]:
|
|
568
|
-
return get_basic_metric_specs(["math_equiv_chain_of_thought" if use_chain_of_thought else "math_equiv"])
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
def get_copyright_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
|
|
572
|
-
if args is None:
|
|
573
|
-
args = {}
|
|
574
|
-
return [
|
|
575
|
-
MetricSpec(
|
|
576
|
-
class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
|
|
577
|
-
args={**args, "name": "longest_common_prefix_length"},
|
|
578
|
-
),
|
|
579
|
-
MetricSpec(
|
|
580
|
-
class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
|
|
581
|
-
args={**args, "name": "edit_distance"},
|
|
582
|
-
),
|
|
583
|
-
MetricSpec(
|
|
584
|
-
class_name="helm.benchmark.metrics.copyright_metrics.BasicCopyrightMetric",
|
|
585
|
-
args={**args, "name": "edit_similarity"},
|
|
586
|
-
),
|
|
587
|
-
] + get_basic_metric_specs([])
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
def get_disinformation_metric_specs(args: Optional[Dict] = None) -> List[MetricSpec]:
|
|
591
|
-
if args is None:
|
|
592
|
-
args = {}
|
|
593
|
-
return [
|
|
594
|
-
MetricSpec(
|
|
595
|
-
class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationHumanEvalMetrics", args={**args}
|
|
596
|
-
),
|
|
597
|
-
MetricSpec(
|
|
598
|
-
class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric", args={"name": "self_bleu"}
|
|
599
|
-
),
|
|
600
|
-
MetricSpec(
|
|
601
|
-
class_name="helm.benchmark.metrics.disinformation_metrics.DisinformationMetric",
|
|
602
|
-
args={"name": "monte_carlo_entropy"},
|
|
603
|
-
),
|
|
604
|
-
] + get_basic_metric_specs([])
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
def get_code_metric_specs(dataset: str, timeout: float) -> List[MetricSpec]:
|
|
608
|
-
if dataset == "humaneval":
|
|
609
|
-
return get_basic_metric_specs(["code_eval_acc", "pass"])
|
|
610
|
-
else: # APPS.
|
|
611
|
-
args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
|
|
612
|
-
return [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
def get_open_ended_generation_metric_specs() -> List[MetricSpec]:
|
|
616
|
-
return get_basic_metric_specs(["exact_match", "quasi_exact_match", "f1_score", "rouge_l", "bleu_1", "bleu_4"])
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
def get_machine_translation_metric_specs() -> List[MetricSpec]:
|
|
620
|
-
return [
|
|
621
|
-
MetricSpec(class_name="helm.benchmark.metrics.machine_translation_metrics.MachineTranslationMetric", args={})
|
|
622
|
-
] + get_basic_metric_specs([])
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
def get_cleva_machine_translation_metric_specs() -> List[MetricSpec]:
|
|
626
|
-
return [
|
|
627
|
-
MetricSpec(
|
|
628
|
-
class_name="helm.benchmark.metrics.machine_translation_metrics.CLEVAMachineTranslationMetric", args={}
|
|
629
|
-
)
|
|
630
|
-
] + get_basic_metric_specs([])
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
def get_cleva_paraphrase_generation_metric_specs(alpha: float = 0.8) -> List[MetricSpec]:
|
|
634
|
-
return [
|
|
635
|
-
MetricSpec(
|
|
636
|
-
class_name="helm.benchmark.metrics.paraphrase_generation_metrics.CLEVAParaphraseGenerationMetric",
|
|
637
|
-
args={"alpha": alpha}, # calculate iBLEU_0.8 by default
|
|
638
|
-
)
|
|
639
|
-
] + get_basic_metric_specs([])
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
def get_verifiability_judgment_metric_specs() -> List[MetricSpec]:
|
|
643
|
-
return get_basic_metric_specs(["exact_match", "quasi_exact_match"])
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
def get_instruction_following_critique_metric_specs(num_respondents: int) -> List[MetricSpec]:
|
|
647
|
-
return [
|
|
648
|
-
MetricSpec(
|
|
649
|
-
class_name="helm.benchmark.metrics.instruction_following_critique_metrics.InstructionFollowingCritiqueMetric", # noqa E501
|
|
650
|
-
args={"num_respondents": num_respondents},
|
|
651
|
-
)
|
|
652
|
-
]
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
def get_cleva_topk_accuracy_metric_specs(k: int = 1, cut_off: int = 5) -> List[MetricSpec]:
|
|
656
|
-
return [
|
|
657
|
-
MetricSpec(
|
|
658
|
-
class_name="helm.benchmark.metrics.cleva_accuracy_metrics.CLEVATopKAccuracyMetric",
|
|
659
|
-
args={"k": k, "cut_off": cut_off},
|
|
660
|
-
)
|
|
661
|
-
]
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
def get_cleva_bias_metric_specs() -> List[MetricSpec]:
|
|
665
|
-
demographic_categories = ["race", "gender"]
|
|
666
|
-
target_categories = ["adjective", "profession"]
|
|
667
|
-
cross_dem_target = itertools.product(demographic_categories, target_categories)
|
|
668
|
-
|
|
669
|
-
return [
|
|
670
|
-
MetricSpec(
|
|
671
|
-
class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric",
|
|
672
|
-
args={"mode": "associations", "demographic_category": dem, "target_category": tgt},
|
|
673
|
-
)
|
|
674
|
-
for dem, tgt in cross_dem_target
|
|
675
|
-
] + [
|
|
676
|
-
MetricSpec(
|
|
677
|
-
class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVABiasMetric",
|
|
678
|
-
args={"mode": "representation", "demographic_category": dem},
|
|
679
|
-
)
|
|
680
|
-
for dem in demographic_categories
|
|
681
|
-
]
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
def get_cleva_toxicity_metric_specs() -> List[MetricSpec]:
|
|
685
|
-
return [
|
|
686
|
-
MetricSpec(class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVAToxicityMetric", args={}),
|
|
687
|
-
]
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
def get_cleva_generative_harms_metric_specs(include_basic_metrics: bool = False) -> List[MetricSpec]:
|
|
691
|
-
return (
|
|
692
|
-
get_cleva_bias_metric_specs()
|
|
693
|
-
+ get_cleva_toxicity_metric_specs()
|
|
694
|
-
+ (get_basic_metric_specs([]) if include_basic_metrics else [])
|
|
695
|
-
)
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
def get_cleva_copyright_metric_spec(args: Optional[Dict] = None) -> List[MetricSpec]:
|
|
699
|
-
if args is None:
|
|
700
|
-
args = {}
|
|
701
|
-
return [
|
|
702
|
-
MetricSpec(
|
|
703
|
-
class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
|
|
704
|
-
args={**args, "name": "longest_common_prefix_length"},
|
|
705
|
-
),
|
|
706
|
-
MetricSpec(
|
|
707
|
-
class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
|
|
708
|
-
args={**args, "name": "edit_distance"},
|
|
709
|
-
),
|
|
710
|
-
MetricSpec(
|
|
711
|
-
class_name="helm.benchmark.metrics.cleva_harms_metrics.CLEVACopyrightMetric",
|
|
712
|
-
args={**args, "name": "edit_similarity"},
|
|
713
|
-
),
|
|
714
|
-
]
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
def get_cleva_generative_task_metric_spec(task: str, subtask: Optional[str], **kwargs) -> List[MetricSpec]:
|
|
718
|
-
CLEVA_GEN_TASK_TO_METRIC: Dict[str, Callable] = {
|
|
719
|
-
"opinion_mining:opinion_target_extraction": get_exact_match_metric_specs,
|
|
720
|
-
"paraphrase_generation": get_cleva_paraphrase_generation_metric_specs,
|
|
721
|
-
"closed_book_question_answering:generative_question_answering": get_exact_match_metric_specs,
|
|
722
|
-
"conceptual_generalization": get_cleva_topk_accuracy_metric_specs,
|
|
723
|
-
"translation:en2zh": get_cleva_machine_translation_metric_specs,
|
|
724
|
-
"translation:zh2en": get_cleva_machine_translation_metric_specs,
|
|
725
|
-
"mathematical_calculation:add": get_exact_match_metric_specs,
|
|
726
|
-
"mathematical_calculation:sub": get_exact_match_metric_specs,
|
|
727
|
-
"mathematical_calculation:mul": get_exact_match_metric_specs,
|
|
728
|
-
"inductive_reasoning:add": get_exact_match_metric_specs,
|
|
729
|
-
"inductive_reasoning:sub": get_exact_match_metric_specs,
|
|
730
|
-
"inductive_reasoning:mul": get_exact_match_metric_specs,
|
|
731
|
-
"reasoning_primitive:dyck_language": get_exact_match_metric_specs,
|
|
732
|
-
"reasoning_primitive:pattern_induction": get_exact_match_metric_specs,
|
|
733
|
-
"reasoning_primitive:pattern_matching": get_exact_match_metric_specs,
|
|
734
|
-
"reasoning_primitive:variable_sub": get_exact_match_metric_specs,
|
|
735
|
-
"subject_knowledge:art": get_exact_match_metric_specs,
|
|
736
|
-
"subject_knowledge:biomedicine": get_exact_match_metric_specs,
|
|
737
|
-
"subject_knowledge:chemistry": get_exact_match_metric_specs,
|
|
738
|
-
"subject_knowledge:computer_science": get_exact_match_metric_specs,
|
|
739
|
-
"subject_knowledge:economics": get_exact_match_metric_specs,
|
|
740
|
-
"subject_knowledge:geography": get_exact_match_metric_specs,
|
|
741
|
-
"subject_knowledge:history": get_exact_match_metric_specs,
|
|
742
|
-
"subject_knowledge:law": get_exact_match_metric_specs,
|
|
743
|
-
"subject_knowledge:literature": get_exact_match_metric_specs,
|
|
744
|
-
"subject_knowledge:math": get_exact_match_metric_specs,
|
|
745
|
-
"subject_knowledge:other_general": get_exact_match_metric_specs,
|
|
746
|
-
"subject_knowledge:philosophy": get_exact_match_metric_specs,
|
|
747
|
-
"subject_knowledge:physics": get_exact_match_metric_specs,
|
|
748
|
-
"subject_knowledge:politics": get_exact_match_metric_specs,
|
|
749
|
-
"summarization:dialogue_summarization": partial(get_basic_metric_specs, ["chinese_rouge_2"]),
|
|
750
|
-
"pinyin_transliteration:pinyin2zh": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
|
|
751
|
-
"pinyin_transliteration:zh2pinyin": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
|
|
752
|
-
"dialogue_generation:task_oriented": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
|
|
753
|
-
"data_to_text_generation": partial(get_basic_metric_specs, ["chinese_bleu_1"]),
|
|
754
|
-
"mathematical_reasoning:math_word_problem": partial(get_basic_metric_specs, ["cleva_math_result_match"]),
|
|
755
|
-
}
|
|
756
|
-
|
|
757
|
-
key: str = task
|
|
758
|
-
if subtask is not None:
|
|
759
|
-
key += ":" + subtask
|
|
760
|
-
return CLEVA_GEN_TASK_TO_METRIC[key](**kwargs)
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
############################################################
|
|
764
|
-
# Run specs
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
CANONICAL_RUN_SPEC_FUNCS: Dict[str, Callable[..., RunSpec]] = {}
|
|
768
|
-
"""Dict of run spec function names to run spec functions."""
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
F = TypeVar("F", bound=Callable[..., RunSpec])
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
def run_spec_function(name: str) -> Callable[[F], F]:
|
|
775
|
-
"""Register the run spec function under the given name."""
|
|
776
|
-
|
|
777
|
-
def wrap(func: F) -> F:
|
|
778
|
-
if name in CANONICAL_RUN_SPEC_FUNCS:
|
|
779
|
-
raise ValueError(f"A run spec function with name {name} already exists")
|
|
780
|
-
CANONICAL_RUN_SPEC_FUNCS[name] = func
|
|
781
|
-
return func
|
|
782
|
-
|
|
783
|
-
return wrap
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
@run_spec_function("simple1")
|
|
787
|
-
def get_simple1_spec() -> RunSpec:
|
|
788
|
-
"""A run spec for debugging."""
|
|
789
|
-
return RunSpec(
|
|
790
|
-
name="simple1",
|
|
791
|
-
scenario_spec=get_scenario_spec1(),
|
|
792
|
-
adapter_spec=get_adapter_spec1(),
|
|
793
|
-
metric_specs=get_basic_metric_specs([]),
|
|
794
|
-
groups=[],
|
|
795
|
-
)
|
|
44
|
+
from helm.benchmark.metrics.metric import MetricSpec
|
|
45
|
+
from helm.benchmark.run_spec import RunSpec, run_spec_function
|
|
46
|
+
from helm.benchmark.runner import get_benchmark_output_path
|
|
47
|
+
from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
|
|
48
|
+
from helm.common.hierarchical_logger import hlog, htrack
|
|
796
49
|
|
|
797
50
|
|
|
798
51
|
@run_spec_function("bbq")
|
|
@@ -806,7 +59,9 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run
|
|
|
806
59
|
input_noun="Passage",
|
|
807
60
|
output_noun="Answer",
|
|
808
61
|
)
|
|
809
|
-
metric_specs =
|
|
62
|
+
metric_specs = [
|
|
63
|
+
MetricSpec(class_name="helm.benchmark.metrics.bbq_metrics.BBQMetric", args={})
|
|
64
|
+
] + get_exact_match_metric_specs()
|
|
810
65
|
|
|
811
66
|
return RunSpec(
|
|
812
67
|
name=f"bbq:subject={subject},method={method}",
|
|
@@ -819,6 +74,8 @@ def get_bbq_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Run
|
|
|
819
74
|
|
|
820
75
|
@run_spec_function("msmarco")
|
|
821
76
|
def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
|
|
77
|
+
from helm.benchmark.scenarios.msmarco_scenario import MSMARCOScenario
|
|
78
|
+
|
|
822
79
|
valid_topk = None if valid_topk is None else int(valid_topk)
|
|
823
80
|
scenario_spec = ScenarioSpec(
|
|
824
81
|
class_name="helm.benchmark.scenarios.msmarco_scenario.MSMARCOScenario",
|
|
@@ -827,11 +84,33 @@ def get_msmarco_spec(track: str, valid_topk: Optional[int] = None) -> RunSpec:
|
|
|
827
84
|
|
|
828
85
|
adapter_spec: AdapterSpec = get_ranking_binary_adapter_spec(max_train_instances=4, stop_sequences=["\n"])
|
|
829
86
|
|
|
87
|
+
# Names of the measures we want to compute.
|
|
88
|
+
measure_names = MSMARCOScenario.MEASURE_NAMES[track]
|
|
89
|
+
multiple_relevance_values = set(MSMARCOScenario.GOLD_RELATIONS[track]) != {1}
|
|
90
|
+
|
|
91
|
+
metric_specs = (
|
|
92
|
+
[
|
|
93
|
+
MetricSpec(
|
|
94
|
+
class_name="helm.benchmark.metrics.ranking_metrics.RankingMetric",
|
|
95
|
+
args={
|
|
96
|
+
"method": ADAPT_RANKING_BINARY,
|
|
97
|
+
"measure_names": measure_names,
|
|
98
|
+
"correct_output": BinaryRankingAdapter.RANKING_CORRECT_LABEL,
|
|
99
|
+
"wrong_output": BinaryRankingAdapter.RANKING_WRONG_LABEL,
|
|
100
|
+
"rank": valid_topk,
|
|
101
|
+
"multiple_relevance_values": multiple_relevance_values,
|
|
102
|
+
},
|
|
103
|
+
),
|
|
104
|
+
]
|
|
105
|
+
+ get_basic_reference_metric_specs()
|
|
106
|
+
+ get_generic_metric_specs()
|
|
107
|
+
)
|
|
108
|
+
|
|
830
109
|
return RunSpec(
|
|
831
110
|
name=f"msmarco:track={track},valid_topk={valid_topk}",
|
|
832
111
|
scenario_spec=scenario_spec,
|
|
833
112
|
adapter_spec=adapter_spec,
|
|
834
|
-
metric_specs=
|
|
113
|
+
metric_specs=metric_specs,
|
|
835
114
|
groups=[f"msmarco_{track}"],
|
|
836
115
|
)
|
|
837
116
|
|
|
@@ -905,28 +184,6 @@ def get_custom_mcqa_spec(
|
|
|
905
184
|
)
|
|
906
185
|
|
|
907
186
|
|
|
908
|
-
@run_spec_function("mmlu")
|
|
909
|
-
def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
|
|
910
|
-
scenario_spec = ScenarioSpec(
|
|
911
|
-
class_name="helm.benchmark.scenarios.mmlu_scenario.MMLUScenario", args={"subject": subject}
|
|
912
|
-
)
|
|
913
|
-
|
|
914
|
-
adapter_spec = get_multiple_choice_adapter_spec(
|
|
915
|
-
method=method,
|
|
916
|
-
instructions=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.",
|
|
917
|
-
input_noun="Question",
|
|
918
|
-
output_noun="Answer",
|
|
919
|
-
)
|
|
920
|
-
|
|
921
|
-
return RunSpec(
|
|
922
|
-
name=f"mmlu:subject={subject},method={method}",
|
|
923
|
-
scenario_spec=scenario_spec,
|
|
924
|
-
adapter_spec=adapter_spec,
|
|
925
|
-
metric_specs=get_exact_match_metric_specs(),
|
|
926
|
-
groups=["mmlu"],
|
|
927
|
-
)
|
|
928
|
-
|
|
929
|
-
|
|
930
187
|
@run_spec_function("interactive_qa_mmlu")
|
|
931
188
|
def get_interactive_qa_mmlu_spec(subject: str) -> RunSpec:
|
|
932
189
|
scenario_spec = ScenarioSpec(
|
|
@@ -975,29 +232,6 @@ def get_wikifact_spec(k: str, subject: str) -> RunSpec:
|
|
|
975
232
|
)
|
|
976
233
|
|
|
977
234
|
|
|
978
|
-
@run_spec_function("commonsense")
|
|
979
|
-
def get_commonsense_spec(dataset: str, method: str) -> RunSpec:
|
|
980
|
-
scenario_spec = ScenarioSpec(
|
|
981
|
-
class_name="helm.benchmark.scenarios.commonsense_scenario.CommonSenseScenario",
|
|
982
|
-
args={"dataset": dataset},
|
|
983
|
-
)
|
|
984
|
-
|
|
985
|
-
adapter_spec = get_multiple_choice_adapter_spec(
|
|
986
|
-
method=method,
|
|
987
|
-
instructions="The following are multiple choice questions (with answers) about common sense.",
|
|
988
|
-
input_noun="Question",
|
|
989
|
-
output_noun="Answer",
|
|
990
|
-
)
|
|
991
|
-
|
|
992
|
-
return RunSpec(
|
|
993
|
-
name=f"commonsense:dataset={dataset},method={method}",
|
|
994
|
-
scenario_spec=scenario_spec,
|
|
995
|
-
adapter_spec=adapter_spec,
|
|
996
|
-
metric_specs=get_exact_match_metric_specs(),
|
|
997
|
-
groups=[dataset],
|
|
998
|
-
)
|
|
999
|
-
|
|
1000
|
-
|
|
1001
235
|
@run_spec_function("quac")
|
|
1002
236
|
def get_quac_spec() -> RunSpec:
|
|
1003
237
|
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.quac_scenario.QuACScenario", args={})
|
|
@@ -1060,7 +294,7 @@ def get_twitter_aae_spec(demographic: str) -> RunSpec:
|
|
|
1060
294
|
name=f"twitter_aae:demographic={demographic}",
|
|
1061
295
|
scenario_spec=scenario_spec,
|
|
1062
296
|
adapter_spec=get_language_modeling_adapter_spec(),
|
|
1063
|
-
metric_specs=
|
|
297
|
+
metric_specs=get_language_modeling_metric_specs([]),
|
|
1064
298
|
groups=["twitter_aae", f"twitter_aae_{demographic}"],
|
|
1065
299
|
)
|
|
1066
300
|
|
|
@@ -1088,7 +322,9 @@ def get_real_toxicity_prompts_spec() -> RunSpec:
|
|
|
1088
322
|
name="real_toxicity_prompts",
|
|
1089
323
|
scenario_spec=scenario_spec,
|
|
1090
324
|
adapter_spec=adapter_spec,
|
|
1091
|
-
metric_specs=get_generative_harms_metric_specs(
|
|
325
|
+
metric_specs=get_generative_harms_metric_specs(
|
|
326
|
+
include_basic_metrics=True, include_generative_harms_metrics=True
|
|
327
|
+
),
|
|
1092
328
|
groups=["real_toxicity_prompts"],
|
|
1093
329
|
)
|
|
1094
330
|
|
|
@@ -1108,46 +344,28 @@ def get_synthetic_reasoning_natural_spec(difficulty: str) -> RunSpec:
|
|
|
1108
344
|
max_train_instances=3, # limited by the context length
|
|
1109
345
|
max_tokens=20,
|
|
1110
346
|
)
|
|
347
|
+
srn_metric_specs = get_basic_metric_specs(["f1_set_match", "iou_set_match", "exact_set_match"])
|
|
1111
348
|
|
|
1112
349
|
return RunSpec(
|
|
1113
350
|
name=f"synthetic_reasoning_natural:difficulty={difficulty}",
|
|
1114
351
|
scenario_spec=scenario_spec,
|
|
1115
352
|
adapter_spec=adapter_spec,
|
|
1116
|
-
metric_specs=
|
|
353
|
+
metric_specs=srn_metric_specs + get_generative_harms_metric_specs(),
|
|
1117
354
|
groups=["synthetic_reasoning", "synthetic_reasoning_natural"],
|
|
1118
355
|
)
|
|
1119
356
|
|
|
1120
357
|
|
|
1121
|
-
@run_spec_function("gsm")
|
|
1122
|
-
def get_gsm_spec() -> RunSpec:
|
|
1123
|
-
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.gsm_scenario.GSM8KScenario", args={})
|
|
1124
|
-
|
|
1125
|
-
# Create AdapterSpec based on the GSM8K paper: https://arxiv.org/pdf/2110.14168.pdf
|
|
1126
|
-
adapter_spec = get_generation_adapter_spec(
|
|
1127
|
-
input_noun="Q",
|
|
1128
|
-
output_noun="A",
|
|
1129
|
-
max_train_instances=5, # Due to limited context and long example length
|
|
1130
|
-
max_tokens=400, # The paper uses 400 tokens as the max sample length
|
|
1131
|
-
stop_sequences=["\n\n"], # Since answer may contain newlines, we use two as SEP
|
|
1132
|
-
)
|
|
1133
|
-
|
|
1134
|
-
return RunSpec(
|
|
1135
|
-
name="gsm",
|
|
1136
|
-
scenario_spec=scenario_spec,
|
|
1137
|
-
adapter_spec=adapter_spec,
|
|
1138
|
-
metric_specs=get_basic_metric_specs(["exact_match_indicator"]) + get_generative_harms_metric_specs(),
|
|
1139
|
-
groups=["gsm"],
|
|
1140
|
-
)
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
358
|
@run_spec_function("raft")
|
|
1144
359
|
def get_raft_spec(subset: str) -> RunSpec:
|
|
360
|
+
from helm.benchmark.scenarios.raft_scenario import RAFTScenario, get_raft_instructions
|
|
361
|
+
|
|
1145
362
|
scenario_spec = ScenarioSpec(
|
|
1146
363
|
class_name="helm.benchmark.scenarios.raft_scenario.RAFTScenario", args={"subset": subset}
|
|
1147
364
|
)
|
|
1148
365
|
|
|
366
|
+
scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), RAFTScenario.name)
|
|
1149
367
|
adapter_spec = get_generation_adapter_spec(
|
|
1150
|
-
instructions=get_raft_instructions(subset),
|
|
368
|
+
instructions=get_raft_instructions(subset, scenario_cache_path),
|
|
1151
369
|
input_noun=None,
|
|
1152
370
|
output_noun="Label",
|
|
1153
371
|
max_tokens=30, # at most ~50 characters per label
|
|
@@ -1166,9 +384,10 @@ def get_raft_spec(subset: str) -> RunSpec:
|
|
|
1166
384
|
def get_numeracy_spec(
|
|
1167
385
|
relation_type: str = "linear", mode: str = "function", seed: str = "0", run_solver: str = "False"
|
|
1168
386
|
) -> RunSpec:
|
|
1169
|
-
from .scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
|
|
387
|
+
from helm.benchmark.scenarios.numeracy_scenario import get_numeracy_adapter_spec, RELTYPE_INFO
|
|
1170
388
|
|
|
1171
|
-
|
|
389
|
+
run_solver_bool: bool = True if run_solver == "True" else False
|
|
390
|
+
del run_solver
|
|
1172
391
|
random_seed = int(seed)
|
|
1173
392
|
scenario_spec = ScenarioSpec(
|
|
1174
393
|
class_name="helm.benchmark.scenarios.numeracy_scenario.NumeracyScenario",
|
|
@@ -1208,72 +427,11 @@ def get_numeracy_spec(
|
|
|
1208
427
|
name=f"numeracy:relation_type={relation_type},mode={mode}",
|
|
1209
428
|
scenario_spec=scenario_spec,
|
|
1210
429
|
adapter_spec=adapter_spec,
|
|
1211
|
-
metric_specs=get_numeracy_metric_specs(
|
|
430
|
+
metric_specs=get_numeracy_metric_specs(run_solver_bool),
|
|
1212
431
|
groups=["numeracy"],
|
|
1213
432
|
)
|
|
1214
433
|
|
|
1215
434
|
|
|
1216
|
-
@run_spec_function("math")
|
|
1217
|
-
def get_math_spec(
|
|
1218
|
-
subject: str,
|
|
1219
|
-
level: str,
|
|
1220
|
-
use_official_examples: str = "False",
|
|
1221
|
-
use_chain_of_thought: str = "False",
|
|
1222
|
-
) -> RunSpec:
|
|
1223
|
-
use_official_examples: bool = use_official_examples == "True" # type: ignore
|
|
1224
|
-
use_chain_of_thought: bool = use_chain_of_thought == "True" # type: ignore
|
|
1225
|
-
if use_chain_of_thought:
|
|
1226
|
-
assert not use_official_examples, "Cannot use official examples when use_chain_of_thought is True."
|
|
1227
|
-
scenario_spec = ScenarioSpec(
|
|
1228
|
-
class_name="helm.benchmark.scenarios.math_scenario.MATHScenario",
|
|
1229
|
-
args={
|
|
1230
|
-
"subject": subject,
|
|
1231
|
-
"level": level,
|
|
1232
|
-
"use_official_examples": use_official_examples,
|
|
1233
|
-
"use_chain_of_thought": use_chain_of_thought,
|
|
1234
|
-
},
|
|
1235
|
-
)
|
|
1236
|
-
|
|
1237
|
-
if use_chain_of_thought: # Include the solution in the output as per https://arxiv.org/abs/2201.11903
|
|
1238
|
-
output_prefix = "Answer: " # Don't include LaTeX '$' delimiters
|
|
1239
|
-
output_suffix = "\n"
|
|
1240
|
-
instance_prefix = "###\n" # Don't include LaTeX '$' delimiters
|
|
1241
|
-
max_tokens = 400 # Increase the number of tokens to generate
|
|
1242
|
-
stop_sequences = ["###"] # Break at the next instance; extraneous output will be stripped out
|
|
1243
|
-
groups = ["math_chain_of_thought"]
|
|
1244
|
-
else:
|
|
1245
|
-
output_prefix = "Answer: $"
|
|
1246
|
-
output_suffix = "$\n"
|
|
1247
|
-
instance_prefix = "###\n"
|
|
1248
|
-
max_tokens = 20
|
|
1249
|
-
stop_sequences = ["$"] # Break at the nearest LaTeX closing delimiter
|
|
1250
|
-
groups = ["math_regular"]
|
|
1251
|
-
|
|
1252
|
-
adapter_spec = AdapterSpec(
|
|
1253
|
-
method=ADAPT_GENERATION,
|
|
1254
|
-
instructions="Given a mathematics problem, determine the answer. Simplify your answer as much as possible.\n",
|
|
1255
|
-
max_train_instances=8,
|
|
1256
|
-
num_outputs=1,
|
|
1257
|
-
temperature=0.0,
|
|
1258
|
-
stop_sequences=stop_sequences,
|
|
1259
|
-
max_tokens=max_tokens,
|
|
1260
|
-
input_prefix="Problem: ",
|
|
1261
|
-
input_suffix="\n",
|
|
1262
|
-
output_prefix=output_prefix,
|
|
1263
|
-
output_suffix=output_suffix,
|
|
1264
|
-
instance_prefix=instance_prefix,
|
|
1265
|
-
)
|
|
1266
|
-
|
|
1267
|
-
return RunSpec(
|
|
1268
|
-
name=f"math:subject={subject},level={level},"
|
|
1269
|
-
f"use_official_examples={use_official_examples},use_chain_of_thought={use_chain_of_thought}",
|
|
1270
|
-
scenario_spec=scenario_spec,
|
|
1271
|
-
adapter_spec=adapter_spec,
|
|
1272
|
-
metric_specs=get_math_metric_specs(use_chain_of_thought) + get_generative_harms_metric_specs(), # type: ignore
|
|
1273
|
-
groups=groups,
|
|
1274
|
-
)
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
435
|
@run_spec_function("boolq")
|
|
1278
436
|
def get_boolq_spec(only_contrast=False) -> RunSpec:
|
|
1279
437
|
scenario_spec = ScenarioSpec(
|
|
@@ -1358,6 +516,8 @@ def get_copyright_spec(
|
|
|
1358
516
|
normalize_by_prefix_length=True,
|
|
1359
517
|
normalize_newline_space_tab=False,
|
|
1360
518
|
) -> RunSpec:
|
|
519
|
+
from helm.benchmark.scenarios.copyright_scenario import datatag2hash_code
|
|
520
|
+
|
|
1361
521
|
scenario_spec = ScenarioSpec(
|
|
1362
522
|
class_name="helm.benchmark.scenarios.copyright_scenario.CopyrightScenario", args=dict(datatag=datatag)
|
|
1363
523
|
)
|
|
@@ -1470,36 +630,21 @@ def get_code_spec(dataset: str, timeout=3) -> RunSpec:
|
|
|
1470
630
|
max_tokens=600,
|
|
1471
631
|
)
|
|
1472
632
|
|
|
633
|
+
if dataset == "humaneval":
|
|
634
|
+
code_metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"])
|
|
635
|
+
else: # APPS.
|
|
636
|
+
args: Dict[str, Any] = {"names": ["test_avg", "strict_acc"], "timeout": timeout}
|
|
637
|
+
code_metric_specs = [MetricSpec(class_name="helm.benchmark.metrics.code_metrics.APPSMetric", args=args)]
|
|
638
|
+
|
|
1473
639
|
return RunSpec(
|
|
1474
640
|
name=f"code:dataset={dataset}",
|
|
1475
641
|
scenario_spec=scenario_spec,
|
|
1476
642
|
adapter_spec=adapter_spec,
|
|
1477
|
-
metric_specs=
|
|
643
|
+
metric_specs=code_metric_specs + get_generative_harms_metric_specs(),
|
|
1478
644
|
groups=[f"code_{dataset}"],
|
|
1479
645
|
)
|
|
1480
646
|
|
|
1481
647
|
|
|
1482
|
-
@run_spec_function("natural_qa")
|
|
1483
|
-
def get_natural_qa_spec(mode: str) -> RunSpec:
|
|
1484
|
-
scenario_spec = ScenarioSpec(
|
|
1485
|
-
class_name="helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario", args={"mode": mode}
|
|
1486
|
-
)
|
|
1487
|
-
|
|
1488
|
-
adapter_spec = get_generation_adapter_spec(
|
|
1489
|
-
input_noun="Question" if mode == "closedbook" else None,
|
|
1490
|
-
output_noun="Answer",
|
|
1491
|
-
max_tokens=300, # answers are at most 65 words
|
|
1492
|
-
)
|
|
1493
|
-
|
|
1494
|
-
return RunSpec(
|
|
1495
|
-
name=f"natural_qa:mode={mode}",
|
|
1496
|
-
scenario_spec=scenario_spec,
|
|
1497
|
-
adapter_spec=adapter_spec,
|
|
1498
|
-
metric_specs=get_f1_metric_specs() + get_generative_harms_metric_specs(),
|
|
1499
|
-
groups=[f"natural_qa_{mode}"],
|
|
1500
|
-
)
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
648
|
@run_spec_function("the_pile")
|
|
1504
649
|
def get_the_pile_spec(subset: str) -> RunSpec:
|
|
1505
650
|
scenario_spec = ScenarioSpec(
|
|
@@ -1510,7 +655,7 @@ def get_the_pile_spec(subset: str) -> RunSpec:
|
|
|
1510
655
|
name=f"the_pile:subset={subset}",
|
|
1511
656
|
scenario_spec=scenario_spec,
|
|
1512
657
|
adapter_spec=get_language_modeling_adapter_spec(),
|
|
1513
|
-
metric_specs=
|
|
658
|
+
metric_specs=get_language_modeling_metric_specs([]),
|
|
1514
659
|
groups=["the_pile"],
|
|
1515
660
|
)
|
|
1516
661
|
|
|
@@ -1523,32 +668,11 @@ def get_ice_spec(**kwargs) -> RunSpec:
|
|
|
1523
668
|
name="ice" + (":" if len(kwargs) > 0 else "") + ",".join(f"{k}={v}" for k, v in sorted(kwargs.items())),
|
|
1524
669
|
scenario_spec=scenario_spec,
|
|
1525
670
|
adapter_spec=get_language_modeling_adapter_spec(),
|
|
1526
|
-
metric_specs=
|
|
671
|
+
metric_specs=get_language_modeling_metric_specs([]),
|
|
1527
672
|
groups=["ice"],
|
|
1528
673
|
)
|
|
1529
674
|
|
|
1530
675
|
|
|
1531
|
-
@run_spec_function("narrative_qa")
|
|
1532
|
-
def get_narrativeqa_spec() -> RunSpec:
|
|
1533
|
-
scenario_spec = ScenarioSpec(
|
|
1534
|
-
class_name="helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario", args={}
|
|
1535
|
-
)
|
|
1536
|
-
|
|
1537
|
-
adapter_spec = get_generation_adapter_spec(
|
|
1538
|
-
input_noun="Passage",
|
|
1539
|
-
output_noun="Answer",
|
|
1540
|
-
max_tokens=100, # max 30 words
|
|
1541
|
-
)
|
|
1542
|
-
|
|
1543
|
-
return RunSpec(
|
|
1544
|
-
name="narrative_qa",
|
|
1545
|
-
scenario_spec=scenario_spec,
|
|
1546
|
-
adapter_spec=adapter_spec,
|
|
1547
|
-
metric_specs=get_open_ended_generation_metric_specs() + get_generative_harms_metric_specs(),
|
|
1548
|
-
groups=["narrative_qa"],
|
|
1549
|
-
)
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
676
|
@run_spec_function("synthetic_efficiency")
|
|
1553
677
|
def get_synthetic_efficiency_spec(
|
|
1554
678
|
num_prompt_tokens: Optional[int] = None,
|
|
@@ -1570,7 +694,9 @@ def get_synthetic_efficiency_spec(
|
|
|
1570
694
|
name=f"synthetic_efficiency:random={random}",
|
|
1571
695
|
scenario_spec=scenario_spec,
|
|
1572
696
|
adapter_spec=adapter_spec,
|
|
1573
|
-
metric_specs=
|
|
697
|
+
metric_specs=get_basic_generation_metric_specs(["exact_match"])
|
|
698
|
+
+ get_generic_metric_specs()
|
|
699
|
+
+ get_generative_harms_metric_specs(),
|
|
1574
700
|
groups=["synthetic_efficiency"],
|
|
1575
701
|
)
|
|
1576
702
|
|
|
@@ -1609,7 +735,7 @@ def get_wikitext_103_spec() -> RunSpec:
|
|
|
1609
735
|
name="wikitext_103",
|
|
1610
736
|
scenario_spec=scenario_spec,
|
|
1611
737
|
adapter_spec=get_language_modeling_adapter_spec(),
|
|
1612
|
-
metric_specs=
|
|
738
|
+
metric_specs=get_language_modeling_metric_specs([]),
|
|
1613
739
|
groups=["wikitext_103"],
|
|
1614
740
|
)
|
|
1615
741
|
|
|
@@ -1757,7 +883,9 @@ def get_dyck_language_spec(num_parenthesis_pairs: int) -> RunSpec:
|
|
|
1757
883
|
name=f"dyck_language_np={int(num_parenthesis_pairs)}",
|
|
1758
884
|
scenario_spec=scenario_spec,
|
|
1759
885
|
adapter_spec=adapter_spec,
|
|
1760
|
-
metric_specs=
|
|
886
|
+
metric_specs=get_basic_generation_metric_specs(["exact_match_indicator"])
|
|
887
|
+
+ get_generic_metric_specs()
|
|
888
|
+
+ get_generative_harms_metric_specs(),
|
|
1761
889
|
groups=["dyck_language"],
|
|
1762
890
|
)
|
|
1763
891
|
|
|
@@ -1827,6 +955,8 @@ def get_entity_data_imputation_spec(dataset: str) -> RunSpec:
|
|
|
1827
955
|
@htrack("Extracting adaptation parameters from the BIG-bench task definition and building the RunSpec")
|
|
1828
956
|
@run_spec_function("big_bench")
|
|
1829
957
|
def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
|
|
958
|
+
from helm.benchmark.scenarios.big_bench_scenario import BIGBenchScenario
|
|
959
|
+
|
|
1830
960
|
def get_adaptation_method(big_bench_metrics: List[str]) -> str:
|
|
1831
961
|
"""
|
|
1832
962
|
From BIG-bench, "there are three types of BIG-bench JSON tasks - generative and scoring
|
|
@@ -1871,16 +1001,14 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
|
|
|
1871
1001
|
)
|
|
1872
1002
|
|
|
1873
1003
|
# Get BIG-bench task definition.
|
|
1874
|
-
|
|
1875
|
-
|
|
1876
|
-
big_bench_task: Dict = BIGBenchScenario.download_and_get_task(output_path, task, subtask)
|
|
1004
|
+
scenario_cache_path = get_scenario_cache_path(get_benchmark_output_path(), BIGBenchScenario.name)
|
|
1005
|
+
big_bench_task: Dict = BIGBenchScenario.download_and_get_task(scenario_cache_path, task, subtask)
|
|
1877
1006
|
|
|
1878
1007
|
# The JSON schema for BIG-bench can be found here:
|
|
1879
1008
|
# https://github.com/google/BIG-bench/blob/main/docs/doc.md#json-schema.
|
|
1880
1009
|
# "metrics" is a required field. The default values were populated using the link above.
|
|
1881
1010
|
adapter_spec = AdapterSpec(
|
|
1882
1011
|
method=get_adaptation_method(big_bench_task["metrics"]),
|
|
1883
|
-
model="openai/text-curie-001", # Can override with the `ModelRunExpander`.
|
|
1884
1012
|
max_train_instances=5, # Can override with the `MaxTrainInstancesRunExpander`.
|
|
1885
1013
|
num_outputs=1, # Can override with the `NumOutputsRunExpander`.
|
|
1886
1014
|
# From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
|
|
@@ -1907,9 +1035,8 @@ def get_big_bench_spec(task: str, subtask: str) -> RunSpec:
|
|
|
1907
1035
|
name=run_spec_name,
|
|
1908
1036
|
scenario_spec=scenario_spec,
|
|
1909
1037
|
adapter_spec=adapter_spec,
|
|
1910
|
-
# TODO add generative harms when applicable
|
|
1911
1038
|
metric_specs=get_metric_specs(big_bench_task["metrics"]),
|
|
1912
|
-
groups=["
|
|
1039
|
+
groups=[f"big_bench_{task}"],
|
|
1913
1040
|
)
|
|
1914
1041
|
|
|
1915
1042
|
|
|
@@ -1991,7 +1118,7 @@ def get_med_mcqa_spec() -> RunSpec:
|
|
|
1991
1118
|
scenario_spec=scenario_spec,
|
|
1992
1119
|
adapter_spec=adapter_spec,
|
|
1993
1120
|
metric_specs=get_exact_match_metric_specs(),
|
|
1994
|
-
groups=["
|
|
1121
|
+
groups=["med_mcqa"],
|
|
1995
1122
|
)
|
|
1996
1123
|
|
|
1997
1124
|
|
|
@@ -2017,48 +1144,86 @@ def get_med_paragraph_simplification_spec() -> RunSpec:
|
|
|
2017
1144
|
)
|
|
2018
1145
|
|
|
2019
1146
|
|
|
2020
|
-
@run_spec_function("
|
|
2021
|
-
def
|
|
2022
|
-
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.
|
|
1147
|
+
@run_spec_function("pubmed_qa")
|
|
1148
|
+
def get_pubmed_qa_spec() -> RunSpec:
|
|
1149
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.pubmed_qa_scenario.PubMedQAScenario", args={})
|
|
2023
1150
|
|
|
2024
1151
|
adapter_spec = get_multiple_choice_adapter_spec(
|
|
2025
1152
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
2026
|
-
instructions="
|
|
1153
|
+
instructions="Answer A for yes, B for no or C for maybe.",
|
|
2027
1154
|
input_noun="Question",
|
|
2028
1155
|
output_noun="Answer",
|
|
2029
1156
|
)
|
|
2030
1157
|
|
|
2031
1158
|
return RunSpec(
|
|
2032
|
-
name="
|
|
1159
|
+
name="pubmed_qa",
|
|
2033
1160
|
scenario_spec=scenario_spec,
|
|
2034
1161
|
adapter_spec=adapter_spec,
|
|
2035
1162
|
metric_specs=get_exact_match_metric_specs(),
|
|
2036
|
-
groups=["
|
|
1163
|
+
groups=["pubmed_qa"],
|
|
2037
1164
|
)
|
|
2038
1165
|
|
|
2039
1166
|
|
|
2040
|
-
@run_spec_function("
|
|
2041
|
-
def
|
|
2042
|
-
|
|
1167
|
+
@run_spec_function("live_qa")
|
|
1168
|
+
def get_live_qa_spec() -> RunSpec:
|
|
1169
|
+
from helm.common.gpu_utils import get_torch_device_name
|
|
2043
1170
|
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
1171
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.live_qa_scenario.LiveQAScenario")
|
|
1172
|
+
|
|
1173
|
+
adapter_spec = get_generation_adapter_spec(
|
|
1174
|
+
instructions="Please answer the following consumer health question.",
|
|
2047
1175
|
input_noun="Question",
|
|
2048
1176
|
output_noun="Answer",
|
|
1177
|
+
max_train_instances=0,
|
|
1178
|
+
max_tokens=512,
|
|
2049
1179
|
)
|
|
2050
1180
|
|
|
2051
1181
|
return RunSpec(
|
|
2052
|
-
name="
|
|
1182
|
+
name="live_qa",
|
|
2053
1183
|
scenario_spec=scenario_spec,
|
|
2054
1184
|
adapter_spec=adapter_spec,
|
|
2055
|
-
metric_specs=
|
|
2056
|
-
|
|
1185
|
+
metric_specs=get_summarization_metric_specs(
|
|
1186
|
+
{"task": "live_qa", "device": get_torch_device_name()},
|
|
1187
|
+
),
|
|
1188
|
+
groups=["live_qa"],
|
|
1189
|
+
)
|
|
1190
|
+
|
|
1191
|
+
|
|
1192
|
+
@run_spec_function("medication_qa")
|
|
1193
|
+
def get_medication_qa_spec() -> RunSpec:
|
|
1194
|
+
from helm.common.gpu_utils import get_torch_device_name
|
|
1195
|
+
|
|
1196
|
+
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.medication_qa_scenario.MedicationQAScenario")
|
|
1197
|
+
|
|
1198
|
+
adapter_spec = get_generation_adapter_spec(
|
|
1199
|
+
instructions="Please answer the following consumer health question.",
|
|
1200
|
+
input_noun="Question",
|
|
1201
|
+
output_noun="Answer",
|
|
1202
|
+
max_train_instances=0,
|
|
1203
|
+
max_tokens=512,
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
return RunSpec(
|
|
1207
|
+
name="medication_qa",
|
|
1208
|
+
scenario_spec=scenario_spec,
|
|
1209
|
+
adapter_spec=adapter_spec,
|
|
1210
|
+
metric_specs=get_summarization_metric_specs(
|
|
1211
|
+
{"task": "medication_qa", "device": get_torch_device_name()},
|
|
1212
|
+
),
|
|
1213
|
+
groups=["medication_qa"],
|
|
2057
1214
|
)
|
|
2058
1215
|
|
|
2059
1216
|
|
|
2060
1217
|
@run_spec_function("lextreme")
|
|
2061
1218
|
def get_lextreme_spec(subset: str) -> RunSpec:
|
|
1219
|
+
from helm.benchmark.scenarios.lextreme_scenario import (
|
|
1220
|
+
get_lextreme_instructions,
|
|
1221
|
+
get_lextreme_max_train_instances,
|
|
1222
|
+
get_lextreme_max_tokens,
|
|
1223
|
+
TaskType,
|
|
1224
|
+
get_lextreme_task_type,
|
|
1225
|
+
)
|
|
1226
|
+
|
|
2062
1227
|
task_type = get_lextreme_task_type(subset)
|
|
2063
1228
|
|
|
2064
1229
|
scenario_spec = ScenarioSpec(
|
|
@@ -2075,7 +1240,7 @@ def get_lextreme_spec(subset: str) -> RunSpec:
|
|
|
2075
1240
|
multi_label=(task_type == TaskType.MLTC),
|
|
2076
1241
|
)
|
|
2077
1242
|
|
|
2078
|
-
metric_specs =
|
|
1243
|
+
metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
|
|
2079
1244
|
if task_type == TaskType.MLTC:
|
|
2080
1245
|
metric_specs += get_classification_metric_specs(delimiter=", ")
|
|
2081
1246
|
elif task_type == TaskType.SLTC:
|
|
@@ -2092,6 +1257,14 @@ def get_lextreme_spec(subset: str) -> RunSpec:
|
|
|
2092
1257
|
|
|
2093
1258
|
@run_spec_function("lex_glue")
|
|
2094
1259
|
def get_lex_glue_spec(subset: str) -> RunSpec:
|
|
1260
|
+
from helm.benchmark.scenarios.lex_glue_scenario import (
|
|
1261
|
+
get_lex_glue_instructions,
|
|
1262
|
+
get_lex_glue_max_tokens,
|
|
1263
|
+
get_lex_glue_max_train_instances,
|
|
1264
|
+
get_lex_glue_task_type,
|
|
1265
|
+
)
|
|
1266
|
+
from helm.benchmark.scenarios.lextreme_scenario import TaskType
|
|
1267
|
+
|
|
2095
1268
|
task_type = get_lex_glue_task_type(subset)
|
|
2096
1269
|
|
|
2097
1270
|
scenario_spec = ScenarioSpec(
|
|
@@ -2108,7 +1281,7 @@ def get_lex_glue_spec(subset: str) -> RunSpec:
|
|
|
2108
1281
|
multi_label=(task_type == TaskType.MLTC),
|
|
2109
1282
|
)
|
|
2110
1283
|
|
|
2111
|
-
metric_specs =
|
|
1284
|
+
metric_specs = get_basic_generation_metric_specs([]) + get_generic_metric_specs()
|
|
2112
1285
|
if task_type == TaskType.MLTC:
|
|
2113
1286
|
metric_specs += get_classification_metric_specs(delimiter=", ")
|
|
2114
1287
|
elif task_type == TaskType.SLTC:
|
|
@@ -2207,92 +1380,6 @@ def get_eurlexsum_legal_summarization_spec(temperature: float = 0.3, device: str
|
|
|
2207
1380
|
)
|
|
2208
1381
|
|
|
2209
1382
|
|
|
2210
|
-
@run_spec_function("wmt_14")
|
|
2211
|
-
def get_wmt_14_spec(language_pair: str, max_train_instances: int = 1) -> RunSpec:
|
|
2212
|
-
FULL_LANGUAGE_NAMES = {
|
|
2213
|
-
"cs": "Czech",
|
|
2214
|
-
"de": "German",
|
|
2215
|
-
"fr": "French",
|
|
2216
|
-
"hi": "Hindi",
|
|
2217
|
-
"ru": "Russian",
|
|
2218
|
-
"en": "English",
|
|
2219
|
-
}
|
|
2220
|
-
source_language, target_language = language_pair.split("-")
|
|
2221
|
-
|
|
2222
|
-
scenario_spec = ScenarioSpec(
|
|
2223
|
-
class_name="helm.benchmark.scenarios.wmt_14_scenario.WMT14Scenario",
|
|
2224
|
-
args={"source_language": source_language, "target_language": target_language},
|
|
2225
|
-
)
|
|
2226
|
-
|
|
2227
|
-
adapter_spec = get_machine_translation_adapter_spec(
|
|
2228
|
-
source_language=FULL_LANGUAGE_NAMES[source_language],
|
|
2229
|
-
target_language=FULL_LANGUAGE_NAMES[target_language],
|
|
2230
|
-
max_train_instances=max_train_instances,
|
|
2231
|
-
)
|
|
2232
|
-
|
|
2233
|
-
return RunSpec(
|
|
2234
|
-
name=f"wmt_14:language_pair={language_pair}",
|
|
2235
|
-
scenario_spec=scenario_spec,
|
|
2236
|
-
adapter_spec=adapter_spec,
|
|
2237
|
-
metric_specs=get_machine_translation_metric_specs(),
|
|
2238
|
-
groups=["wmt_14"],
|
|
2239
|
-
)
|
|
2240
|
-
|
|
2241
|
-
|
|
2242
|
-
@run_spec_function("self_instruct")
|
|
2243
|
-
def get_self_instruct_spec(num_respondents: int) -> RunSpec:
|
|
2244
|
-
scenario_spec = ScenarioSpec(
|
|
2245
|
-
class_name="helm.benchmark.scenarios.self_instruct_scenario.SelfInstructScenario",
|
|
2246
|
-
args={},
|
|
2247
|
-
)
|
|
2248
|
-
|
|
2249
|
-
adapter_spec = get_instruct_adapter_spec()
|
|
2250
|
-
|
|
2251
|
-
return RunSpec(
|
|
2252
|
-
name="self_instruct",
|
|
2253
|
-
scenario_spec=scenario_spec,
|
|
2254
|
-
adapter_spec=adapter_spec,
|
|
2255
|
-
metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
|
|
2256
|
-
groups=["self_instruct"],
|
|
2257
|
-
)
|
|
2258
|
-
|
|
2259
|
-
|
|
2260
|
-
@run_spec_function("vicuna")
|
|
2261
|
-
def get_vicuna_spec(num_respondents: int, category: str = "all") -> RunSpec:
|
|
2262
|
-
scenario_spec = ScenarioSpec(
|
|
2263
|
-
class_name="helm.benchmark.scenarios.vicuna_scenario.VicunaScenario",
|
|
2264
|
-
args={"category": category},
|
|
2265
|
-
)
|
|
2266
|
-
|
|
2267
|
-
adapter_spec = get_instruct_adapter_spec()
|
|
2268
|
-
|
|
2269
|
-
return RunSpec(
|
|
2270
|
-
name=f"vicuna:category={category}", # TODO: add args
|
|
2271
|
-
scenario_spec=scenario_spec,
|
|
2272
|
-
adapter_spec=adapter_spec,
|
|
2273
|
-
metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
|
|
2274
|
-
groups=["vicuna"],
|
|
2275
|
-
)
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
@run_spec_function("grammar")
|
|
2279
|
-
def get_grammar_spec(num_respondents: int, path: str, tags: str) -> RunSpec:
|
|
2280
|
-
scenario_spec = ScenarioSpec(
|
|
2281
|
-
class_name="helm.benchmark.scenarios.grammar_scenario.GrammarScenario",
|
|
2282
|
-
args={"path": path, "tags": tags},
|
|
2283
|
-
)
|
|
2284
|
-
|
|
2285
|
-
adapter_spec = get_instruct_adapter_spec()
|
|
2286
|
-
|
|
2287
|
-
return RunSpec(
|
|
2288
|
-
name=f"grammar:path={path},tags={tags}",
|
|
2289
|
-
scenario_spec=scenario_spec,
|
|
2290
|
-
adapter_spec=adapter_spec,
|
|
2291
|
-
metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
|
|
2292
|
-
groups=["grammar"],
|
|
2293
|
-
)
|
|
2294
|
-
|
|
2295
|
-
|
|
2296
1383
|
@run_spec_function("verifiability_judgment")
|
|
2297
1384
|
def get_verifiability_judgment_spec() -> RunSpec:
|
|
2298
1385
|
scenario_spec = ScenarioSpec(
|
|
@@ -2315,7 +1402,7 @@ def get_verifiability_judgment_spec() -> RunSpec:
|
|
|
2315
1402
|
name="verifiability_judgment",
|
|
2316
1403
|
scenario_spec=scenario_spec,
|
|
2317
1404
|
adapter_spec=adapter_spec,
|
|
2318
|
-
metric_specs=
|
|
1405
|
+
metric_specs=get_basic_metric_specs(["exact_match", "quasi_exact_match"]),
|
|
2319
1406
|
groups=["verifiability_judgment"],
|
|
2320
1407
|
)
|
|
2321
1408
|
|
|
@@ -2355,269 +1442,69 @@ def get_opinions_qa_spec(
|
|
|
2355
1442
|
)
|
|
2356
1443
|
|
|
2357
1444
|
|
|
2358
|
-
@run_spec_function("
|
|
2359
|
-
def
|
|
2360
|
-
scenario_spec = ScenarioSpec(
|
|
2361
|
-
class_name="helm.benchmark.scenarios.open_assistant_scenario.OpenAssistantScenario",
|
|
2362
|
-
args={"language": language},
|
|
2363
|
-
)
|
|
2364
|
-
|
|
2365
|
-
adapter_spec = get_instruct_adapter_spec()
|
|
2366
|
-
|
|
2367
|
-
return RunSpec(
|
|
2368
|
-
name=f"open_assistant:language={language}",
|
|
2369
|
-
scenario_spec=scenario_spec,
|
|
2370
|
-
adapter_spec=adapter_spec,
|
|
2371
|
-
metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
|
|
2372
|
-
groups=["open_assistant"],
|
|
2373
|
-
)
|
|
2374
|
-
|
|
2375
|
-
|
|
2376
|
-
@run_spec_function("koala")
|
|
2377
|
-
def get_koala_spec(num_respondents: int) -> RunSpec:
|
|
1445
|
+
@run_spec_function("lm_entry")
|
|
1446
|
+
def get_lm_entry_spec(task: str, method: str = ADAPT_GENERATION) -> RunSpec:
|
|
2378
1447
|
scenario_spec = ScenarioSpec(
|
|
2379
|
-
class_name="helm.benchmark.scenarios.
|
|
2380
|
-
args={},
|
|
1448
|
+
class_name="helm.benchmark.scenarios.lm_entry_scenario.LMEntryScenario",
|
|
1449
|
+
args={"task": task},
|
|
2381
1450
|
)
|
|
1451
|
+
adapter_spec: AdapterSpec
|
|
1452
|
+
metric_specs: List[MetricSpec]
|
|
2382
1453
|
|
|
2383
|
-
|
|
1454
|
+
if method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
1455
|
+
if task in ["first_letter", "last_letter", "first_word", "last_word", "word_before", "word_after"]:
|
|
1456
|
+
raise ValueError(f"Task {task} cannot be cast to multiple choice.")
|
|
1457
|
+
|
|
1458
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1459
|
+
method=method,
|
|
1460
|
+
instructions="Answer the following multiple choice question with a single letter",
|
|
1461
|
+
input_noun="Question",
|
|
1462
|
+
output_noun="\nAnswer",
|
|
1463
|
+
)
|
|
1464
|
+
metric_specs = get_exact_match_metric_specs()
|
|
1465
|
+
elif method == ADAPT_GENERATION:
|
|
1466
|
+
adapter_spec = get_generation_adapter_spec(
|
|
1467
|
+
instructions="Answer the following question in one word.",
|
|
1468
|
+
input_noun="Q",
|
|
1469
|
+
output_noun="\nA",
|
|
1470
|
+
# Shouldn't use any stop sequences because the task is zero-shot and thus we
|
|
1471
|
+
# don't expect the model to magically figure out the output format.
|
|
1472
|
+
stop_sequences=[],
|
|
1473
|
+
# Set max_tokens to save tokens. The answer is a word so 10 tokens should suffice.
|
|
1474
|
+
max_tokens=10,
|
|
1475
|
+
)
|
|
1476
|
+
# It makes no sense to include non-quasi exact match metrics for this task.
|
|
1477
|
+
metric_specs = get_basic_metric_specs(["quasi_exact_match", "quasi_prefix_exact_match", "f1_score"])
|
|
1478
|
+
else:
|
|
1479
|
+
raise ValueError(f"Unknown method: {method}")
|
|
2384
1480
|
|
|
2385
1481
|
return RunSpec(
|
|
2386
|
-
name="
|
|
1482
|
+
name=f"lm_entry:task={task},method={method}",
|
|
2387
1483
|
scenario_spec=scenario_spec,
|
|
2388
1484
|
adapter_spec=adapter_spec,
|
|
2389
|
-
metric_specs=
|
|
2390
|
-
groups=["
|
|
1485
|
+
metric_specs=metric_specs,
|
|
1486
|
+
groups=["lm_entry"],
|
|
2391
1487
|
)
|
|
2392
1488
|
|
|
2393
1489
|
|
|
2394
|
-
@run_spec_function("
|
|
2395
|
-
def
|
|
1490
|
+
@run_spec_function("thai_exam")
|
|
1491
|
+
def get_thai_exam_spec(exam: str = "onet", method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> RunSpec:
|
|
2396
1492
|
scenario_spec = ScenarioSpec(
|
|
2397
|
-
class_name="helm.benchmark.scenarios.
|
|
2398
|
-
args={"subset": subset},
|
|
1493
|
+
class_name="helm.benchmark.scenarios.thai_exam_scenario.ThaiExamScenario", args={"exam": exam}
|
|
2399
1494
|
)
|
|
2400
1495
|
|
|
2401
|
-
adapter_spec =
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
metric_specs=get_instruction_following_critique_metric_specs(num_respondents),
|
|
2408
|
-
groups=["anthropic_hh_rlhf"],
|
|
2409
|
-
)
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
@run_spec_function("cleva")
|
|
2413
|
-
def get_cleva_spec(task: str, version: str, subtask: Optional[str] = None, prompt_id: int = 0) -> RunSpec:
|
|
2414
|
-
from .scenarios.cleva_scenario import CLEVAScenario # noqa
|
|
2415
|
-
|
|
2416
|
-
CLEVAScenario.download_dataset(task, version)
|
|
2417
|
-
|
|
2418
|
-
_, prompt_setting = CLEVAScenario.get_prompt_setting(task, subtask, version, prompt_id)
|
|
2419
|
-
inference_parameters = CLEVAScenario.load_inference_parameters(task, subtask, version, prompt_id)
|
|
2420
|
-
|
|
2421
|
-
class_name_prefix = "".join([word.capitalize() for word in task.split("_")])
|
|
2422
|
-
scenario_spec = ScenarioSpec(
|
|
2423
|
-
class_name=f"helm.benchmark.scenarios.cleva_scenario.CLEVA{class_name_prefix}Scenario",
|
|
2424
|
-
args={"version": version, "subtask": subtask, "prompt_id": prompt_id},
|
|
1496
|
+
adapter_spec = get_multiple_choice_adapter_spec(
|
|
1497
|
+
method=method,
|
|
1498
|
+
instructions="The following are multiple choice questions (with answers).",
|
|
1499
|
+
input_noun="Question",
|
|
1500
|
+
output_noun="Answer",
|
|
1501
|
+
max_train_instances=5,
|
|
2425
1502
|
)
|
|
2426
|
-
run_spec_name: str = f"cleva:task={task},version={version},prompt_id={prompt_id}"
|
|
2427
|
-
if subtask:
|
|
2428
|
-
run_spec_name += f",subtask={subtask}"
|
|
2429
|
-
|
|
2430
|
-
if task in ["copyright"]:
|
|
2431
|
-
adapter_spec = get_completion_adapter_spec(
|
|
2432
|
-
temperature=inference_parameters.get("temperature", 0.2),
|
|
2433
|
-
max_tokens=inference_parameters.get("max_tokens", 1024),
|
|
2434
|
-
num_outputs=inference_parameters.get("num_outputs", 1),
|
|
2435
|
-
)
|
|
2436
|
-
args = {"normalize_by_prefix_length": True, "normalize_newline_space_tab": False}
|
|
2437
|
-
metric_specs = get_cleva_copyright_metric_spec(args) + get_cleva_generative_harms_metric_specs()
|
|
2438
|
-
elif task in ["code_synthesis"]:
|
|
2439
|
-
adapter_spec = get_completion_adapter_spec(
|
|
2440
|
-
instructions=prompt_setting.instructions,
|
|
2441
|
-
temperature=inference_parameters.get("temperature", 0.2),
|
|
2442
|
-
# Taken from the original OpenAI paper to prevent the further generation of irrelevant classes/functions
|
|
2443
|
-
stop_sequences=inference_parameters.get("stop_sequences", ["\nclass", "\ndef", "\nif", "\nprint"]),
|
|
2444
|
-
max_tokens=inference_parameters.get("max_tokens", 600),
|
|
2445
|
-
)
|
|
2446
|
-
metric_specs = get_basic_metric_specs(["code_eval_acc", "pass"]) + get_cleva_generative_harms_metric_specs()
|
|
2447
|
-
elif task in ["language_modeling"]:
|
|
2448
|
-
adapter_spec = get_language_modeling_adapter_spec()
|
|
2449
|
-
metric_specs = get_basic_metric_specs([])
|
|
2450
|
-
else:
|
|
2451
|
-
if prompt_setting.method in [
|
|
2452
|
-
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
2453
|
-
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
2454
|
-
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
2455
|
-
]:
|
|
2456
|
-
if prompt_setting.method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
2457
|
-
adapter_spec = AdapterSpec(
|
|
2458
|
-
method=prompt_setting.method,
|
|
2459
|
-
instructions=prompt_setting.instructions,
|
|
2460
|
-
input_prefix=prompt_setting.input_prefix,
|
|
2461
|
-
input_suffix=prompt_setting.input_suffix,
|
|
2462
|
-
output_prefix=prompt_setting.output_prefix,
|
|
2463
|
-
output_suffix=prompt_setting.output_suffix,
|
|
2464
|
-
max_train_instances=inference_parameters.get("max_train_instances", 5),
|
|
2465
|
-
num_outputs=inference_parameters.get("num_outputs", 5),
|
|
2466
|
-
max_tokens=inference_parameters.get("max_tokens", 1),
|
|
2467
|
-
temperature=inference_parameters.get("temperature", 0.0),
|
|
2468
|
-
stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
|
|
2469
|
-
sample_train=inference_parameters.get("sample_train", True),
|
|
2470
|
-
multi_label=inference_parameters.get("multi_label", False),
|
|
2471
|
-
)
|
|
2472
|
-
else:
|
|
2473
|
-
adapter_spec = AdapterSpec(
|
|
2474
|
-
method=prompt_setting.method,
|
|
2475
|
-
instructions=prompt_setting.instructions,
|
|
2476
|
-
input_prefix=prompt_setting.input_prefix,
|
|
2477
|
-
input_suffix=prompt_setting.input_suffix,
|
|
2478
|
-
output_prefix=prompt_setting.output_prefix,
|
|
2479
|
-
output_suffix=prompt_setting.output_suffix,
|
|
2480
|
-
# Separate is basically language modeling, so can't easily use in-context examples
|
|
2481
|
-
max_train_instances=inference_parameters.get("max_train_instances", 5),
|
|
2482
|
-
num_outputs=1,
|
|
2483
|
-
max_tokens=0,
|
|
2484
|
-
temperature=inference_parameters.get("temperature", 0.0),
|
|
2485
|
-
sample_train=inference_parameters.get("sample_train", True),
|
|
2486
|
-
)
|
|
2487
|
-
metric_specs = get_exact_match_metric_specs()
|
|
2488
|
-
if task in ["fact_checking", "bias"]:
|
|
2489
|
-
metric_specs += get_multiple_choice_classification_metric_specs()
|
|
2490
|
-
elif prompt_setting.method == ADAPT_GENERATION:
|
|
2491
|
-
adapter_spec = AdapterSpec(
|
|
2492
|
-
method=prompt_setting.method,
|
|
2493
|
-
instructions=prompt_setting.instructions,
|
|
2494
|
-
input_prefix=prompt_setting.input_prefix,
|
|
2495
|
-
input_suffix=prompt_setting.input_suffix,
|
|
2496
|
-
output_prefix=prompt_setting.output_prefix,
|
|
2497
|
-
output_suffix=prompt_setting.output_suffix,
|
|
2498
|
-
max_train_instances=inference_parameters.get("max_train_instances", 5),
|
|
2499
|
-
num_outputs=inference_parameters.get("num_outputs", 1),
|
|
2500
|
-
max_tokens=inference_parameters.get("max_tokens", 20),
|
|
2501
|
-
temperature=inference_parameters.get("temperature", 0.0),
|
|
2502
|
-
stop_sequences=inference_parameters.get("stop_sequences", ["\n"]),
|
|
2503
|
-
sample_train=inference_parameters.get("sample_train", True),
|
|
2504
|
-
multi_label=inference_parameters.get("multi_label", True),
|
|
2505
|
-
)
|
|
2506
|
-
metric_specs = (
|
|
2507
|
-
get_cleva_generative_task_metric_spec(task, subtask) + get_cleva_generative_harms_metric_specs()
|
|
2508
|
-
)
|
|
2509
|
-
else:
|
|
2510
|
-
raise ValueError(
|
|
2511
|
-
f"{task} can only be {ADAPT_GENERATION}, {ADAPT_MULTIPLE_CHOICE_JOINT}, "
|
|
2512
|
-
f"{ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED} or {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL}"
|
|
2513
|
-
)
|
|
2514
1503
|
|
|
2515
1504
|
return RunSpec(
|
|
2516
|
-
name=
|
|
1505
|
+
name=f"thai_exam:exam={exam},method={method}",
|
|
2517
1506
|
scenario_spec=scenario_spec,
|
|
2518
1507
|
adapter_spec=adapter_spec,
|
|
2519
|
-
metric_specs=
|
|
2520
|
-
groups=["
|
|
1508
|
+
metric_specs=get_exact_match_metric_specs(),
|
|
1509
|
+
groups=["thai_exam"],
|
|
2521
1510
|
)
|
|
2522
|
-
|
|
2523
|
-
|
|
2524
|
-
############################################################
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]:
|
|
2528
|
-
"""
|
|
2529
|
-
Takes a specification (name, args) and returns a list of `RunSpec`s.
|
|
2530
|
-
"""
|
|
2531
|
-
# Note that we are abusing `spec` a bit because the name is not actually a class name.
|
|
2532
|
-
name = spec.class_name
|
|
2533
|
-
args = spec.args
|
|
2534
|
-
|
|
2535
|
-
if name not in CANONICAL_RUN_SPEC_FUNCS:
|
|
2536
|
-
raise ValueError(f"Unknown run spec name: {name}")
|
|
2537
|
-
|
|
2538
|
-
# Peel off the run expanders (e.g., model)
|
|
2539
|
-
expanders = [RUN_EXPANDERS[key](value) for key, value in args.items() if key in RUN_EXPANDERS] # type: ignore
|
|
2540
|
-
args = dict((key, value) for key, value in args.items() if key not in RUN_EXPANDERS)
|
|
2541
|
-
|
|
2542
|
-
# Get the canonical run specs
|
|
2543
|
-
run_specs = [CANONICAL_RUN_SPEC_FUNCS[name](**args)]
|
|
2544
|
-
|
|
2545
|
-
# Apply expanders
|
|
2546
|
-
for expander in expanders:
|
|
2547
|
-
run_specs = [
|
|
2548
|
-
child_run_spec for parent_run_spec in run_specs for child_run_spec in expander.expand(parent_run_spec)
|
|
2549
|
-
]
|
|
2550
|
-
|
|
2551
|
-
def alter_run_spec(run_spec: RunSpec) -> RunSpec:
|
|
2552
|
-
try:
|
|
2553
|
-
model = get_model(run_spec.adapter_spec.model)
|
|
2554
|
-
except ValueError:
|
|
2555
|
-
# Models registered from configs cannot have expanders applied to them,
|
|
2556
|
-
# because the models will not have been registered yet at this point.
|
|
2557
|
-
# TODO: Figure out a cleaner way to deal with this.
|
|
2558
|
-
return run_spec
|
|
2559
|
-
# For models that strip newlines, when we're generating, we need to set
|
|
2560
|
-
# the delimiter to be '###' so we stop properly.
|
|
2561
|
-
if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in (
|
|
2562
|
-
ADAPT_GENERATION,
|
|
2563
|
-
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
2564
|
-
):
|
|
2565
|
-
stop_expander = StopRunExpander(value="hash")
|
|
2566
|
-
run_spec = singleton(stop_expander.expand(run_spec))
|
|
2567
|
-
|
|
2568
|
-
if NLG_PREFIX_TAG in model.tags:
|
|
2569
|
-
global_prefix_expander = GlobalPrefixRunExpander(value="nlg")
|
|
2570
|
-
run_spec = singleton(global_prefix_expander.expand(run_spec))
|
|
2571
|
-
|
|
2572
|
-
# When running ChatGPT on non-language modelling tasks, increase max_tokens by 1
|
|
2573
|
-
# to add room for the special message role token.
|
|
2574
|
-
if OPENAI_CHATGPT_MODEL_TAG in model.tags and run_spec.adapter_spec.max_tokens:
|
|
2575
|
-
increase_max_tokens_expander = IncreaseMaxTokensRunExpander(value=1)
|
|
2576
|
-
run_spec = singleton(increase_max_tokens_expander.expand(run_spec))
|
|
2577
|
-
|
|
2578
|
-
if CHATML_MODEL_TAG in model.tags:
|
|
2579
|
-
chatml_expander = ChatMLRunExpander()
|
|
2580
|
-
run_spec = singleton(chatml_expander.expand(run_spec))
|
|
2581
|
-
|
|
2582
|
-
# Special handling for Anthropic Claude
|
|
2583
|
-
if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags or ANTHROPIC_CLAUDE_2_MODEL_TAG in model.tags:
|
|
2584
|
-
try:
|
|
2585
|
-
import anthropic
|
|
2586
|
-
from helm.proxy.clients.anthropic_client import AnthropicClient
|
|
2587
|
-
except ModuleNotFoundError as e:
|
|
2588
|
-
handle_module_not_found_error(e, ["anthropic"])
|
|
2589
|
-
claude_run_expanders: List[RunExpander] = []
|
|
2590
|
-
claude_run_expanders.append(AddToStopRunExpander(anthropic.HUMAN_PROMPT))
|
|
2591
|
-
if ANTHROPIC_CLAUDE_1_MODEL_TAG in model.tags:
|
|
2592
|
-
claude_run_expanders.append(IncreaseMaxTokensRunExpander(value=AnthropicClient.ADDITIONAL_TOKENS))
|
|
2593
|
-
# Get scenario tags
|
|
2594
|
-
components = run_spec.scenario_spec.class_name.split(".")
|
|
2595
|
-
class_name = components[-1]
|
|
2596
|
-
module_name = ".".join(components[:-1])
|
|
2597
|
-
cls = getattr(importlib.import_module(module_name), class_name)
|
|
2598
|
-
scenario_tags: List[str] = cls.tags
|
|
2599
|
-
# If the scenario is instruction, do not use PROMPT_ANSWER_START
|
|
2600
|
-
if "instructions" in scenario_tags:
|
|
2601
|
-
claude_run_expanders.append(
|
|
2602
|
-
FormatPromptRunExpander(prefix=anthropic.HUMAN_PROMPT, suffix=f"{anthropic.AI_PROMPT}")
|
|
2603
|
-
)
|
|
2604
|
-
else:
|
|
2605
|
-
claude_run_expanders.append(
|
|
2606
|
-
FormatPromptRunExpander(
|
|
2607
|
-
prefix=anthropic.HUMAN_PROMPT,
|
|
2608
|
-
suffix=f"{anthropic.AI_PROMPT} {AnthropicClient.PROMPT_ANSWER_START}",
|
|
2609
|
-
)
|
|
2610
|
-
)
|
|
2611
|
-
for claude_run_expander in claude_run_expanders:
|
|
2612
|
-
run_spec = singleton(claude_run_expander.expand(run_spec))
|
|
2613
|
-
|
|
2614
|
-
# For multiple choice
|
|
2615
|
-
if BUGGY_TEMP_0_TAG in model.tags and run_spec.adapter_spec.temperature == 0:
|
|
2616
|
-
increase_temperature_expander = IncreaseTemperatureRunExpander(value=1e-4)
|
|
2617
|
-
run_spec = singleton(increase_temperature_expander.expand(run_spec))
|
|
2618
|
-
|
|
2619
|
-
return run_spec
|
|
2620
|
-
|
|
2621
|
-
run_specs = [alter_run_spec(run_spec) for run_spec in run_specs]
|
|
2622
|
-
|
|
2623
|
-
return run_specs
|