crfm-helm 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/METADATA +138 -31
- crfm_helm-0.5.1.dist-info/RECORD +654 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +25 -3
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +41 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +213 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +392 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +575 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +41 -1
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +205 -35
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +163 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +757 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +246 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +3 -4
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +5 -3
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_image2structure.yaml +304 -0
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +164 -0
- helm/benchmark/static/schema_vlm.yaml +823 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-737eef9e.js +10 -0
- helm/benchmark/static_build/assets/index-878a1094.css +1 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +233 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +301 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +104 -73
- helm/clients/vertexai_client.py +400 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +111 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +54 -49
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +33 -3
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1159 -538
- helm/config/model_metadata.yaml +868 -41
- helm/config/tokenizer_configs.yaml +149 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.1.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -1,10 +1,23 @@
|
|
|
1
1
|
# mypy: check_untyped_defs = False
|
|
2
|
+
from typing import List, Set
|
|
2
3
|
from helm.benchmark.scenarios.scenario import TEST_SPLIT, TRAIN_SPLIT, Instance, Input, Output, Reference, CORRECT_TAG
|
|
3
4
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
5
|
from .adapter_factory import AdapterFactory, ADAPT_MULTIPLE_CHOICE_JOINT
|
|
5
6
|
from .test_adapter import TestAdapter
|
|
6
7
|
|
|
7
8
|
|
|
9
|
+
def _make_instance(
|
|
10
|
+
text: str, reference_texts: List[str], correct_references: Set[int], is_eval: bool = False
|
|
11
|
+
) -> Instance:
|
|
12
|
+
references = []
|
|
13
|
+
for i, reference_text in enumerate(reference_texts):
|
|
14
|
+
tags = [CORRECT_TAG] if i in correct_references else []
|
|
15
|
+
references.append(Reference(Output(text=reference_text), tags=tags))
|
|
16
|
+
|
|
17
|
+
split = TEST_SPLIT if is_eval else TRAIN_SPLIT
|
|
18
|
+
return Instance(Input(text=text), references=references, split=split)
|
|
19
|
+
|
|
20
|
+
|
|
8
21
|
class TestMultipleChoiceJointAdapter(TestAdapter):
|
|
9
22
|
def test_sample_examples(self):
|
|
10
23
|
adapter_spec = AdapterSpec(
|
|
@@ -53,6 +66,47 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
|
|
|
53
66
|
examples = adapter.sample_examples(all_train_instances, seed=0)
|
|
54
67
|
assert len(examples) == 3
|
|
55
68
|
|
|
69
|
+
def test_sample_examples_unique_labels(self):
|
|
70
|
+
"""This is a demonstration of behavior reported in issue #2224."""
|
|
71
|
+
adapter_spec = AdapterSpec(
|
|
72
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=3
|
|
73
|
+
)
|
|
74
|
+
adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service)
|
|
75
|
+
all_train_instances = [
|
|
76
|
+
# Three with 0 being correct.
|
|
77
|
+
_make_instance("one", ["0", "1"], correct_references={0}),
|
|
78
|
+
_make_instance("two", ["2", "3"], correct_references={0}),
|
|
79
|
+
_make_instance("three", ["4", "5"], correct_references={0}),
|
|
80
|
+
# Two with 1 being correct.
|
|
81
|
+
_make_instance("four", ["6", "7"], correct_references={1}),
|
|
82
|
+
_make_instance("five", ["8", "9"], correct_references={1}),
|
|
83
|
+
]
|
|
84
|
+
eval_instance = _make_instance("eval", ["10", "11"], correct_references={1}, is_eval=True)
|
|
85
|
+
request_states = adapter.adapt(all_train_instances + [eval_instance], parallelism=1)
|
|
86
|
+
assert len(request_states) == 1
|
|
87
|
+
# In every case, we are showing that model that Output should be "A".
|
|
88
|
+
assert request_states[0].request.prompt == (
|
|
89
|
+
"Input: three\n"
|
|
90
|
+
"A. 4\n"
|
|
91
|
+
"B. 5\n"
|
|
92
|
+
"Output: A\n"
|
|
93
|
+
"\n"
|
|
94
|
+
"Input: two\n"
|
|
95
|
+
"A. 2\n"
|
|
96
|
+
"B. 3\n"
|
|
97
|
+
"Output: A\n"
|
|
98
|
+
"\n"
|
|
99
|
+
"Input: one\n"
|
|
100
|
+
"A. 0\n"
|
|
101
|
+
"B. 1\n"
|
|
102
|
+
"Output: A\n"
|
|
103
|
+
"\n"
|
|
104
|
+
"Input: eval\n"
|
|
105
|
+
"A. 10\n"
|
|
106
|
+
"B. 11\n"
|
|
107
|
+
"Output:"
|
|
108
|
+
)
|
|
109
|
+
|
|
56
110
|
def test_multiple_correct_reference(self):
|
|
57
111
|
adapter_spec = AdapterSpec(
|
|
58
112
|
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
@@ -91,9 +145,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
|
|
|
91
145
|
],
|
|
92
146
|
split=TEST_SPLIT,
|
|
93
147
|
)
|
|
94
|
-
|
|
95
|
-
assert len(
|
|
96
|
-
assert
|
|
148
|
+
request_states = adapter.adapt(train_instances + [eval_instance], parallelism=1)
|
|
149
|
+
assert len(request_states) == 1
|
|
150
|
+
assert request_states[0].request.prompt == (
|
|
97
151
|
"Input: Second reference is correct\n"
|
|
98
152
|
"A. First\n"
|
|
99
153
|
"B. Second\n"
|
|
@@ -150,9 +204,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter):
|
|
|
150
204
|
],
|
|
151
205
|
split=TEST_SPLIT,
|
|
152
206
|
)
|
|
153
|
-
|
|
154
|
-
assert len(
|
|
155
|
-
assert
|
|
207
|
+
request_states = adapter.adapt(train_instances + [eval_instance], parallelism=1)
|
|
208
|
+
assert len(request_states) == 1
|
|
209
|
+
assert request_states[0].request.prompt == (
|
|
156
210
|
"Input: Second reference is correct\n"
|
|
157
211
|
"A. First\n"
|
|
158
212
|
"B. Second\n"
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import (
|
|
4
|
+
ADAPT_GENERATION,
|
|
5
|
+
ADAPT_LANGUAGE_MODELING,
|
|
6
|
+
ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
7
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED,
|
|
8
|
+
ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL,
|
|
9
|
+
ADAPT_RANKING_BINARY,
|
|
10
|
+
AdapterSpec,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def format_instructions(instructions: str) -> str:
|
|
15
|
+
if len(instructions) > 0:
|
|
16
|
+
instructions += "\n"
|
|
17
|
+
return instructions
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_multiple_choice_joint_adapter_spec(
|
|
21
|
+
instructions: str,
|
|
22
|
+
input_noun: Optional[str],
|
|
23
|
+
output_noun: str,
|
|
24
|
+
num_outputs: int = 5,
|
|
25
|
+
max_train_instances: int = 5,
|
|
26
|
+
max_tokens: int = 5,
|
|
27
|
+
sample_train: bool = True,
|
|
28
|
+
**kwargs,
|
|
29
|
+
) -> AdapterSpec:
|
|
30
|
+
"""
|
|
31
|
+
[instructions]
|
|
32
|
+
|
|
33
|
+
[input_noun]: [input]
|
|
34
|
+
[reference_1]
|
|
35
|
+
...
|
|
36
|
+
[reference_k]
|
|
37
|
+
[output_noun]: [output]
|
|
38
|
+
|
|
39
|
+
[input_noun]: [input]
|
|
40
|
+
[reference_1]
|
|
41
|
+
...
|
|
42
|
+
[reference_k]
|
|
43
|
+
[output_noun]:
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
return AdapterSpec(
|
|
47
|
+
method=ADAPT_MULTIPLE_CHOICE_JOINT,
|
|
48
|
+
instructions=format_instructions(instructions),
|
|
49
|
+
input_prefix=f"{input_noun}: " if input_noun is not None else "",
|
|
50
|
+
input_suffix="\n" if input_noun is not None else "",
|
|
51
|
+
output_prefix=f"{output_noun}: ",
|
|
52
|
+
output_suffix="\n",
|
|
53
|
+
max_train_instances=max_train_instances,
|
|
54
|
+
num_outputs=num_outputs,
|
|
55
|
+
max_tokens=max_tokens,
|
|
56
|
+
temperature=0.0,
|
|
57
|
+
stop_sequences=["\n"],
|
|
58
|
+
sample_train=sample_train,
|
|
59
|
+
**kwargs,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_multiple_choice_separate_adapter_spec(method: str, empty_input: bool = False) -> AdapterSpec:
|
|
64
|
+
"""
|
|
65
|
+
[input] [reference_i]
|
|
66
|
+
or
|
|
67
|
+
[reference_i]
|
|
68
|
+
"""
|
|
69
|
+
assert method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}
|
|
70
|
+
|
|
71
|
+
return AdapterSpec(
|
|
72
|
+
method=method,
|
|
73
|
+
instructions="",
|
|
74
|
+
input_prefix="",
|
|
75
|
+
input_suffix="",
|
|
76
|
+
output_prefix=" " if not empty_input else "",
|
|
77
|
+
output_suffix="",
|
|
78
|
+
# Separate is basically language modeling, so can't easily use in-context examples
|
|
79
|
+
max_train_instances=0,
|
|
80
|
+
num_outputs=1,
|
|
81
|
+
max_tokens=0,
|
|
82
|
+
temperature=0.0,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_multiple_choice_adapter_spec(
|
|
87
|
+
method: str,
|
|
88
|
+
instructions: str,
|
|
89
|
+
input_noun: Optional[str],
|
|
90
|
+
output_noun: str,
|
|
91
|
+
max_train_instances: int = 5,
|
|
92
|
+
num_outputs: int = 5,
|
|
93
|
+
max_tokens: int = 1,
|
|
94
|
+
empty_input: bool = False,
|
|
95
|
+
sample_train: bool = True,
|
|
96
|
+
**kwargs,
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
Toggle between joint and separate adapters.
|
|
100
|
+
"""
|
|
101
|
+
if method == ADAPT_MULTIPLE_CHOICE_JOINT:
|
|
102
|
+
return get_multiple_choice_joint_adapter_spec(
|
|
103
|
+
instructions,
|
|
104
|
+
input_noun,
|
|
105
|
+
output_noun,
|
|
106
|
+
max_train_instances=max_train_instances,
|
|
107
|
+
num_outputs=num_outputs,
|
|
108
|
+
max_tokens=max_tokens,
|
|
109
|
+
sample_train=sample_train,
|
|
110
|
+
**kwargs,
|
|
111
|
+
)
|
|
112
|
+
elif method in {ADAPT_MULTIPLE_CHOICE_SEPARATE_ORIGINAL, ADAPT_MULTIPLE_CHOICE_SEPARATE_CALIBRATED}:
|
|
113
|
+
return get_multiple_choice_separate_adapter_spec(method, empty_input)
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"Invalid adaptation method: {method}")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_ranking_binary_adapter_spec(
|
|
119
|
+
instructions: str = "",
|
|
120
|
+
document_noun: str = "Passage",
|
|
121
|
+
query_noun: str = "Query",
|
|
122
|
+
output_prefix: str = "Does the passage answer the query?",
|
|
123
|
+
output_noun: str = "Answer",
|
|
124
|
+
max_train_instances: int = 4,
|
|
125
|
+
num_outputs: int = 1,
|
|
126
|
+
num_train_trials: int = 1,
|
|
127
|
+
temperature: float = 0.0,
|
|
128
|
+
max_tokens: int = 5,
|
|
129
|
+
**kwargs,
|
|
130
|
+
) -> AdapterSpec:
|
|
131
|
+
"""
|
|
132
|
+
[instructions]
|
|
133
|
+
|
|
134
|
+
[object_noun]: [object]
|
|
135
|
+
[query_noun]: [query]
|
|
136
|
+
[prompt_noun]: [prompt_content]
|
|
137
|
+
[output_noun]: [output]
|
|
138
|
+
|
|
139
|
+
...
|
|
140
|
+
|
|
141
|
+
[object_noun]: [object]
|
|
142
|
+
[query_noun]: [query]
|
|
143
|
+
[prompt_noun]: [prompt_content]
|
|
144
|
+
[output_noun]: [output]
|
|
145
|
+
|
|
146
|
+
[object_noun]: [object]
|
|
147
|
+
[query_noun]: [query]
|
|
148
|
+
[prompt_noun]: [prompt_content]
|
|
149
|
+
[output_noun]: [output]
|
|
150
|
+
"""
|
|
151
|
+
msg = (
|
|
152
|
+
"There must be an even number of in-context examples to ensure that"
|
|
153
|
+
"an equal number of positive and negative examples are included."
|
|
154
|
+
)
|
|
155
|
+
assert max_train_instances % 2 == 0, msg
|
|
156
|
+
max_train_instances = int(max_train_instances / 2)
|
|
157
|
+
|
|
158
|
+
return AdapterSpec(
|
|
159
|
+
method=ADAPT_RANKING_BINARY,
|
|
160
|
+
instructions=format_instructions(instructions),
|
|
161
|
+
input_prefix=f"{query_noun}: ",
|
|
162
|
+
input_suffix="\n",
|
|
163
|
+
reference_prefix=f"{document_noun}: ",
|
|
164
|
+
reference_suffix="\n",
|
|
165
|
+
output_prefix=f"{output_prefix}\n{output_noun}: ",
|
|
166
|
+
max_train_instances=max_train_instances,
|
|
167
|
+
num_outputs=num_outputs,
|
|
168
|
+
num_train_trials=num_train_trials,
|
|
169
|
+
temperature=temperature,
|
|
170
|
+
max_tokens=max_tokens,
|
|
171
|
+
**kwargs,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def get_completion_adapter_spec(
|
|
176
|
+
instructions: str = "",
|
|
177
|
+
input_prefix: str = "",
|
|
178
|
+
output_prefix: str = "",
|
|
179
|
+
output_suffix: str = "",
|
|
180
|
+
max_train_instances: int = 0,
|
|
181
|
+
temperature: float = 0.0,
|
|
182
|
+
num_outputs: int = 1,
|
|
183
|
+
max_tokens: int = 100,
|
|
184
|
+
stop_sequences: Optional[List] = None, # default value of `stop_sequences` is no stop sequence,
|
|
185
|
+
**kwargs,
|
|
186
|
+
) -> AdapterSpec:
|
|
187
|
+
"""
|
|
188
|
+
[input][output_prefix][output][output_suffix]
|
|
189
|
+
|
|
190
|
+
[input][output_prefix]
|
|
191
|
+
"""
|
|
192
|
+
if stop_sequences is None:
|
|
193
|
+
stop_sequences = []
|
|
194
|
+
|
|
195
|
+
return AdapterSpec(
|
|
196
|
+
method=ADAPT_GENERATION,
|
|
197
|
+
instructions=format_instructions(instructions),
|
|
198
|
+
input_prefix=input_prefix,
|
|
199
|
+
input_suffix="",
|
|
200
|
+
output_prefix=output_prefix,
|
|
201
|
+
output_suffix=output_suffix,
|
|
202
|
+
max_train_instances=max_train_instances,
|
|
203
|
+
temperature=temperature,
|
|
204
|
+
num_outputs=num_outputs,
|
|
205
|
+
max_tokens=max_tokens,
|
|
206
|
+
stop_sequences=stop_sequences,
|
|
207
|
+
**kwargs,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def get_generation_adapter_spec(
|
|
212
|
+
instructions: str = "",
|
|
213
|
+
input_noun: Optional[str] = None,
|
|
214
|
+
newline_after_input_noun: bool = False,
|
|
215
|
+
output_noun: Optional[str] = None,
|
|
216
|
+
newline_after_output_noun: bool = False,
|
|
217
|
+
max_train_instances: int = 5,
|
|
218
|
+
num_outputs: int = 1,
|
|
219
|
+
max_tokens: int = 5,
|
|
220
|
+
stop_sequences: Optional[List] = None, # default value of `stop_sequences` is ["\n"]
|
|
221
|
+
temperature: float = 0.0,
|
|
222
|
+
multi_label: bool = False,
|
|
223
|
+
) -> AdapterSpec:
|
|
224
|
+
"""
|
|
225
|
+
[instructions]
|
|
226
|
+
|
|
227
|
+
[input_noun]: [input]
|
|
228
|
+
[output_noun]: [output]
|
|
229
|
+
|
|
230
|
+
[input_noun]: [input]
|
|
231
|
+
[output_noun]:
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
def format_prefix(noun: Optional[str], append_new_line: bool) -> str:
|
|
235
|
+
"""
|
|
236
|
+
When `append_new_line` is False:
|
|
237
|
+
[input_noun]: [input]
|
|
238
|
+
|
|
239
|
+
When `append_new_line` is True:
|
|
240
|
+
[input_noun]:
|
|
241
|
+
[input]
|
|
242
|
+
"""
|
|
243
|
+
prefix: str = f"{noun}:" if noun is not None else ""
|
|
244
|
+
if len(prefix) > 0:
|
|
245
|
+
prefix += "\n" if append_new_line else " "
|
|
246
|
+
return prefix
|
|
247
|
+
|
|
248
|
+
if stop_sequences is None:
|
|
249
|
+
stop_sequences = ["\n"]
|
|
250
|
+
|
|
251
|
+
return AdapterSpec(
|
|
252
|
+
method=ADAPT_GENERATION,
|
|
253
|
+
instructions=format_instructions(instructions),
|
|
254
|
+
input_prefix=format_prefix(input_noun, append_new_line=newline_after_input_noun),
|
|
255
|
+
input_suffix="\n",
|
|
256
|
+
output_prefix=format_prefix(output_noun, append_new_line=newline_after_output_noun),
|
|
257
|
+
output_suffix="\n",
|
|
258
|
+
max_train_instances=max_train_instances,
|
|
259
|
+
num_outputs=num_outputs,
|
|
260
|
+
max_tokens=max_tokens,
|
|
261
|
+
temperature=temperature,
|
|
262
|
+
stop_sequences=stop_sequences,
|
|
263
|
+
multi_label=multi_label,
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def get_instruct_adapter_spec(
|
|
268
|
+
num_outputs: int = 1,
|
|
269
|
+
max_tokens: int = 512,
|
|
270
|
+
temperature: float = 0.7,
|
|
271
|
+
) -> AdapterSpec:
|
|
272
|
+
"""
|
|
273
|
+
Zero-shot instruction-following.
|
|
274
|
+
"""
|
|
275
|
+
return AdapterSpec(
|
|
276
|
+
method=ADAPT_GENERATION,
|
|
277
|
+
instructions="",
|
|
278
|
+
input_prefix="",
|
|
279
|
+
input_suffix="\n",
|
|
280
|
+
output_prefix="",
|
|
281
|
+
output_suffix="",
|
|
282
|
+
max_train_instances=0,
|
|
283
|
+
num_outputs=num_outputs,
|
|
284
|
+
max_tokens=max_tokens,
|
|
285
|
+
temperature=temperature,
|
|
286
|
+
stop_sequences=[],
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def get_few_shot_instruct_adapter_spec(
|
|
291
|
+
num_outputs: int = 1,
|
|
292
|
+
max_tokens: int = 512,
|
|
293
|
+
temperature: float = 0.7,
|
|
294
|
+
max_train_instances: int = 0,
|
|
295
|
+
) -> AdapterSpec:
|
|
296
|
+
"""
|
|
297
|
+
Few-shot instruction-following.
|
|
298
|
+
"""
|
|
299
|
+
return AdapterSpec(
|
|
300
|
+
method=ADAPT_GENERATION,
|
|
301
|
+
instructions="",
|
|
302
|
+
input_prefix="",
|
|
303
|
+
input_suffix="\n",
|
|
304
|
+
output_prefix="",
|
|
305
|
+
output_suffix="",
|
|
306
|
+
max_train_instances=max_train_instances,
|
|
307
|
+
num_outputs=num_outputs,
|
|
308
|
+
max_tokens=max_tokens,
|
|
309
|
+
temperature=temperature,
|
|
310
|
+
stop_sequences=[],
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def get_language_modeling_adapter_spec() -> AdapterSpec:
|
|
315
|
+
"""
|
|
316
|
+
Used for language modeling.
|
|
317
|
+
"""
|
|
318
|
+
return AdapterSpec(
|
|
319
|
+
method=ADAPT_LANGUAGE_MODELING,
|
|
320
|
+
instructions="",
|
|
321
|
+
input_prefix="",
|
|
322
|
+
input_suffix="",
|
|
323
|
+
output_prefix="",
|
|
324
|
+
output_suffix="",
|
|
325
|
+
max_train_instances=0,
|
|
326
|
+
num_outputs=1,
|
|
327
|
+
max_tokens=0,
|
|
328
|
+
temperature=0.0,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def get_summarization_adapter_spec(num_sents: Optional[int], max_train_instances: int = 5, **kwargs) -> AdapterSpec:
|
|
333
|
+
"""
|
|
334
|
+
Used for summarization.
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
if num_sents == 1:
|
|
338
|
+
out_pref = "Summarize the above article in 1 sentence.\n"
|
|
339
|
+
elif num_sents is None:
|
|
340
|
+
out_pref = "Summarize the above article.\n"
|
|
341
|
+
else:
|
|
342
|
+
out_pref = f"Summarize the above article in {num_sents} sentences.\n"
|
|
343
|
+
|
|
344
|
+
return AdapterSpec(
|
|
345
|
+
method=ADAPT_GENERATION,
|
|
346
|
+
instructions="",
|
|
347
|
+
input_prefix="###\nArticle: ",
|
|
348
|
+
input_suffix="\n\n",
|
|
349
|
+
output_prefix=out_pref,
|
|
350
|
+
output_suffix="\n",
|
|
351
|
+
max_train_instances=max_train_instances,
|
|
352
|
+
num_outputs=1,
|
|
353
|
+
stop_sequences=["###"], # Separator between few-shot instances.
|
|
354
|
+
**kwargs,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def get_machine_translation_adapter_spec(
|
|
359
|
+
source_language, target_language, max_train_instances, **kwargs
|
|
360
|
+
) -> AdapterSpec:
|
|
361
|
+
"""
|
|
362
|
+
Used for machine translation.
|
|
363
|
+
"""
|
|
364
|
+
return AdapterSpec(
|
|
365
|
+
method=ADAPT_GENERATION,
|
|
366
|
+
instructions=f"Translate the following sentences from {source_language} to {target_language}.",
|
|
367
|
+
input_prefix=f"{source_language}: ",
|
|
368
|
+
input_suffix="\n",
|
|
369
|
+
output_prefix=f"{target_language}: ",
|
|
370
|
+
output_suffix="\n",
|
|
371
|
+
max_train_instances=max_train_instances,
|
|
372
|
+
num_outputs=1,
|
|
373
|
+
stop_sequences=["\n\n"],
|
|
374
|
+
temperature=0.0,
|
|
375
|
+
**kwargs,
|
|
376
|
+
)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
|
-
from typing import Optional, Dict, List
|
|
2
|
+
from typing import Optional, Dict, List, Any
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.scenarios.scenario import Instance
|
|
5
5
|
from helm.common.general import indent_lines, format_text_lines, serialize
|
|
@@ -45,6 +45,11 @@ class RequestState:
|
|
|
45
45
|
num_conditioning_tokens: int = 0
|
|
46
46
|
"""The number of initial tokens that will be ignored when computing language modeling metrics"""
|
|
47
47
|
|
|
48
|
+
annotations: Optional[Dict[str, Any]] = None
|
|
49
|
+
"""Output of some post-processing step that is needed for the metric to understand the request
|
|
50
|
+
Should match the annotator's name to an Annotation (usually a list of dictionaries for each completion)
|
|
51
|
+
Example: parsing, rendering an image based on the text completion, etc."""
|
|
52
|
+
|
|
48
53
|
def __post_init__(self):
|
|
49
54
|
if self.request_mode:
|
|
50
55
|
assert self.request_mode in ["original", "calibration"], f"Invalid request_mode: {self.request_mode}"
|
|
@@ -3,8 +3,9 @@ from dataclasses import dataclass
|
|
|
3
3
|
from typing import List, Dict, Tuple, Optional
|
|
4
4
|
|
|
5
5
|
from helm.benchmark.scenarios.scenario import Instance
|
|
6
|
-
from .adapter_spec import AdapterSpec
|
|
7
|
-
from .request_state import RequestState
|
|
6
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
7
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
8
|
+
from helm.benchmark.annotation.annotator import AnnotatorSpec
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
@dataclass
|
|
@@ -21,6 +22,9 @@ class ScenarioState:
|
|
|
21
22
|
# List of `RequestState`s that were produced by adaptation (and execution)
|
|
22
23
|
request_states: List[RequestState]
|
|
23
24
|
|
|
25
|
+
# Annotations to use for this run spec
|
|
26
|
+
annotator_specs: Optional[List[AnnotatorSpec]] = None
|
|
27
|
+
|
|
24
28
|
def __post_init__(self):
|
|
25
29
|
# Create derived indices based on `request_states` so it's easier for
|
|
26
30
|
# the `Metric` later to access them. Two things are produced:
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from typing import Dict, List, Any
|
|
2
|
+
from abc import abstractmethod, ABC
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.common.object_spec import ObjectSpec
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Annotator(ABC):
|
|
10
|
+
"""Annotator is an abstract class for annotating a request state. Annotators are used to add additional
|
|
11
|
+
information to a request state that is needed for a metric to understand the request. This could be
|
|
12
|
+
parsing, rendering an image based on the text completion, etc."""
|
|
13
|
+
|
|
14
|
+
name: str
|
|
15
|
+
"""Name of the annotator. Should be filled in by the subclass."""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def annotate(self, request_state: RequestState) -> Any:
|
|
19
|
+
"""Fills the annotations field of the request state with additional information
|
|
20
|
+
that are implementation specific."""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class AnnotatorSpec(ObjectSpec):
|
|
26
|
+
"""Specifies how to create an `Annotator`.
|
|
27
|
+
The user should only specify the class name.
|
|
28
|
+
The arguments will be filled in by the `AnnotatorFactory`.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DummyAnnotator(Annotator):
|
|
35
|
+
"""A dummy annotator that does nothing."""
|
|
36
|
+
|
|
37
|
+
name = "dummy"
|
|
38
|
+
|
|
39
|
+
def annotate(self, request_state: RequestState) -> List[Dict[str, Any]]:
|
|
40
|
+
if request_state.result is None:
|
|
41
|
+
raise ValueError("Annotation requires a result")
|
|
42
|
+
annotation_values: List[str] = [completion.text.upper() for completion in request_state.result.completions]
|
|
43
|
+
return [{"all_caps": value} for value in annotation_values]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, Dict, Mapping, Optional
|
|
3
|
+
|
|
4
|
+
from helm.common.credentials_utils import provide_api_key
|
|
5
|
+
from helm.common.cache_backend_config import CacheBackendConfig, CacheConfig
|
|
6
|
+
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from helm.common.object_spec import create_object, inject_object_spec_args
|
|
8
|
+
from helm.benchmark.annotation.annotator import Annotator, AnnotatorSpec
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AnnotatorFactory:
|
|
12
|
+
"""Factory for creating annotators."""
|
|
13
|
+
|
|
14
|
+
def __init__(
|
|
15
|
+
self, credentials: Mapping[str, Any], file_storage_path: str, cache_backend_config: CacheBackendConfig
|
|
16
|
+
):
|
|
17
|
+
self.credentials = credentials
|
|
18
|
+
self.file_storage_path = file_storage_path
|
|
19
|
+
self.cache_backend_config = cache_backend_config
|
|
20
|
+
hlog(f"AnnotatorFactory: file_storage_path = {file_storage_path}")
|
|
21
|
+
hlog(f"AnnotatorFactory: cache_backend_config = {cache_backend_config}")
|
|
22
|
+
|
|
23
|
+
# Cache for annotators
|
|
24
|
+
# This is used to prevent duplicate creation of annotators
|
|
25
|
+
# It is especially important as annotation is a multi-threaded
|
|
26
|
+
# process and creating a new annotator for each request can cause
|
|
27
|
+
# race conditions.
|
|
28
|
+
self.annotators: Dict[str, Annotator] = {}
|
|
29
|
+
|
|
30
|
+
def get_annotator(self, annotator_spec: AnnotatorSpec) -> Annotator:
|
|
31
|
+
"""Return a annotator based on the name."""
|
|
32
|
+
# First try to find the annotator in the cache
|
|
33
|
+
assert annotator_spec.args is None or annotator_spec.args == {}
|
|
34
|
+
annotator_name: str = annotator_spec.class_name.split(".")[-1].lower().replace("annotator", "")
|
|
35
|
+
annotator: Optional[Annotator] = self.annotators.get(annotator_name)
|
|
36
|
+
if annotator is not None:
|
|
37
|
+
return annotator
|
|
38
|
+
|
|
39
|
+
# Otherwise, create the client
|
|
40
|
+
cache_config: CacheConfig = self.cache_backend_config.get_cache_config(annotator_name)
|
|
41
|
+
annotator_spec = inject_object_spec_args(
|
|
42
|
+
annotator_spec,
|
|
43
|
+
constant_bindings={
|
|
44
|
+
"cache_config": cache_config,
|
|
45
|
+
},
|
|
46
|
+
provider_bindings={
|
|
47
|
+
"api_key": lambda: provide_api_key(self.credentials, annotator_name),
|
|
48
|
+
"file_storage_path": lambda: self._get_file_storage_path(annotator_name),
|
|
49
|
+
},
|
|
50
|
+
)
|
|
51
|
+
annotator = create_object(annotator_spec)
|
|
52
|
+
|
|
53
|
+
# Cache the client
|
|
54
|
+
self.annotators[annotator_name] = annotator
|
|
55
|
+
|
|
56
|
+
return annotator
|
|
57
|
+
|
|
58
|
+
def _get_file_storage_path(self, annotator_name: str) -> str:
|
|
59
|
+
# Returns the path to use for a local file cache for the given annotator
|
|
60
|
+
local_file_cache_path: str = os.path.join(self.file_storage_path, "output", annotator_name)
|
|
61
|
+
return local_file_cache_path
|