crfm-helm 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +134 -31
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +31 -3
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +20 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +2 -3
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +32 -8
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +7 -19
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +60 -6
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/gender_perturbation.py +1 -1
- helm/benchmark/augmentations/perturbation.py +8 -2
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +20 -7
- helm/benchmark/metrics/basic_metrics.py +169 -664
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -110
- helm/benchmark/metrics/dry_run_metrics.py +2 -2
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +2 -2
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -2
- helm/benchmark/metrics/test_classification_metrics.py +8 -5
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +5 -1
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +10 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +74 -0
- helm/benchmark/model_metadata_registry.py +36 -0
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +26 -10
- helm/benchmark/presentation/schema.py +15 -40
- helm/benchmark/presentation/summarize.py +119 -79
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +1 -2
- helm/benchmark/presentation/test_summarize.py +3 -3
- helm/benchmark/run.py +54 -26
- helm/benchmark/run_expander.py +214 -16
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/classic_run_specs.py +1510 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +51 -57
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +1 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +1 -1
- helm/benchmark/scenarios/opinions_qa_scenario.py +0 -4
- helm/benchmark/scenarios/scenario.py +4 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +6 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +1 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +1 -1
- helm/benchmark/server.py +24 -1
- helm/benchmark/slurm_runner.py +70 -49
- helm/benchmark/static/benchmarking.js +1 -1
- helm/benchmark/static/schema_classic.yaml +258 -1066
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +2 -227
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +14 -16
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -44
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +3 -9
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +203 -18
- helm/{proxy/clients → clients}/auto_client.py +59 -31
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +65 -7
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +4 -11
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +5 -5
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +5 -7
- helm/{proxy/clients → clients}/huggingface_client.py +43 -64
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +4 -4
- helm/{proxy/clients → clients}/megatron_client.py +5 -5
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +6 -8
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +100 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +15 -16
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +20 -8
- helm/{proxy/clients → clients}/together_client.py +12 -72
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +53 -48
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +16 -4
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +1 -1
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +35 -4
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/request.py +15 -17
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +1 -1
- helm/config/model_deployments.yaml +1069 -546
- helm/config/model_metadata.yaml +753 -31
- helm/config/tokenizer_configs.yaml +142 -43
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +8 -6
- helm/proxy/example_queries.py +29 -17
- helm/proxy/server.py +70 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +96 -16
- helm/proxy/services/service.py +30 -0
- helm/proxy/services/test_remote_service.py +4 -3
- helm/proxy/services/test_service.py +0 -12
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +6 -9
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +7 -26
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +3 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
helm/benchmark/vlm_run_specs.py
DELETED
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
2
|
-
|
|
3
|
-
from .adaptation.adapter_spec import AdapterSpec
|
|
4
|
-
from .adaptation.adapters.adapter_factory import ADAPT_GENERATION_MULTIMODAL
|
|
5
|
-
from .metrics.metric import MetricSpec
|
|
6
|
-
from .run_specs import run_spec_function, get_exact_match_metric_specs
|
|
7
|
-
from .runner import RunSpec
|
|
8
|
-
from .scenarios.scenario import ScenarioSpec
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
############################################################
|
|
12
|
-
# Prototypical adapter specs for VLM evaluation
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def get_vlm_generation_adapter_spec(
|
|
16
|
-
instructions: str = "",
|
|
17
|
-
input_prefix: str = "",
|
|
18
|
-
input_suffix: str = "",
|
|
19
|
-
output_prefix: str = "",
|
|
20
|
-
output_suffix: str = "",
|
|
21
|
-
max_tokens: int = 100,
|
|
22
|
-
stop_sequences: Optional[List[str]] = None,
|
|
23
|
-
) -> AdapterSpec:
|
|
24
|
-
return AdapterSpec(
|
|
25
|
-
method=ADAPT_GENERATION_MULTIMODAL,
|
|
26
|
-
global_prefix="",
|
|
27
|
-
instructions=instructions,
|
|
28
|
-
input_prefix=input_prefix,
|
|
29
|
-
input_suffix=input_suffix,
|
|
30
|
-
output_prefix=output_prefix,
|
|
31
|
-
output_suffix=output_suffix,
|
|
32
|
-
instance_prefix="\n",
|
|
33
|
-
# We focus on zero-shot evaluation for now as most open VLMs only support a single image input
|
|
34
|
-
max_train_instances=0,
|
|
35
|
-
num_outputs=1,
|
|
36
|
-
max_tokens=max_tokens,
|
|
37
|
-
stop_sequences=stop_sequences if stop_sequences is not None else [],
|
|
38
|
-
random=None,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
############################################################
|
|
43
|
-
# VHELM run specs
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
@run_spec_function("viz_wiz")
|
|
47
|
-
def get_viz_wiz_spec() -> RunSpec:
|
|
48
|
-
scenario_spec = ScenarioSpec(
|
|
49
|
-
class_name="helm.benchmark.scenarios.vision_language.viz_wiz_scenario.VizWizScenario", args={}
|
|
50
|
-
)
|
|
51
|
-
|
|
52
|
-
# TODO: finalize the adapter spec parameters once we add more models
|
|
53
|
-
adapter_spec: AdapterSpec = get_vlm_generation_adapter_spec(
|
|
54
|
-
input_prefix="User: ",
|
|
55
|
-
input_suffix="<end_of_utterance>",
|
|
56
|
-
output_prefix="\nAssistant: ",
|
|
57
|
-
output_suffix="<end_of_utterance>",
|
|
58
|
-
stop_sequences=["<end_of_utterance>"],
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
62
|
-
|
|
63
|
-
run_spec_name: str = "viz_wiz"
|
|
64
|
-
return RunSpec(
|
|
65
|
-
name=run_spec_name,
|
|
66
|
-
scenario_spec=scenario_spec,
|
|
67
|
-
adapter_spec=adapter_spec,
|
|
68
|
-
metric_specs=metric_specs,
|
|
69
|
-
groups=[run_spec_name],
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
@run_spec_function("vqa")
|
|
74
|
-
def get_vqa_spec() -> RunSpec:
|
|
75
|
-
scenario_spec = ScenarioSpec(
|
|
76
|
-
class_name="helm.benchmark.scenarios.vision_language.vqa_scenario.VQAScenario", args={}
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
# TODO: finalize the adapter spec parameters once we add more models
|
|
80
|
-
adapter_spec: AdapterSpec = get_vlm_generation_adapter_spec(
|
|
81
|
-
input_prefix="User: ",
|
|
82
|
-
input_suffix="<end_of_utterance>",
|
|
83
|
-
output_prefix="\nAssistant: ",
|
|
84
|
-
output_suffix="<end_of_utterance>",
|
|
85
|
-
stop_sequences=["<end_of_utterance>"],
|
|
86
|
-
)
|
|
87
|
-
|
|
88
|
-
metric_specs: List[MetricSpec] = get_exact_match_metric_specs()
|
|
89
|
-
|
|
90
|
-
run_spec_name: str = "vqa"
|
|
91
|
-
return RunSpec(
|
|
92
|
-
name=run_spec_name,
|
|
93
|
-
scenario_spec=scenario_spec,
|
|
94
|
-
adapter_spec=adapter_spec,
|
|
95
|
-
metric_specs=metric_specs,
|
|
96
|
-
groups=[run_spec_name],
|
|
97
|
-
)
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
from .encoder_decoder_window_service import EncoderDecoderWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class FlanT5WindowService(EncoderDecoderWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def max_sequence_length(self) -> int:
|
|
11
|
-
"""Return the max sequence length."""
|
|
12
|
-
# We subtract 1 to account for <extra_id_0> that gets appended to prompts.
|
|
13
|
-
return 512 - 1
|
|
14
|
-
|
|
15
|
-
@property
|
|
16
|
-
def end_of_text_token(self) -> str:
|
|
17
|
-
"""The end of text token."""
|
|
18
|
-
return "</s>"
|
|
19
|
-
|
|
20
|
-
@property
|
|
21
|
-
def tokenizer_name(self) -> str:
|
|
22
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
23
|
-
return "google/flan-t5-xxl"
|
|
24
|
-
|
|
25
|
-
@property
|
|
26
|
-
def prefix_token(self) -> str:
|
|
27
|
-
"""The prefix token is the same as the end of text token."""
|
|
28
|
-
# echo=True is not supported
|
|
29
|
-
return ""
|
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
from .local_window_service import LocalWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class GPT2WindowService(LocalWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def max_sequence_length(self) -> int:
|
|
11
|
-
"""Return the max sequence length of this tokenizer."""
|
|
12
|
-
return 1024
|
|
13
|
-
|
|
14
|
-
@property
|
|
15
|
-
def max_request_length(self) -> int:
|
|
16
|
-
"""Return the max request length of GPT-2."""
|
|
17
|
-
return self.max_sequence_length + 1
|
|
18
|
-
|
|
19
|
-
@property
|
|
20
|
-
def end_of_text_token(self) -> str:
|
|
21
|
-
"""The end of text token."""
|
|
22
|
-
return "<|endoftext|>"
|
|
23
|
-
|
|
24
|
-
@property
|
|
25
|
-
def tokenizer_name(self) -> str:
|
|
26
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
27
|
-
return "huggingface/gpt2"
|
|
28
|
-
|
|
29
|
-
@property
|
|
30
|
-
def prefix_token(self) -> str:
|
|
31
|
-
"""The prefix token for models that uses the GPT-2 tokenizer is the end of text token."""
|
|
32
|
-
return self.end_of_text_token
|
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer
|
|
3
|
-
from .local_window_service import LocalWindowService
|
|
4
|
-
from .tokenizer_service import TokenizerService
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class HuggingFaceWindowService(LocalWindowService):
|
|
8
|
-
def __init__(
|
|
9
|
-
self,
|
|
10
|
-
service: TokenizerService,
|
|
11
|
-
tokenizer_name: str,
|
|
12
|
-
pretrained_model_name_or_path: Optional[str] = None,
|
|
13
|
-
max_sequence_length: Optional[int] = None,
|
|
14
|
-
max_request_length: Optional[int] = None,
|
|
15
|
-
end_of_text_token: Optional[str] = None,
|
|
16
|
-
prefix_token: Optional[str] = None,
|
|
17
|
-
**kwargs
|
|
18
|
-
):
|
|
19
|
-
super().__init__(service)
|
|
20
|
-
self._tokenizer_name = tokenizer_name
|
|
21
|
-
# Override max_sequence_length, max_request_length, end_of_text_token
|
|
22
|
-
# and prefix_token if provided as an argument.
|
|
23
|
-
# Otherwise, auto-infer them from the Hugging Face tokenizer.
|
|
24
|
-
#
|
|
25
|
-
# Note that many Hugging Face tokenizers have incorrect sequence lengths,
|
|
26
|
-
# so it is recommended to set this manually.
|
|
27
|
-
with HuggingFaceTokenizer.get_tokenizer(
|
|
28
|
-
helm_tokenizer_name=tokenizer_name,
|
|
29
|
-
pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
|
|
30
|
-
**kwargs,
|
|
31
|
-
) as tokenizer:
|
|
32
|
-
self._max_sequence_length = max_sequence_length or tokenizer.model_max_length
|
|
33
|
-
self._max_request_length = max_request_length or self._max_sequence_length
|
|
34
|
-
self._end_of_text_token = end_of_text_token or tokenizer.eos_token or ""
|
|
35
|
-
self._prefix_token = prefix_token or tokenizer.bos_token or ""
|
|
36
|
-
|
|
37
|
-
@property
|
|
38
|
-
def tokenizer_name(self) -> str:
|
|
39
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
40
|
-
return self._tokenizer_name
|
|
41
|
-
|
|
42
|
-
@property
|
|
43
|
-
def max_sequence_length(self) -> int:
|
|
44
|
-
"""Return the max sequence length of this tokenizer."""
|
|
45
|
-
return self._max_sequence_length
|
|
46
|
-
|
|
47
|
-
@property
|
|
48
|
-
def max_request_length(self) -> int:
|
|
49
|
-
"""Return the max request length of this tokenizer."""
|
|
50
|
-
return self._max_request_length
|
|
51
|
-
|
|
52
|
-
@property
|
|
53
|
-
def end_of_text_token(self) -> str:
|
|
54
|
-
"""The end of text token."""
|
|
55
|
-
return self._end_of_text_token
|
|
56
|
-
|
|
57
|
-
@property
|
|
58
|
-
def prefix_token(self) -> str:
|
|
59
|
-
"""The prefix token."""
|
|
60
|
-
return self._prefix_token
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
from .encoder_decoder_window_service import EncoderDecoderWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class T0ppWindowService(EncoderDecoderWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def max_sequence_length(self) -> int:
|
|
11
|
-
"""Return the max sequence length."""
|
|
12
|
-
# From https://arxiv.org/pdf/2110.08207.pdf, "we truncate input and target sequences to 1024 and 256 tokens,
|
|
13
|
-
# respectively. Following Raffel et al. (2020), we use packing to combine multiple training examples into
|
|
14
|
-
# a single sequence to reach the maximum sequence length."
|
|
15
|
-
return 1024
|
|
16
|
-
|
|
17
|
-
@property
|
|
18
|
-
def max_output_length(self) -> int:
|
|
19
|
-
return 256
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def end_of_text_token(self) -> str:
|
|
23
|
-
"""The end of text token."""
|
|
24
|
-
return "</s>"
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def tokenizer_name(self) -> str:
|
|
28
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
29
|
-
return "bigscience/T0pp"
|
|
30
|
-
|
|
31
|
-
@property
|
|
32
|
-
def prefix_token(self) -> str:
|
|
33
|
-
"""The prefix token is the same as the end of text token."""
|
|
34
|
-
# echo=True is not supported
|
|
35
|
-
return ""
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from .encoder_decoder_window_service import EncoderDecoderWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class T511bWindowService(EncoderDecoderWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def max_sequence_length(self) -> int:
|
|
11
|
-
"""Return the max sequence length."""
|
|
12
|
-
# From https://arxiv.org/pdf/1910.10683.pdf, "we use a maximum sequence length of 512".
|
|
13
|
-
# We subtract 1 to account for <extra_id_0> that gets appended to prompts.
|
|
14
|
-
return 512 - 1
|
|
15
|
-
|
|
16
|
-
@property
|
|
17
|
-
def end_of_text_token(self) -> str:
|
|
18
|
-
"""The end of text token."""
|
|
19
|
-
return "</s>"
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def tokenizer_name(self) -> str:
|
|
23
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
24
|
-
return "google/t5-11b"
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def prefix_token(self) -> str:
|
|
28
|
-
"""The prefix token is the same as the end of text token."""
|
|
29
|
-
# echo=True is not supported
|
|
30
|
-
return ""
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
import shutil
|
|
2
|
-
import tempfile
|
|
3
|
-
|
|
4
|
-
from .test_utils import get_tokenizer_service, TEST_PROMPT, GPT2_TEST_TOKENS, GPT2_TEST_TOKEN_IDS
|
|
5
|
-
from .tokenizer_service import TokenizerService
|
|
6
|
-
from .window_service_factory import WindowServiceFactory
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class TestMTNLGWindowService:
|
|
10
|
-
def setup_method(self):
|
|
11
|
-
self.path: str = tempfile.mkdtemp()
|
|
12
|
-
service: TokenizerService = get_tokenizer_service(self.path)
|
|
13
|
-
self.window_service = WindowServiceFactory.get_window_service("microsoft/TNLGv2_7B", service)
|
|
14
|
-
|
|
15
|
-
def teardown_method(self, method):
|
|
16
|
-
shutil.rmtree(self.path)
|
|
17
|
-
|
|
18
|
-
def test_max_request_length(self):
|
|
19
|
-
assert self.window_service.max_request_length == 2048
|
|
20
|
-
|
|
21
|
-
def test_encode(self):
|
|
22
|
-
assert self.window_service.encode(TEST_PROMPT).token_values == GPT2_TEST_TOKEN_IDS
|
|
23
|
-
|
|
24
|
-
def test_decode(self):
|
|
25
|
-
assert self.window_service.decode(self.window_service.encode(TEST_PROMPT).tokens) == TEST_PROMPT
|
|
26
|
-
|
|
27
|
-
def test_tokenize(self):
|
|
28
|
-
assert self.window_service.tokenize(TEST_PROMPT) == GPT2_TEST_TOKENS
|
|
29
|
-
|
|
30
|
-
def test_fits_within_context_window(self):
|
|
31
|
-
# Should fit in the context window since we subtracted the number of tokens of the test prompt
|
|
32
|
-
# from the max request length of 2048
|
|
33
|
-
assert self.window_service.fits_within_context_window(TEST_PROMPT, 2048 - 51)
|
|
34
|
-
# Should not fit within the max request length because we're expecting one more extra token in the completion
|
|
35
|
-
assert not self.window_service.fits_within_context_window(TEST_PROMPT, 2048 - 51 + 1)
|
|
36
|
-
|
|
37
|
-
def test_truncate_from_right(self):
|
|
38
|
-
# Create a prompt that exceed max context length: 51 * 41 = 2091 tokens
|
|
39
|
-
long_prompt: str = TEST_PROMPT * 41
|
|
40
|
-
assert not self.window_service.fits_within_context_window(long_prompt)
|
|
41
|
-
|
|
42
|
-
# Truncate and ensure it fits within the context window
|
|
43
|
-
truncated_long_prompt: str = self.window_service.truncate_from_right(long_prompt)
|
|
44
|
-
assert self.window_service.get_num_tokens(truncated_long_prompt) == 2048
|
|
45
|
-
assert self.window_service.fits_within_context_window(truncated_long_prompt)
|
|
46
|
-
|
|
47
|
-
def test_tokenize_and_count(self):
|
|
48
|
-
assert self.window_service.get_num_tokens(TEST_PROMPT) == 51
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from .encoder_decoder_window_service import EncoderDecoderWindowService
|
|
2
|
-
from .tokenizer_service import TokenizerService
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class UL2WindowService(EncoderDecoderWindowService):
|
|
6
|
-
def __init__(self, service: TokenizerService):
|
|
7
|
-
super().__init__(service)
|
|
8
|
-
|
|
9
|
-
@property
|
|
10
|
-
def max_sequence_length(self) -> int:
|
|
11
|
-
"""Return the max sequence length."""
|
|
12
|
-
# From https://arxiv.org/pdf/2205.05131.pdf, "the sequence length is set to 512/512 for inputs and targets".
|
|
13
|
-
# We subtract 1 to account for <extra_id_0> that gets appended to prompts.
|
|
14
|
-
return 512 - 1
|
|
15
|
-
|
|
16
|
-
@property
|
|
17
|
-
def end_of_text_token(self) -> str:
|
|
18
|
-
"""The end of text token."""
|
|
19
|
-
return "</s>"
|
|
20
|
-
|
|
21
|
-
@property
|
|
22
|
-
def tokenizer_name(self) -> str:
|
|
23
|
-
"""Name of the tokenizer to use when sending a request."""
|
|
24
|
-
return "google/ul2"
|
|
25
|
-
|
|
26
|
-
@property
|
|
27
|
-
def prefix_token(self) -> str:
|
|
28
|
-
"""The prefix token is the same as the end of text token."""
|
|
29
|
-
# echo=True is not supported
|
|
30
|
-
return ""
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from .ai21_window_service import AI21WindowService
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
class WiderAI21WindowService(AI21WindowService):
|
|
5
|
-
@property
|
|
6
|
-
def max_sequence_length(self) -> int:
|
|
7
|
-
"""
|
|
8
|
-
Return the max sequence length of the larger AI21 Jurassic-2 models.
|
|
9
|
-
|
|
10
|
-
The AI21 server automatically prepends a token to every prompt,
|
|
11
|
-
so the actual max sequence length is 8192 - 1 = 8191.
|
|
12
|
-
"""
|
|
13
|
-
return 8191
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class AI21Jurassic2JumboWindowService(AI21WindowService):
|
|
17
|
-
@property
|
|
18
|
-
def max_sequence_length(self) -> int:
|
|
19
|
-
"""
|
|
20
|
-
Return the max sequence length of the AI21 Jurassic-2 Jumbo.
|
|
21
|
-
|
|
22
|
-
AI21 has recommended using a sequence length of 6000 tokens to avoid OOMs.
|
|
23
|
-
"""
|
|
24
|
-
return 6000
|
helm/common/cache_utils.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
"""Functions used for caching."""
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
|
|
5
|
-
from helm.common.cache import CacheConfig, MongoCacheConfig, SqliteCacheConfig
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def build_cache_config(cache_path: str, mongo_uri: str, organization: str) -> CacheConfig:
|
|
9
|
-
if mongo_uri:
|
|
10
|
-
return MongoCacheConfig(mongo_uri, collection_name=organization)
|
|
11
|
-
|
|
12
|
-
client_cache_path: str = os.path.join(cache_path, f"{organization}.sqlite")
|
|
13
|
-
# TODO: Allow setting CacheConfig.follower_cache_path from a command line flag.
|
|
14
|
-
return SqliteCacheConfig(client_cache_path)
|
|
@@ -1,95 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import requests
|
|
3
|
-
from typing import Any, Dict, List
|
|
4
|
-
|
|
5
|
-
from helm.common.cache import CacheConfig
|
|
6
|
-
from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token
|
|
7
|
-
from .client import CachingClient, truncate_sequence
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class AlephAlphaClient(CachingClient):
|
|
11
|
-
COMPLETION_ENDPOINT: str = "complete"
|
|
12
|
-
|
|
13
|
-
def __init__(self, api_key: str, cache_config: CacheConfig):
|
|
14
|
-
super().__init__(cache_config=cache_config)
|
|
15
|
-
self.api_key: str = api_key
|
|
16
|
-
|
|
17
|
-
def _send_request(self, endpoint: str, raw_request: Dict[str, Any]) -> Dict[str, Any]:
|
|
18
|
-
response = requests.request(
|
|
19
|
-
method="POST",
|
|
20
|
-
url=f"https://api.aleph-alpha.com/{endpoint}",
|
|
21
|
-
headers={
|
|
22
|
-
"Content-Type": "application/json",
|
|
23
|
-
"Accept": "application/json",
|
|
24
|
-
"Authorization": f"Bearer {self.api_key}",
|
|
25
|
-
},
|
|
26
|
-
data=json.dumps(raw_request),
|
|
27
|
-
# Setting the nice flag prevents intensive benchmarking runs from saturating Aleph Alpha's API queues
|
|
28
|
-
params=json.dumps({"nice": True}),
|
|
29
|
-
)
|
|
30
|
-
result = json.loads(response.text)
|
|
31
|
-
assert "error" not in result, f"Request failed with error: {result['error']}"
|
|
32
|
-
return result
|
|
33
|
-
|
|
34
|
-
def make_request(self, request: Request) -> RequestResult:
|
|
35
|
-
"""Make a request following https://docs.aleph-alpha.com/api/complete."""
|
|
36
|
-
raw_request = {
|
|
37
|
-
"model": request.model_engine,
|
|
38
|
-
"prompt": request.prompt,
|
|
39
|
-
"maximum_tokens": request.max_tokens,
|
|
40
|
-
"temperature": request.temperature,
|
|
41
|
-
"top_k": request.top_k_per_token,
|
|
42
|
-
"top_p": request.top_p,
|
|
43
|
-
"presence_penalty": request.presence_penalty,
|
|
44
|
-
"frequency_penalty": request.frequency_penalty,
|
|
45
|
-
"n": request.num_completions,
|
|
46
|
-
"stop_sequences": request.stop_sequences,
|
|
47
|
-
"log_probs": request.top_k_per_token,
|
|
48
|
-
"echo": request.echo_prompt,
|
|
49
|
-
"tokens": True, # Setting to True returns individual tokens of the completion
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
|
|
54
|
-
def do_it():
|
|
55
|
-
result = self._send_request(AlephAlphaClient.COMPLETION_ENDPOINT, raw_request)
|
|
56
|
-
assert "completions" in result, f"Invalid response: {result}"
|
|
57
|
-
return result
|
|
58
|
-
|
|
59
|
-
response, cached = self.cache.get(raw_request, wrap_request_time(do_it))
|
|
60
|
-
except (requests.exceptions.RequestException, AssertionError) as e:
|
|
61
|
-
error: str = f"AlephAlphaClient error: {e}"
|
|
62
|
-
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
63
|
-
|
|
64
|
-
completions: List[Sequence] = []
|
|
65
|
-
for completion in response["completions"]:
|
|
66
|
-
sequence_logprob: float = 0
|
|
67
|
-
tokens: List[Token] = []
|
|
68
|
-
|
|
69
|
-
# `completion_tokens` is the list of selected tokens.
|
|
70
|
-
for i, token in enumerate(completion.get("completion_tokens", [])):
|
|
71
|
-
# Get the top K logprobs for the ith token
|
|
72
|
-
top_logprobs: Dict[str, float] = completion["log_probs"][i]
|
|
73
|
-
# Use the selected token value to get the logprob
|
|
74
|
-
logprob: float = top_logprobs[token]
|
|
75
|
-
sequence_logprob += logprob
|
|
76
|
-
tokens.append(
|
|
77
|
-
Token(
|
|
78
|
-
text=token,
|
|
79
|
-
logprob=logprob,
|
|
80
|
-
top_logprobs=top_logprobs,
|
|
81
|
-
)
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
sequence: Sequence = Sequence(text=completion["completion"], logprob=sequence_logprob, tokens=tokens)
|
|
85
|
-
sequence = truncate_sequence(sequence, request)
|
|
86
|
-
completions.append(sequence)
|
|
87
|
-
|
|
88
|
-
return RequestResult(
|
|
89
|
-
success=True,
|
|
90
|
-
cached=cached,
|
|
91
|
-
request_time=response["request_time"],
|
|
92
|
-
request_datetime=response["request_datetime"],
|
|
93
|
-
completions=completions,
|
|
94
|
-
embedding=[],
|
|
95
|
-
)
|
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
2
|
-
|
|
3
|
-
import openai as gooseai
|
|
4
|
-
|
|
5
|
-
from helm.common.cache import CacheConfig
|
|
6
|
-
from helm.common.request import (
|
|
7
|
-
wrap_request_time,
|
|
8
|
-
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
|
|
9
|
-
Request,
|
|
10
|
-
RequestResult,
|
|
11
|
-
Sequence,
|
|
12
|
-
Token,
|
|
13
|
-
)
|
|
14
|
-
from .client import CachingClient, truncate_sequence
|
|
15
|
-
from .openai_client import ORIGINAL_COMPLETION_ATTRIBUTES
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class GooseAIClient(CachingClient):
|
|
19
|
-
"""
|
|
20
|
-
GooseAI API Client
|
|
21
|
-
- How to use the API: https://goose.ai/docs/api
|
|
22
|
-
- Supported models: https://goose.ai/docs/models
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
def __init__(self, api_key: str, cache_config: CacheConfig, org_id: Optional[str] = None):
|
|
26
|
-
super().__init__(cache_config=cache_config)
|
|
27
|
-
self.org_id: Optional[str] = org_id
|
|
28
|
-
self.api_key: str = api_key
|
|
29
|
-
self.api_base: str = "https://api.goose.ai/v1"
|
|
30
|
-
|
|
31
|
-
def make_request(self, request: Request) -> RequestResult:
|
|
32
|
-
"""
|
|
33
|
-
Request parameters for GooseAI API documented here: https://goose.ai/docs/api/completions
|
|
34
|
-
The only OpenAI API parameter not supported is `best_of`.
|
|
35
|
-
"""
|
|
36
|
-
# Embedding not supported for this model
|
|
37
|
-
if request.embedding:
|
|
38
|
-
return EMBEDDING_UNAVAILABLE_REQUEST_RESULT
|
|
39
|
-
|
|
40
|
-
raw_request = {
|
|
41
|
-
"engine": request.model_engine,
|
|
42
|
-
"prompt": request.prompt,
|
|
43
|
-
"temperature": request.temperature,
|
|
44
|
-
"n": request.num_completions,
|
|
45
|
-
"max_tokens": request.max_tokens,
|
|
46
|
-
"logprobs": request.top_k_per_token,
|
|
47
|
-
"stop": request.stop_sequences or None, # API doesn't like empty list
|
|
48
|
-
"top_p": request.top_p,
|
|
49
|
-
"presence_penalty": request.presence_penalty,
|
|
50
|
-
"frequency_penalty": request.frequency_penalty,
|
|
51
|
-
"echo": request.echo_prompt,
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
try:
|
|
55
|
-
|
|
56
|
-
def do_it():
|
|
57
|
-
# Following https://beta.openai.com/docs/api-reference/authentication
|
|
58
|
-
# `organization` can be set to None.
|
|
59
|
-
gooseai.organization = self.org_id
|
|
60
|
-
gooseai.api_key = self.api_key
|
|
61
|
-
gooseai.api_base = self.api_base
|
|
62
|
-
gooseai.api_resources.completion.Completion.__bases__ = ORIGINAL_COMPLETION_ATTRIBUTES
|
|
63
|
-
return gooseai.Completion.create(**raw_request)
|
|
64
|
-
|
|
65
|
-
cache_key = CachingClient.make_cache_key(raw_request, request)
|
|
66
|
-
response, cached = self.cache.get(cache_key, wrap_request_time(do_it))
|
|
67
|
-
except gooseai.error.OpenAIError as e:
|
|
68
|
-
error: str = f"OpenAI (GooseAI API) error: {e}"
|
|
69
|
-
return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[])
|
|
70
|
-
|
|
71
|
-
completions: List[Sequence] = []
|
|
72
|
-
for raw_completion in response["choices"]:
|
|
73
|
-
sequence_logprob = 0
|
|
74
|
-
tokens: List[Token] = []
|
|
75
|
-
|
|
76
|
-
raw_data = raw_completion["logprobs"]
|
|
77
|
-
for text, logprob, top_logprobs in zip(
|
|
78
|
-
raw_data["tokens"], raw_data["token_logprobs"], raw_data["top_logprobs"]
|
|
79
|
-
):
|
|
80
|
-
tokens.append(Token(text=text, logprob=logprob or 0, top_logprobs=dict(top_logprobs or {})))
|
|
81
|
-
sequence_logprob += logprob or 0
|
|
82
|
-
|
|
83
|
-
completion = Sequence(
|
|
84
|
-
text=raw_completion["text"],
|
|
85
|
-
logprob=sequence_logprob,
|
|
86
|
-
tokens=tokens,
|
|
87
|
-
finish_reason={"reason": raw_completion["finish_reason"]},
|
|
88
|
-
)
|
|
89
|
-
completion = truncate_sequence(completion, request)
|
|
90
|
-
completions.append(completion)
|
|
91
|
-
|
|
92
|
-
return RequestResult(
|
|
93
|
-
success=True,
|
|
94
|
-
cached=cached,
|
|
95
|
-
request_time=response["request_time"],
|
|
96
|
-
request_datetime=response.get("request_datetime"),
|
|
97
|
-
completions=completions,
|
|
98
|
-
embedding=[],
|
|
99
|
-
)
|