crfm-helm 0.3.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/METADATA +144 -36
- crfm_helm-0.5.0.dist-info/RECORD +642 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +37 -2
- helm/benchmark/adaptation/adapters/adapter.py +4 -42
- helm/benchmark/adaptation/adapters/adapter_factory.py +24 -27
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -0
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +21 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +12 -5
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +104 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +5 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/test_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +59 -14
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +40 -5
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +78 -10
- helm/benchmark/adaptation/common_adapter_specs.py +376 -0
- helm/benchmark/adaptation/prompt.py +7 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/annotator.py +43 -0
- helm/benchmark/annotation/annotator_factory.py +61 -0
- helm/benchmark/annotation/image2structure/image_compiler_annotator.py +88 -0
- helm/benchmark/annotation/image2structure/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +84 -0
- helm/benchmark/annotation/image2structure/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation_executor.py +124 -0
- helm/benchmark/augmentations/cleva_perturbation.py +7 -14
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +3 -3
- helm/benchmark/augmentations/contrast_sets_perturbation.py +0 -3
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +2 -2
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +2 -2
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +0 -7
- helm/benchmark/augmentations/perturbation.py +20 -7
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +2 -2
- helm/benchmark/augmentations/test_perturbation.py +11 -7
- helm/benchmark/augmentations/translate_perturbation.py +30 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +38 -0
- helm/benchmark/executor.py +46 -16
- helm/benchmark/huggingface_registration.py +37 -7
- helm/benchmark/metrics/basic_metrics.py +172 -641
- helm/benchmark/metrics/bbq_metrics.py +3 -4
- helm/benchmark/metrics/bias_metrics.py +6 -6
- helm/benchmark/metrics/classification_metrics.py +11 -8
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/code_metrics.py +4 -3
- helm/benchmark/metrics/code_metrics_helper.py +0 -2
- helm/benchmark/metrics/common_metric_specs.py +167 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +6 -112
- helm/benchmark/metrics/dry_run_metrics.py +5 -3
- helm/benchmark/metrics/efficiency_metrics.py +206 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +376 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +73 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +18 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +3 -1
- helm/benchmark/metrics/language_modeling_metrics.py +99 -0
- helm/benchmark/metrics/machine_translation_metrics.py +5 -5
- helm/benchmark/metrics/metric.py +93 -172
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +16 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +3 -4
- helm/benchmark/metrics/ranking_metrics.py +6 -7
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/summac/model_summac.py +0 -2
- helm/benchmark/metrics/summarization_metrics.py +8 -8
- helm/benchmark/metrics/test_classification_metrics.py +9 -6
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +30 -0
- helm/benchmark/metrics/test_metric.py +2 -2
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +13 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +2 -0
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +9 -2
- helm/benchmark/metrics/toxicity_metrics.py +1 -1
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/unitxt_metrics.py +81 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +341 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +450 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/model_deployment_registry.py +164 -41
- helm/benchmark/model_metadata_registry.py +181 -35
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +8 -7
- helm/benchmark/presentation/run_display.py +50 -17
- helm/benchmark/presentation/schema.py +28 -46
- helm/benchmark/presentation/summarize.py +213 -96
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_run_entry.py +14 -9
- helm/benchmark/presentation/test_summarize.py +5 -0
- helm/benchmark/run.py +66 -54
- helm/benchmark/run_expander.py +342 -31
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +162 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/{run_specs.py → run_specs/classic_run_specs.py} +217 -1330
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +314 -0
- helm/benchmark/run_specs/heim_run_specs.py +623 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +42 -0
- helm/benchmark/run_specs/vlm_run_specs.py +501 -0
- helm/benchmark/runner.py +116 -69
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +2 -2
- helm/benchmark/scenarios/cleva_scenario.py +43 -46
- helm/benchmark/scenarios/code_scenario.py +3 -2
- helm/benchmark/scenarios/commonsense_scenario.py +171 -191
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +169 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +121 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +77 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +324 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +204 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +559 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +67 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +78 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +0 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +1 -1
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_scenario.py +0 -1
- helm/benchmark/scenarios/legalbench_scenario.py +123 -0
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +4 -2
- helm/benchmark/scenarios/math_scenario.py +19 -2
- helm/benchmark/scenarios/medication_qa_scenario.py +60 -0
- helm/benchmark/scenarios/numeracy_scenario.py +3 -3
- helm/benchmark/scenarios/opinions_qa_scenario.py +6 -10
- helm/benchmark/scenarios/raft_scenario.py +2 -6
- helm/benchmark/scenarios/scenario.py +14 -2
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/test_math_scenario.py +22 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +135 -0
- helm/benchmark/scenarios/the_pile_scenario.py +6 -7
- helm/benchmark/scenarios/unitxt_scenario.py +56 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +3 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2structure/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +214 -0
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +25 -0
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +20 -0
- helm/benchmark/scenarios/vision_language/image2structure/utils_latex.py +347 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +145 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +129 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +107 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +1 -1
- helm/benchmark/scenarios/wmt_14_scenario.py +18 -18
- helm/benchmark/server.py +59 -2
- helm/benchmark/slurm_jobs.py +12 -0
- helm/benchmark/slurm_runner.py +79 -51
- helm/benchmark/static/benchmarking.js +3 -4
- helm/benchmark/static/contamination.yaml +1 -1
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/json-urls.js +4 -0
- helm/benchmark/static/{schema.yaml → schema_classic.yaml} +346 -930
- helm/benchmark/static/schema_instruction_following.yaml +210 -0
- helm/benchmark/static/schema_lite.yaml +824 -0
- helm/benchmark/static/schema_mmlu.yaml +1507 -0
- helm/benchmark/static/schema_unitxt.yaml +428 -0
- helm/benchmark/static/schema_vlm.yaml +576 -0
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/crfm-logo-74391ab8.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-3e5e3aa4.png +0 -0
- helm/benchmark/static_build/assets/helm-logo-simple-2ed5400b.png +0 -0
- helm/benchmark/static_build/assets/helmhero-28e90f4d.png +0 -0
- helm/benchmark/static_build/assets/index-5088afcb.css +1 -0
- helm/benchmark/static_build/assets/index-d839df55.js +9 -0
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/react-d4a0b69b.js +85 -0
- helm/benchmark/static_build/assets/recharts-6d337683.js +97 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tremor-54a99cc4.js +10 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +20 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_model_deployment_definition.py +90 -0
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/tokenizer_config_registry.py +10 -14
- helm/benchmark/window_services/ai21_window_service.py +22 -33
- helm/benchmark/window_services/cohere_window_service.py +1 -63
- helm/benchmark/window_services/default_window_service.py +2 -35
- helm/benchmark/window_services/encoder_decoder_window_service.py +0 -11
- helm/benchmark/window_services/ice_window_service.py +0 -34
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +15 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +21 -4
- helm/benchmark/window_services/no_decoding_window_service.py +32 -0
- helm/benchmark/window_services/test_anthropic_window_service.py +2 -1
- helm/benchmark/window_services/test_bloom_window_service.py +2 -1
- helm/benchmark/window_services/test_cohere_window_service.py +2 -1
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +2 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +2 -1
- helm/benchmark/window_services/test_gptj_window_service.py +3 -2
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -2
- helm/benchmark/window_services/test_ice_window_service.py +2 -1
- helm/benchmark/window_services/test_openai_window_service.py +2 -1
- helm/benchmark/window_services/test_opt_window_service.py +3 -2
- helm/benchmark/window_services/test_palmyra_window_service.py +2 -1
- helm/benchmark/window_services/test_t0pp_window_service.py +2 -1
- helm/benchmark/window_services/test_t511b_window_service.py +2 -1
- helm/benchmark/window_services/test_ul2_window_service.py +2 -1
- helm/benchmark/window_services/test_utils.py +3 -2
- helm/benchmark/window_services/test_yalm_window_service.py +2 -1
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +24 -269
- helm/benchmark/window_services/yalm_window_service.py +0 -27
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +5 -12
- helm/clients/aleph_alpha_client.py +112 -0
- helm/{proxy/clients → clients}/anthropic_client.py +213 -24
- helm/clients/auto_client.py +215 -0
- helm/clients/bedrock_client.py +128 -0
- helm/clients/bedrock_utils.py +72 -0
- helm/{proxy/clients → clients}/client.py +67 -55
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +6 -17
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +7 -8
- helm/clients/google_translate_client.py +35 -0
- helm/{proxy/clients → clients}/http_model_client.py +6 -10
- helm/{proxy/clients → clients}/huggingface_client.py +134 -92
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +78 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +98 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +96 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +141 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +191 -0
- helm/clients/image_generation/dalle2_client.py +192 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +190 -0
- helm/clients/image_generation/deep_floyd_client.py +78 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +249 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +86 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +115 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +111 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +7 -5
- helm/{proxy/clients → clients}/megatron_client.py +13 -7
- helm/clients/mistral_client.py +134 -0
- helm/clients/moderation_api_client.py +109 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +302 -0
- helm/{proxy/clients → clients}/palmyra_client.py +15 -12
- helm/{proxy/clients → clients}/perspective_api_client.py +7 -8
- helm/clients/simple_client.py +64 -0
- helm/{proxy/clients → clients}/test_auto_client.py +15 -15
- helm/clients/test_client.py +100 -0
- helm/clients/test_huggingface_client.py +70 -0
- helm/clients/test_simple_client.py +19 -0
- helm/{proxy/clients → clients}/test_together_client.py +23 -12
- helm/{proxy/clients → clients}/together_client.py +18 -71
- helm/clients/vertexai_client.py +391 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vlm_client.py +104 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +59 -52
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/qwen_vlm_client.py +171 -0
- helm/clients/vllm_client.py +46 -0
- helm/common/cache.py +24 -179
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/concurrency.py +32 -0
- helm/common/credentials_utils.py +28 -0
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +29 -10
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +24 -1
- helm/common/key_value_store.py +113 -0
- helm/common/media_object.py +13 -0
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +88 -0
- helm/common/multimodal_request_utils.py +31 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +2 -2
- helm/common/request.py +36 -27
- helm/common/test_general.py +6 -0
- helm/common/tokenization_request.py +6 -3
- helm/config/__init__.py +0 -0
- helm/config/model_deployments.yaml +1942 -0
- helm/config/model_metadata.yaml +2201 -0
- helm/config/tokenizer_configs.yaml +362 -0
- helm/proxy/accounts.py +31 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/model_critique_client.py +13 -5
- helm/proxy/example_queries.py +29 -17
- helm/proxy/retry.py +8 -2
- helm/proxy/server.py +77 -5
- helm/proxy/services/remote_service.py +31 -0
- helm/proxy/services/server_service.py +103 -20
- helm/proxy/services/service.py +34 -2
- helm/proxy/services/test_remote_service.py +7 -6
- helm/proxy/services/test_service.py +27 -18
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/py.typed +0 -0
- helm/tokenizers/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/ai21_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/anthropic_tokenizer.py +17 -11
- helm/tokenizers/auto_tokenizer.py +93 -0
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +8 -2
- helm/{proxy/tokenizers → tokenizers}/cohere_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +3 -3
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +56 -60
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_anthropic_tokenizer.py +82 -0
- helm/tokenizers/test_huggingface_tokenizer.py +136 -0
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/tokenizers/vertexai_tokenizer.py +97 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +5 -3
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.3.0.dist-info/RECORD +0 -396
- helm/benchmark/vlm_run_specs.py +0 -71
- helm/benchmark/window_services/anthropic_window_service.py +0 -68
- helm/benchmark/window_services/bloom_window_service.py +0 -35
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/gptj_window_service.py +0 -38
- helm/benchmark/window_services/gptneox_window_service.py +0 -41
- helm/benchmark/window_services/http_model_window_service.py +0 -28
- helm/benchmark/window_services/huggingface_window_service.py +0 -59
- helm/benchmark/window_services/lit_gpt_window_service.py +0 -27
- helm/benchmark/window_services/llama_window_service.py +0 -28
- helm/benchmark/window_services/luminous_window_service.py +0 -67
- helm/benchmark/window_services/megatron_window_service.py +0 -10
- helm/benchmark/window_services/mt_nlg_window_service.py +0 -27
- helm/benchmark/window_services/openai_window_service.py +0 -13
- helm/benchmark/window_services/opt_window_service.py +0 -35
- helm/benchmark/window_services/palmyra_window_service.py +0 -45
- helm/benchmark/window_services/remote_window_service.py +0 -48
- helm/benchmark/window_services/santacoder_window_service.py +0 -27
- helm/benchmark/window_services/starcoder_window_service.py +0 -27
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/benchmark/window_services/wider_openai_window_service.py +0 -52
- helm/proxy/clients/aleph_alpha_client.py +0 -99
- helm/proxy/clients/auto_client.py +0 -461
- helm/proxy/clients/goose_ai_client.py +0 -100
- helm/proxy/clients/microsoft_client.py +0 -182
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/remote_model_registry.py +0 -28
- helm/proxy/clients/simple_client.py +0 -61
- helm/proxy/clients/test_anthropic_client.py +0 -63
- helm/proxy/clients/test_client.py +0 -31
- helm/proxy/clients/test_huggingface_client.py +0 -87
- helm/proxy/models.py +0 -963
- helm/proxy/test_models.py +0 -27
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -86
- helm/proxy/token_counters/test_openai_token_counter.py +0 -79
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_huggingface_tokenizer.py +0 -56
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/LICENSE +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.3.0.dist-info → crfm_helm-0.5.0.dist-info}/top_level.txt +0 -0
- /helm/{proxy/clients → benchmark/annotation}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/annotation/image2structure}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_ice_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/tokenizer.py +0 -0
- /helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import Any, Dict, Optional
|
|
2
|
+
from typing import Any, Dict, Optional, cast
|
|
3
3
|
from threading import Lock
|
|
4
4
|
from helm.common.cache import CacheConfig
|
|
5
|
+
from helm.common.concurrency import ThreadSafeWrapper
|
|
5
6
|
|
|
6
7
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
|
7
8
|
|
|
@@ -10,46 +11,36 @@ from .caching_tokenizer import CachingTokenizer
|
|
|
10
11
|
from .tokenizer import cleanup_tokens
|
|
11
12
|
|
|
12
13
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
"google/t5-11b": "t5-11b",
|
|
16
|
-
"huggingface/gpt2": "gpt2",
|
|
17
|
-
"huggingface/santacoder": "bigcode/santacoder",
|
|
18
|
-
"huggingface/starcoder": "bigcode/starcoder",
|
|
19
|
-
}
|
|
20
|
-
"""Mapping of some HELM model names to Hugging Face pretrained model name."""
|
|
14
|
+
WrappedPreTrainedTokenizer = ThreadSafeWrapper[PreTrainedTokenizerBase]
|
|
15
|
+
"""Thread safe wrapper around Hugging Face PreTrainedTokenizerBase.
|
|
21
16
|
|
|
17
|
+
Hugging Face PreTrainedTokenizerBase is thread-hostile and using it from multiple threads
|
|
18
|
+
simultaneously can result in an "Already borrowed" error (#1421). This wrapper ensures
|
|
19
|
+
that a lock is held when using the PreTrainedTokenizerBase.
|
|
22
20
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
21
|
+
Example usage:
|
|
22
|
+
|
|
23
|
+
with wrapped_tokenizer as tokenizer:
|
|
24
|
+
tokenizer.encode("...")
|
|
25
|
+
"""
|
|
27
26
|
|
|
28
27
|
|
|
29
28
|
class HuggingFaceTokenizer(CachingTokenizer):
|
|
30
|
-
_tokenizers: Dict[str,
|
|
29
|
+
_tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {}
|
|
31
30
|
_tokenizers_lock: Lock = Lock()
|
|
32
31
|
|
|
33
|
-
def __init__(
|
|
34
|
-
self,
|
|
35
|
-
cache_config: CacheConfig,
|
|
36
|
-
pretrained_model_name_or_path: Optional[str] = None,
|
|
37
|
-
revision: Optional[str] = None,
|
|
38
|
-
):
|
|
32
|
+
def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
|
|
39
33
|
super().__init__(cache_config=cache_config)
|
|
40
34
|
self._pretrained_model_name_or_path = pretrained_model_name_or_path
|
|
41
|
-
self.
|
|
35
|
+
self._kwargs = kwargs
|
|
42
36
|
|
|
43
37
|
@staticmethod
|
|
44
|
-
def create_tokenizer(pretrained_model_name_or_path: str,
|
|
38
|
+
def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPreTrainedTokenizer:
|
|
45
39
|
"""Loads tokenizer using files from disk if they exist. Otherwise, downloads from HuggingFace."""
|
|
46
40
|
# To avoid deadlocks when using HuggingFace tokenizers with multiple processes
|
|
47
41
|
# TODO: Figure out if we actually need this.
|
|
48
42
|
os.environ["TOKENIZERS_PARALLELISM"] = "False"
|
|
49
43
|
|
|
50
|
-
tokenizer_kwargs = {}
|
|
51
|
-
if revision is not None:
|
|
52
|
-
tokenizer_kwargs["revision"] = revision
|
|
53
44
|
try:
|
|
54
45
|
# From the Hugging Face documentation, "local_files_only(defaults to False) —
|
|
55
46
|
# Whether or not to only look at local files".
|
|
@@ -60,19 +51,23 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
60
51
|
# From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside
|
|
61
52
|
# the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face
|
|
62
53
|
# Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
|
|
63
|
-
return
|
|
64
|
-
|
|
54
|
+
return WrappedPreTrainedTokenizer(
|
|
55
|
+
AutoTokenizer.from_pretrained(
|
|
56
|
+
pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
|
|
57
|
+
)
|
|
65
58
|
)
|
|
66
59
|
except OSError:
|
|
67
60
|
hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
|
|
68
|
-
return
|
|
69
|
-
|
|
61
|
+
return WrappedPreTrainedTokenizer(
|
|
62
|
+
AutoTokenizer.from_pretrained(
|
|
63
|
+
pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
|
|
64
|
+
)
|
|
70
65
|
)
|
|
71
66
|
|
|
72
67
|
@staticmethod
|
|
73
68
|
def get_tokenizer(
|
|
74
|
-
helm_tokenizer_name: str, pretrained_model_name_or_path: str,
|
|
75
|
-
) ->
|
|
69
|
+
helm_tokenizer_name: str, pretrained_model_name_or_path: str, **kwargs
|
|
70
|
+
) -> WrappedPreTrainedTokenizer:
|
|
76
71
|
"""
|
|
77
72
|
Checks if the desired tokenizer is cached. Creates the tokenizer if it's not cached.
|
|
78
73
|
Returns the tokenizer.
|
|
@@ -80,42 +75,39 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
80
75
|
with HuggingFaceTokenizer._tokenizers_lock:
|
|
81
76
|
if helm_tokenizer_name not in HuggingFaceTokenizer._tokenizers:
|
|
82
77
|
with htrack_block(
|
|
83
|
-
f"Loading {pretrained_model_name_or_path} (
|
|
78
|
+
f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) "
|
|
84
79
|
f"for HELM tokenizer {helm_tokenizer_name} with Hugging Face Transformers"
|
|
85
80
|
):
|
|
86
81
|
# Keep the tokenizer in memory, so we don't recreate it for future requests
|
|
87
82
|
HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] = HuggingFaceTokenizer.create_tokenizer(
|
|
88
|
-
pretrained_model_name_or_path,
|
|
83
|
+
pretrained_model_name_or_path, **kwargs
|
|
89
84
|
)
|
|
90
85
|
return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name]
|
|
91
86
|
|
|
92
|
-
def _get_tokenizer_for_request(self, request: Dict[str, Any]) ->
|
|
87
|
+
def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrainedTokenizer:
|
|
93
88
|
"""Method used in both _tokenize_do_it and _decode_do_it to get the tokenizer."""
|
|
94
|
-
pretrained_model_name_or_path
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
pretrained_model_name_or_path = resolve_alias(request["tokenizer"])
|
|
99
|
-
_tokenizer = HuggingFaceTokenizer.get_tokenizer(
|
|
89
|
+
pretrained_model_name_or_path = (
|
|
90
|
+
self._pretrained_model_name_or_path if self._pretrained_model_name_or_path else request["tokenizer"]
|
|
91
|
+
)
|
|
92
|
+
return HuggingFaceTokenizer.get_tokenizer(
|
|
100
93
|
helm_tokenizer_name=request["tokenizer"],
|
|
101
94
|
pretrained_model_name_or_path=pretrained_model_name_or_path,
|
|
102
|
-
|
|
95
|
+
**self._kwargs,
|
|
103
96
|
)
|
|
104
|
-
return _tokenizer
|
|
105
97
|
|
|
106
98
|
def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
107
|
-
_tokenizer = self._get_tokenizer_for_request(request)
|
|
108
|
-
|
|
109
99
|
if request["encode"]:
|
|
110
100
|
if request["truncation"]:
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
101
|
+
with self._get_tokenizer_for_request(request) as tokenizer:
|
|
102
|
+
tokens = tokenizer.encode(
|
|
103
|
+
request["text"],
|
|
104
|
+
truncation=request["truncation"],
|
|
105
|
+
max_length=request["max_length"],
|
|
106
|
+
add_special_tokens=False,
|
|
107
|
+
)
|
|
117
108
|
else:
|
|
118
|
-
|
|
109
|
+
with self._get_tokenizer_for_request(request) as tokenizer:
|
|
110
|
+
tokens = tokenizer.encode(request["text"], add_special_tokens=False)
|
|
119
111
|
else:
|
|
120
112
|
if "gpt" in request["tokenizer"] or request["tokenizer"] in [
|
|
121
113
|
"bigscience/bloom",
|
|
@@ -126,9 +118,10 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
126
118
|
# convert_tokens_to_string method. We prefer to use this method instead
|
|
127
119
|
# of the hacky cleanup_tokens method below as it might handle cases
|
|
128
120
|
# we haven't thought of in cleanup_tokens.
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
121
|
+
with self._get_tokenizer_for_request(request) as tokenizer:
|
|
122
|
+
tokens = [
|
|
123
|
+
tokenizer.convert_tokens_to_string([token]) for token in tokenizer.tokenize(request["text"])
|
|
124
|
+
]
|
|
132
125
|
else:
|
|
133
126
|
# Tokenizes the text and returns the tokens as a list of strings,
|
|
134
127
|
# not a list of token objects (otherwise "Hello world" would be"
|
|
@@ -138,14 +131,17 @@ class HuggingFaceTokenizer(CachingTokenizer):
|
|
|
138
131
|
# But this replaces all the "▁" characters by "", which is not what we want.
|
|
139
132
|
# This would be problematic as tokenize(" Hello", encode=False) would return ["Hello"]
|
|
140
133
|
# Just like tokenize("Hello", encode=False) would return ["Hello"].
|
|
141
|
-
|
|
134
|
+
with self._get_tokenizer_for_request(request) as tokenizer:
|
|
135
|
+
tokens = tokenizer.tokenize(request["text"])
|
|
136
|
+
# Some tokenizers (e.g. Qwen/Qwen-7B) return the tokens as bytes, so we have to decode them to strings.
|
|
137
|
+
if tokens and type(tokens[0]) == bytes:
|
|
138
|
+
tokens = [cast(bytes, token).decode(errors="ignore") for token in tokens]
|
|
142
139
|
tokens = cleanup_tokens(tokens, request["tokenizer"])
|
|
143
140
|
return {"tokens": tokens}
|
|
144
141
|
|
|
145
142
|
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
)
|
|
143
|
+
with self._get_tokenizer_for_request(request) as tokenizer:
|
|
144
|
+
text = tokenizer.decode(
|
|
145
|
+
request["tokens"], clean_up_tokenization_spaces=request["clean_up_tokenization_spaces"]
|
|
146
|
+
)
|
|
151
147
|
return {"text": text}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from helm.common.tokenization_request import (
|
|
2
|
+
DecodeRequest,
|
|
3
|
+
DecodeRequestResult,
|
|
4
|
+
TokenizationRequest,
|
|
5
|
+
TokenizationRequestResult,
|
|
6
|
+
TokenizationToken,
|
|
7
|
+
)
|
|
8
|
+
from helm.tokenizers.tokenizer import Tokenizer
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SimpleTokenizer(Tokenizer):
|
|
12
|
+
"""Simple tokenizer for tutorials and for debugging."""
|
|
13
|
+
|
|
14
|
+
def tokenize(self, request: TokenizationRequest) -> TokenizationRequestResult:
|
|
15
|
+
if request.encode:
|
|
16
|
+
return TokenizationRequestResult(
|
|
17
|
+
success=True,
|
|
18
|
+
cached=False,
|
|
19
|
+
tokens=[TokenizationToken(ord(character)) for character in request.text],
|
|
20
|
+
text=request.text,
|
|
21
|
+
)
|
|
22
|
+
else:
|
|
23
|
+
return TokenizationRequestResult(
|
|
24
|
+
success=True,
|
|
25
|
+
cached=False,
|
|
26
|
+
tokens=[TokenizationToken(character) for character in request.text],
|
|
27
|
+
text=request.text,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
def decode(self, request: DecodeRequest) -> DecodeRequestResult:
|
|
31
|
+
return DecodeRequestResult(
|
|
32
|
+
success=True, cached=False, text="".join([chr(code_point) for code_point in request.tokens])
|
|
33
|
+
)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.common.cache import SqliteCacheConfig
|
|
6
|
+
from helm.common.general import parallel_map
|
|
7
|
+
from helm.common.tokenization_request import (
|
|
8
|
+
DecodeRequest,
|
|
9
|
+
DecodeRequestResult,
|
|
10
|
+
TokenizationRequest,
|
|
11
|
+
TokenizationRequestResult,
|
|
12
|
+
)
|
|
13
|
+
from helm.tokenizers.anthropic_tokenizer import AnthropicTokenizer
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TestAnthropicTokenizer:
|
|
17
|
+
TEST_PROMPT: str = "I am a computer scientist."
|
|
18
|
+
TEST_ENCODED: List[int] = [45, 1413, 269, 6797, 22228, 18]
|
|
19
|
+
TEST_TOKENS: List[str] = ["I", " am", " a", " computer", " scientist", "."]
|
|
20
|
+
|
|
21
|
+
def setup_method(self, method):
|
|
22
|
+
cache_file = tempfile.NamedTemporaryFile(delete=False)
|
|
23
|
+
self.cache_path: str = cache_file.name
|
|
24
|
+
self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path))
|
|
25
|
+
|
|
26
|
+
def teardown_method(self, method):
|
|
27
|
+
os.remove(self.cache_path)
|
|
28
|
+
|
|
29
|
+
def test_tokenize(self):
|
|
30
|
+
request = TokenizationRequest(text=self.TEST_PROMPT, tokenizer="anthropic/claude")
|
|
31
|
+
result: TokenizationRequestResult = self.tokenizer.tokenize(request)
|
|
32
|
+
assert not result.cached, "First time making the tokenize request. Result should not be cached"
|
|
33
|
+
assert result.raw_tokens == self.TEST_TOKENS
|
|
34
|
+
result = self.tokenizer.tokenize(request)
|
|
35
|
+
assert result.cached, "Result should be cached"
|
|
36
|
+
assert result.raw_tokens == self.TEST_TOKENS
|
|
37
|
+
|
|
38
|
+
def test_encode(self):
|
|
39
|
+
request = TokenizationRequest(
|
|
40
|
+
text=self.TEST_PROMPT, tokenizer="anthropic/claude", encode=True, truncation=True, max_length=1
|
|
41
|
+
)
|
|
42
|
+
result: TokenizationRequestResult = self.tokenizer.tokenize(request)
|
|
43
|
+
assert not result.cached, "First time making the tokenize request. Result should not be cached"
|
|
44
|
+
assert result.raw_tokens == [self.TEST_ENCODED[0]]
|
|
45
|
+
result = self.tokenizer.tokenize(request)
|
|
46
|
+
assert result.cached, "Result should be cached"
|
|
47
|
+
assert result.raw_tokens == [self.TEST_ENCODED[0]]
|
|
48
|
+
|
|
49
|
+
request = TokenizationRequest(
|
|
50
|
+
text=self.TEST_PROMPT, tokenizer="anthropic/claude", encode=True, truncation=True, max_length=1024
|
|
51
|
+
)
|
|
52
|
+
result = self.tokenizer.tokenize(request)
|
|
53
|
+
assert not result.cached, "First time making this particular request. Result should not be cached"
|
|
54
|
+
assert result.raw_tokens == self.TEST_ENCODED
|
|
55
|
+
|
|
56
|
+
def test_decode(self):
|
|
57
|
+
request = DecodeRequest(tokens=self.TEST_ENCODED, tokenizer="anthropic/claude")
|
|
58
|
+
result: DecodeRequestResult = self.tokenizer.decode(request)
|
|
59
|
+
assert not result.cached, "First time making the decode request. Result should not be cached"
|
|
60
|
+
assert result.text == self.TEST_PROMPT
|
|
61
|
+
result = self.tokenizer.decode(request)
|
|
62
|
+
assert result.cached, "Result should be cached"
|
|
63
|
+
assert result.text == self.TEST_PROMPT
|
|
64
|
+
|
|
65
|
+
def test_already_borrowed(self):
|
|
66
|
+
"""Test workaround of the "Already borrowed" bug (#1421) caused by the thread-hostile Anthropic tokenizer,
|
|
67
|
+
which is a thin wrapper around a Hugging Face FastTokenizer"""
|
|
68
|
+
|
|
69
|
+
def make_tokenize_request(seed: int) -> None:
|
|
70
|
+
request_length = 10
|
|
71
|
+
truncation = bool(seed % 2)
|
|
72
|
+
self.tokenizer.tokenize(
|
|
73
|
+
# The truncation parameter requires setting a flag on the Rust FastTokenizer.
|
|
74
|
+
# Concurrent requests cause concurrent mutations, which results an Rust concurrency error.
|
|
75
|
+
TokenizationRequest(
|
|
76
|
+
text=str(seed) * request_length, tokenizer="anthropic/claude", encode=True, truncation=truncation
|
|
77
|
+
)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
num_requests = 100
|
|
81
|
+
# Should not raise "Already borrowed" error
|
|
82
|
+
parallel_map(make_tokenize_request, list(range(num_requests)), parallelism=8)
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import tempfile
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from helm.common.cache import SqliteCacheConfig
|
|
6
|
+
from helm.common.general import parallel_map, singleton
|
|
7
|
+
from helm.common.tokenization_request import (
|
|
8
|
+
DecodeRequest,
|
|
9
|
+
DecodeRequestResult,
|
|
10
|
+
TokenizationRequest,
|
|
11
|
+
TokenizationRequestResult,
|
|
12
|
+
)
|
|
13
|
+
from .huggingface_tokenizer import HuggingFaceTokenizer
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TestHuggingFaceGPT2Tokenizer:
|
|
17
|
+
def setup_method(self, method):
|
|
18
|
+
cache_file = tempfile.NamedTemporaryFile(delete=False)
|
|
19
|
+
self.cache_path: str = cache_file.name
|
|
20
|
+
self.tokenizer = HuggingFaceTokenizer(SqliteCacheConfig(self.cache_path))
|
|
21
|
+
|
|
22
|
+
def teardown_method(self, method):
|
|
23
|
+
os.remove(self.cache_path)
|
|
24
|
+
|
|
25
|
+
def test_tokenize(self):
|
|
26
|
+
request = TokenizationRequest(text="I am a computer scientist.", tokenizer="huggingface/gpt2")
|
|
27
|
+
result: TokenizationRequestResult = self.tokenizer.tokenize(request)
|
|
28
|
+
assert not result.cached, "First time making the tokenize request. Result should not be cached"
|
|
29
|
+
result = self.tokenizer.tokenize(request)
|
|
30
|
+
assert result.cached, "Result should be cached"
|
|
31
|
+
assert result.raw_tokens == ["I", " am", " a", " computer", " scientist", "."]
|
|
32
|
+
|
|
33
|
+
def test_encode(self):
|
|
34
|
+
request = TokenizationRequest(
|
|
35
|
+
text="I am a computer scientist.", tokenizer="huggingface/gpt2", encode=True, truncation=True, max_length=1
|
|
36
|
+
)
|
|
37
|
+
result: TokenizationRequestResult = self.tokenizer.tokenize(request)
|
|
38
|
+
assert not result.cached, "First time making the tokenize request. Result should not be cached"
|
|
39
|
+
result = self.tokenizer.tokenize(request)
|
|
40
|
+
assert result.cached, "Result should be cached"
|
|
41
|
+
assert result.raw_tokens == [40]
|
|
42
|
+
|
|
43
|
+
request = TokenizationRequest(
|
|
44
|
+
text="I am a computer scientist.",
|
|
45
|
+
tokenizer="huggingface/gpt2",
|
|
46
|
+
encode=True,
|
|
47
|
+
truncation=True,
|
|
48
|
+
max_length=1024,
|
|
49
|
+
)
|
|
50
|
+
result = self.tokenizer.tokenize(request)
|
|
51
|
+
assert not result.cached, "First time making this particular request. Result should not be cached"
|
|
52
|
+
assert result.raw_tokens == [40, 716, 257, 3644, 11444, 13]
|
|
53
|
+
|
|
54
|
+
def test_decode(self):
|
|
55
|
+
request = DecodeRequest(tokens=[40, 716, 257, 3644, 11444, 13], tokenizer="huggingface/gpt2")
|
|
56
|
+
result: DecodeRequestResult = self.tokenizer.decode(request)
|
|
57
|
+
assert not result.cached, "First time making the decode request. Result should not be cached"
|
|
58
|
+
result = self.tokenizer.decode(request)
|
|
59
|
+
assert result.cached, "Result should be cached"
|
|
60
|
+
assert result.text == "I am a computer scientist."
|
|
61
|
+
|
|
62
|
+
def test_already_borrowed(self):
|
|
63
|
+
"""Test workaround of the "Already borrowed" bug (#1421) caused by the thread-hostile Hugging Face tokenizer"""
|
|
64
|
+
|
|
65
|
+
def make_tokenize_request(seed: int) -> None:
|
|
66
|
+
request_length = 10
|
|
67
|
+
truncation = bool(seed % 2)
|
|
68
|
+
self.tokenizer.tokenize(
|
|
69
|
+
# The truncation parameter requires setting a flag on the Rust FastTokenizer.
|
|
70
|
+
# Concurrent requests cause concurrent mutations, which results an Rust concurrency error.
|
|
71
|
+
TokenizationRequest(
|
|
72
|
+
text=str(seed) * request_length, tokenizer="huggingface/gpt2", encode=True, truncation=truncation
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
num_requests = 100
|
|
77
|
+
# Should not raise "Already borrowed" error
|
|
78
|
+
parallel_map(make_tokenize_request, list(range(num_requests)), parallelism=8)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class TestHuggingFaceTokenizer:
|
|
82
|
+
# The following prompt has 51 tokens according to the GPT-2 tokenizer
|
|
83
|
+
TEST_PROMPT: str = (
|
|
84
|
+
"The Center for Research on Foundation Models (CRFM) is "
|
|
85
|
+
"an interdisciplinary initiative born out of the Stanford "
|
|
86
|
+
"Institute for Human-Centered Artificial Intelligence (HAI) "
|
|
87
|
+
"that aims to make fundamental advances in the study, development, "
|
|
88
|
+
"and deployment of foundation models."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def verify_get_tokenizer(
|
|
93
|
+
tokenizer_name: str, expected_num_tokens: int, pretrained_model_name_or_path: Optional[str] = None
|
|
94
|
+
):
|
|
95
|
+
wrapped_tokenizer = HuggingFaceTokenizer.get_tokenizer(
|
|
96
|
+
helm_tokenizer_name=tokenizer_name,
|
|
97
|
+
pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
|
|
98
|
+
)
|
|
99
|
+
assert tokenizer_name in HuggingFaceTokenizer._tokenizers, "Tokenizer should be cached"
|
|
100
|
+
with wrapped_tokenizer as tokenizer:
|
|
101
|
+
assert len(tokenizer.encode(TestHuggingFaceTokenizer.TEST_PROMPT)) == expected_num_tokens
|
|
102
|
+
|
|
103
|
+
def test_get_tokenizer_gpt2(self):
|
|
104
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("huggingface/gpt2", 51, pretrained_model_name_or_path="gpt2")
|
|
105
|
+
|
|
106
|
+
def test_get_tokenizer_gptj(self):
|
|
107
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("EleutherAI/gpt-j-6B", 51)
|
|
108
|
+
|
|
109
|
+
def test_get_tokenizer_gptneox(self):
|
|
110
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("EleutherAI/gpt-neox-20b", 52)
|
|
111
|
+
|
|
112
|
+
def test_get_tokenizer_bloom(self):
|
|
113
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("bigscience/bloom", 51)
|
|
114
|
+
|
|
115
|
+
def test_get_tokenizer_t0pp(self):
|
|
116
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("bigscience/T0pp", 58)
|
|
117
|
+
|
|
118
|
+
def test_get_tokenizer_t511b(self):
|
|
119
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("google/t5-11b", 58, pretrained_model_name_or_path="t5-11b")
|
|
120
|
+
|
|
121
|
+
def test_get_tokenizer_ul2(self):
|
|
122
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("google/ul2", 58)
|
|
123
|
+
|
|
124
|
+
def test_get_santacoder(self):
|
|
125
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("bigcode/santacoder", 62)
|
|
126
|
+
|
|
127
|
+
def test_get_clip_tokenizer(self):
|
|
128
|
+
TestHuggingFaceTokenizer.verify_get_tokenizer("openai/clip-vit-large-patch14", 50)
|
|
129
|
+
|
|
130
|
+
def test_gpt2_tokenize_eos(self):
|
|
131
|
+
eos_token: str = "<|endoftext|>"
|
|
132
|
+
wrapped_tokenizer = HuggingFaceTokenizer.get_tokenizer("huggingface/gpt2", pretrained_model_name_or_path="gpt2")
|
|
133
|
+
with wrapped_tokenizer as tokenizer:
|
|
134
|
+
token_ids = tokenizer.encode(eos_token)
|
|
135
|
+
assert singleton(token_ids) == 50256
|
|
136
|
+
assert tokenizer.decode(token_ids) == eos_token
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from helm.common.tokenization_request import (
|
|
2
|
+
DecodeRequest,
|
|
3
|
+
TokenizationRequest,
|
|
4
|
+
TokenizationToken,
|
|
5
|
+
)
|
|
6
|
+
from helm.tokenizers.simple_tokenizer import SimpleTokenizer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_simple_tokenizer_tokenize():
|
|
10
|
+
tokenizer = SimpleTokenizer()
|
|
11
|
+
request = TokenizationRequest(tokenizer="simple/tokenizer1", text="otter 🦦")
|
|
12
|
+
result = tokenizer.tokenize(request)
|
|
13
|
+
assert result.success
|
|
14
|
+
assert not result.cached
|
|
15
|
+
assert result.tokens == [TokenizationToken(token) for token in ["o", "t", "t", "e", "r", " ", "🦦"]]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_simple_tokenizer_encode():
|
|
19
|
+
tokenizer = SimpleTokenizer()
|
|
20
|
+
request = TokenizationRequest(tokenizer="simple/tokenizer1", text="otter 🦦", encode=True)
|
|
21
|
+
result = tokenizer.tokenize(request)
|
|
22
|
+
assert result.success
|
|
23
|
+
assert not result.cached
|
|
24
|
+
assert result.tokens == [TokenizationToken(token) for token in [111, 116, 116, 101, 114, 32, 129446]]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_simple_tokenizer_decode():
|
|
28
|
+
tokenizer = SimpleTokenizer()
|
|
29
|
+
request = DecodeRequest(tokenizer="simple/tokenizer1", tokens=[111, 116, 116, 101, 114, 32, 129446])
|
|
30
|
+
result = tokenizer.decode(request)
|
|
31
|
+
assert result.success
|
|
32
|
+
assert not result.cached
|
|
33
|
+
assert result.text == "otter 🦦"
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import dataclasses
|
|
3
|
+
import requests
|
|
4
|
+
from typing import Any, Dict, List, Union, Optional
|
|
5
|
+
|
|
6
|
+
from helm.common.cache import CacheConfig
|
|
7
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
|
+
from helm.common.tokenization_request import (
|
|
9
|
+
TokenizationRequest,
|
|
10
|
+
TokenizationToken,
|
|
11
|
+
)
|
|
12
|
+
from helm.tokenizers.caching_tokenizer import CachingTokenizer
|
|
13
|
+
from helm.proxy.retry import NonRetriableException
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import google.auth
|
|
17
|
+
import google.auth.transport.requests
|
|
18
|
+
from google.auth.exceptions import DefaultCredentialsError
|
|
19
|
+
except ModuleNotFoundError as e:
|
|
20
|
+
handle_module_not_found_error(e, ["google"])
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class VertexAIAuthenticationException(NonRetriableException):
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class VertexAITokenizer(CachingTokenizer):
|
|
28
|
+
"""Google Vertex AI API for tokenization.
|
|
29
|
+
|
|
30
|
+
Doc: https://cloud.google.com/vertex-ai/docs/generative-ai/compute-token"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, project_id: Optional[str], location: Optional[str], cache_config: CacheConfig) -> None:
|
|
33
|
+
super().__init__(cache_config)
|
|
34
|
+
if not project_id:
|
|
35
|
+
raise VertexAIAuthenticationException("credentials.conf is missing googleProjectId")
|
|
36
|
+
if not location:
|
|
37
|
+
raise VertexAIAuthenticationException("credentials.conf is missing googleLocation")
|
|
38
|
+
self.project_id = project_id
|
|
39
|
+
self.location = location
|
|
40
|
+
try:
|
|
41
|
+
creds, _ = google.auth.default(quota_project_id=self.project_id)
|
|
42
|
+
auth_req = google.auth.transport.requests.Request()
|
|
43
|
+
creds.refresh(auth_req)
|
|
44
|
+
except DefaultCredentialsError as e:
|
|
45
|
+
raise VertexAIAuthenticationException(
|
|
46
|
+
"Log in using `gcloud auth application-default login` to use the Google Vertex tokenizer API"
|
|
47
|
+
) from e
|
|
48
|
+
self.access_token = creds.token
|
|
49
|
+
|
|
50
|
+
def _tokenization_request_to_cache_key(self, request: TokenizationRequest) -> Dict[str, Any]:
|
|
51
|
+
cache_key = dataclasses.asdict(request)
|
|
52
|
+
# Delete encode because the Google Vertex AI API simulateously gives string and integer tokens.
|
|
53
|
+
del cache_key["encode"]
|
|
54
|
+
return cache_key
|
|
55
|
+
|
|
56
|
+
def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
57
|
+
text: str = request["text"]
|
|
58
|
+
tokenizer_name = request["tokenizer"].split("/", maxsplit=1)[1]
|
|
59
|
+
url = (
|
|
60
|
+
f"https://{self.location}-aiplatform.googleapis.com/v1/projects/{self.project_id}/"
|
|
61
|
+
f"locations/{self.location}/publishers/google/models/{tokenizer_name}:computeTokens"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
headers = {"Authorization": f"Bearer {self.access_token}"}
|
|
65
|
+
body = {
|
|
66
|
+
"instances": [{"prompt": text}],
|
|
67
|
+
}
|
|
68
|
+
response = requests.post(url, headers=headers, json=body)
|
|
69
|
+
response.raise_for_status()
|
|
70
|
+
return response.json()
|
|
71
|
+
|
|
72
|
+
def _tokenization_raw_response_to_tokens(
|
|
73
|
+
self, response: Dict[str, Any], request: TokenizationRequest
|
|
74
|
+
) -> List[TokenizationToken]:
|
|
75
|
+
tokens: List[Union[int, str]]
|
|
76
|
+
response_instance = response["tokensInfo"][0]
|
|
77
|
+
if not response_instance:
|
|
78
|
+
# Response was empty
|
|
79
|
+
tokens = []
|
|
80
|
+
else:
|
|
81
|
+
if request.encode:
|
|
82
|
+
tokens = [int(token) for token in response_instance["tokenIds"]]
|
|
83
|
+
else:
|
|
84
|
+
# errors="ignore" is needed because the tokenizer is not guaranteed to tokenize on
|
|
85
|
+
# the boundary of UTF-8 characters. The tokenization boundary can be within the bytes of
|
|
86
|
+
# a UTF-8 character.
|
|
87
|
+
#
|
|
88
|
+
# TODO(#2141): Come up with a more correct way of doing this.
|
|
89
|
+
tokens = [
|
|
90
|
+
base64.decodebytes(token.encode()).decode("utf-8", errors="ignore")
|
|
91
|
+
for token in response_instance["tokens"]
|
|
92
|
+
]
|
|
93
|
+
return [TokenizationToken(token) for token in tokens]
|
|
94
|
+
|
|
95
|
+
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
96
|
+
# Defined for mypy but decode() already raises NotImplementedError
|
|
97
|
+
raise NotImplementedError("The Google Vertex AI API does not support decoding.")
|
|
@@ -21,9 +21,11 @@ class YaLMTokenizer(CachingTokenizer):
|
|
|
21
21
|
# This is a problem because then tokenize(" Hello", encode=False) == tokenize("Hello", encode=False)
|
|
22
22
|
# That is why we manually replace "▁" with a space.
|
|
23
23
|
return {
|
|
24
|
-
"tokens":
|
|
25
|
-
|
|
26
|
-
|
|
24
|
+
"tokens": (
|
|
25
|
+
token_ids
|
|
26
|
+
if request["encode"]
|
|
27
|
+
else cleanup_tokens(self._tokenizer.convert_ids_to_tokens(token_ids), request["tokenizer"])
|
|
28
|
+
)
|
|
27
29
|
}
|
|
28
30
|
|
|
29
31
|
def _decode_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
|
|
File without changes
|
|
Binary file
|
|
@@ -16,7 +16,7 @@ adapted from https://github.com/yandex/YaLM-100B/blob/main/megatron_lm/megatron/
|
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
YALM_TOKENIZER_PACKAGE: str = "helm.
|
|
19
|
+
YALM_TOKENIZER_PACKAGE: str = "helm.tokenizers.yalm_tokenizer_data"
|
|
20
20
|
YALM_TOKENIZER_VOCAB_FILENAME: str = "voc_100b.sp"
|
|
21
21
|
|
|
22
22
|
|