keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/__init__.py +52 -0
- keras_hub/api/__init__.py +27 -0
- keras_hub/api/layers/__init__.py +47 -0
- keras_hub/api/metrics/__init__.py +24 -0
- keras_hub/api/models/__init__.py +249 -0
- keras_hub/api/samplers/__init__.py +29 -0
- keras_hub/api/tokenizers/__init__.py +35 -0
- keras_hub/src/__init__.py +13 -0
- keras_hub/src/api_export.py +53 -0
- keras_hub/src/layers/__init__.py +13 -0
- keras_hub/src/layers/modeling/__init__.py +13 -0
- keras_hub/src/layers/modeling/alibi_bias.py +143 -0
- keras_hub/src/layers/modeling/cached_multi_head_attention.py +137 -0
- keras_hub/src/layers/modeling/f_net_encoder.py +200 -0
- keras_hub/src/layers/modeling/masked_lm_head.py +239 -0
- keras_hub/src/layers/modeling/position_embedding.py +123 -0
- keras_hub/src/layers/modeling/reversible_embedding.py +311 -0
- keras_hub/src/layers/modeling/rotary_embedding.py +169 -0
- keras_hub/src/layers/modeling/sine_position_encoding.py +108 -0
- keras_hub/src/layers/modeling/token_and_position_embedding.py +150 -0
- keras_hub/src/layers/modeling/transformer_decoder.py +496 -0
- keras_hub/src/layers/modeling/transformer_encoder.py +262 -0
- keras_hub/src/layers/modeling/transformer_layer_utils.py +106 -0
- keras_hub/src/layers/preprocessing/__init__.py +13 -0
- keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +220 -0
- keras_hub/src/layers/preprocessing/multi_segment_packer.py +319 -0
- keras_hub/src/layers/preprocessing/preprocessing_layer.py +62 -0
- keras_hub/src/layers/preprocessing/random_deletion.py +271 -0
- keras_hub/src/layers/preprocessing/random_swap.py +267 -0
- keras_hub/src/layers/preprocessing/start_end_packer.py +219 -0
- keras_hub/src/metrics/__init__.py +13 -0
- keras_hub/src/metrics/bleu.py +394 -0
- keras_hub/src/metrics/edit_distance.py +197 -0
- keras_hub/src/metrics/perplexity.py +181 -0
- keras_hub/src/metrics/rouge_base.py +204 -0
- keras_hub/src/metrics/rouge_l.py +97 -0
- keras_hub/src/metrics/rouge_n.py +125 -0
- keras_hub/src/models/__init__.py +13 -0
- keras_hub/src/models/albert/__init__.py +20 -0
- keras_hub/src/models/albert/albert_backbone.py +267 -0
- keras_hub/src/models/albert/albert_classifier.py +202 -0
- keras_hub/src/models/albert/albert_masked_lm.py +129 -0
- keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +194 -0
- keras_hub/src/models/albert/albert_preprocessor.py +206 -0
- keras_hub/src/models/albert/albert_presets.py +70 -0
- keras_hub/src/models/albert/albert_tokenizer.py +119 -0
- keras_hub/src/models/backbone.py +311 -0
- keras_hub/src/models/bart/__init__.py +20 -0
- keras_hub/src/models/bart/bart_backbone.py +261 -0
- keras_hub/src/models/bart/bart_preprocessor.py +276 -0
- keras_hub/src/models/bart/bart_presets.py +74 -0
- keras_hub/src/models/bart/bart_seq_2_seq_lm.py +490 -0
- keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +262 -0
- keras_hub/src/models/bart/bart_tokenizer.py +124 -0
- keras_hub/src/models/bert/__init__.py +23 -0
- keras_hub/src/models/bert/bert_backbone.py +227 -0
- keras_hub/src/models/bert/bert_classifier.py +183 -0
- keras_hub/src/models/bert/bert_masked_lm.py +131 -0
- keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +198 -0
- keras_hub/src/models/bert/bert_preprocessor.py +184 -0
- keras_hub/src/models/bert/bert_presets.py +147 -0
- keras_hub/src/models/bert/bert_tokenizer.py +112 -0
- keras_hub/src/models/bloom/__init__.py +20 -0
- keras_hub/src/models/bloom/bloom_attention.py +186 -0
- keras_hub/src/models/bloom/bloom_backbone.py +173 -0
- keras_hub/src/models/bloom/bloom_causal_lm.py +298 -0
- keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +176 -0
- keras_hub/src/models/bloom/bloom_decoder.py +206 -0
- keras_hub/src/models/bloom/bloom_preprocessor.py +185 -0
- keras_hub/src/models/bloom/bloom_presets.py +121 -0
- keras_hub/src/models/bloom/bloom_tokenizer.py +116 -0
- keras_hub/src/models/causal_lm.py +383 -0
- keras_hub/src/models/classifier.py +109 -0
- keras_hub/src/models/csp_darknet/__init__.py +13 -0
- keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +410 -0
- keras_hub/src/models/csp_darknet/csp_darknet_image_classifier.py +133 -0
- keras_hub/src/models/deberta_v3/__init__.py +24 -0
- keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +210 -0
- keras_hub/src/models/deberta_v3/deberta_v3_classifier.py +228 -0
- keras_hub/src/models/deberta_v3/deberta_v3_masked_lm.py +135 -0
- keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +191 -0
- keras_hub/src/models/deberta_v3/deberta_v3_preprocessor.py +206 -0
- keras_hub/src/models/deberta_v3/deberta_v3_presets.py +82 -0
- keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +155 -0
- keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +227 -0
- keras_hub/src/models/deberta_v3/disentangled_self_attention.py +412 -0
- keras_hub/src/models/deberta_v3/relative_embedding.py +94 -0
- keras_hub/src/models/densenet/__init__.py +13 -0
- keras_hub/src/models/densenet/densenet_backbone.py +210 -0
- keras_hub/src/models/densenet/densenet_image_classifier.py +131 -0
- keras_hub/src/models/distil_bert/__init__.py +26 -0
- keras_hub/src/models/distil_bert/distil_bert_backbone.py +187 -0
- keras_hub/src/models/distil_bert/distil_bert_classifier.py +208 -0
- keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +137 -0
- keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +194 -0
- keras_hub/src/models/distil_bert/distil_bert_preprocessor.py +175 -0
- keras_hub/src/models/distil_bert/distil_bert_presets.py +57 -0
- keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +114 -0
- keras_hub/src/models/electra/__init__.py +20 -0
- keras_hub/src/models/electra/electra_backbone.py +247 -0
- keras_hub/src/models/electra/electra_preprocessor.py +154 -0
- keras_hub/src/models/electra/electra_presets.py +95 -0
- keras_hub/src/models/electra/electra_tokenizer.py +104 -0
- keras_hub/src/models/f_net/__init__.py +20 -0
- keras_hub/src/models/f_net/f_net_backbone.py +236 -0
- keras_hub/src/models/f_net/f_net_classifier.py +154 -0
- keras_hub/src/models/f_net/f_net_masked_lm.py +132 -0
- keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +196 -0
- keras_hub/src/models/f_net/f_net_preprocessor.py +177 -0
- keras_hub/src/models/f_net/f_net_presets.py +43 -0
- keras_hub/src/models/f_net/f_net_tokenizer.py +95 -0
- keras_hub/src/models/falcon/__init__.py +20 -0
- keras_hub/src/models/falcon/falcon_attention.py +156 -0
- keras_hub/src/models/falcon/falcon_backbone.py +164 -0
- keras_hub/src/models/falcon/falcon_causal_lm.py +291 -0
- keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +173 -0
- keras_hub/src/models/falcon/falcon_preprocessor.py +187 -0
- keras_hub/src/models/falcon/falcon_presets.py +30 -0
- keras_hub/src/models/falcon/falcon_tokenizer.py +110 -0
- keras_hub/src/models/falcon/falcon_transformer_decoder.py +255 -0
- keras_hub/src/models/feature_pyramid_backbone.py +73 -0
- keras_hub/src/models/gemma/__init__.py +20 -0
- keras_hub/src/models/gemma/gemma_attention.py +250 -0
- keras_hub/src/models/gemma/gemma_backbone.py +316 -0
- keras_hub/src/models/gemma/gemma_causal_lm.py +448 -0
- keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +167 -0
- keras_hub/src/models/gemma/gemma_decoder_block.py +241 -0
- keras_hub/src/models/gemma/gemma_preprocessor.py +191 -0
- keras_hub/src/models/gemma/gemma_presets.py +248 -0
- keras_hub/src/models/gemma/gemma_tokenizer.py +103 -0
- keras_hub/src/models/gemma/rms_normalization.py +40 -0
- keras_hub/src/models/gpt2/__init__.py +20 -0
- keras_hub/src/models/gpt2/gpt2_backbone.py +199 -0
- keras_hub/src/models/gpt2/gpt2_causal_lm.py +437 -0
- keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +173 -0
- keras_hub/src/models/gpt2/gpt2_preprocessor.py +187 -0
- keras_hub/src/models/gpt2/gpt2_presets.py +82 -0
- keras_hub/src/models/gpt2/gpt2_tokenizer.py +110 -0
- keras_hub/src/models/gpt_neo_x/__init__.py +13 -0
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +251 -0
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +175 -0
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +201 -0
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +141 -0
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +258 -0
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +145 -0
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +88 -0
- keras_hub/src/models/image_classifier.py +90 -0
- keras_hub/src/models/llama/__init__.py +20 -0
- keras_hub/src/models/llama/llama_attention.py +225 -0
- keras_hub/src/models/llama/llama_backbone.py +188 -0
- keras_hub/src/models/llama/llama_causal_lm.py +327 -0
- keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +170 -0
- keras_hub/src/models/llama/llama_decoder.py +246 -0
- keras_hub/src/models/llama/llama_layernorm.py +48 -0
- keras_hub/src/models/llama/llama_preprocessor.py +189 -0
- keras_hub/src/models/llama/llama_presets.py +80 -0
- keras_hub/src/models/llama/llama_tokenizer.py +84 -0
- keras_hub/src/models/llama3/__init__.py +20 -0
- keras_hub/src/models/llama3/llama3_backbone.py +84 -0
- keras_hub/src/models/llama3/llama3_causal_lm.py +46 -0
- keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +173 -0
- keras_hub/src/models/llama3/llama3_preprocessor.py +21 -0
- keras_hub/src/models/llama3/llama3_presets.py +69 -0
- keras_hub/src/models/llama3/llama3_tokenizer.py +63 -0
- keras_hub/src/models/masked_lm.py +101 -0
- keras_hub/src/models/mistral/__init__.py +20 -0
- keras_hub/src/models/mistral/mistral_attention.py +238 -0
- keras_hub/src/models/mistral/mistral_backbone.py +203 -0
- keras_hub/src/models/mistral/mistral_causal_lm.py +328 -0
- keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +175 -0
- keras_hub/src/models/mistral/mistral_layer_norm.py +48 -0
- keras_hub/src/models/mistral/mistral_preprocessor.py +190 -0
- keras_hub/src/models/mistral/mistral_presets.py +48 -0
- keras_hub/src/models/mistral/mistral_tokenizer.py +82 -0
- keras_hub/src/models/mistral/mistral_transformer_decoder.py +265 -0
- keras_hub/src/models/mix_transformer/__init__.py +13 -0
- keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +181 -0
- keras_hub/src/models/mix_transformer/mix_transformer_classifier.py +133 -0
- keras_hub/src/models/mix_transformer/mix_transformer_layers.py +300 -0
- keras_hub/src/models/opt/__init__.py +20 -0
- keras_hub/src/models/opt/opt_backbone.py +173 -0
- keras_hub/src/models/opt/opt_causal_lm.py +301 -0
- keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +177 -0
- keras_hub/src/models/opt/opt_preprocessor.py +188 -0
- keras_hub/src/models/opt/opt_presets.py +72 -0
- keras_hub/src/models/opt/opt_tokenizer.py +116 -0
- keras_hub/src/models/pali_gemma/__init__.py +23 -0
- keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +277 -0
- keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +313 -0
- keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +147 -0
- keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +160 -0
- keras_hub/src/models/pali_gemma/pali_gemma_presets.py +78 -0
- keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +79 -0
- keras_hub/src/models/pali_gemma/pali_gemma_vit.py +566 -0
- keras_hub/src/models/phi3/__init__.py +20 -0
- keras_hub/src/models/phi3/phi3_attention.py +260 -0
- keras_hub/src/models/phi3/phi3_backbone.py +224 -0
- keras_hub/src/models/phi3/phi3_causal_lm.py +218 -0
- keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +173 -0
- keras_hub/src/models/phi3/phi3_decoder.py +260 -0
- keras_hub/src/models/phi3/phi3_layernorm.py +48 -0
- keras_hub/src/models/phi3/phi3_preprocessor.py +190 -0
- keras_hub/src/models/phi3/phi3_presets.py +50 -0
- keras_hub/src/models/phi3/phi3_rotary_embedding.py +137 -0
- keras_hub/src/models/phi3/phi3_tokenizer.py +94 -0
- keras_hub/src/models/preprocessor.py +207 -0
- keras_hub/src/models/resnet/__init__.py +13 -0
- keras_hub/src/models/resnet/resnet_backbone.py +612 -0
- keras_hub/src/models/resnet/resnet_image_classifier.py +136 -0
- keras_hub/src/models/roberta/__init__.py +20 -0
- keras_hub/src/models/roberta/roberta_backbone.py +184 -0
- keras_hub/src/models/roberta/roberta_classifier.py +209 -0
- keras_hub/src/models/roberta/roberta_masked_lm.py +136 -0
- keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +198 -0
- keras_hub/src/models/roberta/roberta_preprocessor.py +192 -0
- keras_hub/src/models/roberta/roberta_presets.py +43 -0
- keras_hub/src/models/roberta/roberta_tokenizer.py +132 -0
- keras_hub/src/models/seq_2_seq_lm.py +54 -0
- keras_hub/src/models/t5/__init__.py +20 -0
- keras_hub/src/models/t5/t5_backbone.py +261 -0
- keras_hub/src/models/t5/t5_layer_norm.py +35 -0
- keras_hub/src/models/t5/t5_multi_head_attention.py +324 -0
- keras_hub/src/models/t5/t5_presets.py +95 -0
- keras_hub/src/models/t5/t5_tokenizer.py +100 -0
- keras_hub/src/models/t5/t5_transformer_layer.py +178 -0
- keras_hub/src/models/task.py +419 -0
- keras_hub/src/models/vgg/__init__.py +13 -0
- keras_hub/src/models/vgg/vgg_backbone.py +158 -0
- keras_hub/src/models/vgg/vgg_image_classifier.py +124 -0
- keras_hub/src/models/vit_det/__init__.py +13 -0
- keras_hub/src/models/vit_det/vit_det_backbone.py +204 -0
- keras_hub/src/models/vit_det/vit_layers.py +565 -0
- keras_hub/src/models/whisper/__init__.py +20 -0
- keras_hub/src/models/whisper/whisper_audio_feature_extractor.py +260 -0
- keras_hub/src/models/whisper/whisper_backbone.py +305 -0
- keras_hub/src/models/whisper/whisper_cached_multi_head_attention.py +153 -0
- keras_hub/src/models/whisper/whisper_decoder.py +141 -0
- keras_hub/src/models/whisper/whisper_encoder.py +106 -0
- keras_hub/src/models/whisper/whisper_preprocessor.py +326 -0
- keras_hub/src/models/whisper/whisper_presets.py +148 -0
- keras_hub/src/models/whisper/whisper_tokenizer.py +163 -0
- keras_hub/src/models/xlm_roberta/__init__.py +26 -0
- keras_hub/src/models/xlm_roberta/xlm_roberta_backbone.py +81 -0
- keras_hub/src/models/xlm_roberta/xlm_roberta_classifier.py +225 -0
- keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +141 -0
- keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +195 -0
- keras_hub/src/models/xlm_roberta/xlm_roberta_preprocessor.py +205 -0
- keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +43 -0
- keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +191 -0
- keras_hub/src/models/xlnet/__init__.py +13 -0
- keras_hub/src/models/xlnet/relative_attention.py +459 -0
- keras_hub/src/models/xlnet/xlnet_backbone.py +222 -0
- keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +133 -0
- keras_hub/src/models/xlnet/xlnet_encoder.py +378 -0
- keras_hub/src/samplers/__init__.py +13 -0
- keras_hub/src/samplers/beam_sampler.py +207 -0
- keras_hub/src/samplers/contrastive_sampler.py +231 -0
- keras_hub/src/samplers/greedy_sampler.py +50 -0
- keras_hub/src/samplers/random_sampler.py +77 -0
- keras_hub/src/samplers/sampler.py +237 -0
- keras_hub/src/samplers/serialization.py +97 -0
- keras_hub/src/samplers/top_k_sampler.py +92 -0
- keras_hub/src/samplers/top_p_sampler.py +113 -0
- keras_hub/src/tests/__init__.py +13 -0
- keras_hub/src/tests/test_case.py +608 -0
- keras_hub/src/tokenizers/__init__.py +13 -0
- keras_hub/src/tokenizers/byte_pair_tokenizer.py +638 -0
- keras_hub/src/tokenizers/byte_tokenizer.py +299 -0
- keras_hub/src/tokenizers/sentence_piece_tokenizer.py +267 -0
- keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +150 -0
- keras_hub/src/tokenizers/tokenizer.py +235 -0
- keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +355 -0
- keras_hub/src/tokenizers/word_piece_tokenizer.py +544 -0
- keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +176 -0
- keras_hub/src/utils/__init__.py +13 -0
- keras_hub/src/utils/keras_utils.py +130 -0
- keras_hub/src/utils/pipeline_model.py +293 -0
- keras_hub/src/utils/preset_utils.py +621 -0
- keras_hub/src/utils/python_utils.py +21 -0
- keras_hub/src/utils/tensor_utils.py +206 -0
- keras_hub/src/utils/timm/__init__.py +13 -0
- keras_hub/src/utils/timm/convert.py +37 -0
- keras_hub/src/utils/timm/convert_resnet.py +171 -0
- keras_hub/src/utils/transformers/__init__.py +13 -0
- keras_hub/src/utils/transformers/convert.py +101 -0
- keras_hub/src/utils/transformers/convert_bert.py +173 -0
- keras_hub/src/utils/transformers/convert_distilbert.py +184 -0
- keras_hub/src/utils/transformers/convert_gemma.py +187 -0
- keras_hub/src/utils/transformers/convert_gpt2.py +186 -0
- keras_hub/src/utils/transformers/convert_llama3.py +136 -0
- keras_hub/src/utils/transformers/convert_pali_gemma.py +303 -0
- keras_hub/src/utils/transformers/safetensor_utils.py +97 -0
- keras_hub/src/version_utils.py +23 -0
- keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +34 -0
- keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +297 -0
- keras_hub_nightly-0.15.0.dev20240823171555.dist-info/WHEEL +5 -0
- keras_hub_nightly-0.15.0.dev20240823171555.dist-info/top_level.txt +1 -0
@@ -0,0 +1,355 @@
|
|
1
|
+
# Copyright 2024 The KerasHub Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
|
16
|
+
from keras_hub.src.api_export import keras_hub_export
|
17
|
+
from keras_hub.src.tokenizers import tokenizer
|
18
|
+
from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
|
19
|
+
from keras_hub.src.utils.tensor_utils import is_int_dtype
|
20
|
+
|
21
|
+
try:
|
22
|
+
import tensorflow as tf
|
23
|
+
import tensorflow_text as tf_text
|
24
|
+
except ImportError:
|
25
|
+
tf = None
|
26
|
+
tf_text = None
|
27
|
+
|
28
|
+
|
29
|
+
@keras_hub_export("keras_hub.tokenizers.UnicodeCodepointTokenizer")
|
30
|
+
class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
|
31
|
+
"""A unicode character tokenizer layer.
|
32
|
+
|
33
|
+
This tokenizer is a vocabulary free tokenizer which tokenizes text as
|
34
|
+
unicode character codepoints.
|
35
|
+
|
36
|
+
Tokenizer outputs can either be padded and truncated with a
|
37
|
+
`sequence_length` argument, or left un-truncated. The exact output will
|
38
|
+
depend on the rank of the input tensors.
|
39
|
+
|
40
|
+
If input is a batch of strings (rank > 0):
|
41
|
+
By default, the layer will output a `tf.RaggedTensor` where the last
|
42
|
+
dimension of the output is ragged. If `sequence_length` is set, the layer
|
43
|
+
will output a dense `tf.Tensor` where all inputs have been padded or
|
44
|
+
truncated to `sequence_length`.
|
45
|
+
|
46
|
+
If input is a scalar string (rank == 0):
|
47
|
+
By default, the layer will output a dense `tf.Tensor` with static shape
|
48
|
+
`[None]`. If `sequence_length` is set, the output will be
|
49
|
+
a dense `tf.Tensor` of shape `[sequence_length]`.
|
50
|
+
|
51
|
+
The output dtype can be controlled via the `dtype` argument, which should be
|
52
|
+
an integer type ("int16", "int32", etc.).
|
53
|
+
|
54
|
+
Args:
|
55
|
+
lowercase: If `True`, the input text will be first lowered before
|
56
|
+
tokenization.
|
57
|
+
sequence_length: If set, the output will be converted to a dense
|
58
|
+
tensor and padded/trimmed so all outputs are of sequence_length.
|
59
|
+
normalization_form: One of the following string values (None, 'NFC',
|
60
|
+
'NFKC', 'NFD', 'NFKD'). If set will normalize unicode to the given
|
61
|
+
form before tokenizing.
|
62
|
+
errors: One of ('replace', 'remove', 'strict'). Specifies the
|
63
|
+
`detokenize()` behavior when an invalid codepoint is encountered.
|
64
|
+
The value of `'strict'` will cause the tokenizer to produce a
|
65
|
+
`InvalidArgument` error on any invalid input formatting. A value of
|
66
|
+
`'replace'` will cause the tokenizer to replace any invalid
|
67
|
+
formatting in the input with the replacement_char codepoint.
|
68
|
+
A value of `'ignore'` will cause the tokenizer to skip any invalid
|
69
|
+
formatting in the input and produce no corresponding output
|
70
|
+
character.
|
71
|
+
replacement_char: The unicode codepoint to use in place of invalid
|
72
|
+
codepoints. (U+FFFD) is `65533`. Defaults to `65533`.
|
73
|
+
input_encoding: One of ("UTF-8", "UTF-16-BE", or "UTF-32-BE").
|
74
|
+
One of The encoding of the input text. Defaults to `"UTF-8"`.
|
75
|
+
output_encoding: One of ("UTF-8", "UTF-16-BE", or "UTF-32-BE").
|
76
|
+
The encoding of the output text. Defaults to `"UTF-8"`.
|
77
|
+
vocabulary_size: Set the vocabulary `vocabulary_size`,
|
78
|
+
by clamping all codepoints to the range [0, vocabulary_size).
|
79
|
+
Effectively this will make the `vocabulary_size - 1` id the
|
80
|
+
the OOV value.
|
81
|
+
|
82
|
+
Examples:
|
83
|
+
|
84
|
+
Basic Usage.
|
85
|
+
>>> inputs = "Unicode Tokenizer"
|
86
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
|
87
|
+
>>> outputs = tokenizer(inputs)
|
88
|
+
>>> np.array(outputs)
|
89
|
+
array([117, 110, 105, 99, 111, 100, 101, 32, 116, 111, 107, 101, 110,
|
90
|
+
105, 122, 101, 114], dtype=int32)
|
91
|
+
|
92
|
+
Ragged outputs.
|
93
|
+
>>> inputs = ["पुस्तक", "کتاب"]
|
94
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
|
95
|
+
>>> seq1, seq2 = tokenizer(inputs)
|
96
|
+
>>> np.array(seq1)
|
97
|
+
array([2346, 2369, 2360, 2381, 2340, 2325], dtype=int32)
|
98
|
+
>>> np.array(seq2)
|
99
|
+
array([1705, 1578, 1575, 1576], dtype=int32)
|
100
|
+
|
101
|
+
Dense outputs.
|
102
|
+
>>> inputs = ["पुस्तक", "کتاب"]
|
103
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
104
|
+
... sequence_length=8)
|
105
|
+
>>> seq1, seq2 = tokenizer(inputs)
|
106
|
+
>>> np.array(seq1)
|
107
|
+
array([2346, 2369, 2360, 2381, 2340, 2325, 0, 0], dtype=int32)
|
108
|
+
>>> np.array(seq2)
|
109
|
+
array([1705, 1578, 1575, 1576, 0, 0, 0, 0], dtype=int32)
|
110
|
+
|
111
|
+
Tokenize, then batch for ragged outputs.
|
112
|
+
>>> inputs = ["Book", "पुस्तक", "کتاب"]
|
113
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
|
114
|
+
>>> ds = tf.data.Dataset.from_tensor_slices(inputs)
|
115
|
+
>>> ds = ds.map(tokenizer)
|
116
|
+
>>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(3))
|
117
|
+
>>> ds.take(1).get_single_element()
|
118
|
+
<tf.RaggedTensor [[98, 111, 111, 107],
|
119
|
+
[2346, 2369, 2360, 2381, 2340, 2325],
|
120
|
+
[1705, 1578, 1575, 1576]]>
|
121
|
+
|
122
|
+
Batch, then tokenize for ragged outputs.
|
123
|
+
>>> inputs = ["Book", "पुस्तक", "کتاب"]
|
124
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
|
125
|
+
>>> ds = tf.data.Dataset.from_tensor_slices(inputs)
|
126
|
+
>>> ds = ds.batch(3).map(tokenizer)
|
127
|
+
>>> ds.take(1).get_single_element()
|
128
|
+
<tf.RaggedTensor [[98, 111, 111, 107],
|
129
|
+
[2346, 2369, 2360, 2381, 2340, 2325],
|
130
|
+
[1705, 1578, 1575, 1576]]>
|
131
|
+
|
132
|
+
Tokenize, then batch for dense outputs (`sequence_length` provided).
|
133
|
+
>>> inputs = ["Book", "पुस्तक", "کتاب"]
|
134
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
135
|
+
... sequence_length=5)
|
136
|
+
>>> ds = tf.data.Dataset.from_tensor_slices(inputs)
|
137
|
+
>>> ds = ds.map(tokenizer)
|
138
|
+
>>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(3))
|
139
|
+
>>> ds.take(1).get_single_element()
|
140
|
+
<tf.Tensor: shape=(3, 5), dtype=int32, numpy=
|
141
|
+
array([[ 98, 111, 111, 107, 0],
|
142
|
+
[2346, 2369, 2360, 2381, 2340],
|
143
|
+
[1705, 1578, 1575, 1576, 0]], dtype=int32)>
|
144
|
+
|
145
|
+
Batch, then tokenize for dense outputs (`sequence_length` provided).
|
146
|
+
>>> inputs = ["Book", "पुस्तक", "کتاب"]
|
147
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
148
|
+
... sequence_length=5)
|
149
|
+
>>> ds = tf.data.Dataset.from_tensor_slices(inputs)
|
150
|
+
>>> ds = ds.batch(3).map(tokenizer)
|
151
|
+
>>> ds.take(1).get_single_element()
|
152
|
+
<tf.Tensor: shape=(3, 5), dtype=int32, numpy=
|
153
|
+
array([[ 98, 111, 111, 107, 0],
|
154
|
+
[2346, 2369, 2360, 2381, 2340],
|
155
|
+
[1705, 1578, 1575, 1576, 0]], dtype=int32)>
|
156
|
+
|
157
|
+
Tokenization with truncation.
|
158
|
+
>>> inputs = ["I Like to Travel a Lot", "मैं किताबें पढ़ना पसंद करता हूं"]
|
159
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
160
|
+
... sequence_length=5)
|
161
|
+
>>> outputs = tokenizer(inputs)
|
162
|
+
>>> np.array(outputs)
|
163
|
+
array([[ 105, 32, 108, 105, 107],
|
164
|
+
[2350, 2376, 2306, 32, 2325]], dtype=int32)
|
165
|
+
|
166
|
+
Tokenization with vocabulary_size.
|
167
|
+
>>> latin_ext_cutoff = 592
|
168
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
169
|
+
... vocabulary_size=latin_ext_cutoff)
|
170
|
+
>>> outputs = tokenizer("¿Cómo estás?")
|
171
|
+
>>> np.array(outputs)
|
172
|
+
array([191, 99, 243, 109, 111, 32, 101, 115, 116, 225, 115, 63],
|
173
|
+
dtype=int32)
|
174
|
+
>>> outputs = tokenizer("आप कैसे हैं")
|
175
|
+
>>> np.array(outputs)
|
176
|
+
array([591, 591, 32, 591, 591, 591, 591, 32, 591, 591, 591],
|
177
|
+
dtype=int32)
|
178
|
+
|
179
|
+
Detokenization.
|
180
|
+
>>> inputs = tf.constant([110, 105, 110, 106, 97], dtype="int32")
|
181
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
|
182
|
+
>>> outputs = tokenizer.detokenize(inputs)
|
183
|
+
>>> np.array(outputs).astype("U")
|
184
|
+
array('ninja', dtype='<U5')
|
185
|
+
|
186
|
+
Detokenization with padding.
|
187
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
188
|
+
... sequence_length=7)
|
189
|
+
>>> dataset = tf.data.Dataset.from_tensor_slices(["a b c", "b c", "a"])
|
190
|
+
>>> dataset = dataset.map(tokenizer)
|
191
|
+
>>> dataset.take(1).get_single_element()
|
192
|
+
<tf.Tensor: shape=(7,), dtype=int32,
|
193
|
+
numpy=array([97, 32, 98, 32, 99, 0, 0], dtype=int32)>
|
194
|
+
>>> detokunbatched = dataset.map(tokenizer.detokenize)
|
195
|
+
>>> detokunbatched.take(1).get_single_element()
|
196
|
+
<tf.Tensor: shape=(), dtype=string, numpy=b'a b c'>
|
197
|
+
|
198
|
+
Detokenization with invalid bytes.
|
199
|
+
>>> inputs = tf.constant([110, 105, 10000000, 110, 106, 97])
|
200
|
+
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
201
|
+
... errors="replace", replacement_char=88)
|
202
|
+
>>> outputs = tokenizer.detokenize(inputs)
|
203
|
+
>>> np.array(outputs).astype("U")
|
204
|
+
array('niXnja', dtype='<U6')
|
205
|
+
"""
|
206
|
+
|
207
|
+
def __init__(
|
208
|
+
self,
|
209
|
+
sequence_length=None,
|
210
|
+
lowercase=True,
|
211
|
+
normalization_form=None,
|
212
|
+
errors="replace",
|
213
|
+
replacement_char=65533,
|
214
|
+
input_encoding="UTF-8",
|
215
|
+
output_encoding="UTF-8",
|
216
|
+
vocabulary_size=None,
|
217
|
+
dtype="int32",
|
218
|
+
**kwargs,
|
219
|
+
) -> None:
|
220
|
+
if not is_int_dtype(dtype):
|
221
|
+
raise ValueError(
|
222
|
+
"Output dtype must be an integer type. "
|
223
|
+
f"Received: dtype={dtype}"
|
224
|
+
)
|
225
|
+
|
226
|
+
# Check normalization_form.
|
227
|
+
if normalization_form not in [None, "NFC", "NFKC", "NFD", "NFKD"]:
|
228
|
+
raise ValueError(
|
229
|
+
'`normalization_form` must be one of None, "NFC", "NFKC", '
|
230
|
+
'"NFD", "NFKD". Received: normalization_form='
|
231
|
+
f"{normalization_form}"
|
232
|
+
)
|
233
|
+
|
234
|
+
# Check errors.
|
235
|
+
if errors not in ["strict", "replace", "ignore"]:
|
236
|
+
raise ValueError(
|
237
|
+
'`errors` must be one of "strict", "replace", "ignore" '
|
238
|
+
f"Received: errors={errors}"
|
239
|
+
)
|
240
|
+
|
241
|
+
# Check normalization_form matches input_encoding.
|
242
|
+
if normalization_form:
|
243
|
+
if input_encoding != "UTF-8":
|
244
|
+
raise ValueError(
|
245
|
+
"""Normalization Forms are Only Supported for Input Encoding
|
246
|
+
UTF-8"""
|
247
|
+
)
|
248
|
+
|
249
|
+
super().__init__(dtype=dtype, **kwargs)
|
250
|
+
|
251
|
+
self.sequence_length = sequence_length
|
252
|
+
self.lowercase = lowercase
|
253
|
+
self.normalization_form = normalization_form
|
254
|
+
self.errors = errors
|
255
|
+
self.replacement_char = replacement_char
|
256
|
+
self.input_encoding = input_encoding
|
257
|
+
self.output_encoding = output_encoding
|
258
|
+
self._vocabulary_size = vocabulary_size
|
259
|
+
|
260
|
+
def get_config(self):
|
261
|
+
config = super().get_config()
|
262
|
+
config.update(
|
263
|
+
{
|
264
|
+
"sequence_length": self.sequence_length,
|
265
|
+
"lowercase": self.lowercase,
|
266
|
+
"normalization_form": self.normalization_form,
|
267
|
+
"errors": self.errors,
|
268
|
+
"replacement_char": self.replacement_char,
|
269
|
+
"input_encoding": self.input_encoding,
|
270
|
+
"output_encoding": self.output_encoding,
|
271
|
+
"vocabulary_size": self._vocabulary_size,
|
272
|
+
}
|
273
|
+
)
|
274
|
+
return config
|
275
|
+
|
276
|
+
def vocabulary_size(self):
|
277
|
+
"""Get the size of the tokenizer vocabulary. None implies no vocabulary
|
278
|
+
size was provided"""
|
279
|
+
return self._vocabulary_size
|
280
|
+
|
281
|
+
def get_vocabulary(self):
|
282
|
+
vocab = {}
|
283
|
+
for i in range(self.vocabulary_size()):
|
284
|
+
vocab[chr(i)] = i
|
285
|
+
return vocab
|
286
|
+
|
287
|
+
def tokenize(self, inputs):
|
288
|
+
if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
|
289
|
+
inputs = tf.convert_to_tensor(inputs)
|
290
|
+
|
291
|
+
scalar_input = inputs.shape.rank == 0
|
292
|
+
if scalar_input:
|
293
|
+
inputs = tf.expand_dims(inputs, 0)
|
294
|
+
|
295
|
+
# Optionally lowercase the text
|
296
|
+
if self.lowercase:
|
297
|
+
inputs = tf_text.case_fold_utf8(inputs)
|
298
|
+
|
299
|
+
# Optionally normalize the text to a given form
|
300
|
+
if self.normalization_form:
|
301
|
+
inputs = tf_text.normalize_utf8(inputs, self.normalization_form)
|
302
|
+
|
303
|
+
tokens = tf.strings.unicode_decode(
|
304
|
+
inputs,
|
305
|
+
errors=self.errors,
|
306
|
+
replacement_char=self.replacement_char,
|
307
|
+
input_encoding=self.input_encoding,
|
308
|
+
)
|
309
|
+
tokens = tf.cast(tokens, self.compute_dtype)
|
310
|
+
|
311
|
+
if self.sequence_length:
|
312
|
+
output_shape = tokens.shape.as_list()
|
313
|
+
output_shape[-1] = self.sequence_length
|
314
|
+
tokens = tokens.to_tensor(shape=output_shape)
|
315
|
+
|
316
|
+
if scalar_input:
|
317
|
+
tokens = tf.squeeze(tokens, 0)
|
318
|
+
|
319
|
+
# Optionally clamps the output code point values to be in the
|
320
|
+
# range [0, vocabulary_size)
|
321
|
+
if self._vocabulary_size:
|
322
|
+
tokens = tf.clip_by_value(tokens, 0, self._vocabulary_size - 1)
|
323
|
+
|
324
|
+
return tokens
|
325
|
+
|
326
|
+
def detokenize(self, inputs):
|
327
|
+
inputs, unbatched, _ = convert_to_ragged_batch(inputs)
|
328
|
+
inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
|
329
|
+
outputs = tf.strings.unicode_encode(
|
330
|
+
inputs,
|
331
|
+
errors=self.errors,
|
332
|
+
replacement_char=self.replacement_char,
|
333
|
+
output_encoding=self.output_encoding,
|
334
|
+
)
|
335
|
+
if unbatched:
|
336
|
+
outputs = tf.squeeze(outputs, 0)
|
337
|
+
return outputs
|
338
|
+
|
339
|
+
def id_to_token(self, id):
|
340
|
+
"""Convert an integer id to a string token."""
|
341
|
+
if id >= self.vocabulary_size() or id < 0:
|
342
|
+
raise ValueError(
|
343
|
+
f"`id` must be in range [0, {self.vocabulary_size() - 1}]. "
|
344
|
+
f"Received: {id}"
|
345
|
+
)
|
346
|
+
return chr(id)
|
347
|
+
|
348
|
+
def token_to_id(self, token):
|
349
|
+
"""Convert a string token to an integer id."""
|
350
|
+
id = ord(token)
|
351
|
+
if id >= self.vocabulary_size():
|
352
|
+
raise ValueError(
|
353
|
+
f"Token {token} is not supported by `UnicodeCodepointTokenizer`."
|
354
|
+
)
|
355
|
+
return id
|