PyPI - keras-hub-nightly - Versions diffs - 0.15.0.dev20240823171555__py3-none-any.whl - Mend

keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (297) hide show

keras_hub/__init__.py +52 -0
keras_hub/api/__init__.py +27 -0
keras_hub/api/layers/__init__.py +47 -0
keras_hub/api/metrics/__init__.py +24 -0
keras_hub/api/models/__init__.py +249 -0
keras_hub/api/samplers/__init__.py +29 -0
keras_hub/api/tokenizers/__init__.py +35 -0
keras_hub/src/__init__.py +13 -0
keras_hub/src/api_export.py +53 -0
keras_hub/src/layers/__init__.py +13 -0
keras_hub/src/layers/modeling/__init__.py +13 -0
keras_hub/src/layers/modeling/alibi_bias.py +143 -0
keras_hub/src/layers/modeling/cached_multi_head_attention.py +137 -0
keras_hub/src/layers/modeling/f_net_encoder.py +200 -0
keras_hub/src/layers/modeling/masked_lm_head.py +239 -0
keras_hub/src/layers/modeling/position_embedding.py +123 -0
keras_hub/src/layers/modeling/reversible_embedding.py +311 -0
keras_hub/src/layers/modeling/rotary_embedding.py +169 -0
keras_hub/src/layers/modeling/sine_position_encoding.py +108 -0
keras_hub/src/layers/modeling/token_and_position_embedding.py +150 -0
keras_hub/src/layers/modeling/transformer_decoder.py +496 -0
keras_hub/src/layers/modeling/transformer_encoder.py +262 -0
keras_hub/src/layers/modeling/transformer_layer_utils.py +106 -0
keras_hub/src/layers/preprocessing/__init__.py +13 -0
keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +220 -0
keras_hub/src/layers/preprocessing/multi_segment_packer.py +319 -0
keras_hub/src/layers/preprocessing/preprocessing_layer.py +62 -0
keras_hub/src/layers/preprocessing/random_deletion.py +271 -0
keras_hub/src/layers/preprocessing/random_swap.py +267 -0
keras_hub/src/layers/preprocessing/start_end_packer.py +219 -0
keras_hub/src/metrics/__init__.py +13 -0
keras_hub/src/metrics/bleu.py +394 -0
keras_hub/src/metrics/edit_distance.py +197 -0
keras_hub/src/metrics/perplexity.py +181 -0
keras_hub/src/metrics/rouge_base.py +204 -0
keras_hub/src/metrics/rouge_l.py +97 -0
keras_hub/src/metrics/rouge_n.py +125 -0
keras_hub/src/models/__init__.py +13 -0
keras_hub/src/models/albert/__init__.py +20 -0
keras_hub/src/models/albert/albert_backbone.py +267 -0
keras_hub/src/models/albert/albert_classifier.py +202 -0
keras_hub/src/models/albert/albert_masked_lm.py +129 -0
keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +194 -0
keras_hub/src/models/albert/albert_preprocessor.py +206 -0
keras_hub/src/models/albert/albert_presets.py +70 -0
keras_hub/src/models/albert/albert_tokenizer.py +119 -0
keras_hub/src/models/backbone.py +311 -0
keras_hub/src/models/bart/__init__.py +20 -0
keras_hub/src/models/bart/bart_backbone.py +261 -0
keras_hub/src/models/bart/bart_preprocessor.py +276 -0
keras_hub/src/models/bart/bart_presets.py +74 -0
keras_hub/src/models/bart/bart_seq_2_seq_lm.py +490 -0
keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +262 -0
keras_hub/src/models/bart/bart_tokenizer.py +124 -0
keras_hub/src/models/bert/__init__.py +23 -0
keras_hub/src/models/bert/bert_backbone.py +227 -0
keras_hub/src/models/bert/bert_classifier.py +183 -0
keras_hub/src/models/bert/bert_masked_lm.py +131 -0
keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +198 -0
keras_hub/src/models/bert/bert_preprocessor.py +184 -0
keras_hub/src/models/bert/bert_presets.py +147 -0
keras_hub/src/models/bert/bert_tokenizer.py +112 -0
keras_hub/src/models/bloom/__init__.py +20 -0
keras_hub/src/models/bloom/bloom_attention.py +186 -0
keras_hub/src/models/bloom/bloom_backbone.py +173 -0
keras_hub/src/models/bloom/bloom_causal_lm.py +298 -0
keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +176 -0
keras_hub/src/models/bloom/bloom_decoder.py +206 -0
keras_hub/src/models/bloom/bloom_preprocessor.py +185 -0
keras_hub/src/models/bloom/bloom_presets.py +121 -0
keras_hub/src/models/bloom/bloom_tokenizer.py +116 -0
keras_hub/src/models/causal_lm.py +383 -0
keras_hub/src/models/classifier.py +109 -0
keras_hub/src/models/csp_darknet/__init__.py +13 -0
keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +410 -0
keras_hub/src/models/csp_darknet/csp_darknet_image_classifier.py +133 -0
keras_hub/src/models/deberta_v3/__init__.py +24 -0
keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +210 -0
keras_hub/src/models/deberta_v3/deberta_v3_classifier.py +228 -0
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm.py +135 -0
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +191 -0
keras_hub/src/models/deberta_v3/deberta_v3_preprocessor.py +206 -0
keras_hub/src/models/deberta_v3/deberta_v3_presets.py +82 -0
keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +155 -0
keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +227 -0
keras_hub/src/models/deberta_v3/disentangled_self_attention.py +412 -0
keras_hub/src/models/deberta_v3/relative_embedding.py +94 -0
keras_hub/src/models/densenet/__init__.py +13 -0
keras_hub/src/models/densenet/densenet_backbone.py +210 -0
keras_hub/src/models/densenet/densenet_image_classifier.py +131 -0
keras_hub/src/models/distil_bert/__init__.py +26 -0
keras_hub/src/models/distil_bert/distil_bert_backbone.py +187 -0
keras_hub/src/models/distil_bert/distil_bert_classifier.py +208 -0
keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +137 -0
keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +194 -0
keras_hub/src/models/distil_bert/distil_bert_preprocessor.py +175 -0
keras_hub/src/models/distil_bert/distil_bert_presets.py +57 -0
keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +114 -0
keras_hub/src/models/electra/__init__.py +20 -0
keras_hub/src/models/electra/electra_backbone.py +247 -0
keras_hub/src/models/electra/electra_preprocessor.py +154 -0
keras_hub/src/models/electra/electra_presets.py +95 -0
keras_hub/src/models/electra/electra_tokenizer.py +104 -0
keras_hub/src/models/f_net/__init__.py +20 -0
keras_hub/src/models/f_net/f_net_backbone.py +236 -0
keras_hub/src/models/f_net/f_net_classifier.py +154 -0
keras_hub/src/models/f_net/f_net_masked_lm.py +132 -0
keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +196 -0
keras_hub/src/models/f_net/f_net_preprocessor.py +177 -0
keras_hub/src/models/f_net/f_net_presets.py +43 -0
keras_hub/src/models/f_net/f_net_tokenizer.py +95 -0
keras_hub/src/models/falcon/__init__.py +20 -0
keras_hub/src/models/falcon/falcon_attention.py +156 -0
keras_hub/src/models/falcon/falcon_backbone.py +164 -0
keras_hub/src/models/falcon/falcon_causal_lm.py +291 -0
keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/falcon/falcon_preprocessor.py +187 -0
keras_hub/src/models/falcon/falcon_presets.py +30 -0
keras_hub/src/models/falcon/falcon_tokenizer.py +110 -0
keras_hub/src/models/falcon/falcon_transformer_decoder.py +255 -0
keras_hub/src/models/feature_pyramid_backbone.py +73 -0
keras_hub/src/models/gemma/__init__.py +20 -0
keras_hub/src/models/gemma/gemma_attention.py +250 -0
keras_hub/src/models/gemma/gemma_backbone.py +316 -0
keras_hub/src/models/gemma/gemma_causal_lm.py +448 -0
keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +167 -0
keras_hub/src/models/gemma/gemma_decoder_block.py +241 -0
keras_hub/src/models/gemma/gemma_preprocessor.py +191 -0
keras_hub/src/models/gemma/gemma_presets.py +248 -0
keras_hub/src/models/gemma/gemma_tokenizer.py +103 -0
keras_hub/src/models/gemma/rms_normalization.py +40 -0
keras_hub/src/models/gpt2/__init__.py +20 -0
keras_hub/src/models/gpt2/gpt2_backbone.py +199 -0
keras_hub/src/models/gpt2/gpt2_causal_lm.py +437 -0
keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/gpt2/gpt2_preprocessor.py +187 -0
keras_hub/src/models/gpt2/gpt2_presets.py +82 -0
keras_hub/src/models/gpt2/gpt2_tokenizer.py +110 -0
keras_hub/src/models/gpt_neo_x/__init__.py +13 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +251 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +175 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +201 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +141 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +258 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +145 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +88 -0
keras_hub/src/models/image_classifier.py +90 -0
keras_hub/src/models/llama/__init__.py +20 -0
keras_hub/src/models/llama/llama_attention.py +225 -0
keras_hub/src/models/llama/llama_backbone.py +188 -0
keras_hub/src/models/llama/llama_causal_lm.py +327 -0
keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +170 -0
keras_hub/src/models/llama/llama_decoder.py +246 -0
keras_hub/src/models/llama/llama_layernorm.py +48 -0
keras_hub/src/models/llama/llama_preprocessor.py +189 -0
keras_hub/src/models/llama/llama_presets.py +80 -0
keras_hub/src/models/llama/llama_tokenizer.py +84 -0
keras_hub/src/models/llama3/__init__.py +20 -0
keras_hub/src/models/llama3/llama3_backbone.py +84 -0
keras_hub/src/models/llama3/llama3_causal_lm.py +46 -0
keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/llama3/llama3_preprocessor.py +21 -0
keras_hub/src/models/llama3/llama3_presets.py +69 -0
keras_hub/src/models/llama3/llama3_tokenizer.py +63 -0
keras_hub/src/models/masked_lm.py +101 -0
keras_hub/src/models/mistral/__init__.py +20 -0
keras_hub/src/models/mistral/mistral_attention.py +238 -0
keras_hub/src/models/mistral/mistral_backbone.py +203 -0
keras_hub/src/models/mistral/mistral_causal_lm.py +328 -0
keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +175 -0
keras_hub/src/models/mistral/mistral_layer_norm.py +48 -0
keras_hub/src/models/mistral/mistral_preprocessor.py +190 -0
keras_hub/src/models/mistral/mistral_presets.py +48 -0
keras_hub/src/models/mistral/mistral_tokenizer.py +82 -0
keras_hub/src/models/mistral/mistral_transformer_decoder.py +265 -0
keras_hub/src/models/mix_transformer/__init__.py +13 -0
keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +181 -0
keras_hub/src/models/mix_transformer/mix_transformer_classifier.py +133 -0
keras_hub/src/models/mix_transformer/mix_transformer_layers.py +300 -0
keras_hub/src/models/opt/__init__.py +20 -0
keras_hub/src/models/opt/opt_backbone.py +173 -0
keras_hub/src/models/opt/opt_causal_lm.py +301 -0
keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +177 -0
keras_hub/src/models/opt/opt_preprocessor.py +188 -0
keras_hub/src/models/opt/opt_presets.py +72 -0
keras_hub/src/models/opt/opt_tokenizer.py +116 -0
keras_hub/src/models/pali_gemma/__init__.py +23 -0
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +277 -0
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +313 -0
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +147 -0
keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +160 -0
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +78 -0
keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +79 -0
keras_hub/src/models/pali_gemma/pali_gemma_vit.py +566 -0
keras_hub/src/models/phi3/__init__.py +20 -0
keras_hub/src/models/phi3/phi3_attention.py +260 -0
keras_hub/src/models/phi3/phi3_backbone.py +224 -0
keras_hub/src/models/phi3/phi3_causal_lm.py +218 -0
keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/phi3/phi3_decoder.py +260 -0
keras_hub/src/models/phi3/phi3_layernorm.py +48 -0
keras_hub/src/models/phi3/phi3_preprocessor.py +190 -0
keras_hub/src/models/phi3/phi3_presets.py +50 -0
keras_hub/src/models/phi3/phi3_rotary_embedding.py +137 -0
keras_hub/src/models/phi3/phi3_tokenizer.py +94 -0
keras_hub/src/models/preprocessor.py +207 -0
keras_hub/src/models/resnet/__init__.py +13 -0
keras_hub/src/models/resnet/resnet_backbone.py +612 -0
keras_hub/src/models/resnet/resnet_image_classifier.py +136 -0
keras_hub/src/models/roberta/__init__.py +20 -0
keras_hub/src/models/roberta/roberta_backbone.py +184 -0
keras_hub/src/models/roberta/roberta_classifier.py +209 -0
keras_hub/src/models/roberta/roberta_masked_lm.py +136 -0
keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +198 -0
keras_hub/src/models/roberta/roberta_preprocessor.py +192 -0
keras_hub/src/models/roberta/roberta_presets.py +43 -0
keras_hub/src/models/roberta/roberta_tokenizer.py +132 -0
keras_hub/src/models/seq_2_seq_lm.py +54 -0
keras_hub/src/models/t5/__init__.py +20 -0
keras_hub/src/models/t5/t5_backbone.py +261 -0
keras_hub/src/models/t5/t5_layer_norm.py +35 -0
keras_hub/src/models/t5/t5_multi_head_attention.py +324 -0
keras_hub/src/models/t5/t5_presets.py +95 -0
keras_hub/src/models/t5/t5_tokenizer.py +100 -0
keras_hub/src/models/t5/t5_transformer_layer.py +178 -0
keras_hub/src/models/task.py +419 -0
keras_hub/src/models/vgg/__init__.py +13 -0
keras_hub/src/models/vgg/vgg_backbone.py +158 -0
keras_hub/src/models/vgg/vgg_image_classifier.py +124 -0
keras_hub/src/models/vit_det/__init__.py +13 -0
keras_hub/src/models/vit_det/vit_det_backbone.py +204 -0
keras_hub/src/models/vit_det/vit_layers.py +565 -0
keras_hub/src/models/whisper/__init__.py +20 -0
keras_hub/src/models/whisper/whisper_audio_feature_extractor.py +260 -0
keras_hub/src/models/whisper/whisper_backbone.py +305 -0
keras_hub/src/models/whisper/whisper_cached_multi_head_attention.py +153 -0
keras_hub/src/models/whisper/whisper_decoder.py +141 -0
keras_hub/src/models/whisper/whisper_encoder.py +106 -0
keras_hub/src/models/whisper/whisper_preprocessor.py +326 -0
keras_hub/src/models/whisper/whisper_presets.py +148 -0
keras_hub/src/models/whisper/whisper_tokenizer.py +163 -0
keras_hub/src/models/xlm_roberta/__init__.py +26 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_backbone.py +81 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_classifier.py +225 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +141 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +195 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_preprocessor.py +205 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +43 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +191 -0
keras_hub/src/models/xlnet/__init__.py +13 -0
keras_hub/src/models/xlnet/relative_attention.py +459 -0
keras_hub/src/models/xlnet/xlnet_backbone.py +222 -0
keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +133 -0
keras_hub/src/models/xlnet/xlnet_encoder.py +378 -0
keras_hub/src/samplers/__init__.py +13 -0
keras_hub/src/samplers/beam_sampler.py +207 -0
keras_hub/src/samplers/contrastive_sampler.py +231 -0
keras_hub/src/samplers/greedy_sampler.py +50 -0
keras_hub/src/samplers/random_sampler.py +77 -0
keras_hub/src/samplers/sampler.py +237 -0
keras_hub/src/samplers/serialization.py +97 -0
keras_hub/src/samplers/top_k_sampler.py +92 -0
keras_hub/src/samplers/top_p_sampler.py +113 -0
keras_hub/src/tests/__init__.py +13 -0
keras_hub/src/tests/test_case.py +608 -0
keras_hub/src/tokenizers/__init__.py +13 -0
keras_hub/src/tokenizers/byte_pair_tokenizer.py +638 -0
keras_hub/src/tokenizers/byte_tokenizer.py +299 -0
keras_hub/src/tokenizers/sentence_piece_tokenizer.py +267 -0
keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +150 -0
keras_hub/src/tokenizers/tokenizer.py +235 -0
keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +355 -0
keras_hub/src/tokenizers/word_piece_tokenizer.py +544 -0
keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +176 -0
keras_hub/src/utils/__init__.py +13 -0
keras_hub/src/utils/keras_utils.py +130 -0
keras_hub/src/utils/pipeline_model.py +293 -0
keras_hub/src/utils/preset_utils.py +621 -0
keras_hub/src/utils/python_utils.py +21 -0
keras_hub/src/utils/tensor_utils.py +206 -0
keras_hub/src/utils/timm/__init__.py +13 -0
keras_hub/src/utils/timm/convert.py +37 -0
keras_hub/src/utils/timm/convert_resnet.py +171 -0
keras_hub/src/utils/transformers/__init__.py +13 -0
keras_hub/src/utils/transformers/convert.py +101 -0
keras_hub/src/utils/transformers/convert_bert.py +173 -0
keras_hub/src/utils/transformers/convert_distilbert.py +184 -0
keras_hub/src/utils/transformers/convert_gemma.py +187 -0
keras_hub/src/utils/transformers/convert_gpt2.py +186 -0
keras_hub/src/utils/transformers/convert_llama3.py +136 -0
keras_hub/src/utils/transformers/convert_pali_gemma.py +303 -0
keras_hub/src/utils/transformers/safetensor_utils.py +97 -0
keras_hub/src/version_utils.py +23 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +34 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +297 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/WHEEL +5 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/top_level.txt +1 -0

keras_hub/src/tokenizers/byte_tokenizer.py ADDED Viewed

@@ -0,0 +1,299 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+try:
+    import tensorflow as tf
+except ImportError:
+    raise ImportError(
+        "To use `keras_hub`, please install Tensorflow: `pip install tensorflow`. "
+        "The TensorFlow package is required for data preprocessing with any backend."
+    )
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.tokenizers import tokenizer
+from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
+from keras_hub.src.utils.tensor_utils import is_int_dtype
+try:
+    import tensorflow_text as tf_text
+except ImportError:
+    tf_text = None
+@keras_hub_export("keras_hub.tokenizers.ByteTokenizer")
+class ByteTokenizer(tokenizer.Tokenizer):
+    """Raw byte tokenizer.
+    This tokenizer is a vocabulary-free tokenizer which will tokenize text as
+    as raw bytes from [0, 256).
+    Tokenizer outputs can either be padded and truncated with a
+    `sequence_length` argument, or left un-truncated. The exact output will
+    depend on the rank of the input tensors.
+    If input is a batch of strings:
+    By default, the layer will output a `tf.RaggedTensor` where the last
+    dimension of the output is ragged. If `sequence_length` is set, the layer
+    will output a dense `tf.Tensor` where all inputs have been padded or
+    truncated to `sequence_length`.
+    If input is a scalar string:
+    There are two cases here. If `sequence_length` is set, the output will be
+    a dense `tf.Tensor` of shape `[sequence_length]`. Otherwise, the output will
+    be a dense `tf.Tensor` of shape `[None]`.
+    The output dtype can be controlled via the
+    `dtype` argument, which should be an integer type
+    ("int16", "int32", etc.).
+    Args:
+        lowercase: boolean. If True, the input text will be converted to
+            lowercase before tokenization.
+        sequence_length: int. If set, the output will be converted to a dense
+            tensor and padded/trimmed so all outputs are of sequence_length.
+        normalization_form: string. One of the following values: (None, "NFC",
+            "NFKC", "NFD", "NFKD"). If set, every UTF-8 string in the input
+            tensor text will be normalized to the given form before tokenizing.
+        errors: One of ('replace', 'remove', 'strict'). Specifies the
+            `detokenize()` behavior when an invalid tokenizer is encountered.
+            The value of `'strict'` will cause the operation to produce a
+            `InvalidArgument` error on any invalid input formatting. A value of
+            `'replace'` will cause the tokenizer to replace any invalid
+            formatting in the input with the `replacement_char` codepoint.
+            A value of `'ignore'` will cause the tokenizer to skip any invalid
+            formatting in the input and produce no corresponding output
+            character.
+        replacement_char: int. The replacement character to
+            use when an invalid byte sequence is encountered and when `errors`
+            is set to "replace" (same behaviour as
+            https://www.tensorflow.org/api_docs/python/tf/strings/unicode_transcode).
+            (U+FFFD) is `65533`. Defaults to `65533`.
+    Examples:
+    Basic usage.
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
+    >>> outputs = tokenizer("hello")
+    >>> np.array(outputs)
+    array([104, 101, 108, 108, 111], dtype=int32)
+    Ragged outputs.
+    >>> inputs = ["hello", "hi"]
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
+    >>> seq1, seq2 = tokenizer(inputs)
+    >>> np.array(seq1)
+    array([104, 101, 108, 108, 111], dtype=int32)
+    >>> np.array(seq2)
+    array([104, 105], dtype=int32)
+    Dense outputs.
+    >>> inputs = ["hello", "hi"]
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer(sequence_length=8)
+    >>> seq1, seq2 = tokenizer(inputs)
+    >>> np.array(seq1)
+    array([104, 101, 108, 108, 111,   0,   0,   0], dtype=int32)
+    >>> np.array(seq2)
+    array([104, 105,   0,   0,   0,   0,   0,   0], dtype=int32)
+    Tokenize, then batch for ragged outputs.
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
+    >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
+    >>> ds = ds.map(tokenizer)
+    >>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(2))
+    >>> ds.take(1).get_single_element()
+    <tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>
+    Batch, then tokenize for ragged outputs.
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
+    >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
+    >>> ds = ds.batch(2).map(tokenizer)
+    >>> ds.take(1).get_single_element()
+    <tf.RaggedTensor [[104, 101, 108, 108, 111], [102, 117, 110]]>
+    Tokenize, then batch for dense outputs (`sequence_length` provided).
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer(sequence_length=5)
+    >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
+    >>> ds = ds.map(tokenizer)
+    >>> ds = ds.apply(tf.data.experimental.dense_to_ragged_batch(2))
+    >>> ds.take(1).get_single_element()
+    <tf.Tensor: shape=(2, 5), dtype=int32, numpy=
+    array([[104, 101, 108, 108, 111],
+           [102, 117, 110,   0,   0]], dtype=int32)>
+    Batch, then tokenize for dense outputs. (`sequence_length` provided).
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer(sequence_length=5)
+    >>> ds = tf.data.Dataset.from_tensor_slices(["hello", "fun"])
+    >>> ds = ds.batch(2).map(tokenizer)
+    >>> ds.take(1).get_single_element()
+    <tf.Tensor: shape=(2, 5), dtype=int32, numpy=
+    array([[104, 101, 108, 108, 111],
+           [102, 117, 110,   0,   0]], dtype=int32)>
+    Detokenization.
+    >>> inputs = [104, 101, 108, 108, 111]
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
+    >>> outputs = tokenizer.detokenize(inputs)
+    >>> np.array(outputs).astype("U")
+    array('hello', dtype='<U5')
+    Detokenization with invalid bytes.
+    >>> # The 255 below is invalid utf-8.
+    >>> inputs = [104, 101, 255, 108, 108, 111]
+    >>> tokenizer = keras_hub.tokenizers.ByteTokenizer(
+    ...     errors="replace", replacement_char=88)
+    >>> outputs = tokenizer.detokenize(inputs)
+    >>> np.array(outputs).astype("U")
+    array('heXllo', dtype='<U6')
+    """
+    def __init__(
+        self,
+        lowercase=True,
+        sequence_length=None,
+        normalization_form=None,
+        errors="replace",
+        replacement_char=65533,
+        dtype="int32",
+        **kwargs,
+    ):
+        if not is_int_dtype(dtype):
+            raise ValueError(
+                "Output dtype must be an integer type. "
+                f"Received: dtype={dtype}"
+            )
+        # Check normalization_form.
+        if normalization_form not in (None, "NFC", "NFKC", "NFD", "NFKD"):
+            raise ValueError(
+                '`normalization_form` must be one of None, "NFC", "NFKC", '
+                '"NFD", "NFKD". Received: normalization_form='
+                f"{normalization_form}"
+            )
+        # Check errors.
+        if errors not in ("strict", "replace", "ignore"):
+            raise ValueError(
+                '`errors` must be one of "strict", "replace", "ignore" '
+                f"Received: errors={errors}"
+            )
+        super().__init__(dtype=dtype, **kwargs)
+        self.lowercase = lowercase
+        self.sequence_length = sequence_length
+        self.normalization_form = normalization_form
+        self.errors = errors
+        self.replacement_char = replacement_char
+        self._char_lst = tf.constant(
+            [i.tobytes() for i in np.arange(256, dtype=np.uint8)]
+        )
+    def vocabulary_size(self):
+        """Get the integer size of the tokenizer vocabulary."""
+        return 256
+    def get_vocabulary(self):
+        vocab = {}
+        for i in range(self.vocabulary_size()):
+            vocab[chr(i)] = i
+        return vocab
+    def tokenize(self, inputs):
+        if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
+            inputs = tf.convert_to_tensor(inputs)
+        scalar_input = inputs.shape.rank == 0
+        if scalar_input:
+            inputs = tf.expand_dims(inputs, 0)
+        # Optional: Lowercase the input.
+        if self.lowercase:
+            inputs = tf_text.case_fold_utf8(inputs)
+        # Optional: Normalize unicode.
+        if self.normalization_form is not None:
+            inputs = tf_text.normalize_utf8(inputs, self.normalization_form)
+        # Tokenize input strings.
+        tokens = tf.strings.bytes_split(inputs)
+        tokens = tf.squeeze(
+            tf.ragged.map_flat_values(tf.io.decode_raw, tokens, tf.uint8), -1
+        )
+        tokens = tf.cast(tokens, self.compute_dtype)
+        # Convert to a dense output if `sequence_length` is set.
+        if self.sequence_length:
+            output_shape = tokens.shape.as_list()
+            output_shape[-1] = self.sequence_length
+            tokens = tokens.to_tensor(shape=output_shape)
+        if scalar_input:
+            tokens = tf.squeeze(tokens, 0)
+        return tokens
+    def detokenize(self, inputs):
+        inputs, unbatched, _ = convert_to_ragged_batch(inputs)
+        # Remove trailing padding tokens, so that trailing "\x00" bytes don't
+        # show up in the detokenized output.
+        inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
+        outputs = tf.strings.reduce_join(
+            tf.gather(self._char_lst, inputs), axis=-1
+        )
+        # Handle errors if an invalid byte sequence is encountered.
+        outputs = tf.strings.unicode_transcode(
+            outputs,
+            "UTF-8",
+            "UTF-8",
+            errors=self.errors,
+            replacement_char=self.replacement_char,
+        )
+        if unbatched:
+            outputs = tf.squeeze(outputs, 0)
+        return outputs
+    def id_to_token(self, id):
+        """Convert an integer id to a string token."""
+        if id >= self.vocabulary_size() or id < 0:
+            raise ValueError(
+                f"`id` must be in range [0, {self.vocabulary_size() - 1}]. "
+                f"Received: {id}"
+            )
+        return chr(id)
+    def token_to_id(self, token):
+        """Convert a string token to an integer id."""
+        id = ord(token)
+        if id >= self.vocabulary_size():
+            raise ValueError(
+                f"Token {token} is not supported by `ByteTokenizer`."
+            )
+        return id
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "lowercase": self.lowercase,
+                "sequence_length": self.sequence_length,
+                "normalization_form": self.normalization_form,
+                "errors": self.errors,
+                "replacement_char": self.replacement_char,
+            }
+        )
+        return config

keras_hub/src/tokenizers/sentence_piece_tokenizer.py ADDED Viewed

@@ -0,0 +1,267 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import binascii
+import os
+import keras
+try:
+    import tensorflow as tf
+except ImportError:
+    raise ImportError(
+        "To use `keras_hub`, please install Tensorflow: `pip install tensorflow`. "
+        "The TensorFlow package is required for data preprocessing with any backend."
+    )
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.tokenizers import tokenizer
+from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
+from keras_hub.src.utils.tensor_utils import is_int_dtype
+from keras_hub.src.utils.tensor_utils import is_string_dtype
+from keras_hub.src.utils.tensor_utils import tensor_to_list
+try:
+    import tensorflow_text as tf_text
+except ImportError:
+    tf_text = None
+VOCAB_FILENAME = "vocabulary.spm"
+@keras_hub_export("keras_hub.tokenizers.SentencePieceTokenizer")
+class SentencePieceTokenizer(tokenizer.Tokenizer):
+    """A SentencePiece tokenizer layer.
+    This layer provides an implementation of SentencePiece tokenization
+    as described in the [SentencePiece paper](https://arxiv.org/abs/1808.06226)
+    and the [SentencePiece package](https://pypi.org/project/sentencepiece/).
+    The tokenization will run entirely within the Tensorflow graph, and can
+    be saved inside a `keras.Model`.
+    By default, the layer will output a `tf.RaggedTensor` where the last
+    dimension of the output is ragged after whitespace splitting and sub-word
+    tokenizing. If `sequence_length` is set, the layer will output a dense
+    `tf.Tensor` where all inputs have been padded or truncated to
+    `sequence_length`. The output dtype can be controlled via the `dtype`
+    argument, which should be either an integer or string type.
+    Args:
+        proto: Either a `string` path to a SentencePiece proto file, or a
+            `bytes` object with a serialized SentencePiece proto. See the
+            [SentencePiece repository](https://github.com/google/sentencepiece)
+            for more details on the format.
+        sequence_length: If set, the output will be converted to a dense
+            tensor and padded/trimmed so all outputs are of `sequence_length`.
+    References:
+        - [Kudo and Richardson, 2018](https://arxiv.org/abs/1808.06226)
+    Examples:
+    From bytes.
+    ```python
+    def train_sentence_piece_bytes(ds, size):
+        bytes_io = io.BytesIO()
+        sentencepiece.SentencePieceTrainer.train(
+            sentence_iterator=ds.as_numpy_iterator(),
+            model_writer=bytes_io,
+            vocab_size=size,
+        )
+        return bytes_io.getvalue()
+    # Train a sentencepiece proto.
+    ds = tf.data.Dataset.from_tensor_slices(["the quick brown fox."])
+    proto = train_sentence_piece_bytes(ds, 20)
+    # Tokenize inputs.
+    tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(proto=proto)
+    ds = ds.map(tokenizer)
+    ```
+    From a file.
+    ```python
+    def train_sentence_piece_file(ds, path, size):
+        with open(path, "wb") as model_file:
+            sentencepiece.SentencePieceTrainer.train(
+                sentence_iterator=ds.as_numpy_iterator(),
+                model_writer=model_file,
+                vocab_size=size,
+            )
+    # Train a sentencepiece proto.
+    ds = tf.data.Dataset.from_tensor_slices(["the quick brown fox."])
+    proto = train_sentence_piece_file(ds, "model.spm", 20)
+    # Tokenize inputs.
+    tokenizer = keras_hub.tokenizers.SentencePieceTokenizer(proto="model.spm")
+    ds = ds.map(tokenizer)
+    ```
+    """
+    def __init__(
+        self,
+        proto=None,
+        sequence_length=None,
+        dtype="int32",
+        **kwargs,
+    ) -> None:
+        if not is_int_dtype(dtype) and not is_string_dtype(dtype):
+            raise ValueError(
+                "Output dtype must be an integer type or a string. "
+                f"Received: dtype={dtype}"
+            )
+        super().__init__(dtype=dtype, **kwargs)
+        self.proto = None
+        self.sequence_length = sequence_length
+        self.set_proto(proto)
+        self.file_assets = [VOCAB_FILENAME]
+    def save_assets(self, dir_path):
+        path = os.path.join(dir_path, VOCAB_FILENAME)
+        with open(path, "wb") as file:
+            file.write(self.proto)
+    def load_assets(self, dir_path):
+        path = os.path.join(dir_path, VOCAB_FILENAME)
+        self.set_proto(path)
+    def set_proto(self, proto):
+        if proto is None:
+            self.proto = None
+            self._sentence_piece = None
+            return
+        if isinstance(proto, str):
+            # A string could be either a filepath, or a base64 encoded byte
+            # array (which we need for serialization). We will heuristically
+            # try to distinguish, by checking if a string is both longer and
+            # than 2048 characters and valid base64 characters.
+            is_base64 = False
+            if len(proto) > 2048:
+                try:
+                    proto_bytes = base64.b64decode(proto, validate=True)
+                    is_base64 = True
+                except binascii.Error:
+                    pass
+            if not is_base64:
+                proto_bytes = open(proto, "rb").read()
+        elif isinstance(proto, bytes):
+            proto_bytes = proto
+        else:
+            raise ValueError(
+                "SentencePiece `proto` argument should be either a `string` "
+                f"filepath or a `bytes` sequence. "
+                f"Received unknown type: {type(proto)}"
+            )
+        self._sentence_piece = tf_text.SentencepieceTokenizer(
+            model=proto_bytes,
+            out_type=self.compute_dtype,
+        )
+        # Keras cannot serialize a bytestring, so we base64 encode the model
+        # byte array as a string for saving.
+        self.proto = proto_bytes
+    def vocabulary_size(self):
+        """Get the integer size of the tokenizer vocabulary."""
+        self._check_vocabulary()
+        return int(self._sentence_piece.vocab_size().numpy())
+    def get_vocabulary(self):
+        """Get the tokenizer vocabulary."""
+        self._check_vocabulary()
+        return tensor_to_list(
+            self._sentence_piece.id_to_string(
+                tf.range(int(self._sentence_piece.vocab_size().numpy()))
+            )
+        )
+    def id_to_token(self, id):
+        """Convert an integer id to a string token."""
+        self._check_vocabulary()
+        if id >= self.vocabulary_size() or id < 0:
+            raise ValueError(
+                f"`id` must be in range [0, {self.vocabulary_size() - 1}]. "
+                f"Received: {id}"
+            )
+        return tensor_to_list(self._sentence_piece.id_to_string(id))
+    def token_to_id(self, token):
+        """Convert a string token to an integer id."""
+        self._check_vocabulary()
+        return int(self._sentence_piece.string_to_id(token).numpy())
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "proto": None,  # Save vocabulary via an asset!
+                "sequence_length": self.sequence_length,
+            }
+        )
+        return config
+    def _check_vocabulary(self):
+        if self.proto is None:
+            raise ValueError(
+                "No vocabulary has been set for SentencePieceTokenizer. Make "
+                "sure to pass a `proto` argument when creating the layer."
+            )
+    def tokenize(self, inputs):
+        self._check_vocabulary()
+        if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
+            inputs = tf.convert_to_tensor(inputs)
+        scalar_input = inputs.shape.rank == 0
+        if scalar_input:
+            inputs = tf.expand_dims(inputs, 0)
+        if self._sentence_piece is None:
+            raise ValueError(
+                "No vocabulary has been set for SentencePieceTokenizer. Make "
+                "sure to pass a `vocabulary` argument when creating the layer."
+            )
+        tokens = self._sentence_piece.tokenize(inputs)
+        # Convert to a dense output if `sequence_length` is set.
+        if self.sequence_length:
+            output_shape = tokens.shape.as_list()
+            output_shape[-1] = self.sequence_length
+            tokens = tokens.to_tensor(shape=output_shape)
+        # Convert to a dense output if input was a scalar.
+        if scalar_input:
+            tokens = tf.squeeze(tokens, 0)
+            tf.ensure_shape(tokens, shape=[self.sequence_length])
+        return tokens
+    def detokenize(self, inputs):
+        self._check_vocabulary()
+        inputs, unbatched, _ = convert_to_ragged_batch(inputs)
+        # tf-text sentencepiece does not handle int64.
+        inputs = tf.cast(inputs, "int32")
+        outputs = self._sentence_piece.detokenize(inputs)
+        if unbatched:
+            outputs = tf.squeeze(outputs, 0)
+        return outputs
+    def compute_output_spec(self, input_spec):
+        return keras.KerasTensor(
+            input_spec.shape + (self.sequence_length,), dtype=self.compute_dtype
+        )