PyPI - keras-hub-nightly - Versions diffs - 0.15.0.dev20240823171555__py3-none-any.whl - Mend

keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (297) hide show

keras_hub/__init__.py +52 -0
keras_hub/api/__init__.py +27 -0
keras_hub/api/layers/__init__.py +47 -0
keras_hub/api/metrics/__init__.py +24 -0
keras_hub/api/models/__init__.py +249 -0
keras_hub/api/samplers/__init__.py +29 -0
keras_hub/api/tokenizers/__init__.py +35 -0
keras_hub/src/__init__.py +13 -0
keras_hub/src/api_export.py +53 -0
keras_hub/src/layers/__init__.py +13 -0
keras_hub/src/layers/modeling/__init__.py +13 -0
keras_hub/src/layers/modeling/alibi_bias.py +143 -0
keras_hub/src/layers/modeling/cached_multi_head_attention.py +137 -0
keras_hub/src/layers/modeling/f_net_encoder.py +200 -0
keras_hub/src/layers/modeling/masked_lm_head.py +239 -0
keras_hub/src/layers/modeling/position_embedding.py +123 -0
keras_hub/src/layers/modeling/reversible_embedding.py +311 -0
keras_hub/src/layers/modeling/rotary_embedding.py +169 -0
keras_hub/src/layers/modeling/sine_position_encoding.py +108 -0
keras_hub/src/layers/modeling/token_and_position_embedding.py +150 -0
keras_hub/src/layers/modeling/transformer_decoder.py +496 -0
keras_hub/src/layers/modeling/transformer_encoder.py +262 -0
keras_hub/src/layers/modeling/transformer_layer_utils.py +106 -0
keras_hub/src/layers/preprocessing/__init__.py +13 -0
keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +220 -0
keras_hub/src/layers/preprocessing/multi_segment_packer.py +319 -0
keras_hub/src/layers/preprocessing/preprocessing_layer.py +62 -0
keras_hub/src/layers/preprocessing/random_deletion.py +271 -0
keras_hub/src/layers/preprocessing/random_swap.py +267 -0
keras_hub/src/layers/preprocessing/start_end_packer.py +219 -0
keras_hub/src/metrics/__init__.py +13 -0
keras_hub/src/metrics/bleu.py +394 -0
keras_hub/src/metrics/edit_distance.py +197 -0
keras_hub/src/metrics/perplexity.py +181 -0
keras_hub/src/metrics/rouge_base.py +204 -0
keras_hub/src/metrics/rouge_l.py +97 -0
keras_hub/src/metrics/rouge_n.py +125 -0
keras_hub/src/models/__init__.py +13 -0
keras_hub/src/models/albert/__init__.py +20 -0
keras_hub/src/models/albert/albert_backbone.py +267 -0
keras_hub/src/models/albert/albert_classifier.py +202 -0
keras_hub/src/models/albert/albert_masked_lm.py +129 -0
keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +194 -0
keras_hub/src/models/albert/albert_preprocessor.py +206 -0
keras_hub/src/models/albert/albert_presets.py +70 -0
keras_hub/src/models/albert/albert_tokenizer.py +119 -0
keras_hub/src/models/backbone.py +311 -0
keras_hub/src/models/bart/__init__.py +20 -0
keras_hub/src/models/bart/bart_backbone.py +261 -0
keras_hub/src/models/bart/bart_preprocessor.py +276 -0
keras_hub/src/models/bart/bart_presets.py +74 -0
keras_hub/src/models/bart/bart_seq_2_seq_lm.py +490 -0
keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +262 -0
keras_hub/src/models/bart/bart_tokenizer.py +124 -0
keras_hub/src/models/bert/__init__.py +23 -0
keras_hub/src/models/bert/bert_backbone.py +227 -0
keras_hub/src/models/bert/bert_classifier.py +183 -0
keras_hub/src/models/bert/bert_masked_lm.py +131 -0
keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +198 -0
keras_hub/src/models/bert/bert_preprocessor.py +184 -0
keras_hub/src/models/bert/bert_presets.py +147 -0
keras_hub/src/models/bert/bert_tokenizer.py +112 -0
keras_hub/src/models/bloom/__init__.py +20 -0
keras_hub/src/models/bloom/bloom_attention.py +186 -0
keras_hub/src/models/bloom/bloom_backbone.py +173 -0
keras_hub/src/models/bloom/bloom_causal_lm.py +298 -0
keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +176 -0
keras_hub/src/models/bloom/bloom_decoder.py +206 -0
keras_hub/src/models/bloom/bloom_preprocessor.py +185 -0
keras_hub/src/models/bloom/bloom_presets.py +121 -0
keras_hub/src/models/bloom/bloom_tokenizer.py +116 -0
keras_hub/src/models/causal_lm.py +383 -0
keras_hub/src/models/classifier.py +109 -0
keras_hub/src/models/csp_darknet/__init__.py +13 -0
keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +410 -0
keras_hub/src/models/csp_darknet/csp_darknet_image_classifier.py +133 -0
keras_hub/src/models/deberta_v3/__init__.py +24 -0
keras_hub/src/models/deberta_v3/deberta_v3_backbone.py +210 -0
keras_hub/src/models/deberta_v3/deberta_v3_classifier.py +228 -0
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm.py +135 -0
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +191 -0
keras_hub/src/models/deberta_v3/deberta_v3_preprocessor.py +206 -0
keras_hub/src/models/deberta_v3/deberta_v3_presets.py +82 -0
keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +155 -0
keras_hub/src/models/deberta_v3/disentangled_attention_encoder.py +227 -0
keras_hub/src/models/deberta_v3/disentangled_self_attention.py +412 -0
keras_hub/src/models/deberta_v3/relative_embedding.py +94 -0
keras_hub/src/models/densenet/__init__.py +13 -0
keras_hub/src/models/densenet/densenet_backbone.py +210 -0
keras_hub/src/models/densenet/densenet_image_classifier.py +131 -0
keras_hub/src/models/distil_bert/__init__.py +26 -0
keras_hub/src/models/distil_bert/distil_bert_backbone.py +187 -0
keras_hub/src/models/distil_bert/distil_bert_classifier.py +208 -0
keras_hub/src/models/distil_bert/distil_bert_masked_lm.py +137 -0
keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +194 -0
keras_hub/src/models/distil_bert/distil_bert_preprocessor.py +175 -0
keras_hub/src/models/distil_bert/distil_bert_presets.py +57 -0
keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +114 -0
keras_hub/src/models/electra/__init__.py +20 -0
keras_hub/src/models/electra/electra_backbone.py +247 -0
keras_hub/src/models/electra/electra_preprocessor.py +154 -0
keras_hub/src/models/electra/electra_presets.py +95 -0
keras_hub/src/models/electra/electra_tokenizer.py +104 -0
keras_hub/src/models/f_net/__init__.py +20 -0
keras_hub/src/models/f_net/f_net_backbone.py +236 -0
keras_hub/src/models/f_net/f_net_classifier.py +154 -0
keras_hub/src/models/f_net/f_net_masked_lm.py +132 -0
keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +196 -0
keras_hub/src/models/f_net/f_net_preprocessor.py +177 -0
keras_hub/src/models/f_net/f_net_presets.py +43 -0
keras_hub/src/models/f_net/f_net_tokenizer.py +95 -0
keras_hub/src/models/falcon/__init__.py +20 -0
keras_hub/src/models/falcon/falcon_attention.py +156 -0
keras_hub/src/models/falcon/falcon_backbone.py +164 -0
keras_hub/src/models/falcon/falcon_causal_lm.py +291 -0
keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/falcon/falcon_preprocessor.py +187 -0
keras_hub/src/models/falcon/falcon_presets.py +30 -0
keras_hub/src/models/falcon/falcon_tokenizer.py +110 -0
keras_hub/src/models/falcon/falcon_transformer_decoder.py +255 -0
keras_hub/src/models/feature_pyramid_backbone.py +73 -0
keras_hub/src/models/gemma/__init__.py +20 -0
keras_hub/src/models/gemma/gemma_attention.py +250 -0
keras_hub/src/models/gemma/gemma_backbone.py +316 -0
keras_hub/src/models/gemma/gemma_causal_lm.py +448 -0
keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +167 -0
keras_hub/src/models/gemma/gemma_decoder_block.py +241 -0
keras_hub/src/models/gemma/gemma_preprocessor.py +191 -0
keras_hub/src/models/gemma/gemma_presets.py +248 -0
keras_hub/src/models/gemma/gemma_tokenizer.py +103 -0
keras_hub/src/models/gemma/rms_normalization.py +40 -0
keras_hub/src/models/gpt2/__init__.py +20 -0
keras_hub/src/models/gpt2/gpt2_backbone.py +199 -0
keras_hub/src/models/gpt2/gpt2_causal_lm.py +437 -0
keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/gpt2/gpt2_preprocessor.py +187 -0
keras_hub/src/models/gpt2/gpt2_presets.py +82 -0
keras_hub/src/models/gpt2/gpt2_tokenizer.py +110 -0
keras_hub/src/models/gpt_neo_x/__init__.py +13 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_attention.py +251 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_backbone.py +175 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm.py +201 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +141 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_decoder.py +258 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +145 -0
keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +88 -0
keras_hub/src/models/image_classifier.py +90 -0
keras_hub/src/models/llama/__init__.py +20 -0
keras_hub/src/models/llama/llama_attention.py +225 -0
keras_hub/src/models/llama/llama_backbone.py +188 -0
keras_hub/src/models/llama/llama_causal_lm.py +327 -0
keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +170 -0
keras_hub/src/models/llama/llama_decoder.py +246 -0
keras_hub/src/models/llama/llama_layernorm.py +48 -0
keras_hub/src/models/llama/llama_preprocessor.py +189 -0
keras_hub/src/models/llama/llama_presets.py +80 -0
keras_hub/src/models/llama/llama_tokenizer.py +84 -0
keras_hub/src/models/llama3/__init__.py +20 -0
keras_hub/src/models/llama3/llama3_backbone.py +84 -0
keras_hub/src/models/llama3/llama3_causal_lm.py +46 -0
keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/llama3/llama3_preprocessor.py +21 -0
keras_hub/src/models/llama3/llama3_presets.py +69 -0
keras_hub/src/models/llama3/llama3_tokenizer.py +63 -0
keras_hub/src/models/masked_lm.py +101 -0
keras_hub/src/models/mistral/__init__.py +20 -0
keras_hub/src/models/mistral/mistral_attention.py +238 -0
keras_hub/src/models/mistral/mistral_backbone.py +203 -0
keras_hub/src/models/mistral/mistral_causal_lm.py +328 -0
keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +175 -0
keras_hub/src/models/mistral/mistral_layer_norm.py +48 -0
keras_hub/src/models/mistral/mistral_preprocessor.py +190 -0
keras_hub/src/models/mistral/mistral_presets.py +48 -0
keras_hub/src/models/mistral/mistral_tokenizer.py +82 -0
keras_hub/src/models/mistral/mistral_transformer_decoder.py +265 -0
keras_hub/src/models/mix_transformer/__init__.py +13 -0
keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +181 -0
keras_hub/src/models/mix_transformer/mix_transformer_classifier.py +133 -0
keras_hub/src/models/mix_transformer/mix_transformer_layers.py +300 -0
keras_hub/src/models/opt/__init__.py +20 -0
keras_hub/src/models/opt/opt_backbone.py +173 -0
keras_hub/src/models/opt/opt_causal_lm.py +301 -0
keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +177 -0
keras_hub/src/models/opt/opt_preprocessor.py +188 -0
keras_hub/src/models/opt/opt_presets.py +72 -0
keras_hub/src/models/opt/opt_tokenizer.py +116 -0
keras_hub/src/models/pali_gemma/__init__.py +23 -0
keras_hub/src/models/pali_gemma/pali_gemma_backbone.py +277 -0
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm.py +313 -0
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +147 -0
keras_hub/src/models/pali_gemma/pali_gemma_decoder_block.py +160 -0
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +78 -0
keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +79 -0
keras_hub/src/models/pali_gemma/pali_gemma_vit.py +566 -0
keras_hub/src/models/phi3/__init__.py +20 -0
keras_hub/src/models/phi3/phi3_attention.py +260 -0
keras_hub/src/models/phi3/phi3_backbone.py +224 -0
keras_hub/src/models/phi3/phi3_causal_lm.py +218 -0
keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +173 -0
keras_hub/src/models/phi3/phi3_decoder.py +260 -0
keras_hub/src/models/phi3/phi3_layernorm.py +48 -0
keras_hub/src/models/phi3/phi3_preprocessor.py +190 -0
keras_hub/src/models/phi3/phi3_presets.py +50 -0
keras_hub/src/models/phi3/phi3_rotary_embedding.py +137 -0
keras_hub/src/models/phi3/phi3_tokenizer.py +94 -0
keras_hub/src/models/preprocessor.py +207 -0
keras_hub/src/models/resnet/__init__.py +13 -0
keras_hub/src/models/resnet/resnet_backbone.py +612 -0
keras_hub/src/models/resnet/resnet_image_classifier.py +136 -0
keras_hub/src/models/roberta/__init__.py +20 -0
keras_hub/src/models/roberta/roberta_backbone.py +184 -0
keras_hub/src/models/roberta/roberta_classifier.py +209 -0
keras_hub/src/models/roberta/roberta_masked_lm.py +136 -0
keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +198 -0
keras_hub/src/models/roberta/roberta_preprocessor.py +192 -0
keras_hub/src/models/roberta/roberta_presets.py +43 -0
keras_hub/src/models/roberta/roberta_tokenizer.py +132 -0
keras_hub/src/models/seq_2_seq_lm.py +54 -0
keras_hub/src/models/t5/__init__.py +20 -0
keras_hub/src/models/t5/t5_backbone.py +261 -0
keras_hub/src/models/t5/t5_layer_norm.py +35 -0
keras_hub/src/models/t5/t5_multi_head_attention.py +324 -0
keras_hub/src/models/t5/t5_presets.py +95 -0
keras_hub/src/models/t5/t5_tokenizer.py +100 -0
keras_hub/src/models/t5/t5_transformer_layer.py +178 -0
keras_hub/src/models/task.py +419 -0
keras_hub/src/models/vgg/__init__.py +13 -0
keras_hub/src/models/vgg/vgg_backbone.py +158 -0
keras_hub/src/models/vgg/vgg_image_classifier.py +124 -0
keras_hub/src/models/vit_det/__init__.py +13 -0
keras_hub/src/models/vit_det/vit_det_backbone.py +204 -0
keras_hub/src/models/vit_det/vit_layers.py +565 -0
keras_hub/src/models/whisper/__init__.py +20 -0
keras_hub/src/models/whisper/whisper_audio_feature_extractor.py +260 -0
keras_hub/src/models/whisper/whisper_backbone.py +305 -0
keras_hub/src/models/whisper/whisper_cached_multi_head_attention.py +153 -0
keras_hub/src/models/whisper/whisper_decoder.py +141 -0
keras_hub/src/models/whisper/whisper_encoder.py +106 -0
keras_hub/src/models/whisper/whisper_preprocessor.py +326 -0
keras_hub/src/models/whisper/whisper_presets.py +148 -0
keras_hub/src/models/whisper/whisper_tokenizer.py +163 -0
keras_hub/src/models/xlm_roberta/__init__.py +26 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_backbone.py +81 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_classifier.py +225 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm.py +141 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +195 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_preprocessor.py +205 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_presets.py +43 -0
keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +191 -0
keras_hub/src/models/xlnet/__init__.py +13 -0
keras_hub/src/models/xlnet/relative_attention.py +459 -0
keras_hub/src/models/xlnet/xlnet_backbone.py +222 -0
keras_hub/src/models/xlnet/xlnet_content_and_query_embedding.py +133 -0
keras_hub/src/models/xlnet/xlnet_encoder.py +378 -0
keras_hub/src/samplers/__init__.py +13 -0
keras_hub/src/samplers/beam_sampler.py +207 -0
keras_hub/src/samplers/contrastive_sampler.py +231 -0
keras_hub/src/samplers/greedy_sampler.py +50 -0
keras_hub/src/samplers/random_sampler.py +77 -0
keras_hub/src/samplers/sampler.py +237 -0
keras_hub/src/samplers/serialization.py +97 -0
keras_hub/src/samplers/top_k_sampler.py +92 -0
keras_hub/src/samplers/top_p_sampler.py +113 -0
keras_hub/src/tests/__init__.py +13 -0
keras_hub/src/tests/test_case.py +608 -0
keras_hub/src/tokenizers/__init__.py +13 -0
keras_hub/src/tokenizers/byte_pair_tokenizer.py +638 -0
keras_hub/src/tokenizers/byte_tokenizer.py +299 -0
keras_hub/src/tokenizers/sentence_piece_tokenizer.py +267 -0
keras_hub/src/tokenizers/sentence_piece_tokenizer_trainer.py +150 -0
keras_hub/src/tokenizers/tokenizer.py +235 -0
keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +355 -0
keras_hub/src/tokenizers/word_piece_tokenizer.py +544 -0
keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py +176 -0
keras_hub/src/utils/__init__.py +13 -0
keras_hub/src/utils/keras_utils.py +130 -0
keras_hub/src/utils/pipeline_model.py +293 -0
keras_hub/src/utils/preset_utils.py +621 -0
keras_hub/src/utils/python_utils.py +21 -0
keras_hub/src/utils/tensor_utils.py +206 -0
keras_hub/src/utils/timm/__init__.py +13 -0
keras_hub/src/utils/timm/convert.py +37 -0
keras_hub/src/utils/timm/convert_resnet.py +171 -0
keras_hub/src/utils/transformers/__init__.py +13 -0
keras_hub/src/utils/transformers/convert.py +101 -0
keras_hub/src/utils/transformers/convert_bert.py +173 -0
keras_hub/src/utils/transformers/convert_distilbert.py +184 -0
keras_hub/src/utils/transformers/convert_gemma.py +187 -0
keras_hub/src/utils/transformers/convert_gpt2.py +186 -0
keras_hub/src/utils/transformers/convert_llama3.py +136 -0
keras_hub/src/utils/transformers/convert_pali_gemma.py +303 -0
keras_hub/src/utils/transformers/safetensor_utils.py +97 -0
keras_hub/src/version_utils.py +23 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +34 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +297 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/WHEEL +5 -0
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/top_level.txt +1 -0

keras_hub/src/tokenizers/word_piece_tokenizer_trainer.py ADDED Viewed

@@ -0,0 +1,176 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from keras_hub.src.api_export import keras_hub_export
+from keras_hub.src.tokenizers.word_piece_tokenizer import pretokenize
+try:
+    import tensorflow as tf
+    from tensorflow_text.tools.wordpiece_vocab import (
+        wordpiece_tokenizer_learner_lib as learner,
+    )
+except ImportError:
+    tf = None
+    learner = None
+@keras_hub_export("keras_hub.tokenizers.compute_word_piece_vocabulary")
+def compute_word_piece_vocabulary(
+    data,
+    vocabulary_size,
+    vocabulary_output_file=None,
+    lowercase=False,
+    strip_accents=False,
+    split=True,
+    split_on_cjk=True,
+    suffix_indicator="##",
+    reserved_tokens=["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]"],
+):
+    r"""A utility to train a WordPiece vocabulary.
+    Trains a WordPiece vocabulary from an input dataset or a list of filenames.
+    For custom data loading and pretokenization (`split=False`), the input
+    `data` should be a `tf.data.Dataset`. If `data` is a list of filenames,
+    the file format is required to be plain text files, and the text would be
+    read in line by line during training.
+    Args:
+        data: A `tf.data.Dataset`, or a list of filenames.
+        vocabulary_size: int. The maximum size of a vocabulary to be trained.
+        vocabulary_output_file: str. The location to write a
+            vocabulary file. defaults to `None`.
+        lowercase: bool. If `True`, the input text will be
+            lowercased before tokenization. Defaults to `False`.
+        strip_accents: bool. If `True`, all accent marks will
+            be removed from text before tokenization. Defaults to `False`.
+        split: bool. If `True`, input will be split on
+            whitespace and punctuation marks, and all punctuation marks will be
+            kept as tokens. If `False`, input should be split ("pre-tokenized")
+            before calling the tokenizer, and passed as a dense or ragged tensor
+            of whole words. `split` is required to be `True` when `data` is a
+            list of filenames. Defaults to `True`.
+        split_on_cjk: bool. If `True`, input will be split
+            on CJK characters, i.e., Chinese, Japanese, Korean and Vietnamese
+            characters (https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)).
+            Note that this is applicable only when `split` is `True`.
+            Defaults to `True`.
+        suffix_indicator: str. The characters prepended to a
+            WordPiece to indicate that it is a suffix to another subword.
+            E.g. `"##ing"`. Defaults to `"##"`.
+        reserved_tokens: list of strings. A list of tokens that must be included in the vocabulary.
+    Returns:
+        Returns a list of vocabulary terms.
+    Examples:
+    Basic Usage (from Dataset).
+    >>> inputs = tf.data.Dataset.from_tensor_slices(["bat sat pat mat rat"])
+    >>> vocab = compute_word_piece_vocabulary(inputs, 13)
+    >>> vocab
+    ['[PAD]', '[CLS]', '[SEP]', '[UNK]', '[MASK]', 'a', 'b', 'm', 'p', 'r', 's', 't', '##at']
+    >>> tokenizer = keras_hub.tokenizers.WordPieceTokenizer(vocabulary=vocab, oov_token="[UNK]")
+    >>> outputs = inputs.map(tokenizer.tokenize)
+    >>> for x in outputs:
+    ...     print(x)
+    tf.Tensor([ 6 12 10 12  8 12  7 12  9 12], shape=(10,), dtype=int32)
+    Basic Usage (from filenames).
+    ```python
+    with open("test.txt", "w+") as f:
+        f.write("bat sat pat mat rat\n")
+    inputs = ["test.txt"]
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(inputs, 13)
+    ```
+    Custom Split Usage (from Dataset).
+    >>> def normalize_and_split(x):
+    ...     "Strip punctuation and split on whitespace."
+    ...     x = tf.strings.regex_replace(x, r"\p{P}", "")
+    ...     return tf.strings.split(x)
+    >>> inputs = tf.data.Dataset.from_tensor_slices(["bat sat: pat mat rat.\n"])
+    >>> split_inputs = inputs.map(normalize_and_split)
+    >>> vocab = compute_word_piece_vocabulary(
+    ...     split_inputs, 13, split=False,
+    ... )
+    >>> vocab
+    ['[PAD]', '[CLS]', '[SEP]', '[UNK]', '[MASK]', 'a', 'b', 'm', 'p', 'r', 's', 't', '##at']
+    >>> tokenizer = keras_hub.tokenizers.WordPieceTokenizer(vocabulary=vocab)
+    >>> inputs.map(tokenizer.tokenize)
+    Custom Split Usage (from filenames).
+    ```python
+    def normalize_and_split(x):
+        "Strip punctuation and split on whitespace."
+        x = tf.strings.regex_replace(x, r"\p{P}", "")
+        return tf.strings.split(x)
+    with open("test.txt", "w+") as f:
+        f.write("bat sat: pat mat rat.\n")
+    inputs = tf.data.TextLineDataset(["test.txt"])
+    split_inputs = inputs.map(normalize_and_split)
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
+        split_inputs, 13, split=False
+    )
+    tokenizer = keras_hub.tokenizers.WordPieceTokenizer(vocabulary=vocab)
+    inputs.map(tokenizer.tokenize)
+    ```
+    """
+    # Read data files.
+    if not isinstance(data, (list, tf.data.Dataset)):
+        raise ValueError(
+            "The `data` argument must be either `tf.data.Dataset` or `list`. "
+            f"Received: {type(data)}."
+        )
+    if isinstance(data, list):
+        # Processing list of file paths.
+        if not split:
+            raise ValueError(
+                "When learning a vocab from files, `split` must be `True`. "
+                "To compute a vocabulary with custom split rules, load your "
+                "data as a dataset, split it, and pass it to "
+                "`compute_word_piece_vocabulary()` with split=False."
+            )
+        path_ds = tf.data.Dataset.from_tensor_slices(data)
+        # Uses map to read filepaths.
+        data = path_ds.map(
+            lambda path: tf.io.read_file(path),
+            num_parallel_calls=tf.data.AUTOTUNE,
+        )
+    words_data = data.map(
+        lambda text: pretokenize(
+            text, lowercase, strip_accents, split, split_on_cjk
+        ),
+        num_parallel_calls=tf.data.AUTOTUNE,
+    )
+    word_counts = learner.count_words(words_data)
+    # Train tokenizer.
+    vocab = learner.learn(
+        word_counts,
+        vocab_size=vocabulary_size,
+        reserved_tokens=reserved_tokens,
+        include_joiner_token=True,
+        joiner=suffix_indicator,
+    )
+    if len(vocab) > vocabulary_size:
+        vocab = vocab[:vocabulary_size]
+    if vocabulary_output_file is not None:
+        vocab_text = "".join([line + "\n" for line in vocab])
+        # Write vocab to file.
+        with open(vocabulary_output_file, "w", encoding="utf-8") as vocab_file:
+            vocab_file.write(vocab_text)
+    else:
+        return vocab

keras_hub/src/utils/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

keras_hub/src/utils/keras_utils.py ADDED Viewed

@@ -0,0 +1,130 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import keras
+from absl import logging
+from packaging.version import parse
+from keras_hub.src.utils.tensor_utils import is_tensor_type
+try:
+    import tensorflow as tf
+except ImportError:
+    tf = None
+def clone_initializer(initializer):
+    """Clones an initializer to ensure a new seed.
+    As of tensorflow 2.10, we need to clone user passed initializers when
+    invoking them twice to avoid creating the same randomized initialization.
+    """
+    # If we get a string or dict, just return as we cannot and should not clone.
+    if not isinstance(initializer, keras.initializers.Initializer):
+        return initializer
+    config = initializer.get_config()
+    return initializer.__class__.from_config(config)
+def convert_inputs_to_list_of_tensor_segments(x):
+    """Converts user inputs to a list of a tensor segments.
+    For models and layers which accept lists of string tensors to pack together,
+    this method converts user inputs to a uniform format in a way that can be
+    considered canonical for the library.
+    We handle the following:
+    - A single string will be converted to a tensor and wrapped in a list.
+    - A list of strings will be converted to a tensor and wrapped in a list.
+    - A single tensor will be wrapped in a list.
+    - A list of tensors will be passed through unaltered.
+    All other inputs will result in an error. This effectively means that users
+    who would like to pack multiple segments together should convert those
+    segments to tensors before calling the layer. This removes any ambiguity
+    in the input for those cases.
+    """
+    # Check the input type.
+    is_string = isinstance(x, (str, bytes))
+    is_tensor = is_tensor_type(x)
+    is_string_list = (
+        isinstance(x, (list, tuple)) and x and isinstance(x[0], (str, bytes))
+    )
+    is_tensor_list = isinstance(x, (list, tuple)) and x and is_tensor_type(x[0])
+    if is_string or is_string_list:
+        # Automatically convert raw strings or string lists to tensors.
+        # Wrap this input as a single (possibly batched) segment.
+        x = [tf.convert_to_tensor(x)]
+    elif is_tensor:
+        # Automatically wrap a single tensor as a single segment.
+        x = [x]
+    elif is_tensor_list:
+        # Pass lists of tensors though unaltered.
+        x = x
+    else:
+        # Error for all other input.
+        raise ValueError(
+            f"Unsupported input for `x`. `x` should be a string, a list of "
+            "strings, or a list of tensors. If passing multiple segments "
+            "which should packed together, please convert your inputs to a "
+            f"list of tensors. Received `x={x}`"
+        )
+    return x
+def print_msg(message, line_break=True):
+    """Print the message to absl logging or stdout."""
+    # Copied from core Keras.
+    if keras.utils.is_interactive_logging_enabled():
+        if line_break:
+            sys.stdout.write(message + "\n")
+        else:
+            sys.stdout.write(message)
+        sys.stdout.flush()
+    else:
+        logging.info(message)
+@keras.saving.register_keras_serializable(package="keras_hub")
+def gelu_approximate(x):
+    return keras.activations.gelu(x, approximate=True)
+def has_quantization_support():
+    return False if parse(keras.version()) < parse("3.4.0") else True
+def assert_quantization_support():
+    if not has_quantization_support():
+        raise ValueError(
+            "Quantization API requires Keras >= 3.4.0 to function "
+            f"correctly. Received: '{keras.version()}'"
+        )
+def standardize_data_format(data_format):
+    if data_format is None:
+        return keras.config.image_data_format()
+    data_format = str(data_format).lower()
+    if data_format not in {"channels_first", "channels_last"}:
+        raise ValueError(
+            "The `data_format` argument must be one of "
+            "{'channels_first', 'channels_last'}. "
+            f"Received: data_format={data_format}"
+        )
+    return data_format

keras_hub/src/utils/pipeline_model.py ADDED Viewed

@@ -0,0 +1,293 @@
+# Copyright 2024 The KerasHub Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import math
+import keras
+from keras import ops
+from keras import tree
+from keras_hub.src.utils.tensor_utils import is_tensor_type
+try:
+    import tensorflow as tf
+except ImportError:
+    tf = None
+def _convert_inputs_to_dataset(
+    x=None,
+    y=None,
+    sample_weight=None,
+    batch_size=None,
+):
+    """Convert inputs to a `tf.data.Dataset`.
+    This is a stand in for the `TensorLikeDataAdapter` in core Keras.
+    """
+    if isinstance(x, tf.data.Dataset):
+        if y is not None:
+            raise ValueError(
+                "When `x` is a `tf.data.Dataset`, please do not provide "
+                f"`y`. Received: `type(y)={type(y)}`."
+            )
+        if sample_weight is not None:
+            raise ValueError(
+                "When `x` is a `tf.data.Dataset`, please do not provide "
+                "`sample_weight`. Received: "
+                f"`type(sample_weight)={type(sample_weight)}`."
+            )
+        if batch_size is not None:
+            raise ValueError(
+                "When `x` is a `tf.data.Dataset`, please do not provide "
+                "`batch_size`. Received: "
+                f"`type(batch_size)={type(batch_size)}`."
+            )
+        return x
+    inputs = keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
+    try:
+        def convert(x):
+            if isinstance(x, (tf.Tensor, tf.RaggedTensor)):
+                return x
+            if hasattr(x, "__array__"):
+                return ops.convert_to_numpy(x)
+            return x
+        inputs = tree.map_structure(convert, inputs)
+        ds = tf.data.Dataset.from_tensor_slices(inputs)
+    except ValueError as e:
+        # If our inputs are unbatched, re-raise with a more friendly error
+        # message the default from tf.data. We expect this to come up with
+        # some frequency, so it's important to have a good sign post here.
+        if "only supported for rank >= 1" in str(e):
+            raise ValueError(
+                "`x`, `y`, and `sample_weight` must have a batch dimension "
+                "when calling `fit()`, `evaluate()`, and `predict()`. Received "
+                "an input with rank 0. Please add an outer dimension to your "
+                "input, e.g., wrap it in a list."
+            ) from e
+        raise e
+    return ds.batch(batch_size or 32)
+def _train_validation_split(arrays, validation_split):
+    """Split arrays into train and validation subsets in deterministic order.
+    This is copied directly from core Keras.
+    """
+    def _can_split(t):
+        return is_tensor_type(t) or t is None
+    flat_arrays = tree.flatten(arrays)
+    unsplitable = [type(t) for t in flat_arrays if not _can_split(t)]
+    if unsplitable:
+        raise ValueError(
+            "`validation_split` is only supported for Tensors or NumPy "
+            "arrays, found following types in the input: {}".format(unsplitable)
+        )
+    if all(t is None for t in flat_arrays):
+        return arrays, arrays
+    first_non_none = None
+    for t in flat_arrays:
+        if t is not None:
+            first_non_none = t
+            break
+    # Assumes all arrays have the same batch shape or are `None`.
+    batch_dim = int(first_non_none.shape[0])
+    split_at = int(math.floor(batch_dim * (1.0 - validation_split)))
+    if split_at == 0 or split_at == batch_dim:
+        raise ValueError(
+            "Training data contains {batch_dim} samples, which is not "
+            "sufficient to split it into a validation and training set as "
+            "specified by `validation_split={validation_split}`. Either "
+            "provide more data, or a different value for the "
+            "`validation_split` argument.".format(
+                batch_dim=batch_dim, validation_split=validation_split
+            )
+        )
+    def _split(t, start, end):
+        if t is None:
+            return t
+        return t[start:end]
+    train_arrays = tree.map_structure(
+        functools.partial(_split, start=0, end=split_at), arrays
+    )
+    val_arrays = tree.map_structure(
+        functools.partial(_split, start=split_at, end=batch_dim), arrays
+    )
+    return train_arrays, val_arrays
+@keras.saving.register_keras_serializable(package="keras_hub")
+class PipelineModel(keras.Model):
+    """A model which allows automatically applying preprocessing."""
+    def __init__(self, *args, **kwargs):
+        # Workaround for https://github.com/keras-team/keras/issues/17270
+        # Reset any attempt to overwrite this classes base class to this class
+        # can continue to be used for functional and non-functional models.
+        PipelineModel.__bases__ = (keras.Model,)
+        super().__init__(*args, **kwargs)
+    def preprocess_samples(self, x, y=None, sample_weight=None):
+        """An overridable function which preprocesses entire samples."""
+        return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
+    # ========================================================================
+    # Below are overrides to keras.Model methods to apply the functions above.
+    # ========================================================================
+    def fit(
+        self,
+        x=None,
+        y=None,
+        batch_size=None,
+        sample_weight=None,
+        validation_data=None,
+        validation_split=None,
+        **kwargs,
+    ):
+        if validation_split and validation_data is None:
+            (x, y, sample_weight), validation_data = _train_validation_split(
+                (x, y, sample_weight), validation_split=validation_split
+            )
+        x = _convert_inputs_to_dataset(x, y, sample_weight, batch_size)
+        x = x.map(
+            self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE
+        ).prefetch(tf.data.AUTOTUNE)
+        if validation_data is not None:
+            if not isinstance(validation_data, tf.data.Dataset):
+                (vx, vy, vsw) = keras.utils.unpack_x_y_sample_weight(
+                    validation_data
+                )
+                validation_data = _convert_inputs_to_dataset(
+                    vx, vy, vsw, batch_size
+                )
+        return super().fit(
+            x=x,
+            y=None,
+            batch_size=None,
+            sample_weight=None,
+            validation_data=validation_data,
+            **kwargs,
+        )
+    def evaluate(
+        self,
+        x=None,
+        y=None,
+        batch_size=None,
+        sample_weight=None,
+        **kwargs,
+    ):
+        # During `fit()`, `keras.Model` attempts to cache the validation
+        # dataset and ignores the values for `x`, `y`, and `sample_weight`.
+        # We don't want that behavior here, as the validation dataset still
+        # needs preprocessing.
+        kwargs.pop("_use_cached_eval_dataset", None)
+        x = _convert_inputs_to_dataset(x, y, sample_weight, batch_size)
+        x = x.map(
+            self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE
+        ).prefetch(tf.data.AUTOTUNE)
+        return super().evaluate(
+            x=x,
+            y=None,
+            batch_size=None,
+            **kwargs,
+        )
+    def predict(
+        self,
+        x=None,
+        batch_size=None,
+        **kwargs,
+    ):
+        x = _convert_inputs_to_dataset(x, None, None, batch_size)
+        x = x.map(
+            self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE
+        ).prefetch(tf.data.AUTOTUNE)
+        return super().predict(
+            x=x,
+            batch_size=None,
+            **kwargs,
+        )
+    def train_on_batch(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        **kwargs,
+    ):
+        data = self.preprocess_samples(x, y, sample_weight)
+        x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)
+        x = ops.convert_to_tensor(x)
+        if y is not None:
+            y = ops.convert_to_tensor(y)
+        if sample_weight is not None:
+            sample_weight = ops.convert_to_tensor(sample_weight)
+        return super().train_on_batch(
+            x=x,
+            y=y,
+            sample_weight=sample_weight,
+            **kwargs,
+        )
+    def test_on_batch(
+        self,
+        x,
+        y=None,
+        sample_weight=None,
+        **kwargs,
+    ):
+        data = self.preprocess_samples(x, y, sample_weight)
+        x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)
+        x = ops.convert_to_tensor(x)
+        if y is not None:
+            y = ops.convert_to_tensor(y)
+        if sample_weight is not None:
+            sample_weight = ops.convert_to_tensor(sample_weight)
+        return super().test_on_batch(
+            x=x,
+            y=y,
+            sample_weight=sample_weight,
+            **kwargs,
+        )
+    def predict_on_batch(
+        self,
+        x,
+        **kwargs,
+    ):
+        data = self.preprocess_samples(x)
+        x, _, _ = keras.utils.unpack_x_y_sample_weight(data)
+        x = ops.convert_to_tensor(x)
+        return super().predict_on_batch(
+            x=x,
+            **kwargs,
+        )