PyPI - keras-hub-nightly - Versions diffs - 0.15.0.dev20240823171555__py3-none-any.whl → 0.16.0.dev2024092017__py3-none-any.whl - Mend

keras-hub-nightly 0.15.0.dev20240823171555py3-none-any.whl → 0.16.0.dev2024092017py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (198) hide show

keras_hub/__init__.py +0 -6
keras_hub/api/__init__.py +2 -0
keras_hub/api/bounding_box/__init__.py +36 -0
keras_hub/api/layers/__init__.py +14 -0
keras_hub/api/models/__init__.py +97 -48
keras_hub/api/tokenizers/__init__.py +30 -0
keras_hub/api/utils/__init__.py +22 -0
keras_hub/src/api_export.py +15 -9
keras_hub/src/bounding_box/__init__.py +13 -0
keras_hub/src/bounding_box/converters.py +529 -0
keras_hub/src/bounding_box/formats.py +162 -0
keras_hub/src/bounding_box/iou.py +263 -0
keras_hub/src/bounding_box/to_dense.py +95 -0
keras_hub/src/bounding_box/to_ragged.py +99 -0
keras_hub/src/bounding_box/utils.py +194 -0
keras_hub/src/bounding_box/validate_format.py +99 -0
keras_hub/src/layers/preprocessing/audio_converter.py +121 -0
keras_hub/src/layers/preprocessing/image_converter.py +130 -0
keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +2 -0
keras_hub/src/layers/preprocessing/multi_segment_packer.py +9 -8
keras_hub/src/layers/preprocessing/preprocessing_layer.py +2 -29
keras_hub/src/layers/preprocessing/random_deletion.py +33 -31
keras_hub/src/layers/preprocessing/random_swap.py +33 -31
keras_hub/src/layers/preprocessing/resizing_image_converter.py +101 -0
keras_hub/src/layers/preprocessing/start_end_packer.py +3 -2
keras_hub/src/models/albert/__init__.py +1 -2
keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +6 -86
keras_hub/src/models/albert/{albert_classifier.py → albert_text_classifier.py} +34 -10
keras_hub/src/models/albert/{albert_preprocessor.py → albert_text_classifier_preprocessor.py} +14 -70
keras_hub/src/models/albert/albert_tokenizer.py +17 -36
keras_hub/src/models/backbone.py +12 -34
keras_hub/src/models/bart/__init__.py +1 -2
keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +21 -148
keras_hub/src/models/bart/bart_tokenizer.py +12 -39
keras_hub/src/models/bert/__init__.py +1 -5
keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +6 -87
keras_hub/src/models/bert/bert_presets.py +1 -4
keras_hub/src/models/bert/{bert_classifier.py → bert_text_classifier.py} +19 -12
keras_hub/src/models/bert/{bert_preprocessor.py → bert_text_classifier_preprocessor.py} +14 -70
keras_hub/src/models/bert/bert_tokenizer.py +17 -35
keras_hub/src/models/bloom/__init__.py +1 -2
keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +6 -91
keras_hub/src/models/bloom/bloom_tokenizer.py +12 -41
keras_hub/src/models/causal_lm.py +10 -29
keras_hub/src/models/causal_lm_preprocessor.py +195 -0
keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +54 -15
keras_hub/src/models/deberta_v3/__init__.py +1 -4
keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +14 -77
keras_hub/src/models/deberta_v3/{deberta_v3_classifier.py → deberta_v3_text_classifier.py} +16 -11
keras_hub/src/models/deberta_v3/{deberta_v3_preprocessor.py → deberta_v3_text_classifier_preprocessor.py} +23 -64
keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +30 -25
keras_hub/src/models/densenet/densenet_backbone.py +46 -22
keras_hub/src/models/distil_bert/__init__.py +1 -4
keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +14 -76
keras_hub/src/models/distil_bert/{distil_bert_classifier.py → distil_bert_text_classifier.py} +17 -12
keras_hub/src/models/distil_bert/{distil_bert_preprocessor.py → distil_bert_text_classifier_preprocessor.py} +23 -63
keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +19 -35
keras_hub/src/models/efficientnet/__init__.py +13 -0
keras_hub/src/models/efficientnet/efficientnet_backbone.py +569 -0
keras_hub/src/models/efficientnet/fusedmbconv.py +229 -0
keras_hub/src/models/efficientnet/mbconv.py +238 -0
keras_hub/src/models/electra/__init__.py +1 -2
keras_hub/src/models/electra/electra_tokenizer.py +17 -32
keras_hub/src/models/f_net/__init__.py +1 -2
keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +12 -78
keras_hub/src/models/f_net/{f_net_classifier.py → f_net_text_classifier.py} +17 -10
keras_hub/src/models/f_net/{f_net_preprocessor.py → f_net_text_classifier_preprocessor.py} +19 -63
keras_hub/src/models/f_net/f_net_tokenizer.py +17 -35
keras_hub/src/models/falcon/__init__.py +1 -2
keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +6 -89
keras_hub/src/models/falcon/falcon_tokenizer.py +12 -35
keras_hub/src/models/gemma/__init__.py +1 -2
keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +6 -90
keras_hub/src/models/gemma/gemma_decoder_block.py +1 -1
keras_hub/src/models/gemma/gemma_tokenizer.py +12 -23
keras_hub/src/models/gpt2/__init__.py +1 -2
keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +6 -89
keras_hub/src/models/gpt2/gpt2_preprocessor.py +12 -90
keras_hub/src/models/gpt2/gpt2_tokenizer.py +12 -34
keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +6 -91
keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +12 -34
keras_hub/src/models/image_classifier.py +0 -5
keras_hub/src/models/image_classifier_preprocessor.py +83 -0
keras_hub/src/models/llama/__init__.py +1 -2
keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +6 -85
keras_hub/src/models/llama/llama_tokenizer.py +12 -25
keras_hub/src/models/llama3/__init__.py +1 -2
keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +6 -89
keras_hub/src/models/llama3/llama3_tokenizer.py +12 -33
keras_hub/src/models/masked_lm.py +0 -2
keras_hub/src/models/masked_lm_preprocessor.py +156 -0
keras_hub/src/models/mistral/__init__.py +1 -2
keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +6 -91
keras_hub/src/models/mistral/mistral_tokenizer.py +12 -23
keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +2 -2
keras_hub/src/models/mobilenet/__init__.py +13 -0
keras_hub/src/models/mobilenet/mobilenet_backbone.py +530 -0
keras_hub/src/models/mobilenet/mobilenet_image_classifier.py +114 -0
keras_hub/src/models/opt/__init__.py +1 -2
keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +6 -93
keras_hub/src/models/opt/opt_tokenizer.py +12 -41
keras_hub/src/models/pali_gemma/__init__.py +1 -4
keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +28 -28
keras_hub/src/models/pali_gemma/pali_gemma_image_converter.py +25 -0
keras_hub/src/models/pali_gemma/pali_gemma_presets.py +5 -5
keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +11 -3
keras_hub/src/models/phi3/__init__.py +1 -2
keras_hub/src/models/phi3/phi3_causal_lm.py +3 -9
keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +6 -89
keras_hub/src/models/phi3/phi3_tokenizer.py +12 -36
keras_hub/src/models/preprocessor.py +72 -83
keras_hub/src/models/resnet/__init__.py +6 -0
keras_hub/src/models/resnet/resnet_backbone.py +390 -42
keras_hub/src/models/resnet/resnet_image_classifier.py +33 -6
keras_hub/src/models/resnet/resnet_image_classifier_preprocessor.py +28 -0
keras_hub/src/models/{llama3/llama3_preprocessor.py → resnet/resnet_image_converter.py} +7 -5
keras_hub/src/models/resnet/resnet_presets.py +95 -0
keras_hub/src/models/retinanet/__init__.py +13 -0
keras_hub/src/models/retinanet/anchor_generator.py +175 -0
keras_hub/src/models/retinanet/box_matcher.py +259 -0
keras_hub/src/models/retinanet/non_max_supression.py +578 -0
keras_hub/src/models/roberta/__init__.py +1 -2
keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +22 -74
keras_hub/src/models/roberta/{roberta_classifier.py → roberta_text_classifier.py} +16 -11
keras_hub/src/models/roberta/{roberta_preprocessor.py → roberta_text_classifier_preprocessor.py} +21 -53
keras_hub/src/models/roberta/roberta_tokenizer.py +13 -52
keras_hub/src/models/seq_2_seq_lm_preprocessor.py +269 -0
keras_hub/src/models/stable_diffusion_v3/__init__.py +13 -0
keras_hub/src/models/stable_diffusion_v3/clip_encoder_block.py +103 -0
keras_hub/src/models/stable_diffusion_v3/clip_preprocessor.py +93 -0
keras_hub/src/models/stable_diffusion_v3/clip_text_encoder.py +149 -0
keras_hub/src/models/stable_diffusion_v3/clip_tokenizer.py +167 -0
keras_hub/src/models/stable_diffusion_v3/mmdit.py +427 -0
keras_hub/src/models/stable_diffusion_v3/mmdit_block.py +317 -0
keras_hub/src/models/stable_diffusion_v3/t5_xxl_preprocessor.py +74 -0
keras_hub/src/models/stable_diffusion_v3/t5_xxl_text_encoder.py +155 -0
keras_hub/src/models/stable_diffusion_v3/vae_attention.py +126 -0
keras_hub/src/models/stable_diffusion_v3/vae_image_decoder.py +186 -0
keras_hub/src/models/t5/__init__.py +1 -2
keras_hub/src/models/t5/t5_tokenizer.py +13 -23
keras_hub/src/models/task.py +71 -116
keras_hub/src/models/{classifier.py → text_classifier.py} +19 -13
keras_hub/src/models/text_classifier_preprocessor.py +138 -0
keras_hub/src/models/whisper/__init__.py +1 -2
keras_hub/src/models/whisper/{whisper_audio_feature_extractor.py → whisper_audio_converter.py} +20 -18
keras_hub/src/models/whisper/whisper_backbone.py +0 -3
keras_hub/src/models/whisper/whisper_presets.py +10 -10
keras_hub/src/models/whisper/whisper_tokenizer.py +20 -16
keras_hub/src/models/xlm_roberta/__init__.py +1 -4
keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +26 -72
keras_hub/src/models/xlm_roberta/{xlm_roberta_classifier.py → xlm_roberta_text_classifier.py} +16 -11
keras_hub/src/models/xlm_roberta/{xlm_roberta_preprocessor.py → xlm_roberta_text_classifier_preprocessor.py} +26 -53
keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +25 -10
keras_hub/src/tests/test_case.py +46 -0
keras_hub/src/tokenizers/byte_pair_tokenizer.py +30 -17
keras_hub/src/tokenizers/byte_tokenizer.py +14 -15
keras_hub/src/tokenizers/sentence_piece_tokenizer.py +20 -7
keras_hub/src/tokenizers/tokenizer.py +67 -32
keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +14 -15
keras_hub/src/tokenizers/word_piece_tokenizer.py +34 -47
keras_hub/src/utils/imagenet/__init__.py +13 -0
keras_hub/src/utils/imagenet/imagenet_utils.py +1067 -0
keras_hub/src/utils/keras_utils.py +0 -50
keras_hub/src/utils/preset_utils.py +230 -68
keras_hub/src/utils/tensor_utils.py +187 -69
keras_hub/src/utils/timm/convert_resnet.py +19 -16
keras_hub/src/utils/timm/preset_loader.py +66 -0
keras_hub/src/utils/transformers/convert_albert.py +193 -0
keras_hub/src/utils/transformers/convert_bart.py +373 -0
keras_hub/src/utils/transformers/convert_bert.py +7 -17
keras_hub/src/utils/transformers/convert_distilbert.py +10 -20
keras_hub/src/utils/transformers/convert_gemma.py +5 -19
keras_hub/src/utils/transformers/convert_gpt2.py +5 -18
keras_hub/src/utils/transformers/convert_llama3.py +7 -18
keras_hub/src/utils/transformers/convert_mistral.py +129 -0
keras_hub/src/utils/transformers/convert_pali_gemma.py +7 -29
keras_hub/src/utils/transformers/preset_loader.py +77 -0
keras_hub/src/utils/transformers/safetensor_utils.py +2 -2
keras_hub/src/version_utils.py +1 -1
keras_hub_nightly-0.16.0.dev2024092017.dist-info/METADATA +202 -0
keras_hub_nightly-0.16.0.dev2024092017.dist-info/RECORD +334 -0
{keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/WHEEL +1 -1
keras_hub/src/models/bart/bart_preprocessor.py +0 -276
keras_hub/src/models/bloom/bloom_preprocessor.py +0 -185
keras_hub/src/models/electra/electra_preprocessor.py +0 -154
keras_hub/src/models/falcon/falcon_preprocessor.py +0 -187
keras_hub/src/models/gemma/gemma_preprocessor.py +0 -191
keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +0 -145
keras_hub/src/models/llama/llama_preprocessor.py +0 -189
keras_hub/src/models/mistral/mistral_preprocessor.py +0 -190
keras_hub/src/models/opt/opt_preprocessor.py +0 -188
keras_hub/src/models/phi3/phi3_preprocessor.py +0 -190
keras_hub/src/models/whisper/whisper_preprocessor.py +0 -326
keras_hub/src/utils/timm/convert.py +0 -37
keras_hub/src/utils/transformers/convert.py +0 -101
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +0 -34
keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +0 -297
{keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/top_level.txt +0 -0

keras_hub/src/tokenizers/tokenizer.py CHANGED Viewed

@@ -19,16 +19,14 @@ from keras_hub.src.layers.preprocessing.preprocessing_layer import (
 )
 from keras_hub.src.utils.preset_utils import TOKENIZER_ASSET_DIR
 from keras_hub.src.utils.preset_utils import TOKENIZER_CONFIG_FILE
-from keras_hub.src.utils.preset_utils import check_config_class
-from keras_hub.src.utils.preset_utils import check_format
+from keras_hub.src.utils.preset_utils import builtin_presets
+from keras_hub.src.utils.preset_utils import find_subclass
 from keras_hub.src.utils.preset_utils import get_file
-from keras_hub.src.utils.preset_utils import list_presets
-from keras_hub.src.utils.preset_utils import list_subclasses
-from keras_hub.src.utils.preset_utils import load_serialized_object
+from keras_hub.src.utils.preset_utils import get_preset_loader
 from keras_hub.src.utils.preset_utils import save_serialized_object
 from keras_hub.src.utils.preset_utils import save_tokenizer_assets
 from keras_hub.src.utils.python_utils import classproperty
-from keras_hub.src.utils.transformers.convert import load_transformers_tokenizer
+from keras_hub.src.utils.tensor_utils import preprocessing_function
 @keras_hub_export(
@@ -79,6 +77,8 @@ class Tokenizer(PreprocessingLayer):
     ```
     """
+    backbone_cls = None
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.file_assets = None
@@ -138,6 +138,55 @@ class Tokenizer(PreprocessingLayer):
             f"{self.__class__.__name__}."
         )
+    @property
+    def special_tokens(self):
+        """List all built-in special tokens for the tokenizer."""
+        if not hasattr(self, "_special_token_attrs"):
+            return []
+        tokens = set(getattr(self, a) for a in self._special_token_attrs)
+        return list(tokens)
+    @property
+    def special_token_ids(self):
+        """List all built-in special token ids for the tokenizer."""
+        if not hasattr(self, "_special_token_attrs"):
+            return []
+        ids = set(getattr(self, f"{a}_id") for a in self._special_token_attrs)
+        if None in ids:
+            raise ValueError(
+                "Cannot access `special_token_ids` before a vocabulary has "
+                "been set on the tokenizer."
+            )
+        return list(ids)
+    def _add_special_token(self, token, name):
+        if not hasattr(self, "_special_token_attrs"):
+            self._special_token_attrs = []
+        self._special_token_attrs.append(name)
+        setattr(self, name, token)
+        try:
+            id = self.token_to_id(token)
+        except (ValueError, AttributeError):
+            id = None
+        setattr(self, f"{name}_id", id)
+    def _update_special_token_ids(self):
+        if not hasattr(self, "_special_token_attrs"):
+            return
+        vocabulary = self.get_vocabulary()
+        for attr in set(self._special_token_attrs):
+            token = getattr(self, attr)
+            if token not in vocabulary:
+                classname = self.__class__.__name__
+                raise ValueError(
+                    f"Cannot find special token `'{token}'` in the provided "
+                    f"vocabulary for `{classname}`. Please ensure `'{token}'` "
+                    "is in the provided vocabulary when creating the Tokenizer."
+                )
+        for attr in self._special_token_attrs:
+            token = getattr(self, attr)
+            setattr(self, f"{attr}_id", self.token_to_id(token))
     def save_to_preset(self, preset_dir):
         """Save tokenizer to a preset directory.
@@ -151,6 +200,7 @@ class Tokenizer(PreprocessingLayer):
         )
         save_tokenizer_assets(self, preset_dir)
+    @preprocessing_function
     def call(self, inputs, *args, training=None, **kwargs):
         return self.tokenize(inputs, *args, **kwargs)
@@ -165,11 +215,8 @@ class Tokenizer(PreprocessingLayer):
     @classproperty
     def presets(cls):
-        """List built-in presets for a `Task` subclass."""
-        presets = list_presets(cls)
-        for subclass in list_subclasses(cls):
-            presets.update(subclass.presets)
-        return presets
+        """List built-in presets for a `Tokenizer` subclass."""
+        return builtin_presets(cls)
     @classmethod
     def from_preset(
@@ -180,10 +227,10 @@ class Tokenizer(PreprocessingLayer):
         """Instantiate a `keras_hub.models.Tokenizer` from a model preset.
         A preset is a directory of configs, weights and other file assets used
-        to save and load a pre-trained model. The `preset` can be passed as a
+        to save and load a pre-trained model. The `preset` can be passed as
         one of:
-        1. a built in preset identifier like `'bert_base_en'`
+        1. a built-in preset identifier like `'bert_base_en'`
         2. a Kaggle Models handle like `'kaggle://user/bert/keras/bert_base_en'`
         3. a Hugging Face handle like `'hf://user/bert_base_en'`
         4. a path to a local preset directory like `'./bert_base_en'`
@@ -198,7 +245,7 @@ class Tokenizer(PreprocessingLayer):
         will be inferred from the config in the preset directory.
         Args:
-            preset: string. A built in preset identifier, a Kaggle Models
+            preset: string. A built-in preset identifier, a Kaggle Models
                 handle, a Hugging Face handle, or a path to a local directory.
             load_weights: bool. If `True`, the weights will be loaded into the
                 model architecture. If `False`, the weights will be randomly
@@ -207,7 +254,7 @@ class Tokenizer(PreprocessingLayer):
         Examples:
         ```python
         # Load a preset tokenizer.
-        tokenizer = keras_hub.tokenizerTokenizer.from_preset("bert_base_en")
+        tokenizer = keras_hub.tokenizer.Tokenizer.from_preset("bert_base_en")
         # Tokenize some input.
         tokenizer("The quick brown fox tripped.")
@@ -216,20 +263,8 @@ class Tokenizer(PreprocessingLayer):
         tokenizer.detokenize([5, 6, 7, 8, 9])
         ```
         """
-        format = check_format(preset)
-        if format == "transformers":
-            return load_transformers_tokenizer(cls, preset)
-        preset_cls = check_config_class(
-            preset, config_file=TOKENIZER_CONFIG_FILE
-        )
-        if not issubclass(preset_cls, cls):
-            raise ValueError(
-                f"Preset has type `{preset_cls.__name__}` which is not a "
-                f"a subclass of calling class `{cls.__name__}`. Call "
-                f"`from_preset` directly on `{preset_cls.__name__}` instead."
-            )
-        tokenizer = load_serialized_object(preset, TOKENIZER_CONFIG_FILE)
-        tokenizer.load_preset_assets(preset)
-        return tokenizer
+        loader = get_preset_loader(preset)
+        backbone_cls = loader.check_backbone_class()
+        if cls.backbone_cls != backbone_cls:
+            cls = find_subclass(preset, cls, backbone_cls)
+        return loader.load_tokenizer(cls, **kwargs)

keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py CHANGED Viewed

@@ -17,6 +17,7 @@ from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.tokenizers import tokenizer
 from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
 from keras_hub.src.utils.tensor_utils import is_int_dtype
+from keras_hub.src.utils.tensor_utils import preprocessing_function
 try:
     import tensorflow as tf
@@ -94,9 +95,9 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
     >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
     >>> seq1, seq2 = tokenizer(inputs)
     >>> np.array(seq1)
-    array([2346, 2369, 2360, 2381, 2340, 2325], dtype=int32)
+    array([2346, 2369, 2360, 2381, 2340, 2325])
     >>> np.array(seq2)
-    array([1705, 1578, 1575, 1576], dtype=int32)
+    array([1705, 1578, 1575, 1576])
     Dense outputs.
     >>> inputs = ["पुस्तक", "کتاب"]
@@ -179,9 +180,8 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
     Detokenization.
     >>> inputs = tf.constant([110, 105, 110, 106,  97], dtype="int32")
     >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
-    >>> outputs = tokenizer.detokenize(inputs)
-    >>> np.array(outputs).astype("U")
-    array('ninja', dtype='<U5')
+    >>> tokenizer.detokenize(inputs)
+    'ninja'
     Detokenization with padding.
     >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
@@ -199,9 +199,8 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
     >>> inputs = tf.constant([110, 105, 10000000, 110, 106,  97])
     >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
     ...     errors="replace", replacement_char=88)
-    >>> outputs = tokenizer.detokenize(inputs)
-    >>> np.array(outputs).astype("U")
-    array('niXnja', dtype='<U6')
+    >>> tokenizer.detokenize(inputs)
+    'niXnja'
     """
     def __init__(
@@ -256,6 +255,7 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
         self.input_encoding = input_encoding
         self.output_encoding = output_encoding
         self._vocabulary_size = vocabulary_size
+        self._update_special_token_ids()
     def get_config(self):
         config = super().get_config()
@@ -284,12 +284,10 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
             vocab[chr(i)] = i
         return vocab
+    @preprocessing_function
     def tokenize(self, inputs):
-        if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
-            inputs = tf.convert_to_tensor(inputs)
-        scalar_input = inputs.shape.rank == 0
-        if scalar_input:
+        unbatched = inputs.shape.rank == 0
+        if unbatched:
             inputs = tf.expand_dims(inputs, 0)
         # Optionally lowercase the text
@@ -313,7 +311,7 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
             output_shape[-1] = self.sequence_length
             tokens = tokens.to_tensor(shape=output_shape)
-        if scalar_input:
+        if unbatched:
             tokens = tf.squeeze(tokens, 0)
         # Optionally clamps the output code point values to be in the
@@ -323,8 +321,9 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
         return tokens
+    @preprocessing_function
     def detokenize(self, inputs):
-        inputs, unbatched, _ = convert_to_ragged_batch(inputs)
+        inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
         inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
         outputs = tf.strings.unicode_encode(
             inputs,

keras_hub/src/tokenizers/word_piece_tokenizer.py CHANGED Viewed

@@ -23,6 +23,7 @@ from keras_hub.src.tokenizers import tokenizer
 from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
 from keras_hub.src.utils.tensor_utils import is_int_dtype
 from keras_hub.src.utils.tensor_utils import is_string_dtype
+from keras_hub.src.utils.tensor_utils import preprocessing_function
 try:
     import tensorflow as tf
@@ -166,7 +167,7 @@ def pretokenize(
         if special_tokens_pattern is not None:
             # the idea here is to pass the special tokens regex to the split
             # function as delimiter regex pattern, so the input will be splitted
-            # by them, but also the function will treat each on of them as one
+            # by them, but also the function will treat each one of them as one
             # entity that shouldn't be splitted even if they have other
             # delimiter regex pattern inside them. then pass the special tokens
             # regex also as keep delimiter regex pattern, so they will
@@ -263,12 +264,6 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
         oov_token: str. The string value to substitute for
             an unknown token. It must be included in the vocab.
             Defaults to `"[UNK]"`.
-        special_tokens: list. A list of special tokens. when
-            `special_tokens_in_strings` is set to `True`, the tokenizer will map
-            every special token in the input strings to its id, even if these
-            special tokens contain characters that should be splitted before
-            tokenization such as punctuation. `special_tokens` must be included
-            in `vocabulary`.
         special_tokens_in_strings: bool. A bool to indicate if the tokenizer
             should expect special tokens in input strings that should be
             tokenized and mapped correctly to their ids. Defaults to False.
@@ -310,9 +305,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
     ...     lowercase=True,
     ...     dtype="string",
     ... )
-    >>> outputs = tokenizer(inputs)
-    >>> np.array(outputs).astype("U")
-    array(['the', 'qu', '##ick', 'br', '##own', 'fox', '.'], dtype='<U5')
+    >>> tokenizer(inputs)
+    ['the', 'qu', '##ick', 'br', '##own', 'fox', '.']
     Detokenization.
     >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
@@ -321,9 +315,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
     ...     vocabulary=vocab,
     ...     lowercase=True,
     ... )
-    >>> outputs = tokenizer.detokenize(tokenizer.tokenize(inputs))
-    >>> np.array(outputs).astype("U")
-    array('the quick brown fox .', dtype='<U21')
+    >>> tokenizer.detokenize(tokenizer.tokenize(inputs))
+    'the quick brown fox .'
     Custom splitting.
     >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
@@ -335,9 +328,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
     ...     dtype='string',
     ... )
     >>> split_inputs = tf.strings.split(inputs, sep="$")
-    >>> outputs = tokenizer(split_inputs)
-    >>> np.array(outputs).astype("U")
-    array(['the', 'qu', '##ick', 'br', '##own', 'fox'], dtype='<U5')
+    >>> tokenizer(split_inputs)
+    ['the', 'qu', '##ick', 'br', '##own', 'fox']
     """
     def __init__(
@@ -372,19 +364,9 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
         self.split_on_cjk = split_on_cjk
         self.suffix_indicator = suffix_indicator
         self.oov_token = oov_token
-        self.special_tokens = special_tokens
-        self._special_tokens_pattern = None
-        if self.split and special_tokens_in_strings:
-            # the idea here is to pass the special tokens regex to the
-            # split function as delimiter regex pattern, so the input will
-            # be splitted by them, but also the function will treat each on
-            # of them as one entity that shouldn't be splitted even if they
-            # have other delimiter regex pattern inside them. then pass the
-            # special tokens regex also as keep delimiter regex
-            # pattern, so they will not be removed.
-            self._special_tokens_pattern = get_special_tokens_pattern(
-                self.special_tokens
-            )
+        self._init_special_tokens = special_tokens
+        self.special_tokens_in_strings = special_tokens_in_strings
         self.set_vocabulary(vocabulary)
         self.file_assets = [VOCAB_FILENAME]
@@ -426,16 +408,6 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
                 "the `oov_token` argument when creating the tokenizer."
             )
-        # Check for special tokens in the vocabulary
-        if self.special_tokens is not None:
-            for token in self.special_tokens:
-                if token not in self.vocabulary:
-                    raise ValueError(
-                        f"Cannot find token `'{token}'` in the provided "
-                        f"`vocabulary`. Please provide `'{token}'` in your "
-                        "`vocabulary` or use a pretrained `vocabulary` name."
-                    )
         self._fast_word_piece = tf_text.FastWordpieceTokenizer(
             vocab=self.vocabulary,
             token_out_type=self.compute_dtype,
@@ -444,6 +416,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
             no_pretokenization=True,
             support_detokenization=True,
         )
+        self._update_special_token_ids()
     def get_vocabulary(self):
         """Get the tokenizer vocabulary as a list of strings tokens."""
@@ -484,7 +457,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
                 "split": self.split,
                 "suffix_indicator": self.suffix_indicator,
                 "oov_token": self.oov_token,
-                "special_tokens": self.special_tokens,
+                "special_tokens": self._init_special_tokens,
+                "special_tokens_in_strings": self.special_tokens_in_strings,
             }
         )
         return config
@@ -496,19 +470,31 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
                 "to pass a `vocabulary` argument when creating the layer."
             )
+    @preprocessing_function
     def tokenize(self, inputs):
         self._check_vocabulary()
-        if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
-            inputs = tf.convert_to_tensor(inputs)
-        scalar_input = inputs.shape.rank == 0
+        inputs = tf.convert_to_tensor(inputs)
+        unbatched = inputs.shape.rank == 0
+        pattern = None
+        if self.split and self.special_tokens_in_strings:
+            # the idea here is to pass the special tokens regex to the
+            # split function as delimiter regex pattern, so the input will
+            # be splitted by them, but also the function will treat each one
+            # of them as one entity that shouldn't be splitted even if they
+            # have other delimiter regex pattern inside them. then pass the
+            # special tokens regex also as keep delimiter regex
+            # pattern, so they will not be removed.
+            special_tokens = self.special_tokens
+            if self._init_special_tokens:
+                special_tokens += self._init_special_tokens
+            pattern = get_special_tokens_pattern(special_tokens)
         inputs = pretokenize(
             inputs,
             self.lowercase,
             self.strip_accents,
             self.split,
             self.split_on_cjk,
-            self._special_tokens_pattern,
+            pattern,
         )
         # Apply WordPiece and coerce shape for outputs.
@@ -524,15 +510,16 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
             output_shape[-1] = self.sequence_length
             tokens = tokens.to_tensor(shape=output_shape)
         # Convert to a dense output if input in scalar
-        if scalar_input:
+        if unbatched:
             tokens = tf.squeeze(tokens, 0)
             tf.ensure_shape(tokens, shape=[self.sequence_length])
         return tokens
+    @preprocessing_function
     def detokenize(self, inputs):
         self._check_vocabulary()
-        inputs, unbatched, _ = convert_to_ragged_batch(inputs)
+        inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
         outputs = self._fast_word_piece.detokenize(inputs)
         if unbatched:
             outputs = tf.squeeze(outputs, 0)

keras_hub/src/utils/imagenet/__init__.py ADDED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2024 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl → 0.16.0.dev2024092017__py3-none-any.whl

keras-hub-nightly 0.15.0.dev20240823171555py3-none-any.whl → 0.16.0.dev2024092017py3-none-any.whl