keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl → 0.16.0.dev2024092017__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. keras_hub/__init__.py +0 -6
  2. keras_hub/api/__init__.py +2 -0
  3. keras_hub/api/bounding_box/__init__.py +36 -0
  4. keras_hub/api/layers/__init__.py +14 -0
  5. keras_hub/api/models/__init__.py +97 -48
  6. keras_hub/api/tokenizers/__init__.py +30 -0
  7. keras_hub/api/utils/__init__.py +22 -0
  8. keras_hub/src/api_export.py +15 -9
  9. keras_hub/src/bounding_box/__init__.py +13 -0
  10. keras_hub/src/bounding_box/converters.py +529 -0
  11. keras_hub/src/bounding_box/formats.py +162 -0
  12. keras_hub/src/bounding_box/iou.py +263 -0
  13. keras_hub/src/bounding_box/to_dense.py +95 -0
  14. keras_hub/src/bounding_box/to_ragged.py +99 -0
  15. keras_hub/src/bounding_box/utils.py +194 -0
  16. keras_hub/src/bounding_box/validate_format.py +99 -0
  17. keras_hub/src/layers/preprocessing/audio_converter.py +121 -0
  18. keras_hub/src/layers/preprocessing/image_converter.py +130 -0
  19. keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +2 -0
  20. keras_hub/src/layers/preprocessing/multi_segment_packer.py +9 -8
  21. keras_hub/src/layers/preprocessing/preprocessing_layer.py +2 -29
  22. keras_hub/src/layers/preprocessing/random_deletion.py +33 -31
  23. keras_hub/src/layers/preprocessing/random_swap.py +33 -31
  24. keras_hub/src/layers/preprocessing/resizing_image_converter.py +101 -0
  25. keras_hub/src/layers/preprocessing/start_end_packer.py +3 -2
  26. keras_hub/src/models/albert/__init__.py +1 -2
  27. keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +6 -86
  28. keras_hub/src/models/albert/{albert_classifier.py → albert_text_classifier.py} +34 -10
  29. keras_hub/src/models/albert/{albert_preprocessor.py → albert_text_classifier_preprocessor.py} +14 -70
  30. keras_hub/src/models/albert/albert_tokenizer.py +17 -36
  31. keras_hub/src/models/backbone.py +12 -34
  32. keras_hub/src/models/bart/__init__.py +1 -2
  33. keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +21 -148
  34. keras_hub/src/models/bart/bart_tokenizer.py +12 -39
  35. keras_hub/src/models/bert/__init__.py +1 -5
  36. keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +6 -87
  37. keras_hub/src/models/bert/bert_presets.py +1 -4
  38. keras_hub/src/models/bert/{bert_classifier.py → bert_text_classifier.py} +19 -12
  39. keras_hub/src/models/bert/{bert_preprocessor.py → bert_text_classifier_preprocessor.py} +14 -70
  40. keras_hub/src/models/bert/bert_tokenizer.py +17 -35
  41. keras_hub/src/models/bloom/__init__.py +1 -2
  42. keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +6 -91
  43. keras_hub/src/models/bloom/bloom_tokenizer.py +12 -41
  44. keras_hub/src/models/causal_lm.py +10 -29
  45. keras_hub/src/models/causal_lm_preprocessor.py +195 -0
  46. keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +54 -15
  47. keras_hub/src/models/deberta_v3/__init__.py +1 -4
  48. keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +14 -77
  49. keras_hub/src/models/deberta_v3/{deberta_v3_classifier.py → deberta_v3_text_classifier.py} +16 -11
  50. keras_hub/src/models/deberta_v3/{deberta_v3_preprocessor.py → deberta_v3_text_classifier_preprocessor.py} +23 -64
  51. keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +30 -25
  52. keras_hub/src/models/densenet/densenet_backbone.py +46 -22
  53. keras_hub/src/models/distil_bert/__init__.py +1 -4
  54. keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +14 -76
  55. keras_hub/src/models/distil_bert/{distil_bert_classifier.py → distil_bert_text_classifier.py} +17 -12
  56. keras_hub/src/models/distil_bert/{distil_bert_preprocessor.py → distil_bert_text_classifier_preprocessor.py} +23 -63
  57. keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +19 -35
  58. keras_hub/src/models/efficientnet/__init__.py +13 -0
  59. keras_hub/src/models/efficientnet/efficientnet_backbone.py +569 -0
  60. keras_hub/src/models/efficientnet/fusedmbconv.py +229 -0
  61. keras_hub/src/models/efficientnet/mbconv.py +238 -0
  62. keras_hub/src/models/electra/__init__.py +1 -2
  63. keras_hub/src/models/electra/electra_tokenizer.py +17 -32
  64. keras_hub/src/models/f_net/__init__.py +1 -2
  65. keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +12 -78
  66. keras_hub/src/models/f_net/{f_net_classifier.py → f_net_text_classifier.py} +17 -10
  67. keras_hub/src/models/f_net/{f_net_preprocessor.py → f_net_text_classifier_preprocessor.py} +19 -63
  68. keras_hub/src/models/f_net/f_net_tokenizer.py +17 -35
  69. keras_hub/src/models/falcon/__init__.py +1 -2
  70. keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +6 -89
  71. keras_hub/src/models/falcon/falcon_tokenizer.py +12 -35
  72. keras_hub/src/models/gemma/__init__.py +1 -2
  73. keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +6 -90
  74. keras_hub/src/models/gemma/gemma_decoder_block.py +1 -1
  75. keras_hub/src/models/gemma/gemma_tokenizer.py +12 -23
  76. keras_hub/src/models/gpt2/__init__.py +1 -2
  77. keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +6 -89
  78. keras_hub/src/models/gpt2/gpt2_preprocessor.py +12 -90
  79. keras_hub/src/models/gpt2/gpt2_tokenizer.py +12 -34
  80. keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +6 -91
  81. keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +12 -34
  82. keras_hub/src/models/image_classifier.py +0 -5
  83. keras_hub/src/models/image_classifier_preprocessor.py +83 -0
  84. keras_hub/src/models/llama/__init__.py +1 -2
  85. keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +6 -85
  86. keras_hub/src/models/llama/llama_tokenizer.py +12 -25
  87. keras_hub/src/models/llama3/__init__.py +1 -2
  88. keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +6 -89
  89. keras_hub/src/models/llama3/llama3_tokenizer.py +12 -33
  90. keras_hub/src/models/masked_lm.py +0 -2
  91. keras_hub/src/models/masked_lm_preprocessor.py +156 -0
  92. keras_hub/src/models/mistral/__init__.py +1 -2
  93. keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +6 -91
  94. keras_hub/src/models/mistral/mistral_tokenizer.py +12 -23
  95. keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +2 -2
  96. keras_hub/src/models/mobilenet/__init__.py +13 -0
  97. keras_hub/src/models/mobilenet/mobilenet_backbone.py +530 -0
  98. keras_hub/src/models/mobilenet/mobilenet_image_classifier.py +114 -0
  99. keras_hub/src/models/opt/__init__.py +1 -2
  100. keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +6 -93
  101. keras_hub/src/models/opt/opt_tokenizer.py +12 -41
  102. keras_hub/src/models/pali_gemma/__init__.py +1 -4
  103. keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +28 -28
  104. keras_hub/src/models/pali_gemma/pali_gemma_image_converter.py +25 -0
  105. keras_hub/src/models/pali_gemma/pali_gemma_presets.py +5 -5
  106. keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +11 -3
  107. keras_hub/src/models/phi3/__init__.py +1 -2
  108. keras_hub/src/models/phi3/phi3_causal_lm.py +3 -9
  109. keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +6 -89
  110. keras_hub/src/models/phi3/phi3_tokenizer.py +12 -36
  111. keras_hub/src/models/preprocessor.py +72 -83
  112. keras_hub/src/models/resnet/__init__.py +6 -0
  113. keras_hub/src/models/resnet/resnet_backbone.py +390 -42
  114. keras_hub/src/models/resnet/resnet_image_classifier.py +33 -6
  115. keras_hub/src/models/resnet/resnet_image_classifier_preprocessor.py +28 -0
  116. keras_hub/src/models/{llama3/llama3_preprocessor.py → resnet/resnet_image_converter.py} +7 -5
  117. keras_hub/src/models/resnet/resnet_presets.py +95 -0
  118. keras_hub/src/models/retinanet/__init__.py +13 -0
  119. keras_hub/src/models/retinanet/anchor_generator.py +175 -0
  120. keras_hub/src/models/retinanet/box_matcher.py +259 -0
  121. keras_hub/src/models/retinanet/non_max_supression.py +578 -0
  122. keras_hub/src/models/roberta/__init__.py +1 -2
  123. keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +22 -74
  124. keras_hub/src/models/roberta/{roberta_classifier.py → roberta_text_classifier.py} +16 -11
  125. keras_hub/src/models/roberta/{roberta_preprocessor.py → roberta_text_classifier_preprocessor.py} +21 -53
  126. keras_hub/src/models/roberta/roberta_tokenizer.py +13 -52
  127. keras_hub/src/models/seq_2_seq_lm_preprocessor.py +269 -0
  128. keras_hub/src/models/stable_diffusion_v3/__init__.py +13 -0
  129. keras_hub/src/models/stable_diffusion_v3/clip_encoder_block.py +103 -0
  130. keras_hub/src/models/stable_diffusion_v3/clip_preprocessor.py +93 -0
  131. keras_hub/src/models/stable_diffusion_v3/clip_text_encoder.py +149 -0
  132. keras_hub/src/models/stable_diffusion_v3/clip_tokenizer.py +167 -0
  133. keras_hub/src/models/stable_diffusion_v3/mmdit.py +427 -0
  134. keras_hub/src/models/stable_diffusion_v3/mmdit_block.py +317 -0
  135. keras_hub/src/models/stable_diffusion_v3/t5_xxl_preprocessor.py +74 -0
  136. keras_hub/src/models/stable_diffusion_v3/t5_xxl_text_encoder.py +155 -0
  137. keras_hub/src/models/stable_diffusion_v3/vae_attention.py +126 -0
  138. keras_hub/src/models/stable_diffusion_v3/vae_image_decoder.py +186 -0
  139. keras_hub/src/models/t5/__init__.py +1 -2
  140. keras_hub/src/models/t5/t5_tokenizer.py +13 -23
  141. keras_hub/src/models/task.py +71 -116
  142. keras_hub/src/models/{classifier.py → text_classifier.py} +19 -13
  143. keras_hub/src/models/text_classifier_preprocessor.py +138 -0
  144. keras_hub/src/models/whisper/__init__.py +1 -2
  145. keras_hub/src/models/whisper/{whisper_audio_feature_extractor.py → whisper_audio_converter.py} +20 -18
  146. keras_hub/src/models/whisper/whisper_backbone.py +0 -3
  147. keras_hub/src/models/whisper/whisper_presets.py +10 -10
  148. keras_hub/src/models/whisper/whisper_tokenizer.py +20 -16
  149. keras_hub/src/models/xlm_roberta/__init__.py +1 -4
  150. keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +26 -72
  151. keras_hub/src/models/xlm_roberta/{xlm_roberta_classifier.py → xlm_roberta_text_classifier.py} +16 -11
  152. keras_hub/src/models/xlm_roberta/{xlm_roberta_preprocessor.py → xlm_roberta_text_classifier_preprocessor.py} +26 -53
  153. keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +25 -10
  154. keras_hub/src/tests/test_case.py +46 -0
  155. keras_hub/src/tokenizers/byte_pair_tokenizer.py +30 -17
  156. keras_hub/src/tokenizers/byte_tokenizer.py +14 -15
  157. keras_hub/src/tokenizers/sentence_piece_tokenizer.py +20 -7
  158. keras_hub/src/tokenizers/tokenizer.py +67 -32
  159. keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +14 -15
  160. keras_hub/src/tokenizers/word_piece_tokenizer.py +34 -47
  161. keras_hub/src/utils/imagenet/__init__.py +13 -0
  162. keras_hub/src/utils/imagenet/imagenet_utils.py +1067 -0
  163. keras_hub/src/utils/keras_utils.py +0 -50
  164. keras_hub/src/utils/preset_utils.py +230 -68
  165. keras_hub/src/utils/tensor_utils.py +187 -69
  166. keras_hub/src/utils/timm/convert_resnet.py +19 -16
  167. keras_hub/src/utils/timm/preset_loader.py +66 -0
  168. keras_hub/src/utils/transformers/convert_albert.py +193 -0
  169. keras_hub/src/utils/transformers/convert_bart.py +373 -0
  170. keras_hub/src/utils/transformers/convert_bert.py +7 -17
  171. keras_hub/src/utils/transformers/convert_distilbert.py +10 -20
  172. keras_hub/src/utils/transformers/convert_gemma.py +5 -19
  173. keras_hub/src/utils/transformers/convert_gpt2.py +5 -18
  174. keras_hub/src/utils/transformers/convert_llama3.py +7 -18
  175. keras_hub/src/utils/transformers/convert_mistral.py +129 -0
  176. keras_hub/src/utils/transformers/convert_pali_gemma.py +7 -29
  177. keras_hub/src/utils/transformers/preset_loader.py +77 -0
  178. keras_hub/src/utils/transformers/safetensor_utils.py +2 -2
  179. keras_hub/src/version_utils.py +1 -1
  180. keras_hub_nightly-0.16.0.dev2024092017.dist-info/METADATA +202 -0
  181. keras_hub_nightly-0.16.0.dev2024092017.dist-info/RECORD +334 -0
  182. {keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/WHEEL +1 -1
  183. keras_hub/src/models/bart/bart_preprocessor.py +0 -276
  184. keras_hub/src/models/bloom/bloom_preprocessor.py +0 -185
  185. keras_hub/src/models/electra/electra_preprocessor.py +0 -154
  186. keras_hub/src/models/falcon/falcon_preprocessor.py +0 -187
  187. keras_hub/src/models/gemma/gemma_preprocessor.py +0 -191
  188. keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +0 -145
  189. keras_hub/src/models/llama/llama_preprocessor.py +0 -189
  190. keras_hub/src/models/mistral/mistral_preprocessor.py +0 -190
  191. keras_hub/src/models/opt/opt_preprocessor.py +0 -188
  192. keras_hub/src/models/phi3/phi3_preprocessor.py +0 -190
  193. keras_hub/src/models/whisper/whisper_preprocessor.py +0 -326
  194. keras_hub/src/utils/timm/convert.py +0 -37
  195. keras_hub/src/utils/transformers/convert.py +0 -101
  196. keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +0 -34
  197. keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +0 -297
  198. {keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/top_level.txt +0 -0
@@ -19,16 +19,14 @@ from keras_hub.src.layers.preprocessing.preprocessing_layer import (
19
19
  )
20
20
  from keras_hub.src.utils.preset_utils import TOKENIZER_ASSET_DIR
21
21
  from keras_hub.src.utils.preset_utils import TOKENIZER_CONFIG_FILE
22
- from keras_hub.src.utils.preset_utils import check_config_class
23
- from keras_hub.src.utils.preset_utils import check_format
22
+ from keras_hub.src.utils.preset_utils import builtin_presets
23
+ from keras_hub.src.utils.preset_utils import find_subclass
24
24
  from keras_hub.src.utils.preset_utils import get_file
25
- from keras_hub.src.utils.preset_utils import list_presets
26
- from keras_hub.src.utils.preset_utils import list_subclasses
27
- from keras_hub.src.utils.preset_utils import load_serialized_object
25
+ from keras_hub.src.utils.preset_utils import get_preset_loader
28
26
  from keras_hub.src.utils.preset_utils import save_serialized_object
29
27
  from keras_hub.src.utils.preset_utils import save_tokenizer_assets
30
28
  from keras_hub.src.utils.python_utils import classproperty
31
- from keras_hub.src.utils.transformers.convert import load_transformers_tokenizer
29
+ from keras_hub.src.utils.tensor_utils import preprocessing_function
32
30
 
33
31
 
34
32
  @keras_hub_export(
@@ -79,6 +77,8 @@ class Tokenizer(PreprocessingLayer):
79
77
  ```
80
78
  """
81
79
 
80
+ backbone_cls = None
81
+
82
82
  def __init__(self, *args, **kwargs):
83
83
  super().__init__(*args, **kwargs)
84
84
  self.file_assets = None
@@ -138,6 +138,55 @@ class Tokenizer(PreprocessingLayer):
138
138
  f"{self.__class__.__name__}."
139
139
  )
140
140
 
141
+ @property
142
+ def special_tokens(self):
143
+ """List all built-in special tokens for the tokenizer."""
144
+ if not hasattr(self, "_special_token_attrs"):
145
+ return []
146
+ tokens = set(getattr(self, a) for a in self._special_token_attrs)
147
+ return list(tokens)
148
+
149
+ @property
150
+ def special_token_ids(self):
151
+ """List all built-in special token ids for the tokenizer."""
152
+ if not hasattr(self, "_special_token_attrs"):
153
+ return []
154
+ ids = set(getattr(self, f"{a}_id") for a in self._special_token_attrs)
155
+ if None in ids:
156
+ raise ValueError(
157
+ "Cannot access `special_token_ids` before a vocabulary has "
158
+ "been set on the tokenizer."
159
+ )
160
+ return list(ids)
161
+
162
+ def _add_special_token(self, token, name):
163
+ if not hasattr(self, "_special_token_attrs"):
164
+ self._special_token_attrs = []
165
+ self._special_token_attrs.append(name)
166
+ setattr(self, name, token)
167
+ try:
168
+ id = self.token_to_id(token)
169
+ except (ValueError, AttributeError):
170
+ id = None
171
+ setattr(self, f"{name}_id", id)
172
+
173
+ def _update_special_token_ids(self):
174
+ if not hasattr(self, "_special_token_attrs"):
175
+ return
176
+ vocabulary = self.get_vocabulary()
177
+ for attr in set(self._special_token_attrs):
178
+ token = getattr(self, attr)
179
+ if token not in vocabulary:
180
+ classname = self.__class__.__name__
181
+ raise ValueError(
182
+ f"Cannot find special token `'{token}'` in the provided "
183
+ f"vocabulary for `{classname}`. Please ensure `'{token}'` "
184
+ "is in the provided vocabulary when creating the Tokenizer."
185
+ )
186
+ for attr in self._special_token_attrs:
187
+ token = getattr(self, attr)
188
+ setattr(self, f"{attr}_id", self.token_to_id(token))
189
+
141
190
  def save_to_preset(self, preset_dir):
142
191
  """Save tokenizer to a preset directory.
143
192
 
@@ -151,6 +200,7 @@ class Tokenizer(PreprocessingLayer):
151
200
  )
152
201
  save_tokenizer_assets(self, preset_dir)
153
202
 
203
+ @preprocessing_function
154
204
  def call(self, inputs, *args, training=None, **kwargs):
155
205
  return self.tokenize(inputs, *args, **kwargs)
156
206
 
@@ -165,11 +215,8 @@ class Tokenizer(PreprocessingLayer):
165
215
 
166
216
  @classproperty
167
217
  def presets(cls):
168
- """List built-in presets for a `Task` subclass."""
169
- presets = list_presets(cls)
170
- for subclass in list_subclasses(cls):
171
- presets.update(subclass.presets)
172
- return presets
218
+ """List built-in presets for a `Tokenizer` subclass."""
219
+ return builtin_presets(cls)
173
220
 
174
221
  @classmethod
175
222
  def from_preset(
@@ -180,10 +227,10 @@ class Tokenizer(PreprocessingLayer):
180
227
  """Instantiate a `keras_hub.models.Tokenizer` from a model preset.
181
228
 
182
229
  A preset is a directory of configs, weights and other file assets used
183
- to save and load a pre-trained model. The `preset` can be passed as a
230
+ to save and load a pre-trained model. The `preset` can be passed as
184
231
  one of:
185
232
 
186
- 1. a built in preset identifier like `'bert_base_en'`
233
+ 1. a built-in preset identifier like `'bert_base_en'`
187
234
  2. a Kaggle Models handle like `'kaggle://user/bert/keras/bert_base_en'`
188
235
  3. a Hugging Face handle like `'hf://user/bert_base_en'`
189
236
  4. a path to a local preset directory like `'./bert_base_en'`
@@ -198,7 +245,7 @@ class Tokenizer(PreprocessingLayer):
198
245
  will be inferred from the config in the preset directory.
199
246
 
200
247
  Args:
201
- preset: string. A built in preset identifier, a Kaggle Models
248
+ preset: string. A built-in preset identifier, a Kaggle Models
202
249
  handle, a Hugging Face handle, or a path to a local directory.
203
250
  load_weights: bool. If `True`, the weights will be loaded into the
204
251
  model architecture. If `False`, the weights will be randomly
@@ -207,7 +254,7 @@ class Tokenizer(PreprocessingLayer):
207
254
  Examples:
208
255
  ```python
209
256
  # Load a preset tokenizer.
210
- tokenizer = keras_hub.tokenizerTokenizer.from_preset("bert_base_en")
257
+ tokenizer = keras_hub.tokenizer.Tokenizer.from_preset("bert_base_en")
211
258
 
212
259
  # Tokenize some input.
213
260
  tokenizer("The quick brown fox tripped.")
@@ -216,20 +263,8 @@ class Tokenizer(PreprocessingLayer):
216
263
  tokenizer.detokenize([5, 6, 7, 8, 9])
217
264
  ```
218
265
  """
219
- format = check_format(preset)
220
- if format == "transformers":
221
- return load_transformers_tokenizer(cls, preset)
222
-
223
- preset_cls = check_config_class(
224
- preset, config_file=TOKENIZER_CONFIG_FILE
225
- )
226
- if not issubclass(preset_cls, cls):
227
- raise ValueError(
228
- f"Preset has type `{preset_cls.__name__}` which is not a "
229
- f"a subclass of calling class `{cls.__name__}`. Call "
230
- f"`from_preset` directly on `{preset_cls.__name__}` instead."
231
- )
232
-
233
- tokenizer = load_serialized_object(preset, TOKENIZER_CONFIG_FILE)
234
- tokenizer.load_preset_assets(preset)
235
- return tokenizer
266
+ loader = get_preset_loader(preset)
267
+ backbone_cls = loader.check_backbone_class()
268
+ if cls.backbone_cls != backbone_cls:
269
+ cls = find_subclass(preset, cls, backbone_cls)
270
+ return loader.load_tokenizer(cls, **kwargs)
@@ -17,6 +17,7 @@ from keras_hub.src.api_export import keras_hub_export
17
17
  from keras_hub.src.tokenizers import tokenizer
18
18
  from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
19
19
  from keras_hub.src.utils.tensor_utils import is_int_dtype
20
+ from keras_hub.src.utils.tensor_utils import preprocessing_function
20
21
 
21
22
  try:
22
23
  import tensorflow as tf
@@ -94,9 +95,9 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
94
95
  >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
95
96
  >>> seq1, seq2 = tokenizer(inputs)
96
97
  >>> np.array(seq1)
97
- array([2346, 2369, 2360, 2381, 2340, 2325], dtype=int32)
98
+ array([2346, 2369, 2360, 2381, 2340, 2325])
98
99
  >>> np.array(seq2)
99
- array([1705, 1578, 1575, 1576], dtype=int32)
100
+ array([1705, 1578, 1575, 1576])
100
101
 
101
102
  Dense outputs.
102
103
  >>> inputs = ["पुस्तक", "کتاب"]
@@ -179,9 +180,8 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
179
180
  Detokenization.
180
181
  >>> inputs = tf.constant([110, 105, 110, 106, 97], dtype="int32")
181
182
  >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
182
- >>> outputs = tokenizer.detokenize(inputs)
183
- >>> np.array(outputs).astype("U")
184
- array('ninja', dtype='<U5')
183
+ >>> tokenizer.detokenize(inputs)
184
+ 'ninja'
185
185
 
186
186
  Detokenization with padding.
187
187
  >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
@@ -199,9 +199,8 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
199
199
  >>> inputs = tf.constant([110, 105, 10000000, 110, 106, 97])
200
200
  >>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
201
201
  ... errors="replace", replacement_char=88)
202
- >>> outputs = tokenizer.detokenize(inputs)
203
- >>> np.array(outputs).astype("U")
204
- array('niXnja', dtype='<U6')
202
+ >>> tokenizer.detokenize(inputs)
203
+ 'niXnja'
205
204
  """
206
205
 
207
206
  def __init__(
@@ -256,6 +255,7 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
256
255
  self.input_encoding = input_encoding
257
256
  self.output_encoding = output_encoding
258
257
  self._vocabulary_size = vocabulary_size
258
+ self._update_special_token_ids()
259
259
 
260
260
  def get_config(self):
261
261
  config = super().get_config()
@@ -284,12 +284,10 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
284
284
  vocab[chr(i)] = i
285
285
  return vocab
286
286
 
287
+ @preprocessing_function
287
288
  def tokenize(self, inputs):
288
- if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
289
- inputs = tf.convert_to_tensor(inputs)
290
-
291
- scalar_input = inputs.shape.rank == 0
292
- if scalar_input:
289
+ unbatched = inputs.shape.rank == 0
290
+ if unbatched:
293
291
  inputs = tf.expand_dims(inputs, 0)
294
292
 
295
293
  # Optionally lowercase the text
@@ -313,7 +311,7 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
313
311
  output_shape[-1] = self.sequence_length
314
312
  tokens = tokens.to_tensor(shape=output_shape)
315
313
 
316
- if scalar_input:
314
+ if unbatched:
317
315
  tokens = tf.squeeze(tokens, 0)
318
316
 
319
317
  # Optionally clamps the output code point values to be in the
@@ -323,8 +321,9 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
323
321
 
324
322
  return tokens
325
323
 
324
+ @preprocessing_function
326
325
  def detokenize(self, inputs):
327
- inputs, unbatched, _ = convert_to_ragged_batch(inputs)
326
+ inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
328
327
  inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
329
328
  outputs = tf.strings.unicode_encode(
330
329
  inputs,
@@ -23,6 +23,7 @@ from keras_hub.src.tokenizers import tokenizer
23
23
  from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
24
24
  from keras_hub.src.utils.tensor_utils import is_int_dtype
25
25
  from keras_hub.src.utils.tensor_utils import is_string_dtype
26
+ from keras_hub.src.utils.tensor_utils import preprocessing_function
26
27
 
27
28
  try:
28
29
  import tensorflow as tf
@@ -166,7 +167,7 @@ def pretokenize(
166
167
  if special_tokens_pattern is not None:
167
168
  # the idea here is to pass the special tokens regex to the split
168
169
  # function as delimiter regex pattern, so the input will be splitted
169
- # by them, but also the function will treat each on of them as one
170
+ # by them, but also the function will treat each one of them as one
170
171
  # entity that shouldn't be splitted even if they have other
171
172
  # delimiter regex pattern inside them. then pass the special tokens
172
173
  # regex also as keep delimiter regex pattern, so they will
@@ -263,12 +264,6 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
263
264
  oov_token: str. The string value to substitute for
264
265
  an unknown token. It must be included in the vocab.
265
266
  Defaults to `"[UNK]"`.
266
- special_tokens: list. A list of special tokens. when
267
- `special_tokens_in_strings` is set to `True`, the tokenizer will map
268
- every special token in the input strings to its id, even if these
269
- special tokens contain characters that should be splitted before
270
- tokenization such as punctuation. `special_tokens` must be included
271
- in `vocabulary`.
272
267
  special_tokens_in_strings: bool. A bool to indicate if the tokenizer
273
268
  should expect special tokens in input strings that should be
274
269
  tokenized and mapped correctly to their ids. Defaults to False.
@@ -310,9 +305,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
310
305
  ... lowercase=True,
311
306
  ... dtype="string",
312
307
  ... )
313
- >>> outputs = tokenizer(inputs)
314
- >>> np.array(outputs).astype("U")
315
- array(['the', 'qu', '##ick', 'br', '##own', 'fox', '.'], dtype='<U5')
308
+ >>> tokenizer(inputs)
309
+ ['the', 'qu', '##ick', 'br', '##own', 'fox', '.']
316
310
 
317
311
  Detokenization.
318
312
  >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
@@ -321,9 +315,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
321
315
  ... vocabulary=vocab,
322
316
  ... lowercase=True,
323
317
  ... )
324
- >>> outputs = tokenizer.detokenize(tokenizer.tokenize(inputs))
325
- >>> np.array(outputs).astype("U")
326
- array('the quick brown fox .', dtype='<U21')
318
+ >>> tokenizer.detokenize(tokenizer.tokenize(inputs))
319
+ 'the quick brown fox .'
327
320
 
328
321
  Custom splitting.
329
322
  >>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
@@ -335,9 +328,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
335
328
  ... dtype='string',
336
329
  ... )
337
330
  >>> split_inputs = tf.strings.split(inputs, sep="$")
338
- >>> outputs = tokenizer(split_inputs)
339
- >>> np.array(outputs).astype("U")
340
- array(['the', 'qu', '##ick', 'br', '##own', 'fox'], dtype='<U5')
331
+ >>> tokenizer(split_inputs)
332
+ ['the', 'qu', '##ick', 'br', '##own', 'fox']
341
333
  """
342
334
 
343
335
  def __init__(
@@ -372,19 +364,9 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
372
364
  self.split_on_cjk = split_on_cjk
373
365
  self.suffix_indicator = suffix_indicator
374
366
  self.oov_token = oov_token
375
- self.special_tokens = special_tokens
376
- self._special_tokens_pattern = None
377
- if self.split and special_tokens_in_strings:
378
- # the idea here is to pass the special tokens regex to the
379
- # split function as delimiter regex pattern, so the input will
380
- # be splitted by them, but also the function will treat each on
381
- # of them as one entity that shouldn't be splitted even if they
382
- # have other delimiter regex pattern inside them. then pass the
383
- # special tokens regex also as keep delimiter regex
384
- # pattern, so they will not be removed.
385
- self._special_tokens_pattern = get_special_tokens_pattern(
386
- self.special_tokens
387
- )
367
+ self._init_special_tokens = special_tokens
368
+ self.special_tokens_in_strings = special_tokens_in_strings
369
+
388
370
  self.set_vocabulary(vocabulary)
389
371
  self.file_assets = [VOCAB_FILENAME]
390
372
 
@@ -426,16 +408,6 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
426
408
  "the `oov_token` argument when creating the tokenizer."
427
409
  )
428
410
 
429
- # Check for special tokens in the vocabulary
430
- if self.special_tokens is not None:
431
- for token in self.special_tokens:
432
- if token not in self.vocabulary:
433
- raise ValueError(
434
- f"Cannot find token `'{token}'` in the provided "
435
- f"`vocabulary`. Please provide `'{token}'` in your "
436
- "`vocabulary` or use a pretrained `vocabulary` name."
437
- )
438
-
439
411
  self._fast_word_piece = tf_text.FastWordpieceTokenizer(
440
412
  vocab=self.vocabulary,
441
413
  token_out_type=self.compute_dtype,
@@ -444,6 +416,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
444
416
  no_pretokenization=True,
445
417
  support_detokenization=True,
446
418
  )
419
+ self._update_special_token_ids()
447
420
 
448
421
  def get_vocabulary(self):
449
422
  """Get the tokenizer vocabulary as a list of strings tokens."""
@@ -484,7 +457,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
484
457
  "split": self.split,
485
458
  "suffix_indicator": self.suffix_indicator,
486
459
  "oov_token": self.oov_token,
487
- "special_tokens": self.special_tokens,
460
+ "special_tokens": self._init_special_tokens,
461
+ "special_tokens_in_strings": self.special_tokens_in_strings,
488
462
  }
489
463
  )
490
464
  return config
@@ -496,19 +470,31 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
496
470
  "to pass a `vocabulary` argument when creating the layer."
497
471
  )
498
472
 
473
+ @preprocessing_function
499
474
  def tokenize(self, inputs):
500
475
  self._check_vocabulary()
501
- if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
502
- inputs = tf.convert_to_tensor(inputs)
503
-
504
- scalar_input = inputs.shape.rank == 0
476
+ inputs = tf.convert_to_tensor(inputs)
477
+ unbatched = inputs.shape.rank == 0
478
+ pattern = None
479
+ if self.split and self.special_tokens_in_strings:
480
+ # the idea here is to pass the special tokens regex to the
481
+ # split function as delimiter regex pattern, so the input will
482
+ # be splitted by them, but also the function will treat each one
483
+ # of them as one entity that shouldn't be splitted even if they
484
+ # have other delimiter regex pattern inside them. then pass the
485
+ # special tokens regex also as keep delimiter regex
486
+ # pattern, so they will not be removed.
487
+ special_tokens = self.special_tokens
488
+ if self._init_special_tokens:
489
+ special_tokens += self._init_special_tokens
490
+ pattern = get_special_tokens_pattern(special_tokens)
505
491
  inputs = pretokenize(
506
492
  inputs,
507
493
  self.lowercase,
508
494
  self.strip_accents,
509
495
  self.split,
510
496
  self.split_on_cjk,
511
- self._special_tokens_pattern,
497
+ pattern,
512
498
  )
513
499
 
514
500
  # Apply WordPiece and coerce shape for outputs.
@@ -524,15 +510,16 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
524
510
  output_shape[-1] = self.sequence_length
525
511
  tokens = tokens.to_tensor(shape=output_shape)
526
512
  # Convert to a dense output if input in scalar
527
- if scalar_input:
513
+ if unbatched:
528
514
  tokens = tf.squeeze(tokens, 0)
529
515
  tf.ensure_shape(tokens, shape=[self.sequence_length])
530
516
 
531
517
  return tokens
532
518
 
519
+ @preprocessing_function
533
520
  def detokenize(self, inputs):
534
521
  self._check_vocabulary()
535
- inputs, unbatched, _ = convert_to_ragged_batch(inputs)
522
+ inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
536
523
  outputs = self._fast_word_piece.detokenize(inputs)
537
524
  if unbatched:
538
525
  outputs = tf.squeeze(outputs, 0)
@@ -0,0 +1,13 @@
1
+ # Copyright 2024 The KerasNLP Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # https://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.