keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl → 0.16.0.dev2024092017__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. keras_hub/__init__.py +0 -6
  2. keras_hub/api/__init__.py +2 -0
  3. keras_hub/api/bounding_box/__init__.py +36 -0
  4. keras_hub/api/layers/__init__.py +14 -0
  5. keras_hub/api/models/__init__.py +97 -48
  6. keras_hub/api/tokenizers/__init__.py +30 -0
  7. keras_hub/api/utils/__init__.py +22 -0
  8. keras_hub/src/api_export.py +15 -9
  9. keras_hub/src/bounding_box/__init__.py +13 -0
  10. keras_hub/src/bounding_box/converters.py +529 -0
  11. keras_hub/src/bounding_box/formats.py +162 -0
  12. keras_hub/src/bounding_box/iou.py +263 -0
  13. keras_hub/src/bounding_box/to_dense.py +95 -0
  14. keras_hub/src/bounding_box/to_ragged.py +99 -0
  15. keras_hub/src/bounding_box/utils.py +194 -0
  16. keras_hub/src/bounding_box/validate_format.py +99 -0
  17. keras_hub/src/layers/preprocessing/audio_converter.py +121 -0
  18. keras_hub/src/layers/preprocessing/image_converter.py +130 -0
  19. keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +2 -0
  20. keras_hub/src/layers/preprocessing/multi_segment_packer.py +9 -8
  21. keras_hub/src/layers/preprocessing/preprocessing_layer.py +2 -29
  22. keras_hub/src/layers/preprocessing/random_deletion.py +33 -31
  23. keras_hub/src/layers/preprocessing/random_swap.py +33 -31
  24. keras_hub/src/layers/preprocessing/resizing_image_converter.py +101 -0
  25. keras_hub/src/layers/preprocessing/start_end_packer.py +3 -2
  26. keras_hub/src/models/albert/__init__.py +1 -2
  27. keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +6 -86
  28. keras_hub/src/models/albert/{albert_classifier.py → albert_text_classifier.py} +34 -10
  29. keras_hub/src/models/albert/{albert_preprocessor.py → albert_text_classifier_preprocessor.py} +14 -70
  30. keras_hub/src/models/albert/albert_tokenizer.py +17 -36
  31. keras_hub/src/models/backbone.py +12 -34
  32. keras_hub/src/models/bart/__init__.py +1 -2
  33. keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +21 -148
  34. keras_hub/src/models/bart/bart_tokenizer.py +12 -39
  35. keras_hub/src/models/bert/__init__.py +1 -5
  36. keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +6 -87
  37. keras_hub/src/models/bert/bert_presets.py +1 -4
  38. keras_hub/src/models/bert/{bert_classifier.py → bert_text_classifier.py} +19 -12
  39. keras_hub/src/models/bert/{bert_preprocessor.py → bert_text_classifier_preprocessor.py} +14 -70
  40. keras_hub/src/models/bert/bert_tokenizer.py +17 -35
  41. keras_hub/src/models/bloom/__init__.py +1 -2
  42. keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +6 -91
  43. keras_hub/src/models/bloom/bloom_tokenizer.py +12 -41
  44. keras_hub/src/models/causal_lm.py +10 -29
  45. keras_hub/src/models/causal_lm_preprocessor.py +195 -0
  46. keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +54 -15
  47. keras_hub/src/models/deberta_v3/__init__.py +1 -4
  48. keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +14 -77
  49. keras_hub/src/models/deberta_v3/{deberta_v3_classifier.py → deberta_v3_text_classifier.py} +16 -11
  50. keras_hub/src/models/deberta_v3/{deberta_v3_preprocessor.py → deberta_v3_text_classifier_preprocessor.py} +23 -64
  51. keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +30 -25
  52. keras_hub/src/models/densenet/densenet_backbone.py +46 -22
  53. keras_hub/src/models/distil_bert/__init__.py +1 -4
  54. keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +14 -76
  55. keras_hub/src/models/distil_bert/{distil_bert_classifier.py → distil_bert_text_classifier.py} +17 -12
  56. keras_hub/src/models/distil_bert/{distil_bert_preprocessor.py → distil_bert_text_classifier_preprocessor.py} +23 -63
  57. keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +19 -35
  58. keras_hub/src/models/efficientnet/__init__.py +13 -0
  59. keras_hub/src/models/efficientnet/efficientnet_backbone.py +569 -0
  60. keras_hub/src/models/efficientnet/fusedmbconv.py +229 -0
  61. keras_hub/src/models/efficientnet/mbconv.py +238 -0
  62. keras_hub/src/models/electra/__init__.py +1 -2
  63. keras_hub/src/models/electra/electra_tokenizer.py +17 -32
  64. keras_hub/src/models/f_net/__init__.py +1 -2
  65. keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +12 -78
  66. keras_hub/src/models/f_net/{f_net_classifier.py → f_net_text_classifier.py} +17 -10
  67. keras_hub/src/models/f_net/{f_net_preprocessor.py → f_net_text_classifier_preprocessor.py} +19 -63
  68. keras_hub/src/models/f_net/f_net_tokenizer.py +17 -35
  69. keras_hub/src/models/falcon/__init__.py +1 -2
  70. keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +6 -89
  71. keras_hub/src/models/falcon/falcon_tokenizer.py +12 -35
  72. keras_hub/src/models/gemma/__init__.py +1 -2
  73. keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +6 -90
  74. keras_hub/src/models/gemma/gemma_decoder_block.py +1 -1
  75. keras_hub/src/models/gemma/gemma_tokenizer.py +12 -23
  76. keras_hub/src/models/gpt2/__init__.py +1 -2
  77. keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +6 -89
  78. keras_hub/src/models/gpt2/gpt2_preprocessor.py +12 -90
  79. keras_hub/src/models/gpt2/gpt2_tokenizer.py +12 -34
  80. keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +6 -91
  81. keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +12 -34
  82. keras_hub/src/models/image_classifier.py +0 -5
  83. keras_hub/src/models/image_classifier_preprocessor.py +83 -0
  84. keras_hub/src/models/llama/__init__.py +1 -2
  85. keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +6 -85
  86. keras_hub/src/models/llama/llama_tokenizer.py +12 -25
  87. keras_hub/src/models/llama3/__init__.py +1 -2
  88. keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +6 -89
  89. keras_hub/src/models/llama3/llama3_tokenizer.py +12 -33
  90. keras_hub/src/models/masked_lm.py +0 -2
  91. keras_hub/src/models/masked_lm_preprocessor.py +156 -0
  92. keras_hub/src/models/mistral/__init__.py +1 -2
  93. keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +6 -91
  94. keras_hub/src/models/mistral/mistral_tokenizer.py +12 -23
  95. keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +2 -2
  96. keras_hub/src/models/mobilenet/__init__.py +13 -0
  97. keras_hub/src/models/mobilenet/mobilenet_backbone.py +530 -0
  98. keras_hub/src/models/mobilenet/mobilenet_image_classifier.py +114 -0
  99. keras_hub/src/models/opt/__init__.py +1 -2
  100. keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +6 -93
  101. keras_hub/src/models/opt/opt_tokenizer.py +12 -41
  102. keras_hub/src/models/pali_gemma/__init__.py +1 -4
  103. keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +28 -28
  104. keras_hub/src/models/pali_gemma/pali_gemma_image_converter.py +25 -0
  105. keras_hub/src/models/pali_gemma/pali_gemma_presets.py +5 -5
  106. keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +11 -3
  107. keras_hub/src/models/phi3/__init__.py +1 -2
  108. keras_hub/src/models/phi3/phi3_causal_lm.py +3 -9
  109. keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +6 -89
  110. keras_hub/src/models/phi3/phi3_tokenizer.py +12 -36
  111. keras_hub/src/models/preprocessor.py +72 -83
  112. keras_hub/src/models/resnet/__init__.py +6 -0
  113. keras_hub/src/models/resnet/resnet_backbone.py +390 -42
  114. keras_hub/src/models/resnet/resnet_image_classifier.py +33 -6
  115. keras_hub/src/models/resnet/resnet_image_classifier_preprocessor.py +28 -0
  116. keras_hub/src/models/{llama3/llama3_preprocessor.py → resnet/resnet_image_converter.py} +7 -5
  117. keras_hub/src/models/resnet/resnet_presets.py +95 -0
  118. keras_hub/src/models/retinanet/__init__.py +13 -0
  119. keras_hub/src/models/retinanet/anchor_generator.py +175 -0
  120. keras_hub/src/models/retinanet/box_matcher.py +259 -0
  121. keras_hub/src/models/retinanet/non_max_supression.py +578 -0
  122. keras_hub/src/models/roberta/__init__.py +1 -2
  123. keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +22 -74
  124. keras_hub/src/models/roberta/{roberta_classifier.py → roberta_text_classifier.py} +16 -11
  125. keras_hub/src/models/roberta/{roberta_preprocessor.py → roberta_text_classifier_preprocessor.py} +21 -53
  126. keras_hub/src/models/roberta/roberta_tokenizer.py +13 -52
  127. keras_hub/src/models/seq_2_seq_lm_preprocessor.py +269 -0
  128. keras_hub/src/models/stable_diffusion_v3/__init__.py +13 -0
  129. keras_hub/src/models/stable_diffusion_v3/clip_encoder_block.py +103 -0
  130. keras_hub/src/models/stable_diffusion_v3/clip_preprocessor.py +93 -0
  131. keras_hub/src/models/stable_diffusion_v3/clip_text_encoder.py +149 -0
  132. keras_hub/src/models/stable_diffusion_v3/clip_tokenizer.py +167 -0
  133. keras_hub/src/models/stable_diffusion_v3/mmdit.py +427 -0
  134. keras_hub/src/models/stable_diffusion_v3/mmdit_block.py +317 -0
  135. keras_hub/src/models/stable_diffusion_v3/t5_xxl_preprocessor.py +74 -0
  136. keras_hub/src/models/stable_diffusion_v3/t5_xxl_text_encoder.py +155 -0
  137. keras_hub/src/models/stable_diffusion_v3/vae_attention.py +126 -0
  138. keras_hub/src/models/stable_diffusion_v3/vae_image_decoder.py +186 -0
  139. keras_hub/src/models/t5/__init__.py +1 -2
  140. keras_hub/src/models/t5/t5_tokenizer.py +13 -23
  141. keras_hub/src/models/task.py +71 -116
  142. keras_hub/src/models/{classifier.py → text_classifier.py} +19 -13
  143. keras_hub/src/models/text_classifier_preprocessor.py +138 -0
  144. keras_hub/src/models/whisper/__init__.py +1 -2
  145. keras_hub/src/models/whisper/{whisper_audio_feature_extractor.py → whisper_audio_converter.py} +20 -18
  146. keras_hub/src/models/whisper/whisper_backbone.py +0 -3
  147. keras_hub/src/models/whisper/whisper_presets.py +10 -10
  148. keras_hub/src/models/whisper/whisper_tokenizer.py +20 -16
  149. keras_hub/src/models/xlm_roberta/__init__.py +1 -4
  150. keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +26 -72
  151. keras_hub/src/models/xlm_roberta/{xlm_roberta_classifier.py → xlm_roberta_text_classifier.py} +16 -11
  152. keras_hub/src/models/xlm_roberta/{xlm_roberta_preprocessor.py → xlm_roberta_text_classifier_preprocessor.py} +26 -53
  153. keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +25 -10
  154. keras_hub/src/tests/test_case.py +46 -0
  155. keras_hub/src/tokenizers/byte_pair_tokenizer.py +30 -17
  156. keras_hub/src/tokenizers/byte_tokenizer.py +14 -15
  157. keras_hub/src/tokenizers/sentence_piece_tokenizer.py +20 -7
  158. keras_hub/src/tokenizers/tokenizer.py +67 -32
  159. keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +14 -15
  160. keras_hub/src/tokenizers/word_piece_tokenizer.py +34 -47
  161. keras_hub/src/utils/imagenet/__init__.py +13 -0
  162. keras_hub/src/utils/imagenet/imagenet_utils.py +1067 -0
  163. keras_hub/src/utils/keras_utils.py +0 -50
  164. keras_hub/src/utils/preset_utils.py +230 -68
  165. keras_hub/src/utils/tensor_utils.py +187 -69
  166. keras_hub/src/utils/timm/convert_resnet.py +19 -16
  167. keras_hub/src/utils/timm/preset_loader.py +66 -0
  168. keras_hub/src/utils/transformers/convert_albert.py +193 -0
  169. keras_hub/src/utils/transformers/convert_bart.py +373 -0
  170. keras_hub/src/utils/transformers/convert_bert.py +7 -17
  171. keras_hub/src/utils/transformers/convert_distilbert.py +10 -20
  172. keras_hub/src/utils/transformers/convert_gemma.py +5 -19
  173. keras_hub/src/utils/transformers/convert_gpt2.py +5 -18
  174. keras_hub/src/utils/transformers/convert_llama3.py +7 -18
  175. keras_hub/src/utils/transformers/convert_mistral.py +129 -0
  176. keras_hub/src/utils/transformers/convert_pali_gemma.py +7 -29
  177. keras_hub/src/utils/transformers/preset_loader.py +77 -0
  178. keras_hub/src/utils/transformers/safetensor_utils.py +2 -2
  179. keras_hub/src/version_utils.py +1 -1
  180. keras_hub_nightly-0.16.0.dev2024092017.dist-info/METADATA +202 -0
  181. keras_hub_nightly-0.16.0.dev2024092017.dist-info/RECORD +334 -0
  182. {keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/WHEEL +1 -1
  183. keras_hub/src/models/bart/bart_preprocessor.py +0 -276
  184. keras_hub/src/models/bloom/bloom_preprocessor.py +0 -185
  185. keras_hub/src/models/electra/electra_preprocessor.py +0 -154
  186. keras_hub/src/models/falcon/falcon_preprocessor.py +0 -187
  187. keras_hub/src/models/gemma/gemma_preprocessor.py +0 -191
  188. keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +0 -145
  189. keras_hub/src/models/llama/llama_preprocessor.py +0 -189
  190. keras_hub/src/models/mistral/mistral_preprocessor.py +0 -190
  191. keras_hub/src/models/opt/opt_preprocessor.py +0 -188
  192. keras_hub/src/models/phi3/phi3_preprocessor.py +0 -190
  193. keras_hub/src/models/whisper/whisper_preprocessor.py +0 -326
  194. keras_hub/src/utils/timm/convert.py +0 -37
  195. keras_hub/src/utils/transformers/convert.py +0 -101
  196. keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +0 -34
  197. keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +0 -297
  198. {keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/top_level.txt +0 -0
@@ -19,17 +19,25 @@ from keras_hub.src.api_export import keras_hub_export
19
19
  from keras_hub.src.layers.preprocessing.multi_segment_packer import (
20
20
  MultiSegmentPacker,
21
21
  )
22
- from keras_hub.src.models.preprocessor import Preprocessor
22
+ from keras_hub.src.models.text_classifier_preprocessor import (
23
+ TextClassifierPreprocessor,
24
+ )
25
+ from keras_hub.src.models.xlm_roberta.xlm_roberta_backbone import (
26
+ XLMRobertaBackbone,
27
+ )
23
28
  from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
24
29
  XLMRobertaTokenizer,
25
30
  )
26
- from keras_hub.src.utils.keras_utils import (
27
- convert_inputs_to_list_of_tensor_segments,
28
- )
31
+ from keras_hub.src.utils.tensor_utils import preprocessing_function
29
32
 
30
33
 
31
- @keras_hub_export("keras_hub.models.XLMRobertaPreprocessor")
32
- class XLMRobertaPreprocessor(Preprocessor):
34
+ @keras_hub_export(
35
+ [
36
+ "keras_hub.models.XLMRobertaTextClassifierPreprocessor",
37
+ "keras_hub.models.XLMRobertaPreprocessor",
38
+ ]
39
+ )
40
+ class XLMRobertaTextClassifierPreprocessor(TextClassifierPreprocessor):
33
41
  """An XLM-RoBERTa preprocessing layer which tokenizes and packs inputs.
34
42
 
35
43
  This preprocessing layer will do three things:
@@ -73,7 +81,7 @@ class XLMRobertaPreprocessor(Preprocessor):
73
81
 
74
82
  Directly calling the layer on data.
75
83
  ```python
76
- preprocessor = keras_hub.models.XLMRobertaPreprocessor.from_preset(
84
+ preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
77
85
  "xlm_roberta_base_multi"
78
86
  )
79
87
 
@@ -107,13 +115,15 @@ class XLMRobertaPreprocessor(Preprocessor):
107
115
  )
108
116
  proto = train_sentencepiece(ds, vocab_size=10)
109
117
  tokenizer = keras_hub.models.XLMRobertaTokenizer(proto=proto)
110
- preprocessor = keras_hub.models.XLMRobertaPreprocessor(tokenizer)
118
+ preprocessor = keras_hub.models.XLMRobertaTextClassifierPreprocessor(
119
+ tokenizer
120
+ )
111
121
  preprocessor("The quick brown fox jumped.")
112
122
  ```
113
123
 
114
124
  Mapping with `tf.data.Dataset`.
115
125
  ```python
116
- preprocessor = keras_hub.models.XLMRobertaPreprocessor.from_preset(
126
+ preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
117
127
  "xlm_roberta_base_multi"
118
128
  )
119
129
 
@@ -144,25 +154,11 @@ class XLMRobertaPreprocessor(Preprocessor):
144
154
  ```
145
155
  """
146
156
 
157
+ backbone_cls = XLMRobertaBackbone
147
158
  tokenizer_cls = XLMRobertaTokenizer
148
159
 
149
- def __init__(
150
- self,
151
- tokenizer,
152
- sequence_length=512,
153
- truncate="round_robin",
154
- **kwargs,
155
- ):
156
- super().__init__(**kwargs)
157
-
158
- self.tokenizer = tokenizer
159
- self.packer = None
160
- self.truncate = truncate
161
- self.sequence_length = sequence_length
162
-
163
160
  def build(self, input_shape):
164
- # Defer packer creation to `build()` so that we can be sure tokenizer
165
- # assets have loaded when restoring a saved model.
161
+ # Roberta is doubles up the sep token, so we override build.
166
162
  self.packer = MultiSegmentPacker(
167
163
  start_value=self.tokenizer.start_token_id,
168
164
  end_value=self.tokenizer.end_token_id,
@@ -173,33 +169,10 @@ class XLMRobertaPreprocessor(Preprocessor):
173
169
  )
174
170
  self.built = True
175
171
 
176
- def get_config(self):
177
- config = super().get_config()
178
- config.update(
179
- {
180
- "sequence_length": self.sequence_length,
181
- "truncate": self.truncate,
182
- }
183
- )
184
- return config
185
-
172
+ @preprocessing_function
186
173
  def call(self, x, y=None, sample_weight=None):
187
- x = convert_inputs_to_list_of_tensor_segments(x)
188
- x = [self.tokenizer(segment) for segment in x]
189
- token_ids, _ = self.packer(x)
190
- x = {
191
- "token_ids": token_ids,
192
- "padding_mask": token_ids != self.tokenizer.pad_token_id,
193
- }
174
+ output = super().call(x, y=y, sample_weight=sample_weight)
175
+ x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(output)
176
+ # Backbone has no segment ID input.
177
+ del x["segment_ids"]
194
178
  return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
195
-
196
- @property
197
- def sequence_length(self):
198
- """The padded length of model input sequences."""
199
- return self._sequence_length
200
-
201
- @sequence_length.setter
202
- def sequence_length(self, value):
203
- self._sequence_length = value
204
- if self.packer is not None:
205
- self.packer.sequence_length = value
@@ -14,6 +14,9 @@
14
14
 
15
15
 
16
16
  from keras_hub.src.api_export import keras_hub_export
17
+ from keras_hub.src.models.xlm_roberta.xlm_roberta_backbone import (
18
+ XLMRobertaBackbone,
19
+ )
17
20
  from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
18
21
  SentencePieceTokenizer,
19
22
  )
@@ -25,7 +28,12 @@ except ImportError:
25
28
  tf = None
26
29
 
27
30
 
28
- @keras_hub_export("keras_hub.models.XLMRobertaTokenizer")
31
+ @keras_hub_export(
32
+ [
33
+ "keras_hub.tokenizers.XLMRobertaTokenizer",
34
+ "keras_hub.models.XLMRobertaTokenizer",
35
+ ]
36
+ )
29
37
  class XLMRobertaTokenizer(SentencePieceTokenizer):
30
38
  """An XLM-RoBERTa tokenizer using SentencePiece subword segmentation.
31
39
 
@@ -89,17 +97,24 @@ class XLMRobertaTokenizer(SentencePieceTokenizer):
89
97
  ```
90
98
  """
91
99
 
100
+ backbone_cls = XLMRobertaBackbone
101
+
92
102
  def __init__(self, proto, **kwargs):
93
- # List of special tokens.
94
- self._vocabulary_prefix = ["<s>", "<pad>", "</s>", "<unk>"]
103
+ # Handle special tokens manually, as the tokenizer maps these tokens in
104
+ # a way that is not reflected in the vocabulary.
105
+ self.start_token, self.start_token_id = "<s>", 0
106
+ self.pad_token, self.pad_token_id = "<pad>", 1
107
+ self.end_token, self.end_token_id = "</s>", 2
108
+ self.unk_token, self.unk_token_id = "<unk>", 3
109
+ super().__init__(proto=proto, **kwargs)
95
110
 
96
- # IDs of special tokens.
97
- self.start_token_id = 0 # <s>
98
- self.pad_token_id = 1 # <pad>
99
- self.end_token_id = 2 # </s>
100
- self.unk_token_id = 3 # <unk>
111
+ @property
112
+ def special_tokens(self):
113
+ return ["<s>", "<pad>", "</s>", "<unk>"]
101
114
 
102
- super().__init__(proto=proto, **kwargs)
115
+ @property
116
+ def special_token_ids(self):
117
+ return [0, 1, 2, 3]
103
118
 
104
119
  def set_proto(self, proto):
105
120
  super().set_proto(proto)
@@ -162,7 +177,7 @@ class XLMRobertaTokenizer(SentencePieceTokenizer):
162
177
 
163
178
  # Correct `unk_token_id` (0 -> 3). Note that we do not correct
164
179
  # `start_token_id` and `end_token_id`; they are dealt with in
165
- # `XLMRobertaPreprocessor`.
180
+ # `XLMRobertaTextClassifierPreprocessor`.
166
181
  tokens = tf.where(tf.equal(tokens, 0), self.unk_token_id - 1, tokens)
167
182
 
168
183
  # Shift the tokens IDs right by one.
@@ -18,6 +18,7 @@ import pathlib
18
18
  import re
19
19
 
20
20
  import keras
21
+ import numpy as np
21
22
  import tensorflow as tf
22
23
  from absl.testing import parameterized
23
24
  from keras import ops
@@ -465,6 +466,8 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
465
466
  init_kwargs,
466
467
  input_data,
467
468
  expected_output_shape,
469
+ expected_pyramid_output_keys=None,
470
+ expected_pyramid_image_sizes=None,
468
471
  variable_length_data=None,
469
472
  run_mixed_precision_check=True,
470
473
  run_quantization_check=True,
@@ -492,6 +495,26 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
492
495
  run_quantization_check=run_quantization_check,
493
496
  )
494
497
 
498
+ if expected_pyramid_output_keys:
499
+ backbone = cls(**init_kwargs)
500
+ model = keras.models.Model(
501
+ backbone.inputs, backbone.pyramid_outputs
502
+ )
503
+ output_data = model(input_data)
504
+
505
+ self.assertIsInstance(output_data, dict)
506
+ self.assertEqual(
507
+ list(output_data.keys()), list(backbone.pyramid_outputs.keys())
508
+ )
509
+ self.assertEqual(
510
+ list(output_data.keys()), expected_pyramid_output_keys
511
+ )
512
+ # check height and width of each level.
513
+ for i, (k, v) in enumerate(output_data.items()):
514
+ self.assertEqual(
515
+ tuple(v.shape[1:3]), expected_pyramid_image_sizes[i]
516
+ )
517
+
495
518
  # Check data_format. We assume that `input_data` is in "channels_last"
496
519
  # format.
497
520
  if run_data_format_check and can_run_data_format_check:
@@ -501,6 +524,12 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
501
524
  input_data = ops.transpose(input_data, axes=(2, 0, 1))
502
525
  elif len(input_data_shape) == 4:
503
526
  input_data = ops.transpose(input_data, axes=(0, 3, 1, 2))
527
+ if len(expected_output_shape) == 3:
528
+ x = expected_output_shape
529
+ expected_output_shape = (x[0], x[2], x[1])
530
+ elif len(expected_output_shape) == 4:
531
+ x = expected_output_shape
532
+ expected_output_shape = (x[0], x[3], x[1], x[2])
504
533
  if "image_shape" in init_kwargs:
505
534
  init_kwargs = init_kwargs.copy()
506
535
  init_kwargs["image_shape"] = tuple(
@@ -557,6 +586,10 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
557
586
  task.preprocessor = None
558
587
  task.fit(ds.map(preprocessor))
559
588
  task.preprocessor = preprocessor
589
+ # Turn off default compilation, should error during `fit()`.
590
+ task = cls(**init_kwargs, compile=False)
591
+ with self.assertRaisesRegex(ValueError, "You must call `compile"):
592
+ task.fit(ds)
560
593
 
561
594
  def run_preset_test(
562
595
  self,
@@ -567,6 +600,7 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
567
600
  expected_output=None,
568
601
  expected_output_shape=None,
569
602
  expected_partial_output=None,
603
+ expected_labels=None,
570
604
  ):
571
605
  """Run instantiation and a forward pass for a preset."""
572
606
  with self.assertRaises(Exception):
@@ -604,5 +638,17 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
604
638
 
605
639
  tree.map_structure(compare, output, expected_partial_output)
606
640
 
641
+ if expected_labels is not None:
642
+ output = ops.argmax(output, axis=-1)
643
+ self.assertAllEqual(output, expected_labels)
644
+
607
645
  def get_test_data_dir(self):
608
646
  return str(pathlib.Path(__file__).parent / "test_data")
647
+
648
+ def load_test_image(self, target_size=None):
649
+ # From https://commons.wikimedia.org/wiki/File:California_quail.jpg
650
+ path = os.path.join(self.get_test_data_dir(), "test_image.jpg")
651
+ img = keras.utils.load_img(
652
+ path, target_size=target_size, keep_aspect_ratio=True
653
+ )
654
+ return np.array(img)
@@ -31,6 +31,7 @@ from keras_hub.src.tokenizers import tokenizer
31
31
  from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
32
32
  from keras_hub.src.utils.tensor_utils import is_int_dtype
33
33
  from keras_hub.src.utils.tensor_utils import is_string_dtype
34
+ from keras_hub.src.utils.tensor_utils import preprocessing_function
34
35
 
35
36
  try:
36
37
  import tensorflow as tf
@@ -63,12 +64,17 @@ def create_alts_for_unsplittable_tokens(unsplittable_tokens):
63
64
  # Create alternates for all special tokens that will be not split during
64
65
  # tokenization.
65
66
  alts = []
66
- prefix = "Ĵ"
67
- # Trim out splitters.
68
- replace_pattern = r"'|\s+|[^\p{L}\p{N}]+"
69
- for token in unsplittable_tokens:
70
- token = re.sub(replace_pattern, "", token)
71
- alts.append(prefix + token)
67
+ for index in range(len(unsplittable_tokens)):
68
+ # Map unsplittable tokens to ĴA, ĴB, ĴC, etc. Which we assume will be
69
+ # a very uncommon string in any input data. We can't use a literal
70
+ # numeric counter here because we will split on all numbers. Ĵ is a
71
+ # random character we chose as it is likely to be unique.
72
+ prefix = "Ĵ"
73
+ digits = [int(d) for d in str(index)]
74
+ # Make numbers to uppercase characters so our token is still
75
+ # unsplittable.
76
+ suffix = "".join([chr(ord("A") + d) for d in digits])
77
+ alts.append(prefix + suffix)
72
78
  return alts
73
79
 
74
80
 
@@ -252,9 +258,9 @@ class BytePairTokenizer(tokenizer.Tokenizer):
252
258
  array([1, 2], dtype=int32)
253
259
  >>> seq1, seq2 = tokenizer(["butterfly", "butter"])
254
260
  >>> np.array(seq1)
255
- array([1, 2], dtype=int32)
261
+ array([1, 2])
256
262
  >>> np.array(seq2)
257
- array([1], dtype=int32)
263
+ array([1])
258
264
  >>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(
259
265
  ... vocab, merge, sequence_length=2)
260
266
  >>> seq1, seq2 = tokenizer(["butterfly", "butter"])
@@ -268,8 +274,7 @@ class BytePairTokenizer(tokenizer.Tokenizer):
268
274
  >>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
269
275
  >>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(vocab, merge)
270
276
  >>> tokenizer.detokenize([[1, 2]])
271
- <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'butterfly'],
272
- dtype=object)>
277
+ ['butterfly']
273
278
  """
274
279
 
275
280
  def __init__(
@@ -291,6 +296,8 @@ class BytePairTokenizer(tokenizer.Tokenizer):
291
296
  super().__init__(dtype=dtype, **kwargs)
292
297
  self.sequence_length = sequence_length
293
298
  self.add_prefix_space = add_prefix_space
299
+ if unsplittable_tokens is None:
300
+ unsplittable_tokens = self.special_tokens
294
301
  self.unsplittable_tokens = unsplittable_tokens
295
302
  self.file_assets = [VOCAB_FILENAME, MERGES_FILENAME]
296
303
 
@@ -385,6 +392,7 @@ class BytePairTokenizer(tokenizer.Tokenizer):
385
392
  list(range(len(self.merges))),
386
393
  default=self.merge_ranks_lookup_default,
387
394
  )
395
+ self._update_special_token_ids()
388
396
 
389
397
  def get_vocabulary(self):
390
398
  """Get the tokenizer vocabulary as a list of strings tokens."""
@@ -526,17 +534,21 @@ class BytePairTokenizer(tokenizer.Tokenizer):
526
534
  "layer."
527
535
  )
528
536
 
537
+ @preprocessing_function
529
538
  def tokenize(self, inputs):
530
539
  self._check_vocabulary()
531
- if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
532
- inputs = tf.convert_to_tensor(inputs)
533
-
534
540
  if self.add_prefix_space:
535
541
  inputs = tf.strings.join([" ", inputs])
536
542
 
537
- scalar_input = inputs.shape.rank == 0
538
- if scalar_input:
543
+ inputs = tf.convert_to_tensor(inputs)
544
+ unbatched = inputs.shape.rank == 0
545
+ if unbatched:
539
546
  inputs = tf.expand_dims(inputs, 0)
547
+ if inputs.shape.rank > 1:
548
+ raise ValueError(
549
+ "`tokenize()` inputs should be a string, list of strings, or "
550
+ f"string tensor with rank < 2. Received: {inputs}"
551
+ )
540
552
 
541
553
  raw_tokens = split_strings_for_bpe(inputs, self.unsplittable_tokens)
542
554
  token_row_splits = raw_tokens.row_splits
@@ -581,15 +593,16 @@ class BytePairTokenizer(tokenizer.Tokenizer):
581
593
  tokens = tokens.to_tensor(shape=output_shape)
582
594
 
583
595
  # Convert to a dense output if input in scalar
584
- if scalar_input:
596
+ if unbatched:
585
597
  tokens = tf.squeeze(tokens, 0)
586
598
  tf.ensure_shape(tokens, shape=[self.sequence_length])
587
599
 
588
600
  return tokens
589
601
 
602
+ @preprocessing_function
590
603
  def detokenize(self, inputs):
591
604
  self._check_vocabulary()
592
- inputs, unbatched, _ = convert_to_ragged_batch(inputs)
605
+ inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
593
606
  inputs = tf.cast(inputs, self.dtype)
594
607
  unicode_text = tf.strings.reduce_join(
595
608
  self.id_to_token_map.lookup(inputs), axis=-1
@@ -26,6 +26,7 @@ from keras_hub.src.api_export import keras_hub_export
26
26
  from keras_hub.src.tokenizers import tokenizer
27
27
  from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
28
28
  from keras_hub.src.utils.tensor_utils import is_int_dtype
29
+ from keras_hub.src.utils.tensor_utils import preprocessing_function
29
30
 
30
31
  try:
31
32
  import tensorflow_text as tf_text
@@ -95,9 +96,9 @@ class ByteTokenizer(tokenizer.Tokenizer):
95
96
  >>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
96
97
  >>> seq1, seq2 = tokenizer(inputs)
97
98
  >>> np.array(seq1)
98
- array([104, 101, 108, 108, 111], dtype=int32)
99
+ array([104, 101, 108, 108, 111])
99
100
  >>> np.array(seq2)
100
- array([104, 105], dtype=int32)
101
+ array([104, 105])
101
102
 
102
103
  Dense outputs.
103
104
  >>> inputs = ["hello", "hi"]
@@ -145,18 +146,16 @@ class ByteTokenizer(tokenizer.Tokenizer):
145
146
  Detokenization.
146
147
  >>> inputs = [104, 101, 108, 108, 111]
147
148
  >>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
148
- >>> outputs = tokenizer.detokenize(inputs)
149
- >>> np.array(outputs).astype("U")
150
- array('hello', dtype='<U5')
149
+ >>> tokenizer.detokenize(inputs)
150
+ 'hello'
151
151
 
152
152
  Detokenization with invalid bytes.
153
153
  >>> # The 255 below is invalid utf-8.
154
154
  >>> inputs = [104, 101, 255, 108, 108, 111]
155
155
  >>> tokenizer = keras_hub.tokenizers.ByteTokenizer(
156
156
  ... errors="replace", replacement_char=88)
157
- >>> outputs = tokenizer.detokenize(inputs)
158
- >>> np.array(outputs).astype("U")
159
- array('heXllo', dtype='<U6')
157
+ >>> tokenizer.detokenize(inputs)
158
+ 'heXllo'
160
159
  """
161
160
 
162
161
  def __init__(
@@ -201,6 +200,7 @@ class ByteTokenizer(tokenizer.Tokenizer):
201
200
  self._char_lst = tf.constant(
202
201
  [i.tobytes() for i in np.arange(256, dtype=np.uint8)]
203
202
  )
203
+ self._update_special_token_ids()
204
204
 
205
205
  def vocabulary_size(self):
206
206
  """Get the integer size of the tokenizer vocabulary."""
@@ -212,12 +212,10 @@ class ByteTokenizer(tokenizer.Tokenizer):
212
212
  vocab[chr(i)] = i
213
213
  return vocab
214
214
 
215
+ @preprocessing_function
215
216
  def tokenize(self, inputs):
216
- if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
217
- inputs = tf.convert_to_tensor(inputs)
218
-
219
- scalar_input = inputs.shape.rank == 0
220
- if scalar_input:
217
+ unbatched = inputs.shape.rank == 0
218
+ if unbatched:
221
219
  inputs = tf.expand_dims(inputs, 0)
222
220
 
223
221
  # Optional: Lowercase the input.
@@ -241,12 +239,13 @@ class ByteTokenizer(tokenizer.Tokenizer):
241
239
  output_shape[-1] = self.sequence_length
242
240
  tokens = tokens.to_tensor(shape=output_shape)
243
241
 
244
- if scalar_input:
242
+ if unbatched:
245
243
  tokens = tf.squeeze(tokens, 0)
246
244
  return tokens
247
245
 
246
+ @preprocessing_function
248
247
  def detokenize(self, inputs):
249
- inputs, unbatched, _ = convert_to_ragged_batch(inputs)
248
+ inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
250
249
  # Remove trailing padding tokens, so that trailing "\x00" bytes don't
251
250
  # show up in the detokenized output.
252
251
  inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
@@ -31,6 +31,7 @@ from keras_hub.src.tokenizers import tokenizer
31
31
  from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
32
32
  from keras_hub.src.utils.tensor_utils import is_int_dtype
33
33
  from keras_hub.src.utils.tensor_utils import is_string_dtype
34
+ from keras_hub.src.utils.tensor_utils import preprocessing_function
34
35
  from keras_hub.src.utils.tensor_utils import tensor_to_list
35
36
 
36
37
  try:
@@ -66,6 +67,9 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
66
67
  for more details on the format.
67
68
  sequence_length: If set, the output will be converted to a dense
68
69
  tensor and padded/trimmed so all outputs are of `sequence_length`.
70
+ add_bos: Add beginning of sentence token to the result.
71
+ add_eos: Add end of sentence token to the result. Token is always
72
+ truncated if output is longer than specified `sequence_length`.
69
73
 
70
74
  References:
71
75
  - [Kudo and Richardson, 2018](https://arxiv.org/abs/1808.06226)
@@ -115,6 +119,8 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
115
119
  proto=None,
116
120
  sequence_length=None,
117
121
  dtype="int32",
122
+ add_bos=False,
123
+ add_eos=False,
118
124
  **kwargs,
119
125
  ) -> None:
120
126
  if not is_int_dtype(dtype) and not is_string_dtype(dtype):
@@ -127,6 +133,8 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
127
133
 
128
134
  self.proto = None
129
135
  self.sequence_length = sequence_length
136
+ self.add_bos = add_bos
137
+ self.add_eos = add_eos
130
138
  self.set_proto(proto)
131
139
  self.file_assets = [VOCAB_FILENAME]
132
140
 
@@ -171,10 +179,13 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
171
179
  self._sentence_piece = tf_text.SentencepieceTokenizer(
172
180
  model=proto_bytes,
173
181
  out_type=self.compute_dtype,
182
+ add_bos=self.add_bos,
183
+ add_eos=self.add_eos,
174
184
  )
175
185
  # Keras cannot serialize a bytestring, so we base64 encode the model
176
186
  # byte array as a string for saving.
177
187
  self.proto = proto_bytes
188
+ self._update_special_token_ids()
178
189
 
179
190
  def vocabulary_size(self):
180
191
  """Get the integer size of the tokenizer vocabulary."""
@@ -211,6 +222,8 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
211
222
  {
212
223
  "proto": None, # Save vocabulary via an asset!
213
224
  "sequence_length": self.sequence_length,
225
+ "add_bos": self.add_bos,
226
+ "add_eos": self.add_eos,
214
227
  }
215
228
  )
216
229
  return config
@@ -222,12 +235,12 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
222
235
  "sure to pass a `proto` argument when creating the layer."
223
236
  )
224
237
 
238
+ @preprocessing_function
225
239
  def tokenize(self, inputs):
226
240
  self._check_vocabulary()
227
- if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
228
- inputs = tf.convert_to_tensor(inputs)
229
- scalar_input = inputs.shape.rank == 0
230
- if scalar_input:
241
+ inputs = tf.convert_to_tensor(inputs)
242
+ unbatched = inputs.shape.rank == 0
243
+ if unbatched:
231
244
  inputs = tf.expand_dims(inputs, 0)
232
245
 
233
246
  if self._sentence_piece is None:
@@ -245,15 +258,15 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
245
258
  tokens = tokens.to_tensor(shape=output_shape)
246
259
 
247
260
  # Convert to a dense output if input was a scalar.
248
- if scalar_input:
261
+ if unbatched:
249
262
  tokens = tf.squeeze(tokens, 0)
250
263
  tf.ensure_shape(tokens, shape=[self.sequence_length])
251
-
252
264
  return tokens
253
265
 
266
+ @preprocessing_function
254
267
  def detokenize(self, inputs):
255
268
  self._check_vocabulary()
256
- inputs, unbatched, _ = convert_to_ragged_batch(inputs)
269
+ inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
257
270
  # tf-text sentencepiece does not handle int64.
258
271
  inputs = tf.cast(inputs, "int32")
259
272
  outputs = self._sentence_piece.detokenize(inputs)