keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl → 0.16.0.dev2024092017__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/__init__.py +0 -6
- keras_hub/api/__init__.py +2 -0
- keras_hub/api/bounding_box/__init__.py +36 -0
- keras_hub/api/layers/__init__.py +14 -0
- keras_hub/api/models/__init__.py +97 -48
- keras_hub/api/tokenizers/__init__.py +30 -0
- keras_hub/api/utils/__init__.py +22 -0
- keras_hub/src/api_export.py +15 -9
- keras_hub/src/bounding_box/__init__.py +13 -0
- keras_hub/src/bounding_box/converters.py +529 -0
- keras_hub/src/bounding_box/formats.py +162 -0
- keras_hub/src/bounding_box/iou.py +263 -0
- keras_hub/src/bounding_box/to_dense.py +95 -0
- keras_hub/src/bounding_box/to_ragged.py +99 -0
- keras_hub/src/bounding_box/utils.py +194 -0
- keras_hub/src/bounding_box/validate_format.py +99 -0
- keras_hub/src/layers/preprocessing/audio_converter.py +121 -0
- keras_hub/src/layers/preprocessing/image_converter.py +130 -0
- keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +2 -0
- keras_hub/src/layers/preprocessing/multi_segment_packer.py +9 -8
- keras_hub/src/layers/preprocessing/preprocessing_layer.py +2 -29
- keras_hub/src/layers/preprocessing/random_deletion.py +33 -31
- keras_hub/src/layers/preprocessing/random_swap.py +33 -31
- keras_hub/src/layers/preprocessing/resizing_image_converter.py +101 -0
- keras_hub/src/layers/preprocessing/start_end_packer.py +3 -2
- keras_hub/src/models/albert/__init__.py +1 -2
- keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +6 -86
- keras_hub/src/models/albert/{albert_classifier.py → albert_text_classifier.py} +34 -10
- keras_hub/src/models/albert/{albert_preprocessor.py → albert_text_classifier_preprocessor.py} +14 -70
- keras_hub/src/models/albert/albert_tokenizer.py +17 -36
- keras_hub/src/models/backbone.py +12 -34
- keras_hub/src/models/bart/__init__.py +1 -2
- keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +21 -148
- keras_hub/src/models/bart/bart_tokenizer.py +12 -39
- keras_hub/src/models/bert/__init__.py +1 -5
- keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +6 -87
- keras_hub/src/models/bert/bert_presets.py +1 -4
- keras_hub/src/models/bert/{bert_classifier.py → bert_text_classifier.py} +19 -12
- keras_hub/src/models/bert/{bert_preprocessor.py → bert_text_classifier_preprocessor.py} +14 -70
- keras_hub/src/models/bert/bert_tokenizer.py +17 -35
- keras_hub/src/models/bloom/__init__.py +1 -2
- keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +6 -91
- keras_hub/src/models/bloom/bloom_tokenizer.py +12 -41
- keras_hub/src/models/causal_lm.py +10 -29
- keras_hub/src/models/causal_lm_preprocessor.py +195 -0
- keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +54 -15
- keras_hub/src/models/deberta_v3/__init__.py +1 -4
- keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +14 -77
- keras_hub/src/models/deberta_v3/{deberta_v3_classifier.py → deberta_v3_text_classifier.py} +16 -11
- keras_hub/src/models/deberta_v3/{deberta_v3_preprocessor.py → deberta_v3_text_classifier_preprocessor.py} +23 -64
- keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +30 -25
- keras_hub/src/models/densenet/densenet_backbone.py +46 -22
- keras_hub/src/models/distil_bert/__init__.py +1 -4
- keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +14 -76
- keras_hub/src/models/distil_bert/{distil_bert_classifier.py → distil_bert_text_classifier.py} +17 -12
- keras_hub/src/models/distil_bert/{distil_bert_preprocessor.py → distil_bert_text_classifier_preprocessor.py} +23 -63
- keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +19 -35
- keras_hub/src/models/efficientnet/__init__.py +13 -0
- keras_hub/src/models/efficientnet/efficientnet_backbone.py +569 -0
- keras_hub/src/models/efficientnet/fusedmbconv.py +229 -0
- keras_hub/src/models/efficientnet/mbconv.py +238 -0
- keras_hub/src/models/electra/__init__.py +1 -2
- keras_hub/src/models/electra/electra_tokenizer.py +17 -32
- keras_hub/src/models/f_net/__init__.py +1 -2
- keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +12 -78
- keras_hub/src/models/f_net/{f_net_classifier.py → f_net_text_classifier.py} +17 -10
- keras_hub/src/models/f_net/{f_net_preprocessor.py → f_net_text_classifier_preprocessor.py} +19 -63
- keras_hub/src/models/f_net/f_net_tokenizer.py +17 -35
- keras_hub/src/models/falcon/__init__.py +1 -2
- keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +6 -89
- keras_hub/src/models/falcon/falcon_tokenizer.py +12 -35
- keras_hub/src/models/gemma/__init__.py +1 -2
- keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +6 -90
- keras_hub/src/models/gemma/gemma_decoder_block.py +1 -1
- keras_hub/src/models/gemma/gemma_tokenizer.py +12 -23
- keras_hub/src/models/gpt2/__init__.py +1 -2
- keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +6 -89
- keras_hub/src/models/gpt2/gpt2_preprocessor.py +12 -90
- keras_hub/src/models/gpt2/gpt2_tokenizer.py +12 -34
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +6 -91
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +12 -34
- keras_hub/src/models/image_classifier.py +0 -5
- keras_hub/src/models/image_classifier_preprocessor.py +83 -0
- keras_hub/src/models/llama/__init__.py +1 -2
- keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +6 -85
- keras_hub/src/models/llama/llama_tokenizer.py +12 -25
- keras_hub/src/models/llama3/__init__.py +1 -2
- keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +6 -89
- keras_hub/src/models/llama3/llama3_tokenizer.py +12 -33
- keras_hub/src/models/masked_lm.py +0 -2
- keras_hub/src/models/masked_lm_preprocessor.py +156 -0
- keras_hub/src/models/mistral/__init__.py +1 -2
- keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +6 -91
- keras_hub/src/models/mistral/mistral_tokenizer.py +12 -23
- keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +2 -2
- keras_hub/src/models/mobilenet/__init__.py +13 -0
- keras_hub/src/models/mobilenet/mobilenet_backbone.py +530 -0
- keras_hub/src/models/mobilenet/mobilenet_image_classifier.py +114 -0
- keras_hub/src/models/opt/__init__.py +1 -2
- keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +6 -93
- keras_hub/src/models/opt/opt_tokenizer.py +12 -41
- keras_hub/src/models/pali_gemma/__init__.py +1 -4
- keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +28 -28
- keras_hub/src/models/pali_gemma/pali_gemma_image_converter.py +25 -0
- keras_hub/src/models/pali_gemma/pali_gemma_presets.py +5 -5
- keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +11 -3
- keras_hub/src/models/phi3/__init__.py +1 -2
- keras_hub/src/models/phi3/phi3_causal_lm.py +3 -9
- keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +6 -89
- keras_hub/src/models/phi3/phi3_tokenizer.py +12 -36
- keras_hub/src/models/preprocessor.py +72 -83
- keras_hub/src/models/resnet/__init__.py +6 -0
- keras_hub/src/models/resnet/resnet_backbone.py +390 -42
- keras_hub/src/models/resnet/resnet_image_classifier.py +33 -6
- keras_hub/src/models/resnet/resnet_image_classifier_preprocessor.py +28 -0
- keras_hub/src/models/{llama3/llama3_preprocessor.py → resnet/resnet_image_converter.py} +7 -5
- keras_hub/src/models/resnet/resnet_presets.py +95 -0
- keras_hub/src/models/retinanet/__init__.py +13 -0
- keras_hub/src/models/retinanet/anchor_generator.py +175 -0
- keras_hub/src/models/retinanet/box_matcher.py +259 -0
- keras_hub/src/models/retinanet/non_max_supression.py +578 -0
- keras_hub/src/models/roberta/__init__.py +1 -2
- keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +22 -74
- keras_hub/src/models/roberta/{roberta_classifier.py → roberta_text_classifier.py} +16 -11
- keras_hub/src/models/roberta/{roberta_preprocessor.py → roberta_text_classifier_preprocessor.py} +21 -53
- keras_hub/src/models/roberta/roberta_tokenizer.py +13 -52
- keras_hub/src/models/seq_2_seq_lm_preprocessor.py +269 -0
- keras_hub/src/models/stable_diffusion_v3/__init__.py +13 -0
- keras_hub/src/models/stable_diffusion_v3/clip_encoder_block.py +103 -0
- keras_hub/src/models/stable_diffusion_v3/clip_preprocessor.py +93 -0
- keras_hub/src/models/stable_diffusion_v3/clip_text_encoder.py +149 -0
- keras_hub/src/models/stable_diffusion_v3/clip_tokenizer.py +167 -0
- keras_hub/src/models/stable_diffusion_v3/mmdit.py +427 -0
- keras_hub/src/models/stable_diffusion_v3/mmdit_block.py +317 -0
- keras_hub/src/models/stable_diffusion_v3/t5_xxl_preprocessor.py +74 -0
- keras_hub/src/models/stable_diffusion_v3/t5_xxl_text_encoder.py +155 -0
- keras_hub/src/models/stable_diffusion_v3/vae_attention.py +126 -0
- keras_hub/src/models/stable_diffusion_v3/vae_image_decoder.py +186 -0
- keras_hub/src/models/t5/__init__.py +1 -2
- keras_hub/src/models/t5/t5_tokenizer.py +13 -23
- keras_hub/src/models/task.py +71 -116
- keras_hub/src/models/{classifier.py → text_classifier.py} +19 -13
- keras_hub/src/models/text_classifier_preprocessor.py +138 -0
- keras_hub/src/models/whisper/__init__.py +1 -2
- keras_hub/src/models/whisper/{whisper_audio_feature_extractor.py → whisper_audio_converter.py} +20 -18
- keras_hub/src/models/whisper/whisper_backbone.py +0 -3
- keras_hub/src/models/whisper/whisper_presets.py +10 -10
- keras_hub/src/models/whisper/whisper_tokenizer.py +20 -16
- keras_hub/src/models/xlm_roberta/__init__.py +1 -4
- keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +26 -72
- keras_hub/src/models/xlm_roberta/{xlm_roberta_classifier.py → xlm_roberta_text_classifier.py} +16 -11
- keras_hub/src/models/xlm_roberta/{xlm_roberta_preprocessor.py → xlm_roberta_text_classifier_preprocessor.py} +26 -53
- keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +25 -10
- keras_hub/src/tests/test_case.py +46 -0
- keras_hub/src/tokenizers/byte_pair_tokenizer.py +30 -17
- keras_hub/src/tokenizers/byte_tokenizer.py +14 -15
- keras_hub/src/tokenizers/sentence_piece_tokenizer.py +20 -7
- keras_hub/src/tokenizers/tokenizer.py +67 -32
- keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +14 -15
- keras_hub/src/tokenizers/word_piece_tokenizer.py +34 -47
- keras_hub/src/utils/imagenet/__init__.py +13 -0
- keras_hub/src/utils/imagenet/imagenet_utils.py +1067 -0
- keras_hub/src/utils/keras_utils.py +0 -50
- keras_hub/src/utils/preset_utils.py +230 -68
- keras_hub/src/utils/tensor_utils.py +187 -69
- keras_hub/src/utils/timm/convert_resnet.py +19 -16
- keras_hub/src/utils/timm/preset_loader.py +66 -0
- keras_hub/src/utils/transformers/convert_albert.py +193 -0
- keras_hub/src/utils/transformers/convert_bart.py +373 -0
- keras_hub/src/utils/transformers/convert_bert.py +7 -17
- keras_hub/src/utils/transformers/convert_distilbert.py +10 -20
- keras_hub/src/utils/transformers/convert_gemma.py +5 -19
- keras_hub/src/utils/transformers/convert_gpt2.py +5 -18
- keras_hub/src/utils/transformers/convert_llama3.py +7 -18
- keras_hub/src/utils/transformers/convert_mistral.py +129 -0
- keras_hub/src/utils/transformers/convert_pali_gemma.py +7 -29
- keras_hub/src/utils/transformers/preset_loader.py +77 -0
- keras_hub/src/utils/transformers/safetensor_utils.py +2 -2
- keras_hub/src/version_utils.py +1 -1
- keras_hub_nightly-0.16.0.dev2024092017.dist-info/METADATA +202 -0
- keras_hub_nightly-0.16.0.dev2024092017.dist-info/RECORD +334 -0
- {keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/WHEEL +1 -1
- keras_hub/src/models/bart/bart_preprocessor.py +0 -276
- keras_hub/src/models/bloom/bloom_preprocessor.py +0 -185
- keras_hub/src/models/electra/electra_preprocessor.py +0 -154
- keras_hub/src/models/falcon/falcon_preprocessor.py +0 -187
- keras_hub/src/models/gemma/gemma_preprocessor.py +0 -191
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +0 -145
- keras_hub/src/models/llama/llama_preprocessor.py +0 -189
- keras_hub/src/models/mistral/mistral_preprocessor.py +0 -190
- keras_hub/src/models/opt/opt_preprocessor.py +0 -188
- keras_hub/src/models/phi3/phi3_preprocessor.py +0 -190
- keras_hub/src/models/whisper/whisper_preprocessor.py +0 -326
- keras_hub/src/utils/timm/convert.py +0 -37
- keras_hub/src/utils/transformers/convert.py +0 -101
- keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +0 -34
- keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +0 -297
- {keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/top_level.txt +0 -0
@@ -19,16 +19,14 @@ from keras_hub.src.layers.preprocessing.preprocessing_layer import (
|
|
19
19
|
)
|
20
20
|
from keras_hub.src.utils.preset_utils import TOKENIZER_ASSET_DIR
|
21
21
|
from keras_hub.src.utils.preset_utils import TOKENIZER_CONFIG_FILE
|
22
|
-
from keras_hub.src.utils.preset_utils import
|
23
|
-
from keras_hub.src.utils.preset_utils import
|
22
|
+
from keras_hub.src.utils.preset_utils import builtin_presets
|
23
|
+
from keras_hub.src.utils.preset_utils import find_subclass
|
24
24
|
from keras_hub.src.utils.preset_utils import get_file
|
25
|
-
from keras_hub.src.utils.preset_utils import
|
26
|
-
from keras_hub.src.utils.preset_utils import list_subclasses
|
27
|
-
from keras_hub.src.utils.preset_utils import load_serialized_object
|
25
|
+
from keras_hub.src.utils.preset_utils import get_preset_loader
|
28
26
|
from keras_hub.src.utils.preset_utils import save_serialized_object
|
29
27
|
from keras_hub.src.utils.preset_utils import save_tokenizer_assets
|
30
28
|
from keras_hub.src.utils.python_utils import classproperty
|
31
|
-
from keras_hub.src.utils.
|
29
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
32
30
|
|
33
31
|
|
34
32
|
@keras_hub_export(
|
@@ -79,6 +77,8 @@ class Tokenizer(PreprocessingLayer):
|
|
79
77
|
```
|
80
78
|
"""
|
81
79
|
|
80
|
+
backbone_cls = None
|
81
|
+
|
82
82
|
def __init__(self, *args, **kwargs):
|
83
83
|
super().__init__(*args, **kwargs)
|
84
84
|
self.file_assets = None
|
@@ -138,6 +138,55 @@ class Tokenizer(PreprocessingLayer):
|
|
138
138
|
f"{self.__class__.__name__}."
|
139
139
|
)
|
140
140
|
|
141
|
+
@property
|
142
|
+
def special_tokens(self):
|
143
|
+
"""List all built-in special tokens for the tokenizer."""
|
144
|
+
if not hasattr(self, "_special_token_attrs"):
|
145
|
+
return []
|
146
|
+
tokens = set(getattr(self, a) for a in self._special_token_attrs)
|
147
|
+
return list(tokens)
|
148
|
+
|
149
|
+
@property
|
150
|
+
def special_token_ids(self):
|
151
|
+
"""List all built-in special token ids for the tokenizer."""
|
152
|
+
if not hasattr(self, "_special_token_attrs"):
|
153
|
+
return []
|
154
|
+
ids = set(getattr(self, f"{a}_id") for a in self._special_token_attrs)
|
155
|
+
if None in ids:
|
156
|
+
raise ValueError(
|
157
|
+
"Cannot access `special_token_ids` before a vocabulary has "
|
158
|
+
"been set on the tokenizer."
|
159
|
+
)
|
160
|
+
return list(ids)
|
161
|
+
|
162
|
+
def _add_special_token(self, token, name):
|
163
|
+
if not hasattr(self, "_special_token_attrs"):
|
164
|
+
self._special_token_attrs = []
|
165
|
+
self._special_token_attrs.append(name)
|
166
|
+
setattr(self, name, token)
|
167
|
+
try:
|
168
|
+
id = self.token_to_id(token)
|
169
|
+
except (ValueError, AttributeError):
|
170
|
+
id = None
|
171
|
+
setattr(self, f"{name}_id", id)
|
172
|
+
|
173
|
+
def _update_special_token_ids(self):
|
174
|
+
if not hasattr(self, "_special_token_attrs"):
|
175
|
+
return
|
176
|
+
vocabulary = self.get_vocabulary()
|
177
|
+
for attr in set(self._special_token_attrs):
|
178
|
+
token = getattr(self, attr)
|
179
|
+
if token not in vocabulary:
|
180
|
+
classname = self.__class__.__name__
|
181
|
+
raise ValueError(
|
182
|
+
f"Cannot find special token `'{token}'` in the provided "
|
183
|
+
f"vocabulary for `{classname}`. Please ensure `'{token}'` "
|
184
|
+
"is in the provided vocabulary when creating the Tokenizer."
|
185
|
+
)
|
186
|
+
for attr in self._special_token_attrs:
|
187
|
+
token = getattr(self, attr)
|
188
|
+
setattr(self, f"{attr}_id", self.token_to_id(token))
|
189
|
+
|
141
190
|
def save_to_preset(self, preset_dir):
|
142
191
|
"""Save tokenizer to a preset directory.
|
143
192
|
|
@@ -151,6 +200,7 @@ class Tokenizer(PreprocessingLayer):
|
|
151
200
|
)
|
152
201
|
save_tokenizer_assets(self, preset_dir)
|
153
202
|
|
203
|
+
@preprocessing_function
|
154
204
|
def call(self, inputs, *args, training=None, **kwargs):
|
155
205
|
return self.tokenize(inputs, *args, **kwargs)
|
156
206
|
|
@@ -165,11 +215,8 @@ class Tokenizer(PreprocessingLayer):
|
|
165
215
|
|
166
216
|
@classproperty
|
167
217
|
def presets(cls):
|
168
|
-
"""List built-in presets for a `
|
169
|
-
|
170
|
-
for subclass in list_subclasses(cls):
|
171
|
-
presets.update(subclass.presets)
|
172
|
-
return presets
|
218
|
+
"""List built-in presets for a `Tokenizer` subclass."""
|
219
|
+
return builtin_presets(cls)
|
173
220
|
|
174
221
|
@classmethod
|
175
222
|
def from_preset(
|
@@ -180,10 +227,10 @@ class Tokenizer(PreprocessingLayer):
|
|
180
227
|
"""Instantiate a `keras_hub.models.Tokenizer` from a model preset.
|
181
228
|
|
182
229
|
A preset is a directory of configs, weights and other file assets used
|
183
|
-
to save and load a pre-trained model. The `preset` can be passed as
|
230
|
+
to save and load a pre-trained model. The `preset` can be passed as
|
184
231
|
one of:
|
185
232
|
|
186
|
-
1. a built
|
233
|
+
1. a built-in preset identifier like `'bert_base_en'`
|
187
234
|
2. a Kaggle Models handle like `'kaggle://user/bert/keras/bert_base_en'`
|
188
235
|
3. a Hugging Face handle like `'hf://user/bert_base_en'`
|
189
236
|
4. a path to a local preset directory like `'./bert_base_en'`
|
@@ -198,7 +245,7 @@ class Tokenizer(PreprocessingLayer):
|
|
198
245
|
will be inferred from the config in the preset directory.
|
199
246
|
|
200
247
|
Args:
|
201
|
-
preset: string. A built
|
248
|
+
preset: string. A built-in preset identifier, a Kaggle Models
|
202
249
|
handle, a Hugging Face handle, or a path to a local directory.
|
203
250
|
load_weights: bool. If `True`, the weights will be loaded into the
|
204
251
|
model architecture. If `False`, the weights will be randomly
|
@@ -207,7 +254,7 @@ class Tokenizer(PreprocessingLayer):
|
|
207
254
|
Examples:
|
208
255
|
```python
|
209
256
|
# Load a preset tokenizer.
|
210
|
-
tokenizer = keras_hub.
|
257
|
+
tokenizer = keras_hub.tokenizer.Tokenizer.from_preset("bert_base_en")
|
211
258
|
|
212
259
|
# Tokenize some input.
|
213
260
|
tokenizer("The quick brown fox tripped.")
|
@@ -216,20 +263,8 @@ class Tokenizer(PreprocessingLayer):
|
|
216
263
|
tokenizer.detokenize([5, 6, 7, 8, 9])
|
217
264
|
```
|
218
265
|
"""
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
preset, config_file=TOKENIZER_CONFIG_FILE
|
225
|
-
)
|
226
|
-
if not issubclass(preset_cls, cls):
|
227
|
-
raise ValueError(
|
228
|
-
f"Preset has type `{preset_cls.__name__}` which is not a "
|
229
|
-
f"a subclass of calling class `{cls.__name__}`. Call "
|
230
|
-
f"`from_preset` directly on `{preset_cls.__name__}` instead."
|
231
|
-
)
|
232
|
-
|
233
|
-
tokenizer = load_serialized_object(preset, TOKENIZER_CONFIG_FILE)
|
234
|
-
tokenizer.load_preset_assets(preset)
|
235
|
-
return tokenizer
|
266
|
+
loader = get_preset_loader(preset)
|
267
|
+
backbone_cls = loader.check_backbone_class()
|
268
|
+
if cls.backbone_cls != backbone_cls:
|
269
|
+
cls = find_subclass(preset, cls, backbone_cls)
|
270
|
+
return loader.load_tokenizer(cls, **kwargs)
|
@@ -17,6 +17,7 @@ from keras_hub.src.api_export import keras_hub_export
|
|
17
17
|
from keras_hub.src.tokenizers import tokenizer
|
18
18
|
from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
|
19
19
|
from keras_hub.src.utils.tensor_utils import is_int_dtype
|
20
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
20
21
|
|
21
22
|
try:
|
22
23
|
import tensorflow as tf
|
@@ -94,9 +95,9 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
|
|
94
95
|
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
|
95
96
|
>>> seq1, seq2 = tokenizer(inputs)
|
96
97
|
>>> np.array(seq1)
|
97
|
-
array([2346, 2369, 2360, 2381, 2340, 2325]
|
98
|
+
array([2346, 2369, 2360, 2381, 2340, 2325])
|
98
99
|
>>> np.array(seq2)
|
99
|
-
array([1705, 1578, 1575, 1576]
|
100
|
+
array([1705, 1578, 1575, 1576])
|
100
101
|
|
101
102
|
Dense outputs.
|
102
103
|
>>> inputs = ["पुस्तक", "کتاب"]
|
@@ -179,9 +180,8 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
|
|
179
180
|
Detokenization.
|
180
181
|
>>> inputs = tf.constant([110, 105, 110, 106, 97], dtype="int32")
|
181
182
|
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer()
|
182
|
-
>>>
|
183
|
-
|
184
|
-
array('ninja', dtype='<U5')
|
183
|
+
>>> tokenizer.detokenize(inputs)
|
184
|
+
'ninja'
|
185
185
|
|
186
186
|
Detokenization with padding.
|
187
187
|
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
@@ -199,9 +199,8 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
|
|
199
199
|
>>> inputs = tf.constant([110, 105, 10000000, 110, 106, 97])
|
200
200
|
>>> tokenizer = keras_hub.tokenizers.UnicodeCodepointTokenizer(
|
201
201
|
... errors="replace", replacement_char=88)
|
202
|
-
>>>
|
203
|
-
|
204
|
-
array('niXnja', dtype='<U6')
|
202
|
+
>>> tokenizer.detokenize(inputs)
|
203
|
+
'niXnja'
|
205
204
|
"""
|
206
205
|
|
207
206
|
def __init__(
|
@@ -256,6 +255,7 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
|
|
256
255
|
self.input_encoding = input_encoding
|
257
256
|
self.output_encoding = output_encoding
|
258
257
|
self._vocabulary_size = vocabulary_size
|
258
|
+
self._update_special_token_ids()
|
259
259
|
|
260
260
|
def get_config(self):
|
261
261
|
config = super().get_config()
|
@@ -284,12 +284,10 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
|
|
284
284
|
vocab[chr(i)] = i
|
285
285
|
return vocab
|
286
286
|
|
287
|
+
@preprocessing_function
|
287
288
|
def tokenize(self, inputs):
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
scalar_input = inputs.shape.rank == 0
|
292
|
-
if scalar_input:
|
289
|
+
unbatched = inputs.shape.rank == 0
|
290
|
+
if unbatched:
|
293
291
|
inputs = tf.expand_dims(inputs, 0)
|
294
292
|
|
295
293
|
# Optionally lowercase the text
|
@@ -313,7 +311,7 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
|
|
313
311
|
output_shape[-1] = self.sequence_length
|
314
312
|
tokens = tokens.to_tensor(shape=output_shape)
|
315
313
|
|
316
|
-
if
|
314
|
+
if unbatched:
|
317
315
|
tokens = tf.squeeze(tokens, 0)
|
318
316
|
|
319
317
|
# Optionally clamps the output code point values to be in the
|
@@ -323,8 +321,9 @@ class UnicodeCodepointTokenizer(tokenizer.Tokenizer):
|
|
323
321
|
|
324
322
|
return tokens
|
325
323
|
|
324
|
+
@preprocessing_function
|
326
325
|
def detokenize(self, inputs):
|
327
|
-
inputs, unbatched,
|
326
|
+
inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
|
328
327
|
inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
|
329
328
|
outputs = tf.strings.unicode_encode(
|
330
329
|
inputs,
|
@@ -23,6 +23,7 @@ from keras_hub.src.tokenizers import tokenizer
|
|
23
23
|
from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
|
24
24
|
from keras_hub.src.utils.tensor_utils import is_int_dtype
|
25
25
|
from keras_hub.src.utils.tensor_utils import is_string_dtype
|
26
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
26
27
|
|
27
28
|
try:
|
28
29
|
import tensorflow as tf
|
@@ -166,7 +167,7 @@ def pretokenize(
|
|
166
167
|
if special_tokens_pattern is not None:
|
167
168
|
# the idea here is to pass the special tokens regex to the split
|
168
169
|
# function as delimiter regex pattern, so the input will be splitted
|
169
|
-
# by them, but also the function will treat each
|
170
|
+
# by them, but also the function will treat each one of them as one
|
170
171
|
# entity that shouldn't be splitted even if they have other
|
171
172
|
# delimiter regex pattern inside them. then pass the special tokens
|
172
173
|
# regex also as keep delimiter regex pattern, so they will
|
@@ -263,12 +264,6 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
263
264
|
oov_token: str. The string value to substitute for
|
264
265
|
an unknown token. It must be included in the vocab.
|
265
266
|
Defaults to `"[UNK]"`.
|
266
|
-
special_tokens: list. A list of special tokens. when
|
267
|
-
`special_tokens_in_strings` is set to `True`, the tokenizer will map
|
268
|
-
every special token in the input strings to its id, even if these
|
269
|
-
special tokens contain characters that should be splitted before
|
270
|
-
tokenization such as punctuation. `special_tokens` must be included
|
271
|
-
in `vocabulary`.
|
272
267
|
special_tokens_in_strings: bool. A bool to indicate if the tokenizer
|
273
268
|
should expect special tokens in input strings that should be
|
274
269
|
tokenized and mapped correctly to their ids. Defaults to False.
|
@@ -310,9 +305,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
310
305
|
... lowercase=True,
|
311
306
|
... dtype="string",
|
312
307
|
... )
|
313
|
-
>>>
|
314
|
-
|
315
|
-
array(['the', 'qu', '##ick', 'br', '##own', 'fox', '.'], dtype='<U5')
|
308
|
+
>>> tokenizer(inputs)
|
309
|
+
['the', 'qu', '##ick', 'br', '##own', 'fox', '.']
|
316
310
|
|
317
311
|
Detokenization.
|
318
312
|
>>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
|
@@ -321,9 +315,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
321
315
|
... vocabulary=vocab,
|
322
316
|
... lowercase=True,
|
323
317
|
... )
|
324
|
-
>>>
|
325
|
-
|
326
|
-
array('the quick brown fox .', dtype='<U21')
|
318
|
+
>>> tokenizer.detokenize(tokenizer.tokenize(inputs))
|
319
|
+
'the quick brown fox .'
|
327
320
|
|
328
321
|
Custom splitting.
|
329
322
|
>>> vocab = ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox", "."]
|
@@ -335,9 +328,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
335
328
|
... dtype='string',
|
336
329
|
... )
|
337
330
|
>>> split_inputs = tf.strings.split(inputs, sep="$")
|
338
|
-
>>>
|
339
|
-
|
340
|
-
array(['the', 'qu', '##ick', 'br', '##own', 'fox'], dtype='<U5')
|
331
|
+
>>> tokenizer(split_inputs)
|
332
|
+
['the', 'qu', '##ick', 'br', '##own', 'fox']
|
341
333
|
"""
|
342
334
|
|
343
335
|
def __init__(
|
@@ -372,19 +364,9 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
372
364
|
self.split_on_cjk = split_on_cjk
|
373
365
|
self.suffix_indicator = suffix_indicator
|
374
366
|
self.oov_token = oov_token
|
375
|
-
self.
|
376
|
-
self.
|
377
|
-
|
378
|
-
# the idea here is to pass the special tokens regex to the
|
379
|
-
# split function as delimiter regex pattern, so the input will
|
380
|
-
# be splitted by them, but also the function will treat each on
|
381
|
-
# of them as one entity that shouldn't be splitted even if they
|
382
|
-
# have other delimiter regex pattern inside them. then pass the
|
383
|
-
# special tokens regex also as keep delimiter regex
|
384
|
-
# pattern, so they will not be removed.
|
385
|
-
self._special_tokens_pattern = get_special_tokens_pattern(
|
386
|
-
self.special_tokens
|
387
|
-
)
|
367
|
+
self._init_special_tokens = special_tokens
|
368
|
+
self.special_tokens_in_strings = special_tokens_in_strings
|
369
|
+
|
388
370
|
self.set_vocabulary(vocabulary)
|
389
371
|
self.file_assets = [VOCAB_FILENAME]
|
390
372
|
|
@@ -426,16 +408,6 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
426
408
|
"the `oov_token` argument when creating the tokenizer."
|
427
409
|
)
|
428
410
|
|
429
|
-
# Check for special tokens in the vocabulary
|
430
|
-
if self.special_tokens is not None:
|
431
|
-
for token in self.special_tokens:
|
432
|
-
if token not in self.vocabulary:
|
433
|
-
raise ValueError(
|
434
|
-
f"Cannot find token `'{token}'` in the provided "
|
435
|
-
f"`vocabulary`. Please provide `'{token}'` in your "
|
436
|
-
"`vocabulary` or use a pretrained `vocabulary` name."
|
437
|
-
)
|
438
|
-
|
439
411
|
self._fast_word_piece = tf_text.FastWordpieceTokenizer(
|
440
412
|
vocab=self.vocabulary,
|
441
413
|
token_out_type=self.compute_dtype,
|
@@ -444,6 +416,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
444
416
|
no_pretokenization=True,
|
445
417
|
support_detokenization=True,
|
446
418
|
)
|
419
|
+
self._update_special_token_ids()
|
447
420
|
|
448
421
|
def get_vocabulary(self):
|
449
422
|
"""Get the tokenizer vocabulary as a list of strings tokens."""
|
@@ -484,7 +457,8 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
484
457
|
"split": self.split,
|
485
458
|
"suffix_indicator": self.suffix_indicator,
|
486
459
|
"oov_token": self.oov_token,
|
487
|
-
"special_tokens": self.
|
460
|
+
"special_tokens": self._init_special_tokens,
|
461
|
+
"special_tokens_in_strings": self.special_tokens_in_strings,
|
488
462
|
}
|
489
463
|
)
|
490
464
|
return config
|
@@ -496,19 +470,31 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
496
470
|
"to pass a `vocabulary` argument when creating the layer."
|
497
471
|
)
|
498
472
|
|
473
|
+
@preprocessing_function
|
499
474
|
def tokenize(self, inputs):
|
500
475
|
self._check_vocabulary()
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
476
|
+
inputs = tf.convert_to_tensor(inputs)
|
477
|
+
unbatched = inputs.shape.rank == 0
|
478
|
+
pattern = None
|
479
|
+
if self.split and self.special_tokens_in_strings:
|
480
|
+
# the idea here is to pass the special tokens regex to the
|
481
|
+
# split function as delimiter regex pattern, so the input will
|
482
|
+
# be splitted by them, but also the function will treat each one
|
483
|
+
# of them as one entity that shouldn't be splitted even if they
|
484
|
+
# have other delimiter regex pattern inside them. then pass the
|
485
|
+
# special tokens regex also as keep delimiter regex
|
486
|
+
# pattern, so they will not be removed.
|
487
|
+
special_tokens = self.special_tokens
|
488
|
+
if self._init_special_tokens:
|
489
|
+
special_tokens += self._init_special_tokens
|
490
|
+
pattern = get_special_tokens_pattern(special_tokens)
|
505
491
|
inputs = pretokenize(
|
506
492
|
inputs,
|
507
493
|
self.lowercase,
|
508
494
|
self.strip_accents,
|
509
495
|
self.split,
|
510
496
|
self.split_on_cjk,
|
511
|
-
|
497
|
+
pattern,
|
512
498
|
)
|
513
499
|
|
514
500
|
# Apply WordPiece and coerce shape for outputs.
|
@@ -524,15 +510,16 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
|
|
524
510
|
output_shape[-1] = self.sequence_length
|
525
511
|
tokens = tokens.to_tensor(shape=output_shape)
|
526
512
|
# Convert to a dense output if input in scalar
|
527
|
-
if
|
513
|
+
if unbatched:
|
528
514
|
tokens = tf.squeeze(tokens, 0)
|
529
515
|
tf.ensure_shape(tokens, shape=[self.sequence_length])
|
530
516
|
|
531
517
|
return tokens
|
532
518
|
|
519
|
+
@preprocessing_function
|
533
520
|
def detokenize(self, inputs):
|
534
521
|
self._check_vocabulary()
|
535
|
-
inputs, unbatched,
|
522
|
+
inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
|
536
523
|
outputs = self._fast_word_piece.detokenize(inputs)
|
537
524
|
if unbatched:
|
538
525
|
outputs = tf.squeeze(outputs, 0)
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Copyright 2024 The KerasNLP Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# https://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|