keras-hub-nightly 0.15.0.dev20240823171555__py3-none-any.whl → 0.16.0.dev2024092017__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/__init__.py +0 -6
- keras_hub/api/__init__.py +2 -0
- keras_hub/api/bounding_box/__init__.py +36 -0
- keras_hub/api/layers/__init__.py +14 -0
- keras_hub/api/models/__init__.py +97 -48
- keras_hub/api/tokenizers/__init__.py +30 -0
- keras_hub/api/utils/__init__.py +22 -0
- keras_hub/src/api_export.py +15 -9
- keras_hub/src/bounding_box/__init__.py +13 -0
- keras_hub/src/bounding_box/converters.py +529 -0
- keras_hub/src/bounding_box/formats.py +162 -0
- keras_hub/src/bounding_box/iou.py +263 -0
- keras_hub/src/bounding_box/to_dense.py +95 -0
- keras_hub/src/bounding_box/to_ragged.py +99 -0
- keras_hub/src/bounding_box/utils.py +194 -0
- keras_hub/src/bounding_box/validate_format.py +99 -0
- keras_hub/src/layers/preprocessing/audio_converter.py +121 -0
- keras_hub/src/layers/preprocessing/image_converter.py +130 -0
- keras_hub/src/layers/preprocessing/masked_lm_mask_generator.py +2 -0
- keras_hub/src/layers/preprocessing/multi_segment_packer.py +9 -8
- keras_hub/src/layers/preprocessing/preprocessing_layer.py +2 -29
- keras_hub/src/layers/preprocessing/random_deletion.py +33 -31
- keras_hub/src/layers/preprocessing/random_swap.py +33 -31
- keras_hub/src/layers/preprocessing/resizing_image_converter.py +101 -0
- keras_hub/src/layers/preprocessing/start_end_packer.py +3 -2
- keras_hub/src/models/albert/__init__.py +1 -2
- keras_hub/src/models/albert/albert_masked_lm_preprocessor.py +6 -86
- keras_hub/src/models/albert/{albert_classifier.py → albert_text_classifier.py} +34 -10
- keras_hub/src/models/albert/{albert_preprocessor.py → albert_text_classifier_preprocessor.py} +14 -70
- keras_hub/src/models/albert/albert_tokenizer.py +17 -36
- keras_hub/src/models/backbone.py +12 -34
- keras_hub/src/models/bart/__init__.py +1 -2
- keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor.py +21 -148
- keras_hub/src/models/bart/bart_tokenizer.py +12 -39
- keras_hub/src/models/bert/__init__.py +1 -5
- keras_hub/src/models/bert/bert_masked_lm_preprocessor.py +6 -87
- keras_hub/src/models/bert/bert_presets.py +1 -4
- keras_hub/src/models/bert/{bert_classifier.py → bert_text_classifier.py} +19 -12
- keras_hub/src/models/bert/{bert_preprocessor.py → bert_text_classifier_preprocessor.py} +14 -70
- keras_hub/src/models/bert/bert_tokenizer.py +17 -35
- keras_hub/src/models/bloom/__init__.py +1 -2
- keras_hub/src/models/bloom/bloom_causal_lm_preprocessor.py +6 -91
- keras_hub/src/models/bloom/bloom_tokenizer.py +12 -41
- keras_hub/src/models/causal_lm.py +10 -29
- keras_hub/src/models/causal_lm_preprocessor.py +195 -0
- keras_hub/src/models/csp_darknet/csp_darknet_backbone.py +54 -15
- keras_hub/src/models/deberta_v3/__init__.py +1 -4
- keras_hub/src/models/deberta_v3/deberta_v3_masked_lm_preprocessor.py +14 -77
- keras_hub/src/models/deberta_v3/{deberta_v3_classifier.py → deberta_v3_text_classifier.py} +16 -11
- keras_hub/src/models/deberta_v3/{deberta_v3_preprocessor.py → deberta_v3_text_classifier_preprocessor.py} +23 -64
- keras_hub/src/models/deberta_v3/deberta_v3_tokenizer.py +30 -25
- keras_hub/src/models/densenet/densenet_backbone.py +46 -22
- keras_hub/src/models/distil_bert/__init__.py +1 -4
- keras_hub/src/models/distil_bert/distil_bert_masked_lm_preprocessor.py +14 -76
- keras_hub/src/models/distil_bert/{distil_bert_classifier.py → distil_bert_text_classifier.py} +17 -12
- keras_hub/src/models/distil_bert/{distil_bert_preprocessor.py → distil_bert_text_classifier_preprocessor.py} +23 -63
- keras_hub/src/models/distil_bert/distil_bert_tokenizer.py +19 -35
- keras_hub/src/models/efficientnet/__init__.py +13 -0
- keras_hub/src/models/efficientnet/efficientnet_backbone.py +569 -0
- keras_hub/src/models/efficientnet/fusedmbconv.py +229 -0
- keras_hub/src/models/efficientnet/mbconv.py +238 -0
- keras_hub/src/models/electra/__init__.py +1 -2
- keras_hub/src/models/electra/electra_tokenizer.py +17 -32
- keras_hub/src/models/f_net/__init__.py +1 -2
- keras_hub/src/models/f_net/f_net_masked_lm_preprocessor.py +12 -78
- keras_hub/src/models/f_net/{f_net_classifier.py → f_net_text_classifier.py} +17 -10
- keras_hub/src/models/f_net/{f_net_preprocessor.py → f_net_text_classifier_preprocessor.py} +19 -63
- keras_hub/src/models/f_net/f_net_tokenizer.py +17 -35
- keras_hub/src/models/falcon/__init__.py +1 -2
- keras_hub/src/models/falcon/falcon_causal_lm_preprocessor.py +6 -89
- keras_hub/src/models/falcon/falcon_tokenizer.py +12 -35
- keras_hub/src/models/gemma/__init__.py +1 -2
- keras_hub/src/models/gemma/gemma_causal_lm_preprocessor.py +6 -90
- keras_hub/src/models/gemma/gemma_decoder_block.py +1 -1
- keras_hub/src/models/gemma/gemma_tokenizer.py +12 -23
- keras_hub/src/models/gpt2/__init__.py +1 -2
- keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor.py +6 -89
- keras_hub/src/models/gpt2/gpt2_preprocessor.py +12 -90
- keras_hub/src/models/gpt2/gpt2_tokenizer.py +12 -34
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor.py +6 -91
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer.py +12 -34
- keras_hub/src/models/image_classifier.py +0 -5
- keras_hub/src/models/image_classifier_preprocessor.py +83 -0
- keras_hub/src/models/llama/__init__.py +1 -2
- keras_hub/src/models/llama/llama_causal_lm_preprocessor.py +6 -85
- keras_hub/src/models/llama/llama_tokenizer.py +12 -25
- keras_hub/src/models/llama3/__init__.py +1 -2
- keras_hub/src/models/llama3/llama3_causal_lm_preprocessor.py +6 -89
- keras_hub/src/models/llama3/llama3_tokenizer.py +12 -33
- keras_hub/src/models/masked_lm.py +0 -2
- keras_hub/src/models/masked_lm_preprocessor.py +156 -0
- keras_hub/src/models/mistral/__init__.py +1 -2
- keras_hub/src/models/mistral/mistral_causal_lm_preprocessor.py +6 -91
- keras_hub/src/models/mistral/mistral_tokenizer.py +12 -23
- keras_hub/src/models/mix_transformer/mix_transformer_backbone.py +2 -2
- keras_hub/src/models/mobilenet/__init__.py +13 -0
- keras_hub/src/models/mobilenet/mobilenet_backbone.py +530 -0
- keras_hub/src/models/mobilenet/mobilenet_image_classifier.py +114 -0
- keras_hub/src/models/opt/__init__.py +1 -2
- keras_hub/src/models/opt/opt_causal_lm_preprocessor.py +6 -93
- keras_hub/src/models/opt/opt_tokenizer.py +12 -41
- keras_hub/src/models/pali_gemma/__init__.py +1 -4
- keras_hub/src/models/pali_gemma/pali_gemma_causal_lm_preprocessor.py +28 -28
- keras_hub/src/models/pali_gemma/pali_gemma_image_converter.py +25 -0
- keras_hub/src/models/pali_gemma/pali_gemma_presets.py +5 -5
- keras_hub/src/models/pali_gemma/pali_gemma_tokenizer.py +11 -3
- keras_hub/src/models/phi3/__init__.py +1 -2
- keras_hub/src/models/phi3/phi3_causal_lm.py +3 -9
- keras_hub/src/models/phi3/phi3_causal_lm_preprocessor.py +6 -89
- keras_hub/src/models/phi3/phi3_tokenizer.py +12 -36
- keras_hub/src/models/preprocessor.py +72 -83
- keras_hub/src/models/resnet/__init__.py +6 -0
- keras_hub/src/models/resnet/resnet_backbone.py +390 -42
- keras_hub/src/models/resnet/resnet_image_classifier.py +33 -6
- keras_hub/src/models/resnet/resnet_image_classifier_preprocessor.py +28 -0
- keras_hub/src/models/{llama3/llama3_preprocessor.py → resnet/resnet_image_converter.py} +7 -5
- keras_hub/src/models/resnet/resnet_presets.py +95 -0
- keras_hub/src/models/retinanet/__init__.py +13 -0
- keras_hub/src/models/retinanet/anchor_generator.py +175 -0
- keras_hub/src/models/retinanet/box_matcher.py +259 -0
- keras_hub/src/models/retinanet/non_max_supression.py +578 -0
- keras_hub/src/models/roberta/__init__.py +1 -2
- keras_hub/src/models/roberta/roberta_masked_lm_preprocessor.py +22 -74
- keras_hub/src/models/roberta/{roberta_classifier.py → roberta_text_classifier.py} +16 -11
- keras_hub/src/models/roberta/{roberta_preprocessor.py → roberta_text_classifier_preprocessor.py} +21 -53
- keras_hub/src/models/roberta/roberta_tokenizer.py +13 -52
- keras_hub/src/models/seq_2_seq_lm_preprocessor.py +269 -0
- keras_hub/src/models/stable_diffusion_v3/__init__.py +13 -0
- keras_hub/src/models/stable_diffusion_v3/clip_encoder_block.py +103 -0
- keras_hub/src/models/stable_diffusion_v3/clip_preprocessor.py +93 -0
- keras_hub/src/models/stable_diffusion_v3/clip_text_encoder.py +149 -0
- keras_hub/src/models/stable_diffusion_v3/clip_tokenizer.py +167 -0
- keras_hub/src/models/stable_diffusion_v3/mmdit.py +427 -0
- keras_hub/src/models/stable_diffusion_v3/mmdit_block.py +317 -0
- keras_hub/src/models/stable_diffusion_v3/t5_xxl_preprocessor.py +74 -0
- keras_hub/src/models/stable_diffusion_v3/t5_xxl_text_encoder.py +155 -0
- keras_hub/src/models/stable_diffusion_v3/vae_attention.py +126 -0
- keras_hub/src/models/stable_diffusion_v3/vae_image_decoder.py +186 -0
- keras_hub/src/models/t5/__init__.py +1 -2
- keras_hub/src/models/t5/t5_tokenizer.py +13 -23
- keras_hub/src/models/task.py +71 -116
- keras_hub/src/models/{classifier.py → text_classifier.py} +19 -13
- keras_hub/src/models/text_classifier_preprocessor.py +138 -0
- keras_hub/src/models/whisper/__init__.py +1 -2
- keras_hub/src/models/whisper/{whisper_audio_feature_extractor.py → whisper_audio_converter.py} +20 -18
- keras_hub/src/models/whisper/whisper_backbone.py +0 -3
- keras_hub/src/models/whisper/whisper_presets.py +10 -10
- keras_hub/src/models/whisper/whisper_tokenizer.py +20 -16
- keras_hub/src/models/xlm_roberta/__init__.py +1 -4
- keras_hub/src/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +26 -72
- keras_hub/src/models/xlm_roberta/{xlm_roberta_classifier.py → xlm_roberta_text_classifier.py} +16 -11
- keras_hub/src/models/xlm_roberta/{xlm_roberta_preprocessor.py → xlm_roberta_text_classifier_preprocessor.py} +26 -53
- keras_hub/src/models/xlm_roberta/xlm_roberta_tokenizer.py +25 -10
- keras_hub/src/tests/test_case.py +46 -0
- keras_hub/src/tokenizers/byte_pair_tokenizer.py +30 -17
- keras_hub/src/tokenizers/byte_tokenizer.py +14 -15
- keras_hub/src/tokenizers/sentence_piece_tokenizer.py +20 -7
- keras_hub/src/tokenizers/tokenizer.py +67 -32
- keras_hub/src/tokenizers/unicode_codepoint_tokenizer.py +14 -15
- keras_hub/src/tokenizers/word_piece_tokenizer.py +34 -47
- keras_hub/src/utils/imagenet/__init__.py +13 -0
- keras_hub/src/utils/imagenet/imagenet_utils.py +1067 -0
- keras_hub/src/utils/keras_utils.py +0 -50
- keras_hub/src/utils/preset_utils.py +230 -68
- keras_hub/src/utils/tensor_utils.py +187 -69
- keras_hub/src/utils/timm/convert_resnet.py +19 -16
- keras_hub/src/utils/timm/preset_loader.py +66 -0
- keras_hub/src/utils/transformers/convert_albert.py +193 -0
- keras_hub/src/utils/transformers/convert_bart.py +373 -0
- keras_hub/src/utils/transformers/convert_bert.py +7 -17
- keras_hub/src/utils/transformers/convert_distilbert.py +10 -20
- keras_hub/src/utils/transformers/convert_gemma.py +5 -19
- keras_hub/src/utils/transformers/convert_gpt2.py +5 -18
- keras_hub/src/utils/transformers/convert_llama3.py +7 -18
- keras_hub/src/utils/transformers/convert_mistral.py +129 -0
- keras_hub/src/utils/transformers/convert_pali_gemma.py +7 -29
- keras_hub/src/utils/transformers/preset_loader.py +77 -0
- keras_hub/src/utils/transformers/safetensor_utils.py +2 -2
- keras_hub/src/version_utils.py +1 -1
- keras_hub_nightly-0.16.0.dev2024092017.dist-info/METADATA +202 -0
- keras_hub_nightly-0.16.0.dev2024092017.dist-info/RECORD +334 -0
- {keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/WHEEL +1 -1
- keras_hub/src/models/bart/bart_preprocessor.py +0 -276
- keras_hub/src/models/bloom/bloom_preprocessor.py +0 -185
- keras_hub/src/models/electra/electra_preprocessor.py +0 -154
- keras_hub/src/models/falcon/falcon_preprocessor.py +0 -187
- keras_hub/src/models/gemma/gemma_preprocessor.py +0 -191
- keras_hub/src/models/gpt_neo_x/gpt_neo_x_preprocessor.py +0 -145
- keras_hub/src/models/llama/llama_preprocessor.py +0 -189
- keras_hub/src/models/mistral/mistral_preprocessor.py +0 -190
- keras_hub/src/models/opt/opt_preprocessor.py +0 -188
- keras_hub/src/models/phi3/phi3_preprocessor.py +0 -190
- keras_hub/src/models/whisper/whisper_preprocessor.py +0 -326
- keras_hub/src/utils/timm/convert.py +0 -37
- keras_hub/src/utils/transformers/convert.py +0 -101
- keras_hub_nightly-0.15.0.dev20240823171555.dist-info/METADATA +0 -34
- keras_hub_nightly-0.15.0.dev20240823171555.dist-info/RECORD +0 -297
- {keras_hub_nightly-0.15.0.dev20240823171555.dist-info → keras_hub_nightly-0.16.0.dev2024092017.dist-info}/top_level.txt +0 -0
@@ -19,17 +19,25 @@ from keras_hub.src.api_export import keras_hub_export
|
|
19
19
|
from keras_hub.src.layers.preprocessing.multi_segment_packer import (
|
20
20
|
MultiSegmentPacker,
|
21
21
|
)
|
22
|
-
from keras_hub.src.models.
|
22
|
+
from keras_hub.src.models.text_classifier_preprocessor import (
|
23
|
+
TextClassifierPreprocessor,
|
24
|
+
)
|
25
|
+
from keras_hub.src.models.xlm_roberta.xlm_roberta_backbone import (
|
26
|
+
XLMRobertaBackbone,
|
27
|
+
)
|
23
28
|
from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
|
24
29
|
XLMRobertaTokenizer,
|
25
30
|
)
|
26
|
-
from keras_hub.src.utils.
|
27
|
-
convert_inputs_to_list_of_tensor_segments,
|
28
|
-
)
|
31
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
29
32
|
|
30
33
|
|
31
|
-
@keras_hub_export(
|
32
|
-
|
34
|
+
@keras_hub_export(
|
35
|
+
[
|
36
|
+
"keras_hub.models.XLMRobertaTextClassifierPreprocessor",
|
37
|
+
"keras_hub.models.XLMRobertaPreprocessor",
|
38
|
+
]
|
39
|
+
)
|
40
|
+
class XLMRobertaTextClassifierPreprocessor(TextClassifierPreprocessor):
|
33
41
|
"""An XLM-RoBERTa preprocessing layer which tokenizes and packs inputs.
|
34
42
|
|
35
43
|
This preprocessing layer will do three things:
|
@@ -73,7 +81,7 @@ class XLMRobertaPreprocessor(Preprocessor):
|
|
73
81
|
|
74
82
|
Directly calling the layer on data.
|
75
83
|
```python
|
76
|
-
preprocessor = keras_hub.models.
|
84
|
+
preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
|
77
85
|
"xlm_roberta_base_multi"
|
78
86
|
)
|
79
87
|
|
@@ -107,13 +115,15 @@ class XLMRobertaPreprocessor(Preprocessor):
|
|
107
115
|
)
|
108
116
|
proto = train_sentencepiece(ds, vocab_size=10)
|
109
117
|
tokenizer = keras_hub.models.XLMRobertaTokenizer(proto=proto)
|
110
|
-
preprocessor = keras_hub.models.
|
118
|
+
preprocessor = keras_hub.models.XLMRobertaTextClassifierPreprocessor(
|
119
|
+
tokenizer
|
120
|
+
)
|
111
121
|
preprocessor("The quick brown fox jumped.")
|
112
122
|
```
|
113
123
|
|
114
124
|
Mapping with `tf.data.Dataset`.
|
115
125
|
```python
|
116
|
-
preprocessor = keras_hub.models.
|
126
|
+
preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset(
|
117
127
|
"xlm_roberta_base_multi"
|
118
128
|
)
|
119
129
|
|
@@ -144,25 +154,11 @@ class XLMRobertaPreprocessor(Preprocessor):
|
|
144
154
|
```
|
145
155
|
"""
|
146
156
|
|
157
|
+
backbone_cls = XLMRobertaBackbone
|
147
158
|
tokenizer_cls = XLMRobertaTokenizer
|
148
159
|
|
149
|
-
def __init__(
|
150
|
-
self,
|
151
|
-
tokenizer,
|
152
|
-
sequence_length=512,
|
153
|
-
truncate="round_robin",
|
154
|
-
**kwargs,
|
155
|
-
):
|
156
|
-
super().__init__(**kwargs)
|
157
|
-
|
158
|
-
self.tokenizer = tokenizer
|
159
|
-
self.packer = None
|
160
|
-
self.truncate = truncate
|
161
|
-
self.sequence_length = sequence_length
|
162
|
-
|
163
160
|
def build(self, input_shape):
|
164
|
-
#
|
165
|
-
# assets have loaded when restoring a saved model.
|
161
|
+
# Roberta is doubles up the sep token, so we override build.
|
166
162
|
self.packer = MultiSegmentPacker(
|
167
163
|
start_value=self.tokenizer.start_token_id,
|
168
164
|
end_value=self.tokenizer.end_token_id,
|
@@ -173,33 +169,10 @@ class XLMRobertaPreprocessor(Preprocessor):
|
|
173
169
|
)
|
174
170
|
self.built = True
|
175
171
|
|
176
|
-
|
177
|
-
config = super().get_config()
|
178
|
-
config.update(
|
179
|
-
{
|
180
|
-
"sequence_length": self.sequence_length,
|
181
|
-
"truncate": self.truncate,
|
182
|
-
}
|
183
|
-
)
|
184
|
-
return config
|
185
|
-
|
172
|
+
@preprocessing_function
|
186
173
|
def call(self, x, y=None, sample_weight=None):
|
187
|
-
|
188
|
-
x =
|
189
|
-
|
190
|
-
x
|
191
|
-
"token_ids": token_ids,
|
192
|
-
"padding_mask": token_ids != self.tokenizer.pad_token_id,
|
193
|
-
}
|
174
|
+
output = super().call(x, y=y, sample_weight=sample_weight)
|
175
|
+
x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(output)
|
176
|
+
# Backbone has no segment ID input.
|
177
|
+
del x["segment_ids"]
|
194
178
|
return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
|
195
|
-
|
196
|
-
@property
|
197
|
-
def sequence_length(self):
|
198
|
-
"""The padded length of model input sequences."""
|
199
|
-
return self._sequence_length
|
200
|
-
|
201
|
-
@sequence_length.setter
|
202
|
-
def sequence_length(self, value):
|
203
|
-
self._sequence_length = value
|
204
|
-
if self.packer is not None:
|
205
|
-
self.packer.sequence_length = value
|
@@ -14,6 +14,9 @@
|
|
14
14
|
|
15
15
|
|
16
16
|
from keras_hub.src.api_export import keras_hub_export
|
17
|
+
from keras_hub.src.models.xlm_roberta.xlm_roberta_backbone import (
|
18
|
+
XLMRobertaBackbone,
|
19
|
+
)
|
17
20
|
from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
|
18
21
|
SentencePieceTokenizer,
|
19
22
|
)
|
@@ -25,7 +28,12 @@ except ImportError:
|
|
25
28
|
tf = None
|
26
29
|
|
27
30
|
|
28
|
-
@keras_hub_export(
|
31
|
+
@keras_hub_export(
|
32
|
+
[
|
33
|
+
"keras_hub.tokenizers.XLMRobertaTokenizer",
|
34
|
+
"keras_hub.models.XLMRobertaTokenizer",
|
35
|
+
]
|
36
|
+
)
|
29
37
|
class XLMRobertaTokenizer(SentencePieceTokenizer):
|
30
38
|
"""An XLM-RoBERTa tokenizer using SentencePiece subword segmentation.
|
31
39
|
|
@@ -89,17 +97,24 @@ class XLMRobertaTokenizer(SentencePieceTokenizer):
|
|
89
97
|
```
|
90
98
|
"""
|
91
99
|
|
100
|
+
backbone_cls = XLMRobertaBackbone
|
101
|
+
|
92
102
|
def __init__(self, proto, **kwargs):
|
93
|
-
#
|
94
|
-
|
103
|
+
# Handle special tokens manually, as the tokenizer maps these tokens in
|
104
|
+
# a way that is not reflected in the vocabulary.
|
105
|
+
self.start_token, self.start_token_id = "<s>", 0
|
106
|
+
self.pad_token, self.pad_token_id = "<pad>", 1
|
107
|
+
self.end_token, self.end_token_id = "</s>", 2
|
108
|
+
self.unk_token, self.unk_token_id = "<unk>", 3
|
109
|
+
super().__init__(proto=proto, **kwargs)
|
95
110
|
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
self.end_token_id = 2 # </s>
|
100
|
-
self.unk_token_id = 3 # <unk>
|
111
|
+
@property
|
112
|
+
def special_tokens(self):
|
113
|
+
return ["<s>", "<pad>", "</s>", "<unk>"]
|
101
114
|
|
102
|
-
|
115
|
+
@property
|
116
|
+
def special_token_ids(self):
|
117
|
+
return [0, 1, 2, 3]
|
103
118
|
|
104
119
|
def set_proto(self, proto):
|
105
120
|
super().set_proto(proto)
|
@@ -162,7 +177,7 @@ class XLMRobertaTokenizer(SentencePieceTokenizer):
|
|
162
177
|
|
163
178
|
# Correct `unk_token_id` (0 -> 3). Note that we do not correct
|
164
179
|
# `start_token_id` and `end_token_id`; they are dealt with in
|
165
|
-
# `
|
180
|
+
# `XLMRobertaTextClassifierPreprocessor`.
|
166
181
|
tokens = tf.where(tf.equal(tokens, 0), self.unk_token_id - 1, tokens)
|
167
182
|
|
168
183
|
# Shift the tokens IDs right by one.
|
keras_hub/src/tests/test_case.py
CHANGED
@@ -18,6 +18,7 @@ import pathlib
|
|
18
18
|
import re
|
19
19
|
|
20
20
|
import keras
|
21
|
+
import numpy as np
|
21
22
|
import tensorflow as tf
|
22
23
|
from absl.testing import parameterized
|
23
24
|
from keras import ops
|
@@ -465,6 +466,8 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
|
|
465
466
|
init_kwargs,
|
466
467
|
input_data,
|
467
468
|
expected_output_shape,
|
469
|
+
expected_pyramid_output_keys=None,
|
470
|
+
expected_pyramid_image_sizes=None,
|
468
471
|
variable_length_data=None,
|
469
472
|
run_mixed_precision_check=True,
|
470
473
|
run_quantization_check=True,
|
@@ -492,6 +495,26 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
|
|
492
495
|
run_quantization_check=run_quantization_check,
|
493
496
|
)
|
494
497
|
|
498
|
+
if expected_pyramid_output_keys:
|
499
|
+
backbone = cls(**init_kwargs)
|
500
|
+
model = keras.models.Model(
|
501
|
+
backbone.inputs, backbone.pyramid_outputs
|
502
|
+
)
|
503
|
+
output_data = model(input_data)
|
504
|
+
|
505
|
+
self.assertIsInstance(output_data, dict)
|
506
|
+
self.assertEqual(
|
507
|
+
list(output_data.keys()), list(backbone.pyramid_outputs.keys())
|
508
|
+
)
|
509
|
+
self.assertEqual(
|
510
|
+
list(output_data.keys()), expected_pyramid_output_keys
|
511
|
+
)
|
512
|
+
# check height and width of each level.
|
513
|
+
for i, (k, v) in enumerate(output_data.items()):
|
514
|
+
self.assertEqual(
|
515
|
+
tuple(v.shape[1:3]), expected_pyramid_image_sizes[i]
|
516
|
+
)
|
517
|
+
|
495
518
|
# Check data_format. We assume that `input_data` is in "channels_last"
|
496
519
|
# format.
|
497
520
|
if run_data_format_check and can_run_data_format_check:
|
@@ -501,6 +524,12 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
|
|
501
524
|
input_data = ops.transpose(input_data, axes=(2, 0, 1))
|
502
525
|
elif len(input_data_shape) == 4:
|
503
526
|
input_data = ops.transpose(input_data, axes=(0, 3, 1, 2))
|
527
|
+
if len(expected_output_shape) == 3:
|
528
|
+
x = expected_output_shape
|
529
|
+
expected_output_shape = (x[0], x[2], x[1])
|
530
|
+
elif len(expected_output_shape) == 4:
|
531
|
+
x = expected_output_shape
|
532
|
+
expected_output_shape = (x[0], x[3], x[1], x[2])
|
504
533
|
if "image_shape" in init_kwargs:
|
505
534
|
init_kwargs = init_kwargs.copy()
|
506
535
|
init_kwargs["image_shape"] = tuple(
|
@@ -557,6 +586,10 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
|
|
557
586
|
task.preprocessor = None
|
558
587
|
task.fit(ds.map(preprocessor))
|
559
588
|
task.preprocessor = preprocessor
|
589
|
+
# Turn off default compilation, should error during `fit()`.
|
590
|
+
task = cls(**init_kwargs, compile=False)
|
591
|
+
with self.assertRaisesRegex(ValueError, "You must call `compile"):
|
592
|
+
task.fit(ds)
|
560
593
|
|
561
594
|
def run_preset_test(
|
562
595
|
self,
|
@@ -567,6 +600,7 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
|
|
567
600
|
expected_output=None,
|
568
601
|
expected_output_shape=None,
|
569
602
|
expected_partial_output=None,
|
603
|
+
expected_labels=None,
|
570
604
|
):
|
571
605
|
"""Run instantiation and a forward pass for a preset."""
|
572
606
|
with self.assertRaises(Exception):
|
@@ -604,5 +638,17 @@ class TestCase(tf.test.TestCase, parameterized.TestCase):
|
|
604
638
|
|
605
639
|
tree.map_structure(compare, output, expected_partial_output)
|
606
640
|
|
641
|
+
if expected_labels is not None:
|
642
|
+
output = ops.argmax(output, axis=-1)
|
643
|
+
self.assertAllEqual(output, expected_labels)
|
644
|
+
|
607
645
|
def get_test_data_dir(self):
|
608
646
|
return str(pathlib.Path(__file__).parent / "test_data")
|
647
|
+
|
648
|
+
def load_test_image(self, target_size=None):
|
649
|
+
# From https://commons.wikimedia.org/wiki/File:California_quail.jpg
|
650
|
+
path = os.path.join(self.get_test_data_dir(), "test_image.jpg")
|
651
|
+
img = keras.utils.load_img(
|
652
|
+
path, target_size=target_size, keep_aspect_ratio=True
|
653
|
+
)
|
654
|
+
return np.array(img)
|
@@ -31,6 +31,7 @@ from keras_hub.src.tokenizers import tokenizer
|
|
31
31
|
from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
|
32
32
|
from keras_hub.src.utils.tensor_utils import is_int_dtype
|
33
33
|
from keras_hub.src.utils.tensor_utils import is_string_dtype
|
34
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
34
35
|
|
35
36
|
try:
|
36
37
|
import tensorflow as tf
|
@@ -63,12 +64,17 @@ def create_alts_for_unsplittable_tokens(unsplittable_tokens):
|
|
63
64
|
# Create alternates for all special tokens that will be not split during
|
64
65
|
# tokenization.
|
65
66
|
alts = []
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
67
|
+
for index in range(len(unsplittable_tokens)):
|
68
|
+
# Map unsplittable tokens to ĴA, ĴB, ĴC, etc. Which we assume will be
|
69
|
+
# a very uncommon string in any input data. We can't use a literal
|
70
|
+
# numeric counter here because we will split on all numbers. Ĵ is a
|
71
|
+
# random character we chose as it is likely to be unique.
|
72
|
+
prefix = "Ĵ"
|
73
|
+
digits = [int(d) for d in str(index)]
|
74
|
+
# Make numbers to uppercase characters so our token is still
|
75
|
+
# unsplittable.
|
76
|
+
suffix = "".join([chr(ord("A") + d) for d in digits])
|
77
|
+
alts.append(prefix + suffix)
|
72
78
|
return alts
|
73
79
|
|
74
80
|
|
@@ -252,9 +258,9 @@ class BytePairTokenizer(tokenizer.Tokenizer):
|
|
252
258
|
array([1, 2], dtype=int32)
|
253
259
|
>>> seq1, seq2 = tokenizer(["butterfly", "butter"])
|
254
260
|
>>> np.array(seq1)
|
255
|
-
array([1, 2]
|
261
|
+
array([1, 2])
|
256
262
|
>>> np.array(seq2)
|
257
|
-
array([1]
|
263
|
+
array([1])
|
258
264
|
>>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(
|
259
265
|
... vocab, merge, sequence_length=2)
|
260
266
|
>>> seq1, seq2 = tokenizer(["butterfly", "butter"])
|
@@ -268,8 +274,7 @@ class BytePairTokenizer(tokenizer.Tokenizer):
|
|
268
274
|
>>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
|
269
275
|
>>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(vocab, merge)
|
270
276
|
>>> tokenizer.detokenize([[1, 2]])
|
271
|
-
|
272
|
-
dtype=object)>
|
277
|
+
['butterfly']
|
273
278
|
"""
|
274
279
|
|
275
280
|
def __init__(
|
@@ -291,6 +296,8 @@ class BytePairTokenizer(tokenizer.Tokenizer):
|
|
291
296
|
super().__init__(dtype=dtype, **kwargs)
|
292
297
|
self.sequence_length = sequence_length
|
293
298
|
self.add_prefix_space = add_prefix_space
|
299
|
+
if unsplittable_tokens is None:
|
300
|
+
unsplittable_tokens = self.special_tokens
|
294
301
|
self.unsplittable_tokens = unsplittable_tokens
|
295
302
|
self.file_assets = [VOCAB_FILENAME, MERGES_FILENAME]
|
296
303
|
|
@@ -385,6 +392,7 @@ class BytePairTokenizer(tokenizer.Tokenizer):
|
|
385
392
|
list(range(len(self.merges))),
|
386
393
|
default=self.merge_ranks_lookup_default,
|
387
394
|
)
|
395
|
+
self._update_special_token_ids()
|
388
396
|
|
389
397
|
def get_vocabulary(self):
|
390
398
|
"""Get the tokenizer vocabulary as a list of strings tokens."""
|
@@ -526,17 +534,21 @@ class BytePairTokenizer(tokenizer.Tokenizer):
|
|
526
534
|
"layer."
|
527
535
|
)
|
528
536
|
|
537
|
+
@preprocessing_function
|
529
538
|
def tokenize(self, inputs):
|
530
539
|
self._check_vocabulary()
|
531
|
-
if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
|
532
|
-
inputs = tf.convert_to_tensor(inputs)
|
533
|
-
|
534
540
|
if self.add_prefix_space:
|
535
541
|
inputs = tf.strings.join([" ", inputs])
|
536
542
|
|
537
|
-
|
538
|
-
|
543
|
+
inputs = tf.convert_to_tensor(inputs)
|
544
|
+
unbatched = inputs.shape.rank == 0
|
545
|
+
if unbatched:
|
539
546
|
inputs = tf.expand_dims(inputs, 0)
|
547
|
+
if inputs.shape.rank > 1:
|
548
|
+
raise ValueError(
|
549
|
+
"`tokenize()` inputs should be a string, list of strings, or "
|
550
|
+
f"string tensor with rank < 2. Received: {inputs}"
|
551
|
+
)
|
540
552
|
|
541
553
|
raw_tokens = split_strings_for_bpe(inputs, self.unsplittable_tokens)
|
542
554
|
token_row_splits = raw_tokens.row_splits
|
@@ -581,15 +593,16 @@ class BytePairTokenizer(tokenizer.Tokenizer):
|
|
581
593
|
tokens = tokens.to_tensor(shape=output_shape)
|
582
594
|
|
583
595
|
# Convert to a dense output if input in scalar
|
584
|
-
if
|
596
|
+
if unbatched:
|
585
597
|
tokens = tf.squeeze(tokens, 0)
|
586
598
|
tf.ensure_shape(tokens, shape=[self.sequence_length])
|
587
599
|
|
588
600
|
return tokens
|
589
601
|
|
602
|
+
@preprocessing_function
|
590
603
|
def detokenize(self, inputs):
|
591
604
|
self._check_vocabulary()
|
592
|
-
inputs, unbatched,
|
605
|
+
inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
|
593
606
|
inputs = tf.cast(inputs, self.dtype)
|
594
607
|
unicode_text = tf.strings.reduce_join(
|
595
608
|
self.id_to_token_map.lookup(inputs), axis=-1
|
@@ -26,6 +26,7 @@ from keras_hub.src.api_export import keras_hub_export
|
|
26
26
|
from keras_hub.src.tokenizers import tokenizer
|
27
27
|
from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
|
28
28
|
from keras_hub.src.utils.tensor_utils import is_int_dtype
|
29
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
29
30
|
|
30
31
|
try:
|
31
32
|
import tensorflow_text as tf_text
|
@@ -95,9 +96,9 @@ class ByteTokenizer(tokenizer.Tokenizer):
|
|
95
96
|
>>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
|
96
97
|
>>> seq1, seq2 = tokenizer(inputs)
|
97
98
|
>>> np.array(seq1)
|
98
|
-
array([104, 101, 108, 108, 111]
|
99
|
+
array([104, 101, 108, 108, 111])
|
99
100
|
>>> np.array(seq2)
|
100
|
-
array([104, 105]
|
101
|
+
array([104, 105])
|
101
102
|
|
102
103
|
Dense outputs.
|
103
104
|
>>> inputs = ["hello", "hi"]
|
@@ -145,18 +146,16 @@ class ByteTokenizer(tokenizer.Tokenizer):
|
|
145
146
|
Detokenization.
|
146
147
|
>>> inputs = [104, 101, 108, 108, 111]
|
147
148
|
>>> tokenizer = keras_hub.tokenizers.ByteTokenizer()
|
148
|
-
>>>
|
149
|
-
|
150
|
-
array('hello', dtype='<U5')
|
149
|
+
>>> tokenizer.detokenize(inputs)
|
150
|
+
'hello'
|
151
151
|
|
152
152
|
Detokenization with invalid bytes.
|
153
153
|
>>> # The 255 below is invalid utf-8.
|
154
154
|
>>> inputs = [104, 101, 255, 108, 108, 111]
|
155
155
|
>>> tokenizer = keras_hub.tokenizers.ByteTokenizer(
|
156
156
|
... errors="replace", replacement_char=88)
|
157
|
-
>>>
|
158
|
-
|
159
|
-
array('heXllo', dtype='<U6')
|
157
|
+
>>> tokenizer.detokenize(inputs)
|
158
|
+
'heXllo'
|
160
159
|
"""
|
161
160
|
|
162
161
|
def __init__(
|
@@ -201,6 +200,7 @@ class ByteTokenizer(tokenizer.Tokenizer):
|
|
201
200
|
self._char_lst = tf.constant(
|
202
201
|
[i.tobytes() for i in np.arange(256, dtype=np.uint8)]
|
203
202
|
)
|
203
|
+
self._update_special_token_ids()
|
204
204
|
|
205
205
|
def vocabulary_size(self):
|
206
206
|
"""Get the integer size of the tokenizer vocabulary."""
|
@@ -212,12 +212,10 @@ class ByteTokenizer(tokenizer.Tokenizer):
|
|
212
212
|
vocab[chr(i)] = i
|
213
213
|
return vocab
|
214
214
|
|
215
|
+
@preprocessing_function
|
215
216
|
def tokenize(self, inputs):
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
scalar_input = inputs.shape.rank == 0
|
220
|
-
if scalar_input:
|
217
|
+
unbatched = inputs.shape.rank == 0
|
218
|
+
if unbatched:
|
221
219
|
inputs = tf.expand_dims(inputs, 0)
|
222
220
|
|
223
221
|
# Optional: Lowercase the input.
|
@@ -241,12 +239,13 @@ class ByteTokenizer(tokenizer.Tokenizer):
|
|
241
239
|
output_shape[-1] = self.sequence_length
|
242
240
|
tokens = tokens.to_tensor(shape=output_shape)
|
243
241
|
|
244
|
-
if
|
242
|
+
if unbatched:
|
245
243
|
tokens = tf.squeeze(tokens, 0)
|
246
244
|
return tokens
|
247
245
|
|
246
|
+
@preprocessing_function
|
248
247
|
def detokenize(self, inputs):
|
249
|
-
inputs, unbatched,
|
248
|
+
inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
|
250
249
|
# Remove trailing padding tokens, so that trailing "\x00" bytes don't
|
251
250
|
# show up in the detokenized output.
|
252
251
|
inputs = tf.ragged.boolean_mask(inputs, tf.not_equal(inputs, 0))
|
@@ -31,6 +31,7 @@ from keras_hub.src.tokenizers import tokenizer
|
|
31
31
|
from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
|
32
32
|
from keras_hub.src.utils.tensor_utils import is_int_dtype
|
33
33
|
from keras_hub.src.utils.tensor_utils import is_string_dtype
|
34
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
34
35
|
from keras_hub.src.utils.tensor_utils import tensor_to_list
|
35
36
|
|
36
37
|
try:
|
@@ -66,6 +67,9 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
|
|
66
67
|
for more details on the format.
|
67
68
|
sequence_length: If set, the output will be converted to a dense
|
68
69
|
tensor and padded/trimmed so all outputs are of `sequence_length`.
|
70
|
+
add_bos: Add beginning of sentence token to the result.
|
71
|
+
add_eos: Add end of sentence token to the result. Token is always
|
72
|
+
truncated if output is longer than specified `sequence_length`.
|
69
73
|
|
70
74
|
References:
|
71
75
|
- [Kudo and Richardson, 2018](https://arxiv.org/abs/1808.06226)
|
@@ -115,6 +119,8 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
|
|
115
119
|
proto=None,
|
116
120
|
sequence_length=None,
|
117
121
|
dtype="int32",
|
122
|
+
add_bos=False,
|
123
|
+
add_eos=False,
|
118
124
|
**kwargs,
|
119
125
|
) -> None:
|
120
126
|
if not is_int_dtype(dtype) and not is_string_dtype(dtype):
|
@@ -127,6 +133,8 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
|
|
127
133
|
|
128
134
|
self.proto = None
|
129
135
|
self.sequence_length = sequence_length
|
136
|
+
self.add_bos = add_bos
|
137
|
+
self.add_eos = add_eos
|
130
138
|
self.set_proto(proto)
|
131
139
|
self.file_assets = [VOCAB_FILENAME]
|
132
140
|
|
@@ -171,10 +179,13 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
|
|
171
179
|
self._sentence_piece = tf_text.SentencepieceTokenizer(
|
172
180
|
model=proto_bytes,
|
173
181
|
out_type=self.compute_dtype,
|
182
|
+
add_bos=self.add_bos,
|
183
|
+
add_eos=self.add_eos,
|
174
184
|
)
|
175
185
|
# Keras cannot serialize a bytestring, so we base64 encode the model
|
176
186
|
# byte array as a string for saving.
|
177
187
|
self.proto = proto_bytes
|
188
|
+
self._update_special_token_ids()
|
178
189
|
|
179
190
|
def vocabulary_size(self):
|
180
191
|
"""Get the integer size of the tokenizer vocabulary."""
|
@@ -211,6 +222,8 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
|
|
211
222
|
{
|
212
223
|
"proto": None, # Save vocabulary via an asset!
|
213
224
|
"sequence_length": self.sequence_length,
|
225
|
+
"add_bos": self.add_bos,
|
226
|
+
"add_eos": self.add_eos,
|
214
227
|
}
|
215
228
|
)
|
216
229
|
return config
|
@@ -222,12 +235,12 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
|
|
222
235
|
"sure to pass a `proto` argument when creating the layer."
|
223
236
|
)
|
224
237
|
|
238
|
+
@preprocessing_function
|
225
239
|
def tokenize(self, inputs):
|
226
240
|
self._check_vocabulary()
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
if scalar_input:
|
241
|
+
inputs = tf.convert_to_tensor(inputs)
|
242
|
+
unbatched = inputs.shape.rank == 0
|
243
|
+
if unbatched:
|
231
244
|
inputs = tf.expand_dims(inputs, 0)
|
232
245
|
|
233
246
|
if self._sentence_piece is None:
|
@@ -245,15 +258,15 @@ class SentencePieceTokenizer(tokenizer.Tokenizer):
|
|
245
258
|
tokens = tokens.to_tensor(shape=output_shape)
|
246
259
|
|
247
260
|
# Convert to a dense output if input was a scalar.
|
248
|
-
if
|
261
|
+
if unbatched:
|
249
262
|
tokens = tf.squeeze(tokens, 0)
|
250
263
|
tf.ensure_shape(tokens, shape=[self.sequence_length])
|
251
|
-
|
252
264
|
return tokens
|
253
265
|
|
266
|
+
@preprocessing_function
|
254
267
|
def detokenize(self, inputs):
|
255
268
|
self._check_vocabulary()
|
256
|
-
inputs, unbatched,
|
269
|
+
inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
|
257
270
|
# tf-text sentencepiece does not handle int64.
|
258
271
|
inputs = tf.cast(inputs, "int32")
|
259
272
|
outputs = self._sentence_piece.detokenize(inputs)
|