keras-hub-nightly 0.22.0.dev202507150421__py3-none-any.whl → 0.22.0.dev202507170424__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +3 -0
- keras_hub/models/__init__.py +3 -0
- keras_hub/src/models/clip/clip_backbone.py +3 -102
- keras_hub/src/models/clip/clip_layers.py +295 -0
- keras_hub/src/models/clip/clip_preprocessor.py +57 -48
- keras_hub/src/models/clip/clip_text_encoder.py +2 -2
- keras_hub/src/models/clip/clip_vision_encoder.py +3 -3
- keras_hub/src/models/dinov2/__init__.py +5 -0
- keras_hub/src/models/dinov2/dinov2_backbone.py +228 -0
- keras_hub/src/models/dinov2/dinov2_image_converter.py +8 -0
- keras_hub/src/models/dinov2/dinov2_layers.py +886 -0
- keras_hub/src/models/dinov2/dinov2_presets.py +4 -0
- keras_hub/src/models/flux/flux_text_to_image_preprocessor.py +6 -2
- keras_hub/src/models/hgnetv2/__init__.py +5 -0
- keras_hub/src/models/hgnetv2/hgnetv2_presets.py +5 -5
- keras_hub/src/models/stable_diffusion_3/flow_match_euler_discrete_scheduler.py +16 -7
- keras_hub/src/models/stable_diffusion_3/mmdit.py +61 -4
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +23 -32
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor.py +6 -2
- keras_hub/src/utils/preset_utils.py +4 -1
- keras_hub/src/utils/transformers/convert_dinov2.py +180 -0
- keras_hub/src/utils/transformers/export/gemma.py +89 -0
- keras_hub/src/utils/transformers/export/hf_exporter.py +98 -0
- keras_hub/src/utils/transformers/preset_loader.py +4 -1
- keras_hub/src/version.py +1 -1
- {keras_hub_nightly-0.22.0.dev202507150421.dist-info → keras_hub_nightly-0.22.0.dev202507170424.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.22.0.dev202507150421.dist-info → keras_hub_nightly-0.22.0.dev202507170424.dist-info}/RECORD +32 -25
- keras_hub/src/models/clip/clip_encoder_block.py +0 -111
- keras_hub/src/models/clip/clip_vision_embedding.py +0 -101
- {keras_hub_nightly-0.22.0.dev202507150421.dist-info → keras_hub_nightly-0.22.0.dev202507170424.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.22.0.dev202507150421.dist-info → keras_hub_nightly-0.22.0.dev202507170424.dist-info}/top_level.txt +0 -0
keras_hub/layers/__init__.py
CHANGED
@@ -84,6 +84,9 @@ from keras_hub.src.models.deit.deit_image_converter import (
|
|
84
84
|
from keras_hub.src.models.densenet.densenet_image_converter import (
|
85
85
|
DenseNetImageConverter as DenseNetImageConverter,
|
86
86
|
)
|
87
|
+
from keras_hub.src.models.dinov2.dinov2_image_converter import (
|
88
|
+
DINOV2ImageConverter as DINOV2ImageConverter,
|
89
|
+
)
|
87
90
|
from keras_hub.src.models.efficientnet.efficientnet_image_converter import (
|
88
91
|
EfficientNetImageConverter as EfficientNetImageConverter,
|
89
92
|
)
|
keras_hub/models/__init__.py
CHANGED
@@ -157,6 +157,9 @@ from keras_hub.src.models.densenet.densenet_image_classifier import (
|
|
157
157
|
from keras_hub.src.models.densenet.densenet_image_classifier_preprocessor import (
|
158
158
|
DenseNetImageClassifierPreprocessor as DenseNetImageClassifierPreprocessor,
|
159
159
|
)
|
160
|
+
from keras_hub.src.models.dinov2.dinov2_backbone import (
|
161
|
+
DINOV2Backbone as DINOV2Backbone,
|
162
|
+
)
|
160
163
|
from keras_hub.src.models.distil_bert.distil_bert_backbone import (
|
161
164
|
DistilBertBackbone as DistilBertBackbone,
|
162
165
|
)
|
@@ -1,109 +1,10 @@
|
|
1
|
-
import math
|
2
|
-
|
3
1
|
from keras import layers
|
4
|
-
from keras import ops
|
5
2
|
|
6
3
|
from keras_hub.src.api_export import keras_hub_export
|
7
4
|
from keras_hub.src.models.backbone import Backbone
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
"""The vision pooler layer of CLIP.
|
12
|
-
|
13
|
-
`CLIPVisionPooler` will extracts the first token (index `0`) from the
|
14
|
-
sequence of the vision embeddings as the pooled outputs.
|
15
|
-
|
16
|
-
Call arguments:
|
17
|
-
vision_embeddings: A tensor of shape
|
18
|
-
`(batch_size, sequence_length, hidden_dim)`.
|
19
|
-
"""
|
20
|
-
|
21
|
-
def call(self, vision_embeddings):
|
22
|
-
return vision_embeddings[:, 0, :]
|
23
|
-
|
24
|
-
def compute_output_shape(self, input_shape):
|
25
|
-
return (input_shape[0], input_shape[-1])
|
26
|
-
|
27
|
-
|
28
|
-
class CLIPTextPooler(layers.Layer):
|
29
|
-
"""The text pooler layer of CLIP.
|
30
|
-
|
31
|
-
`CLIPTextPooler` extracts the text embeddings at the positions of EOS tokens
|
32
|
-
as the pooled outputs.
|
33
|
-
|
34
|
-
Call arguments:
|
35
|
-
text_embeddings: A tensor of shape
|
36
|
-
`(batch_size, sequence_length, hidden_dim)`.
|
37
|
-
token_ids: A tensor of shape `(batch_size, max_tokens)`, used to
|
38
|
-
identify the positions of EOS tokens.
|
39
|
-
"""
|
40
|
-
|
41
|
-
def call(self, text_embeddings, token_ids):
|
42
|
-
# `keepdims` is not supported in `keras<=3.1`.
|
43
|
-
eos_index = ops.argmax(token_ids, axis=-1)
|
44
|
-
eos_index = ops.expand_dims(eos_index, axis=-1)
|
45
|
-
eos_index = ops.expand_dims(eos_index, axis=-1)
|
46
|
-
pooled_outputs = ops.take_along_axis(text_embeddings, eos_index, axis=1)
|
47
|
-
return ops.squeeze(pooled_outputs, axis=1)
|
48
|
-
|
49
|
-
def compute_output_shape(self, input_shape):
|
50
|
-
return (input_shape[0], input_shape[-1])
|
51
|
-
|
52
|
-
|
53
|
-
class CLIPHead(layers.Layer):
|
54
|
-
"""The head layer of CLIP.
|
55
|
-
|
56
|
-
`CLIPHead` takes `vision_embedding` and `text_embedding` as inputs to
|
57
|
-
compute the corresponding logits. Both embeddings are L2 normalized and used
|
58
|
-
to compute pairwise cosine similarity. The resulting logits are then scaled
|
59
|
-
by a learnable `logit_scale` parameter.
|
60
|
-
|
61
|
-
Call arguments:
|
62
|
-
vision_embedding: A tensor of shape `(batch_size, hidden_dim)`.
|
63
|
-
text_embedding: A tensor of shape `(batch_size, hidden_dim)`.
|
64
|
-
"""
|
65
|
-
|
66
|
-
def build(self, input_shape):
|
67
|
-
self.logit_scale = self.add_weight(
|
68
|
-
shape=(),
|
69
|
-
initializer=lambda *a, **kw: math.log(1 / 0.07),
|
70
|
-
trainable=True,
|
71
|
-
dtype=self.variable_dtype,
|
72
|
-
name="logit_scale",
|
73
|
-
)
|
74
|
-
|
75
|
-
def call(self, vision_embedding, text_embedding):
|
76
|
-
normalized_vision_embedding = ops.sqrt(
|
77
|
-
ops.sum(ops.power(vision_embedding, 2), axis=-1, keepdims=True)
|
78
|
-
)
|
79
|
-
normalized_text_embedding = ops.sqrt(
|
80
|
-
ops.sum(ops.power(text_embedding, 2), axis=-1, keepdims=True)
|
81
|
-
)
|
82
|
-
vision_embedding = vision_embedding / normalized_vision_embedding
|
83
|
-
text_embedding = text_embedding / normalized_text_embedding
|
84
|
-
logit_scale = ops.exp(self.logit_scale)
|
85
|
-
text_logits = (
|
86
|
-
ops.matmul(
|
87
|
-
text_embedding,
|
88
|
-
ops.transpose(vision_embedding),
|
89
|
-
)
|
90
|
-
* logit_scale
|
91
|
-
)
|
92
|
-
vision_logits = ops.transpose(text_logits)
|
93
|
-
return vision_logits, text_logits
|
94
|
-
|
95
|
-
def compute_output_shape(
|
96
|
-
self, vision_embedding_shape, text_embedding_shape
|
97
|
-
):
|
98
|
-
vision_logits_shape = (
|
99
|
-
vision_embedding_shape[0],
|
100
|
-
text_embedding_shape[0],
|
101
|
-
)
|
102
|
-
text_logits_shape = (
|
103
|
-
text_embedding_shape[0],
|
104
|
-
vision_embedding_shape[0],
|
105
|
-
)
|
106
|
-
return vision_logits_shape, text_logits_shape
|
5
|
+
from keras_hub.src.models.clip.clip_layers import CLIPHead
|
6
|
+
from keras_hub.src.models.clip.clip_layers import CLIPTextPooler
|
7
|
+
from keras_hub.src.models.clip.clip_layers import CLIPVisionPooler
|
107
8
|
|
108
9
|
|
109
10
|
@keras_hub_export("keras_hub.models.CLIPBackbone")
|
@@ -0,0 +1,295 @@
|
|
1
|
+
import math
|
2
|
+
|
3
|
+
from keras import layers
|
4
|
+
from keras import ops
|
5
|
+
|
6
|
+
from keras_hub.src.utils.keras_utils import standardize_data_format
|
7
|
+
|
8
|
+
|
9
|
+
def quick_gelu(x):
|
10
|
+
return x * ops.sigmoid(1.702 * x)
|
11
|
+
|
12
|
+
|
13
|
+
class CLIPVisionEmbedding(layers.Layer):
|
14
|
+
def __init__(
|
15
|
+
self,
|
16
|
+
hidden_dim,
|
17
|
+
patch_size,
|
18
|
+
image_size,
|
19
|
+
data_format=None,
|
20
|
+
dtype=None,
|
21
|
+
**kwargs,
|
22
|
+
):
|
23
|
+
super().__init__(dtype=dtype, **kwargs)
|
24
|
+
self.hidden_dim = int(hidden_dim)
|
25
|
+
self.patch_size = int(patch_size)
|
26
|
+
self.image_size = int(image_size)
|
27
|
+
data_format = standardize_data_format(data_format)
|
28
|
+
self.data_format = data_format
|
29
|
+
num_patches = (image_size // patch_size) ** 2
|
30
|
+
self.num_positions = num_patches + 1
|
31
|
+
|
32
|
+
self.patch_embedding = layers.Conv2D(
|
33
|
+
hidden_dim,
|
34
|
+
kernel_size=patch_size,
|
35
|
+
strides=patch_size,
|
36
|
+
data_format=data_format,
|
37
|
+
use_bias=False,
|
38
|
+
dtype=dtype,
|
39
|
+
name="patch_embedding",
|
40
|
+
)
|
41
|
+
self.position_embedding = layers.Embedding(
|
42
|
+
num_patches + 1, hidden_dim, dtype=dtype, name="position_embedding"
|
43
|
+
)
|
44
|
+
|
45
|
+
def build(self, input_shape):
|
46
|
+
self.class_embedding = self.add_weight(
|
47
|
+
shape=(self.hidden_dim,),
|
48
|
+
initializer="random_normal",
|
49
|
+
dtype=self.variable_dtype,
|
50
|
+
name="class_embedding",
|
51
|
+
)
|
52
|
+
self.position_ids = self.add_weight(
|
53
|
+
shape=(1, self.num_positions),
|
54
|
+
initializer="zeros",
|
55
|
+
# Let the backend determine the int dtype. For example, tf
|
56
|
+
# requires int64 for correct device placement, whereas jax and torch
|
57
|
+
# don't.
|
58
|
+
dtype=int,
|
59
|
+
trainable=False,
|
60
|
+
name="position_ids",
|
61
|
+
)
|
62
|
+
self.patch_embedding.build(input_shape)
|
63
|
+
self.position_embedding.build(self.position_ids.shape)
|
64
|
+
|
65
|
+
def call(self, inputs, training=None):
|
66
|
+
x = inputs
|
67
|
+
batch_size = ops.shape(x)[0]
|
68
|
+
patch_embeddings = self.patch_embedding(x, training=training)
|
69
|
+
if self.data_format == "channels_last":
|
70
|
+
patch_embeddings = ops.reshape(
|
71
|
+
patch_embeddings, (batch_size, -1, self.hidden_dim)
|
72
|
+
)
|
73
|
+
else:
|
74
|
+
patch_embeddings = ops.reshape(
|
75
|
+
patch_embeddings, (batch_size, self.hidden_dim, -1)
|
76
|
+
)
|
77
|
+
patch_embeddings = ops.transpose(patch_embeddings, (0, 2, 1))
|
78
|
+
class_embeddings = ops.expand_dims(self.class_embedding, axis=(0, 1))
|
79
|
+
class_embeddings = ops.tile(class_embeddings, (batch_size, 1, 1))
|
80
|
+
position_embeddings = self.position_embedding(self.position_ids)
|
81
|
+
embeddings = ops.concatenate(
|
82
|
+
[class_embeddings, patch_embeddings], axis=1
|
83
|
+
)
|
84
|
+
return ops.add(embeddings, position_embeddings)
|
85
|
+
|
86
|
+
def get_config(self):
|
87
|
+
config = super().get_config()
|
88
|
+
config.update(
|
89
|
+
{
|
90
|
+
"hidden_dim": self.hidden_dim,
|
91
|
+
"patch_size": self.patch_size,
|
92
|
+
"image_size": self.image_size,
|
93
|
+
}
|
94
|
+
)
|
95
|
+
return config
|
96
|
+
|
97
|
+
def compute_output_shape(self, input_shape):
|
98
|
+
output_shape = [input_shape[0], None, self.hidden_dim]
|
99
|
+
if self.data_format == "channels_last":
|
100
|
+
if input_shape[1] is not None and input_shape[2] is not None:
|
101
|
+
patch_num = input_shape[1] // self.patch_size
|
102
|
+
output_shape[1] = patch_num**2 + 1
|
103
|
+
else:
|
104
|
+
if input_shape[2] is not None and input_shape[3] is not None:
|
105
|
+
patch_num = input_shape[2] // self.patch_size
|
106
|
+
output_shape[1] = patch_num**2 + 1
|
107
|
+
return output_shape
|
108
|
+
|
109
|
+
|
110
|
+
class CLIPEncoderLayer(layers.Layer):
|
111
|
+
def __init__(
|
112
|
+
self,
|
113
|
+
hidden_dim,
|
114
|
+
num_heads,
|
115
|
+
intermediate_dim,
|
116
|
+
intermediate_activation="quick_gelu",
|
117
|
+
use_causal_mask=True,
|
118
|
+
**kwargs,
|
119
|
+
):
|
120
|
+
super().__init__(**kwargs)
|
121
|
+
if hidden_dim % num_heads != 0:
|
122
|
+
raise ValueError(
|
123
|
+
"`hidden_dim` must be divisible by `num_heads`. "
|
124
|
+
f"Received: hidden_dim={hidden_dim}, num_heads={num_heads}"
|
125
|
+
)
|
126
|
+
self.hidden_dim = hidden_dim
|
127
|
+
self.num_heads = num_heads
|
128
|
+
self.intermediate_dim = intermediate_dim
|
129
|
+
self.intermediate_activation = intermediate_activation
|
130
|
+
self.use_causal_mask = use_causal_mask
|
131
|
+
|
132
|
+
if intermediate_activation == "quick_gelu":
|
133
|
+
intermediate_activation = quick_gelu
|
134
|
+
|
135
|
+
self.layer_norm_1 = layers.LayerNormalization(
|
136
|
+
epsilon=1e-5, dtype=self.dtype_policy, name="layer_norm_1"
|
137
|
+
)
|
138
|
+
self.attention = layers.MultiHeadAttention(
|
139
|
+
num_heads,
|
140
|
+
hidden_dim // num_heads,
|
141
|
+
dtype=self.dtype_policy,
|
142
|
+
name="attention",
|
143
|
+
)
|
144
|
+
self.layer_norm_2 = layers.LayerNormalization(
|
145
|
+
epsilon=1e-5, dtype=self.dtype_policy, name="layer_norm_2"
|
146
|
+
)
|
147
|
+
self.dense_1 = layers.Dense(
|
148
|
+
self.intermediate_dim, dtype=self.dtype_policy, name="dense_1"
|
149
|
+
)
|
150
|
+
self.activation = layers.Activation(
|
151
|
+
intermediate_activation, dtype=self.dtype_policy, name="activation"
|
152
|
+
)
|
153
|
+
self.dense_2 = layers.Dense(
|
154
|
+
self.hidden_dim, dtype=self.dtype_policy, name="dense_2"
|
155
|
+
)
|
156
|
+
|
157
|
+
def build(self, input_shape):
|
158
|
+
self.layer_norm_1.build(input_shape)
|
159
|
+
self.attention.build(input_shape, input_shape, input_shape)
|
160
|
+
self.layer_norm_2.build(input_shape)
|
161
|
+
self.dense_1.build(input_shape)
|
162
|
+
input_shape = self.dense_1.compute_output_shape(input_shape)
|
163
|
+
self.dense_2.build(input_shape)
|
164
|
+
|
165
|
+
def compute_output_shape(self, inputs_shape):
|
166
|
+
outputs_shape = list(inputs_shape)
|
167
|
+
outputs_shape[-1] = self.hidden_dim
|
168
|
+
return outputs_shape
|
169
|
+
|
170
|
+
def call(self, x, training=None):
|
171
|
+
residual = x
|
172
|
+
x = self.layer_norm_1(x)
|
173
|
+
x = self.attention(
|
174
|
+
x, x, x, training=training, use_causal_mask=self.use_causal_mask
|
175
|
+
)
|
176
|
+
x = ops.add(residual, x)
|
177
|
+
|
178
|
+
residual = x
|
179
|
+
x = self.dense_1(self.layer_norm_2(residual))
|
180
|
+
x = self.activation(x)
|
181
|
+
x = self.dense_2(x)
|
182
|
+
x = ops.add(residual, x)
|
183
|
+
return x
|
184
|
+
|
185
|
+
def get_config(self):
|
186
|
+
config = super().get_config()
|
187
|
+
config.update(
|
188
|
+
{
|
189
|
+
"hidden_dim": self.hidden_dim,
|
190
|
+
"num_heads": self.num_heads,
|
191
|
+
"intermediate_dim": self.intermediate_dim,
|
192
|
+
"intermediate_activation": self.intermediate_activation,
|
193
|
+
"use_causal_mask": self.use_causal_mask,
|
194
|
+
}
|
195
|
+
)
|
196
|
+
return config
|
197
|
+
|
198
|
+
|
199
|
+
class CLIPVisionPooler(layers.Layer):
|
200
|
+
"""The vision pooler layer of CLIP.
|
201
|
+
|
202
|
+
`CLIPVisionPooler` will extracts the first token (index `0`) from the
|
203
|
+
sequence of the vision embeddings as the pooled outputs.
|
204
|
+
|
205
|
+
Call arguments:
|
206
|
+
vision_embeddings: A tensor of shape
|
207
|
+
`(batch_size, sequence_length, hidden_dim)`.
|
208
|
+
"""
|
209
|
+
|
210
|
+
def call(self, vision_embeddings):
|
211
|
+
return vision_embeddings[:, 0, :]
|
212
|
+
|
213
|
+
def compute_output_shape(self, input_shape):
|
214
|
+
return (input_shape[0], input_shape[-1])
|
215
|
+
|
216
|
+
|
217
|
+
class CLIPTextPooler(layers.Layer):
|
218
|
+
"""The text pooler layer of CLIP.
|
219
|
+
|
220
|
+
`CLIPTextPooler` extracts the text embeddings at the positions of EOS tokens
|
221
|
+
as the pooled outputs.
|
222
|
+
|
223
|
+
Call arguments:
|
224
|
+
text_embeddings: A tensor of shape
|
225
|
+
`(batch_size, sequence_length, hidden_dim)`.
|
226
|
+
token_ids: A tensor of shape `(batch_size, max_tokens)`, used to
|
227
|
+
identify the positions of EOS tokens.
|
228
|
+
"""
|
229
|
+
|
230
|
+
def call(self, text_embeddings, token_ids):
|
231
|
+
# `keepdims` is not supported in `keras<=3.1`.
|
232
|
+
eos_index = ops.argmax(token_ids, axis=-1)
|
233
|
+
eos_index = ops.expand_dims(eos_index, axis=-1)
|
234
|
+
eos_index = ops.expand_dims(eos_index, axis=-1)
|
235
|
+
pooled_outputs = ops.take_along_axis(text_embeddings, eos_index, axis=1)
|
236
|
+
return ops.squeeze(pooled_outputs, axis=1)
|
237
|
+
|
238
|
+
def compute_output_shape(self, input_shape):
|
239
|
+
return (input_shape[0], input_shape[-1])
|
240
|
+
|
241
|
+
|
242
|
+
class CLIPHead(layers.Layer):
|
243
|
+
"""The head layer of CLIP.
|
244
|
+
|
245
|
+
`CLIPHead` takes `vision_embedding` and `text_embedding` as inputs to
|
246
|
+
compute the corresponding logits. Both embeddings are L2 normalized and used
|
247
|
+
to compute pairwise cosine similarity. The resulting logits are then scaled
|
248
|
+
by a learnable `logit_scale` parameter.
|
249
|
+
|
250
|
+
Call arguments:
|
251
|
+
vision_embedding: A tensor of shape `(batch_size, hidden_dim)`.
|
252
|
+
text_embedding: A tensor of shape `(batch_size, hidden_dim)`.
|
253
|
+
"""
|
254
|
+
|
255
|
+
def build(self, input_shape):
|
256
|
+
self.logit_scale = self.add_weight(
|
257
|
+
shape=(),
|
258
|
+
initializer=lambda *a, **kw: math.log(1 / 0.07),
|
259
|
+
trainable=True,
|
260
|
+
dtype=self.variable_dtype,
|
261
|
+
name="logit_scale",
|
262
|
+
)
|
263
|
+
|
264
|
+
def call(self, vision_embedding, text_embedding):
|
265
|
+
normalized_vision_embedding = ops.sqrt(
|
266
|
+
ops.sum(ops.power(vision_embedding, 2), axis=-1, keepdims=True)
|
267
|
+
)
|
268
|
+
normalized_text_embedding = ops.sqrt(
|
269
|
+
ops.sum(ops.power(text_embedding, 2), axis=-1, keepdims=True)
|
270
|
+
)
|
271
|
+
vision_embedding = vision_embedding / normalized_vision_embedding
|
272
|
+
text_embedding = text_embedding / normalized_text_embedding
|
273
|
+
logit_scale = ops.exp(self.logit_scale)
|
274
|
+
text_logits = (
|
275
|
+
ops.matmul(
|
276
|
+
text_embedding,
|
277
|
+
ops.transpose(vision_embedding),
|
278
|
+
)
|
279
|
+
* logit_scale
|
280
|
+
)
|
281
|
+
vision_logits = ops.transpose(text_logits)
|
282
|
+
return vision_logits, text_logits
|
283
|
+
|
284
|
+
def compute_output_shape(
|
285
|
+
self, vision_embedding_shape, text_embedding_shape
|
286
|
+
):
|
287
|
+
vision_logits_shape = (
|
288
|
+
vision_embedding_shape[0],
|
289
|
+
text_embedding_shape[0],
|
290
|
+
)
|
291
|
+
text_logits_shape = (
|
292
|
+
text_embedding_shape[0],
|
293
|
+
vision_embedding_shape[0],
|
294
|
+
)
|
295
|
+
return vision_logits_shape, text_logits_shape
|
@@ -2,8 +2,10 @@ import keras
|
|
2
2
|
|
3
3
|
from keras_hub.src.api_export import keras_hub_export
|
4
4
|
from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
|
5
|
+
from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
|
6
|
+
from keras_hub.src.models.clip.clip_backbone import CLIPBackbone
|
7
|
+
from keras_hub.src.models.clip.clip_image_converter import CLIPImageConverter
|
5
8
|
from keras_hub.src.models.clip.clip_tokenizer import CLIPTokenizer
|
6
|
-
from keras_hub.src.models.preprocessor import Preprocessor
|
7
9
|
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
8
10
|
|
9
11
|
try:
|
@@ -13,32 +15,18 @@ except ImportError:
|
|
13
15
|
|
14
16
|
|
15
17
|
@keras_hub_export("keras_hub.models.CLIPPreprocessor")
|
16
|
-
class CLIPPreprocessor(
|
17
|
-
"""CLIP
|
18
|
+
class CLIPPreprocessor(CausalLMPreprocessor):
|
19
|
+
"""CLIP preprocessor.
|
18
20
|
|
19
21
|
This preprocessing layer will do 2 things:
|
20
22
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
This layer can be used directly with `tf.data.Dataset.map` to preprocess
|
25
|
-
string data in the `(x, y, sample_weight)` format used by
|
26
|
-
`keras.Model.fit`.
|
27
|
-
|
28
|
-
The call method of this layer accepts three arguments, `x`, `y`, and
|
29
|
-
`sample_weight`. `x` can be a python string or tensor representing a single
|
30
|
-
segment, a list of python strings representing a batch of single segments,
|
31
|
-
or a list of tensors representing multiple segments to be packed together.
|
32
|
-
`y` and `sample_weight` are both optional, can have any format, and will be
|
33
|
-
passed through unaltered.
|
34
|
-
|
35
|
-
`CLIPPreprocessor` forces the input to have only one segment, as CLIP is
|
36
|
-
mainly used for generation tasks. For tasks having multi-segment inputs
|
37
|
-
like "glue/mnli", please use a model designed for classification purposes
|
38
|
-
such as BERT or RoBERTa.
|
23
|
+
This preprocessing layer is meant for use with
|
24
|
+
`keras_hub.models.CLIPBackbone`. By default, it will take in batches of
|
25
|
+
strings and images, and return token ids and resized images.
|
39
26
|
|
40
27
|
Args:
|
41
28
|
tokenizer: A `keras_hub.models.CLIPTokenizer` instance.
|
29
|
+
image_converter: A `keras_hub.models.CLIPImageConverter` instance.
|
42
30
|
sequence_length: The length of the packed inputs.
|
43
31
|
add_start_token: If `True`, the preprocessor will prepend the tokenizer
|
44
32
|
start token to each input sequence.
|
@@ -47,32 +35,62 @@ class CLIPPreprocessor(Preprocessor):
|
|
47
35
|
to_lower: bool. Whether to lower the inputs.
|
48
36
|
|
49
37
|
Call arguments:
|
50
|
-
x: A
|
51
|
-
|
52
|
-
|
38
|
+
x: A dict with `"prompts"` and `"images"` keys, where `"prompts"` is
|
39
|
+
`tf.Tensor` or list of python strings and `"images"` are the image
|
40
|
+
tensors.
|
41
|
+
y: Label data. Should always be `None` since SigLIP doesn't need the
|
42
|
+
label to calculate the loss.
|
43
|
+
sample_weight: Label weights.
|
53
44
|
sequence_length: Pass to override the configured `sequence_length` of
|
54
45
|
the layer.
|
55
|
-
"""
|
56
46
|
|
57
|
-
|
47
|
+
Examples:
|
48
|
+
```python
|
49
|
+
# Load the preprocessor from a preset.
|
50
|
+
preprocessor = keras_hub.models.CLIPPreprocessor.from_preset(
|
51
|
+
"clip_vit_base_patch16"
|
52
|
+
)
|
53
|
+
|
54
|
+
# Tokenize the sentence and preprocess the image.
|
55
|
+
preprocessor(
|
56
|
+
{
|
57
|
+
"prompts": "The quick brown fox jumped.",
|
58
|
+
"images": np.ones(shape=(123, 123, 3)),
|
59
|
+
}
|
60
|
+
)
|
61
|
+
|
62
|
+
# Tokenize a batch of sentences and preprocess a batch of images.
|
63
|
+
preprocessor(
|
64
|
+
{
|
65
|
+
"prompts": ["The quick brown fox jumped.", "The fox slept."],
|
66
|
+
"images": np.ones(shape=(2, 123, 123, 3)),
|
67
|
+
}
|
68
|
+
)
|
69
|
+
```
|
70
|
+
"""
|
58
71
|
|
72
|
+
backbone_cls = CLIPBackbone
|
59
73
|
tokenizer_cls = CLIPTokenizer
|
74
|
+
image_converter_cls = CLIPImageConverter
|
60
75
|
|
61
76
|
def __init__(
|
62
77
|
self,
|
63
78
|
tokenizer,
|
79
|
+
image_converter=None,
|
64
80
|
sequence_length=77,
|
65
81
|
add_start_token=True,
|
66
82
|
add_end_token=True,
|
67
83
|
to_lower=True,
|
68
84
|
**kwargs,
|
69
85
|
):
|
70
|
-
super().__init__(
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
86
|
+
super().__init__(
|
87
|
+
tokenizer=tokenizer,
|
88
|
+
sequence_length=sequence_length,
|
89
|
+
add_start_token=add_start_token,
|
90
|
+
add_end_token=add_end_token,
|
91
|
+
**kwargs,
|
92
|
+
)
|
93
|
+
self.image_converter = image_converter
|
76
94
|
self.to_lower = to_lower
|
77
95
|
|
78
96
|
def build(self, input_shape):
|
@@ -96,10 +114,14 @@ class CLIPPreprocessor(Preprocessor):
|
|
96
114
|
sequence_length=None,
|
97
115
|
):
|
98
116
|
sequence_length = sequence_length or self.sequence_length
|
117
|
+
images, prompts = x["images"], x["prompts"]
|
99
118
|
if self.to_lower:
|
100
|
-
|
119
|
+
prompts = tf.strings.lower(prompts)
|
120
|
+
prompts = self.tokenizer(prompts)
|
121
|
+
if images is not None and self.image_converter:
|
122
|
+
images = self.image_converter(images)
|
101
123
|
token_ids, padding_mask = self.packer(
|
102
|
-
|
124
|
+
prompts,
|
103
125
|
sequence_length=sequence_length,
|
104
126
|
add_start_value=self.add_start_token,
|
105
127
|
add_end_value=self.add_end_token,
|
@@ -107,6 +129,7 @@ class CLIPPreprocessor(Preprocessor):
|
|
107
129
|
x = {
|
108
130
|
"token_ids": token_ids,
|
109
131
|
"padding_mask": padding_mask,
|
132
|
+
"images": images,
|
110
133
|
}
|
111
134
|
return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
|
112
135
|
|
@@ -114,21 +137,7 @@ class CLIPPreprocessor(Preprocessor):
|
|
114
137
|
config = super().get_config()
|
115
138
|
config.update(
|
116
139
|
{
|
117
|
-
"sequence_length": self.sequence_length,
|
118
|
-
"add_start_token": self.add_start_token,
|
119
|
-
"add_end_token": self.add_end_token,
|
120
140
|
"to_lower": self.to_lower,
|
121
141
|
}
|
122
142
|
)
|
123
143
|
return config
|
124
|
-
|
125
|
-
@property
|
126
|
-
def sequence_length(self):
|
127
|
-
"""The padded length of model input sequences."""
|
128
|
-
return self._sequence_length
|
129
|
-
|
130
|
-
@sequence_length.setter
|
131
|
-
def sequence_length(self, value):
|
132
|
-
self._sequence_length = value
|
133
|
-
if self.packer is not None:
|
134
|
-
self.packer.sequence_length = value
|
@@ -5,7 +5,7 @@ from keras_hub.src.layers.modeling.token_and_position_embedding import (
|
|
5
5
|
TokenAndPositionEmbedding,
|
6
6
|
)
|
7
7
|
from keras_hub.src.models.backbone import Backbone
|
8
|
-
from keras_hub.src.models.clip.
|
8
|
+
from keras_hub.src.models.clip.clip_layers import CLIPEncoderLayer
|
9
9
|
|
10
10
|
|
11
11
|
@keras_hub_export("keras_hub.models.CLIPTextEncoder")
|
@@ -71,7 +71,7 @@ class CLIPTextEncoder(Backbone):
|
|
71
71
|
name=f"{prefix}embedding",
|
72
72
|
)
|
73
73
|
self.encoder_layers = [
|
74
|
-
|
74
|
+
CLIPEncoderLayer(
|
75
75
|
hidden_dim,
|
76
76
|
num_heads,
|
77
77
|
intermediate_dim,
|
@@ -2,8 +2,8 @@ from keras import layers
|
|
2
2
|
|
3
3
|
from keras_hub.src.api_export import keras_hub_export
|
4
4
|
from keras_hub.src.models.backbone import Backbone
|
5
|
-
from keras_hub.src.models.clip.
|
6
|
-
from keras_hub.src.models.clip.
|
5
|
+
from keras_hub.src.models.clip.clip_layers import CLIPEncoderLayer
|
6
|
+
from keras_hub.src.models.clip.clip_layers import CLIPVisionEmbedding
|
7
7
|
from keras_hub.src.utils.keras_utils import standardize_data_format
|
8
8
|
|
9
9
|
|
@@ -91,7 +91,7 @@ class CLIPVisionEncoder(Backbone):
|
|
91
91
|
epsilon=1e-5, dtype=dtype, name=f"{prefix}pre_layer_norm"
|
92
92
|
)
|
93
93
|
self.encoder_layers = [
|
94
|
-
|
94
|
+
CLIPEncoderLayer(
|
95
95
|
hidden_dim,
|
96
96
|
num_heads,
|
97
97
|
intermediate_dim,
|