keras-hub-nightly 0.22.0.dev202507150421__py3-none-any.whl → 0.22.0.dev202507170424__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +3 -0
- keras_hub/models/__init__.py +3 -0
- keras_hub/src/models/clip/clip_backbone.py +3 -102
- keras_hub/src/models/clip/clip_layers.py +295 -0
- keras_hub/src/models/clip/clip_preprocessor.py +57 -48
- keras_hub/src/models/clip/clip_text_encoder.py +2 -2
- keras_hub/src/models/clip/clip_vision_encoder.py +3 -3
- keras_hub/src/models/dinov2/__init__.py +5 -0
- keras_hub/src/models/dinov2/dinov2_backbone.py +228 -0
- keras_hub/src/models/dinov2/dinov2_image_converter.py +8 -0
- keras_hub/src/models/dinov2/dinov2_layers.py +886 -0
- keras_hub/src/models/dinov2/dinov2_presets.py +4 -0
- keras_hub/src/models/flux/flux_text_to_image_preprocessor.py +6 -2
- keras_hub/src/models/hgnetv2/__init__.py +5 -0
- keras_hub/src/models/hgnetv2/hgnetv2_presets.py +5 -5
- keras_hub/src/models/stable_diffusion_3/flow_match_euler_discrete_scheduler.py +16 -7
- keras_hub/src/models/stable_diffusion_3/mmdit.py +61 -4
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py +23 -32
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py +1 -0
- keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor.py +6 -2
- keras_hub/src/utils/preset_utils.py +4 -1
- keras_hub/src/utils/transformers/convert_dinov2.py +180 -0
- keras_hub/src/utils/transformers/export/gemma.py +89 -0
- keras_hub/src/utils/transformers/export/hf_exporter.py +98 -0
- keras_hub/src/utils/transformers/preset_loader.py +4 -1
- keras_hub/src/version.py +1 -1
- {keras_hub_nightly-0.22.0.dev202507150421.dist-info → keras_hub_nightly-0.22.0.dev202507170424.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.22.0.dev202507150421.dist-info → keras_hub_nightly-0.22.0.dev202507170424.dist-info}/RECORD +32 -25
- keras_hub/src/models/clip/clip_encoder_block.py +0 -111
- keras_hub/src/models/clip/clip_vision_embedding.py +0 -101
- {keras_hub_nightly-0.22.0.dev202507150421.dist-info → keras_hub_nightly-0.22.0.dev202507170424.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.22.0.dev202507150421.dist-info → keras_hub_nightly-0.22.0.dev202507170424.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,228 @@
|
|
1
|
+
from keras import layers
|
2
|
+
|
3
|
+
from keras_hub.src.api_export import keras_hub_export
|
4
|
+
from keras_hub.src.models.backbone import Backbone
|
5
|
+
from keras_hub.src.models.dinov2.dinov2_layers import DINOV2Embedding
|
6
|
+
from keras_hub.src.models.dinov2.dinov2_layers import DINOV2Encoder
|
7
|
+
from keras_hub.src.utils.keras_utils import standardize_data_format
|
8
|
+
|
9
|
+
|
10
|
+
@keras_hub_export("keras_hub.models.DINOV2Backbone")
|
11
|
+
class DINOV2Backbone(Backbone):
|
12
|
+
"""DINOV2 core network with hyperparameters.
|
13
|
+
|
14
|
+
DINOV2 offers a powerful, generalist visual backbone learned entirely from
|
15
|
+
unlabeled images as described in [DINOv2: Learning Robust Visual Features
|
16
|
+
without Supervision](https://arxiv.org/abs/2304.07193)
|
17
|
+
|
18
|
+
The default constructor gives a fully customizable, randomly initialized
|
19
|
+
DINOV2 model with any number of layers, heads, and embedding dimensions. To
|
20
|
+
load preset architectures and weights, use the `from_preset` constructor.
|
21
|
+
|
22
|
+
Note that this backbone supports interpolation of the position embeddings
|
23
|
+
to the input image shape. This is useful when the input image shape is
|
24
|
+
different from the shape used to train the position embeddings. The
|
25
|
+
`position_embedding_shape` argument is used to specify the original shape
|
26
|
+
used to train the position embeddings.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
patch_size: int. The size of each square patch in the input image.
|
30
|
+
num_layers: int. The number of transformer layers.
|
31
|
+
hidden_dim: int. The size of the transformer hidden state at the end
|
32
|
+
of each transformer layer.
|
33
|
+
num_heads: int. The number of attention heads for each transformer.
|
34
|
+
intermediate_dim: int. The output dimension of the first Dense layer in
|
35
|
+
a two-layer feedforward network for each transformer.
|
36
|
+
layer_scale_init_value: float. The initial value for the layer scale in
|
37
|
+
the transformer layers. Defaults to `1.0`.
|
38
|
+
num_register_tokens: int. The number of register tokens to use in the
|
39
|
+
embedding layer. Defaults to `0`.
|
40
|
+
use_mask_token: bool. Whether to use a mask token in the embedding
|
41
|
+
layer. Defaults to `True`.
|
42
|
+
use_swiglu_ffn: bool. Whether to use SwigLU activation in the MLP
|
43
|
+
layers. Defaults to `False`.
|
44
|
+
dropout_rate: float. The dropout rate to use. Defaults to `0.0`.
|
45
|
+
drop_path_rate: float. The drop path rate to use. Defaults to `0.0`.
|
46
|
+
image_shape: tuple. The input shape without the batch size. Defaults to
|
47
|
+
`(224, 224, 3)`.
|
48
|
+
position_embedding_shape: tuple. The original shape used to train the
|
49
|
+
position embeddings. This is used to interpolate the position
|
50
|
+
embeddings to the actual input shape. Defaults to `(518, 518)`.
|
51
|
+
antialias_in_interpolation: bool. Whether to use antialiasing in the
|
52
|
+
interpolation of the position embeddings. Defaults to `False`.
|
53
|
+
data_format: `None` or str. If specified, either `"channels_last"` or
|
54
|
+
`"channels_first"`. The ordering of the dimensions in the
|
55
|
+
inputs. `"channels_last"` corresponds to inputs with shape
|
56
|
+
`(batch_size, height, width, channels)`
|
57
|
+
while `"channels_first"` corresponds to inputs with shape
|
58
|
+
`(batch_size, channels, height, width)`. It defaults to the
|
59
|
+
`image_data_format` value found in your Keras config file at
|
60
|
+
`~/.keras/keras.json`. If you never set it, then it will be
|
61
|
+
`"channels_last"`.
|
62
|
+
dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
|
63
|
+
for the models computations and weights. Note that some
|
64
|
+
computations, such as softmax and layer normalization will always
|
65
|
+
be done a float32 precision regardless of dtype.
|
66
|
+
|
67
|
+
Example:
|
68
|
+
```python
|
69
|
+
# Pretrained DINOV2 model.
|
70
|
+
input_data = {
|
71
|
+
"images": np.ones(shape=(1, 518, 518, 3), dtype="float32"),
|
72
|
+
}
|
73
|
+
model = keras_hub.models.DINOV2Backbone.from_preset(
|
74
|
+
"dinov2_base"
|
75
|
+
)
|
76
|
+
model(input_data)
|
77
|
+
|
78
|
+
# Pretrained DINOV2 model with custom image shape.
|
79
|
+
input_data = {
|
80
|
+
"images": np.ones(shape=(1, 224, 224, 3), dtype="float32"),
|
81
|
+
}
|
82
|
+
model = keras_hub.models.DINOV2Backbone.from_preset(
|
83
|
+
"dinov2_base", image_shape=(224, 224, 3)
|
84
|
+
)
|
85
|
+
model(input_data)
|
86
|
+
|
87
|
+
# Randomly initialized DINOV2 model with custom config.
|
88
|
+
model = keras_hub.models.DINOV2Backbone(
|
89
|
+
patch_size=14,
|
90
|
+
num_layers=2,
|
91
|
+
hidden_dim=32,
|
92
|
+
num_heads=2,
|
93
|
+
intermediate_dim=128,
|
94
|
+
image_shape=(224, 224, 3),
|
95
|
+
position_embedding_shape=(518, 518),
|
96
|
+
)
|
97
|
+
model(input_data)
|
98
|
+
```
|
99
|
+
"""
|
100
|
+
|
101
|
+
def __init__(
|
102
|
+
self,
|
103
|
+
patch_size,
|
104
|
+
num_layers,
|
105
|
+
hidden_dim,
|
106
|
+
num_heads,
|
107
|
+
intermediate_dim,
|
108
|
+
layer_scale_init_value=1.0,
|
109
|
+
num_register_tokens=0,
|
110
|
+
use_mask_token=True,
|
111
|
+
use_swiglu_ffn=False,
|
112
|
+
dropout_rate=0.0,
|
113
|
+
drop_path_rate=0.0,
|
114
|
+
image_shape=(224, 224, 3),
|
115
|
+
position_embedding_shape=(518, 518, 3),
|
116
|
+
antialias_in_interpolation=False,
|
117
|
+
data_format=None,
|
118
|
+
dtype=None,
|
119
|
+
name=None,
|
120
|
+
**kwargs,
|
121
|
+
):
|
122
|
+
data_format = standardize_data_format(data_format)
|
123
|
+
if data_format == "channels_last":
|
124
|
+
height, width = image_shape[0], image_shape[1]
|
125
|
+
position_embedding_height, position_embedding_width = (
|
126
|
+
position_embedding_shape[0],
|
127
|
+
position_embedding_shape[1],
|
128
|
+
)
|
129
|
+
else:
|
130
|
+
height, width = image_shape[1], image_shape[2]
|
131
|
+
position_embedding_height, position_embedding_width = (
|
132
|
+
position_embedding_shape[1],
|
133
|
+
position_embedding_shape[2],
|
134
|
+
)
|
135
|
+
if height != width:
|
136
|
+
raise ValueError(
|
137
|
+
"`DINOV2Backbone` expects the height and width to be the "
|
138
|
+
f"same in `image_shape`. Received: image_shape={image_shape}"
|
139
|
+
)
|
140
|
+
|
141
|
+
# `prefix` is used to prevent duplicate name when utilizing multiple
|
142
|
+
# DINOV2Backbone encoders within a single model.
|
143
|
+
prefix = str(name) + "_" if name is not None else ""
|
144
|
+
|
145
|
+
# === Layers ===
|
146
|
+
self.embeddings = DINOV2Embedding(
|
147
|
+
hidden_dim=hidden_dim,
|
148
|
+
patch_size=patch_size,
|
149
|
+
image_shape=(height, width),
|
150
|
+
num_register_tokens=num_register_tokens,
|
151
|
+
use_mask_token=use_mask_token,
|
152
|
+
dropout_rate=dropout_rate,
|
153
|
+
position_embedding_shape=(
|
154
|
+
position_embedding_height,
|
155
|
+
position_embedding_width,
|
156
|
+
),
|
157
|
+
antialias_in_interpolation=antialias_in_interpolation,
|
158
|
+
data_format=data_format,
|
159
|
+
dtype=dtype,
|
160
|
+
name=f"{prefix}embeddings",
|
161
|
+
)
|
162
|
+
self.encoder = DINOV2Encoder(
|
163
|
+
num_layers=num_layers,
|
164
|
+
hidden_dim=hidden_dim,
|
165
|
+
num_heads=num_heads,
|
166
|
+
intermediate_dim=intermediate_dim,
|
167
|
+
layer_scale_init_value=layer_scale_init_value,
|
168
|
+
use_swiglu_ffn=use_swiglu_ffn,
|
169
|
+
dropout_rate=dropout_rate,
|
170
|
+
drop_path_rate=drop_path_rate,
|
171
|
+
dtype=dtype,
|
172
|
+
name=f"{prefix}encoder",
|
173
|
+
)
|
174
|
+
self.layernorm = layers.LayerNormalization(
|
175
|
+
epsilon=1e-6, dtype=dtype, name=f"{prefix}layernorm"
|
176
|
+
)
|
177
|
+
|
178
|
+
# === Functional Model ===
|
179
|
+
image_input = layers.Input(shape=image_shape, name="images")
|
180
|
+
x = self.embeddings(image_input)
|
181
|
+
x = self.encoder(x)
|
182
|
+
x = self.layernorm(x)
|
183
|
+
outputs = x
|
184
|
+
super().__init__(
|
185
|
+
inputs={"images": image_input},
|
186
|
+
outputs=outputs,
|
187
|
+
dtype=dtype,
|
188
|
+
name=name,
|
189
|
+
**kwargs,
|
190
|
+
)
|
191
|
+
|
192
|
+
# === Config ===
|
193
|
+
self.patch_size = int(patch_size)
|
194
|
+
self.num_layers = int(num_layers)
|
195
|
+
self.hidden_dim = int(hidden_dim)
|
196
|
+
self.num_heads = int(num_heads)
|
197
|
+
self.intermediate_dim = int(intermediate_dim)
|
198
|
+
self.layer_scale_init_value = float(layer_scale_init_value)
|
199
|
+
self.num_register_tokens = int(num_register_tokens)
|
200
|
+
self.use_mask_token = bool(use_mask_token)
|
201
|
+
self.use_swiglu_ffn = bool(use_swiglu_ffn)
|
202
|
+
self.dropout_rate = float(dropout_rate)
|
203
|
+
self.drop_path_rate = float(drop_path_rate)
|
204
|
+
self.image_shape = image_shape
|
205
|
+
self.position_embedding_shape = position_embedding_shape
|
206
|
+
self.antialias_in_interpolation = bool(antialias_in_interpolation)
|
207
|
+
|
208
|
+
def get_config(self):
|
209
|
+
config = super().get_config()
|
210
|
+
config.update(
|
211
|
+
{
|
212
|
+
"patch_size": self.patch_size,
|
213
|
+
"num_layers": self.num_layers,
|
214
|
+
"hidden_dim": self.hidden_dim,
|
215
|
+
"num_heads": self.num_heads,
|
216
|
+
"intermediate_dim": self.intermediate_dim,
|
217
|
+
"layer_scale_init_value": self.layer_scale_init_value,
|
218
|
+
"num_register_tokens": self.num_register_tokens,
|
219
|
+
"use_mask_token": self.use_mask_token,
|
220
|
+
"use_swiglu_ffn": self.use_swiglu_ffn,
|
221
|
+
"dropout_rate": self.dropout_rate,
|
222
|
+
"drop_path_rate": self.drop_path_rate,
|
223
|
+
"image_shape": self.image_shape,
|
224
|
+
"position_embedding_shape": self.position_embedding_shape,
|
225
|
+
"antialias_in_interpolation": self.antialias_in_interpolation,
|
226
|
+
}
|
227
|
+
)
|
228
|
+
return config
|
@@ -0,0 +1,8 @@
|
|
1
|
+
from keras_hub.src.api_export import keras_hub_export
|
2
|
+
from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
|
3
|
+
from keras_hub.src.models.dinov2.dinov2_backbone import DINOV2Backbone
|
4
|
+
|
5
|
+
|
6
|
+
@keras_hub_export("keras_hub.layers.DINOV2ImageConverter")
|
7
|
+
class DINOV2ImageConverter(ImageConverter):
|
8
|
+
backbone_cls = DINOV2Backbone
|