keras-hub-nightly 0.19.0.dev202503010353__py3-none-any.whl → 0.19.0.dev202503030351__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -70,6 +70,9 @@ from keras_hub.src.models.sam.sam_prompt_encoder import SAMPromptEncoder
70
70
  from keras_hub.src.models.segformer.segformer_image_converter import (
71
71
  SegFormerImageConverter,
72
72
  )
73
+ from keras_hub.src.models.siglip.siglip_image_converter import (
74
+ SigLIPImageConverter,
75
+ )
73
76
  from keras_hub.src.models.vgg.vgg_image_converter import VGGImageConverter
74
77
  from keras_hub.src.models.vit.vit_image_converter import ViTImageConverter
75
78
  from keras_hub.src.models.whisper.whisper_audio_converter import (
@@ -312,6 +312,13 @@ from keras_hub.src.models.segformer.segformer_image_segmenter_preprocessor impor
312
312
  )
313
313
  from keras_hub.src.models.seq_2_seq_lm import Seq2SeqLM
314
314
  from keras_hub.src.models.seq_2_seq_lm_preprocessor import Seq2SeqLMPreprocessor
315
+ from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
316
+ from keras_hub.src.models.siglip.siglip_preprocessor import SigLIPPreprocessor
317
+ from keras_hub.src.models.siglip.siglip_text_encoder import SigLIPTextEncoder
318
+ from keras_hub.src.models.siglip.siglip_tokenizer import SigLIPTokenizer
319
+ from keras_hub.src.models.siglip.siglip_vision_encoder import (
320
+ SigLIPVisionEncoder,
321
+ )
315
322
  from keras_hub.src.models.stable_diffusion_3.stable_diffusion_3_backbone import (
316
323
  StableDiffusion3Backbone,
317
324
  )
@@ -30,6 +30,7 @@ from keras_hub.src.models.pali_gemma.pali_gemma_tokenizer import (
30
30
  )
31
31
  from keras_hub.src.models.phi3.phi3_tokenizer import Phi3Tokenizer
32
32
  from keras_hub.src.models.roberta.roberta_tokenizer import RobertaTokenizer
33
+ from keras_hub.src.models.siglip.siglip_tokenizer import SigLIPTokenizer
33
34
  from keras_hub.src.models.t5.t5_tokenizer import T5Tokenizer
34
35
  from keras_hub.src.models.whisper.whisper_tokenizer import WhisperTokenizer
35
36
  from keras_hub.src.models.xlm_roberta.xlm_roberta_tokenizer import (
@@ -1,6 +1,7 @@
1
1
  import math
2
2
 
3
3
  import keras
4
+ import ml_dtypes
4
5
  import numpy as np
5
6
  from keras import ops
6
7
 
@@ -18,6 +19,95 @@ from keras_hub.src.utils.tensor_utils import check_bounding_box_support
18
19
  from keras_hub.src.utils.tensor_utils import preprocessing_function
19
20
 
20
21
 
22
+ # TODO: Use `keras.layers.Resizing` once `antialias` is configurable.
23
+ # https://github.com/keras-team/keras/pull/20972
24
+ def _saturate_cast(x, dtype, backend_module):
25
+ def get_dtype_min_max(dtype):
26
+ if "bool" == dtype:
27
+ dtype_min = 0
28
+ dtype_max = 1
29
+ elif "int" in dtype:
30
+ dtype_min = ml_dtypes.iinfo(dtype).min
31
+ dtype_max = ml_dtypes.iinfo(dtype).max
32
+ else:
33
+ dtype_min = ml_dtypes.finfo(dtype).min
34
+ dtype_max = ml_dtypes.finfo(dtype).max
35
+ return dtype_min, dtype_max
36
+
37
+ dtype = keras.backend.standardize_dtype(dtype)
38
+ in_dtype = keras.backend.standardize_dtype(x.dtype)
39
+ in_min, in_max = get_dtype_min_max(in_dtype)
40
+ out_min, out_max = get_dtype_min_max(dtype)
41
+
42
+ min_limit = np.maximum(in_min, out_min).astype(in_dtype)
43
+ if min_limit < out_min:
44
+ min_limit = np.nextafter(min_limit, 0, dtype=in_dtype)
45
+ max_limit = np.minimum(in_max, out_max).astype(in_dtype)
46
+ if max_limit > out_max:
47
+ max_limit = np.nextafter(max_limit, 0, dtype=in_dtype)
48
+
49
+ x = backend_module.numpy.clip(x, min_limit, max_limit)
50
+ return backend_module.cast(x, dtype)
51
+
52
+
53
+ class ResizingAntialiasConfigurable(keras.layers.Resizing):
54
+ """A preprocessing layer which resizes images.
55
+
56
+ This class is the same as `keras.layers.Resizing` but exposes `antialias` as
57
+ a configurable parameter.
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ height,
63
+ width,
64
+ interpolation="bilinear",
65
+ antialias=False,
66
+ crop_to_aspect_ratio=False,
67
+ pad_to_aspect_ratio=False,
68
+ fill_mode="constant",
69
+ fill_value=0.0,
70
+ data_format=None,
71
+ **kwargs,
72
+ ):
73
+ super().__init__(
74
+ height=height,
75
+ width=width,
76
+ interpolation=interpolation,
77
+ crop_to_aspect_ratio=crop_to_aspect_ratio,
78
+ pad_to_aspect_ratio=pad_to_aspect_ratio,
79
+ fill_mode=fill_mode,
80
+ fill_value=fill_value,
81
+ data_format=data_format,
82
+ **kwargs,
83
+ )
84
+ self.antialias = bool(antialias)
85
+
86
+ def transform_images(self, images, transformation=None, training=True):
87
+ size = (self.height, self.width)
88
+ resized = self.backend.image.resize(
89
+ images,
90
+ size=size,
91
+ interpolation=self.interpolation,
92
+ antialias=self.antialias, # Added.
93
+ data_format=self.data_format,
94
+ crop_to_aspect_ratio=self.crop_to_aspect_ratio,
95
+ pad_to_aspect_ratio=self.pad_to_aspect_ratio,
96
+ fill_mode=self.fill_mode,
97
+ fill_value=self.fill_value,
98
+ )
99
+ if resized.dtype == images.dtype:
100
+ return resized
101
+ if keras.backend.is_int_dtype(images.dtype):
102
+ resized = self.backend.numpy.round(resized)
103
+ return _saturate_cast(resized, images.dtype, self.backend)
104
+
105
+ def get_config(self):
106
+ config = super().get_config()
107
+ config.update({"antialias": self.antialias})
108
+ return config
109
+
110
+
21
111
  @keras_hub_export("keras_hub.layers.ImageConverter")
22
112
  class ImageConverter(PreprocessingLayer):
23
113
  """Preprocess raw images into model ready inputs.
@@ -65,6 +155,8 @@ class ImageConverter(PreprocessingLayer):
65
155
  interpolation: String, the interpolation method.
66
156
  Supports `"bilinear"`, `"nearest"`, `"bicubic"`,
67
157
  `"lanczos3"`, `"lanczos5"`. Defaults to `"bilinear"`.
158
+ antialias: Whether to use an antialiasing filter when downsampling an
159
+ image. Defaults to `False`.
68
160
  bounding_box_format: A string specifying the format of the bounding
69
161
  boxes, one of `"xyxy"`, `"rel_xyxy"`, `"xywh"`, `"center_xywh"`,
70
162
  `"yxyx"`, `"rel_yxyx"`. Specifies the format of the bounding boxes
@@ -107,6 +199,7 @@ class ImageConverter(PreprocessingLayer):
107
199
  crop_to_aspect_ratio=True,
108
200
  pad_to_aspect_ratio=False,
109
201
  interpolation="bilinear",
202
+ antialias=False,
110
203
  bounding_box_format="yxyx",
111
204
  data_format=None,
112
205
  **kwargs,
@@ -132,12 +225,13 @@ class ImageConverter(PreprocessingLayer):
132
225
  resizing_kwargs = {}
133
226
  if check_bounding_box_support():
134
227
  resizing_kwargs["bounding_box_format"] = bounding_box_format
135
- self.resizing = keras.layers.Resizing(
228
+ self.resizing = ResizingAntialiasConfigurable(
136
229
  height=image_size[0] if image_size else None,
137
230
  width=image_size[1] if image_size else None,
138
231
  crop_to_aspect_ratio=crop_to_aspect_ratio,
139
232
  pad_to_aspect_ratio=pad_to_aspect_ratio,
140
233
  interpolation=interpolation,
234
+ antialias=antialias,
141
235
  data_format=data_format,
142
236
  dtype=self.dtype_policy,
143
237
  name="resizing",
@@ -148,6 +242,7 @@ class ImageConverter(PreprocessingLayer):
148
242
  self.crop_to_aspect_ratio = crop_to_aspect_ratio
149
243
  self.pad_to_aspect_ratio = pad_to_aspect_ratio
150
244
  self.interpolation = interpolation
245
+ self.antialias = antialias
151
246
  self.bounding_box_format = bounding_box_format
152
247
  self.data_format = standardize_data_format(data_format)
153
248
 
@@ -211,6 +306,7 @@ class ImageConverter(PreprocessingLayer):
211
306
  "scale": self.scale,
212
307
  "offset": self.offset,
213
308
  "interpolation": self.interpolation,
309
+ "antialias": self.antialias,
214
310
  "crop_to_aspect_ratio": self.crop_to_aspect_ratio,
215
311
  "pad_to_aspect_ratio": self.pad_to_aspect_ratio,
216
312
  "bounding_box_format": self.bounding_box_format,
@@ -0,0 +1,5 @@
1
+ from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
2
+ from keras_hub.src.models.siglip.siglip_presets import backbone_presets
3
+ from keras_hub.src.utils.preset_utils import register_presets
4
+
5
+ register_presets(backbone_presets, SigLIPBackbone)
@@ -0,0 +1,230 @@
1
+ import keras
2
+ from keras import layers
3
+ from keras import ops
4
+
5
+ from keras_hub.src.api_export import keras_hub_export
6
+ from keras_hub.src.models.backbone import Backbone
7
+ from keras_hub.src.models.siglip.siglip_layers import SigLIPHead
8
+ from keras_hub.src.models.siglip.siglip_loss import SigLIPLoss
9
+
10
+
11
+ @keras_hub_export("keras_hub.models.SigLIPBackbone")
12
+ class SigLIPBackbone(Backbone):
13
+ """SigCLIP core network with hyperparameters.
14
+
15
+ This backbone implements the base architecture for the Sigmoid loss in the
16
+ Language-Image Pre-training (SigLIP) model. Unlike standard contrastive
17
+ learning with softmax normalization, the sigmoid loss operates solely on
18
+ image-text pairs and does not require a global view of the pairwise
19
+ similarities for normalization. It includes vision and text encoders. This
20
+ backbone outputs the final logit scores corresponding to each image and
21
+ token input.
22
+
23
+ The default constructor gives a fully customizable, randomly initialized
24
+ SigLIP model with any number of layers, heads, and embedding dimensions. To
25
+ load preset architectures and weights, use the `from_preset` constructor.
26
+
27
+ Args:
28
+ vision_encoder: The SigLIP vision encoder for encoding the input images.
29
+ text_encoder: The SigLIP text encoder for encoding the input tokens.
30
+ projection_dim: int. The size of the projection layer.
31
+ dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
32
+ for the models computations and weights. Note that some
33
+ computations, such as softmax and layer normalization will always
34
+ be done a float32 precision regardless of dtype.
35
+
36
+ Example:
37
+ ```python
38
+ input_data = {
39
+ "images": np.ones(shape=(1, 224, 224, 3), dtype="float32"),
40
+ "token_ids": np.ones(shape=(1, 64), dtype="int32"),
41
+ }
42
+
43
+ # Pretrained SigLIP model.
44
+ model = keras_hub.models.SigLIPBackbone.from_preset(
45
+ "siglip_base_patch16_224"
46
+ )
47
+ model(input_data)
48
+
49
+ # Randomly initialized SigLIP model with custom config.
50
+ vision_encoder = keras_hub.models.SigLIPVisionEncoder(
51
+ patch_size=32,
52
+ hidden_dim=768,
53
+ num_layers=8,
54
+ num_heads=8,
55
+ intermediate_dim=2048,
56
+ image_shape=(384, 384, 3),
57
+ )
58
+ text_encoder = keras_hub.models.SigLIPTextEncoder(
59
+ vocabulary_size=32000,
60
+ embedding_dim=768,
61
+ hidden_dim=768,
62
+ num_layers=8,
63
+ num_heads=8,
64
+ intermediate_dim=2048,
65
+ )
66
+ model = keras_hub.models.SigLIPBackbone(
67
+ vision_encoder=vision_encoder,
68
+ text_encoder=text_encoder,
69
+ )
70
+ model(input_data)
71
+ ```
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ vision_encoder,
77
+ text_encoder,
78
+ dtype=None,
79
+ **kwargs,
80
+ ):
81
+ # === Layers ===
82
+ self.vision_encoder = vision_encoder
83
+ self.text_encoder = text_encoder
84
+ self.siglip_head = SigLIPHead(dtype=dtype, name="siglip_head")
85
+
86
+ # === Functional Model ===
87
+ image_input = layers.Input(
88
+ shape=self.vision_encoder.image_shape, name="images"
89
+ )
90
+ token_id_input = layers.Input(
91
+ shape=(None,), dtype="int32", name="token_ids"
92
+ )
93
+ vision_embeddings = self.get_vision_embeddings(image_input)
94
+ text_embeddings = self.get_text_embeddings(token_id_input)
95
+ vision_logits, text_logits = self.siglip_head(
96
+ vision_embeddings, text_embeddings
97
+ )
98
+
99
+ super().__init__(
100
+ inputs={
101
+ "images": image_input,
102
+ "token_ids": token_id_input,
103
+ },
104
+ outputs={
105
+ "vision_logits": vision_logits,
106
+ "text_logits": text_logits,
107
+ },
108
+ dtype=dtype,
109
+ **kwargs,
110
+ )
111
+
112
+ def compute_loss(
113
+ self, x, y=None, y_pred=None, sample_weight=None, **kwargs
114
+ ):
115
+ outputs = self(x)
116
+ text_logits = outputs["text_logits"]
117
+ batch_size = ops.shape(text_logits)[0]
118
+ eye = ops.eye(batch_size, dtype=text_logits.dtype)
119
+ m1_diag1 = -ops.ones_like(text_logits) + 2 * eye
120
+ return super().compute_loss(
121
+ x=x,
122
+ y=m1_diag1,
123
+ y_pred=text_logits,
124
+ sample_weight=sample_weight,
125
+ **kwargs,
126
+ )
127
+
128
+ def compile(
129
+ self,
130
+ optimizer="auto",
131
+ loss="auto",
132
+ metrics=None,
133
+ **kwargs,
134
+ ):
135
+ """Configures the `SigLIPBackbone` task for training.
136
+
137
+ `SigLIPBackbone` extends the default compilation signature
138
+ of `keras.Model.compile` with defaults for `optimizer` and `loss`. To
139
+ override these defaults, pass any value to these arguments during
140
+ compilation.
141
+
142
+ Args:
143
+ optimizer: `"auto"`, an optimizer name, or a `keras.Optimizer`
144
+ instance. Defaults to `"auto"`, which uses the default
145
+ optimizer for `SigLIPBackbone`. See `keras.Model.compile` and
146
+ `keras.optimizers` for more info on possible `optimizer`
147
+ values.
148
+ loss: `"auto"`, a loss name, or a `keras.losses.Loss` instance.
149
+ Defaults to `"auto"`, in which case the default loss
150
+ computation of `SigLIPBackbone` will be applied.
151
+ See `keras.Model.compile` and `keras.losses` for more info on
152
+ possible `loss` values.
153
+ metrics: `a list of metrics to be evaluated by
154
+ the model during training and testing. Defaults to `None`.
155
+ See `keras.Model.compile` and `keras.metrics` for
156
+ more info on possible `metrics` values.
157
+ **kwargs: See `keras.Model.compile` for a full list of arguments
158
+ supported by the compile method.
159
+ """
160
+ if optimizer == "auto":
161
+ # Using the alternative optimizer AdamW instead of the
162
+ # ScalingViT-Adafactor optimizer mentioned in the paper:
163
+ # https://arxiv.org/abs/2303.15343 - C. Robustness of SigLIP
164
+ # results.
165
+ optimizer = keras.optimizers.AdamW(1e-3, weight_decay=1e-4)
166
+ if loss == "auto":
167
+ loss = SigLIPLoss()
168
+ if metrics == "auto":
169
+ metrics = [keras.metrics.Accuracy()]
170
+ super().compile(
171
+ optimizer=optimizer,
172
+ loss=loss,
173
+ metrics=metrics,
174
+ **kwargs,
175
+ )
176
+
177
+ def get_vision_embeddings(self, images):
178
+ """Get the embeddings from the vision encoder.
179
+
180
+ Args:
181
+ images: The input tensor for the vision encoder.
182
+
183
+ Returns:
184
+ The output embeddings obtained by applying projection layer to the
185
+ pooled output of the vision encoder.
186
+ """
187
+ return self.vision_encoder({"images": images})
188
+
189
+ def get_text_embeddings(self, token_ids):
190
+ """Get the embeddings from the text encoder.
191
+
192
+ Args:
193
+ token_ids: The input int tensor for the text encoder.
194
+
195
+ Returns:
196
+ The output embeddings obtained by applying projection layer to the
197
+ pooled output of the text encoder.
198
+ """
199
+ return self.text_encoder({"token_ids": token_ids})
200
+
201
+ def get_config(self):
202
+ config = super().get_config()
203
+ config.update(
204
+ {
205
+ "vision_encoder": layers.serialize(self.vision_encoder),
206
+ "text_encoder": layers.serialize(self.text_encoder),
207
+ }
208
+ )
209
+ return config
210
+
211
+ @classmethod
212
+ def from_config(cls, config, custom_objects=None):
213
+ config = config.copy()
214
+
215
+ # Propagate `dtype` to submodels if needed.
216
+ if "dtype" in config and config["dtype"] is not None:
217
+ dtype_config = config["dtype"]
218
+ if "dtype" not in config["vision_encoder"]["config"]:
219
+ config["vision_encoder"]["config"]["dtype"] = dtype_config
220
+ if "dtype" not in config["text_encoder"]["config"]:
221
+ config["text_encoder"]["config"]["dtype"] = dtype_config
222
+
223
+ # We expect submodels to be instantiated.
224
+ config["vision_encoder"] = layers.deserialize(
225
+ config["vision_encoder"], custom_objects=custom_objects
226
+ )
227
+ config["text_encoder"] = layers.deserialize(
228
+ config["text_encoder"], custom_objects=custom_objects
229
+ )
230
+ return cls(**config)
@@ -0,0 +1,8 @@
1
+ from keras_hub.src.api_export import keras_hub_export
2
+ from keras_hub.src.layers.preprocessing.image_converter import ImageConverter
3
+ from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
4
+
5
+
6
+ @keras_hub_export("keras_hub.layers.SigLIPImageConverter")
7
+ class SigLIPImageConverter(ImageConverter):
8
+ backbone_cls = SigLIPBackbone