keras-hub-nightly 0.19.0.dev202503020350__py3-none-any.whl → 0.19.0.dev202503040351__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,162 @@
1
+ import keras
2
+
3
+ from keras_hub.src.api_export import keras_hub_export
4
+ from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
5
+ from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
6
+ from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
7
+ from keras_hub.src.models.siglip.siglip_image_converter import (
8
+ SigLIPImageConverter,
9
+ )
10
+ from keras_hub.src.models.siglip.siglip_tokenizer import SigLIPTokenizer
11
+ from keras_hub.src.utils.tensor_utils import preprocessing_function
12
+
13
+ try:
14
+ import tensorflow as tf
15
+ except ImportError:
16
+ tf = None
17
+
18
+
19
+ @keras_hub_export("keras_hub.models.SigLIPPreprocessor")
20
+ class SigLIPPreprocessor(CausalLMPreprocessor):
21
+ """SigLIP preprocessor.
22
+
23
+ This preprocessing layer is meant for use with
24
+ `keras_hub.models.SigLIPBackbone`. By default, it will take in batches of
25
+ strings and images, and return token ids and resized images.
26
+
27
+ Args:
28
+ tokenizer: A `keras_hub.models.SigLIPTokenizer` instance.
29
+ image_converter: A `keras_hub.models.SigLIPImageConverter` instance.
30
+ sequence_length: The length of the packed inputs.
31
+ add_start_token: If `True`, the preprocessor will prepend the tokenizer
32
+ start token to each input sequence. Defaults to `False`.
33
+ add_end_token: If `True`, the preprocessor will append the tokenizer
34
+ end token to each input sequence. Defaults to `True`.
35
+ canonicalize_text: If `True`, the input strings will be canonicalized
36
+ (converted to lowercase, punctuation removed, and stripped).
37
+
38
+ Call arguments:
39
+ x: A dict with `"prompts"` and `"images"` keys, where `"prompts"` is
40
+ `tf.Tensor` or list of python strings and `"images"` are the image
41
+ tensors.
42
+ y: Label data. Should always be `None` since SigLIP doesn't need the
43
+ label to calculate the loss.
44
+ sample_weight: Label weights.
45
+ sequence_length: Pass to override the configured `sequence_length` of
46
+ the layer.
47
+
48
+ Examples:
49
+ ```python
50
+ # Load the preprocessor from a preset.
51
+ preprocessor = keras_hub.models.SigLIPPreprocessor.from_preset(
52
+ "siglip_base_patch16_224"
53
+ )
54
+
55
+ # Tokenize the sentence and preprocess the image.
56
+ preprocessor(
57
+ {
58
+ "prompts": "The quick brown fox jumped.",
59
+ "images": np.ones(shape=(123, 123, 3)),
60
+ }
61
+ )
62
+
63
+ # Tokenize a batch of sentences and preprocess a batch of images.
64
+ preprocessor(
65
+ {
66
+ "prompts": ["The quick brown fox jumped.", "The fox slept."],
67
+ "images": np.ones(shape=(2, 123, 123, 3)),
68
+ }
69
+ )
70
+ ```
71
+ """
72
+
73
+ backbone_cls = SigLIPBackbone
74
+ tokenizer_cls = SigLIPTokenizer
75
+ image_converter_cls = SigLIPImageConverter
76
+
77
+ def __init__(
78
+ self,
79
+ tokenizer,
80
+ image_converter=None,
81
+ sequence_length=64,
82
+ add_start_token=False,
83
+ add_end_token=True,
84
+ canonicalize_text=True,
85
+ **kwargs,
86
+ ):
87
+ super().__init__(
88
+ tokenizer=tokenizer,
89
+ sequence_length=sequence_length,
90
+ add_start_token=add_start_token,
91
+ add_end_token=add_end_token,
92
+ **kwargs,
93
+ )
94
+ self.image_converter = image_converter
95
+ self.canonicalize_text = bool(canonicalize_text)
96
+
97
+ def build(self, input_shape):
98
+ # Defer packer creation to `build()` so that we can be sure tokenizer
99
+ # assets have loaded when restoring a saved model.
100
+ self.packer = StartEndPacker(
101
+ end_value=self.tokenizer.end_token_id,
102
+ pad_value=self.tokenizer.pad_token_id,
103
+ sequence_length=self.sequence_length,
104
+ return_padding_mask=True,
105
+ )
106
+ self.built = True
107
+
108
+ def canonicalize_inputs(self, inputs):
109
+ # Ref: https://github.com/google-research/big_vision/blob/main/big_vision/evaluators/proj/image_text/prompt_engineering.py
110
+ inputs = tf.convert_to_tensor(inputs)
111
+ # Do lower case.
112
+ inputs = tf.strings.lower(inputs)
113
+ # Remove punctuation.
114
+ inputs = tf.strings.regex_replace(
115
+ inputs,
116
+ (
117
+ r"!|\"|#|\$|%|&|\\|'|\(|\)|\*|\+|,|-|\.|/|:|;|<|=|>|\?|@|\[|\\|"
118
+ r"\]|\^|_|`|{|\||}|~"
119
+ ),
120
+ "",
121
+ )
122
+ inputs = tf.strings.regex_replace(inputs, r"\s+", " ")
123
+ inputs = tf.strings.strip(inputs)
124
+ return inputs
125
+
126
+ @preprocessing_function
127
+ def call(
128
+ self,
129
+ x,
130
+ y=None,
131
+ sample_weight=None,
132
+ sequence_length=None,
133
+ ):
134
+ sequence_length = sequence_length or self.sequence_length
135
+ images, prompts = x["images"], x["prompts"]
136
+ if self.canonicalize_text:
137
+ prompts = self.canonicalize_inputs(prompts)
138
+ prompts = self.tokenizer(prompts)
139
+ if self.image_converter:
140
+ images = self.image_converter(images)
141
+ token_ids, padding_mask = self.packer(
142
+ prompts,
143
+ sequence_length=sequence_length,
144
+ add_start_value=self.add_start_token,
145
+ add_end_value=self.add_end_token,
146
+ )
147
+ # The last token does not have a next token, so we truncate it out.
148
+ x = {
149
+ "token_ids": token_ids,
150
+ "padding_mask": padding_mask,
151
+ "images": images,
152
+ }
153
+ return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
154
+
155
+ def get_config(self):
156
+ config = super().get_config()
157
+ config.update(
158
+ {
159
+ "canonicalize_text": self.canonicalize_text,
160
+ }
161
+ )
162
+ return config
@@ -0,0 +1,128 @@
1
+ """SigLIP model preset configurations."""
2
+
3
+ # Metadata for loading pretrained model weights.
4
+ backbone_presets = {
5
+ "siglip_base_patch16_224": {
6
+ "metadata": {
7
+ "description": (
8
+ "200 million parameter, image size 224, pre-trained on WebLi."
9
+ ),
10
+ "params": 203156230,
11
+ "official_name": "SigLIP",
12
+ "path": "siglip",
13
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
14
+ },
15
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_224/2",
16
+ },
17
+ "siglip_base_patch16_256": {
18
+ "metadata": {
19
+ "description": (
20
+ "200 million parameter, image size 256, pre-trained on WebLi."
21
+ ),
22
+ "params": 203202370,
23
+ "official_name": "SigLIP",
24
+ "path": "siglip",
25
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
26
+ },
27
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_256/1",
28
+ },
29
+ "siglip_base_patch16_384": {
30
+ "metadata": {
31
+ "description": (
32
+ "200 million parameter, image size 384, pre-trained on WebLi."
33
+ ),
34
+ "params": 203448450,
35
+ "official_name": "SigLIP",
36
+ "path": "siglip",
37
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
38
+ },
39
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_384/1",
40
+ },
41
+ "siglip_base_patch16_512": {
42
+ "metadata": {
43
+ "description": (
44
+ "200 million parameter, image size 512, pre-trained on WebLi."
45
+ ),
46
+ "params": 203792962,
47
+ "official_name": "SigLIP",
48
+ "path": "siglip",
49
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
50
+ },
51
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_512/1",
52
+ },
53
+ "siglip_large_patch16_256": {
54
+ "metadata": {
55
+ "description": (
56
+ "652 million parameter, image size 256, pre-trained on WebLi."
57
+ ),
58
+ "params": 652151106,
59
+ "official_name": "SigLIP",
60
+ "path": "siglip",
61
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
62
+ },
63
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_large_patch16_256/1",
64
+ },
65
+ "siglip_large_patch16_384": {
66
+ "metadata": {
67
+ "description": (
68
+ "652 million parameter, image size 384, pre-trained on WebLi."
69
+ ),
70
+ "params": 652479106,
71
+ "official_name": "SigLIP",
72
+ "path": "siglip",
73
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
74
+ },
75
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_large_patch16_384/1",
76
+ },
77
+ "siglip_so400m_patch14_224": {
78
+ "metadata": {
79
+ "description": (
80
+ "877 million parameter, image size 224, "
81
+ "shape-optimized version, pre-trained on WebLi."
82
+ ),
83
+ "params": 877360578,
84
+ "official_name": "SigLIP",
85
+ "path": "siglip",
86
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
87
+ },
88
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_so400m_patch14_224/2",
89
+ },
90
+ "siglip_so400m_patch14_384": {
91
+ "metadata": {
92
+ "description": (
93
+ "877 million parameter, image size 384, "
94
+ "shape-optimized version, pre-trained on WebLi."
95
+ ),
96
+ "params": 877961291,
97
+ "official_name": "SigLIP",
98
+ "path": "siglip",
99
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
100
+ },
101
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_so400m_patch14_384/1",
102
+ },
103
+ "siglip_so400m_patch16_256_i18n": {
104
+ "metadata": {
105
+ "description": (
106
+ "1.1 billion parameter, image size 256, "
107
+ "shape-optimized version, pre-trained on WebLi."
108
+ ),
109
+ "params": 1128759282,
110
+ "official_name": "SigLIP",
111
+ "path": "siglip",
112
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
113
+ },
114
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_so400m_patch16_256_i18n/1",
115
+ },
116
+ "siglip_base_patch16_256_multilingual": {
117
+ "metadata": {
118
+ "description": (
119
+ "370 million parameter, image size 256, pre-trained on WebLi."
120
+ ),
121
+ "params": 370626370,
122
+ "official_name": "SigLIP",
123
+ "path": "siglip",
124
+ "model_card": "https://www.kaggle.com/models/kerashub/siglip",
125
+ },
126
+ "kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_256_multilingual/1",
127
+ },
128
+ }
@@ -0,0 +1,134 @@
1
+ from keras import initializers
2
+ from keras import layers
3
+
4
+ from keras_hub.src.api_export import keras_hub_export
5
+ from keras_hub.src.models.backbone import Backbone
6
+ from keras_hub.src.models.siglip.siglip_layers import SigLIPEncoderLayer
7
+ from keras_hub.src.models.siglip.siglip_layers import SigLIPTextEmbedding
8
+
9
+
10
+ @keras_hub_export("keras_hub.models.SigLIPTextEncoder")
11
+ class SigLIPTextEncoder(Backbone):
12
+ """SigLIP text core network with hyperparameters.
13
+
14
+ Args:
15
+ vocabulary_size: int. The size of the token vocabulary.
16
+ embedding_dim: int. The output dimension of the embedding layer.
17
+ hidden_dim: int. The size of the transformer hidden state at the end
18
+ of each transformer layer.
19
+ num_layers: int. The number of transformer layers.
20
+ num_heads: int. The number of attention heads for each transformer.
21
+ intermediate_dim: int. The output dimension of the first Dense layer in
22
+ a two-layer feedforward network for each transformer.
23
+ intermediate_activation: activation function. The activation that
24
+ is used for the first Dense layer in a two-layer feedforward network
25
+ for each transformer. Defaults to `"gelu_approximate"`.
26
+ layer_norm_epsilon: float. The epsilon for the layer normalization.
27
+ Defaults to `1e-6`.
28
+ max_sequence_length: int. The maximum sequence length that this encoder
29
+ can consume. Defaults to `64`.
30
+ dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
31
+ for the models computations and weights. Note that some
32
+ computations, such as softmax and layer normalization will always
33
+ be done a float32 precision regardless of dtype.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ vocabulary_size,
39
+ embedding_dim,
40
+ hidden_dim,
41
+ num_layers,
42
+ num_heads,
43
+ intermediate_dim,
44
+ intermediate_activation="gelu_approximate",
45
+ layer_norm_epsilon=1e-6,
46
+ max_sequence_length=64,
47
+ dtype=None,
48
+ name=None,
49
+ **kwargs,
50
+ ):
51
+ # `prefix` is used to prevent duplicate name when utilizing multiple
52
+ # SigLIP encoders within a single model.
53
+ prefix = str(name) + "_" if name is not None else ""
54
+
55
+ # === Layers ===
56
+ self.embedding = SigLIPTextEmbedding(
57
+ vocabulary_size=vocabulary_size,
58
+ sequence_length=max_sequence_length,
59
+ embedding_dim=embedding_dim,
60
+ dtype=dtype,
61
+ name=f"{prefix}embedding",
62
+ )
63
+ self.encoder_layers = [
64
+ SigLIPEncoderLayer(
65
+ hidden_dim,
66
+ num_heads,
67
+ intermediate_dim,
68
+ intermediate_activation,
69
+ layer_norm_epsilon=layer_norm_epsilon,
70
+ dtype=dtype,
71
+ name=f"{prefix}encoder_block_{i}",
72
+ )
73
+ for i in range(num_layers)
74
+ ]
75
+ self.post_layer_norm = layers.LayerNormalization(
76
+ epsilon=layer_norm_epsilon,
77
+ dtype=dtype,
78
+ name=f"{prefix}post_layer_norm",
79
+ )
80
+ self.head = layers.Dense(
81
+ hidden_dim,
82
+ kernel_initializer=initializers.LecunNormal(),
83
+ dtype=dtype,
84
+ name=f"{prefix}head",
85
+ )
86
+
87
+ # === Functional Model ===
88
+ token_id_input = layers.Input(
89
+ shape=(None,), dtype="int32", name="token_ids"
90
+ )
91
+ x = self.embedding(token_id_input)
92
+ for _, block in enumerate(self.encoder_layers):
93
+ x = block(x)
94
+ x = self.post_layer_norm(x)
95
+
96
+ # Assuming "sticky" EOS tokenization, last token is always EOS.
97
+ x = x[:, -1, :]
98
+ x = self.head(x)
99
+ outputs = x
100
+ super().__init__(
101
+ inputs={"token_ids": token_id_input},
102
+ outputs=outputs,
103
+ dtype=dtype,
104
+ name=name,
105
+ **kwargs,
106
+ )
107
+
108
+ # === Config ===
109
+ self.vocabulary_size = vocabulary_size
110
+ self.embedding_dim = embedding_dim
111
+ self.hidden_dim = hidden_dim
112
+ self.num_layers = num_layers
113
+ self.num_heads = num_heads
114
+ self.intermediate_dim = intermediate_dim
115
+ self.intermediate_activation = intermediate_activation
116
+ self.layer_norm_epsilon = layer_norm_epsilon
117
+ self.max_sequence_length = max_sequence_length
118
+
119
+ def get_config(self):
120
+ config = super().get_config()
121
+ config.update(
122
+ {
123
+ "vocabulary_size": self.vocabulary_size,
124
+ "embedding_dim": self.embedding_dim,
125
+ "hidden_dim": self.hidden_dim,
126
+ "num_layers": self.num_layers,
127
+ "num_heads": self.num_heads,
128
+ "intermediate_dim": self.intermediate_dim,
129
+ "intermediate_activation": self.intermediate_activation,
130
+ "layer_norm_epsilon": self.layer_norm_epsilon,
131
+ "max_sequence_length": self.max_sequence_length,
132
+ }
133
+ )
134
+ return config
@@ -0,0 +1,77 @@
1
+ from keras_hub.src.api_export import keras_hub_export
2
+ from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
3
+ from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
4
+ SentencePieceTokenizer,
5
+ )
6
+
7
+
8
+ @keras_hub_export(
9
+ [
10
+ "keras_hub.tokenizers.SigLIPTokenizer",
11
+ "keras_hub.models.SigLIPTokenizer",
12
+ ]
13
+ )
14
+ class SigLIPTokenizer(SentencePieceTokenizer):
15
+ """SigLIP tokenizer layer based on SentencePiece.
16
+
17
+ This tokenizer class will tokenize raw strings into integer sequences and
18
+ is based on `keras_hub.tokenizers.SentencePieceTokenizer`. Unlike the
19
+ underlying tokenizer, it will check for all special tokens needed by
20
+ SigLIP models and provides a `from_preset()` method to automatically
21
+ download a matching vocabulary for a SigLIP preset.
22
+
23
+ If input is a batch of strings (rank > 0), the layer will output a
24
+ `tf.RaggedTensor` where the last dimension of the output is ragged.
25
+
26
+ If input is a scalar string (rank == 0), the layer will output a dense
27
+ `tf.Tensor` with static shape `[None]`.
28
+
29
+ Args:
30
+ proto: Either a `string` path to a SentencePiece proto file, or a
31
+ `bytes` object with a serialized SentencePiece proto. See the
32
+ [SentencePiece repository](https://github.com/google/sentencepiece)
33
+ for more details on the format.
34
+
35
+ Examples:
36
+
37
+ ```python
38
+ # Unbatched input.
39
+ tokenizer = keras_hub.models.SigLIPTokenizer.from_preset(
40
+ "siglip_base_patch16_224"
41
+ )
42
+ tokenizer("The quick brown fox jumped.")
43
+
44
+ # Batched input.
45
+ tokenizer(["The quick brown fox jumped.", "The fox slept."])
46
+
47
+ # Detokenization.
48
+ tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
49
+
50
+ # Custom vocabulary.
51
+ bytes_io = io.BytesIO()
52
+ ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."])
53
+ sentencepiece.SentencePieceTrainer.train(
54
+ sentence_iterator=ds.as_numpy_iterator(),
55
+ model_writer=bytes_io,
56
+ vocab_size=8,
57
+ model_type="WORD",
58
+ pad_id=0,
59
+ bos_id=1,
60
+ eos_id=2,
61
+ unk_id=3,
62
+ unk_piece="<unk>",
63
+ )
64
+ tokenizer = keras_hub.models.SigLIPTokenizer(
65
+ proto=bytes_io.getvalue(),
66
+ )
67
+ tokenizer("The quick brown fox jumped.")
68
+ ```
69
+ """
70
+
71
+ backbone_cls = SigLIPBackbone
72
+
73
+ def __init__(self, proto, **kwargs):
74
+ self._add_special_token("<unk>", "unknown_token")
75
+ self._add_special_token("</s>", "end_token")
76
+ self._add_special_token("</s>", "pad_token")
77
+ super().__init__(proto=proto, **kwargs)
@@ -0,0 +1,151 @@
1
+ from keras import layers
2
+
3
+ from keras_hub.src.api_export import keras_hub_export
4
+ from keras_hub.src.models.backbone import Backbone
5
+ from keras_hub.src.models.siglip.siglip_layers import SigLIPEncoderLayer
6
+ from keras_hub.src.models.siglip.siglip_layers import (
7
+ SigLIPMultiHeadAttentionPooling,
8
+ )
9
+ from keras_hub.src.models.siglip.siglip_layers import SigLIPVisionEmbedding
10
+ from keras_hub.src.utils.keras_utils import standardize_data_format
11
+
12
+
13
+ @keras_hub_export("keras_hub.models.SigLIPVisionEncoder")
14
+ class SigLIPVisionEncoder(Backbone):
15
+ """SigLIP vision core network with hyperparameters.
16
+
17
+ Args:
18
+ patch_size: int. The size of each square patch in the input image.
19
+ hidden_dim: int. The size of the transformer hidden state at the end
20
+ of each transformer layer.
21
+ num_layers: int. The number of transformer layers.
22
+ num_heads: int. The number of attention heads for each transformer.
23
+ intermediate_dim: int. The output dimension of the first Dense layer in
24
+ a two-layer feedforward network for each transformer.
25
+ intermediate_activation: activation function. The activation that
26
+ is used for the first Dense layer in a two-layer feedforward network
27
+ for each transformer. Defaults to `"gelu_approximate"`.
28
+ layer_norm_epsilon: float. The epsilon for the layer normalization.
29
+ Defaults to `1e-6`.
30
+ image_shape: tuple. The input shape without the batch size. Defaults to
31
+ `(224, 224, 3)`.
32
+ data_format: `None` or str. If specified, either `"channels_last"` or
33
+ `"channels_first"`. The ordering of the dimensions in the
34
+ inputs. `"channels_last"` corresponds to inputs with shape
35
+ `(batch_size, height, width, channels)`
36
+ while `"channels_first"` corresponds to inputs with shape
37
+ `(batch_size, channels, height, width)`. It defaults to the
38
+ `image_data_format` value found in your Keras config file at
39
+ `~/.keras/keras.json`. If you never set it, then it will be
40
+ `"channels_last"`.
41
+ dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
42
+ for the models computations and weights. Note that some
43
+ computations, such as softmax and layer normalization will always
44
+ be done a float32 precision regardless of dtype.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ patch_size,
50
+ hidden_dim,
51
+ num_layers,
52
+ num_heads,
53
+ intermediate_dim,
54
+ intermediate_activation="gelu_approximate",
55
+ layer_norm_epsilon=1e-6,
56
+ image_shape=(224, 224, 3),
57
+ data_format=None,
58
+ dtype=None,
59
+ name=None,
60
+ **kwargs,
61
+ ):
62
+ data_format = standardize_data_format(data_format)
63
+ if data_format == "channels_last":
64
+ height, width = image_shape[0], image_shape[1]
65
+ else:
66
+ height, width = image_shape[1], image_shape[2]
67
+ if height != width:
68
+ raise ValueError(
69
+ "`SigLIPVisionEncoder` expects the height and width to be the "
70
+ f"same in `image_shape`. Received: image_shape={image_shape}"
71
+ )
72
+
73
+ # `prefix` is used to prevent duplicate name when utilizing multiple
74
+ # SigLIP encoders within a single model.
75
+ prefix = str(name) + "_" if name is not None else ""
76
+
77
+ # === Layers ===
78
+ self.embedding = SigLIPVisionEmbedding(
79
+ hidden_dim=hidden_dim,
80
+ patch_size=patch_size,
81
+ image_size=height,
82
+ data_format=data_format,
83
+ dtype=dtype,
84
+ name=f"{prefix}embedding",
85
+ )
86
+ self.encoder_layers = [
87
+ SigLIPEncoderLayer(
88
+ hidden_dim,
89
+ num_heads,
90
+ intermediate_dim,
91
+ intermediate_activation,
92
+ layer_norm_epsilon=layer_norm_epsilon,
93
+ dtype=dtype,
94
+ name=f"{prefix}encoder_block_{i}",
95
+ )
96
+ for i in range(num_layers)
97
+ ]
98
+ self.post_layer_norm = layers.LayerNormalization(
99
+ epsilon=1e-6, dtype=dtype, name=f"{prefix}post_layer_norm"
100
+ )
101
+ self.head = SigLIPMultiHeadAttentionPooling(
102
+ hidden_dim,
103
+ intermediate_dim,
104
+ num_heads,
105
+ intermediate_activation,
106
+ layer_norm_epsilon,
107
+ dtype=dtype,
108
+ name=f"{prefix}head",
109
+ )
110
+
111
+ # === Functional Model ===
112
+ image_input = layers.Input(shape=image_shape, name="images")
113
+ x = self.embedding(image_input)
114
+ for _, block in enumerate(self.encoder_layers):
115
+ x = block(x)
116
+ x = self.post_layer_norm(x)
117
+ x = self.head(x)
118
+ outputs = x
119
+ super().__init__(
120
+ inputs={"images": image_input},
121
+ outputs=outputs,
122
+ dtype=dtype,
123
+ name=name,
124
+ **kwargs,
125
+ )
126
+
127
+ # === Config ===
128
+ self.patch_size = patch_size
129
+ self.hidden_dim = hidden_dim
130
+ self.num_layers = num_layers
131
+ self.num_heads = num_heads
132
+ self.intermediate_dim = intermediate_dim
133
+ self.intermediate_activation = intermediate_activation
134
+ self.layer_norm_epsilon = layer_norm_epsilon
135
+ self.image_shape = image_shape
136
+
137
+ def get_config(self):
138
+ config = super().get_config()
139
+ config.update(
140
+ {
141
+ "patch_size": self.patch_size,
142
+ "hidden_dim": self.hidden_dim,
143
+ "num_layers": self.num_layers,
144
+ "num_heads": self.num_heads,
145
+ "intermediate_dim": self.intermediate_dim,
146
+ "intermediate_activation": self.intermediate_activation,
147
+ "layer_norm_epsilon": self.layer_norm_epsilon,
148
+ "image_shape": self.image_shape,
149
+ }
150
+ )
151
+ return config
@@ -1,7 +1,7 @@
1
1
  from keras_hub.src.api_export import keras_hub_export
2
2
 
3
3
  # Unique source of truth for the version number.
4
- __version__ = "0.19.0.dev202503020350"
4
+ __version__ = "0.19.0.dev202503040351"
5
5
 
6
6
 
7
7
  @keras_hub_export("keras_hub.version")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: keras-hub-nightly
3
- Version: 0.19.0.dev202503020350
3
+ Version: 0.19.0.dev202503040351
4
4
  Summary: Industry-strength Natural Language Processing extensions for Keras.
5
5
  Home-page: https://github.com/keras-team/keras-hub
6
6
  Author: Keras team