keras-hub-nightly 0.19.0.dev202503020350__py3-none-any.whl → 0.19.0.dev202503040351__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/api/layers/__init__.py +3 -0
- keras_hub/api/models/__init__.py +7 -0
- keras_hub/api/tokenizers/__init__.py +1 -0
- keras_hub/src/layers/preprocessing/image_converter.py +97 -1
- keras_hub/src/models/pali_gemma/pali_gemma_vit.py +5 -1
- keras_hub/src/models/siglip/__init__.py +5 -0
- keras_hub/src/models/siglip/siglip_backbone.py +230 -0
- keras_hub/src/models/siglip/siglip_image_converter.py +8 -0
- keras_hub/src/models/siglip/siglip_layers.py +555 -0
- keras_hub/src/models/siglip/siglip_loss.py +35 -0
- keras_hub/src/models/siglip/siglip_preprocessor.py +162 -0
- keras_hub/src/models/siglip/siglip_presets.py +128 -0
- keras_hub/src/models/siglip/siglip_text_encoder.py +134 -0
- keras_hub/src/models/siglip/siglip_tokenizer.py +77 -0
- keras_hub/src/models/siglip/siglip_vision_encoder.py +151 -0
- keras_hub/src/version_utils.py +1 -1
- {keras_hub_nightly-0.19.0.dev202503020350.dist-info → keras_hub_nightly-0.19.0.dev202503040351.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.19.0.dev202503020350.dist-info → keras_hub_nightly-0.19.0.dev202503040351.dist-info}/RECORD +20 -10
- {keras_hub_nightly-0.19.0.dev202503020350.dist-info → keras_hub_nightly-0.19.0.dev202503040351.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.19.0.dev202503020350.dist-info → keras_hub_nightly-0.19.0.dev202503040351.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
|
|
1
|
+
import keras
|
2
|
+
|
3
|
+
from keras_hub.src.api_export import keras_hub_export
|
4
|
+
from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
|
5
|
+
from keras_hub.src.models.causal_lm_preprocessor import CausalLMPreprocessor
|
6
|
+
from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
|
7
|
+
from keras_hub.src.models.siglip.siglip_image_converter import (
|
8
|
+
SigLIPImageConverter,
|
9
|
+
)
|
10
|
+
from keras_hub.src.models.siglip.siglip_tokenizer import SigLIPTokenizer
|
11
|
+
from keras_hub.src.utils.tensor_utils import preprocessing_function
|
12
|
+
|
13
|
+
try:
|
14
|
+
import tensorflow as tf
|
15
|
+
except ImportError:
|
16
|
+
tf = None
|
17
|
+
|
18
|
+
|
19
|
+
@keras_hub_export("keras_hub.models.SigLIPPreprocessor")
|
20
|
+
class SigLIPPreprocessor(CausalLMPreprocessor):
|
21
|
+
"""SigLIP preprocessor.
|
22
|
+
|
23
|
+
This preprocessing layer is meant for use with
|
24
|
+
`keras_hub.models.SigLIPBackbone`. By default, it will take in batches of
|
25
|
+
strings and images, and return token ids and resized images.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
tokenizer: A `keras_hub.models.SigLIPTokenizer` instance.
|
29
|
+
image_converter: A `keras_hub.models.SigLIPImageConverter` instance.
|
30
|
+
sequence_length: The length of the packed inputs.
|
31
|
+
add_start_token: If `True`, the preprocessor will prepend the tokenizer
|
32
|
+
start token to each input sequence. Defaults to `False`.
|
33
|
+
add_end_token: If `True`, the preprocessor will append the tokenizer
|
34
|
+
end token to each input sequence. Defaults to `True`.
|
35
|
+
canonicalize_text: If `True`, the input strings will be canonicalized
|
36
|
+
(converted to lowercase, punctuation removed, and stripped).
|
37
|
+
|
38
|
+
Call arguments:
|
39
|
+
x: A dict with `"prompts"` and `"images"` keys, where `"prompts"` is
|
40
|
+
`tf.Tensor` or list of python strings and `"images"` are the image
|
41
|
+
tensors.
|
42
|
+
y: Label data. Should always be `None` since SigLIP doesn't need the
|
43
|
+
label to calculate the loss.
|
44
|
+
sample_weight: Label weights.
|
45
|
+
sequence_length: Pass to override the configured `sequence_length` of
|
46
|
+
the layer.
|
47
|
+
|
48
|
+
Examples:
|
49
|
+
```python
|
50
|
+
# Load the preprocessor from a preset.
|
51
|
+
preprocessor = keras_hub.models.SigLIPPreprocessor.from_preset(
|
52
|
+
"siglip_base_patch16_224"
|
53
|
+
)
|
54
|
+
|
55
|
+
# Tokenize the sentence and preprocess the image.
|
56
|
+
preprocessor(
|
57
|
+
{
|
58
|
+
"prompts": "The quick brown fox jumped.",
|
59
|
+
"images": np.ones(shape=(123, 123, 3)),
|
60
|
+
}
|
61
|
+
)
|
62
|
+
|
63
|
+
# Tokenize a batch of sentences and preprocess a batch of images.
|
64
|
+
preprocessor(
|
65
|
+
{
|
66
|
+
"prompts": ["The quick brown fox jumped.", "The fox slept."],
|
67
|
+
"images": np.ones(shape=(2, 123, 123, 3)),
|
68
|
+
}
|
69
|
+
)
|
70
|
+
```
|
71
|
+
"""
|
72
|
+
|
73
|
+
backbone_cls = SigLIPBackbone
|
74
|
+
tokenizer_cls = SigLIPTokenizer
|
75
|
+
image_converter_cls = SigLIPImageConverter
|
76
|
+
|
77
|
+
def __init__(
|
78
|
+
self,
|
79
|
+
tokenizer,
|
80
|
+
image_converter=None,
|
81
|
+
sequence_length=64,
|
82
|
+
add_start_token=False,
|
83
|
+
add_end_token=True,
|
84
|
+
canonicalize_text=True,
|
85
|
+
**kwargs,
|
86
|
+
):
|
87
|
+
super().__init__(
|
88
|
+
tokenizer=tokenizer,
|
89
|
+
sequence_length=sequence_length,
|
90
|
+
add_start_token=add_start_token,
|
91
|
+
add_end_token=add_end_token,
|
92
|
+
**kwargs,
|
93
|
+
)
|
94
|
+
self.image_converter = image_converter
|
95
|
+
self.canonicalize_text = bool(canonicalize_text)
|
96
|
+
|
97
|
+
def build(self, input_shape):
|
98
|
+
# Defer packer creation to `build()` so that we can be sure tokenizer
|
99
|
+
# assets have loaded when restoring a saved model.
|
100
|
+
self.packer = StartEndPacker(
|
101
|
+
end_value=self.tokenizer.end_token_id,
|
102
|
+
pad_value=self.tokenizer.pad_token_id,
|
103
|
+
sequence_length=self.sequence_length,
|
104
|
+
return_padding_mask=True,
|
105
|
+
)
|
106
|
+
self.built = True
|
107
|
+
|
108
|
+
def canonicalize_inputs(self, inputs):
|
109
|
+
# Ref: https://github.com/google-research/big_vision/blob/main/big_vision/evaluators/proj/image_text/prompt_engineering.py
|
110
|
+
inputs = tf.convert_to_tensor(inputs)
|
111
|
+
# Do lower case.
|
112
|
+
inputs = tf.strings.lower(inputs)
|
113
|
+
# Remove punctuation.
|
114
|
+
inputs = tf.strings.regex_replace(
|
115
|
+
inputs,
|
116
|
+
(
|
117
|
+
r"!|\"|#|\$|%|&|\\|'|\(|\)|\*|\+|,|-|\.|/|:|;|<|=|>|\?|@|\[|\\|"
|
118
|
+
r"\]|\^|_|`|{|\||}|~"
|
119
|
+
),
|
120
|
+
"",
|
121
|
+
)
|
122
|
+
inputs = tf.strings.regex_replace(inputs, r"\s+", " ")
|
123
|
+
inputs = tf.strings.strip(inputs)
|
124
|
+
return inputs
|
125
|
+
|
126
|
+
@preprocessing_function
|
127
|
+
def call(
|
128
|
+
self,
|
129
|
+
x,
|
130
|
+
y=None,
|
131
|
+
sample_weight=None,
|
132
|
+
sequence_length=None,
|
133
|
+
):
|
134
|
+
sequence_length = sequence_length or self.sequence_length
|
135
|
+
images, prompts = x["images"], x["prompts"]
|
136
|
+
if self.canonicalize_text:
|
137
|
+
prompts = self.canonicalize_inputs(prompts)
|
138
|
+
prompts = self.tokenizer(prompts)
|
139
|
+
if self.image_converter:
|
140
|
+
images = self.image_converter(images)
|
141
|
+
token_ids, padding_mask = self.packer(
|
142
|
+
prompts,
|
143
|
+
sequence_length=sequence_length,
|
144
|
+
add_start_value=self.add_start_token,
|
145
|
+
add_end_value=self.add_end_token,
|
146
|
+
)
|
147
|
+
# The last token does not have a next token, so we truncate it out.
|
148
|
+
x = {
|
149
|
+
"token_ids": token_ids,
|
150
|
+
"padding_mask": padding_mask,
|
151
|
+
"images": images,
|
152
|
+
}
|
153
|
+
return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
|
154
|
+
|
155
|
+
def get_config(self):
|
156
|
+
config = super().get_config()
|
157
|
+
config.update(
|
158
|
+
{
|
159
|
+
"canonicalize_text": self.canonicalize_text,
|
160
|
+
}
|
161
|
+
)
|
162
|
+
return config
|
@@ -0,0 +1,128 @@
|
|
1
|
+
"""SigLIP model preset configurations."""
|
2
|
+
|
3
|
+
# Metadata for loading pretrained model weights.
|
4
|
+
backbone_presets = {
|
5
|
+
"siglip_base_patch16_224": {
|
6
|
+
"metadata": {
|
7
|
+
"description": (
|
8
|
+
"200 million parameter, image size 224, pre-trained on WebLi."
|
9
|
+
),
|
10
|
+
"params": 203156230,
|
11
|
+
"official_name": "SigLIP",
|
12
|
+
"path": "siglip",
|
13
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
14
|
+
},
|
15
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_224/2",
|
16
|
+
},
|
17
|
+
"siglip_base_patch16_256": {
|
18
|
+
"metadata": {
|
19
|
+
"description": (
|
20
|
+
"200 million parameter, image size 256, pre-trained on WebLi."
|
21
|
+
),
|
22
|
+
"params": 203202370,
|
23
|
+
"official_name": "SigLIP",
|
24
|
+
"path": "siglip",
|
25
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
26
|
+
},
|
27
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_256/1",
|
28
|
+
},
|
29
|
+
"siglip_base_patch16_384": {
|
30
|
+
"metadata": {
|
31
|
+
"description": (
|
32
|
+
"200 million parameter, image size 384, pre-trained on WebLi."
|
33
|
+
),
|
34
|
+
"params": 203448450,
|
35
|
+
"official_name": "SigLIP",
|
36
|
+
"path": "siglip",
|
37
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
38
|
+
},
|
39
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_384/1",
|
40
|
+
},
|
41
|
+
"siglip_base_patch16_512": {
|
42
|
+
"metadata": {
|
43
|
+
"description": (
|
44
|
+
"200 million parameter, image size 512, pre-trained on WebLi."
|
45
|
+
),
|
46
|
+
"params": 203792962,
|
47
|
+
"official_name": "SigLIP",
|
48
|
+
"path": "siglip",
|
49
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
50
|
+
},
|
51
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_512/1",
|
52
|
+
},
|
53
|
+
"siglip_large_patch16_256": {
|
54
|
+
"metadata": {
|
55
|
+
"description": (
|
56
|
+
"652 million parameter, image size 256, pre-trained on WebLi."
|
57
|
+
),
|
58
|
+
"params": 652151106,
|
59
|
+
"official_name": "SigLIP",
|
60
|
+
"path": "siglip",
|
61
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
62
|
+
},
|
63
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_large_patch16_256/1",
|
64
|
+
},
|
65
|
+
"siglip_large_patch16_384": {
|
66
|
+
"metadata": {
|
67
|
+
"description": (
|
68
|
+
"652 million parameter, image size 384, pre-trained on WebLi."
|
69
|
+
),
|
70
|
+
"params": 652479106,
|
71
|
+
"official_name": "SigLIP",
|
72
|
+
"path": "siglip",
|
73
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
74
|
+
},
|
75
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_large_patch16_384/1",
|
76
|
+
},
|
77
|
+
"siglip_so400m_patch14_224": {
|
78
|
+
"metadata": {
|
79
|
+
"description": (
|
80
|
+
"877 million parameter, image size 224, "
|
81
|
+
"shape-optimized version, pre-trained on WebLi."
|
82
|
+
),
|
83
|
+
"params": 877360578,
|
84
|
+
"official_name": "SigLIP",
|
85
|
+
"path": "siglip",
|
86
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
87
|
+
},
|
88
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_so400m_patch14_224/2",
|
89
|
+
},
|
90
|
+
"siglip_so400m_patch14_384": {
|
91
|
+
"metadata": {
|
92
|
+
"description": (
|
93
|
+
"877 million parameter, image size 384, "
|
94
|
+
"shape-optimized version, pre-trained on WebLi."
|
95
|
+
),
|
96
|
+
"params": 877961291,
|
97
|
+
"official_name": "SigLIP",
|
98
|
+
"path": "siglip",
|
99
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
100
|
+
},
|
101
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_so400m_patch14_384/1",
|
102
|
+
},
|
103
|
+
"siglip_so400m_patch16_256_i18n": {
|
104
|
+
"metadata": {
|
105
|
+
"description": (
|
106
|
+
"1.1 billion parameter, image size 256, "
|
107
|
+
"shape-optimized version, pre-trained on WebLi."
|
108
|
+
),
|
109
|
+
"params": 1128759282,
|
110
|
+
"official_name": "SigLIP",
|
111
|
+
"path": "siglip",
|
112
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
113
|
+
},
|
114
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_so400m_patch16_256_i18n/1",
|
115
|
+
},
|
116
|
+
"siglip_base_patch16_256_multilingual": {
|
117
|
+
"metadata": {
|
118
|
+
"description": (
|
119
|
+
"370 million parameter, image size 256, pre-trained on WebLi."
|
120
|
+
),
|
121
|
+
"params": 370626370,
|
122
|
+
"official_name": "SigLIP",
|
123
|
+
"path": "siglip",
|
124
|
+
"model_card": "https://www.kaggle.com/models/kerashub/siglip",
|
125
|
+
},
|
126
|
+
"kaggle_handle": "kaggle://kerashub/siglip/keras/siglip_base_patch16_256_multilingual/1",
|
127
|
+
},
|
128
|
+
}
|
@@ -0,0 +1,134 @@
|
|
1
|
+
from keras import initializers
|
2
|
+
from keras import layers
|
3
|
+
|
4
|
+
from keras_hub.src.api_export import keras_hub_export
|
5
|
+
from keras_hub.src.models.backbone import Backbone
|
6
|
+
from keras_hub.src.models.siglip.siglip_layers import SigLIPEncoderLayer
|
7
|
+
from keras_hub.src.models.siglip.siglip_layers import SigLIPTextEmbedding
|
8
|
+
|
9
|
+
|
10
|
+
@keras_hub_export("keras_hub.models.SigLIPTextEncoder")
|
11
|
+
class SigLIPTextEncoder(Backbone):
|
12
|
+
"""SigLIP text core network with hyperparameters.
|
13
|
+
|
14
|
+
Args:
|
15
|
+
vocabulary_size: int. The size of the token vocabulary.
|
16
|
+
embedding_dim: int. The output dimension of the embedding layer.
|
17
|
+
hidden_dim: int. The size of the transformer hidden state at the end
|
18
|
+
of each transformer layer.
|
19
|
+
num_layers: int. The number of transformer layers.
|
20
|
+
num_heads: int. The number of attention heads for each transformer.
|
21
|
+
intermediate_dim: int. The output dimension of the first Dense layer in
|
22
|
+
a two-layer feedforward network for each transformer.
|
23
|
+
intermediate_activation: activation function. The activation that
|
24
|
+
is used for the first Dense layer in a two-layer feedforward network
|
25
|
+
for each transformer. Defaults to `"gelu_approximate"`.
|
26
|
+
layer_norm_epsilon: float. The epsilon for the layer normalization.
|
27
|
+
Defaults to `1e-6`.
|
28
|
+
max_sequence_length: int. The maximum sequence length that this encoder
|
29
|
+
can consume. Defaults to `64`.
|
30
|
+
dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
|
31
|
+
for the models computations and weights. Note that some
|
32
|
+
computations, such as softmax and layer normalization will always
|
33
|
+
be done a float32 precision regardless of dtype.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
vocabulary_size,
|
39
|
+
embedding_dim,
|
40
|
+
hidden_dim,
|
41
|
+
num_layers,
|
42
|
+
num_heads,
|
43
|
+
intermediate_dim,
|
44
|
+
intermediate_activation="gelu_approximate",
|
45
|
+
layer_norm_epsilon=1e-6,
|
46
|
+
max_sequence_length=64,
|
47
|
+
dtype=None,
|
48
|
+
name=None,
|
49
|
+
**kwargs,
|
50
|
+
):
|
51
|
+
# `prefix` is used to prevent duplicate name when utilizing multiple
|
52
|
+
# SigLIP encoders within a single model.
|
53
|
+
prefix = str(name) + "_" if name is not None else ""
|
54
|
+
|
55
|
+
# === Layers ===
|
56
|
+
self.embedding = SigLIPTextEmbedding(
|
57
|
+
vocabulary_size=vocabulary_size,
|
58
|
+
sequence_length=max_sequence_length,
|
59
|
+
embedding_dim=embedding_dim,
|
60
|
+
dtype=dtype,
|
61
|
+
name=f"{prefix}embedding",
|
62
|
+
)
|
63
|
+
self.encoder_layers = [
|
64
|
+
SigLIPEncoderLayer(
|
65
|
+
hidden_dim,
|
66
|
+
num_heads,
|
67
|
+
intermediate_dim,
|
68
|
+
intermediate_activation,
|
69
|
+
layer_norm_epsilon=layer_norm_epsilon,
|
70
|
+
dtype=dtype,
|
71
|
+
name=f"{prefix}encoder_block_{i}",
|
72
|
+
)
|
73
|
+
for i in range(num_layers)
|
74
|
+
]
|
75
|
+
self.post_layer_norm = layers.LayerNormalization(
|
76
|
+
epsilon=layer_norm_epsilon,
|
77
|
+
dtype=dtype,
|
78
|
+
name=f"{prefix}post_layer_norm",
|
79
|
+
)
|
80
|
+
self.head = layers.Dense(
|
81
|
+
hidden_dim,
|
82
|
+
kernel_initializer=initializers.LecunNormal(),
|
83
|
+
dtype=dtype,
|
84
|
+
name=f"{prefix}head",
|
85
|
+
)
|
86
|
+
|
87
|
+
# === Functional Model ===
|
88
|
+
token_id_input = layers.Input(
|
89
|
+
shape=(None,), dtype="int32", name="token_ids"
|
90
|
+
)
|
91
|
+
x = self.embedding(token_id_input)
|
92
|
+
for _, block in enumerate(self.encoder_layers):
|
93
|
+
x = block(x)
|
94
|
+
x = self.post_layer_norm(x)
|
95
|
+
|
96
|
+
# Assuming "sticky" EOS tokenization, last token is always EOS.
|
97
|
+
x = x[:, -1, :]
|
98
|
+
x = self.head(x)
|
99
|
+
outputs = x
|
100
|
+
super().__init__(
|
101
|
+
inputs={"token_ids": token_id_input},
|
102
|
+
outputs=outputs,
|
103
|
+
dtype=dtype,
|
104
|
+
name=name,
|
105
|
+
**kwargs,
|
106
|
+
)
|
107
|
+
|
108
|
+
# === Config ===
|
109
|
+
self.vocabulary_size = vocabulary_size
|
110
|
+
self.embedding_dim = embedding_dim
|
111
|
+
self.hidden_dim = hidden_dim
|
112
|
+
self.num_layers = num_layers
|
113
|
+
self.num_heads = num_heads
|
114
|
+
self.intermediate_dim = intermediate_dim
|
115
|
+
self.intermediate_activation = intermediate_activation
|
116
|
+
self.layer_norm_epsilon = layer_norm_epsilon
|
117
|
+
self.max_sequence_length = max_sequence_length
|
118
|
+
|
119
|
+
def get_config(self):
|
120
|
+
config = super().get_config()
|
121
|
+
config.update(
|
122
|
+
{
|
123
|
+
"vocabulary_size": self.vocabulary_size,
|
124
|
+
"embedding_dim": self.embedding_dim,
|
125
|
+
"hidden_dim": self.hidden_dim,
|
126
|
+
"num_layers": self.num_layers,
|
127
|
+
"num_heads": self.num_heads,
|
128
|
+
"intermediate_dim": self.intermediate_dim,
|
129
|
+
"intermediate_activation": self.intermediate_activation,
|
130
|
+
"layer_norm_epsilon": self.layer_norm_epsilon,
|
131
|
+
"max_sequence_length": self.max_sequence_length,
|
132
|
+
}
|
133
|
+
)
|
134
|
+
return config
|
@@ -0,0 +1,77 @@
|
|
1
|
+
from keras_hub.src.api_export import keras_hub_export
|
2
|
+
from keras_hub.src.models.siglip.siglip_backbone import SigLIPBackbone
|
3
|
+
from keras_hub.src.tokenizers.sentence_piece_tokenizer import (
|
4
|
+
SentencePieceTokenizer,
|
5
|
+
)
|
6
|
+
|
7
|
+
|
8
|
+
@keras_hub_export(
|
9
|
+
[
|
10
|
+
"keras_hub.tokenizers.SigLIPTokenizer",
|
11
|
+
"keras_hub.models.SigLIPTokenizer",
|
12
|
+
]
|
13
|
+
)
|
14
|
+
class SigLIPTokenizer(SentencePieceTokenizer):
|
15
|
+
"""SigLIP tokenizer layer based on SentencePiece.
|
16
|
+
|
17
|
+
This tokenizer class will tokenize raw strings into integer sequences and
|
18
|
+
is based on `keras_hub.tokenizers.SentencePieceTokenizer`. Unlike the
|
19
|
+
underlying tokenizer, it will check for all special tokens needed by
|
20
|
+
SigLIP models and provides a `from_preset()` method to automatically
|
21
|
+
download a matching vocabulary for a SigLIP preset.
|
22
|
+
|
23
|
+
If input is a batch of strings (rank > 0), the layer will output a
|
24
|
+
`tf.RaggedTensor` where the last dimension of the output is ragged.
|
25
|
+
|
26
|
+
If input is a scalar string (rank == 0), the layer will output a dense
|
27
|
+
`tf.Tensor` with static shape `[None]`.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
proto: Either a `string` path to a SentencePiece proto file, or a
|
31
|
+
`bytes` object with a serialized SentencePiece proto. See the
|
32
|
+
[SentencePiece repository](https://github.com/google/sentencepiece)
|
33
|
+
for more details on the format.
|
34
|
+
|
35
|
+
Examples:
|
36
|
+
|
37
|
+
```python
|
38
|
+
# Unbatched input.
|
39
|
+
tokenizer = keras_hub.models.SigLIPTokenizer.from_preset(
|
40
|
+
"siglip_base_patch16_224"
|
41
|
+
)
|
42
|
+
tokenizer("The quick brown fox jumped.")
|
43
|
+
|
44
|
+
# Batched input.
|
45
|
+
tokenizer(["The quick brown fox jumped.", "The fox slept."])
|
46
|
+
|
47
|
+
# Detokenization.
|
48
|
+
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
|
49
|
+
|
50
|
+
# Custom vocabulary.
|
51
|
+
bytes_io = io.BytesIO()
|
52
|
+
ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."])
|
53
|
+
sentencepiece.SentencePieceTrainer.train(
|
54
|
+
sentence_iterator=ds.as_numpy_iterator(),
|
55
|
+
model_writer=bytes_io,
|
56
|
+
vocab_size=8,
|
57
|
+
model_type="WORD",
|
58
|
+
pad_id=0,
|
59
|
+
bos_id=1,
|
60
|
+
eos_id=2,
|
61
|
+
unk_id=3,
|
62
|
+
unk_piece="<unk>",
|
63
|
+
)
|
64
|
+
tokenizer = keras_hub.models.SigLIPTokenizer(
|
65
|
+
proto=bytes_io.getvalue(),
|
66
|
+
)
|
67
|
+
tokenizer("The quick brown fox jumped.")
|
68
|
+
```
|
69
|
+
"""
|
70
|
+
|
71
|
+
backbone_cls = SigLIPBackbone
|
72
|
+
|
73
|
+
def __init__(self, proto, **kwargs):
|
74
|
+
self._add_special_token("<unk>", "unknown_token")
|
75
|
+
self._add_special_token("</s>", "end_token")
|
76
|
+
self._add_special_token("</s>", "pad_token")
|
77
|
+
super().__init__(proto=proto, **kwargs)
|
@@ -0,0 +1,151 @@
|
|
1
|
+
from keras import layers
|
2
|
+
|
3
|
+
from keras_hub.src.api_export import keras_hub_export
|
4
|
+
from keras_hub.src.models.backbone import Backbone
|
5
|
+
from keras_hub.src.models.siglip.siglip_layers import SigLIPEncoderLayer
|
6
|
+
from keras_hub.src.models.siglip.siglip_layers import (
|
7
|
+
SigLIPMultiHeadAttentionPooling,
|
8
|
+
)
|
9
|
+
from keras_hub.src.models.siglip.siglip_layers import SigLIPVisionEmbedding
|
10
|
+
from keras_hub.src.utils.keras_utils import standardize_data_format
|
11
|
+
|
12
|
+
|
13
|
+
@keras_hub_export("keras_hub.models.SigLIPVisionEncoder")
|
14
|
+
class SigLIPVisionEncoder(Backbone):
|
15
|
+
"""SigLIP vision core network with hyperparameters.
|
16
|
+
|
17
|
+
Args:
|
18
|
+
patch_size: int. The size of each square patch in the input image.
|
19
|
+
hidden_dim: int. The size of the transformer hidden state at the end
|
20
|
+
of each transformer layer.
|
21
|
+
num_layers: int. The number of transformer layers.
|
22
|
+
num_heads: int. The number of attention heads for each transformer.
|
23
|
+
intermediate_dim: int. The output dimension of the first Dense layer in
|
24
|
+
a two-layer feedforward network for each transformer.
|
25
|
+
intermediate_activation: activation function. The activation that
|
26
|
+
is used for the first Dense layer in a two-layer feedforward network
|
27
|
+
for each transformer. Defaults to `"gelu_approximate"`.
|
28
|
+
layer_norm_epsilon: float. The epsilon for the layer normalization.
|
29
|
+
Defaults to `1e-6`.
|
30
|
+
image_shape: tuple. The input shape without the batch size. Defaults to
|
31
|
+
`(224, 224, 3)`.
|
32
|
+
data_format: `None` or str. If specified, either `"channels_last"` or
|
33
|
+
`"channels_first"`. The ordering of the dimensions in the
|
34
|
+
inputs. `"channels_last"` corresponds to inputs with shape
|
35
|
+
`(batch_size, height, width, channels)`
|
36
|
+
while `"channels_first"` corresponds to inputs with shape
|
37
|
+
`(batch_size, channels, height, width)`. It defaults to the
|
38
|
+
`image_data_format` value found in your Keras config file at
|
39
|
+
`~/.keras/keras.json`. If you never set it, then it will be
|
40
|
+
`"channels_last"`.
|
41
|
+
dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use
|
42
|
+
for the models computations and weights. Note that some
|
43
|
+
computations, such as softmax and layer normalization will always
|
44
|
+
be done a float32 precision regardless of dtype.
|
45
|
+
"""
|
46
|
+
|
47
|
+
def __init__(
|
48
|
+
self,
|
49
|
+
patch_size,
|
50
|
+
hidden_dim,
|
51
|
+
num_layers,
|
52
|
+
num_heads,
|
53
|
+
intermediate_dim,
|
54
|
+
intermediate_activation="gelu_approximate",
|
55
|
+
layer_norm_epsilon=1e-6,
|
56
|
+
image_shape=(224, 224, 3),
|
57
|
+
data_format=None,
|
58
|
+
dtype=None,
|
59
|
+
name=None,
|
60
|
+
**kwargs,
|
61
|
+
):
|
62
|
+
data_format = standardize_data_format(data_format)
|
63
|
+
if data_format == "channels_last":
|
64
|
+
height, width = image_shape[0], image_shape[1]
|
65
|
+
else:
|
66
|
+
height, width = image_shape[1], image_shape[2]
|
67
|
+
if height != width:
|
68
|
+
raise ValueError(
|
69
|
+
"`SigLIPVisionEncoder` expects the height and width to be the "
|
70
|
+
f"same in `image_shape`. Received: image_shape={image_shape}"
|
71
|
+
)
|
72
|
+
|
73
|
+
# `prefix` is used to prevent duplicate name when utilizing multiple
|
74
|
+
# SigLIP encoders within a single model.
|
75
|
+
prefix = str(name) + "_" if name is not None else ""
|
76
|
+
|
77
|
+
# === Layers ===
|
78
|
+
self.embedding = SigLIPVisionEmbedding(
|
79
|
+
hidden_dim=hidden_dim,
|
80
|
+
patch_size=patch_size,
|
81
|
+
image_size=height,
|
82
|
+
data_format=data_format,
|
83
|
+
dtype=dtype,
|
84
|
+
name=f"{prefix}embedding",
|
85
|
+
)
|
86
|
+
self.encoder_layers = [
|
87
|
+
SigLIPEncoderLayer(
|
88
|
+
hidden_dim,
|
89
|
+
num_heads,
|
90
|
+
intermediate_dim,
|
91
|
+
intermediate_activation,
|
92
|
+
layer_norm_epsilon=layer_norm_epsilon,
|
93
|
+
dtype=dtype,
|
94
|
+
name=f"{prefix}encoder_block_{i}",
|
95
|
+
)
|
96
|
+
for i in range(num_layers)
|
97
|
+
]
|
98
|
+
self.post_layer_norm = layers.LayerNormalization(
|
99
|
+
epsilon=1e-6, dtype=dtype, name=f"{prefix}post_layer_norm"
|
100
|
+
)
|
101
|
+
self.head = SigLIPMultiHeadAttentionPooling(
|
102
|
+
hidden_dim,
|
103
|
+
intermediate_dim,
|
104
|
+
num_heads,
|
105
|
+
intermediate_activation,
|
106
|
+
layer_norm_epsilon,
|
107
|
+
dtype=dtype,
|
108
|
+
name=f"{prefix}head",
|
109
|
+
)
|
110
|
+
|
111
|
+
# === Functional Model ===
|
112
|
+
image_input = layers.Input(shape=image_shape, name="images")
|
113
|
+
x = self.embedding(image_input)
|
114
|
+
for _, block in enumerate(self.encoder_layers):
|
115
|
+
x = block(x)
|
116
|
+
x = self.post_layer_norm(x)
|
117
|
+
x = self.head(x)
|
118
|
+
outputs = x
|
119
|
+
super().__init__(
|
120
|
+
inputs={"images": image_input},
|
121
|
+
outputs=outputs,
|
122
|
+
dtype=dtype,
|
123
|
+
name=name,
|
124
|
+
**kwargs,
|
125
|
+
)
|
126
|
+
|
127
|
+
# === Config ===
|
128
|
+
self.patch_size = patch_size
|
129
|
+
self.hidden_dim = hidden_dim
|
130
|
+
self.num_layers = num_layers
|
131
|
+
self.num_heads = num_heads
|
132
|
+
self.intermediate_dim = intermediate_dim
|
133
|
+
self.intermediate_activation = intermediate_activation
|
134
|
+
self.layer_norm_epsilon = layer_norm_epsilon
|
135
|
+
self.image_shape = image_shape
|
136
|
+
|
137
|
+
def get_config(self):
|
138
|
+
config = super().get_config()
|
139
|
+
config.update(
|
140
|
+
{
|
141
|
+
"patch_size": self.patch_size,
|
142
|
+
"hidden_dim": self.hidden_dim,
|
143
|
+
"num_layers": self.num_layers,
|
144
|
+
"num_heads": self.num_heads,
|
145
|
+
"intermediate_dim": self.intermediate_dim,
|
146
|
+
"intermediate_activation": self.intermediate_activation,
|
147
|
+
"layer_norm_epsilon": self.layer_norm_epsilon,
|
148
|
+
"image_shape": self.image_shape,
|
149
|
+
}
|
150
|
+
)
|
151
|
+
return config
|
keras_hub/src/version_utils.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: keras-hub-nightly
|
3
|
-
Version: 0.19.0.
|
3
|
+
Version: 0.19.0.dev202503040351
|
4
4
|
Summary: Industry-strength Natural Language Processing extensions for Keras.
|
5
5
|
Home-page: https://github.com/keras-team/keras-hub
|
6
6
|
Author: Keras team
|