keras-hub-nightly 0.23.0.dev202508260411__py3-none-any.whl → 0.23.0.dev202508280418__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- keras_hub/layers/__init__.py +6 -0
- keras_hub/models/__init__.py +21 -0
- keras_hub/src/layers/modeling/position_embedding.py +21 -6
- keras_hub/src/layers/modeling/rotary_embedding.py +16 -6
- keras_hub/src/layers/modeling/sine_position_encoding.py +21 -8
- keras_hub/src/layers/modeling/token_and_position_embedding.py +2 -1
- keras_hub/src/models/backbone.py +10 -15
- keras_hub/src/models/d_fine/__init__.py +0 -0
- keras_hub/src/models/d_fine/d_fine_attention.py +461 -0
- keras_hub/src/models/d_fine/d_fine_backbone.py +891 -0
- keras_hub/src/models/d_fine/d_fine_decoder.py +944 -0
- keras_hub/src/models/d_fine/d_fine_encoder.py +365 -0
- keras_hub/src/models/d_fine/d_fine_hybrid_encoder.py +642 -0
- keras_hub/src/models/d_fine/d_fine_image_converter.py +8 -0
- keras_hub/src/models/d_fine/d_fine_layers.py +1828 -0
- keras_hub/src/models/d_fine/d_fine_loss.py +938 -0
- keras_hub/src/models/d_fine/d_fine_object_detector.py +875 -0
- keras_hub/src/models/d_fine/d_fine_object_detector_preprocessor.py +14 -0
- keras_hub/src/models/d_fine/d_fine_presets.py +2 -0
- keras_hub/src/models/d_fine/d_fine_utils.py +827 -0
- keras_hub/src/models/hgnetv2/hgnetv2_backbone.py +4 -1
- keras_hub/src/models/hgnetv2/hgnetv2_encoder.py +3 -2
- keras_hub/src/models/hgnetv2/hgnetv2_layers.py +27 -11
- keras_hub/src/models/parseq/__init__.py +0 -0
- keras_hub/src/models/parseq/parseq_backbone.py +134 -0
- keras_hub/src/models/parseq/parseq_causal_lm.py +466 -0
- keras_hub/src/models/parseq/parseq_causal_lm_preprocessor.py +168 -0
- keras_hub/src/models/parseq/parseq_decoder.py +418 -0
- keras_hub/src/models/parseq/parseq_image_converter.py +8 -0
- keras_hub/src/models/parseq/parseq_tokenizer.py +221 -0
- keras_hub/src/tests/test_case.py +37 -1
- keras_hub/src/utils/preset_utils.py +49 -0
- keras_hub/src/utils/tensor_utils.py +23 -1
- keras_hub/src/utils/transformers/convert_vit.py +4 -1
- keras_hub/src/version.py +1 -1
- keras_hub/tokenizers/__init__.py +3 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/METADATA +1 -1
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/RECORD +40 -20
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/WHEEL +0 -0
- {keras_hub_nightly-0.23.0.dev202508260411.dist-info → keras_hub_nightly-0.23.0.dev202508280418.dist-info}/top_level.txt +0 -0
@@ -157,7 +157,10 @@ class HGNetV2Backbone(Backbone):
|
|
157
157
|
if stage_name in self.out_features
|
158
158
|
}
|
159
159
|
super().__init__(
|
160
|
-
inputs=pixel_values,
|
160
|
+
inputs=pixel_values,
|
161
|
+
outputs=feature_maps_output,
|
162
|
+
dtype=dtype,
|
163
|
+
**kwargs,
|
161
164
|
)
|
162
165
|
|
163
166
|
# === Config ===
|
@@ -56,9 +56,10 @@ class HGNetV2Encoder(keras.layers.Layer):
|
|
56
56
|
use_learnable_affine_block,
|
57
57
|
data_format=None,
|
58
58
|
channel_axis=None,
|
59
|
+
dtype=None,
|
59
60
|
**kwargs,
|
60
61
|
):
|
61
|
-
super().__init__(**kwargs)
|
62
|
+
super().__init__(dtype=dtype, **kwargs)
|
62
63
|
self.stage_in_channels = stage_in_channels
|
63
64
|
self.stage_mid_channels = stage_mid_channels
|
64
65
|
self.stage_out_channels = stage_out_channels
|
@@ -90,7 +91,7 @@ class HGNetV2Encoder(keras.layers.Layer):
|
|
90
91
|
name=f"{self.name}_stage_{stage_idx}"
|
91
92
|
if self.name
|
92
93
|
else f"stage_{stage_idx}",
|
93
|
-
dtype=
|
94
|
+
dtype=dtype,
|
94
95
|
)
|
95
96
|
self.stages_list.append(stage_layer)
|
96
97
|
|
@@ -17,8 +17,8 @@ class HGNetV2LearnableAffineBlock(keras.layers.Layer):
|
|
17
17
|
**kwargs: Additional keyword arguments passed to the parent class.
|
18
18
|
"""
|
19
19
|
|
20
|
-
def __init__(self, scale_value=1.0, bias_value=0.0, **kwargs):
|
21
|
-
super().__init__(**kwargs)
|
20
|
+
def __init__(self, scale_value=1.0, bias_value=0.0, dtype=None, **kwargs):
|
21
|
+
super().__init__(dtype=dtype, **kwargs)
|
22
22
|
self.scale_value = scale_value
|
23
23
|
self.bias_value = bias_value
|
24
24
|
|
@@ -87,9 +87,10 @@ class HGNetV2ConvLayer(keras.layers.Layer):
|
|
87
87
|
use_learnable_affine_block=False,
|
88
88
|
data_format=None,
|
89
89
|
channel_axis=None,
|
90
|
+
dtype=None,
|
90
91
|
**kwargs,
|
91
92
|
):
|
92
|
-
super().__init__(**kwargs)
|
93
|
+
super().__init__(dtype=dtype, **kwargs)
|
93
94
|
self.in_channels = in_channels
|
94
95
|
self.out_channels = out_channels
|
95
96
|
self.kernel_size = kernel_size
|
@@ -104,6 +105,7 @@ class HGNetV2ConvLayer(keras.layers.Layer):
|
|
104
105
|
padding=((pad, pad), (pad, pad)),
|
105
106
|
data_format=self.data_format,
|
106
107
|
name=f"{self.name}_pad" if self.name else None,
|
108
|
+
dtype=self.dtype_policy,
|
107
109
|
)
|
108
110
|
self.convolution = keras.layers.Conv2D(
|
109
111
|
filters=self.out_channels,
|
@@ -156,7 +158,8 @@ class HGNetV2ConvLayer(keras.layers.Layer):
|
|
156
158
|
)
|
157
159
|
else:
|
158
160
|
self.lab = keras.layers.Identity(
|
159
|
-
name=f"{self.name}_identity_lab" if self.name else None
|
161
|
+
name=f"{self.name}_identity_lab" if self.name else None,
|
162
|
+
dtype=self.dtype_policy,
|
160
163
|
)
|
161
164
|
|
162
165
|
def build(self, input_shape):
|
@@ -230,9 +233,10 @@ class HGNetV2ConvLayerLight(keras.layers.Layer):
|
|
230
233
|
use_learnable_affine_block=False,
|
231
234
|
data_format=None,
|
232
235
|
channel_axis=None,
|
236
|
+
dtype=None,
|
233
237
|
**kwargs,
|
234
238
|
):
|
235
|
-
super().__init__(**kwargs)
|
239
|
+
super().__init__(dtype=dtype, **kwargs)
|
236
240
|
self.in_channels = in_channels
|
237
241
|
self.out_channels = out_channels
|
238
242
|
self.kernel_size = kernel_size
|
@@ -327,9 +331,10 @@ class HGNetV2Embeddings(keras.layers.Layer):
|
|
327
331
|
use_learnable_affine_block,
|
328
332
|
data_format=None,
|
329
333
|
channel_axis=None,
|
334
|
+
dtype=None,
|
330
335
|
**kwargs,
|
331
336
|
):
|
332
|
-
super().__init__(**kwargs)
|
337
|
+
super().__init__(dtype=dtype, **kwargs)
|
333
338
|
self.stem_channels = stem_channels
|
334
339
|
self.hidden_act = hidden_act
|
335
340
|
self.use_learnable_affine_block = use_learnable_affine_block
|
@@ -352,6 +357,7 @@ class HGNetV2Embeddings(keras.layers.Layer):
|
|
352
357
|
padding=((0, 1), (0, 1)),
|
353
358
|
data_format=self.data_format,
|
354
359
|
name=f"{self.name}_padding1" if self.name else "padding1",
|
360
|
+
dtype=self.dtype_policy,
|
355
361
|
)
|
356
362
|
self.stem2a_layer = HGNetV2ConvLayer(
|
357
363
|
in_channels=self.stem_channels[1],
|
@@ -370,6 +376,7 @@ class HGNetV2Embeddings(keras.layers.Layer):
|
|
370
376
|
padding=((0, 1), (0, 1)),
|
371
377
|
data_format=self.data_format,
|
372
378
|
name=f"{self.name}_padding2" if self.name else "padding2",
|
379
|
+
dtype=self.dtype_policy,
|
373
380
|
)
|
374
381
|
self.stem2b_layer = HGNetV2ConvLayer(
|
375
382
|
in_channels=self.stem_channels[1] // 2,
|
@@ -390,10 +397,12 @@ class HGNetV2Embeddings(keras.layers.Layer):
|
|
390
397
|
padding="valid",
|
391
398
|
data_format=self.data_format,
|
392
399
|
name=f"{self.name}_pool" if self.name else "pool",
|
400
|
+
dtype=self.dtype_policy,
|
393
401
|
)
|
394
402
|
self.concatenate_layer = keras.layers.Concatenate(
|
395
403
|
axis=self.channel_axis,
|
396
404
|
name=f"{self.name}_concat" if self.name else "concat",
|
405
|
+
dtype=self.dtype_policy,
|
397
406
|
)
|
398
407
|
self.stem3_layer = HGNetV2ConvLayer(
|
399
408
|
in_channels=self.stem_channels[1] * 2,
|
@@ -550,9 +559,10 @@ class HGNetV2BasicLayer(keras.layers.Layer):
|
|
550
559
|
use_learnable_affine_block=False,
|
551
560
|
data_format=None,
|
552
561
|
channel_axis=None,
|
562
|
+
dtype=None,
|
553
563
|
**kwargs,
|
554
564
|
):
|
555
|
-
super().__init__(**kwargs)
|
565
|
+
super().__init__(dtype=dtype, **kwargs)
|
556
566
|
self.in_channels_arg = in_channels
|
557
567
|
self.middle_channels = middle_channels
|
558
568
|
self.out_channels = out_channels
|
@@ -635,23 +645,27 @@ class HGNetV2BasicLayer(keras.layers.Layer):
|
|
635
645
|
self.drop_path_rate,
|
636
646
|
noise_shape=(None, 1, 1, 1),
|
637
647
|
name=f"{self.name}_drop_path" if self.name else "drop_path",
|
648
|
+
dtype=self.dtype_policy,
|
638
649
|
)
|
639
650
|
else:
|
640
651
|
self.drop_path_layer = keras.layers.Identity(
|
641
652
|
name=f"{self.name}_identity_drop_path"
|
642
653
|
if self.name
|
643
|
-
else "identity_drop_path"
|
654
|
+
else "identity_drop_path",
|
655
|
+
dtype=self.dtype_policy,
|
644
656
|
)
|
645
657
|
|
646
658
|
self.concatenate_layer = keras.layers.Concatenate(
|
647
659
|
axis=self.channel_axis,
|
648
660
|
name=f"{self.name}_concat" if self.name else "concat",
|
661
|
+
dtype=self.dtype_policy,
|
649
662
|
)
|
650
663
|
if self.residual:
|
651
664
|
self.add_layer = keras.layers.Add(
|
652
665
|
name=f"{self.name}_add_residual"
|
653
666
|
if self.name
|
654
|
-
else "add_residual"
|
667
|
+
else "add_residual",
|
668
|
+
dtype=self.dtype_policy,
|
655
669
|
)
|
656
670
|
|
657
671
|
def build(self, input_shape):
|
@@ -794,9 +808,10 @@ class HGNetV2Stage(keras.layers.Layer):
|
|
794
808
|
drop_path: float = 0.0,
|
795
809
|
data_format=None,
|
796
810
|
channel_axis=None,
|
811
|
+
dtype=None,
|
797
812
|
**kwargs,
|
798
813
|
):
|
799
|
-
super().__init__(**kwargs)
|
814
|
+
super().__init__(dtype=dtype, **kwargs)
|
800
815
|
self.stage_in_channels = stage_in_channels
|
801
816
|
self.stage_mid_channels = stage_mid_channels
|
802
817
|
self.stage_out_channels = stage_out_channels
|
@@ -842,7 +857,8 @@ class HGNetV2Stage(keras.layers.Layer):
|
|
842
857
|
self.downsample_layer = keras.layers.Identity(
|
843
858
|
name=f"{self.name}_identity_downsample"
|
844
859
|
if self.name
|
845
|
-
else "identity_downsample"
|
860
|
+
else "identity_downsample",
|
861
|
+
dtype=self.dtype_policy,
|
846
862
|
)
|
847
863
|
|
848
864
|
self.blocks_list = []
|
File without changes
|
@@ -0,0 +1,134 @@
|
|
1
|
+
import keras
|
2
|
+
|
3
|
+
from keras_hub.src.api_export import keras_hub_export
|
4
|
+
from keras_hub.src.models.backbone import Backbone
|
5
|
+
from keras_hub.src.models.parseq.parseq_decoder import PARSeqDecoder
|
6
|
+
|
7
|
+
|
8
|
+
@keras_hub_export("keras_hub.models.PARSeqBackbone")
|
9
|
+
class PARSeqBackbone(Backbone):
|
10
|
+
"""Scene Text Detection with PARSeq.
|
11
|
+
|
12
|
+
Performs OCR in natural scenes using the PARSeq model described in [Scene
|
13
|
+
Text Recognition with Permuted Autoregressive Sequence Models](
|
14
|
+
https://arxiv.org/abs/2207.06966). PARSeq is a ViT-based model that allows
|
15
|
+
iterative decoding by performing an autoregressive decoding phase, followed
|
16
|
+
by a refinement phase.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
image_encoder: keras.Model. The image encoder model.
|
20
|
+
vocabulary_size: int. The size of the vocabulary.
|
21
|
+
max_label_length: int. The maximum length of the label sequence.
|
22
|
+
decoder_hidden_dim: int. The dimension of the decoder hidden layers.
|
23
|
+
num_decoder_layers: int. The number of decoder layers.
|
24
|
+
num_decoder_heads: int. The number of attention heads in the decoder.
|
25
|
+
decoder_mlp_dim: int. The dimension of the decoder MLP hidden layer.
|
26
|
+
dropout_rate: float. The dropout rate for the decoder network.
|
27
|
+
Defaults to `0.1`.
|
28
|
+
attention_dropout: float. The dropout rate for the attention weights.
|
29
|
+
Defaults to `0.1`.
|
30
|
+
dtype: str. `None`, str, or `keras.mixed_precision.DTypePolicy`. The
|
31
|
+
dtype to use for the computations and weights.
|
32
|
+
**kwargs: Additional keyword arguments passed to the base
|
33
|
+
`keras.Model` constructor.
|
34
|
+
"""
|
35
|
+
|
36
|
+
def __init__(
|
37
|
+
self,
|
38
|
+
image_encoder,
|
39
|
+
vocabulary_size,
|
40
|
+
max_label_length,
|
41
|
+
decoder_hidden_dim,
|
42
|
+
num_decoder_layers,
|
43
|
+
num_decoder_heads,
|
44
|
+
decoder_mlp_dim,
|
45
|
+
dropout_rate=0.1,
|
46
|
+
attention_dropout=0.1,
|
47
|
+
dtype=None,
|
48
|
+
**kwargs,
|
49
|
+
):
|
50
|
+
# === Layers ===
|
51
|
+
self.image_encoder = image_encoder
|
52
|
+
self.decoder = PARSeqDecoder(
|
53
|
+
vocabulary_size=vocabulary_size,
|
54
|
+
max_label_length=max_label_length,
|
55
|
+
num_layers=num_decoder_layers,
|
56
|
+
num_heads=num_decoder_heads,
|
57
|
+
hidden_dim=decoder_hidden_dim,
|
58
|
+
mlp_dim=decoder_mlp_dim,
|
59
|
+
dropout_rate=dropout_rate,
|
60
|
+
attention_dropout=attention_dropout,
|
61
|
+
name="decoder",
|
62
|
+
dtype=dtype,
|
63
|
+
)
|
64
|
+
self.head = keras.layers.Dense(
|
65
|
+
vocabulary_size - 2, # We don't predict <bos> nor <pad>
|
66
|
+
dtype=dtype,
|
67
|
+
)
|
68
|
+
|
69
|
+
# === Functional Model ===
|
70
|
+
image_input = self.image_encoder.input
|
71
|
+
|
72
|
+
token_id_input = keras.Input(
|
73
|
+
shape=(None,), dtype="int32", name="token_ids"
|
74
|
+
)
|
75
|
+
padding_mask_input = keras.Input(
|
76
|
+
shape=(None,), dtype="int32", name="padding_mask"
|
77
|
+
)
|
78
|
+
|
79
|
+
memory = self.image_encoder(image_input)
|
80
|
+
target_out = self.decoder(
|
81
|
+
token_id_input, memory, padding_mask=padding_mask_input
|
82
|
+
)
|
83
|
+
logits = self.head(target_out)
|
84
|
+
|
85
|
+
# === Config ===
|
86
|
+
self.vocabulary_size = vocabulary_size
|
87
|
+
self.max_label_length = max_label_length
|
88
|
+
self.decoder_hidden_dim = decoder_hidden_dim
|
89
|
+
self.num_decoder_layers = num_decoder_layers
|
90
|
+
self.num_decoder_heads = num_decoder_heads
|
91
|
+
self.decoder_mlp_dim = decoder_mlp_dim
|
92
|
+
self.dropout_rate = dropout_rate
|
93
|
+
self.attention_dropout = attention_dropout
|
94
|
+
|
95
|
+
super().__init__(
|
96
|
+
inputs={
|
97
|
+
"images": image_input,
|
98
|
+
"token_ids": token_id_input,
|
99
|
+
"padding_mask": padding_mask_input,
|
100
|
+
},
|
101
|
+
outputs=logits,
|
102
|
+
dtype=dtype,
|
103
|
+
**kwargs,
|
104
|
+
)
|
105
|
+
|
106
|
+
def get_config(self):
|
107
|
+
config = super().get_config()
|
108
|
+
config.update(
|
109
|
+
{
|
110
|
+
"image_encoder": keras.layers.serialize(self.image_encoder),
|
111
|
+
"vocabulary_size": self.vocabulary_size,
|
112
|
+
"max_label_length": self.max_label_length,
|
113
|
+
"decoder_hidden_dim": self.decoder_hidden_dim,
|
114
|
+
"num_decoder_layers": self.num_decoder_layers,
|
115
|
+
"num_decoder_heads": self.num_decoder_heads,
|
116
|
+
"decoder_mlp_dim": self.decoder_mlp_dim,
|
117
|
+
"dropout_rate": self.dropout_rate,
|
118
|
+
"attention_dropout": self.attention_dropout,
|
119
|
+
}
|
120
|
+
)
|
121
|
+
|
122
|
+
return config
|
123
|
+
|
124
|
+
@classmethod
|
125
|
+
def from_config(cls, config):
|
126
|
+
config.update(
|
127
|
+
{
|
128
|
+
"image_encoder": keras.layers.deserialize(
|
129
|
+
config["image_encoder"]
|
130
|
+
),
|
131
|
+
}
|
132
|
+
)
|
133
|
+
|
134
|
+
return super().from_config(config)
|