mttf 1.1.16__py3-none-any.whl → 1.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mttf might be problematic. Click here for more details.
- mt/tf/init.py +2 -2
- mt/tf/keras_applications/__init__.py +5 -0
- mt/tf/version.py +1 -1
- {mttf-1.1.16.dist-info → mttf-1.1.17.dist-info}/METADATA +1 -1
- {mttf-1.1.16.dist-info → mttf-1.1.17.dist-info}/RECORD +14 -16
- mt/tf/keras_applications/mobilenet_v3_split.py +0 -555
- mt/tf/keras_applications/mobilevit.py +0 -323
- {mttf-1.1.16.data → mttf-1.1.17.data}/scripts/dmt_pipi.sh +0 -0
- {mttf-1.1.16.data → mttf-1.1.17.data}/scripts/dmt_twineu.sh +0 -0
- {mttf-1.1.16.data → mttf-1.1.17.data}/scripts/pipi.sh +0 -0
- {mttf-1.1.16.data → mttf-1.1.17.data}/scripts/wml_nexus.py +0 -0
- {mttf-1.1.16.data → mttf-1.1.17.data}/scripts/wml_pipi.sh +0 -0
- {mttf-1.1.16.data → mttf-1.1.17.data}/scripts/wml_twineu.sh +0 -0
- {mttf-1.1.16.dist-info → mttf-1.1.17.dist-info}/WHEEL +0 -0
- {mttf-1.1.16.dist-info → mttf-1.1.17.dist-info}/licenses/LICENSE +0 -0
- {mttf-1.1.16.dist-info → mttf-1.1.17.dist-info}/top_level.txt +0 -0
mt/tf/init.py
CHANGED
|
@@ -20,7 +20,7 @@ def init():
|
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
# add mobilenet_v3_split module
|
|
23
|
-
from .
|
|
23
|
+
from mt.keras.applications import mobilenet_v3_split, mobilevit
|
|
24
24
|
|
|
25
25
|
setattr(tensorflow.keras.applications, "mobilenet_v3_split", mobilenet_v3_split)
|
|
26
26
|
setattr(tensorflow.keras.applications, "mobilevit", mobilevit)
|
|
@@ -32,7 +32,7 @@ def init():
|
|
|
32
32
|
mobilenet_v3_split.MobileNetV3Split,
|
|
33
33
|
)
|
|
34
34
|
|
|
35
|
-
from .
|
|
35
|
+
from mt.keras.layers import Identical, Upsize2D, Downsize2D
|
|
36
36
|
|
|
37
37
|
setattr(tensorflow.keras.layers, "Identical", Identical)
|
|
38
38
|
setattr(tensorflow.keras.layers, "Upsize2D", Upsize2D)
|
mt/tf/version.py
CHANGED
|
@@ -14,26 +14,24 @@ mt/keras_src/layers_src/simple_mha.py,sha256=B0iBytzJL9J_B3D-9YgWtLFXlvLc1CA43Nb
|
|
|
14
14
|
mt/keras_src/layers_src/utils.py,sha256=cvOikMkSSCJ9Si-eIfp_hD86AQ-PxbI5xtIfp9bcW8E,2806
|
|
15
15
|
mt/keras_src/layers_src/var_regularizer.py,sha256=yJwNCtetTT-a4Y38PXEUCyl3OQQzIxT9ybj9jP1r25A,1104
|
|
16
16
|
mt/tf/__init__.py,sha256=M8xiJNdrAUJZgiZTOQOdfkehjO-CYzGpoxh5HVGBkms,338
|
|
17
|
-
mt/tf/init.py,sha256=
|
|
17
|
+
mt/tf/init.py,sha256=bGAE4YaEBJhEti6ilPyLW-HBcoUxXhyHv_tMemAWraM,1298
|
|
18
18
|
mt/tf/mttf_version.py,sha256=ha53i-H9pE-crufFttUECgXHwPvam07zMKzApUts1Gs,206
|
|
19
19
|
mt/tf/utils.py,sha256=wau2vhPoPHu2cDxlc2lc9fxrndOXPdq2DNG4em5OOMI,1025
|
|
20
|
-
mt/tf/version.py,sha256=
|
|
21
|
-
mt/tf/keras_applications/__init__.py,sha256=
|
|
22
|
-
mt/tf/keras_applications/mobilenet_v3_split.py,sha256=1oPB3EX3k3c7iju9Ksuw9xyv32-mOKPs3uy2Mk5tLd8,19716
|
|
23
|
-
mt/tf/keras_applications/mobilevit.py,sha256=VsKB_U-f9jwUEjpd0eq-YXL4rDsuAbKQ0yIzkbMfLzw,9949
|
|
20
|
+
mt/tf/version.py,sha256=wlfHaYbRkJgBo8YIMUY5_Ql_ajzPwcICFgNLqZIU-5c,207
|
|
21
|
+
mt/tf/keras_applications/__init__.py,sha256=m-A1rHGGLQgHX9690ENWXZkrU0vqfsJkZXcjIG3CLM0,142
|
|
24
22
|
mt/tf/keras_layers/__init__.py,sha256=NsuFD-kSuy6cVV3Kl7ab95tw4g7x4Igv3cF-Ky3VuCo,124
|
|
25
23
|
mt/tfc/__init__.py,sha256=XFnHJOPip-pT0MzUWGJ07GnNUJOhXluXLLULCY3Miac,9919
|
|
26
24
|
mt/tfg/__init__.py,sha256=6Ly2QImAyQTsg_ZszuAuK_L2n56v89Cix9yYmMVk0CM,304
|
|
27
25
|
mt/tfp/__init__.py,sha256=AQkGCkmDRwswEt3qoOSpxe-fZekx78sHHBs2ZVz33gc,383
|
|
28
26
|
mt/tfp/real_nvp.py,sha256=U9EmkXGqFcvtS2yeh5_RgbKlVKKlGFGklAb7Voyazz4,4440
|
|
29
|
-
mttf-1.1.
|
|
30
|
-
mttf-1.1.
|
|
31
|
-
mttf-1.1.
|
|
32
|
-
mttf-1.1.
|
|
33
|
-
mttf-1.1.
|
|
34
|
-
mttf-1.1.
|
|
35
|
-
mttf-1.1.
|
|
36
|
-
mttf-1.1.
|
|
37
|
-
mttf-1.1.
|
|
38
|
-
mttf-1.1.
|
|
39
|
-
mttf-1.1.
|
|
27
|
+
mttf-1.1.17.data/scripts/dmt_pipi.sh,sha256=NNsj4P332unHMqU4mAFjU9PQvxQ8TK5XQ42LC29IZY8,510
|
|
28
|
+
mttf-1.1.17.data/scripts/dmt_twineu.sh,sha256=KZhcYwuCW0c36tWcOgCe7uxJmS08rz-J6YNY76Exy4M,193
|
|
29
|
+
mttf-1.1.17.data/scripts/pipi.sh,sha256=kdo96bdaKq2QIa52Z4XFSiGPcbDm09SAU9cju6I2Lxo,289
|
|
30
|
+
mttf-1.1.17.data/scripts/wml_nexus.py,sha256=kW0ju8_kdXc4jOjhdzKiMsFuO1MNpHmu87skrhu9SEg,1492
|
|
31
|
+
mttf-1.1.17.data/scripts/wml_pipi.sh,sha256=CuidIcbuxyXSBNQqYRhCcSC8QbBaSGnQX0KAIFaIvKA,499
|
|
32
|
+
mttf-1.1.17.data/scripts/wml_twineu.sh,sha256=av1JLN765oOWC5LPkv2eSWIVof26y60601tMGkuYdb8,180
|
|
33
|
+
mttf-1.1.17.dist-info/licenses/LICENSE,sha256=e_JtcszdGZ2ZGfjcymTGrcxFj_9XPicZOVtnsrPvruk,1070
|
|
34
|
+
mttf-1.1.17.dist-info/METADATA,sha256=uT-UBbGAPZbPF2fUeD73tFit_w1Wguzhnjs3JcFoTEs,568
|
|
35
|
+
mttf-1.1.17.dist-info/WHEEL,sha256=wXxTzcEDnjrTwFYjLPcsW_7_XihufBwmpiBeiXNBGEA,91
|
|
36
|
+
mttf-1.1.17.dist-info/top_level.txt,sha256=WcqGFu9cV7iMZg09iam8eNxUvGpLSKKF2Iubf6SJVOo,3
|
|
37
|
+
mttf-1.1.17.dist-info/RECORD,,
|
|
@@ -1,555 +0,0 @@
|
|
|
1
|
-
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
# ==============================================================================
|
|
15
|
-
# pylint: disable=invalid-name
|
|
16
|
-
# pylint: disable=missing-function-docstring
|
|
17
|
-
"""MobileNet v3 models split into 5 submodels.
|
|
18
|
-
|
|
19
|
-
The MobileNetV3 model is split into 5 parts:
|
|
20
|
-
|
|
21
|
-
- The input parser block that downsamples once (:func:`MobileNetV3Parser`).
|
|
22
|
-
- Block 0 to 3 that downample once for each block (:func:`MobileNetV3LargeBlock`
|
|
23
|
-
or :func:`MobileNetV3SmallBlock`). As of 2023/05/15, there's a possibility to have block 4
|
|
24
|
-
for MobileNetV3Large.
|
|
25
|
-
- The mixer block that turns the downsampled grid into a (1,1,feat_dim) batch
|
|
26
|
-
(:func:`MobileNetV3Mixer`).
|
|
27
|
-
- Optionally the output block that may or may not contain the clasification head
|
|
28
|
-
(:func:`MobileNetV3Output`).
|
|
29
|
-
|
|
30
|
-
Input arguments follow those of MobileNetV3. One can also use :func:`MobileNetV3Split` to create
|
|
31
|
-
a model of submodels that is theoretically equivalent to the original MobileNetV3 model. However,
|
|
32
|
-
no pre-trained weights exist.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
from mt import tp, tfc
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
__all__ = [
|
|
40
|
-
"MobileNetV3Input",
|
|
41
|
-
"MobileNetV3Parser",
|
|
42
|
-
"MobileNetV3Block",
|
|
43
|
-
"MobileNetV3Mixer",
|
|
44
|
-
"MobileNetV3Output",
|
|
45
|
-
"MobileNetV3Split",
|
|
46
|
-
]
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
try:
|
|
50
|
-
from tensorflow.keras.applications.mobilenet_v3 import (
|
|
51
|
-
relu,
|
|
52
|
-
hard_swish,
|
|
53
|
-
_depth,
|
|
54
|
-
_inverted_res_block,
|
|
55
|
-
)
|
|
56
|
-
except ImportError:
|
|
57
|
-
try:
|
|
58
|
-
from keras.applications.mobilenet_v3 import (
|
|
59
|
-
relu,
|
|
60
|
-
hard_swish,
|
|
61
|
-
_depth,
|
|
62
|
-
_inverted_res_block,
|
|
63
|
-
)
|
|
64
|
-
except ImportError:
|
|
65
|
-
from keras.src.applications.mobilenet_v3 import (
|
|
66
|
-
relu,
|
|
67
|
-
hard_swish,
|
|
68
|
-
_depth,
|
|
69
|
-
_inverted_res_block,
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
try:
|
|
73
|
-
from keras import backend
|
|
74
|
-
except ImportError:
|
|
75
|
-
from tensorflow.python.keras import backend
|
|
76
|
-
|
|
77
|
-
try:
|
|
78
|
-
from keras import models
|
|
79
|
-
except ImportError:
|
|
80
|
-
from tensorflow.python.keras import models
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
try:
|
|
84
|
-
from keras.layers import VersionAwareLayers
|
|
85
|
-
|
|
86
|
-
layers = VersionAwareLayers()
|
|
87
|
-
except ImportError:
|
|
88
|
-
try:
|
|
89
|
-
from keras import layers
|
|
90
|
-
except ImportError:
|
|
91
|
-
from tensorflow.python.keras.layers import VersionAwareLayers
|
|
92
|
-
|
|
93
|
-
layers = VersionAwareLayers()
|
|
94
|
-
|
|
95
|
-
try:
|
|
96
|
-
from keras.utils import data_utils, layer_utils
|
|
97
|
-
except ImportError:
|
|
98
|
-
from tensorflow.python.keras.utils import data_utils, layer_utils
|
|
99
|
-
|
|
100
|
-
from tensorflow.python.platform import tf_logging as logging
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def MobileNetV3Input(
|
|
104
|
-
input_shape=None,
|
|
105
|
-
):
|
|
106
|
-
"""Prepares a MobileNetV3 input layer."""
|
|
107
|
-
|
|
108
|
-
# If input_shape is None and input_tensor is None using standard shape
|
|
109
|
-
if input_shape is None:
|
|
110
|
-
input_shape = (None, None, 3)
|
|
111
|
-
|
|
112
|
-
if backend.image_data_format() == "channels_last":
|
|
113
|
-
row_axis, col_axis = (0, 1)
|
|
114
|
-
else:
|
|
115
|
-
row_axis, col_axis = (1, 2)
|
|
116
|
-
rows = input_shape[row_axis]
|
|
117
|
-
cols = input_shape[col_axis]
|
|
118
|
-
if rows and cols and (rows < 32 or cols < 32):
|
|
119
|
-
raise tfc.ModelSyntaxError(
|
|
120
|
-
f"Input size must be at least 32x32; got `input_shape={input_shape}`"
|
|
121
|
-
)
|
|
122
|
-
|
|
123
|
-
img_input = layers.Input(shape=input_shape)
|
|
124
|
-
return img_input
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
def MobileNetV3Parser(
|
|
128
|
-
img_input,
|
|
129
|
-
model_type: str = "Large", # only 'Small' or 'Large' are accepted
|
|
130
|
-
minimalistic=False,
|
|
131
|
-
):
|
|
132
|
-
"""Prepares a MobileNetV3 parser block."""
|
|
133
|
-
|
|
134
|
-
channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
|
|
135
|
-
|
|
136
|
-
if minimalistic:
|
|
137
|
-
activation = relu
|
|
138
|
-
else:
|
|
139
|
-
activation = hard_swish
|
|
140
|
-
|
|
141
|
-
x = img_input
|
|
142
|
-
x = layers.Rescaling(scale=1.0 / 127.5, offset=-1.0)(x)
|
|
143
|
-
x = layers.Conv2D(
|
|
144
|
-
16, kernel_size=3, strides=(2, 2), padding="same", use_bias=False, name="Conv"
|
|
145
|
-
)(x)
|
|
146
|
-
x = layers.BatchNormalization(
|
|
147
|
-
axis=channel_axis, epsilon=1e-3, momentum=0.999, name="Conv/BatchNorm"
|
|
148
|
-
)(x)
|
|
149
|
-
x = activation(x)
|
|
150
|
-
|
|
151
|
-
# Create model.
|
|
152
|
-
model = models.Model(img_input, x, name=f"MobileNetV3{model_type}Parser")
|
|
153
|
-
|
|
154
|
-
return model
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
def MobileNetV3SmallBlock(
|
|
158
|
-
block_id: int, # only 0 to 3 are accepted here
|
|
159
|
-
input_tensor, # input tensor for the block
|
|
160
|
-
alpha=1.0,
|
|
161
|
-
minimalistic=False,
|
|
162
|
-
):
|
|
163
|
-
"""Prepares a MobileNetV3Small downsampling block."""
|
|
164
|
-
|
|
165
|
-
def depth(d):
|
|
166
|
-
return _depth(d * alpha)
|
|
167
|
-
|
|
168
|
-
if minimalistic:
|
|
169
|
-
kernel = 3
|
|
170
|
-
activation = relu
|
|
171
|
-
se_ratio = None
|
|
172
|
-
else:
|
|
173
|
-
kernel = 5
|
|
174
|
-
activation = hard_swish
|
|
175
|
-
se_ratio = 0.25
|
|
176
|
-
|
|
177
|
-
x = input_tensor
|
|
178
|
-
if block_id == 0:
|
|
179
|
-
x = _inverted_res_block(x, 1, depth(16), 3, 2, se_ratio, relu, 0)
|
|
180
|
-
elif block_id == 1:
|
|
181
|
-
x = _inverted_res_block(x, 72.0 / 16, depth(24), 3, 2, None, relu, 1)
|
|
182
|
-
x = _inverted_res_block(x, 88.0 / 24, depth(24), 3, 1, None, relu, 2)
|
|
183
|
-
elif block_id == 2:
|
|
184
|
-
x = _inverted_res_block(x, 4, depth(40), kernel, 2, se_ratio, activation, 3)
|
|
185
|
-
x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 4)
|
|
186
|
-
x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 5)
|
|
187
|
-
x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 6)
|
|
188
|
-
x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 7)
|
|
189
|
-
else:
|
|
190
|
-
x = _inverted_res_block(x, 6, depth(96), kernel, 2, se_ratio, activation, 8)
|
|
191
|
-
x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation, 9)
|
|
192
|
-
x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation, 10)
|
|
193
|
-
|
|
194
|
-
# Create model.
|
|
195
|
-
model = models.Model(input_tensor, x, name=f"MobileNetV3SmallBlock{block_id}")
|
|
196
|
-
|
|
197
|
-
return model
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def MobileNetV3LargeBlock(
|
|
201
|
-
block_id: int, # only 0 to 4 are accepted here. 4 is only available as of 2023/05/15
|
|
202
|
-
input_tensor, # input tensor for the block
|
|
203
|
-
alpha=1.0,
|
|
204
|
-
minimalistic=False,
|
|
205
|
-
):
|
|
206
|
-
"""Prepares a MobileNetV3Large downsampling block."""
|
|
207
|
-
|
|
208
|
-
def depth(d):
|
|
209
|
-
return _depth(d * alpha)
|
|
210
|
-
|
|
211
|
-
if minimalistic:
|
|
212
|
-
kernel = 3
|
|
213
|
-
activation = relu
|
|
214
|
-
se_ratio = None
|
|
215
|
-
else:
|
|
216
|
-
kernel = 5
|
|
217
|
-
activation = hard_swish
|
|
218
|
-
se_ratio = 0.25
|
|
219
|
-
|
|
220
|
-
x = input_tensor
|
|
221
|
-
if block_id == 0:
|
|
222
|
-
x = _inverted_res_block(x, 1, depth(16), 3, 1, None, relu, 0)
|
|
223
|
-
x = _inverted_res_block(x, 4, depth(24), 3, 2, None, relu, 1)
|
|
224
|
-
x = _inverted_res_block(x, 3, depth(24), 3, 1, None, relu, 2)
|
|
225
|
-
elif block_id == 1:
|
|
226
|
-
x = _inverted_res_block(x, 3, depth(40), kernel, 2, se_ratio, relu, 3)
|
|
227
|
-
x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 4)
|
|
228
|
-
x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 5)
|
|
229
|
-
elif block_id == 2:
|
|
230
|
-
x = _inverted_res_block(x, 6, depth(80), 3, 2, None, activation, 6)
|
|
231
|
-
x = _inverted_res_block(x, 2.5, depth(80), 3, 1, None, activation, 7)
|
|
232
|
-
x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 8)
|
|
233
|
-
x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 9)
|
|
234
|
-
x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 10)
|
|
235
|
-
x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 11)
|
|
236
|
-
elif block_id == 3:
|
|
237
|
-
x = _inverted_res_block(x, 6, depth(160), kernel, 2, se_ratio, activation, 12)
|
|
238
|
-
x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation, 13)
|
|
239
|
-
x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation, 14)
|
|
240
|
-
else:
|
|
241
|
-
x = _inverted_res_block(x, 6, depth(320), kernel, 2, se_ratio, activation, 15)
|
|
242
|
-
x = _inverted_res_block(x, 6, depth(320), kernel, 1, se_ratio, activation, 16)
|
|
243
|
-
x = _inverted_res_block(x, 6, depth(320), kernel, 1, se_ratio, activation, 17)
|
|
244
|
-
|
|
245
|
-
# Create model.
|
|
246
|
-
model = models.Model(input_tensor, x, name=f"MobileNetV3LargeBlock{block_id}")
|
|
247
|
-
|
|
248
|
-
return model
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
def MobileNetV3Mixer(
|
|
252
|
-
input_tensor,
|
|
253
|
-
params: tfc.MobileNetV3MixerParams,
|
|
254
|
-
last_point_ch,
|
|
255
|
-
alpha=1.0,
|
|
256
|
-
model_type: str = "Large", # only 'Small' or 'Large' are accepted
|
|
257
|
-
minimalistic=False,
|
|
258
|
-
):
|
|
259
|
-
"""Prepares a MobileNetV3 mixer block."""
|
|
260
|
-
|
|
261
|
-
x = input_tensor
|
|
262
|
-
channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
|
|
263
|
-
|
|
264
|
-
if params.variant == "mobilenet":
|
|
265
|
-
|
|
266
|
-
if minimalistic:
|
|
267
|
-
kernel = 3
|
|
268
|
-
activation = relu
|
|
269
|
-
se_ratio = None
|
|
270
|
-
else:
|
|
271
|
-
kernel = 5
|
|
272
|
-
activation = hard_swish
|
|
273
|
-
se_ratio = 0.25
|
|
274
|
-
|
|
275
|
-
last_conv_ch = _depth(backend.int_shape(x)[channel_axis] * 6)
|
|
276
|
-
|
|
277
|
-
# if the width multiplier is greater than 1 we
|
|
278
|
-
# increase the number of output channels
|
|
279
|
-
if alpha > 1.0:
|
|
280
|
-
last_point_ch = _depth(last_point_ch * alpha)
|
|
281
|
-
x = layers.Conv2D(
|
|
282
|
-
last_conv_ch, kernel_size=1, padding="same", use_bias=False, name="Conv_1"
|
|
283
|
-
)(x)
|
|
284
|
-
x = layers.BatchNormalization(
|
|
285
|
-
axis=channel_axis, epsilon=1e-3, momentum=0.999, name="Conv_1/BatchNorm"
|
|
286
|
-
)(x)
|
|
287
|
-
x = activation(x)
|
|
288
|
-
x = layers.GlobalAveragePooling2D()(x)
|
|
289
|
-
if channel_axis == 1:
|
|
290
|
-
x = layers.Reshape((last_conv_ch, 1, 1))(x)
|
|
291
|
-
else:
|
|
292
|
-
x = layers.Reshape((1, 1, last_conv_ch))(x)
|
|
293
|
-
x = layers.Conv2D(
|
|
294
|
-
last_point_ch, kernel_size=1, padding="same", use_bias=True, name="Conv_2"
|
|
295
|
-
)(x)
|
|
296
|
-
x = activation(x)
|
|
297
|
-
elif params.variant == "maxpool":
|
|
298
|
-
x = layers.GlobalMaxPool2D(x)
|
|
299
|
-
elif params.variant == "mhapool":
|
|
300
|
-
if backend.image_data_format() == "channels_first":
|
|
301
|
-
raise tfc.ModelSyntaxError(
|
|
302
|
-
"Mixer variant 'mhapool' requires channels_last image data format."
|
|
303
|
-
)
|
|
304
|
-
|
|
305
|
-
mhapool_params = params.mhapool_cascade_params
|
|
306
|
-
if not isinstance(mhapool_params, tfc.MHAPool2DCascadeParams):
|
|
307
|
-
raise tfc.ModelSyntaxError(
|
|
308
|
-
"Parameter 'params.mhapool_cascade_params' is not of type "
|
|
309
|
-
"mt.tfc.MHAPool2DCascadeParams. Got: {}.".format(type(mhapool_params))
|
|
310
|
-
)
|
|
311
|
-
|
|
312
|
-
from ..keras_layers import MHAPool2D
|
|
313
|
-
|
|
314
|
-
n_heads = mhapool_params.n_heads
|
|
315
|
-
k = 0
|
|
316
|
-
outputs = []
|
|
317
|
-
while True:
|
|
318
|
-
h = x.shape[1]
|
|
319
|
-
w = x.shape[2]
|
|
320
|
-
|
|
321
|
-
if h <= 1 and w <= 1:
|
|
322
|
-
break
|
|
323
|
-
|
|
324
|
-
c = x.shape[3]
|
|
325
|
-
key_dim = (c + n_heads - 1) // n_heads
|
|
326
|
-
value_dim = int(key_dim * mhapool_params.expansion_factor)
|
|
327
|
-
k += 1
|
|
328
|
-
block_name = f"MHAPool2DCascade_block{k}"
|
|
329
|
-
if k > mhapool_params.max_num_pooling_layers: # GlobalMaxPool2D
|
|
330
|
-
x = layers.GlobalMaxPooling2D(
|
|
331
|
-
keepdims=True, name=block_name + "/GlobalMaxPool"
|
|
332
|
-
)(x)
|
|
333
|
-
else: # MHAPool2D
|
|
334
|
-
x = layers.LayerNormalization()(x)
|
|
335
|
-
if h <= 2 and w <= 2:
|
|
336
|
-
activation = mhapool_params.final_activation
|
|
337
|
-
else:
|
|
338
|
-
activation = mhapool_params.activation
|
|
339
|
-
x = MHAPool2D(
|
|
340
|
-
n_heads,
|
|
341
|
-
key_dim,
|
|
342
|
-
value_dim=value_dim,
|
|
343
|
-
pooling=mhapool_params.pooling,
|
|
344
|
-
dropout=mhapool_params.dropout,
|
|
345
|
-
name=block_name + "/MHAPool",
|
|
346
|
-
)(x)
|
|
347
|
-
|
|
348
|
-
if mhapool_params.output_all:
|
|
349
|
-
outputs.append(x)
|
|
350
|
-
else:
|
|
351
|
-
outputs = [x]
|
|
352
|
-
else:
|
|
353
|
-
raise tfc.ModelSyntaxError(
|
|
354
|
-
"Unknown mixer variant: '{}'.".format(params.variant)
|
|
355
|
-
)
|
|
356
|
-
|
|
357
|
-
# Create model.
|
|
358
|
-
model = models.Model(
|
|
359
|
-
input_tensor, outputs, name="MobileNetV3{}Mixer".format(model_type)
|
|
360
|
-
)
|
|
361
|
-
|
|
362
|
-
return model
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
def MobileNetV3Output(
|
|
366
|
-
input_tensor,
|
|
367
|
-
model_type: str = "Large", # only 'Small' or 'Large' are accepted
|
|
368
|
-
include_top=True,
|
|
369
|
-
classes=1000,
|
|
370
|
-
pooling=None,
|
|
371
|
-
dropout_rate=0.2,
|
|
372
|
-
classifier_activation="softmax",
|
|
373
|
-
):
|
|
374
|
-
"""Prepares a MobileNetV3 output block."""
|
|
375
|
-
|
|
376
|
-
x = input_tensor
|
|
377
|
-
if include_top:
|
|
378
|
-
if dropout_rate > 0:
|
|
379
|
-
x = layers.Dropout(dropout_rate)(x)
|
|
380
|
-
x = layers.Conv2D(classes, kernel_size=1, padding="same", name="Logits")(x)
|
|
381
|
-
x = layers.Flatten()(x)
|
|
382
|
-
x = layers.Activation(activation=classifier_activation, name="Predictions")(x)
|
|
383
|
-
else:
|
|
384
|
-
if pooling == "avg":
|
|
385
|
-
x = layers.GlobalAveragePooling2D(name="avg_pool")(x)
|
|
386
|
-
elif pooling == "max":
|
|
387
|
-
x = layers.GlobalMaxPooling2D(name="max_pool")(x)
|
|
388
|
-
else:
|
|
389
|
-
return None
|
|
390
|
-
|
|
391
|
-
# Create model.
|
|
392
|
-
model = models.Model(input_tensor, x, name=f"MobileNetV3{model_type}Output")
|
|
393
|
-
|
|
394
|
-
return model
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
def MobileNetV3Split(
|
|
398
|
-
input_shape=None,
|
|
399
|
-
alpha: float = 1.0,
|
|
400
|
-
model_type: str = "Large",
|
|
401
|
-
max_n_blocks: int = 6,
|
|
402
|
-
minimalistic: bool = False,
|
|
403
|
-
mixer_params: tp.Optional[tfc.MobileNetV3MixerParams] = None,
|
|
404
|
-
include_top: bool = True,
|
|
405
|
-
pooling=None,
|
|
406
|
-
classes: int = 1000,
|
|
407
|
-
dropout_rate: float = 0.2,
|
|
408
|
-
classifier_activation="softmax",
|
|
409
|
-
output_all: bool = False,
|
|
410
|
-
name: tp.Optional[str] = None,
|
|
411
|
-
):
|
|
412
|
-
"""Prepares a model of submodels which is equivalent to a MobileNetV3 model.
|
|
413
|
-
|
|
414
|
-
Parameters
|
|
415
|
-
----------
|
|
416
|
-
input_shape : tuple
|
|
417
|
-
Optional shape tuple, to be specified if you would like to use a model with an input image
|
|
418
|
-
resolution that is not (224, 224, 3). It should have exactly 3 inputs channels
|
|
419
|
-
(224, 224, 3). You can also omit this option if you would like to infer input_shape from an
|
|
420
|
-
input_tensor. If you choose to include both input_tensor and input_shape then input_shape
|
|
421
|
-
will be used if they match, if the shapes do not match then we will throw an error. E.g.
|
|
422
|
-
`(160, 160, 3)` would be one valid value.
|
|
423
|
-
alpha : float
|
|
424
|
-
controls the width of the network. This is known as the depth multiplier in the MobileNetV3
|
|
425
|
-
paper, but the name is kept for consistency with MobileNetV1 in Keras.
|
|
426
|
-
- If `alpha` < 1.0, proportionally decreases the number
|
|
427
|
-
of filters in each layer.
|
|
428
|
-
- If `alpha` > 1.0, proportionally increases the number
|
|
429
|
-
of filters in each layer.
|
|
430
|
-
- If `alpha` = 1, default number of filters from the paper
|
|
431
|
-
are used at each layer.
|
|
432
|
-
the mobilenetv3 alpha value
|
|
433
|
-
model_type : {'Small', 'Large'}
|
|
434
|
-
whether it is the small variant or the large variant
|
|
435
|
-
max_n_blocks : int
|
|
436
|
-
the maximum number of blocks in the backbone. It is further constrained by the actual
|
|
437
|
-
maximum number of blocks that the variant can implement.
|
|
438
|
-
minimalistic : bool
|
|
439
|
-
In addition to large and small models this module also contains so-called minimalistic
|
|
440
|
-
models, these models have the same per-layer dimensions characteristic as MobilenetV3
|
|
441
|
-
however, they do not utilize any of the advanced blocks (squeeze-and-excite units,
|
|
442
|
-
hard-swish, and 5x5 convolutions). While these models are less efficient on CPU, they
|
|
443
|
-
are much more performant on GPU/DSP.
|
|
444
|
-
mixer_params : mt.tfc.MobileNetV3MixerParams, optional
|
|
445
|
-
parameters for defining the mixer block
|
|
446
|
-
include_top : bool, default True
|
|
447
|
-
whether to include the fully-connected layer at the top of the network. Only valid if
|
|
448
|
-
`mixer_params` is not null.
|
|
449
|
-
pooling : str, optional
|
|
450
|
-
Optional pooling mode for feature extraction when `include_top` is False and
|
|
451
|
-
`mixer_params` is not null.
|
|
452
|
-
- `None` means that the output of the model will be the 4D tensor output of the last
|
|
453
|
-
convolutional block.
|
|
454
|
-
- `avg` means that global average pooling will be applied to the output of the last
|
|
455
|
-
convolutional block, and thus the output of the model will be a 2D tensor.
|
|
456
|
-
- `max` means that global max pooling will be applied.
|
|
457
|
-
classes : int, optional
|
|
458
|
-
Optional number of classes to classify images into, only to be specified if `mixer_params`
|
|
459
|
-
is not null and `include_top` is True.
|
|
460
|
-
dropout_rate : float
|
|
461
|
-
fraction of the input units to drop on the last layer. Only to be specified if
|
|
462
|
-
`mixer_params` is not null and `include_top` is True.
|
|
463
|
-
classifier_activation : object
|
|
464
|
-
A `str` or callable. The activation function to use on the "top" layer. Ignored unless
|
|
465
|
-
`mixer_params` is not null and `include_top` is True. Set `classifier_activation=None` to
|
|
466
|
-
return the logits of the "top" layer. When loading pretrained weights,
|
|
467
|
-
`classifier_activation` can only be `None` or `"softmax"`.
|
|
468
|
-
output_all : bool
|
|
469
|
-
If True, the model returns the output tensor of every submodel other than the input layer.
|
|
470
|
-
Otherwise, it returns the output tensor of the last submodel.
|
|
471
|
-
name : str, optional
|
|
472
|
-
model name, if any. Default to 'MobileNetV3LargeSplit' or 'MobileNetV3SmallSplit'.
|
|
473
|
-
|
|
474
|
-
Returns
|
|
475
|
-
-------
|
|
476
|
-
tensorflow.keras.Model
|
|
477
|
-
the output MobileNetV3 model split into 5 submodels
|
|
478
|
-
"""
|
|
479
|
-
|
|
480
|
-
input_layer = MobileNetV3Input(input_shape=input_shape)
|
|
481
|
-
input_block = MobileNetV3Parser(
|
|
482
|
-
input_layer,
|
|
483
|
-
model_type=model_type,
|
|
484
|
-
minimalistic=minimalistic,
|
|
485
|
-
)
|
|
486
|
-
x = input_block(input_layer)
|
|
487
|
-
outputs = [x]
|
|
488
|
-
|
|
489
|
-
num_blocks = 5 if model_type == "Large" else 4
|
|
490
|
-
if num_blocks > max_n_blocks:
|
|
491
|
-
num_blocks = max_n_blocks
|
|
492
|
-
for i in range(num_blocks):
|
|
493
|
-
if model_type == "Large":
|
|
494
|
-
block = MobileNetV3LargeBlock(i, x, alpha=alpha, minimalistic=minimalistic)
|
|
495
|
-
else:
|
|
496
|
-
block = MobileNetV3SmallBlock(i, x, alpha=alpha, minimalistic=minimalistic)
|
|
497
|
-
x = block(x)
|
|
498
|
-
if output_all:
|
|
499
|
-
outputs.append(x)
|
|
500
|
-
else:
|
|
501
|
-
outputs = [x]
|
|
502
|
-
|
|
503
|
-
if mixer_params is not None:
|
|
504
|
-
if not isinstance(mixer_params, tfc.MobileNetV3MixerParams):
|
|
505
|
-
raise tfc.ModelSyntaxError(
|
|
506
|
-
"Argument 'mixer_params' is not an instance of "
|
|
507
|
-
"mt.tfc.MobileNetV3MixerParams. Got: {}.".format(type(mixer_params))
|
|
508
|
-
)
|
|
509
|
-
|
|
510
|
-
if model_type == "Large":
|
|
511
|
-
last_point_ch = 1280
|
|
512
|
-
else:
|
|
513
|
-
last_point_ch = 1024
|
|
514
|
-
mixer_block = MobileNetV3Mixer(
|
|
515
|
-
x,
|
|
516
|
-
mixer_params,
|
|
517
|
-
last_point_ch,
|
|
518
|
-
alpha=alpha,
|
|
519
|
-
model_type=model_type,
|
|
520
|
-
minimalistic=minimalistic,
|
|
521
|
-
)
|
|
522
|
-
x = mixer_block(x)
|
|
523
|
-
if output_all:
|
|
524
|
-
if isinstance(x, (list, tuple)):
|
|
525
|
-
outputs.extend(x)
|
|
526
|
-
else:
|
|
527
|
-
outputs.append(x)
|
|
528
|
-
else:
|
|
529
|
-
if isinstance(x, (list, tuple)):
|
|
530
|
-
outputs = [x[-1]]
|
|
531
|
-
else:
|
|
532
|
-
outputs = [x]
|
|
533
|
-
|
|
534
|
-
output_block = MobileNetV3Output(
|
|
535
|
-
x,
|
|
536
|
-
model_type=model_type,
|
|
537
|
-
include_top=include_top,
|
|
538
|
-
classes=classes,
|
|
539
|
-
pooling=pooling,
|
|
540
|
-
dropout_rate=dropout_rate,
|
|
541
|
-
classifier_activation=classifier_activation,
|
|
542
|
-
)
|
|
543
|
-
if output_block is not None:
|
|
544
|
-
x = output_block(x)
|
|
545
|
-
if output_all:
|
|
546
|
-
outputs.append(x)
|
|
547
|
-
else:
|
|
548
|
-
outputs = [x]
|
|
549
|
-
|
|
550
|
-
# Create model.
|
|
551
|
-
if name is None:
|
|
552
|
-
name = f"MobilenetV3{model_type}Split"
|
|
553
|
-
model = models.Model(input_layer, outputs, name=name)
|
|
554
|
-
|
|
555
|
-
return model
|
|
@@ -1,323 +0,0 @@
|
|
|
1
|
-
# pylint: disable=invalid-name
|
|
2
|
-
# pylint: disable=missing-function-docstring
|
|
3
|
-
"""MobileViT model.
|
|
4
|
-
|
|
5
|
-
Most of the code here has been ripped and updated off from the following
|
|
6
|
-
`Keras tutorial <https://keras.io/examples/vision/mobilevit/>`_. Please refer
|
|
7
|
-
to the `MobileViT ICLR2022 paper <https://arxiv.org/abs/2110.02178>`_ for more details.
|
|
8
|
-
|
|
9
|
-
The paper authors' code is `here <https://github.com/apple/ml-cvnets>`_.
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
from mt import tp, tfc, tf
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
try:
|
|
17
|
-
from tensorflow.keras.applications.mobilenet_v3 import _inverted_res_block
|
|
18
|
-
except ImportError:
|
|
19
|
-
try:
|
|
20
|
-
from keras.applications.mobilenet_v3 import _inverted_res_block
|
|
21
|
-
except ImportError:
|
|
22
|
-
try:
|
|
23
|
-
from keras.src.applications.mobilenet_v3 import _inverted_res_block
|
|
24
|
-
except ImportError:
|
|
25
|
-
from .mobilenet_v3_split import _inverted_res_block
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
try:
|
|
29
|
-
import keras
|
|
30
|
-
from keras import backend
|
|
31
|
-
from keras import models
|
|
32
|
-
from keras.layers import VersionAwareLayers
|
|
33
|
-
except ImportError:
|
|
34
|
-
try:
|
|
35
|
-
from tensorflow import keras
|
|
36
|
-
from tensorflow.keras import backend
|
|
37
|
-
from tensorflow.keras import models
|
|
38
|
-
from tensorflow.keras.layers import VersionAwareLayers
|
|
39
|
-
except ImportError:
|
|
40
|
-
from tensorflow.python import keras
|
|
41
|
-
from tensorflow.python.keras import backend
|
|
42
|
-
from tensorflow.python.keras import models
|
|
43
|
-
from tensorflow.python.keras.layers import VersionAwareLayers
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
from .mobilenet_v3_split import MobileNetV3Input
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
layers = VersionAwareLayers()
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def conv_block(x, filters=16, kernel_size=3, strides=2):
|
|
53
|
-
conv_layer = layers.Conv2D(
|
|
54
|
-
filters, kernel_size, strides=strides, activation=tf.nn.swish, padding="same"
|
|
55
|
-
)
|
|
56
|
-
return conv_layer(x)
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
# Reference: https://git.io/JKgtC
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def inverted_residual_block(
|
|
63
|
-
x, expanded_channels, output_channels, strides=1, block_id=0
|
|
64
|
-
):
|
|
65
|
-
if block_id == 0:
|
|
66
|
-
raise NotImplementedError(
|
|
67
|
-
"Zero block id for _inverted_res_block() is not implemented in MobileViT."
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
channel_axis = 1 if backend.image_data_format() == "channels_first" else -1
|
|
71
|
-
infilters = backend.int_shape(x)[channel_axis]
|
|
72
|
-
|
|
73
|
-
m = _inverted_res_block(
|
|
74
|
-
x,
|
|
75
|
-
expanded_channels // infilters, # expansion
|
|
76
|
-
output_channels, # filters
|
|
77
|
-
3, # kernel_size
|
|
78
|
-
strides, # stride
|
|
79
|
-
0, # se_ratio
|
|
80
|
-
tf.nn.swish, # activation
|
|
81
|
-
block_id,
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
return m
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
# Reference:
|
|
88
|
-
# https://keras.io/examples/vision/image_classification_with_vision_transformer/
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def mlp(x, hidden_units, dropout_rate):
|
|
92
|
-
for units in hidden_units:
|
|
93
|
-
x = layers.Dense(units, activation=tf.nn.swish)(x)
|
|
94
|
-
x = layers.Dropout(dropout_rate)(x)
|
|
95
|
-
return x
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def transformer_block(x, transformer_layers, projection_dim, num_heads=2):
|
|
99
|
-
for _ in range(transformer_layers):
|
|
100
|
-
# Layer normalization 1.
|
|
101
|
-
x1 = layers.LayerNormalization(epsilon=1e-6)(x)
|
|
102
|
-
# Create a multi-head attention layer.
|
|
103
|
-
attention_output = layers.MultiHeadAttention(
|
|
104
|
-
num_heads=num_heads, key_dim=projection_dim, dropout=0.1
|
|
105
|
-
)(x1, x1)
|
|
106
|
-
# Skip connection 1.
|
|
107
|
-
x2 = layers.Add()([attention_output, x])
|
|
108
|
-
# Layer normalization 2.
|
|
109
|
-
x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
|
|
110
|
-
# MLP.
|
|
111
|
-
x3 = mlp(
|
|
112
|
-
x3,
|
|
113
|
-
hidden_units=[x.shape[-1] * 2, x.shape[-1]],
|
|
114
|
-
dropout_rate=0.1,
|
|
115
|
-
)
|
|
116
|
-
# Skip connection 2.
|
|
117
|
-
x = layers.Add()([x3, x2])
|
|
118
|
-
|
|
119
|
-
return x
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def mobilevit_block(x, num_blocks, projection_dim, strides=1):
|
|
123
|
-
cell_size = 2 # 2x2 for the Transformer block
|
|
124
|
-
|
|
125
|
-
# Local projection with convolutions.
|
|
126
|
-
local_features = conv_block(x, filters=projection_dim, strides=strides)
|
|
127
|
-
local_features = conv_block(
|
|
128
|
-
local_features, filters=projection_dim, kernel_size=1, strides=strides
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
if x.shape[1] % cell_size != 0:
|
|
132
|
-
raise tfc.ModelSyntaxError(
|
|
133
|
-
"Input tensor must have height divisible by {}. Got {}.".format(
|
|
134
|
-
cell_size, x.shape
|
|
135
|
-
)
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
if x.shape[2] % cell_size != 0:
|
|
139
|
-
raise tfc.ModelSyntaxError(
|
|
140
|
-
"Input tensor must have width divisible by {}. Got {}.".format(
|
|
141
|
-
cell_size, x.shape
|
|
142
|
-
)
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
# Unfold into patches and then pass through Transformers.
|
|
146
|
-
z = local_features # (B,H,W,C)
|
|
147
|
-
z = layers.Reshape(
|
|
148
|
-
(
|
|
149
|
-
z.shape[1] // cell_size,
|
|
150
|
-
cell_size,
|
|
151
|
-
z.shape[2] // cell_size,
|
|
152
|
-
cell_size,
|
|
153
|
-
projection_dim,
|
|
154
|
-
)
|
|
155
|
-
)(
|
|
156
|
-
z
|
|
157
|
-
) # (B,H/P,P,W/P,P,C)
|
|
158
|
-
z = tf.transpose(z, perm=[0, 2, 4, 1, 3, 5]) # (B,P,P,H/P,W/P,C)
|
|
159
|
-
non_overlapping_patches = layers.Reshape(
|
|
160
|
-
(cell_size * cell_size, z.shape[3] * z.shape[4], projection_dim)
|
|
161
|
-
)(
|
|
162
|
-
z
|
|
163
|
-
) # (B,P*P,H*W/(P*P),C)
|
|
164
|
-
global_features = transformer_block(
|
|
165
|
-
non_overlapping_patches, num_blocks, projection_dim
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
# Fold into conv-like feature-maps.
|
|
169
|
-
z = layers.Reshape(
|
|
170
|
-
(
|
|
171
|
-
cell_size,
|
|
172
|
-
cell_size,
|
|
173
|
-
x.shape[1] // cell_size,
|
|
174
|
-
x.shape[2] // cell_size,
|
|
175
|
-
projection_dim,
|
|
176
|
-
)
|
|
177
|
-
)(
|
|
178
|
-
global_features
|
|
179
|
-
) # (B,P,P,H/P,W/P,C)
|
|
180
|
-
z = tf.transpose(z, perm=[0, 3, 1, 4, 2, 5]) # (B,H/P,P,W/P,P,C)
|
|
181
|
-
folded_feature_map = layers.Reshape((x.shape[1], x.shape[2], projection_dim))(z)
|
|
182
|
-
|
|
183
|
-
# Apply point-wise conv -> concatenate with the input features.
|
|
184
|
-
folded_feature_map = conv_block(
|
|
185
|
-
folded_feature_map, filters=x.shape[-1], kernel_size=1, strides=strides
|
|
186
|
-
)
|
|
187
|
-
local_global_features = layers.Concatenate(axis=-1)([x, folded_feature_map])
|
|
188
|
-
|
|
189
|
-
# Fuse the local and global features using a convolution layer.
|
|
190
|
-
local_global_features = conv_block(
|
|
191
|
-
local_global_features, filters=projection_dim, strides=strides
|
|
192
|
-
)
|
|
193
|
-
|
|
194
|
-
return local_global_features
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def create_mobilevit(
|
|
198
|
-
input_shape=None,
|
|
199
|
-
model_type: str = "XXS",
|
|
200
|
-
output_all: bool = False,
|
|
201
|
-
name: tp.Optional[str] = None,
|
|
202
|
-
):
|
|
203
|
-
"""Prepares a model of submodels which is equivalent to a MobileNetV3 model.
|
|
204
|
-
|
|
205
|
-
Parameters
|
|
206
|
-
----------
|
|
207
|
-
input_shape : tuple
|
|
208
|
-
Optional shape tuple, to be specified if you would like to use a model with an input image
|
|
209
|
-
resolution that is not (224, 224, 3). It should have exactly 3 inputs channels
|
|
210
|
-
(224, 224, 3). You can also omit this option if you would like to infer input_shape from an
|
|
211
|
-
input_tensor. If you choose to include both input_tensor and input_shape then input_shape
|
|
212
|
-
will be used if they match, if the shapes do not match then we will throw an error. E.g.
|
|
213
|
-
`(160, 160, 3)` would be one valid value.
|
|
214
|
-
model_type : {'XXS', 'XS', 'S'}
|
|
215
|
-
one of the 3 variants introduced in the paper
|
|
216
|
-
output_all : bool
|
|
217
|
-
If True, the model returns the output tensor of every block before down-sampling, other
|
|
218
|
-
than the input layer. Otherwise, it returns the output tensor of the last block.
|
|
219
|
-
name : str, optional
|
|
220
|
-
model name, if any. Default to 'MobileViT<model_type>'.
|
|
221
|
-
|
|
222
|
-
Returns
|
|
223
|
-
-------
|
|
224
|
-
tensorflow.keras.Model
|
|
225
|
-
the output MobileViT model
|
|
226
|
-
"""
|
|
227
|
-
|
|
228
|
-
model_type_id = ["XXS", "XS", "S"].index(model_type)
|
|
229
|
-
|
|
230
|
-
expansion_factor = 2 if model_type_id == 0 else 4
|
|
231
|
-
|
|
232
|
-
inputs = MobileNetV3Input(input_shape=input_shape)
|
|
233
|
-
x = layers.Rescaling(scale=1.0 / 255)(inputs)
|
|
234
|
-
|
|
235
|
-
# Initial conv-stem -> MV2 block.
|
|
236
|
-
x = conv_block(x, filters=16)
|
|
237
|
-
x = inverted_residual_block(
|
|
238
|
-
x,
|
|
239
|
-
expanded_channels=16 * expansion_factor,
|
|
240
|
-
output_channels=16 if model_type_id == 0 else 32,
|
|
241
|
-
block_id=1,
|
|
242
|
-
)
|
|
243
|
-
outputs = [x]
|
|
244
|
-
|
|
245
|
-
# Downsampling with MV2 block.
|
|
246
|
-
output_channels = [24, 48, 64][model_type_id]
|
|
247
|
-
x = inverted_residual_block(
|
|
248
|
-
x,
|
|
249
|
-
expanded_channels=16 * expansion_factor,
|
|
250
|
-
output_channels=output_channels,
|
|
251
|
-
strides=2,
|
|
252
|
-
block_id=2,
|
|
253
|
-
)
|
|
254
|
-
x = inverted_residual_block(
|
|
255
|
-
x,
|
|
256
|
-
expanded_channels=24 * expansion_factor,
|
|
257
|
-
output_channels=output_channels,
|
|
258
|
-
block_id=3,
|
|
259
|
-
)
|
|
260
|
-
x = inverted_residual_block(
|
|
261
|
-
x,
|
|
262
|
-
expanded_channels=24 * expansion_factor,
|
|
263
|
-
output_channels=output_channels,
|
|
264
|
-
block_id=4,
|
|
265
|
-
)
|
|
266
|
-
if output_all:
|
|
267
|
-
outputs.append(x)
|
|
268
|
-
else:
|
|
269
|
-
outputs = [x]
|
|
270
|
-
|
|
271
|
-
# First MV2 -> MobileViT block.
|
|
272
|
-
output_channels = [48, 64, 96][model_type_id]
|
|
273
|
-
projection_dim = [64, 96, 144][model_type_id]
|
|
274
|
-
x = inverted_residual_block(
|
|
275
|
-
x,
|
|
276
|
-
expanded_channels=48 * expansion_factor,
|
|
277
|
-
output_channels=output_channels,
|
|
278
|
-
strides=2,
|
|
279
|
-
block_id=5,
|
|
280
|
-
)
|
|
281
|
-
x = mobilevit_block(x, num_blocks=2, projection_dim=projection_dim)
|
|
282
|
-
if output_all:
|
|
283
|
-
outputs.append(x)
|
|
284
|
-
else:
|
|
285
|
-
outputs = [x]
|
|
286
|
-
|
|
287
|
-
# Second MV2 -> MobileViT block.
|
|
288
|
-
output_channels = [64, 80, 128][model_type_id]
|
|
289
|
-
projection_dim = [80, 120, 192][model_type_id]
|
|
290
|
-
x = inverted_residual_block(
|
|
291
|
-
x,
|
|
292
|
-
expanded_channels=64 * expansion_factor,
|
|
293
|
-
output_channels=output_channels,
|
|
294
|
-
strides=2,
|
|
295
|
-
block_id=6,
|
|
296
|
-
)
|
|
297
|
-
x = mobilevit_block(x, num_blocks=4, projection_dim=projection_dim)
|
|
298
|
-
if output_all:
|
|
299
|
-
outputs.append(x)
|
|
300
|
-
else:
|
|
301
|
-
outputs = [x]
|
|
302
|
-
|
|
303
|
-
# Third MV2 -> MobileViT block.
|
|
304
|
-
output_channels = [80, 96, 160][model_type_id]
|
|
305
|
-
projection_dim = [96, 144, 240][model_type_id]
|
|
306
|
-
x = inverted_residual_block(
|
|
307
|
-
x,
|
|
308
|
-
expanded_channels=80 * expansion_factor,
|
|
309
|
-
output_channels=output_channels,
|
|
310
|
-
strides=2,
|
|
311
|
-
block_id=7,
|
|
312
|
-
)
|
|
313
|
-
x = mobilevit_block(x, num_blocks=3, projection_dim=projection_dim)
|
|
314
|
-
filters = [320, 384, 640][model_type_id]
|
|
315
|
-
x = conv_block(x, filters=filters, kernel_size=1, strides=1)
|
|
316
|
-
if output_all:
|
|
317
|
-
outputs.append(x)
|
|
318
|
-
else:
|
|
319
|
-
outputs = [x]
|
|
320
|
-
|
|
321
|
-
if name is None:
|
|
322
|
-
name = "MobileViT{}".format(model_type)
|
|
323
|
-
return keras.Model(inputs, outputs, name=name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|