onnxruntime_extensions 0.14.0__cp313-cp313-macosx_11_0_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. onnxruntime_extensions/__init__.py +82 -0
  2. onnxruntime_extensions/_cuops.py +564 -0
  3. onnxruntime_extensions/_extensions_pydll.cpython-313-darwin.so +0 -0
  4. onnxruntime_extensions/_extensions_pydll.pyi +45 -0
  5. onnxruntime_extensions/_hf_cvt.py +331 -0
  6. onnxruntime_extensions/_ocos.py +133 -0
  7. onnxruntime_extensions/_ortapi2.py +274 -0
  8. onnxruntime_extensions/_torch_cvt.py +231 -0
  9. onnxruntime_extensions/_version.py +2 -0
  10. onnxruntime_extensions/cmd.py +66 -0
  11. onnxruntime_extensions/cvt.py +306 -0
  12. onnxruntime_extensions/onnxprocess/__init__.py +12 -0
  13. onnxruntime_extensions/onnxprocess/_builder.py +53 -0
  14. onnxruntime_extensions/onnxprocess/_onnx_ops.py +1507 -0
  15. onnxruntime_extensions/onnxprocess/_session.py +355 -0
  16. onnxruntime_extensions/onnxprocess/_tensor.py +628 -0
  17. onnxruntime_extensions/onnxprocess/torch_wrapper.py +31 -0
  18. onnxruntime_extensions/pnp/__init__.py +13 -0
  19. onnxruntime_extensions/pnp/_base.py +124 -0
  20. onnxruntime_extensions/pnp/_imagenet.py +65 -0
  21. onnxruntime_extensions/pnp/_nlp.py +148 -0
  22. onnxruntime_extensions/pnp/_onnx_ops.py +1544 -0
  23. onnxruntime_extensions/pnp/_torchext.py +310 -0
  24. onnxruntime_extensions/pnp/_unifier.py +45 -0
  25. onnxruntime_extensions/pnp/_utils.py +302 -0
  26. onnxruntime_extensions/pp_api.py +83 -0
  27. onnxruntime_extensions/tools/__init__.py +0 -0
  28. onnxruntime_extensions/tools/add_HuggingFace_CLIPImageProcessor_to_model.py +171 -0
  29. onnxruntime_extensions/tools/add_pre_post_processing_to_model.py +535 -0
  30. onnxruntime_extensions/tools/pre_post_processing/__init__.py +4 -0
  31. onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py +395 -0
  32. onnxruntime_extensions/tools/pre_post_processing/step.py +227 -0
  33. onnxruntime_extensions/tools/pre_post_processing/steps/__init__.py +6 -0
  34. onnxruntime_extensions/tools/pre_post_processing/steps/general.py +366 -0
  35. onnxruntime_extensions/tools/pre_post_processing/steps/nlp.py +344 -0
  36. onnxruntime_extensions/tools/pre_post_processing/steps/vision.py +1157 -0
  37. onnxruntime_extensions/tools/pre_post_processing/utils.py +139 -0
  38. onnxruntime_extensions/util.py +186 -0
  39. onnxruntime_extensions-0.14.0.dist-info/LICENSE +21 -0
  40. onnxruntime_extensions-0.14.0.dist-info/METADATA +102 -0
  41. onnxruntime_extensions-0.14.0.dist-info/RECORD +43 -0
  42. onnxruntime_extensions-0.14.0.dist-info/WHEEL +6 -0
  43. onnxruntime_extensions-0.14.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1157 @@
1
+ # Copyright (c) Microsoft Corporation. All rights reserved.
2
+ # Licensed under the MIT License.
3
+
4
+ import onnx
5
+ import numpy as np
6
+
7
+ from typing import List, Optional, Tuple, Union
8
+ from ..step import Step
9
+ from .general import Transpose
10
+
11
+ #
12
+ # Image conversion
13
+ #
14
+
15
+
16
+ class ConvertImageToBGR(Step):
17
+ """
18
+ Convert the bytes of an image by decoding to BGR ordered uint8 values.
19
+ Supported input formats: jpg, png
20
+ Input shape: {num_encoded_bytes}
21
+ Output shape: {input_image_height, input_image_width, 3}
22
+ """
23
+
24
+ def __init__(self, name: Optional[str] = None):
25
+ """
26
+ Args:
27
+ name: Optional name of step. Defaults to 'ConvertImageToBGR'
28
+
29
+ NOTE: Input image format is inferred and does not need to be specified.
30
+ """
31
+ super().__init__(["image"], ["bgr_data"], name)
32
+
33
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
34
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
35
+ assert input_type_str == "uint8"
36
+ output_shape_str = f"to_bgr_ppp_{self.step_num}_h, to_bgr_ppp_{self.step_num}_w, 3"
37
+
38
+ converter_graph = onnx.parser.parse_graph(
39
+ f"""\
40
+ image_to_bgr (uint8[{input_shape_str}] {self.input_names[0]})
41
+ => (uint8[{output_shape_str}] {self.output_names[0]})
42
+ {{
43
+ {self.output_names[0]} = com.microsoft.extensions.DecodeImage({self.input_names[0]})
44
+ }}
45
+ """
46
+ )
47
+
48
+ return converter_graph
49
+
50
+
51
+ class ConvertBGRToImage(Step):
52
+ """
53
+ Convert BGR ordered uint8 data into an encoded image.
54
+ Supported output input formats: jpg, png
55
+ Input shape: {input_image_height, input_image_width, 3}
56
+ Output shape: {num_encoded_bytes}
57
+ """
58
+
59
+ def __init__(self, image_format: str = "jpg", name: Optional[str] = None):
60
+ """
61
+ Args:
62
+ image_format: Format to encode to. jpg and png are supported.
63
+ name: Optional step name. Defaults to 'ConvertBGRToImage'
64
+ """
65
+ super().__init__(["bgr_data"], ["image"], name)
66
+ assert image_format == "jpg" or image_format == "png"
67
+ self._format = image_format
68
+
69
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
70
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
71
+ assert input_type_str == "uint8"
72
+ output_shape_str = f"to_image_ppp_{self.step_num}_num_bytes"
73
+
74
+ converter_graph = onnx.parser.parse_graph(
75
+ f"""\
76
+ bgr_to_image (uint8[{input_shape_str}] {self.input_names[0]})
77
+ => (uint8[{output_shape_str}] {self.output_names[0]})
78
+ {{
79
+ {self.output_names[0]} = com.microsoft.extensions.EncodeImage ({self.input_names[0]})
80
+ }}
81
+ """
82
+ )
83
+
84
+ # as this is a custom op we have to add the attribute for `format` directly to the node.
85
+ # parse_graph doesn't have a schema for the operator and fails attempting to validate the attribute.
86
+ format_attr = converter_graph.node[0].attribute.add()
87
+ format_attr.name = "format"
88
+ format_attr.type = onnx.AttributeProto.AttributeType.STRING
89
+ format_attr.s = bytes(self._format, "utf-8")
90
+
91
+ return converter_graph
92
+
93
+
94
+ class PixelsToYCbCr(Step):
95
+ """
96
+ Convert RGB or BGR pixel data to YCbCr format.
97
+ Input shape: {height, width, 3}
98
+ Output shape is the same.
99
+ Output data is float, but rounded and clipped to the range 0..255 as per the spec for YCbCr conversion.
100
+ """
101
+
102
+ def __init__(self, layout: str = "BGR", name: Optional[str] = None):
103
+ """
104
+ Args:
105
+ layout: Input data layout. Can be 'BGR' or 'RGB'
106
+ name: Optional step name. Defaults to 'PixelsToYCbCr'
107
+ """
108
+ super().__init__(["pixels"], ["Y", "Cb", "Cr"], name)
109
+ assert layout == "RGB" or layout == "BGR"
110
+ self._layout = layout
111
+
112
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
113
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
114
+ # input should be uint8 data HWC
115
+ input_dims = input_shape_str.split(",")
116
+ assert input_type_str == "uint8" and len(input_dims) == 3 and input_dims[2] == "3"
117
+
118
+ # https://en.wikipedia.org/wiki/YCbCr
119
+ # exact weights from https://www.itu.int/rec/T-REC-T.871-201105-I/en
120
+ rgb_weights = np.array([[0.299, 0.587, 0.114],
121
+ [-0.299 / 1.772, -0.587 / 1.772, 0.500],
122
+ [0.500, -0.587 / 1.402, -0.114 / 1.402]],
123
+ dtype=np.float32) # fmt: skip
124
+
125
+ bias = [0.0, 128.0, 128.0]
126
+
127
+ if self._layout == "RGB":
128
+ weights = rgb_weights
129
+ else:
130
+ weights = rgb_weights[:, ::-1] # reverse the order of the last dim for BGR input
131
+
132
+ # Weights are transposed for usage in matmul.
133
+ weights_shape = "3, 3"
134
+ weights = ",".join([str(w) for w in weights.T.flatten()])
135
+
136
+ bias_shape = "3"
137
+ bias = ",".join([str(b) for b in bias])
138
+
139
+ # each output is {h, w}. TBD if input is CHW or HWC though. Once we figure that out we could copy values from
140
+ # the input shape
141
+ output_shape_str = f"YCbCr_ppp_{self.step_num}_h, YCbCr_ppp_{self.step_num}_w"
142
+ assert input_type_str == "uint8"
143
+
144
+ split_attr = "axis = -1"
145
+ if onnx_opset >= 18:
146
+ # Split now requires the number of outputs to be specified even though that can be easily inferred...
147
+ split_attr += ", num_outputs = 3"
148
+
149
+ # convert to float for MatMul
150
+ # apply weights and bias
151
+ # round and clip so it's in the range 0..255
152
+ # split into channels. shape will be {h, w, 1}
153
+ # remove the trailing '1' so output is {h, w}
154
+ converter_graph = onnx.parser.parse_graph(
155
+ f"""\
156
+ pixels_to_YCbCr (uint8[{input_shape_str}] {self.input_names[0]})
157
+ => (float[{output_shape_str}] {self.output_names[0]},
158
+ float[{output_shape_str}] {self.output_names[1]},
159
+ float[{output_shape_str}] {self.output_names[2]})
160
+ {{
161
+ kWeights = Constant <value = float[{weights_shape}] {{{weights}}}> ()
162
+ kBias = Constant <value = float[{bias_shape}] {{{bias}}}> ()
163
+ i64_neg1 = Constant <value = int64[1] {{-1}}> ()
164
+ f_0 = Constant <value = float[1] {{0.0}}> ()
165
+ f_255 = Constant <value = float[1] {{255.0}}> ()
166
+
167
+ f_pixels = Cast <to = 1> ({self.input_names[0]})
168
+ f_weighted = MatMul(f_pixels, kWeights)
169
+ f_biased = Add(f_weighted, kBias)
170
+ f_rounded = Round(f_biased)
171
+ f_clipped = Clip (f_rounded, f_0, f_255)
172
+ split_Y, split_Cb, split_Cr = Split <{split_attr}>(f_clipped)
173
+ {self.output_names[0]} = Squeeze (split_Y, i64_neg1)
174
+ {self.output_names[1]} = Squeeze (split_Cb, i64_neg1)
175
+ {self.output_names[2]} = Squeeze (split_Cr, i64_neg1)
176
+ }}
177
+ """
178
+ )
179
+
180
+ return converter_graph
181
+
182
+
183
+ class YCbCrToPixels(Step):
184
+ """
185
+ Convert YCbCr input to RGB or BGR.
186
+
187
+ Input data can be uint8 or float but all inputs must use the same type.
188
+ Input shape: {height, width, 3}
189
+ Output shape is the same.
190
+ """
191
+
192
+ def __init__(self, layout: str = "BGR", name: Optional[str] = None):
193
+ """
194
+ Args:
195
+ layout: Output layout. Can be 'BGR' or 'RGB'
196
+ name: Optional step name. Defaults to 'YCbCrToPixels'
197
+ """
198
+ super().__init__(["Y", "Cb", "Cr"], ["bgr_data"], name)
199
+ assert layout == "RGB" or layout == "BGR"
200
+ self._layout = layout
201
+
202
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
203
+ input_type_str0, input_shape_str0 = self._get_input_type_and_shape_strs(graph, 0)
204
+ input_type_str1, input_shape_str1 = self._get_input_type_and_shape_strs(graph, 1)
205
+ input_type_str2, input_shape_str2 = self._get_input_type_and_shape_strs(graph, 2)
206
+ assert (input_type_str0 == "uint8" and input_type_str1 == "uint8" and input_type_str2 == "uint8") or (
207
+ input_type_str0 == "float" and input_type_str1 == "float" and input_type_str2 == "float"
208
+ )
209
+
210
+ assert (
211
+ len(input_shape_str0.split(",")) == 2
212
+ and len(input_shape_str1.split(",")) == 2
213
+ and len(input_shape_str2.split(",")) == 2
214
+ )
215
+
216
+ output_shape_str = f"{input_shape_str0}, 3"
217
+
218
+ # fmt: off
219
+ # https://en.wikipedia.org/wiki/YCbCr
220
+ # exact weights from https://www.itu.int/rec/T-REC-T.871-201105-I/en
221
+ ycbcr_to_rgb_weights = np.array([[1, 0, 1.402],
222
+ [1, -0.114*1.772/0.587, -0.299*1.402/0.587],
223
+ [1, 1.772, 0]],
224
+ dtype=np.float32)
225
+ # fmt: on
226
+
227
+ # reverse first dim of weights for output to be bgr
228
+ ycbcr_to_bgr_weights = ycbcr_to_rgb_weights[::-1, :]
229
+
230
+ weights = ycbcr_to_bgr_weights if self._layout == "BGR" else ycbcr_to_rgb_weights
231
+ bias = [0.0, 128.0, 128.0]
232
+
233
+ weights_shape = "3, 3"
234
+ # transpose weights for use in matmul
235
+ weights = ",".join([str(w) for w in weights.T.flatten()])
236
+
237
+ bias_shape = "3"
238
+ bias = ",".join([str(b) for b in bias])
239
+
240
+ # unsqueeze the {h, w} inputs to add channels dim. new shape is {h, w, 1}
241
+ # merge Y, Cb, Cr data on the new channel axis
242
+ # convert to float to apply weights etc.
243
+ # remove bias
244
+ # apply weights
245
+ # round and clip to 0..255
246
+ # convert to uint8.
247
+ converter_graph = onnx.parser.parse_graph(
248
+ f"""\
249
+ YCbCr_to_RGB ({input_type_str0}[{input_shape_str0}] {self.input_names[0]},
250
+ {input_type_str1}[{input_shape_str1}] {self.input_names[1]},
251
+ {input_type_str2}[{input_shape_str2}] {self.input_names[2]})
252
+ => (uint8[{output_shape_str}] {self.output_names[0]})
253
+ {{
254
+ kWeights = Constant <value = float[{weights_shape}] {{{weights}}}> ()
255
+ kBias = Constant <value = float[{bias_shape}] {{{bias}}}> ()
256
+ f_0 = Constant <value = float[1] {{0.0}}> ()
257
+ f_255 = Constant <value = float[1] {{255.0}}> ()
258
+ i64_neg1 = Constant <value = int64[1] {{-1}}> ()
259
+
260
+ Y1 = Unsqueeze({self.input_names[0]}, i64_neg1)
261
+ Cb1 = Unsqueeze({self.input_names[1]}, i64_neg1)
262
+ Cr1 = Unsqueeze({self.input_names[2]}, i64_neg1)
263
+ YCbCr = Concat <axis = -1> (Y1, Cb1, Cr1)
264
+ f_YCbCr = Cast <to = 1> (YCbCr)
265
+ f_unbiased = Sub (f_YCbCr, kBias)
266
+ f_pixels = MatMul (f_unbiased, kWeights)
267
+ f_rounded = Round (f_pixels)
268
+ clipped = Clip (f_rounded, f_0, f_255)
269
+ {self.output_names[0]} = Cast <to = {onnx.TensorProto.UINT8}> (clipped)
270
+ }}
271
+ """
272
+ )
273
+
274
+ return converter_graph
275
+
276
+
277
+ #
278
+ # Pre-processing
279
+ #
280
+ class Resize(Step):
281
+ """
282
+ Resize input data. Aspect ratio is maintained.
283
+ e.g. if image is 1200 x 600 and 300 x 300 is requested the result will be 600 x 300
284
+ """
285
+
286
+ def __init__(self, resize_to: Union[int, Tuple[int, int]], layout: str = "HWC",
287
+ policy: str = "not_smaller", name: Optional[str] = None):
288
+ """
289
+ Args:
290
+ resize_to: Target size. Can be a single value or a tuple with (target_height, target_width).
291
+ The aspect ratio will be maintained and neither height or width in the result will be smaller
292
+ than the requested value.
293
+ layout: Input layout. 'NCHW', 'NHWC', 'CHW', 'HWC' and 'HW' are supported.
294
+ policy: not_smaller (default)
295
+ the sizes are adjusted so that no extent of the output is larger than the specified size,
296
+ while keeping the original aspect ratio
297
+ not_larger
298
+ the sizes are adjusted so that no extent of the output is smaller than the specified size,
299
+ while keeping the original aspect ratio.
300
+ Please refer to https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize for more details.
301
+ name: Optional name. Defaults to 'Resize'
302
+ """
303
+ super().__init__(["image"], ["resized_image"], name)
304
+ if isinstance(resize_to, int):
305
+ self._height = self._width = resize_to
306
+ else:
307
+ assert isinstance(resize_to, tuple)
308
+ self._height, self._width = resize_to
309
+
310
+ self._layout = layout
311
+ self.policy_ = policy
312
+
313
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
314
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
315
+ dims = input_shape_str.split(",")
316
+
317
+ # adjust for layout
318
+ # resize will use the largest ratio so both sides won't necessarily match the requested height and width.
319
+ # use symbolic names for the output dims as we have to provide values. prefix the names to try and
320
+ # avoid any clashes.
321
+ add_batch_dim = False
322
+
323
+ if self._layout == "NHWC":
324
+ assert len(dims) == 4
325
+ split_str = "n, h, w, c"
326
+ sizes_str = "n, h2, w2, c"
327
+ output_shape_str = f"{dims[0]}, resize_ppp_{self.step_num}_h, resize_ppp_{self.step_num}_w, {dims[-1]}"
328
+ elif self._layout == "NCHW":
329
+ assert len(dims) == 4
330
+ split_str = "n, c, h, w"
331
+ sizes_str = "n, c, h2, w2"
332
+ output_shape_str = f"{dims[0]}, {dims[1]}, resize_ppp_{self.step_num}_h, resize_ppp_{self.step_num}_w"
333
+ elif self._layout == "HWC":
334
+ assert len(dims) == 3
335
+ add_batch_dim = True
336
+ split_str = "h, w, c"
337
+ sizes_str = "h2, w2, c"
338
+ output_shape_str = f"resize_ppp_{self.step_num}_h, resize_ppp_{self.step_num}_w, {dims[-1]}"
339
+ elif self._layout == "CHW":
340
+ assert len(dims) == 3
341
+ add_batch_dim = True
342
+ split_str = "c, h, w"
343
+ sizes_str = "c, h2, w2"
344
+ output_shape_str = f"{dims[0]}, resize_ppp_{self.step_num}_h, resize_ppp_{self.step_num}_w"
345
+ elif self._layout == "HW":
346
+ assert len(dims) == 2
347
+ split_str = "h, w"
348
+ sizes_str = "h2, w2"
349
+ output_shape_str = f"resize_ppp_{self.step_num}_h, resize_ppp_{self.step_num}_w"
350
+ else:
351
+ raise ValueError(f"Unsupported layout of {self._layout}")
352
+
353
+ # TODO: Make this configurable. Matching PIL resize for now.
354
+ resize_attributes = 'mode = "linear", nearest_mode = "floor"'
355
+ if onnx_opset >= 18:
356
+ # Resize matches PIL better if antialiasing is used, but that isn't available until ONNX opset 18.
357
+ # Allow this to be used with older opsets as well.
358
+ resize_attributes += ', antialias = 1'
359
+
360
+ u64_1_str = ""
361
+
362
+ # Rank 3 input uses trilinear interpolation, so if input is HWC or CHW we need to add a temporary batch dim
363
+ # to make it rank 4, which will result in Resize using the desired bilinear interpolation.
364
+ if add_batch_dim:
365
+ u64_1_str = "u64_1 = Constant <value = int64[1] {1}> ()"
366
+ sizes_str = "u64_1, " + sizes_str
367
+ resize_str = \
368
+ f"""\
369
+ axes = Constant <value = int64[1] {{{0}}}> ()
370
+ unsqueezed = Unsqueeze ({self.input_names[0]}, axes)
371
+ resized = Resize <{resize_attributes}> (unsqueezed, , , sizes_resize)
372
+ {self.output_names[0]} = Squeeze (resized, axes)
373
+ """
374
+ else:
375
+ resize_str = \
376
+ f"{self.output_names[0]} = Resize <{resize_attributes}> ({self.input_names[0]}, , , sizes_resize)"
377
+
378
+ split_input_shape_attr = "axis = 0"
379
+ split_new_sizes_attr = "axis = 0"
380
+ if onnx_opset >= 18:
381
+ # Split now requires the number of outputs to be specified even though that can be easily inferred...
382
+ split_input_shape_attr += f", num_outputs = {len(dims)}"
383
+ split_new_sizes_attr += ", num_outputs = 2"
384
+
385
+ # Resize-18 has the attribute "not_larger/not_smaller" to specify the resize policy, however
386
+ # we want to support older opsets as well.
387
+ assert self.policy_ in ["not_smaller", "not_larger"], \
388
+ f"Unsupported resize policy of {self.policy_}, must be 'not_smaller' or 'not_larger'"
389
+
390
+ ratio_resize_func = "ReduceMax"
391
+ if self.policy_ == "not_larger":
392
+ ratio_resize_func = "ReduceMin"
393
+
394
+ resize_graph = onnx.parser.parse_graph(
395
+ f"""\
396
+ resize ({input_type_str}[{input_shape_str}] {self.input_names[0]}) =>
397
+ ({input_type_str}[{output_shape_str}] {self.output_names[0]})
398
+ {{
399
+ target_size = Constant <value = float[2] {{{float(self._height)}, {float(self._width)}}}> ()
400
+ image_shape = Shape ({self.input_names[0]})
401
+ {split_str} = Split <{split_input_shape_attr}> (image_shape)
402
+ hw = Concat <axis = 0> (h, w)
403
+ f_hw = Cast <to = 1> (hw)
404
+ ratios = Div (target_size, f_hw)
405
+ ratio_resize = {ratio_resize_func} (ratios)
406
+ f_hw2_exact = Mul (f_hw, ratio_resize)
407
+ f_hw2_round = Round (f_hw2_exact)
408
+ hw2 = Cast <to = 7> (f_hw2_round)
409
+ h2, w2 = Split <{split_new_sizes_attr}> (hw2)
410
+ {u64_1_str}
411
+ sizes_resize = Concat <axis = 0> ({sizes_str})
412
+ {resize_str}
413
+ }}
414
+ """
415
+ )
416
+
417
+ return resize_graph
418
+
419
+
420
+ class CenterCrop(Step):
421
+ """
422
+ Crop the input to the requested dimensions, with the crop being centered.
423
+ Currently only HWC input is handled.
424
+ """
425
+
426
+ def __init__(self, height: int, width: int, name: Optional[str] = None):
427
+ """
428
+ Args:
429
+ height: Height of area to crop.
430
+ width: Width of area to crop.
431
+ name: Optional step name. Defaults to 'CenterCrop'
432
+ """
433
+ super().__init__(["image"], ["cropped_image"], name)
434
+ self._height = height
435
+ self._width = width
436
+
437
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
438
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
439
+ dims = input_shape_str.split(",")
440
+ output_shape_str = f"{self._height}, {self._width}, {dims[-1]}"
441
+
442
+ crop_graph = onnx.parser.parse_graph(
443
+ f"""\
444
+ crop ({input_type_str}[{input_shape_str}] {self.input_names[0]})
445
+ => ({input_type_str}[{output_shape_str}] {self.output_names[0]})
446
+ {{
447
+ target_crop = Constant <value = int64[2] {{{self._height}, {self._width}}}> ()
448
+ i64_2 = Constant <value = int64[1] {{2}}> ()
449
+ axes = Constant <value = int64[2] {{0, 1}}> ()
450
+ x_shape = Shape ({self.input_names[0]})
451
+ hw = Gather (x_shape, axes)
452
+ hw_diff = Sub (hw, target_crop)
453
+ start_xy = Div (hw_diff, i64_2)
454
+ end_xy = Add (start_xy, target_crop)
455
+ {self.output_names[0]} = Slice ({self.input_names[0]}, start_xy, end_xy, axes)
456
+ }}
457
+ """
458
+ )
459
+
460
+ return crop_graph
461
+
462
+
463
+ class Normalize(Step):
464
+ """
465
+ Normalize input data on a per-channel basis.
466
+ `x -> (x - mean) / stddev`
467
+ Output is float with same shape as input.
468
+ """
469
+
470
+ def __init__(self, normalization_values: List[Tuple[float, float]], layout: str = "CHW", name: Optional[str] = None):
471
+ """
472
+ Args:
473
+ normalization_values: Tuple with (mean, stddev). One entry per channel.
474
+ If single entry is provided it will be used for all channels.
475
+ layout: Input layout. Can be 'CHW' or 'HWC'
476
+ name: Optional step name. Defaults to 'Normalize'
477
+ """
478
+ super().__init__(["data"], ["normalized_data"], name)
479
+
480
+ # duplicate for each channel if needed
481
+ if len(normalization_values) == 1:
482
+ normalization_values *= 3
483
+
484
+ assert len(normalization_values) == 3
485
+ self._normalization_values = normalization_values
486
+ assert layout == "HWC" or layout == "CHW"
487
+ self._hwc_layout = True if layout == "HWC" else False
488
+
489
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
490
+ mean0 = self._normalization_values[0][0]
491
+ mean1 = self._normalization_values[1][0]
492
+ mean2 = self._normalization_values[2][0]
493
+ stddev0 = self._normalization_values[0][1]
494
+ stddev1 = self._normalization_values[1][1]
495
+ stddev2 = self._normalization_values[2][1]
496
+
497
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
498
+ values_shape = "3" if self._hwc_layout else "3, 1, 1"
499
+
500
+ normalize_graph = onnx.parser.parse_graph(
501
+ f"""\
502
+ normalize ({input_type_str}[{input_shape_str}] {self.input_names[0]})
503
+ => (float[{input_shape_str}] {self.output_names[0]})
504
+ {{
505
+ kMean = Constant <value = float[{values_shape}] {{{mean0}, {mean1}, {mean2}}}> ()
506
+ kStddev = Constant <value = float[{values_shape}] {{{stddev0}, {stddev1}, {stddev2}}}> ()
507
+ f_input = Cast <to = 1> ({self.input_names[0]})
508
+ f_sub_mean = Sub (f_input, kMean)
509
+ {self.output_names[0]} = Div (f_sub_mean, kStddev)
510
+ }}
511
+ """
512
+ )
513
+
514
+ onnx.checker.check_graph(normalize_graph)
515
+ return normalize_graph
516
+
517
+
518
+ #
519
+ # Utilities
520
+ #
521
+ class ImageBytesToFloat(Step):
522
+ """
523
+ Convert uint8 or float values in range 0..255 to floating point values in range 0..1
524
+ """
525
+
526
+ def __init__(self, rescale_factor: float = 1/255, name: Optional[str] = None):
527
+ """
528
+ Args:
529
+ name: Optional step name. Defaults to 'ImageBytesToFloat'
530
+ """
531
+ super().__init__(["data"], ["float_data"], name)
532
+ self.rescale_factor_ = rescale_factor
533
+
534
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
535
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
536
+ if input_type_str == "uint8":
537
+ optional_cast = f"""\
538
+ input_f = Cast <to = 1> ({self.input_names[0]})
539
+ """
540
+ else:
541
+ # no-op that optimizer will remove
542
+ optional_cast = f"input_f = Identity ({self.input_names[0]})"
543
+
544
+ byte_to_float_graph = onnx.parser.parse_graph(
545
+ f"""\
546
+ byte_to_float ({input_type_str}[{input_shape_str}] {self.input_names[0]})
547
+ => (float[{input_shape_str}] {self.output_names[0]})
548
+ {{
549
+ f_scale = Constant <value = float[1] {{{self.rescale_factor_}}}>()
550
+
551
+ {optional_cast}
552
+ {self.output_names[0]} = Mul(input_f, f_scale)
553
+ }}
554
+ """
555
+ )
556
+
557
+ onnx.checker.check_graph(byte_to_float_graph)
558
+ return byte_to_float_graph
559
+
560
+
561
+ class FloatToImageBytes(Step):
562
+ """
563
+ Converting floating point values to uint8 values in range 0..255.
564
+ Typically this reverses ImageBytesToFloat by converting input data in the range 0..1, but an optional multiplier
565
+ can be specified if the input data has a different range.
566
+ Values will be rounded prior to clipping and conversion to uint8.
567
+ """
568
+
569
+ def __init__(self, multiplier: float = 255.0, name: Optional[str] = None):
570
+ """
571
+ Args:
572
+ multiplier: Optional multiplier. Currently, the expected values are 255 (input data is in range 0..1), or
573
+ 1 (input data is in range 0..255).
574
+ name: Optional step name. Defaults to 'FloatToImageBytes'
575
+ """
576
+ super().__init__(["float_data"], ["pixel_data"], name)
577
+ self._multiplier = multiplier
578
+
579
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
580
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
581
+ assert input_type_str == "float"
582
+
583
+ if self._multiplier == 1.0:
584
+ scale_input = ''
585
+ scaled_input_name = self.input_names[0]
586
+ else:
587
+ scale_input = \
588
+ f"""\
589
+ f_multiplier = Constant <value = float[1] {{{self._multiplier}}}> ()
590
+ scaled_input = Mul ({self.input_names[0]}, f_multiplier)
591
+ """
592
+ scaled_input_name = 'scaled_input'
593
+
594
+ float_to_byte_graphs = onnx.parser.parse_graph(
595
+ f"""\
596
+ float_to_type (float[{input_shape_str}] {self.input_names[0]})
597
+ => (uint8[{input_shape_str}] {self.output_names[0]})
598
+ {{
599
+ f_0 = Constant <value = float[1] {{0.0}}> ()
600
+ f_255 = Constant <value = float[1] {{255.0}}>()
601
+
602
+ {scale_input}
603
+ rounded = Round ({scaled_input_name})
604
+ clipped = Clip (rounded, f_0, f_255)
605
+ {self.output_names[0]} = Cast <to = {onnx.TensorProto.UINT8}> (clipped)
606
+ }}
607
+ """
608
+ )
609
+
610
+ onnx.checker.check_graph(float_to_byte_graphs)
611
+ return float_to_byte_graphs
612
+
613
+
614
+ class ChannelsLastToChannelsFirst(Transpose):
615
+ """
616
+ Convert channels last data to channels first.
617
+ Input can be NHWC or HWC.
618
+ """
619
+
620
+ def __init__(self, has_batch_dim: bool = False, name: Optional[str] = None):
621
+ """
622
+ Args:
623
+ has_batch_dim: Set to True if the input has a batch dimension (i.e. is NHWC)
624
+ name: Optional step name. Defaults to 'ChannelsLastToChannelsFirst'
625
+ """
626
+ perms = [0, 3, 1, 2] if has_batch_dim else [2, 0, 1]
627
+ super().__init__(perms, name)
628
+
629
+
630
+ class DrawBoundingBoxes(Step):
631
+ """
632
+ Draw boxes on BGR image at given position, image is channel last and ordered by BGR.
633
+ Input shape: <uint8_t>{height, width, 3<BGR>}
634
+ boxes: <float>{num_boxes, 6<x, y, x/w, y/h, score, class>}
635
+ The coordinates is the absolute pixel values in the picture. Its value is determined by `mode`.
636
+ we have different modes to represent the coordinates of the box.[XYXY, XYWH, CENTER_XYWH].
637
+ Please refer to the following link for more details. https://keras.io/api/keras_cv/bounding_box/formats/
638
+ **score** is the confidence of the box(object score * class probability) and **class** is the class of the box.
639
+
640
+ Output shape: <uint8_t>{height, width, 3<BGR>}
641
+ """
642
+
643
+ def __init__(self, mode: str = "XYXY", thickness: int = 4, num_classes: int = 10,
644
+ colour_by_classes=False, name: Optional[str] = None):
645
+ """
646
+ Args:
647
+ mode: The mode of the boxes,
648
+ "XYXY" (xmin ymin xmax ymax) All values in the XYXY format should be absolute pixel values.
649
+ "XYWH" (xmin ymin width height)
650
+ "CENTER_XYWH" (x_center, y_center, width, height)
651
+ All values in the CENTER_XYWH format should be absolute pixel values.
652
+
653
+
654
+ thickness: Thickness of the box edge
655
+ num_colours: Number of colours to use
656
+ We support 10 predefined colours and the other classes more than 10 wouldn't be drawn.
657
+ colors are [Red, Yellow, Lime, Cyan, Blue, Magenta, Orange, Maroon, Green, Navy]
658
+ and are used in that order. i.e. result with best score will use red.
659
+ colour_by_classes: Colour boxes by classes or by score.
660
+ If `True` we use a colour for each unique class, with all results from the top
661
+ `num_colours` classes displayed. A colour is only used for a single class.
662
+ If `False`, we draw boxes for the top `num_colours` results. A colour is used
663
+ for a single result, regardless of class.
664
+ name: Optional name of step. Defaults to 'DrawBoundingBoxes'
665
+ """
666
+ super().__init__(["image", "boxes"], ["image_out"], name)
667
+ self.thickness_ = thickness
668
+ self.num_classes_ = num_classes
669
+ self.colour_by_classes_ = colour_by_classes
670
+ self.mode_ = mode
671
+
672
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
673
+ input0_type_str, input0_shape_str = self._get_input_type_and_shape_strs(graph, 0)
674
+ input1_type_str, input1_shape_str = self._get_input_type_and_shape_strs(graph, 1)
675
+ assert input0_type_str == "uint8" and input1_type_str == "float"
676
+
677
+ assert str(input1_shape_str.split(",")[-1]) == "6"
678
+
679
+
680
+ output_shape_str = input0_shape_str
681
+ converter_graph = onnx.parser.parse_graph(
682
+ f"""\
683
+ bounding_box (uint8[{input0_shape_str}] {self.input_names[0]}, float[{input1_shape_str}] {self.input_names[1]})
684
+ => (uint8[{output_shape_str}] {self.output_names[0]})
685
+ {{
686
+ {self.output_names[0]} = com.microsoft.extensions.DrawBoundingBoxes({self.input_names[0]}, {self.input_names[1]})
687
+ }}
688
+ """
689
+ )
690
+ op_attr = ["thickness", "num_classes", "colour_by_classes","mode"]
691
+ token_model_attr = []
692
+ token_model_attr.append(onnx.helper.make_attribute(op_attr[0], self.thickness_))
693
+ token_model_attr.append(onnx.helper.make_attribute(op_attr[1], self.num_classes_))
694
+ token_model_attr.append(onnx.helper.make_attribute(op_attr[2], int(self.colour_by_classes_)))
695
+ token_model_attr.append(onnx.helper.make_attribute(op_attr[3], self.mode_))
696
+ converter_graph.node[0].attribute.extend(token_model_attr)
697
+
698
+ return converter_graph
699
+
700
+
701
+ class LetterBox(Step):
702
+ """
703
+ Image is channel last and ordered by BGR.
704
+ mainly used in object detection, it mostly follows behind resize operation.
705
+ This step either add border or crop the image to satisfy network input.
706
+ ----- bbbbbbbbb
707
+ |img| --- > bb-----bb
708
+ ----- bb|img|bb
709
+ bb-----bb
710
+ bbbbbbbbb
711
+ If target_shape is less than the original image, it will crop the image in a center mode.
712
+ And the padding values will be negative and the Pad op performs cropping.
713
+
714
+ Input shape: <uint8_t>{height, width, 3<BGR>}
715
+ target_shape: <uint8_t>{out_height, out_width, 3<BGR>}
716
+ layout: HWC or CHW are supported
717
+ Output shape: specified by target_shape
718
+ """
719
+
720
+ def __init__(self, target_shape: Union[int, Tuple[int, int]], fill_value=0, layout: str = "HWC",
721
+ name: Optional[str] = None):
722
+ """
723
+ Args:
724
+ target_shape: the size of the output image
725
+ fill_value: a constant value used to fill the border
726
+ name: Optional name of step. Defaults to 'LetterBox'
727
+ """
728
+ super().__init__(["image"], ["image_pad"], name)
729
+
730
+ self.target_shape_ = target_shape
731
+ self.fill_value_ = fill_value
732
+
733
+ if layout != "HWC" and layout != "CHW":
734
+ raise ValueError("Invalid layout. Only HWC and CHW are supported")
735
+
736
+ self.layout_ = layout
737
+
738
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
739
+ input0_type_str, input0_shape_str = self._get_input_type_and_shape_strs(graph, 0)
740
+ assert len(input0_shape_str.split(',')) == 3, "expected HWC or CHW input"
741
+
742
+ target_shape = f"{self.target_shape_[0]}, {self.target_shape_[1]}"
743
+
744
+ if self.layout_ == "HWC":
745
+ target_shape_str = f"{target_shape}, 3"
746
+ split_input_shape_output = "h, w, c"
747
+ concat_input_order = "half_pad_hw, i64_0, remainder_pad_hw, i64_0"
748
+ else:
749
+ target_shape_str = f"3, {target_shape}"
750
+ split_input_shape_output = "c, h, w"
751
+ concat_input_order = "i64_0, half_pad_hw, i64_0, remainder_pad_hw"
752
+
753
+ split_input_shape_attr = "axis = 0"
754
+ if onnx_opset >= 18:
755
+ # Split now requires the number of outputs to be specified even though that can be easily inferred...
756
+ split_input_shape_attr += f", num_outputs = 3"
757
+
758
+ graph_text = (
759
+ f"""\
760
+ LetterBox (uint8[{input0_shape_str}] {self.input_names[0]})
761
+ => (uint8[{target_shape_str}] {self.output_names[0]})
762
+ {{
763
+ target_size = Constant <value = int64[2] {{{(self.target_shape_[0])}, {(self.target_shape_[1])}}}> ()
764
+ i64_2 = Constant <value = int64[1] {{2}}>()
765
+ i64_0 = Constant <value = int64[1] {{0}}>()
766
+ const_val = Constant <value = uint8[1] {{{self.fill_value_}}}> ()
767
+ image_shape = Shape ({self.input_names[0]})
768
+ {split_input_shape_output} = Split <{split_input_shape_attr}> (image_shape)
769
+ hw = Concat <axis = 0> (h, w)
770
+ pad_hw = Sub (target_size, hw)
771
+ half_pad_hw = Div (pad_hw, i64_2)
772
+ remainder_pad_hw = Sub (pad_hw, half_pad_hw)
773
+ pad_value = Concat <axis = 0> ({concat_input_order})
774
+ {self.output_names[0]} = Pad({self.input_names[0]}, pad_value, const_val)
775
+ }}
776
+ """
777
+ )
778
+
779
+ converter_graph = onnx.parser.parse_graph(graph_text)
780
+
781
+ return converter_graph
782
+
783
+
784
+ class SplitOutBoxAndScoreWithConf(Step):
785
+ r"""
786
+ Split the output of the model into boxes and scores, applying the object confidence score.
787
+ Input shape: <float>{num_boxes, <4 box co-ords, conf score, num_classes>}
788
+ Output shape: <float>{num_boxes, 4}, <float>{num_boxes, num_classes}
789
+ |x1,x2,x3,x4, obj_conf, cls_1, ... cls_num|
790
+ /\
791
+ / \
792
+ |x1,x2,x3,x4| |cls_1, ... clx_num|*obj_conf
793
+ """
794
+
795
+ def __init__(self, num_classes: int, name: Optional[str] = None):
796
+ """
797
+ Args:
798
+ num_classes: number of classes
799
+ name: Optional name of step. Defaults to 'SplitOutBoxAndScoreWithConf'
800
+ """
801
+
802
+ super().__init__(["box_conf_scores"], ["boxes", "scores"], name)
803
+ self.num_classes_ = num_classes
804
+
805
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
806
+ input0_type_str, input0_shape_str = self._get_input_type_and_shape_strs(graph, 0)
807
+
808
+ input_shape_list = input0_shape_str.split(',')
809
+ assert len(input_shape_list) == 2, " expected [num_boxes, 5+num_classes]"
810
+
811
+ target_shape_str_0 = f"{input_shape_list[0]}, 4"
812
+ target_shape_str_1 = f"{input_shape_list[0]}, _{self._step_num}_class"
813
+
814
+ converter_graph = onnx.parser.parse_graph(
815
+ f"""\
816
+ SplitOutBoxConfidenceAndScore (float[{input0_shape_str}] {self.input_names[0]})
817
+ => (float[{target_shape_str_0}] {self.output_names[0]},
818
+ float[{target_shape_str_1}] {self.output_names[1]})
819
+ {{
820
+ split_sizes = Constant <value = int64[3] {{4, 1, {self.num_classes_}}}>()
821
+ {self.output_names[0]}, conf, orig_scores = Split <axis=-1>({self.input_names[0]}, split_sizes)
822
+
823
+ scores_with_conf = Mul(orig_scores, conf)
824
+ {self.output_names[1]} = Identity (scores_with_conf)
825
+ }}
826
+ """
827
+ )
828
+ return converter_graph
829
+
830
+
831
+ class SelectBestBoundingBoxesByNMS(Step):
832
+ """
833
+ Non-maximum suppression (NMS) is to select the best bounding boxes.
834
+ Input:
835
+ boxes: float[num_boxes, 4]
836
+ scores: float[num_boxes, num_classes]
837
+ masks: float[num_boxes, mask_data]. optional
838
+
839
+ Output:
840
+ nms_out: float[_few_num_boxes, <box+score+class+mask_data>]
841
+ """
842
+
843
+ def __init__(self,
844
+ iou_threshold: Optional[float] = 0.5,
845
+ score_threshold: Optional[float] = 0.67,
846
+ max_boxes_per_class: Optional[int] = 100,
847
+ max_detections: Optional[int] = None,
848
+ has_mask_data: Optional[bool] = False, name: Optional[str] = None):
849
+ """
850
+ Args: Please refer to https://github.com/onnx/onnx/blob/main/docs/Operators.md#NonMaxSuppression
851
+ for more details about the parameters.
852
+ iou_threshold: same as NonMaxSuppression op, intersection/union of boxes
853
+ score_threshold: If this box's score is lower than score_threshold, it will be removed.
854
+ max_boxes_per_class: max number of boxes to be selected per class
855
+ max_detections: maximum number of boxes in total. Applied as the last step of processing if specified.
856
+ name: Optional name of step. Defaults to 'SelectBestBoundingBoxesByNMS'
857
+ """
858
+ inputs = ["boxes", "scores"]
859
+ if has_mask_data:
860
+ inputs.append("masks")
861
+
862
+ super().__init__(inputs, ["nms_out"], name)
863
+
864
+ self.iou_threshold_ = iou_threshold
865
+ self.score_threshold_ = score_threshold
866
+ self.max_boxes_per_class_ = max_boxes_per_class
867
+ self.max_detections_ = max_detections
868
+
869
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
870
+ input0_type_str, input0_shape_str = self._get_input_type_and_shape_strs(graph, 0)
871
+ input1_type_str, input1_shape_str = self._get_input_type_and_shape_strs(graph, 1)
872
+
873
+ input0_shape_list = input0_shape_str.split(',')
874
+ assert len(input0_shape_list) == 2, " expected [num_boxes, 4]"
875
+
876
+ has_mask_input = len(self.input_names) == 3
877
+
878
+ input_2 = ""
879
+ mask_i = ""
880
+ mask_select = ""
881
+ concat_for_output = "boxes_select, score_select, class_select"
882
+ output_size_str = "6"
883
+ # reduce_score picks the class with the best score for the selected box
884
+ reduce_score = '(score_select_nm, i64_neg1)' if onnx_opset >= 18 else '<axes=[-1]>(score_select_nm)'
885
+
886
+ if has_mask_input:
887
+ input2_type_str, input2_shape_str = self._get_input_type_and_shape_strs(graph, 2)
888
+ input_2 = f", float[{input2_shape_str}] {self.input_names[2]}"
889
+ mask_i = f"masks_i = Identity({self.input_names[2]})"
890
+ mask_select = "mask_select = Gather <axis=0>(masks_i, box_idxs)"
891
+ concat_for_output += ", mask_select"
892
+
893
+ mask_size_str = input2_shape_str.split(",")[-1]
894
+ if mask_size_str.isnumeric():
895
+ output_size_str = str(6 + int(mask_size_str))
896
+ else:
897
+ output_size_str = f"_step{self._step_num}_6_+_mask_size"
898
+
899
+ if self.max_detections_:
900
+ # squeeze scores from [num_results, 1] to [num_results]
901
+ # use TopK to find the best scores for the selected boxes, but only if the number of results is
902
+ # greater than max_detections, and there are results (otherwise calling TopK is invalid).
903
+ # We sort the selected indices to maintain the original ordering for consistency when TopK isn't required
904
+ apply_max_detections = \
905
+ f"""
906
+ max_detections = Constant <value = int64[1] {{{self.max_detections_}}}>()
907
+ num_results = Shape(scores)
908
+ num_results_less_than_max = Less(num_results, max_detections)
909
+ k = Where(num_results_less_than_max, num_results, max_detections)
910
+ have_results = Greater(k, i64_0)
911
+ final_results = If<
912
+ then_branch=then_graph() =>
913
+ (float[_{self._step_num}_selected_boxes, {output_size_str}] then_output)
914
+ {{
915
+ topk_scores, topk_i = TopK<axis = 0>(scores, k)
916
+ # use Unique to sort. no onnx op seems to provide that directly.
917
+ sorted_topk_i = Unique<sorted=1>(topk_i)
918
+ then_output = Gather<axis = 0>(merged_results, sorted_topk_i)
919
+ }},
920
+ else_branch=else_graph() =>
921
+ (float[_{self._step_num}_selected_boxes, {output_size_str}] else_output)
922
+ {{
923
+ else_output = Identity(merged_results)
924
+ }}>
925
+ (have_results)
926
+ """
927
+
928
+ else:
929
+ apply_max_detections = "final_results = Identity(merged_results)"
930
+
931
+ graph_text = \
932
+ f"""
933
+ SelectBestBoundingBoxesByNMS (float[{input0_shape_str}] {self.input_names[0]},
934
+ float[{input1_shape_str}] {self.input_names[1]}
935
+ {input_2})
936
+ => (float[_{self._step_num}_selected_boxes, {output_size_str}] {self.output_names[0]})
937
+ {{
938
+ i64_neg1 = Constant <value = int64[1] {{-1}}>()
939
+ i64_0 = Constant <value = int64[1] {{0}}>()
940
+ i64_1 = Constant <value = int64[1] {{1}}>()
941
+ i64_2 = Constant <value = int64[1] {{2}}>()
942
+ i64_1_2 = Constant <value = int64[2] {{1, 2}}>()
943
+ max_per_class = Constant <value = int64[1] {{{self.max_boxes_per_class_}}}>()
944
+ iou_th = Constant <value = float[1] {{{self.iou_threshold_}}}>()
945
+ score_th = Constant <value = float[1] {{{self.score_threshold_}}}>()
946
+
947
+ boxes_i = Identity({self.input_names[0]})
948
+ scores_i = Identity({self.input_names[1]})
949
+ {mask_i}
950
+
951
+ scores_c_b = Transpose<perm=[1,0]>(scores_i)
952
+ batch_boxes = Unsqueeze(boxes_i, i64_0)
953
+ batch_scores = Unsqueeze(scores_c_b, i64_0)
954
+
955
+ # NMS returns [num_selected_boxes, 3] where each entry is [batch, class idx, box idx]
956
+ nmsbox = NonMaxSuppression<center_point_box=1>(batch_boxes, batch_scores, max_per_class,
957
+ iou_th, score_th)
958
+
959
+ # extract class values
960
+ nms_classes = Gather<axis=-1>(nmsbox, i64_1)
961
+ class_select = Cast<to = 1>(nms_classes)
962
+
963
+ # extract box indexes and select box info using them.
964
+ nms_boxes = Gather<axis=-1>(nmsbox, i64_2)
965
+ box_idxs = Squeeze(nms_boxes, i64_neg1)
966
+ boxes_select = Gather<axis=0>(boxes_i, box_idxs)
967
+
968
+ # scores_c_b is [classes, boxes]
969
+ # box_class_idxs is [selected_boxes, 2] where the 2 values are class idx, box idx
970
+ class_box_idxs = Gather<axis=-1>(nmsbox, i64_1_2)
971
+ scores = GatherND(scores_c_b, class_box_idxs)
972
+ score_select = Unsqueeze(scores, i64_neg1)
973
+
974
+ {mask_select}
975
+
976
+ merged_results = Concat <axis = -1> ({concat_for_output})
977
+
978
+ {apply_max_detections}
979
+
980
+ {self.output_names[0]} = Identity(final_results)
981
+ }}
982
+ """
983
+
984
+ converter_graph = onnx.parser.parse_graph(graph_text)
985
+
986
+ return converter_graph
987
+
988
+
989
+ class ScaleNMSBoundingBoxesAndKeyPoints(Step):
990
+ """
991
+ Scale bounding box and key point coordinates in optional mask data to original image.
992
+
993
+ Input image goes through Resize and LetterBox steps during pre-processing (in that order), and the output of this
994
+ is what the original model runs against.
995
+ To display the predictions on the original image we need to apply the reverse size changes to the co-ordinates
996
+ of the bounding boxes.
997
+
998
+ nms_step_output inner dimension has 4 values for the bounding box, 1 for the score, 1 for the selected class,
999
+ and the remainder (if any) is the mask data.
1000
+
1001
+ The mask data has values for a fixed number of key points. Each key point has an x and y value, and optionally a
1002
+ confidence value.
1003
+
1004
+ input:
1005
+ nms_step_output: output of SelectBestBoundingBoxesByNMS Step, shape [num_boxes, 6+]
1006
+ original_image: original image decoded from jpg/png, <uint8_t>[H, W, 3] or [3, H, W]
1007
+ resized_image: output from Resize pre-processing Step, <uint8_t>[H1, W1, 3] or [3, H1, W1]
1008
+ letter_boxed_image: output from LetterBox pre-processing Step, <uint8_t>[H2, W2, 3] or [3, H2, W2]
1009
+ num_key_points: number of key points in each mask data entry, if present. optional.
1010
+
1011
+ output:
1012
+ nms_output_with_scaled_boxes_and_keypoints: input data with boxes and key points scaled to original image.
1013
+ """
1014
+
1015
+ def __init__(self, num_key_points: Optional[int] = 0, layout: Optional[str] = "HWC", name: Optional[str] = None):
1016
+ """
1017
+ Args:
1018
+ num_key_points: Number of key points in mask data. Only required if input has optional mask data.
1019
+ layout: HWC or CHW. Used to determine where to read the H and W value from the input image shapes.
1020
+ MUST be the same for all 3 input images.
1021
+
1022
+ name: Optional name of step. Defaults to 'ScaleNMSBoundingBoxesAndKeyPoints'
1023
+ """
1024
+ super().__init__(["nms_step_output", "original_image", "resized_image", "letter_boxed_image"],
1025
+ ["nms_output_with_scaled_boxes_and_keypoints"], name)
1026
+ self._num_key_points = num_key_points
1027
+
1028
+ if layout != "HWC" and layout != "CHW":
1029
+ raise ValueError("Invalid layout. Only HWC and CHW are supported")
1030
+
1031
+ self.layout_ = layout
1032
+
1033
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
1034
+ graph_input_params = []
1035
+
1036
+ for idx, input_name in enumerate(self.input_names):
1037
+ input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, idx)
1038
+ graph_input_params.append(f"{input_type_str}[{input_shape_str}] {input_name}")
1039
+
1040
+ graph_input_params = ', '.join(graph_input_params)
1041
+
1042
+ if self.layout_ == "HWC":
1043
+ orig_image_h_w_c = "oh, ow, oc"
1044
+ scaled_image_h_w_c = "sh, sw, sc"
1045
+ letterboxed_image_h_w_c = "lh, lw, lc"
1046
+ else:
1047
+ orig_image_h_w_c = "oc, oh, ow"
1048
+ scaled_image_h_w_c = "sc, sh, sw"
1049
+ letterboxed_image_h_w_c = "lc, lh, lw"
1050
+
1051
+ def split_num_outputs(num_outputs: int):
1052
+ split_input_shape_attr = ''
1053
+ if onnx_opset >= 18:
1054
+ split_input_shape_attr = f", num_outputs = {num_outputs}"
1055
+ return split_input_shape_attr
1056
+
1057
+ nms_output_type_str, nms_output_shape_str = self._get_input_type_and_shape_strs(graph, 0)
1058
+ nms_output_shape = nms_output_shape_str.split(',')
1059
+ data_size_per_result = nms_output_shape[-1]
1060
+ if not data_size_per_result.isnumeric():
1061
+ # this should be known when adding pre-processing
1062
+ raise ValueError("Shape of input must have numeric value for the mask data size")
1063
+
1064
+ data_num_splits = 3 # splits of nms data into box[:2], box[2:4] , score+class, [mask]
1065
+ data_split_sizes = "2, 2, 2" # sizes of the splits
1066
+ score_class_masks = "score_class" # output name/s for trailing output/s from Split
1067
+ keypoint_processing = "" # operators to process the keypoints
1068
+ scaled_keypoints = "" # optional output from keypoint scaling
1069
+
1070
+ data_size = int(data_size_per_result)
1071
+ if data_size > 6:
1072
+ # we have mask data to split out
1073
+ data_num_splits = 4
1074
+ keypoint_data_size = data_size - 6
1075
+ data_split_sizes += f", {keypoint_data_size}"
1076
+ score_class_masks = "score_class, masks"
1077
+ scaled_keypoints = ", scaled_keypoints"
1078
+
1079
+ values_per_keypoint = int(keypoint_data_size / self._num_key_points)
1080
+ reshape_keypoints_to = ",".join([str(self._num_key_points), str(values_per_keypoint)])
1081
+
1082
+ if keypoint_data_size > 2:
1083
+ # split into xy and conf
1084
+ keypoints_xy_and_conf_from_keypoints = \
1085
+ f"""
1086
+ keypoints_split_sizes = Constant <value = int64[2] {{2, {values_per_keypoint - 2}}}>()
1087
+ keypoints_xy, conf = Split <axis = -1>(keypoints, keypoints_split_sizes)
1088
+ """
1089
+ # need to re-combine after scaling
1090
+ scaled_keypoints_and_conf = "scaled_keypoints_and_conf = Concat <axis=-1>(scaled_keypoints_xy, conf)"
1091
+
1092
+ else:
1093
+ # use the keypoint data as-is as we don't have 'conf' data to split out
1094
+ keypoints_xy_and_conf_from_keypoints = "keypoints_xy = Identity(keypoints)"
1095
+ scaled_keypoints_and_conf = "scaled_keypoints_and_conf = Identity(scaled_keypoints_xy)"
1096
+
1097
+ keypoint_processing = \
1098
+ f"""
1099
+ reshape_keypoints_to = Constant <value = int64[2] {{{reshape_keypoints_to}}}>()
1100
+ input_shape = Shape ({self.input_names[0]})
1101
+
1102
+ i64_0 = Constant <value = int64[1] {{0}}>()
1103
+ num_boxes = Gather <axis=0>(input_shape, i64_0)
1104
+ reshape_masks_to = Concat<axis=-1> (num_boxes, reshape_keypoints_to)
1105
+ keypoints = Reshape(masks, reshape_masks_to)
1106
+
1107
+ {keypoints_xy_and_conf_from_keypoints}
1108
+
1109
+ offset_keypoints_xy = Sub (keypoints_xy, f_half_pad_wh)
1110
+ scaled_keypoints_xy = Mul (offset_keypoints_xy, ratios)
1111
+
1112
+ {scaled_keypoints_and_conf}
1113
+
1114
+ orig_shape = Shape(masks)
1115
+ scaled_keypoints = Reshape(scaled_keypoints_and_conf, orig_shape)
1116
+ """
1117
+
1118
+ graph_text = \
1119
+ f"""\
1120
+ ScaleNMSBoundingBoxesAndKeyPoints
1121
+ ({graph_input_params}) => ({nms_output_type_str}[{nms_output_shape_str}] {self.output_names[0]})
1122
+ {{
1123
+ i64_2 = Constant <value = int64[1] {{2}}>()
1124
+ data_split_sizes = Constant <value = int64[{data_num_splits}] {{{data_split_sizes}}}>()
1125
+
1126
+ boxes_xy, boxes_wh_or_xy, {score_class_masks} = Split <axis=-1>({self.input_names[0]}, data_split_sizes)
1127
+
1128
+ ori_shape = Shape ({self.input_names[1]})
1129
+ scaled_shape = Shape ({self.input_names[2]})
1130
+ lettered_shape = Shape ({self.input_names[3]})
1131
+ {orig_image_h_w_c} = Split <axis = 0 {split_num_outputs(3)}> (ori_shape)
1132
+ {scaled_image_h_w_c} = Split <axis = 0 {split_num_outputs(3)}> (scaled_shape)
1133
+ {letterboxed_image_h_w_c} = Split <axis = 0 {split_num_outputs(3)}> (lettered_shape)
1134
+ swh = Concat <axis = -1> (sw,sh)
1135
+ lwh = Concat <axis = -1> (lw,lh)
1136
+
1137
+ f_oh = Cast <to = 1> (oh)
1138
+ f_sh = Cast <to = 1> (sh)
1139
+ ratios = Div (f_oh, f_sh)
1140
+
1141
+ pad_wh = Sub (lwh, swh)
1142
+ half_pad_wh = Div (pad_wh, i64_2)
1143
+ f_half_pad_wh = Cast <to = 1> (half_pad_wh)
1144
+
1145
+ offset_boxes_xy = Sub (boxes_xy, f_half_pad_wh)
1146
+ restored_boxes = Concat <axis=-1> (offset_boxes_xy, boxes_wh_or_xy)
1147
+ scaled_boxes = Mul (restored_boxes, ratios)
1148
+
1149
+ {keypoint_processing}
1150
+
1151
+ {self.output_names[0]} = Concat <axis=-1> (scaled_boxes, score_class {scaled_keypoints})
1152
+ }}
1153
+ """
1154
+
1155
+ converter_graph = onnx.parser.parse_graph(graph_text)
1156
+
1157
+ return converter_graph