onnxruntime_extensions 0.14.0__cp313-cp313-macosx_11_0_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. onnxruntime_extensions/__init__.py +82 -0
  2. onnxruntime_extensions/_cuops.py +564 -0
  3. onnxruntime_extensions/_extensions_pydll.cpython-313-darwin.so +0 -0
  4. onnxruntime_extensions/_extensions_pydll.pyi +45 -0
  5. onnxruntime_extensions/_hf_cvt.py +331 -0
  6. onnxruntime_extensions/_ocos.py +133 -0
  7. onnxruntime_extensions/_ortapi2.py +274 -0
  8. onnxruntime_extensions/_torch_cvt.py +231 -0
  9. onnxruntime_extensions/_version.py +2 -0
  10. onnxruntime_extensions/cmd.py +66 -0
  11. onnxruntime_extensions/cvt.py +306 -0
  12. onnxruntime_extensions/onnxprocess/__init__.py +12 -0
  13. onnxruntime_extensions/onnxprocess/_builder.py +53 -0
  14. onnxruntime_extensions/onnxprocess/_onnx_ops.py +1507 -0
  15. onnxruntime_extensions/onnxprocess/_session.py +355 -0
  16. onnxruntime_extensions/onnxprocess/_tensor.py +628 -0
  17. onnxruntime_extensions/onnxprocess/torch_wrapper.py +31 -0
  18. onnxruntime_extensions/pnp/__init__.py +13 -0
  19. onnxruntime_extensions/pnp/_base.py +124 -0
  20. onnxruntime_extensions/pnp/_imagenet.py +65 -0
  21. onnxruntime_extensions/pnp/_nlp.py +148 -0
  22. onnxruntime_extensions/pnp/_onnx_ops.py +1544 -0
  23. onnxruntime_extensions/pnp/_torchext.py +310 -0
  24. onnxruntime_extensions/pnp/_unifier.py +45 -0
  25. onnxruntime_extensions/pnp/_utils.py +302 -0
  26. onnxruntime_extensions/pp_api.py +83 -0
  27. onnxruntime_extensions/tools/__init__.py +0 -0
  28. onnxruntime_extensions/tools/add_HuggingFace_CLIPImageProcessor_to_model.py +171 -0
  29. onnxruntime_extensions/tools/add_pre_post_processing_to_model.py +535 -0
  30. onnxruntime_extensions/tools/pre_post_processing/__init__.py +4 -0
  31. onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py +395 -0
  32. onnxruntime_extensions/tools/pre_post_processing/step.py +227 -0
  33. onnxruntime_extensions/tools/pre_post_processing/steps/__init__.py +6 -0
  34. onnxruntime_extensions/tools/pre_post_processing/steps/general.py +366 -0
  35. onnxruntime_extensions/tools/pre_post_processing/steps/nlp.py +344 -0
  36. onnxruntime_extensions/tools/pre_post_processing/steps/vision.py +1157 -0
  37. onnxruntime_extensions/tools/pre_post_processing/utils.py +139 -0
  38. onnxruntime_extensions/util.py +186 -0
  39. onnxruntime_extensions-0.14.0.dist-info/LICENSE +21 -0
  40. onnxruntime_extensions-0.14.0.dist-info/METADATA +102 -0
  41. onnxruntime_extensions-0.14.0.dist-info/RECORD +43 -0
  42. onnxruntime_extensions-0.14.0.dist-info/WHEEL +6 -0
  43. onnxruntime_extensions-0.14.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,344 @@
1
+ # Copyright (c) Microsoft Corporation. All rights reserved.
2
+ # Licensed under the MIT License.
3
+
4
+ import onnx
5
+ from collections import OrderedDict
6
+ from pathlib import Path
7
+
8
+ from typing import Optional, Union, Dict
9
+ from ..step import Step
10
+
11
+
12
+ class TokenizerParam(object):
13
+ def __init__(self, vocab_or_file: Union[Path, dict], **kwargs):
14
+ self.vocab_or_file = vocab_or_file
15
+ self.tweaked_bos_id = 1
16
+ self.strip_accents = 0
17
+ self.do_lower_case = 0
18
+ self.is_sentence_pair = 0
19
+ self.__assigned_with_kwargs(**kwargs)
20
+
21
+ def __assigned_with_kwargs(self, **kwargs):
22
+ for key in self.__dict__.keys():
23
+ if key in kwargs and kwargs.get(key) is not None:
24
+ setattr(self, key, kwargs[key])
25
+
26
+
27
+ class SentencePieceTokenizer(Step):
28
+ def __init__(
29
+ self,
30
+ tokenizer_param: TokenizerParam,
31
+ nbest_size=0,
32
+ alpha=1.0,
33
+ reverse=False,
34
+ add_bos=False,
35
+ add_eos=False,
36
+ name: Optional[str] = None,
37
+ ):
38
+ """
39
+ Brief:
40
+ SentencePieceTokenizer has actually 6 inputs in definition, but we allow user to provide only text input,
41
+ and make the others, "nbest_size", "alpha", "add_bos", "add_eos", "reverse" optional.
42
+ Args:
43
+ tokenizer_param: some essential infos to build a tokenizer
44
+ you can create a TokenizerParam object like:
45
+ tokenizer_param = TokenizerParam(vocab_size=tokenizer.vocab_size,
46
+ tweaked_bos_id=tokenizer.tweaked_bos_id)
47
+
48
+ nbest_size: int, optional (default = 0)
49
+ alpha: float, optional (default = 1.0)
50
+ reverse: bool, optional (default = False)
51
+ add_bos: bool, optional (default = False)
52
+ add_eos: bool, optional (default = False)
53
+ Please see more detail explanation in
54
+ https://www.tensorflow.org/text/api_docs/python/text/SentencepieceTokenizer#args
55
+
56
+ name: Optional name of step. Defaults to 'SentencePieceTokenizer'
57
+
58
+ """
59
+ super().__init__(
60
+ ["input_text", "nbest_size", "alpha", "add_bos", "add_eos", "reverse"], ["input_ids", "attention_mask"], name
61
+ )
62
+ self._tokenizer_param = tokenizer_param
63
+ # python bool value (True/False) is not supported in c++, so we use 0/1 to represent bool
64
+ self._optional_kwargs = dict(
65
+ nbest_size=nbest_size, alpha=alpha, add_bos=int(add_bos), add_eos=int(add_eos), reverse=int(reverse)
66
+ )
67
+
68
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
69
+ # input text
70
+ input_type_str0, input_shape_str0 = self._get_input_type_and_shape_strs(graph, 0)
71
+ input_shape_0 = input_shape_str0.split(",")
72
+ # ideally, we should support batch input, each batch has different length and output a token
73
+ # !!! But, the implementation of SentencePieceTokenizer is not batch supported, inputs will be flatten to 1D
74
+ # in the sentence-piece kernel
75
+ assert input_type_str0 == "string"
76
+
77
+ # we have to do this hack here, because some models tweaked bos_id to 0, but we have still 1
78
+ # as default value in model file.
79
+ # it is only a temporary solution, we will remove it in the future.
80
+ tweak_bos_id = False
81
+ if self._tokenizer_param.tweaked_bos_id != 1 and self._optional_kwargs["add_bos"]:
82
+ self._optional_kwargs["add_bos"] = 0
83
+ tweak_bos_id = True
84
+
85
+ batch_dim = input_shape_0[0] if len(input_shape_0) > 1 else "1"
86
+ prefix_ = f'step_{self.step_num}'
87
+ output_shape_str = f"{batch_dim}, {prefix_}__num_ids"
88
+
89
+ def build_input_declare():
90
+ input_base = f"{input_type_str0}[{input_shape_str0}] {self.input_names[0]}"
91
+ return input_base
92
+
93
+ def build_call_para():
94
+ para_base = ["input_with_batch"]
95
+ para_base.append("i64_nbest_size")
96
+ para_base.append("f32_alpha")
97
+ para_base.append("bool_add_bos")
98
+ para_base.append("bool_add_eos")
99
+ para_base.append("bool_reverse")
100
+ return ",".join(para_base)
101
+
102
+ def build_forward_declare():
103
+ # default values for nbest_size, alpha, add_bos, add_eos, reverse
104
+ declare_base = [
105
+ f"i64_nbest_size = Constant <value = int64[1] {{{self._optional_kwargs['nbest_size']}}}> ()",
106
+ f"f32_alpha = Constant <value = float[1] {{ {self._optional_kwargs['alpha']} }}> ()",
107
+ f"bool_add_bos = Constant <value = bool[1] {{{self._optional_kwargs['add_bos']}}}> ()",
108
+ f"bool_add_eos = Constant <value = bool[1] {{{self._optional_kwargs['add_eos']}}}> ()",
109
+ f"bool_reverse = Constant <value = bool[1] {{{self._optional_kwargs['reverse']}}}> ()",
110
+ ]
111
+
112
+ return "\n".join(declare_base)
113
+
114
+ # TODO Camembert and XLMRoberta tokenizers has a different bos_token_id (0) from the default value (1)
115
+ # Now, we are hacking it.
116
+
117
+ def hack_bos_id():
118
+ if tweak_bos_id:
119
+ return f'''
120
+ k_start = Constant <value = int32[1] {{{self._tokenizer_param.tweaked_bos_id}}}> ()
121
+ input_ids_concat02 = Concat <axis = 0> (k_start, token)
122
+ input_ids_bdim = Unsqueeze(input_ids_concat02, i64_0)
123
+ '''
124
+ else:
125
+ return '''
126
+ input_ids_bdim = Unsqueeze(token, i64_0)
127
+ '''
128
+
129
+ def build_unsqueeze():
130
+ if len(input_shape_0) == 1:
131
+ return f"""
132
+ input_with_batch = Unsqueeze({self.input_names[0]}, i64_0)
133
+ """
134
+ else:
135
+ return f"""
136
+ input_with_batch = Identity({self.input_names[0]})
137
+ """
138
+
139
+ converter_graph = onnx.parser.parse_graph(
140
+ f"""\
141
+ SentencePiecetokenizer ({build_input_declare()})
142
+ => (int64[{output_shape_str}] {self.output_names[0]},int64[{output_shape_str}] {self.output_names[1]})
143
+ {{
144
+ {build_forward_declare()}
145
+ i64_neg1 = Constant <value = int64[1] {{-1}}> ()
146
+ i64_0 = Constant <value = int64[1] {{0}}> ()
147
+ {build_unsqueeze()}
148
+ token,idx = com.microsoft.extensions.SentencepieceTokenizer ({build_call_para()})
149
+ {hack_bos_id()}
150
+ {self.output_names[0]} = Cast <to = 7> (input_ids_bdim)
151
+ attention_mask_i32=Greater({self.output_names[0]}, i64_neg1)
152
+ {self.output_names[1]} = Cast <to = 7> (attention_mask_i32)
153
+ }}
154
+ """
155
+ )
156
+
157
+ with open(self._tokenizer_param.vocab_or_file, "rb") as f:
158
+ content = f.read()
159
+
160
+ token_model_attr = onnx.helper.make_attribute("model", content)
161
+ node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == "SentencepieceTokenizer")
162
+ converter_graph.node[node_idx].attribute.append(token_model_attr)
163
+
164
+ return converter_graph
165
+
166
+
167
+ def _vocab_to_dict(vocab_or_file: Union[Dict[str, int], Path, str]):
168
+ if isinstance(vocab_or_file, (Path, str)):
169
+ # read from file
170
+ import json
171
+ with open(vocab_or_file, "r") as f:
172
+ vocab = json.load(f)
173
+ else:
174
+ vocab = vocab_or_file
175
+
176
+ ordered_vocab = OrderedDict(sorted(vocab.items(), key=lambda item: int(item[1])))
177
+
178
+ vocab = "\n".join(ordered_vocab.keys())
179
+ return dict(vocab_file=vocab)
180
+
181
+
182
+ class BertTokenizer(Step):
183
+ def __init__(self, tokenizer_param: TokenizerParam, need_token_type_ids_output: bool = False, name: Optional[str] = None):
184
+ """
185
+ Brief: This step is used to convert the input text into the input_ids, attention_mask, token_type_ids.
186
+ It supports an input of a single string for classification models, or two strings for QA models.
187
+ Args:
188
+ tokenizer_param: some essential infos to build a tokenizer,
189
+ You can create a TokenizerParam like this:
190
+ tokenizer_param = TokenizerParam(vocab=tokenizer.vocab, # vocab is dict or file_path
191
+ strip_accents = True or False (Optional),
192
+ do_lower_case = True or False (Optional)
193
+ )
194
+
195
+ name: Optional name of step. Defaults to 'BertTokenizer'
196
+ need_token_type_ids_output: last outout `token_type_ids` is not required in some Bert based models. (e.g. DistilBert, etc.) can optionally
197
+ choose to add it in graph for step.
198
+
199
+ """
200
+ outputs = []
201
+ if need_token_type_ids_output:
202
+ outputs.extend(["input_ids", "attention_mask"])
203
+ else:
204
+ outputs.extend(["input_ids", "attention_mask", "token_type_ids"])
205
+ super().__init__(["input_text"], outputs, name)
206
+ self._tokenizer_param = tokenizer_param
207
+
208
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
209
+ input_type_str0, input_shape_str0 = self._get_input_type_and_shape_strs(graph, 0)
210
+
211
+ input_shape_0 = input_shape_str0.split(",")
212
+ prefix_ = f'step_{self.step_num}'
213
+ # only support bath size 1 until tokenizer op supports batch size > 1
214
+ batch_dim = input_shape_0[0] if len(input_shape_0) > 1 else "1"
215
+ output_shape_str = f"{batch_dim}, _{prefix_}__num_ids"
216
+ assert input_type_str0 == "string"
217
+
218
+ onnx_tokenizer_impl = "HfBertTokenizer" if self._tokenizer_param.is_sentence_pair else "BertTokenizer"
219
+
220
+ def build_output_declare():
221
+ output_base = []
222
+ for out in self.output_names:
223
+ output_base.append(f"int64[{output_shape_str}] {out}")
224
+
225
+ return ",".join(output_base)
226
+
227
+ def get_tokenizer_ret():
228
+ if onnx_tokenizer_impl == "HfBertTokenizer":
229
+ return ",".join(self.output_names)
230
+ # different output orders for BertTokenizer and HfBertTokenizer
231
+ return "ids,types,mask"
232
+
233
+ def build_output_imp():
234
+ if onnx_tokenizer_impl == "HfBertTokenizer":
235
+ return ""
236
+
237
+ # BertTokenizer has different output dimensions
238
+ ret_vars = get_tokenizer_ret().split(",")
239
+ ret_vars[1], ret_vars[2] = ret_vars[2], ret_vars[1]
240
+ output_str = []
241
+
242
+ for idx, out in enumerate(self.output_names):
243
+ output_str.append(f"{out} = Unsqueeze({ret_vars[idx]}, i64_0)")
244
+
245
+ return "\n".join(output_str)
246
+
247
+ def build_input_declare():
248
+ inputs = f"{input_type_str0}[{input_shape_str0}] {self.input_names[0]}"
249
+ return inputs
250
+
251
+ def build_unsqueeze():
252
+ if len(input_shape_0) == 1:
253
+ return f"""
254
+ input_with_batch = Unsqueeze({self.input_names[0]}, i64_0)
255
+ """
256
+ else:
257
+ return f"""
258
+ input_with_batch = Identity({self.input_names[0]})
259
+ """
260
+
261
+ converter_graph = onnx.parser.parse_graph(
262
+ f"""\
263
+ {onnx_tokenizer_impl} ({build_input_declare()})
264
+ => ({build_output_declare()})
265
+ {{
266
+ i64_0 = Constant <value = int64[1] {{0}}> ()
267
+ {build_unsqueeze()}
268
+ {get_tokenizer_ret()} = com.microsoft.extensions.{onnx_tokenizer_impl} (input_with_batch)
269
+ {build_output_imp()}
270
+ }}
271
+ """
272
+ )
273
+
274
+ bert_tokenizer_param = self._tokenizer_param
275
+ token_model_attr = []
276
+
277
+ attrs = _vocab_to_dict(bert_tokenizer_param.vocab_or_file)
278
+ attrs["strip_accents"] = bert_tokenizer_param.strip_accents
279
+ attrs["do_lower_case"] = bert_tokenizer_param.do_lower_case
280
+
281
+ for attr in attrs:
282
+ token_model_attr.append(onnx.helper.make_attribute(attr, attrs[attr]))
283
+
284
+ node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == onnx_tokenizer_impl)
285
+ converter_graph.node[node_idx].attribute.extend(token_model_attr)
286
+
287
+ return converter_graph
288
+
289
+
290
+ class BertTokenizerQADecoder(Step):
291
+ def __init__(self, tokenizer_param: TokenizerParam, name: Optional[str] = None):
292
+ """
293
+ Brief:
294
+ Decode the input_ids to text
295
+ Args:
296
+ tokenizer_param: some essential info to build a tokenizer.
297
+ you can create a TokenizerParam object like:
298
+ tokenizer_param = TokenizerParam(vocab=tokenizer.vocab, #vocab is dict or file_path)
299
+ name: Optional name of step. Defaults to 'BertTokenizerQADecoder'
300
+ """
301
+ super().__init__(
302
+ ["start_logits", "end_logits", "input_ids"], ["text"], name)
303
+ self._tokenizer_param = tokenizer_param
304
+
305
+ def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
306
+ def build_input_declare():
307
+ inputs = []
308
+ for idx, inp in enumerate(self.input_names):
309
+ input_type_str_x, input_shape_str_x = self._get_input_type_and_shape_strs(graph, idx)
310
+ inputs.append(f"{input_type_str_x}[{input_shape_str_x}] {inp}")
311
+ return ",".join(inputs)
312
+
313
+ # A unique name for output shape
314
+ prefix_ = f'step_{self.step_num}'
315
+ output_shape_str = f"_{prefix_}_any_len"
316
+ converter_graph = onnx.parser.parse_graph(
317
+ f"""\
318
+ tokenizer_decoder ({build_input_declare()})
319
+ => (string[{output_shape_str}] {self.output_names[0]})
320
+ {{
321
+ i64_em = Constant <value = int64[0] {{}}> ()
322
+ i64_1 = Constant <value = int64[1] {{1}}> ()
323
+ i64_0 = Constant <value = int64[1] {{0}}> ()
324
+ i64_neg1 = Constant <value = int64[1] {{-1}}> ()
325
+
326
+ s_position = ArgMax<axis = -1, keepdims = 0>({self.input_names[0]})
327
+ e_position = ArgMax<axis = -1, keepdims = 0>({self.input_names[1]})
328
+ ee_position = Add(e_position,i64_1)
329
+ u_i64_neg1 = Unsqueeze(i64_neg1, i64_0)
330
+ slice_ids= Slice({self.input_names[2]}, s_position, ee_position, i64_neg1)
331
+ {self.output_names[0]} = com.microsoft.extensions.BertTokenizerDecoder (slice_ids, i64_em)
332
+ }}
333
+ """
334
+ )
335
+
336
+ attrs = _vocab_to_dict(self._tokenizer_param.vocab_or_file)
337
+ token_model_attr = []
338
+ for attr in attrs:
339
+ token_model_attr.append(onnx.helper.make_attribute(attr, attrs[attr]))
340
+
341
+ node_idx = next(i for i, v in enumerate(converter_graph.node) if v.op_type == "BertTokenizerDecoder")
342
+ converter_graph.node[node_idx].attribute.extend(token_model_attr)
343
+
344
+ return converter_graph