onnxruntime_extensions 0.12.0__cp39-cp39-win_amd64.whl → 0.14.0__cp39-cp39-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -364,8 +364,15 @@ class SentencepieceDecoder(CustomOp):
364
364
  @classmethod
365
365
  def get_inputs(cls):
366
366
  return [
367
- cls.io_def("ids", onnx.TensorProto.INT64, [None])
367
+ cls.io_def("ids", onnx.TensorProto.INT64, [None]),
368
+ cls.io_def('fairseq', onnx_proto.TensorProto.BOOL, [None])
368
369
  ]
370
+
371
+ @classmethod
372
+ def input_default_values(cls):
373
+ return {
374
+ 'fairseq': [False]
375
+ }
369
376
 
370
377
  @classmethod
371
378
  def get_outputs(cls):
@@ -491,6 +498,16 @@ class StftNorm(CustomOp):
491
498
  ]
492
499
 
493
500
 
501
+ class HfJsonTokenizer(CustomOp):
502
+ @classmethod
503
+ def get_inputs(cls):
504
+ return [cls.io_def('str', onnx_proto.TensorProto.STRING, ['N'])]
505
+
506
+ @classmethod
507
+ def get_outputs(cls):
508
+ return [cls.io_def("ids", onnx.TensorProto.INT64, ['N', None])]
509
+
510
+
494
511
  # TODO: have a C++ impl.
495
512
  def _argsort_op(x, dim):
496
513
  d = numpy.argsort(x, dim)
@@ -544,4 +561,4 @@ class SingleOpGraph:
544
561
 
545
562
  @staticmethod
546
563
  def get_op_class(op_type):
547
- return globals()[op_type]
564
+ return globals()[op_type]
@@ -48,8 +48,9 @@ class HFTokenizerConverter(CustomOpConverter):
48
48
  model_dir = hf_tokenizer.name_or_path
49
49
  else:
50
50
  model_dir = os.path.dirname(vocab_file)
51
- tokenizer_json = json.load(
52
- open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8"))
51
+ f = open(os.path.join(model_dir, tokenizer_file), "r", encoding="utf-8")
52
+ tokenizer_json = json.load(f)
53
+ f.close()
53
54
  # get vocab object from json file
54
55
  vocab = tokenizer_json.get("model", {}).get("vocab", {})
55
56
  sorted_merges = tokenizer_json.get("model", {}).get("merges", [])
@@ -167,7 +168,8 @@ class HFTokenizerConverter(CustomOpConverter):
167
168
  TokenOpParam = namedtuple("TokenOpParam",
168
169
  ["pre_op", "pre_attribute_cvt",
169
170
  "post_op", "post_attribute_cvt",
170
- "default_inputs"],
171
+ "default_encoder_inputs",
172
+ "default_decoder_inputs"],
171
173
  defaults=(None, None, None, None, None))
172
174
 
173
175
  # Some tokenizers can be added by this table
@@ -175,35 +177,36 @@ TokenOpParam = namedtuple("TokenOpParam",
175
177
  # @formatter:off
176
178
  _PROCESSOR_DICT = {
177
179
  "BertTokenizer": TokenOpParam('BertTokenizer', HFTokenizerConverter.bert_tokenizer,
178
- 'BertDecoder', HFTokenizerConverter.bpe_decoder, None),
180
+ 'BertDecoder', HFTokenizerConverter.bpe_decoder, None, None),
179
181
  "DistilBertTokenizer": TokenOpParam('BertTokenizer', HFTokenizerConverter.bert_tokenizer,
180
- 'BertDecoder', HFTokenizerConverter.bpe_decoder, None),
182
+ 'BertDecoder', HFTokenizerConverter.bpe_decoder, None, None),
181
183
  "GPT2Tokenizer": TokenOpParam('GPT2Tokenizer', HFTokenizerConverter.bpe_tokenizer,
182
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
184
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
183
185
  "CodeGenTokenizer": TokenOpParam('GPT2Tokenizer', HFTokenizerConverter.bpe_tokenizer,
184
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
186
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
185
187
  "CLIPTokenizer": TokenOpParam('CLIPTokenizer', HFTokenizerConverter.clip_tokenizer,
186
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
188
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
187
189
  "RobertaTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
188
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
190
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
189
191
  "BartTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
190
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
192
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
191
193
  "LayoutLMv3Tokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
192
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
194
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
193
195
  "LongformerTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
194
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
196
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
195
197
  "LEDTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
196
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
198
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
197
199
  "MvpTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
198
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
200
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
199
201
  "T5Tokenizer": TokenOpParam('SentencepieceTokenizer', HFTokenizerConverter.spm_tokenizer,
200
202
  'SentencepieceDecoder', HFTokenizerConverter.spm_decoder,
201
- default_inputs={'add_eos': [True]}),
203
+ default_encoder_inputs={'add_eos': [True]}, default_decoder_inputs=None),
202
204
  "LlamaTokenizer": TokenOpParam('SpmTokenizer', HFTokenizerConverter.bpe_tokenizer,
203
- 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
205
+ 'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
204
206
  "XLMRobertaTokenizer": TokenOpParam('SentencepieceTokenizer', HFTokenizerConverter.spm_tokenizer,
205
207
  'SentencepieceDecoder', HFTokenizerConverter.spm_decoder,
206
- default_inputs={'add_bos': [True], 'add_eos': [True], 'fairseq': [True]}),
208
+ default_encoder_inputs={'add_bos': [True], 'add_eos': [True], 'fairseq': [True]},
209
+ default_decoder_inputs={'fairseq': [True]}),
207
210
  }
208
211
  # @formatter:on
209
212
 
@@ -245,8 +248,8 @@ class HFTokenizerOnnxGraph:
245
248
 
246
249
  # add default_inputs into initializers to simplify the model input
247
250
  n_inputs = len(default_inputs)
248
- if self.cvt_quadruple.default_inputs is not None:
249
- default_inputs.update(self.cvt_quadruple.default_inputs)
251
+ if self.cvt_quadruple.default_encoder_inputs is not None:
252
+ default_inputs.update(self.cvt_quadruple.default_encoder_inputs)
250
253
  if len(default_inputs) != n_inputs:
251
254
  raise ValueError(
252
255
  "Op: {} does not have the inputs from its TokenOpParam.".format(_cvt_op))
@@ -286,7 +289,43 @@ class HFTokenizerOnnxGraph:
286
289
  return g
287
290
 
288
291
  def post_processing(self, **kwargs):
292
+ with_default_inputs = kwargs.pop("WITH_DEFAULT_INPUTS", True)
293
+
289
294
  _cvt_op = self.cvt_quadruple.post_op
290
295
  _cvt_func = self.cvt_quadruple.post_attribute_cvt
291
296
  cvt = partial(_cvt_func, self.cvt_obj)
292
- return SingleOpGraph.build_graph(_cvt_op, cvt=cvt, **kwargs)
297
+ g = SingleOpGraph.build_graph(_cvt_op, cvt=cvt, **kwargs)
298
+
299
+ default_inputs = {}
300
+ if with_default_inputs:
301
+ op_class = SingleOpGraph.get_op_class(_cvt_op)
302
+ default_inputs = op_class.input_default_values()
303
+ if default_inputs is None:
304
+ encoder_inputs = self.cvt_quadruple.default_encoder_inputs
305
+ if encoder_inputs is not None and encoder_inputs["fairseq"]:
306
+ default_inputs = {} # need to set to empty dict to call .update later
307
+ else:
308
+ return g
309
+
310
+ # add default_inputs into initializers to simplify the model input
311
+ if self.cvt_quadruple.default_decoder_inputs is not None:
312
+ default_inputs.update(self.cvt_quadruple.default_decoder_inputs)
313
+
314
+ new_initializers = []
315
+
316
+ for k, v in default_inputs.items():
317
+ input_value_info = next((i for i in g.input if i.name == k), None)
318
+ if input_value_info is None:
319
+ raise ValueError(
320
+ "The input {} is not found in the graph".format(k))
321
+
322
+ np_dtype = onnx.helper.tensor_dtype_to_np_dtype(
323
+ input_value_info.type.tensor_type.elem_type)
324
+ value = nparray(v, np_dtype)
325
+ new_initializers.append(onnx.numpy_helper.from_array(value, k))
326
+ g.initializer.extend(new_initializers)
327
+ new_inputs = [i for i in g.input if i.name not in default_inputs]
328
+ g.ClearField("input")
329
+ g.input.extend(new_inputs)
330
+
331
+ return g
@@ -1,2 +1,2 @@
1
1
  # Generated by setup.py, DON'T MANUALLY UPDATE IT!
2
- __version__ = "0.12.0"
2
+ __version__ = "0.14.0"
@@ -12,6 +12,24 @@ from typing import Union
12
12
  from ._hf_cvt import HFTokenizerConverter, HFTokenizerOnnxGraph # noqa
13
13
  from ._ortapi2 import make_onnx_model, SingleOpGraph
14
14
 
15
+ import os
16
+ import numpy as np
17
+ import tempfile
18
+ import shutil
19
+
20
+ # edit environment variables to avoid protobuf version mismatch
21
+ os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
22
+
23
+ from transformers.convert_slow_tokenizer import SpmConverter # noqa: E402
24
+ from transformers import AutoTokenizer # noqa: E402
25
+ from tokenizers import decoders, normalizers, pre_tokenizers, Regex # noqa: E402
26
+
27
+
28
+ OrtxTokenizer = None
29
+ try:
30
+ from onnxruntime_extensions.pp_api import Tokenizer as OrtxTokenizer
31
+ except ImportError:
32
+ pass
15
33
 
16
34
  _is_torch_available = False
17
35
  try:
@@ -24,11 +42,150 @@ except ImportError:
24
42
 
25
43
  _PRE_POST_PAIR = {'TrieTokenizer': "TrieDetokenizer"}
26
44
 
45
+ def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
46
+ if add_prefix_space:
47
+ prepend_scheme = "always"
48
+ if not getattr(original_tokenizer, "legacy", True):
49
+ prepend_scheme = "first"
50
+ else:
51
+ prepend_scheme = "never"
52
+ return prepend_scheme
53
+
54
+
55
+ class Baichuan2Converter(SpmConverter):
56
+ handle_byte_fallback = True
57
+
58
+ def __init__(self, original_tokenizer):
59
+ super().__init__(original_tokenizer)
60
+ original_tokenizer.add_prefix_space = False
61
+
62
+ def vocab(self, proto):
63
+ vocab = [
64
+ (self.original_tokenizer.convert_ids_to_tokens(0), 0.0),
65
+ (self.original_tokenizer.convert_ids_to_tokens(1), 0.0),
66
+ (self.original_tokenizer.convert_ids_to_tokens(2), 0.0),
67
+ ]
68
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
69
+ return vocab
70
+
71
+ def unk_id(self, proto):
72
+ unk_id = 0
73
+ return unk_id
74
+
75
+ def decoder(self, replacement, add_prefix_space):
76
+ sequence = [
77
+ decoders.Replace("▁", " "),
78
+ decoders.ByteFallback(),
79
+ decoders.Fuse(),
80
+ ]
81
+ if add_prefix_space:
82
+ sequence += [decoders.Strip(content=" ", left=1)]
83
+ return decoders.Sequence(sequence)
84
+
85
+ def normalizer(self, proto):
86
+ if getattr(self.original_tokenizer, "legacy", True):
87
+ sequence = []
88
+ if getattr(self.original_tokenizer, "add_prefix_space", True):
89
+ sequence += [normalizers.Prepend(prepend="▁")]
90
+ sequence += [normalizers.Replace(pattern=" ", content="▁")]
91
+ return normalizers.Sequence(sequence)
92
+ return None # non-legacy, no normalizer
93
+
94
+ def pre_tokenizer(self, replacement, add_prefix_space):
95
+ if not getattr(self.original_tokenizer, "legacy", True): # non-legacy, we need a replace
96
+ prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
97
+ return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False)
98
+ else:
99
+ return super().pre_tokenizer(replacement, add_prefix_space)
100
+
101
+
102
+ class ChatGlmConverter(SpmConverter):
103
+ def normalizer(self, proto):
104
+ precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
105
+ _normalizers = [
106
+ normalizers.Strip(left=False, right=True), # stripping is important
107
+ normalizers.Replace(Regex(" {2,}"), "▁"),
108
+ ]
109
+ return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)
110
+
111
+ def pre_tokenizer(self, replacement, add_prefix_space):
112
+ prepend_scheme = "always"
113
+ if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
114
+ prepend_scheme = "first"
115
+ return pre_tokenizers.Metaspace(
116
+ replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
117
+ )
118
+
119
+
120
+ JSON_TOKEN_CONVERTERS = {
121
+ "BaichuanTokenizer": Baichuan2Converter,
122
+ "ChatGLMTokenizer": ChatGlmConverter,
123
+ }
124
+
125
+ # Save tokenizer JSON files using HuggingFace AutoTokenizer
126
+ def convert_tokenizer(model_path, output_dir):
127
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
128
+ if output_dir is None:
129
+ if os.path.isdir(model_path):
130
+ output_dir = model_path
131
+ else:
132
+ # create a temporary directory
133
+ output_dir = tempfile.mkdtemp()
134
+ tokenizer.save_pretrained(output_dir)
135
+ json_path = os.path.join(output_dir, "tokenizer.json")
136
+
137
+ if type(tokenizer).__name__ in JSON_TOKEN_CONVERTERS:
138
+ GenericSpmConverter = JSON_TOKEN_CONVERTERS[type(tokenizer).__name__]
139
+
140
+ converted = GenericSpmConverter(tokenizer).converted()
141
+ converted.save(json_path)
142
+ print(f"**Tokenizer saved to {json_path}")
143
+ return output_dir
144
+
145
+ # Validate tokenizer files downloaded from memory
146
+ def validate_tokenizer(model_path, output_dir):
147
+ test_sentence = "I like walking my cute dog\n and\x17 then, 生活的真谛是 \t\t\t\t \n\n61"
148
+ if OrtxTokenizer is None:
149
+ print("onnxruntime_extensions package was built with C API enabled, skipping tokenization test")
150
+ ortx_tokenizer = OrtxTokenizer(output_dir)
151
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
152
+ expected_ids = tokenizer(test_sentence, return_tensors="np")["input_ids"]
153
+ ortx_ids = np.asarray(ortx_tokenizer.tokenize(test_sentence))
154
+ assert np.array_equal(expected_ids[0], ortx_ids), f"Tokenization mismatch: {expected_ids[0]} != {ortx_ids}"
155
+ print("Tokenization test passed")
156
+
157
+ # Download tokenizer JSON files from memory
158
+ def download_tokenizer(tokenizer_dir, output_dir):
159
+ try:
160
+ from transformers.utils import cached_file
161
+
162
+ resolved_full_file = cached_file(tokenizer_dir, "tokenizer.json")
163
+ resolved_config_file = cached_file(tokenizer_dir, "tokenizer_config.json")
164
+ except ImportError:
165
+ raise ValueError(f"Directory '{tokenizer_dir}' not found and transformers is not available")
166
+ if not os.path.exists(resolved_full_file):
167
+ raise FileNotFoundError(f"Downloaded HF file '{resolved_full_file}' cannot be found")
168
+ if os.path.dirname(resolved_full_file) != os.path.dirname(resolved_config_file):
169
+ raise FileNotFoundError(
170
+ f"Downloaded HF files '{resolved_full_file}' " f"and '{resolved_config_file}' are not in the same directory"
171
+ )
172
+
173
+ if output_dir is None or len(output_dir) == 0:
174
+ output_dir = os.path.dirname(resolved_full_file)
175
+ print(f"Using {output_dir} as output directory")
176
+ return output_dir
177
+ else:
178
+ # copy the files to the output directory
179
+ shutil.copy(resolved_full_file, output_dir)
180
+ shutil.copy(resolved_config_file, output_dir)
181
+ return output_dir
182
+
27
183
 
28
184
  def gen_processing_models(processor: Union[str, object],
29
185
  pre_kwargs: dict = None,
30
186
  post_kwargs: dict = None,
31
187
  opset: int = None,
188
+ schema_v2: bool = False,
32
189
  **kwargs):
33
190
  """
34
191
  Generate the pre- and post-processing ONNX model, basing on the name or HF class.
@@ -47,6 +204,9 @@ def gen_processing_models(processor: Union[str, object],
47
204
  Keyword arguments for generating the post-processing model
48
205
  opset: int
49
206
  the target opset version of the model
207
+ schema_v2: bool
208
+ the flag for using embedded tokenizer files; this option leverages the blob-loading functionality
209
+ which loads HF tokenizers from memory rather than using the tokenizer files in HF JSON format.
50
210
  kwargs:
51
211
  The additional arguments for generating models
52
212
 
@@ -58,11 +218,42 @@ def gen_processing_models(processor: Union[str, object],
58
218
  if pre_kwargs is None and post_kwargs is None:
59
219
  raise ValueError(
60
220
  "Either pre_kwargs or post_kwargs should be provided. None means no processing graph output.")
61
- if isinstance(processor, str):
221
+
222
+ # If true, we get the tokenizer JSON files by either downloading from cache or using HuggingFace AutoTokenizer
223
+ # to convert them, and then create an ONNX model with the JSON files as strings in the model attributes (attrs).
224
+ if schema_v2:
225
+ model_name = processor if isinstance(processor, str) else type(processor).__name__
226
+
227
+ converted_tokenizer = {"Baichuan2", "chatglm"}
228
+ need_convert = False
229
+ for token in converted_tokenizer:
230
+ if model_name.find(token) != -1:
231
+ need_convert = True
232
+ break
233
+
234
+ if need_convert:
235
+ model_dir = convert_tokenizer(model_name)
236
+ validate_tokenizer(model_name, None)
237
+ else:
238
+ model_dir = download_tokenizer(model_name, None)
239
+
240
+ # Load the content of tokenizer.json into a string
241
+ with open(f"{model_dir}/tokenizer.json", "r", encoding="utf-8") as f:
242
+ tokenizer_vocab = f.read()
243
+
244
+ # Load the content of tokenizer_config.json into a string
245
+ with open(f"{model_dir}/tokenizer_config.json", "r", encoding="utf-8") as f:
246
+ tokenizer_config = f.read()
247
+
248
+ # Create an ONNX model with these JSON file strings in attrs
62
249
  g_pre, g_post = (None, None)
63
- if pre_kwargs:
64
- g_pre = SingleOpGraph.build_graph(processor, **pre_kwargs)
65
- if post_kwargs:
250
+ if pre_kwargs is not None:
251
+ # Add tokenizer_vocab and tokenizer_config to the kwargs
252
+ # so they are added to attrs in build_graph
253
+ pre_kwargs['tokenizer_vocab'] = tokenizer_vocab
254
+ pre_kwargs['tokenizer_config'] = tokenizer_config
255
+ g_pre = SingleOpGraph.build_graph("HfJsonTokenizer", **pre_kwargs)
256
+ if post_kwargs is not None:
66
257
  if pre_kwargs is None:
67
258
  cls_name = processor
68
259
  else:
@@ -70,27 +261,46 @@ def gen_processing_models(processor: Union[str, object],
70
261
  raise RuntimeError(
71
262
  f"Cannot locate the post processing operator name from {processor}")
72
263
  cls_name = _PRE_POST_PAIR[processor]
264
+ # Add tokenizer_vocab and tokenizer_config to the kwargs
265
+ # so they are added to attrs in build_graph
266
+ post_kwargs['tokenizer_vocab'] = tokenizer_vocab
267
+ post_kwargs['tokenizer_config'] = tokenizer_config
73
268
  g_post = SingleOpGraph.build_graph(cls_name, **post_kwargs)
74
269
  return make_onnx_model(g_pre) if g_pre else None, make_onnx_model(g_post) if g_post else None
75
-
76
- cls_name = type(processor).__name__
77
- if cls_name == "WhisperProcessor":
78
- if WhisperDataProcGraph is None:
79
- raise ValueError(
80
- "The Whisper processor needs torch.onnx support, please install pytorch 2.0 and above")
81
- _converter = WhisperDataProcGraph(processor, opset=opset, **kwargs)
82
- pre_m = _converter.pre_processing(
83
- **pre_kwargs) if pre_kwargs is not None else None
84
- post_m = _converter.post_processing(
85
- **post_kwargs) if post_kwargs is not None else None
86
- return pre_m, post_m
87
- elif HFTokenizerOnnxGraph.is_supported(processor):
88
- _converter = HFTokenizerOnnxGraph(processor)
89
- pre_g = _converter.pre_processing(
90
- **pre_kwargs) if pre_kwargs is not None else None
91
- post_g = _converter.post_processing(
92
- **post_kwargs) if post_kwargs is not None else None
93
- return make_onnx_model(pre_g) if pre_g else None, \
94
- make_onnx_model(post_g) if post_g else None
95
270
  else:
96
- raise ValueError(f"Unsupported processor/tokenizer: {cls_name}")
271
+ if isinstance(processor, str):
272
+ g_pre, g_post = (None, None)
273
+ if pre_kwargs:
274
+ g_pre = SingleOpGraph.build_graph(processor, **pre_kwargs)
275
+ if post_kwargs:
276
+ if pre_kwargs is None:
277
+ cls_name = processor
278
+ else:
279
+ if processor not in _PRE_POST_PAIR:
280
+ raise RuntimeError(
281
+ f"Cannot locate the post processing operator name from {processor}")
282
+ cls_name = _PRE_POST_PAIR[processor]
283
+ g_post = SingleOpGraph.build_graph(cls_name, **post_kwargs)
284
+ return make_onnx_model(g_pre) if g_pre else None, make_onnx_model(g_post) if g_post else None
285
+
286
+ cls_name = type(processor).__name__
287
+ if cls_name == "WhisperProcessor":
288
+ if WhisperDataProcGraph is None:
289
+ raise ValueError(
290
+ "The Whisper processor needs torch.onnx support, please install pytorch 2.0 and above")
291
+ _converter = WhisperDataProcGraph(processor, opset=opset, **kwargs)
292
+ pre_m = _converter.pre_processing(
293
+ **pre_kwargs) if pre_kwargs is not None else None
294
+ post_m = _converter.post_processing(
295
+ **post_kwargs) if post_kwargs is not None else None
296
+ return pre_m, post_m
297
+ elif HFTokenizerOnnxGraph.is_supported(processor):
298
+ _converter = HFTokenizerOnnxGraph(processor)
299
+ pre_g = _converter.pre_processing(
300
+ **pre_kwargs) if pre_kwargs is not None else None
301
+ post_g = _converter.post_processing(
302
+ **post_kwargs) if post_kwargs is not None else None
303
+ return make_onnx_model(pre_g) if pre_g else None, \
304
+ make_onnx_model(post_g) if post_g else None
305
+ else:
306
+ raise ValueError(f"Unsupported processor/tokenizer: {cls_name}")
@@ -3,11 +3,81 @@
3
3
  # license information.
4
4
  ###############################################################################
5
5
 
6
+ import os
6
7
  from . import _extensions_pydll as _C
7
- if not hasattr(_C, "create_processor"):
8
- raise ImportError("onnxruntime_extensions is not built with pre-processing API")
8
+ if not hasattr(_C, "delete_object"):
9
+ raise ImportError(
10
+ "onnxruntime_extensions is not built with pre-processing C API\n"
11
+ "To enable it, please build the package with --ortx-user-option=pp_api")
9
12
 
10
13
  create_processor = _C.create_processor
11
14
  load_images = _C.load_images
12
15
  image_pre_process = _C.image_pre_process
13
16
  tensor_result_get_at = _C.tensor_result_get_at
17
+
18
+ create_tokenizer = _C.create_tokenizer
19
+ batch_tokenize = _C.batch_tokenize
20
+ batch_detokenize = _C.batch_detokenize
21
+
22
+ delete_object = _C.delete_object
23
+
24
+
25
+ class Tokenizer:
26
+ def __init__(self, tokenizer_dir):
27
+ self.tokenizer = None
28
+ if os.path.isdir(tokenizer_dir):
29
+ self.tokenizer = create_tokenizer(tokenizer_dir)
30
+ else:
31
+ try:
32
+ from transformers.utils import cached_file
33
+ resolved_full_file = cached_file(
34
+ tokenizer_dir, "tokenizer.json")
35
+ resolved_config_file = cached_file(
36
+ tokenizer_dir, "tokenizer_config.json")
37
+ except ImportError:
38
+ raise ValueError(
39
+ f"Directory '{tokenizer_dir}' not found and transformers is not available")
40
+ if not os.path.exists(resolved_full_file):
41
+ raise FileNotFoundError(
42
+ f"Downloaded HF file '{resolved_full_file}' cannot be found")
43
+ if (os.path.dirname(resolved_full_file) != os.path.dirname(resolved_config_file)):
44
+ raise FileNotFoundError(
45
+ f"Downloaded HF files '{resolved_full_file}' "
46
+ f"and '{resolved_config_file}' are not in the same directory")
47
+
48
+ tokenizer_dir = os.path.dirname(resolved_full_file)
49
+ self.tokenizer = create_tokenizer(tokenizer_dir)
50
+
51
+ def tokenize(self, text):
52
+ if isinstance(text, (list, tuple)):
53
+ return batch_tokenize(self.tokenizer, text)
54
+ return batch_tokenize(self.tokenizer, [text])[0]
55
+
56
+ def detokenize(self, tokens):
57
+ return batch_detokenize(self.tokenizer, [tokens])
58
+
59
+ def __del__(self):
60
+ if delete_object and self.tokenizer:
61
+ delete_object(self.tokenizer)
62
+ self.tokenizer = None
63
+
64
+
65
+ class ImageProcessor:
66
+ def __init__(self, processor_json):
67
+ self.processor = create_processor(processor_json)
68
+
69
+ def pre_process(self, images):
70
+ if isinstance(images, str):
71
+ images = [images]
72
+ if isinstance(images, list):
73
+ images = load_images(images)
74
+ return image_pre_process(self.processor, images)
75
+
76
+ @staticmethod
77
+ def to_numpy(result, idx):
78
+ return tensor_result_get_at(result, idx)
79
+
80
+ def __del__(self):
81
+ if delete_object and self.processor:
82
+ delete_object(self.processor)
83
+ self.processor = None
@@ -163,7 +163,8 @@ def superresolution(model_file: Path, output_file: Path, output_format: str, onn
163
163
 
164
164
 
165
165
  def yolo_detection(model_file: Path, output_file: Path, output_format: str = 'jpg',
166
- onnx_opset: int = 16, num_classes: int = 80, input_shape: List[int] = None):
166
+ onnx_opset: int = 16, num_classes: int = 80, input_shape: List[int] = None,
167
+ output_as_image: bool = True):
167
168
  """
168
169
  SSD-like model and Faster-RCNN-like model are including NMS inside already, You can find it from onnx model zoo.
169
170
 
@@ -185,6 +186,7 @@ def yolo_detection(model_file: Path, output_file: Path, output_format: str = 'jp
185
186
  :param onnx_opset: The opset version of onnx model, default(16).
186
187
  :param num_classes: The number of classes, default(80).
187
188
  :param input_shape: The shape of input image (height,width), default will be asked from model input.
189
+ :param output_as_image: The flag that means that the model should have the image with boxes instead of the coordinates of the boxess
188
190
  """
189
191
  model = onnx.load(str(model_file.resolve(strict=True)))
190
192
  inputs = [create_named_value("image", onnx.TensorProto.UINT8, ["num_bytes"])]
@@ -284,19 +286,23 @@ Because we need to execute the model to determine the output shape in order to a
284
286
  utils.IoMapEntry("Resize", producer_idx=0, consumer_idx=2),
285
287
  utils.IoMapEntry("LetterBox", producer_idx=0, consumer_idx=3),
286
288
  ]),
287
- # DrawBoundingBoxes on the original image
288
- # Model imported from pytorch has CENTER_XYWH format
289
- # two mode for how to color box,
290
- # 1. colour_by_classes=True, (colour_by_classes), 2. colour_by_classes=False,(colour_by_confidence)
291
- (DrawBoundingBoxes(mode='CENTER_XYWH', num_classes=num_classes, colour_by_classes=True),
292
- [
293
- utils.IoMapEntry("ConvertImageToBGR", producer_idx=0, consumer_idx=0),
294
- utils.IoMapEntry("ScaleBoundingBoxes", producer_idx=0, consumer_idx=1),
295
- ]),
296
- # Encode to jpg/png
297
- ConvertBGRToImage(image_format=output_format),
298
289
  ]
299
290
 
291
+ if output_as_image:
292
+ post_processing_steps += [
293
+ # DrawBoundingBoxes on the original image
294
+ # Model imported from pytorch has CENTER_XYWH format
295
+ # two mode for how to color box,
296
+ # 1. colour_by_classes=True, (colour_by_classes), 2. colour_by_classes=False,(colour_by_confidence)
297
+ (DrawBoundingBoxes(mode='CENTER_XYWH', num_classes=num_classes, colour_by_classes=True),
298
+ [
299
+ utils.IoMapEntry("ConvertImageToBGR", producer_idx=0, consumer_idx=0),
300
+ utils.IoMapEntry("ScaleBoundingBoxes", producer_idx=0, consumer_idx=1),
301
+ ]),
302
+ # Encode to jpg/png
303
+ ConvertBGRToImage(image_format=output_format),
304
+ ]
305
+
300
306
  pipeline.add_post_processing(post_processing_steps)
301
307
 
302
308
  new_model = pipeline.run(model)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: onnxruntime_extensions
3
- Version: 0.12.0
3
+ Version: 0.14.0
4
4
  Summary: ONNXRuntime Extensions
5
5
  Home-page: https://github.com/microsoft/onnxruntime-extensions
6
6
  Author: Microsoft Corporation
@@ -18,6 +18,14 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
18
18
  Classifier: License :: OSI Approved :: MIT License
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
+ Dynamic: author
22
+ Dynamic: author-email
23
+ Dynamic: classifier
24
+ Dynamic: description
25
+ Dynamic: description-content-type
26
+ Dynamic: home-page
27
+ Dynamic: license
28
+ Dynamic: summary
21
29
 
22
30
  # ONNXRuntime-Extensions
23
31
 
@@ -25,29 +33,17 @@ License-File: LICENSE
25
33
 
26
34
  ## What's ONNXRuntime-Extensions
27
35
 
28
- Introduction: ONNXRuntime-Extensions is a library that extends the capability of the ONNX models and inference with ONNX Runtime, via ONNX Runtime Custom Operator ABIs. It includes a set of [ONNX Runtime Custom Operator](https://onnxruntime.ai/docs/reference/operators/add-custom-op.html) to support the common pre- and post-processing operators for vision, text, and nlp models. And it supports multiple languages and platforms, like Python on Windows/Linux/macOS, some mobile platforms like Android and iOS, and Web-Assembly etc. The basic workflow is to enhance a ONNX model firstly and then do the model inference with ONNX Runtime and ONNXRuntime-Extensions package.
36
+ Introduction: ONNXRuntime-Extensions is a C/C++ library that extends the capability of the ONNX models and inference with ONNX Runtime, via ONNX Runtime Custom Operator ABIs. It includes a set of [ONNX Runtime Custom Operator](https://onnxruntime.ai/docs/reference/operators/add-custom-op.html) to support the common pre- and post-processing operators for vision, text, and nlp models. And it supports multiple languages and platforms, like Python on Windows/Linux/macOS, some mobile platforms like Android and iOS, and Web-Assembly etc. The basic workflow is to enhance a ONNX model firstly and then do the model inference with ONNX Runtime and ONNXRuntime-Extensions package.
29
37
 
30
38
 
31
39
  ## Quickstart
40
+ The library can be utilized as either a C/C++ library or other advance language packages like Python, Java, C#, etc. To build it as a shared library, you can use the `build.bat` or `build.sh` scripts located in the root folder. The CMake build definition is available in the `CMakeLists.txt` file and can be modified by appending options to `build.bat` or `build.sh`, such as `build.bat -DOCOS_BUILD_SHARED_LIB=OFF`. For more details, please refer to the [C API documentation](./docs/c_api.md).
32
41
 
33
42
  ### **Python installation**
34
43
  ```bash
35
44
  pip install onnxruntime-extensions
36
45
  ````
37
-
38
-
39
- ### **Nightly Build**
40
-
41
- #### <strong>on Windows</strong>
42
- ```cmd
43
- pip install --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ onnxruntime-extensions
44
- ```
45
- Please ensure that you have met the prerequisites of onnxruntime-extensions (e.g., onnx and onnxruntime) in your Python environment.
46
- #### <strong>on Linux/macOS</strong>
47
- Please make sure the compiler toolkit like gcc(later than g++ 8.0) or clang are installed before the following command
48
- ```bash
49
- python -m pip install git+https://github.com/microsoft/onnxruntime-extensions.git
50
- ```
46
+ The nightly build is also available for the latest features, please refer to [nightly build](./docs/development.md#nightly-build)
51
47
 
52
48
 
53
49
  ## Usage
@@ -1,15 +1,15 @@
1
1
  onnxruntime_extensions/__init__.py,sha256=GMnMIHJ-uqvJGPn5fpCZOi7OG16kFVpfOTTO88kYJWY,2387
2
- onnxruntime_extensions/_cuops.py,sha256=SUD2NhEWHeMem8ylCtCGBKutSuZQs4WMj1ke65-52vA,16193
3
- onnxruntime_extensions/_extensions_pydll.cp39-win_amd64.pyd,sha256=IVAfa-rN8jRjKNbd3v1_agIsU7lCFBLinSXi03RAfQc,3323392
2
+ onnxruntime_extensions/_cuops.py,sha256=W4hmBNoNvFk84V4UAUpltGNFjzcf0fju3iCeuatqXHE,16661
3
+ onnxruntime_extensions/_extensions_pydll.cp39-win_amd64.pyd,sha256=0BtiZfy7lditfJwj-cOFbyntPzDZcERTbsHiVD7tiko,1900032
4
4
  onnxruntime_extensions/_extensions_pydll.pyi,sha256=mYXkqNaCgAbs161RDKgDjxIX9vWdYdVPDC-0X9cieco,1070
5
- onnxruntime_extensions/_hf_cvt.py,sha256=HJwpcdc02aYV9qgAYkrtSYbkargYi0xTqf7Ye60D84A,14062
5
+ onnxruntime_extensions/_hf_cvt.py,sha256=7-nV40_lCydWHBMXUkfe3oaJSI7l0SDQdLT92yZG2oc,15945
6
6
  onnxruntime_extensions/_ocos.py,sha256=OlDOlCH_vWFOBkjbp6Pujgw6rgk8Fd3_2Mi5ev1eeS0,4193
7
7
  onnxruntime_extensions/_ortapi2.py,sha256=Tfrf9fQMQ0e7Wa4R8s4SHdwMNBdmj33wH3y5vMkVVQE,9951
8
8
  onnxruntime_extensions/_torch_cvt.py,sha256=hGOiw24QuFpK_3CLjg8Fs2GD_cCdM049xcJxkHVRbAk,10185
9
- onnxruntime_extensions/_version.py,sha256=gjGFlfLnfAn2djJsgIststV0PRkvMcSSr51ENp79FDA,76
9
+ onnxruntime_extensions/_version.py,sha256=u5KwYLG4_oeOTmNuRw2dLiPJ5hByZa12xh0VGidbJMU,76
10
10
  onnxruntime_extensions/cmd.py,sha256=eIiNNY0ohbUCPgmr9RwOfi0Gzw7nWL17i625L-ZKezI,2428
11
- onnxruntime_extensions/cvt.py,sha256=XMz0CZXBJQ9IwnixjzJwz-utKyu9HREIEUCviZg6v8A,3977
12
- onnxruntime_extensions/pp_api.py,sha256=-Qty5kyN0stBft6vecPucGnjQLZXQd_8PzaCvcQM6ys,571
11
+ onnxruntime_extensions/cvt.py,sha256=2cPsKj4weGDveV36mtoQ9yVUfjtqmFNUpFghrsppXOg,13409
12
+ onnxruntime_extensions/pp_api.py,sha256=Fk1iEMPwcnr84V9ALhr-zuMPNi_fyIMPTrKPeOQooZs,3157
13
13
  onnxruntime_extensions/util.py,sha256=KxNFY0-5CG1i9HADcCc4V33PNukTO46Os_KIL8pj-l8,7394
14
14
  onnxruntime_extensions/onnxprocess/__init__.py,sha256=BnveHXnu2nTQNbCLeZujZgZwO9A3yWFbQGTDthCFbIc,534
15
15
  onnxruntime_extensions/onnxprocess/_builder.py,sha256=L_afKeE7Wc4mWJ47eVXQ2stvmal_37QVTQZgKmt0ZK8,1844
@@ -27,7 +27,7 @@ onnxruntime_extensions/pnp/_unifier.py,sha256=FPQYL1Z6f1Tv2qRsnhW_is9k7-GmCYhf6Z
27
27
  onnxruntime_extensions/pnp/_utils.py,sha256=xBh7-_VstgqXlhBaQ_6E5GV6341ywCRQsrJZZZtYaCc,13061
28
28
  onnxruntime_extensions/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
29
  onnxruntime_extensions/tools/add_HuggingFace_CLIPImageProcessor_to_model.py,sha256=iNGAd9Ym0iKDQkXdWdka-R3S47TT3hMTihdGXg0uHL0,6786
30
- onnxruntime_extensions/tools/add_pre_post_processing_to_model.py,sha256=yNXxriAqPqRQzEi7fcK4CCcyCnhubU-wiO2d2rGIOxw,23996
30
+ onnxruntime_extensions/tools/add_pre_post_processing_to_model.py,sha256=M2dSO2FdLo1Hs0GDVqYmKxmWDj7BsKCiyhpqxmCdDWg,24301
31
31
  onnxruntime_extensions/tools/pre_post_processing/__init__.py,sha256=YKxCtG2McBExYYmcf1tbqDquqIS1iTs4iPx86MBcfRo,125
32
32
  onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py,sha256=lnQ4TUKkZ-TvVC8U_ov3Nsz9gzES0ktnmD-DPTzutPA,19635
33
33
  onnxruntime_extensions/tools/pre_post_processing/step.py,sha256=SYFxtrDmXyFpnnlPl4c49Yg1THFZvh5Y9NwuvquHTVg,9394
@@ -36,8 +36,8 @@ onnxruntime_extensions/tools/pre_post_processing/steps/__init__.py,sha256=pdVRZB
36
36
  onnxruntime_extensions/tools/pre_post_processing/steps/general.py,sha256=fF_XVFSKOCu482Sqjp-nVPbs-ZVGpPal2ekbO1gUO_4,13781
37
37
  onnxruntime_extensions/tools/pre_post_processing/steps/nlp.py,sha256=ZCxRNxqfANplxCe0I-6BfHziM1jDYJsNQKbHdM3Y1I0,15173
38
38
  onnxruntime_extensions/tools/pre_post_processing/steps/vision.py,sha256=BM6CGylOSu4l6UarPfW0I2tgkJDa1Q-gYz__CxZle-k,53183
39
- onnxruntime_extensions-0.12.0.dist-info/LICENSE,sha256=mQaUD2Gx8LUz-n2ZuvVReLKAj74RPqUd-_rYVyzNXys,1162
40
- onnxruntime_extensions-0.12.0.dist-info/METADATA,sha256=ulMfUUUKIosCAUeqozJAJ5LQbpnQsU6HvsvbA-_tTXA,4452
41
- onnxruntime_extensions-0.12.0.dist-info/WHEEL,sha256=4qHc_4HH-JGeVqXiOhLz9XdiROeRsdeB9MhbJiO4SkE,99
42
- onnxruntime_extensions-0.12.0.dist-info/top_level.txt,sha256=XyAgQDKyXsf6_0MJb58kRdHwigpTn7A7kl9diBEjs8M,23
43
- onnxruntime_extensions-0.12.0.dist-info/RECORD,,
39
+ onnxruntime_extensions-0.14.0.dist-info/LICENSE,sha256=mQaUD2Gx8LUz-n2ZuvVReLKAj74RPqUd-_rYVyzNXys,1162
40
+ onnxruntime_extensions-0.14.0.dist-info/METADATA,sha256=udEUfhboh5qT4rtnEBg94FkkPmjjAYRJx9rWfg5ZrJg,4657
41
+ onnxruntime_extensions-0.14.0.dist-info/WHEEL,sha256=agy-BJge3afXwWznUXANATmKFW4eqelqRR0uf608A_0,99
42
+ onnxruntime_extensions-0.14.0.dist-info/top_level.txt,sha256=XyAgQDKyXsf6_0MJb58kRdHwigpTn7A7kl9diBEjs8M,23
43
+ onnxruntime_extensions-0.14.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.2.0)
2
+ Generator: setuptools (75.8.0)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp39-cp39-win_amd64
5
5