onnxruntime_extensions 0.13.0__cp39-cp39-win_amd64.whl → 0.14.0__cp39-cp39-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- onnxruntime_extensions/_cuops.py +19 -2
- onnxruntime_extensions/_extensions_pydll.cp39-win_amd64.pyd +0 -0
- onnxruntime_extensions/_hf_cvt.py +56 -18
- onnxruntime_extensions/_version.py +1 -1
- onnxruntime_extensions/cvt.py +235 -25
- onnxruntime_extensions/pp_api.py +5 -3
- onnxruntime_extensions/tools/add_pre_post_processing_to_model.py +18 -12
- {onnxruntime_extensions-0.13.0.dist-info → onnxruntime_extensions-0.14.0.dist-info}/METADATA +10 -2
- {onnxruntime_extensions-0.13.0.dist-info → onnxruntime_extensions-0.14.0.dist-info}/RECORD +12 -12
- {onnxruntime_extensions-0.13.0.dist-info → onnxruntime_extensions-0.14.0.dist-info}/WHEEL +1 -1
- {onnxruntime_extensions-0.13.0.dist-info → onnxruntime_extensions-0.14.0.dist-info}/LICENSE +0 -0
- {onnxruntime_extensions-0.13.0.dist-info → onnxruntime_extensions-0.14.0.dist-info}/top_level.txt +0 -0
onnxruntime_extensions/_cuops.py
CHANGED
|
@@ -364,8 +364,15 @@ class SentencepieceDecoder(CustomOp):
|
|
|
364
364
|
@classmethod
|
|
365
365
|
def get_inputs(cls):
|
|
366
366
|
return [
|
|
367
|
-
cls.io_def("ids", onnx.TensorProto.INT64, [None])
|
|
367
|
+
cls.io_def("ids", onnx.TensorProto.INT64, [None]),
|
|
368
|
+
cls.io_def('fairseq', onnx_proto.TensorProto.BOOL, [None])
|
|
368
369
|
]
|
|
370
|
+
|
|
371
|
+
@classmethod
|
|
372
|
+
def input_default_values(cls):
|
|
373
|
+
return {
|
|
374
|
+
'fairseq': [False]
|
|
375
|
+
}
|
|
369
376
|
|
|
370
377
|
@classmethod
|
|
371
378
|
def get_outputs(cls):
|
|
@@ -491,6 +498,16 @@ class StftNorm(CustomOp):
|
|
|
491
498
|
]
|
|
492
499
|
|
|
493
500
|
|
|
501
|
+
class HfJsonTokenizer(CustomOp):
|
|
502
|
+
@classmethod
|
|
503
|
+
def get_inputs(cls):
|
|
504
|
+
return [cls.io_def('str', onnx_proto.TensorProto.STRING, ['N'])]
|
|
505
|
+
|
|
506
|
+
@classmethod
|
|
507
|
+
def get_outputs(cls):
|
|
508
|
+
return [cls.io_def("ids", onnx.TensorProto.INT64, ['N', None])]
|
|
509
|
+
|
|
510
|
+
|
|
494
511
|
# TODO: have a C++ impl.
|
|
495
512
|
def _argsort_op(x, dim):
|
|
496
513
|
d = numpy.argsort(x, dim)
|
|
@@ -544,4 +561,4 @@ class SingleOpGraph:
|
|
|
544
561
|
|
|
545
562
|
@staticmethod
|
|
546
563
|
def get_op_class(op_type):
|
|
547
|
-
return globals()[op_type]
|
|
564
|
+
return globals()[op_type]
|
|
Binary file
|
|
@@ -168,7 +168,8 @@ class HFTokenizerConverter(CustomOpConverter):
|
|
|
168
168
|
TokenOpParam = namedtuple("TokenOpParam",
|
|
169
169
|
["pre_op", "pre_attribute_cvt",
|
|
170
170
|
"post_op", "post_attribute_cvt",
|
|
171
|
-
"
|
|
171
|
+
"default_encoder_inputs",
|
|
172
|
+
"default_decoder_inputs"],
|
|
172
173
|
defaults=(None, None, None, None, None))
|
|
173
174
|
|
|
174
175
|
# Some tokenizers can be added by this table
|
|
@@ -176,35 +177,36 @@ TokenOpParam = namedtuple("TokenOpParam",
|
|
|
176
177
|
# @formatter:off
|
|
177
178
|
_PROCESSOR_DICT = {
|
|
178
179
|
"BertTokenizer": TokenOpParam('BertTokenizer', HFTokenizerConverter.bert_tokenizer,
|
|
179
|
-
'BertDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
180
|
+
'BertDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
180
181
|
"DistilBertTokenizer": TokenOpParam('BertTokenizer', HFTokenizerConverter.bert_tokenizer,
|
|
181
|
-
'BertDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
182
|
+
'BertDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
182
183
|
"GPT2Tokenizer": TokenOpParam('GPT2Tokenizer', HFTokenizerConverter.bpe_tokenizer,
|
|
183
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
184
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
184
185
|
"CodeGenTokenizer": TokenOpParam('GPT2Tokenizer', HFTokenizerConverter.bpe_tokenizer,
|
|
185
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
186
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
186
187
|
"CLIPTokenizer": TokenOpParam('CLIPTokenizer', HFTokenizerConverter.clip_tokenizer,
|
|
187
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
188
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
188
189
|
"RobertaTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
|
|
189
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
190
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
190
191
|
"BartTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
|
|
191
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
192
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
192
193
|
"LayoutLMv3Tokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
|
|
193
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
194
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
194
195
|
"LongformerTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
|
|
195
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
196
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
196
197
|
"LEDTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
|
|
197
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
198
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
198
199
|
"MvpTokenizer": TokenOpParam('RobertaTokenizer', HFTokenizerConverter.roberta_tokenizer,
|
|
199
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
200
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
200
201
|
"T5Tokenizer": TokenOpParam('SentencepieceTokenizer', HFTokenizerConverter.spm_tokenizer,
|
|
201
202
|
'SentencepieceDecoder', HFTokenizerConverter.spm_decoder,
|
|
202
|
-
|
|
203
|
+
default_encoder_inputs={'add_eos': [True]}, default_decoder_inputs=None),
|
|
203
204
|
"LlamaTokenizer": TokenOpParam('SpmTokenizer', HFTokenizerConverter.bpe_tokenizer,
|
|
204
|
-
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None),
|
|
205
|
+
'BpeDecoder', HFTokenizerConverter.bpe_decoder, None, None),
|
|
205
206
|
"XLMRobertaTokenizer": TokenOpParam('SentencepieceTokenizer', HFTokenizerConverter.spm_tokenizer,
|
|
206
207
|
'SentencepieceDecoder', HFTokenizerConverter.spm_decoder,
|
|
207
|
-
|
|
208
|
+
default_encoder_inputs={'add_bos': [True], 'add_eos': [True], 'fairseq': [True]},
|
|
209
|
+
default_decoder_inputs={'fairseq': [True]}),
|
|
208
210
|
}
|
|
209
211
|
# @formatter:on
|
|
210
212
|
|
|
@@ -246,8 +248,8 @@ class HFTokenizerOnnxGraph:
|
|
|
246
248
|
|
|
247
249
|
# add default_inputs into initializers to simplify the model input
|
|
248
250
|
n_inputs = len(default_inputs)
|
|
249
|
-
if self.cvt_quadruple.
|
|
250
|
-
default_inputs.update(self.cvt_quadruple.
|
|
251
|
+
if self.cvt_quadruple.default_encoder_inputs is not None:
|
|
252
|
+
default_inputs.update(self.cvt_quadruple.default_encoder_inputs)
|
|
251
253
|
if len(default_inputs) != n_inputs:
|
|
252
254
|
raise ValueError(
|
|
253
255
|
"Op: {} does not have the inputs from its TokenOpParam.".format(_cvt_op))
|
|
@@ -287,7 +289,43 @@ class HFTokenizerOnnxGraph:
|
|
|
287
289
|
return g
|
|
288
290
|
|
|
289
291
|
def post_processing(self, **kwargs):
|
|
292
|
+
with_default_inputs = kwargs.pop("WITH_DEFAULT_INPUTS", True)
|
|
293
|
+
|
|
290
294
|
_cvt_op = self.cvt_quadruple.post_op
|
|
291
295
|
_cvt_func = self.cvt_quadruple.post_attribute_cvt
|
|
292
296
|
cvt = partial(_cvt_func, self.cvt_obj)
|
|
293
|
-
|
|
297
|
+
g = SingleOpGraph.build_graph(_cvt_op, cvt=cvt, **kwargs)
|
|
298
|
+
|
|
299
|
+
default_inputs = {}
|
|
300
|
+
if with_default_inputs:
|
|
301
|
+
op_class = SingleOpGraph.get_op_class(_cvt_op)
|
|
302
|
+
default_inputs = op_class.input_default_values()
|
|
303
|
+
if default_inputs is None:
|
|
304
|
+
encoder_inputs = self.cvt_quadruple.default_encoder_inputs
|
|
305
|
+
if encoder_inputs is not None and encoder_inputs["fairseq"]:
|
|
306
|
+
default_inputs = {} # need to set to empty dict to call .update later
|
|
307
|
+
else:
|
|
308
|
+
return g
|
|
309
|
+
|
|
310
|
+
# add default_inputs into initializers to simplify the model input
|
|
311
|
+
if self.cvt_quadruple.default_decoder_inputs is not None:
|
|
312
|
+
default_inputs.update(self.cvt_quadruple.default_decoder_inputs)
|
|
313
|
+
|
|
314
|
+
new_initializers = []
|
|
315
|
+
|
|
316
|
+
for k, v in default_inputs.items():
|
|
317
|
+
input_value_info = next((i for i in g.input if i.name == k), None)
|
|
318
|
+
if input_value_info is None:
|
|
319
|
+
raise ValueError(
|
|
320
|
+
"The input {} is not found in the graph".format(k))
|
|
321
|
+
|
|
322
|
+
np_dtype = onnx.helper.tensor_dtype_to_np_dtype(
|
|
323
|
+
input_value_info.type.tensor_type.elem_type)
|
|
324
|
+
value = nparray(v, np_dtype)
|
|
325
|
+
new_initializers.append(onnx.numpy_helper.from_array(value, k))
|
|
326
|
+
g.initializer.extend(new_initializers)
|
|
327
|
+
new_inputs = [i for i in g.input if i.name not in default_inputs]
|
|
328
|
+
g.ClearField("input")
|
|
329
|
+
g.input.extend(new_inputs)
|
|
330
|
+
|
|
331
|
+
return g
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
# Generated by setup.py, DON'T MANUALLY UPDATE IT!
|
|
2
|
-
__version__ = "0.
|
|
2
|
+
__version__ = "0.14.0"
|
onnxruntime_extensions/cvt.py
CHANGED
|
@@ -12,6 +12,24 @@ from typing import Union
|
|
|
12
12
|
from ._hf_cvt import HFTokenizerConverter, HFTokenizerOnnxGraph # noqa
|
|
13
13
|
from ._ortapi2 import make_onnx_model, SingleOpGraph
|
|
14
14
|
|
|
15
|
+
import os
|
|
16
|
+
import numpy as np
|
|
17
|
+
import tempfile
|
|
18
|
+
import shutil
|
|
19
|
+
|
|
20
|
+
# edit environment variables to avoid protobuf version mismatch
|
|
21
|
+
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
|
22
|
+
|
|
23
|
+
from transformers.convert_slow_tokenizer import SpmConverter # noqa: E402
|
|
24
|
+
from transformers import AutoTokenizer # noqa: E402
|
|
25
|
+
from tokenizers import decoders, normalizers, pre_tokenizers, Regex # noqa: E402
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
OrtxTokenizer = None
|
|
29
|
+
try:
|
|
30
|
+
from onnxruntime_extensions.pp_api import Tokenizer as OrtxTokenizer
|
|
31
|
+
except ImportError:
|
|
32
|
+
pass
|
|
15
33
|
|
|
16
34
|
_is_torch_available = False
|
|
17
35
|
try:
|
|
@@ -24,11 +42,150 @@ except ImportError:
|
|
|
24
42
|
|
|
25
43
|
_PRE_POST_PAIR = {'TrieTokenizer': "TrieDetokenizer"}
|
|
26
44
|
|
|
45
|
+
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
|
|
46
|
+
if add_prefix_space:
|
|
47
|
+
prepend_scheme = "always"
|
|
48
|
+
if not getattr(original_tokenizer, "legacy", True):
|
|
49
|
+
prepend_scheme = "first"
|
|
50
|
+
else:
|
|
51
|
+
prepend_scheme = "never"
|
|
52
|
+
return prepend_scheme
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Baichuan2Converter(SpmConverter):
|
|
56
|
+
handle_byte_fallback = True
|
|
57
|
+
|
|
58
|
+
def __init__(self, original_tokenizer):
|
|
59
|
+
super().__init__(original_tokenizer)
|
|
60
|
+
original_tokenizer.add_prefix_space = False
|
|
61
|
+
|
|
62
|
+
def vocab(self, proto):
|
|
63
|
+
vocab = [
|
|
64
|
+
(self.original_tokenizer.convert_ids_to_tokens(0), 0.0),
|
|
65
|
+
(self.original_tokenizer.convert_ids_to_tokens(1), 0.0),
|
|
66
|
+
(self.original_tokenizer.convert_ids_to_tokens(2), 0.0),
|
|
67
|
+
]
|
|
68
|
+
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
|
|
69
|
+
return vocab
|
|
70
|
+
|
|
71
|
+
def unk_id(self, proto):
|
|
72
|
+
unk_id = 0
|
|
73
|
+
return unk_id
|
|
74
|
+
|
|
75
|
+
def decoder(self, replacement, add_prefix_space):
|
|
76
|
+
sequence = [
|
|
77
|
+
decoders.Replace("▁", " "),
|
|
78
|
+
decoders.ByteFallback(),
|
|
79
|
+
decoders.Fuse(),
|
|
80
|
+
]
|
|
81
|
+
if add_prefix_space:
|
|
82
|
+
sequence += [decoders.Strip(content=" ", left=1)]
|
|
83
|
+
return decoders.Sequence(sequence)
|
|
84
|
+
|
|
85
|
+
def normalizer(self, proto):
|
|
86
|
+
if getattr(self.original_tokenizer, "legacy", True):
|
|
87
|
+
sequence = []
|
|
88
|
+
if getattr(self.original_tokenizer, "add_prefix_space", True):
|
|
89
|
+
sequence += [normalizers.Prepend(prepend="▁")]
|
|
90
|
+
sequence += [normalizers.Replace(pattern=" ", content="▁")]
|
|
91
|
+
return normalizers.Sequence(sequence)
|
|
92
|
+
return None # non-legacy, no normalizer
|
|
93
|
+
|
|
94
|
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
|
95
|
+
if not getattr(self.original_tokenizer, "legacy", True): # non-legacy, we need a replace
|
|
96
|
+
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
|
|
97
|
+
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme, split=False)
|
|
98
|
+
else:
|
|
99
|
+
return super().pre_tokenizer(replacement, add_prefix_space)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class ChatGlmConverter(SpmConverter):
|
|
103
|
+
def normalizer(self, proto):
|
|
104
|
+
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
|
105
|
+
_normalizers = [
|
|
106
|
+
normalizers.Strip(left=False, right=True), # stripping is important
|
|
107
|
+
normalizers.Replace(Regex(" {2,}"), "▁"),
|
|
108
|
+
]
|
|
109
|
+
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)
|
|
110
|
+
|
|
111
|
+
def pre_tokenizer(self, replacement, add_prefix_space):
|
|
112
|
+
prepend_scheme = "always"
|
|
113
|
+
if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
|
|
114
|
+
prepend_scheme = "first"
|
|
115
|
+
return pre_tokenizers.Metaspace(
|
|
116
|
+
replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
JSON_TOKEN_CONVERTERS = {
|
|
121
|
+
"BaichuanTokenizer": Baichuan2Converter,
|
|
122
|
+
"ChatGLMTokenizer": ChatGlmConverter,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# Save tokenizer JSON files using HuggingFace AutoTokenizer
|
|
126
|
+
def convert_tokenizer(model_path, output_dir):
|
|
127
|
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
128
|
+
if output_dir is None:
|
|
129
|
+
if os.path.isdir(model_path):
|
|
130
|
+
output_dir = model_path
|
|
131
|
+
else:
|
|
132
|
+
# create a temporary directory
|
|
133
|
+
output_dir = tempfile.mkdtemp()
|
|
134
|
+
tokenizer.save_pretrained(output_dir)
|
|
135
|
+
json_path = os.path.join(output_dir, "tokenizer.json")
|
|
136
|
+
|
|
137
|
+
if type(tokenizer).__name__ in JSON_TOKEN_CONVERTERS:
|
|
138
|
+
GenericSpmConverter = JSON_TOKEN_CONVERTERS[type(tokenizer).__name__]
|
|
139
|
+
|
|
140
|
+
converted = GenericSpmConverter(tokenizer).converted()
|
|
141
|
+
converted.save(json_path)
|
|
142
|
+
print(f"**Tokenizer saved to {json_path}")
|
|
143
|
+
return output_dir
|
|
144
|
+
|
|
145
|
+
# Validate tokenizer files downloaded from memory
|
|
146
|
+
def validate_tokenizer(model_path, output_dir):
|
|
147
|
+
test_sentence = "I like walking my cute dog\n and\x17 then, 生活的真谛是 \t\t\t\t \n\n61"
|
|
148
|
+
if OrtxTokenizer is None:
|
|
149
|
+
print("onnxruntime_extensions package was built with C API enabled, skipping tokenization test")
|
|
150
|
+
ortx_tokenizer = OrtxTokenizer(output_dir)
|
|
151
|
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
|
|
152
|
+
expected_ids = tokenizer(test_sentence, return_tensors="np")["input_ids"]
|
|
153
|
+
ortx_ids = np.asarray(ortx_tokenizer.tokenize(test_sentence))
|
|
154
|
+
assert np.array_equal(expected_ids[0], ortx_ids), f"Tokenization mismatch: {expected_ids[0]} != {ortx_ids}"
|
|
155
|
+
print("Tokenization test passed")
|
|
156
|
+
|
|
157
|
+
# Download tokenizer JSON files from memory
|
|
158
|
+
def download_tokenizer(tokenizer_dir, output_dir):
|
|
159
|
+
try:
|
|
160
|
+
from transformers.utils import cached_file
|
|
161
|
+
|
|
162
|
+
resolved_full_file = cached_file(tokenizer_dir, "tokenizer.json")
|
|
163
|
+
resolved_config_file = cached_file(tokenizer_dir, "tokenizer_config.json")
|
|
164
|
+
except ImportError:
|
|
165
|
+
raise ValueError(f"Directory '{tokenizer_dir}' not found and transformers is not available")
|
|
166
|
+
if not os.path.exists(resolved_full_file):
|
|
167
|
+
raise FileNotFoundError(f"Downloaded HF file '{resolved_full_file}' cannot be found")
|
|
168
|
+
if os.path.dirname(resolved_full_file) != os.path.dirname(resolved_config_file):
|
|
169
|
+
raise FileNotFoundError(
|
|
170
|
+
f"Downloaded HF files '{resolved_full_file}' " f"and '{resolved_config_file}' are not in the same directory"
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
if output_dir is None or len(output_dir) == 0:
|
|
174
|
+
output_dir = os.path.dirname(resolved_full_file)
|
|
175
|
+
print(f"Using {output_dir} as output directory")
|
|
176
|
+
return output_dir
|
|
177
|
+
else:
|
|
178
|
+
# copy the files to the output directory
|
|
179
|
+
shutil.copy(resolved_full_file, output_dir)
|
|
180
|
+
shutil.copy(resolved_config_file, output_dir)
|
|
181
|
+
return output_dir
|
|
182
|
+
|
|
27
183
|
|
|
28
184
|
def gen_processing_models(processor: Union[str, object],
|
|
29
185
|
pre_kwargs: dict = None,
|
|
30
186
|
post_kwargs: dict = None,
|
|
31
187
|
opset: int = None,
|
|
188
|
+
schema_v2: bool = False,
|
|
32
189
|
**kwargs):
|
|
33
190
|
"""
|
|
34
191
|
Generate the pre- and post-processing ONNX model, basing on the name or HF class.
|
|
@@ -47,6 +204,9 @@ def gen_processing_models(processor: Union[str, object],
|
|
|
47
204
|
Keyword arguments for generating the post-processing model
|
|
48
205
|
opset: int
|
|
49
206
|
the target opset version of the model
|
|
207
|
+
schema_v2: bool
|
|
208
|
+
the flag for using embedded tokenizer files; this option leverages the blob-loading functionality
|
|
209
|
+
which loads HF tokenizers from memory rather than using the tokenizer files in HF JSON format.
|
|
50
210
|
kwargs:
|
|
51
211
|
The additional arguments for generating models
|
|
52
212
|
|
|
@@ -58,11 +218,42 @@ def gen_processing_models(processor: Union[str, object],
|
|
|
58
218
|
if pre_kwargs is None and post_kwargs is None:
|
|
59
219
|
raise ValueError(
|
|
60
220
|
"Either pre_kwargs or post_kwargs should be provided. None means no processing graph output.")
|
|
61
|
-
|
|
221
|
+
|
|
222
|
+
# If true, we get the tokenizer JSON files by either downloading from cache or using HuggingFace AutoTokenizer
|
|
223
|
+
# to convert them, and then create an ONNX model with the JSON files as strings in the model attributes (attrs).
|
|
224
|
+
if schema_v2:
|
|
225
|
+
model_name = processor if isinstance(processor, str) else type(processor).__name__
|
|
226
|
+
|
|
227
|
+
converted_tokenizer = {"Baichuan2", "chatglm"}
|
|
228
|
+
need_convert = False
|
|
229
|
+
for token in converted_tokenizer:
|
|
230
|
+
if model_name.find(token) != -1:
|
|
231
|
+
need_convert = True
|
|
232
|
+
break
|
|
233
|
+
|
|
234
|
+
if need_convert:
|
|
235
|
+
model_dir = convert_tokenizer(model_name)
|
|
236
|
+
validate_tokenizer(model_name, None)
|
|
237
|
+
else:
|
|
238
|
+
model_dir = download_tokenizer(model_name, None)
|
|
239
|
+
|
|
240
|
+
# Load the content of tokenizer.json into a string
|
|
241
|
+
with open(f"{model_dir}/tokenizer.json", "r", encoding="utf-8") as f:
|
|
242
|
+
tokenizer_vocab = f.read()
|
|
243
|
+
|
|
244
|
+
# Load the content of tokenizer_config.json into a string
|
|
245
|
+
with open(f"{model_dir}/tokenizer_config.json", "r", encoding="utf-8") as f:
|
|
246
|
+
tokenizer_config = f.read()
|
|
247
|
+
|
|
248
|
+
# Create an ONNX model with these JSON file strings in attrs
|
|
62
249
|
g_pre, g_post = (None, None)
|
|
63
|
-
if pre_kwargs:
|
|
64
|
-
|
|
65
|
-
|
|
250
|
+
if pre_kwargs is not None:
|
|
251
|
+
# Add tokenizer_vocab and tokenizer_config to the kwargs
|
|
252
|
+
# so they are added to attrs in build_graph
|
|
253
|
+
pre_kwargs['tokenizer_vocab'] = tokenizer_vocab
|
|
254
|
+
pre_kwargs['tokenizer_config'] = tokenizer_config
|
|
255
|
+
g_pre = SingleOpGraph.build_graph("HfJsonTokenizer", **pre_kwargs)
|
|
256
|
+
if post_kwargs is not None:
|
|
66
257
|
if pre_kwargs is None:
|
|
67
258
|
cls_name = processor
|
|
68
259
|
else:
|
|
@@ -70,27 +261,46 @@ def gen_processing_models(processor: Union[str, object],
|
|
|
70
261
|
raise RuntimeError(
|
|
71
262
|
f"Cannot locate the post processing operator name from {processor}")
|
|
72
263
|
cls_name = _PRE_POST_PAIR[processor]
|
|
264
|
+
# Add tokenizer_vocab and tokenizer_config to the kwargs
|
|
265
|
+
# so they are added to attrs in build_graph
|
|
266
|
+
post_kwargs['tokenizer_vocab'] = tokenizer_vocab
|
|
267
|
+
post_kwargs['tokenizer_config'] = tokenizer_config
|
|
73
268
|
g_post = SingleOpGraph.build_graph(cls_name, **post_kwargs)
|
|
74
269
|
return make_onnx_model(g_pre) if g_pre else None, make_onnx_model(g_post) if g_post else None
|
|
75
|
-
|
|
76
|
-
cls_name = type(processor).__name__
|
|
77
|
-
if cls_name == "WhisperProcessor":
|
|
78
|
-
if WhisperDataProcGraph is None:
|
|
79
|
-
raise ValueError(
|
|
80
|
-
"The Whisper processor needs torch.onnx support, please install pytorch 2.0 and above")
|
|
81
|
-
_converter = WhisperDataProcGraph(processor, opset=opset, **kwargs)
|
|
82
|
-
pre_m = _converter.pre_processing(
|
|
83
|
-
**pre_kwargs) if pre_kwargs is not None else None
|
|
84
|
-
post_m = _converter.post_processing(
|
|
85
|
-
**post_kwargs) if post_kwargs is not None else None
|
|
86
|
-
return pre_m, post_m
|
|
87
|
-
elif HFTokenizerOnnxGraph.is_supported(processor):
|
|
88
|
-
_converter = HFTokenizerOnnxGraph(processor)
|
|
89
|
-
pre_g = _converter.pre_processing(
|
|
90
|
-
**pre_kwargs) if pre_kwargs is not None else None
|
|
91
|
-
post_g = _converter.post_processing(
|
|
92
|
-
**post_kwargs) if post_kwargs is not None else None
|
|
93
|
-
return make_onnx_model(pre_g) if pre_g else None, \
|
|
94
|
-
make_onnx_model(post_g) if post_g else None
|
|
95
270
|
else:
|
|
96
|
-
|
|
271
|
+
if isinstance(processor, str):
|
|
272
|
+
g_pre, g_post = (None, None)
|
|
273
|
+
if pre_kwargs:
|
|
274
|
+
g_pre = SingleOpGraph.build_graph(processor, **pre_kwargs)
|
|
275
|
+
if post_kwargs:
|
|
276
|
+
if pre_kwargs is None:
|
|
277
|
+
cls_name = processor
|
|
278
|
+
else:
|
|
279
|
+
if processor not in _PRE_POST_PAIR:
|
|
280
|
+
raise RuntimeError(
|
|
281
|
+
f"Cannot locate the post processing operator name from {processor}")
|
|
282
|
+
cls_name = _PRE_POST_PAIR[processor]
|
|
283
|
+
g_post = SingleOpGraph.build_graph(cls_name, **post_kwargs)
|
|
284
|
+
return make_onnx_model(g_pre) if g_pre else None, make_onnx_model(g_post) if g_post else None
|
|
285
|
+
|
|
286
|
+
cls_name = type(processor).__name__
|
|
287
|
+
if cls_name == "WhisperProcessor":
|
|
288
|
+
if WhisperDataProcGraph is None:
|
|
289
|
+
raise ValueError(
|
|
290
|
+
"The Whisper processor needs torch.onnx support, please install pytorch 2.0 and above")
|
|
291
|
+
_converter = WhisperDataProcGraph(processor, opset=opset, **kwargs)
|
|
292
|
+
pre_m = _converter.pre_processing(
|
|
293
|
+
**pre_kwargs) if pre_kwargs is not None else None
|
|
294
|
+
post_m = _converter.post_processing(
|
|
295
|
+
**post_kwargs) if post_kwargs is not None else None
|
|
296
|
+
return pre_m, post_m
|
|
297
|
+
elif HFTokenizerOnnxGraph.is_supported(processor):
|
|
298
|
+
_converter = HFTokenizerOnnxGraph(processor)
|
|
299
|
+
pre_g = _converter.pre_processing(
|
|
300
|
+
**pre_kwargs) if pre_kwargs is not None else None
|
|
301
|
+
post_g = _converter.post_processing(
|
|
302
|
+
**post_kwargs) if post_kwargs is not None else None
|
|
303
|
+
return make_onnx_model(pre_g) if pre_g else None, \
|
|
304
|
+
make_onnx_model(post_g) if post_g else None
|
|
305
|
+
else:
|
|
306
|
+
raise ValueError(f"Unsupported processor/tokenizer: {cls_name}")
|
onnxruntime_extensions/pp_api.py
CHANGED
|
@@ -49,10 +49,12 @@ class Tokenizer:
|
|
|
49
49
|
self.tokenizer = create_tokenizer(tokenizer_dir)
|
|
50
50
|
|
|
51
51
|
def tokenize(self, text):
|
|
52
|
+
if isinstance(text, (list, tuple)):
|
|
53
|
+
return batch_tokenize(self.tokenizer, text)
|
|
52
54
|
return batch_tokenize(self.tokenizer, [text])[0]
|
|
53
55
|
|
|
54
56
|
def detokenize(self, tokens):
|
|
55
|
-
return batch_detokenize(self.tokenizer, [tokens])
|
|
57
|
+
return batch_detokenize(self.tokenizer, [tokens])
|
|
56
58
|
|
|
57
59
|
def __del__(self):
|
|
58
60
|
if delete_object and self.tokenizer:
|
|
@@ -72,8 +74,8 @@ class ImageProcessor:
|
|
|
72
74
|
return image_pre_process(self.processor, images)
|
|
73
75
|
|
|
74
76
|
@staticmethod
|
|
75
|
-
def to_numpy(result):
|
|
76
|
-
return tensor_result_get_at(result,
|
|
77
|
+
def to_numpy(result, idx):
|
|
78
|
+
return tensor_result_get_at(result, idx)
|
|
77
79
|
|
|
78
80
|
def __del__(self):
|
|
79
81
|
if delete_object and self.processor:
|
|
@@ -163,7 +163,8 @@ def superresolution(model_file: Path, output_file: Path, output_format: str, onn
|
|
|
163
163
|
|
|
164
164
|
|
|
165
165
|
def yolo_detection(model_file: Path, output_file: Path, output_format: str = 'jpg',
|
|
166
|
-
onnx_opset: int = 16, num_classes: int = 80, input_shape: List[int] = None
|
|
166
|
+
onnx_opset: int = 16, num_classes: int = 80, input_shape: List[int] = None,
|
|
167
|
+
output_as_image: bool = True):
|
|
167
168
|
"""
|
|
168
169
|
SSD-like model and Faster-RCNN-like model are including NMS inside already, You can find it from onnx model zoo.
|
|
169
170
|
|
|
@@ -185,6 +186,7 @@ def yolo_detection(model_file: Path, output_file: Path, output_format: str = 'jp
|
|
|
185
186
|
:param onnx_opset: The opset version of onnx model, default(16).
|
|
186
187
|
:param num_classes: The number of classes, default(80).
|
|
187
188
|
:param input_shape: The shape of input image (height,width), default will be asked from model input.
|
|
189
|
+
:param output_as_image: The flag that means that the model should have the image with boxes instead of the coordinates of the boxess
|
|
188
190
|
"""
|
|
189
191
|
model = onnx.load(str(model_file.resolve(strict=True)))
|
|
190
192
|
inputs = [create_named_value("image", onnx.TensorProto.UINT8, ["num_bytes"])]
|
|
@@ -284,19 +286,23 @@ Because we need to execute the model to determine the output shape in order to a
|
|
|
284
286
|
utils.IoMapEntry("Resize", producer_idx=0, consumer_idx=2),
|
|
285
287
|
utils.IoMapEntry("LetterBox", producer_idx=0, consumer_idx=3),
|
|
286
288
|
]),
|
|
287
|
-
# DrawBoundingBoxes on the original image
|
|
288
|
-
# Model imported from pytorch has CENTER_XYWH format
|
|
289
|
-
# two mode for how to color box,
|
|
290
|
-
# 1. colour_by_classes=True, (colour_by_classes), 2. colour_by_classes=False,(colour_by_confidence)
|
|
291
|
-
(DrawBoundingBoxes(mode='CENTER_XYWH', num_classes=num_classes, colour_by_classes=True),
|
|
292
|
-
[
|
|
293
|
-
utils.IoMapEntry("ConvertImageToBGR", producer_idx=0, consumer_idx=0),
|
|
294
|
-
utils.IoMapEntry("ScaleBoundingBoxes", producer_idx=0, consumer_idx=1),
|
|
295
|
-
]),
|
|
296
|
-
# Encode to jpg/png
|
|
297
|
-
ConvertBGRToImage(image_format=output_format),
|
|
298
289
|
]
|
|
299
290
|
|
|
291
|
+
if output_as_image:
|
|
292
|
+
post_processing_steps += [
|
|
293
|
+
# DrawBoundingBoxes on the original image
|
|
294
|
+
# Model imported from pytorch has CENTER_XYWH format
|
|
295
|
+
# two mode for how to color box,
|
|
296
|
+
# 1. colour_by_classes=True, (colour_by_classes), 2. colour_by_classes=False,(colour_by_confidence)
|
|
297
|
+
(DrawBoundingBoxes(mode='CENTER_XYWH', num_classes=num_classes, colour_by_classes=True),
|
|
298
|
+
[
|
|
299
|
+
utils.IoMapEntry("ConvertImageToBGR", producer_idx=0, consumer_idx=0),
|
|
300
|
+
utils.IoMapEntry("ScaleBoundingBoxes", producer_idx=0, consumer_idx=1),
|
|
301
|
+
]),
|
|
302
|
+
# Encode to jpg/png
|
|
303
|
+
ConvertBGRToImage(image_format=output_format),
|
|
304
|
+
]
|
|
305
|
+
|
|
300
306
|
pipeline.add_post_processing(post_processing_steps)
|
|
301
307
|
|
|
302
308
|
new_model = pipeline.run(model)
|
{onnxruntime_extensions-0.13.0.dist-info → onnxruntime_extensions-0.14.0.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: onnxruntime_extensions
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0
|
|
4
4
|
Summary: ONNXRuntime Extensions
|
|
5
5
|
Home-page: https://github.com/microsoft/onnxruntime-extensions
|
|
6
6
|
Author: Microsoft Corporation
|
|
@@ -18,6 +18,14 @@ Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
|
18
18
|
Classifier: License :: OSI Approved :: MIT License
|
|
19
19
|
Description-Content-Type: text/markdown
|
|
20
20
|
License-File: LICENSE
|
|
21
|
+
Dynamic: author
|
|
22
|
+
Dynamic: author-email
|
|
23
|
+
Dynamic: classifier
|
|
24
|
+
Dynamic: description
|
|
25
|
+
Dynamic: description-content-type
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: license
|
|
28
|
+
Dynamic: summary
|
|
21
29
|
|
|
22
30
|
# ONNXRuntime-Extensions
|
|
23
31
|
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
onnxruntime_extensions/__init__.py,sha256=GMnMIHJ-uqvJGPn5fpCZOi7OG16kFVpfOTTO88kYJWY,2387
|
|
2
|
-
onnxruntime_extensions/_cuops.py,sha256=
|
|
3
|
-
onnxruntime_extensions/_extensions_pydll.cp39-win_amd64.pyd,sha256=
|
|
2
|
+
onnxruntime_extensions/_cuops.py,sha256=W4hmBNoNvFk84V4UAUpltGNFjzcf0fju3iCeuatqXHE,16661
|
|
3
|
+
onnxruntime_extensions/_extensions_pydll.cp39-win_amd64.pyd,sha256=0BtiZfy7lditfJwj-cOFbyntPzDZcERTbsHiVD7tiko,1900032
|
|
4
4
|
onnxruntime_extensions/_extensions_pydll.pyi,sha256=mYXkqNaCgAbs161RDKgDjxIX9vWdYdVPDC-0X9cieco,1070
|
|
5
|
-
onnxruntime_extensions/_hf_cvt.py,sha256=
|
|
5
|
+
onnxruntime_extensions/_hf_cvt.py,sha256=7-nV40_lCydWHBMXUkfe3oaJSI7l0SDQdLT92yZG2oc,15945
|
|
6
6
|
onnxruntime_extensions/_ocos.py,sha256=OlDOlCH_vWFOBkjbp6Pujgw6rgk8Fd3_2Mi5ev1eeS0,4193
|
|
7
7
|
onnxruntime_extensions/_ortapi2.py,sha256=Tfrf9fQMQ0e7Wa4R8s4SHdwMNBdmj33wH3y5vMkVVQE,9951
|
|
8
8
|
onnxruntime_extensions/_torch_cvt.py,sha256=hGOiw24QuFpK_3CLjg8Fs2GD_cCdM049xcJxkHVRbAk,10185
|
|
9
|
-
onnxruntime_extensions/_version.py,sha256=
|
|
9
|
+
onnxruntime_extensions/_version.py,sha256=u5KwYLG4_oeOTmNuRw2dLiPJ5hByZa12xh0VGidbJMU,76
|
|
10
10
|
onnxruntime_extensions/cmd.py,sha256=eIiNNY0ohbUCPgmr9RwOfi0Gzw7nWL17i625L-ZKezI,2428
|
|
11
|
-
onnxruntime_extensions/cvt.py,sha256=
|
|
12
|
-
onnxruntime_extensions/pp_api.py,sha256=
|
|
11
|
+
onnxruntime_extensions/cvt.py,sha256=2cPsKj4weGDveV36mtoQ9yVUfjtqmFNUpFghrsppXOg,13409
|
|
12
|
+
onnxruntime_extensions/pp_api.py,sha256=Fk1iEMPwcnr84V9ALhr-zuMPNi_fyIMPTrKPeOQooZs,3157
|
|
13
13
|
onnxruntime_extensions/util.py,sha256=KxNFY0-5CG1i9HADcCc4V33PNukTO46Os_KIL8pj-l8,7394
|
|
14
14
|
onnxruntime_extensions/onnxprocess/__init__.py,sha256=BnveHXnu2nTQNbCLeZujZgZwO9A3yWFbQGTDthCFbIc,534
|
|
15
15
|
onnxruntime_extensions/onnxprocess/_builder.py,sha256=L_afKeE7Wc4mWJ47eVXQ2stvmal_37QVTQZgKmt0ZK8,1844
|
|
@@ -27,7 +27,7 @@ onnxruntime_extensions/pnp/_unifier.py,sha256=FPQYL1Z6f1Tv2qRsnhW_is9k7-GmCYhf6Z
|
|
|
27
27
|
onnxruntime_extensions/pnp/_utils.py,sha256=xBh7-_VstgqXlhBaQ_6E5GV6341ywCRQsrJZZZtYaCc,13061
|
|
28
28
|
onnxruntime_extensions/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
29
29
|
onnxruntime_extensions/tools/add_HuggingFace_CLIPImageProcessor_to_model.py,sha256=iNGAd9Ym0iKDQkXdWdka-R3S47TT3hMTihdGXg0uHL0,6786
|
|
30
|
-
onnxruntime_extensions/tools/add_pre_post_processing_to_model.py,sha256=
|
|
30
|
+
onnxruntime_extensions/tools/add_pre_post_processing_to_model.py,sha256=M2dSO2FdLo1Hs0GDVqYmKxmWDj7BsKCiyhpqxmCdDWg,24301
|
|
31
31
|
onnxruntime_extensions/tools/pre_post_processing/__init__.py,sha256=YKxCtG2McBExYYmcf1tbqDquqIS1iTs4iPx86MBcfRo,125
|
|
32
32
|
onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py,sha256=lnQ4TUKkZ-TvVC8U_ov3Nsz9gzES0ktnmD-DPTzutPA,19635
|
|
33
33
|
onnxruntime_extensions/tools/pre_post_processing/step.py,sha256=SYFxtrDmXyFpnnlPl4c49Yg1THFZvh5Y9NwuvquHTVg,9394
|
|
@@ -36,8 +36,8 @@ onnxruntime_extensions/tools/pre_post_processing/steps/__init__.py,sha256=pdVRZB
|
|
|
36
36
|
onnxruntime_extensions/tools/pre_post_processing/steps/general.py,sha256=fF_XVFSKOCu482Sqjp-nVPbs-ZVGpPal2ekbO1gUO_4,13781
|
|
37
37
|
onnxruntime_extensions/tools/pre_post_processing/steps/nlp.py,sha256=ZCxRNxqfANplxCe0I-6BfHziM1jDYJsNQKbHdM3Y1I0,15173
|
|
38
38
|
onnxruntime_extensions/tools/pre_post_processing/steps/vision.py,sha256=BM6CGylOSu4l6UarPfW0I2tgkJDa1Q-gYz__CxZle-k,53183
|
|
39
|
-
onnxruntime_extensions-0.
|
|
40
|
-
onnxruntime_extensions-0.
|
|
41
|
-
onnxruntime_extensions-0.
|
|
42
|
-
onnxruntime_extensions-0.
|
|
43
|
-
onnxruntime_extensions-0.
|
|
39
|
+
onnxruntime_extensions-0.14.0.dist-info/LICENSE,sha256=mQaUD2Gx8LUz-n2ZuvVReLKAj74RPqUd-_rYVyzNXys,1162
|
|
40
|
+
onnxruntime_extensions-0.14.0.dist-info/METADATA,sha256=udEUfhboh5qT4rtnEBg94FkkPmjjAYRJx9rWfg5ZrJg,4657
|
|
41
|
+
onnxruntime_extensions-0.14.0.dist-info/WHEEL,sha256=agy-BJge3afXwWznUXANATmKFW4eqelqRR0uf608A_0,99
|
|
42
|
+
onnxruntime_extensions-0.14.0.dist-info/top_level.txt,sha256=XyAgQDKyXsf6_0MJb58kRdHwigpTn7A7kl9diBEjs8M,23
|
|
43
|
+
onnxruntime_extensions-0.14.0.dist-info/RECORD,,
|
|
File without changes
|
{onnxruntime_extensions-0.13.0.dist-info → onnxruntime_extensions-0.14.0.dist-info}/top_level.txt
RENAMED
|
File without changes
|