imb 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imb/__init__.py +0 -1
- imb/base.py +129 -0
- imb/inference_clients/__init__.py +0 -2
- imb/onnx.py +99 -0
- imb/triton.py +460 -0
- imb-1.0.1.dist-info/METADATA +105 -0
- imb-1.0.1.dist-info/RECORD +13 -0
- imb-1.0.0.dist-info/METADATA +0 -30
- imb-1.0.0.dist-info/RECORD +0 -10
- {imb-1.0.0.dist-info → imb-1.0.1.dist-info}/LICENSE +0 -0
- {imb-1.0.0.dist-info → imb-1.0.1.dist-info}/WHEEL +0 -0
- {imb-1.0.0.dist-info → imb-1.0.1.dist-info}/top_level.txt +0 -0
imb/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1
|
-
from .inference_clients import OnnxClient, TritonClient
|
imb/base.py
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
from typing import Dict, List, Optional, Tuple, Union
|
2
|
+
import numpy as np
|
3
|
+
import time
|
4
|
+
import os
|
5
|
+
|
6
|
+
|
7
|
+
class BaseClient:
|
8
|
+
def __init__(self, *args, **kwargs):
|
9
|
+
self.show_fps: bool = os.environ.get('SHOW_FPS') in {'yes', 'true'}
|
10
|
+
self.model_name = ''
|
11
|
+
self.fixed_batch = False
|
12
|
+
self.max_batch_size = 1
|
13
|
+
self.is_async = False
|
14
|
+
self.return_dict = True
|
15
|
+
|
16
|
+
self.inputs_names: List[str] = []
|
17
|
+
self.inputs_shapes: List[tuple] = []
|
18
|
+
self.np_inputs_dtypes: List[np.dtype] = []
|
19
|
+
|
20
|
+
self.outputs_names: List[str] = []
|
21
|
+
self.outputs_shapes: List[tuple] = []
|
22
|
+
self.np_outputs_dtypes: List[np.dtype] = []
|
23
|
+
|
24
|
+
self.sample_inputs: Optional[List[np.ndarray]] = None
|
25
|
+
|
26
|
+
def _load_model_params(self, *args, **kwargs):
|
27
|
+
raise NotImplementedError
|
28
|
+
|
29
|
+
def _create_input_sample(self):
|
30
|
+
if self.sample_inputs is not None:
|
31
|
+
# validate sample inputs
|
32
|
+
for sample_array, config_input_shape in zip(self.sample_inputs, self.inputs_shapes):
|
33
|
+
for i, (s_dim, t_dim) in enumerate(zip(sample_array.shape, config_input_shape)):
|
34
|
+
if i == 0:
|
35
|
+
if self.fixed_batch:
|
36
|
+
assert s_dim == t_dim, \
|
37
|
+
f'model support fixed batch size {t_dim}, \
|
38
|
+
sample_inputs has batch size {s_dim}'
|
39
|
+
else:
|
40
|
+
assert s_dim <= t_dim, \
|
41
|
+
f'model support max batch size {t_dim}, \
|
42
|
+
sample_inputs has batch size {s_dim}'
|
43
|
+
continue
|
44
|
+
assert ((t_dim != -1) and (int(s_dim) == int(t_dim))) or t_dim == -1, \
|
45
|
+
f'incorrect shape in sample_inputs {sample_array.shape}, must be {config_input_shape}'
|
46
|
+
else:
|
47
|
+
has_dynamic_shapes = any(
|
48
|
+
-1 in config_input_shape for config_input_shape in self.inputs_shapes
|
49
|
+
)
|
50
|
+
if has_dynamic_shapes:
|
51
|
+
return
|
52
|
+
self.sample_inputs = []
|
53
|
+
for config_input_shape, np_input_format in zip(self.inputs_shapes, self.np_inputs_dtypes):
|
54
|
+
self.sample_inputs.append(
|
55
|
+
np.ones(config_input_shape).astype(np_input_format)
|
56
|
+
)
|
57
|
+
|
58
|
+
def _create_batches(self, *inputs_data: np.ndarray) -> Tuple[Dict[str, List[np.ndarray]], List[int]]:
|
59
|
+
inputs_batches = dict()
|
60
|
+
paddings = []
|
61
|
+
for input_data, np_format, input_name in zip(inputs_data, self.np_inputs_dtypes, self.inputs_names):
|
62
|
+
input_data = input_data.astype(np_format)
|
63
|
+
input_batches, input_paddings = self._split_on_batches(input_data)
|
64
|
+
if paddings == []:
|
65
|
+
paddings = input_paddings
|
66
|
+
inputs_batches[input_name] = input_batches
|
67
|
+
return inputs_batches, paddings
|
68
|
+
|
69
|
+
def log(self, text, warn=False, err=False):
|
70
|
+
text = f'Model ({self.model_name}) - {text}'
|
71
|
+
if err:
|
72
|
+
print('error', text)
|
73
|
+
elif warn:
|
74
|
+
print('warning',text)
|
75
|
+
else:
|
76
|
+
print('debug', text)
|
77
|
+
|
78
|
+
def warmup_model(self):
|
79
|
+
if self.sample_inputs is None:
|
80
|
+
print('Model was not warmed up, because sample_inputs didn\'t set or shape is dynamic and cannot auto generate')
|
81
|
+
return
|
82
|
+
exception = None
|
83
|
+
for _ in range(5):
|
84
|
+
try:
|
85
|
+
_ = self.__call__(*self.sample_inputs)
|
86
|
+
exception = None
|
87
|
+
except Exception as e:
|
88
|
+
print(f'{e} while warmup, repeat inference...')
|
89
|
+
exception = e
|
90
|
+
time.sleep(2)
|
91
|
+
if exception is not None:
|
92
|
+
raise exception
|
93
|
+
|
94
|
+
def pad_batch(self, batch: np.ndarray):
|
95
|
+
padding_size = self.max_batch_size - batch.shape[0]
|
96
|
+
if padding_size > 0:
|
97
|
+
pad = np.zeros([padding_size, *batch.shape[1:]], dtype=batch.dtype)
|
98
|
+
batch = np.concatenate((batch, pad), axis=0)
|
99
|
+
return batch, padding_size
|
100
|
+
|
101
|
+
def _split_on_batches(self, input_data: np.ndarray):
|
102
|
+
batches = []
|
103
|
+
paddings = []
|
104
|
+
for i in range(0, len(input_data), self.max_batch_size):
|
105
|
+
batch = input_data[i:i+self.max_batch_size]
|
106
|
+
batches.append(batch)
|
107
|
+
paddings.append(0)
|
108
|
+
|
109
|
+
if self.fixed_batch:
|
110
|
+
batches[-1], paddings[-1] = self.pad_batch(batches[-1])
|
111
|
+
|
112
|
+
return batches, paddings
|
113
|
+
|
114
|
+
def forward(self, *input_data):
|
115
|
+
raise NotImplementedError
|
116
|
+
|
117
|
+
def async_forward(self, *input_data):
|
118
|
+
raise NotImplementedError
|
119
|
+
|
120
|
+
def __call__(self, *args, **kwargs) -> Union[Dict[str, np.ndarray], List[np.ndarray]]:
|
121
|
+
t1 = time.time()
|
122
|
+
forward_func = self.async_forward if self.is_async else self.forward
|
123
|
+
output = forward_func(*args, **kwargs)
|
124
|
+
if self.return_dict is False:
|
125
|
+
output = [output[output_name] for output_name in self.outputs_names]
|
126
|
+
t2 = time.time()
|
127
|
+
if self.show_fps:
|
128
|
+
self.log(f'Model: {self.model_name} fps {int(len(args[0])/(t2-t1))}')
|
129
|
+
return output
|
imb/onnx.py
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from typing import Dict, List
|
3
|
+
from .base import BaseClient
|
4
|
+
import onnxruntime as rt
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
|
8
|
+
class OnnxClient(BaseClient):
|
9
|
+
def __init__(self, model_path: str,
|
10
|
+
model_name: str,
|
11
|
+
providers: List[str] = ['CUDAExecutionProvider', 'CPUExecutionProvider'],
|
12
|
+
max_batch_size: int = 1,
|
13
|
+
return_dict: bool = True,
|
14
|
+
fixed_batch: bool = False,
|
15
|
+
warmup: bool = False
|
16
|
+
):
|
17
|
+
super().__init__()
|
18
|
+
self.model_name = model_name
|
19
|
+
self.model_path = model_path
|
20
|
+
self.providers = providers
|
21
|
+
self.return_dict = return_dict
|
22
|
+
self.max_batch_size = max_batch_size
|
23
|
+
self.fixed_batch = fixed_batch
|
24
|
+
|
25
|
+
self._load_model_params(max_batch_size)
|
26
|
+
|
27
|
+
self.sample_inputs = [np.zeros((*shape,), dtype=dtype) for shape, dtype in zip(self.inputs_shapes, self.np_inputs_dtypes)]
|
28
|
+
|
29
|
+
if warmup:
|
30
|
+
self.warmup_model()
|
31
|
+
|
32
|
+
def _load_model_params(self, max_batch_size: int = 1):
|
33
|
+
"""
|
34
|
+
Load model parameters from onnx model
|
35
|
+
|
36
|
+
Args:
|
37
|
+
max_batch_size (int, optional): max batch size. Defaults to 1.
|
38
|
+
|
39
|
+
Raises:
|
40
|
+
ValueError: not support dynamic batch
|
41
|
+
"""
|
42
|
+
sess_options = rt.SessionOptions()
|
43
|
+
self.onnx_model = rt.InferenceSession(
|
44
|
+
self.model_path,
|
45
|
+
providers=self.providers,
|
46
|
+
sess_options=sess_options
|
47
|
+
)
|
48
|
+
|
49
|
+
model_inputs = self.onnx_model.get_inputs()
|
50
|
+
data_dtype = np.float16 if 'float16' in model_inputs[0].type else np.float32
|
51
|
+
self.inputs_names = [model_inputs[i].name for i in range(len(model_inputs))]
|
52
|
+
self.np_inputs_dtypes = [data_dtype for _ in range(len(self.inputs_names))]
|
53
|
+
self.inputs_shapes = [model_inputs[i].shape for i in range(len(model_inputs))]
|
54
|
+
for i_input, shape in enumerate(self.inputs_shapes):
|
55
|
+
new_shape = []
|
56
|
+
for i_dim, value in enumerate(shape):
|
57
|
+
if isinstance(value, int):
|
58
|
+
if i_dim == 0:
|
59
|
+
self.max_batch_size = value
|
60
|
+
self.log(f'set batch size {value} from model metadata')
|
61
|
+
new_shape.append(value)
|
62
|
+
elif isinstance(value, str) and 'batch' in value:
|
63
|
+
new_shape.append(max_batch_size)
|
64
|
+
self.log(f'set batch size {value} from user settings')
|
65
|
+
else:
|
66
|
+
raise ValueError(f'not support value {value} in input shape {shape}')
|
67
|
+
self.inputs_shapes[i_input] = new_shape
|
68
|
+
|
69
|
+
model_outputs = self.onnx_model.get_outputs()
|
70
|
+
self.outputs_names = [model_outputs[i].name for i in range(len(model_outputs))]
|
71
|
+
self.np_outputs_dtypes = [data_dtype for _ in range(len(self.outputs_names))]
|
72
|
+
|
73
|
+
def forward(self, *inputs_data: np.ndarray) -> Dict[str, np.ndarray]:
|
74
|
+
inputs_batches, batches_paddings = self._create_batches(*inputs_data)
|
75
|
+
|
76
|
+
result = defaultdict(list)
|
77
|
+
count_batches = len(next(iter(inputs_batches.values())))
|
78
|
+
|
79
|
+
for i_batch in range(count_batches):
|
80
|
+
batch = dict()
|
81
|
+
for input_name, np_dtype in zip(self.inputs_names, self.np_inputs_dtypes):
|
82
|
+
batch[input_name] = inputs_batches[input_name][i_batch].astype(np_dtype)
|
83
|
+
|
84
|
+
batch_result = self.onnx_model.run(self.outputs_names, batch)
|
85
|
+
batch_result = {
|
86
|
+
self.outputs_names[i]: batch_result[i].astype(self.np_outputs_dtypes[i])
|
87
|
+
for i in range(len(self.outputs_names))
|
88
|
+
}
|
89
|
+
|
90
|
+
padding_size = batches_paddings[i_batch]
|
91
|
+
for output_name, output_value in batch_result.items():
|
92
|
+
result[output_name].append(
|
93
|
+
output_value if padding_size == 0 else output_value[:-padding_size]
|
94
|
+
)
|
95
|
+
|
96
|
+
for output_name, output_values in result.items():
|
97
|
+
result[output_name] = np.concatenate(output_values)
|
98
|
+
|
99
|
+
return result
|
imb/triton.py
ADDED
@@ -0,0 +1,460 @@
|
|
1
|
+
from collections import defaultdict
|
2
|
+
from typing import Any, Dict, List, Literal, Optional, Tuple
|
3
|
+
import tritonclient.http as httpclient
|
4
|
+
import tritonclient.grpc as grpcclient
|
5
|
+
import tritonclient.utils.cuda_shared_memory as cudashm
|
6
|
+
from google.protobuf.json_format import MessageToJson
|
7
|
+
from tritonclient import utils
|
8
|
+
from .base import BaseClient
|
9
|
+
import numpy as np
|
10
|
+
import json
|
11
|
+
import time
|
12
|
+
|
13
|
+
|
14
|
+
class TritonClient(BaseClient):
|
15
|
+
def __init__(self, url: str,
|
16
|
+
model_name: str,
|
17
|
+
max_batch_size: int = 0,
|
18
|
+
sample_inputs: Optional[List[np.ndarray]] = None,
|
19
|
+
timeout: int = 10,
|
20
|
+
resend_count: int = 10,
|
21
|
+
fixed_batch: bool = True,
|
22
|
+
is_async: bool = False,
|
23
|
+
cuda_shm: bool = False,
|
24
|
+
max_shm_regions: int = 2,
|
25
|
+
scheme: Literal["http", "grpc"] = "http",
|
26
|
+
return_dict: bool = True,
|
27
|
+
warmup: bool = False
|
28
|
+
):
|
29
|
+
super().__init__()
|
30
|
+
self.model_name = model_name
|
31
|
+
self.scheme = scheme
|
32
|
+
self.client_module = httpclient if scheme == "http" else grpcclient
|
33
|
+
self.url = url
|
34
|
+
self.is_async = is_async
|
35
|
+
self.cuda_shm = cuda_shm
|
36
|
+
self.triton_timeout = timeout
|
37
|
+
self.resend_count = resend_count
|
38
|
+
self.max_shm_regions = max_shm_regions
|
39
|
+
self.return_dict = return_dict
|
40
|
+
|
41
|
+
self.triton_client = None
|
42
|
+
self._init_triton()
|
43
|
+
|
44
|
+
self.triton_inputs_dtypes = None
|
45
|
+
self.np_inputs_dtypes = None
|
46
|
+
|
47
|
+
self.inputs_shapes = None
|
48
|
+
self.fixed_batch = fixed_batch
|
49
|
+
|
50
|
+
self.inputs_names = None
|
51
|
+
self.outputs_names = None
|
52
|
+
|
53
|
+
self.sample_inputs = sample_inputs
|
54
|
+
|
55
|
+
self._load_model_params(max_batch_size)
|
56
|
+
self._create_input_sample()
|
57
|
+
if warmup:
|
58
|
+
self.warmup_model()
|
59
|
+
|
60
|
+
self.input_shm_handles = [None for _ in range(len(self.inputs_names))]
|
61
|
+
self.output_shm_handles = [None for _ in range(len(self.outputs_names))]
|
62
|
+
|
63
|
+
if self.cuda_shm:
|
64
|
+
assert is_async == False and fixed_batch == True
|
65
|
+
self._fill_output_dynamic_axis()
|
66
|
+
self._create_input_output_shm_handles()
|
67
|
+
self._register_cuda_shm_regions()
|
68
|
+
|
69
|
+
def io_summary(self):
|
70
|
+
return {
|
71
|
+
"model_name": self.model_name,
|
72
|
+
"url": self.url,
|
73
|
+
"scheme": self.scheme,
|
74
|
+
|
75
|
+
"inputs_shapes": self.inputs_shapes,
|
76
|
+
"inputs_names": self.inputs_names,
|
77
|
+
"triton_inputs_dtypes": self.triton_inputs_dtypes,
|
78
|
+
"np_inputs_dtypes": self.np_inputs_dtypes,
|
79
|
+
|
80
|
+
"outputs_shapes": self.outputs_shapes,
|
81
|
+
"outputs_names": self.outputs_names,
|
82
|
+
"triton_outputs_dtypes": self.triton_outputs_dtypes,
|
83
|
+
"np_outputs_dtypes": self.np_outputs_dtypes,
|
84
|
+
|
85
|
+
"fixed_batch": self.fixed_batch,
|
86
|
+
"async": self.is_async,
|
87
|
+
"cuda_shm": self.cuda_shm,
|
88
|
+
"max_shm_regions": self.max_shm_regions,
|
89
|
+
}
|
90
|
+
|
91
|
+
def _init_triton(self):
|
92
|
+
if self.triton_client is not None:
|
93
|
+
# reinit
|
94
|
+
self.triton_client.close()
|
95
|
+
time.sleep(3)
|
96
|
+
|
97
|
+
self.triton_client = self.client_module.InferenceServerClient(
|
98
|
+
url=self.url,
|
99
|
+
verbose=False,
|
100
|
+
ssl=False,
|
101
|
+
network_timeout=self.triton_timeout,
|
102
|
+
connection_timeout=self.triton_timeout
|
103
|
+
)
|
104
|
+
|
105
|
+
def _load_model_params(self, user_max_batch_size: int) -> None:
|
106
|
+
"""
|
107
|
+
Load the model config from Triton Inferernce Server and update the class attributes.
|
108
|
+
|
109
|
+
Args:
|
110
|
+
user_max_batch_size (int): max_batch_size defined by user
|
111
|
+
"""
|
112
|
+
if self.scheme == "grpc":
|
113
|
+
config = self.triton_client.get_model_config(self.model_name, as_json=True)
|
114
|
+
config = config["config"]
|
115
|
+
else:
|
116
|
+
config = self.triton_client.get_model_config(self.model_name)
|
117
|
+
|
118
|
+
self.triton_inputs_dtypes, self.np_inputs_dtypes, \
|
119
|
+
self.inputs_shapes, self.inputs_names \
|
120
|
+
= self._parse_io_params(config['input'])
|
121
|
+
|
122
|
+
self.triton_outputs_dtypes, self.np_outputs_dtypes, \
|
123
|
+
self.outputs_shapes, self.outputs_names \
|
124
|
+
= self._parse_io_params(config['output'])
|
125
|
+
|
126
|
+
not_support_dynamic_batch = config['max_batch_size'] == 0
|
127
|
+
if not_support_dynamic_batch:
|
128
|
+
# use batch size from config
|
129
|
+
self.max_batch_size = config['input'][0]['dims'][0]
|
130
|
+
self.fixed_batch = True
|
131
|
+
else:
|
132
|
+
# user can decrease max_batch_size from config
|
133
|
+
if user_max_batch_size > 0:
|
134
|
+
self.max_batch_size = min(config['max_batch_size'], user_max_batch_size)
|
135
|
+
else:
|
136
|
+
self.max_batch_size = config['max_batch_size']
|
137
|
+
# in config's shape has no batch size
|
138
|
+
self.inputs_shapes = self._insert_batch_size_to_shapes(
|
139
|
+
self.inputs_shapes, self.max_batch_size
|
140
|
+
)
|
141
|
+
self.outputs_shapes = self._insert_batch_size_to_shapes(
|
142
|
+
self.outputs_shapes, self.max_batch_size
|
143
|
+
)
|
144
|
+
|
145
|
+
def _fill_output_dynamic_axis(self) -> None:
|
146
|
+
"""
|
147
|
+
Fill real values in the dynamic axis of the output shapes.
|
148
|
+
"""
|
149
|
+
has_dynamic_shapes = any(
|
150
|
+
-1 in output_shape for output_shape in self.outputs_shapes
|
151
|
+
)
|
152
|
+
if has_dynamic_shapes:
|
153
|
+
start_cuda_shm_flag = self.cuda_shm
|
154
|
+
self.cuda_shm = False
|
155
|
+
outputs = self.forward(*self.sample_inputs)
|
156
|
+
self.outputs_shapes = [
|
157
|
+
list(outputs[output_name].shape) for output_name in self.outputs_names
|
158
|
+
]
|
159
|
+
self.cuda_shm = start_cuda_shm_flag
|
160
|
+
|
161
|
+
@staticmethod
|
162
|
+
def _parse_io_params(io_params: List[Dict]) -> Tuple[List[str], List[np.dtype], List[List[int]], List[str]]:
|
163
|
+
"""
|
164
|
+
Parse the input/output parameters from the model config.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
io_params (List[Dict]): The input/output parameters.
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
Tuple[List[str], List[np.dtype], List[List[int]], List[str]]: The input/output dtypes, shapes, and names.
|
171
|
+
"""
|
172
|
+
triton_dtypes = []
|
173
|
+
np_dtypes = []
|
174
|
+
shapes = []
|
175
|
+
names = []
|
176
|
+
for params in io_params:
|
177
|
+
triton_dtypes.append(params['data_type'].replace('TYPE_', ''))
|
178
|
+
np_dtypes.append(utils.triton_to_np_dtype(triton_dtypes[-1]))
|
179
|
+
shapes.append(params['dims'])
|
180
|
+
names.append(params['name'])
|
181
|
+
|
182
|
+
return triton_dtypes, np_dtypes, shapes, names
|
183
|
+
|
184
|
+
@staticmethod
|
185
|
+
def _insert_batch_size_to_shapes(shapes: List[List], insert_batch_size: int) -> List[List[int]]:
|
186
|
+
"""
|
187
|
+
Insert the batch size to the shapes.
|
188
|
+
|
189
|
+
Args:
|
190
|
+
shapes (List[List]): Shapes from config
|
191
|
+
insert_batch_size (int): Value for insert batch size to shape
|
192
|
+
|
193
|
+
Returns:
|
194
|
+
List[List[int]]: Result shape
|
195
|
+
"""
|
196
|
+
return [[insert_batch_size] + shape for shape in shapes]
|
197
|
+
|
198
|
+
def _generate_shm_name(self, ioname: str) -> str:
|
199
|
+
"""
|
200
|
+
Generate shared region name
|
201
|
+
|
202
|
+
Args:
|
203
|
+
ioname (str): Input/output name
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
str: Shared region name
|
207
|
+
"""
|
208
|
+
return f'{self.model_name}_{ioname}_{time.time()}'
|
209
|
+
|
210
|
+
def _get_old_regions_names(self, regions_statuses: list, new_triton_shm_name: str) -> List[str]:
|
211
|
+
"""
|
212
|
+
Get old regions names for unregister
|
213
|
+
|
214
|
+
Args:
|
215
|
+
regions_statuses (list): responce of get_cuda_shared_memory_status from triton
|
216
|
+
new_triton_shm_name (str): name of new region
|
217
|
+
|
218
|
+
Returns:
|
219
|
+
List[str]: old regions names for unregister
|
220
|
+
"""
|
221
|
+
i_sep = len(new_triton_shm_name) - new_triton_shm_name[::-1].index('_') - 1
|
222
|
+
region_name = new_triton_shm_name[:i_sep]
|
223
|
+
registrated_regions = [
|
224
|
+
(region['name'], float(region['name'][i_sep+1:]))
|
225
|
+
for region in regions_statuses if region['name'].startswith(region_name)
|
226
|
+
]
|
227
|
+
registrated_regions.sort(key=lambda x: x[1])
|
228
|
+
count_old_regions = len(registrated_regions) - self.max_shm_regions + 1
|
229
|
+
old_regions = []
|
230
|
+
if count_old_regions > 0:
|
231
|
+
old_regions = [name for name, _ in registrated_regions[:count_old_regions]]
|
232
|
+
return old_regions
|
233
|
+
|
234
|
+
def _register_cuda_shm_regions(self):
|
235
|
+
"""
|
236
|
+
Register CUDA shared memory regions in Triton
|
237
|
+
"""
|
238
|
+
if self.scheme == "grpc":
|
239
|
+
regions_statuses = self.triton_client.get_cuda_shared_memory_status(as_json=True)['regions']
|
240
|
+
else:
|
241
|
+
regions_statuses = self.triton_client.get_cuda_shared_memory_status()
|
242
|
+
|
243
|
+
for shm_handle in self.input_shm_handles + self.output_shm_handles:
|
244
|
+
old_regions_names = self._get_old_regions_names(regions_statuses, shm_handle._triton_shm_name)
|
245
|
+
for old_region_name in old_regions_names:
|
246
|
+
self.triton_client.unregister_cuda_shared_memory(old_region_name)
|
247
|
+
self.triton_client.register_cuda_shared_memory(
|
248
|
+
shm_handle._triton_shm_name, cudashm.get_raw_handle(shm_handle), 0, shm_handle._byte_size
|
249
|
+
)
|
250
|
+
|
251
|
+
def _create_cuda_shm_handle(self, shape: List[int], dtype: np.dtype, name: str) -> Any:
|
252
|
+
"""
|
253
|
+
Create CUDA shared memory handle
|
254
|
+
|
255
|
+
Args:
|
256
|
+
shape (List[int]): Shape of cuda shared memory region
|
257
|
+
dtype (np.dtype): Data type of input/output data
|
258
|
+
name (str): Input/output name
|
259
|
+
|
260
|
+
Returns:
|
261
|
+
Any: CUDA shared memory handle
|
262
|
+
"""
|
263
|
+
byte_size = int(np.prod(shape) * np.dtype(dtype).itemsize)
|
264
|
+
shm_name = self._generate_shm_name(name)
|
265
|
+
return cudashm.create_shared_memory_region(shm_name, byte_size, 0)
|
266
|
+
|
267
|
+
def _create_cuda_shm_handles_for_io(self, shapes: List[List[int]],
|
268
|
+
dtypes: List[np.dtype],
|
269
|
+
names: List[str]) -> List[Any]:
|
270
|
+
"""
|
271
|
+
Create CUDA shared memory handles for inputs or outputs
|
272
|
+
|
273
|
+
Args:
|
274
|
+
shapes (List[List[int]]): Shapes of cuda shared memory regions
|
275
|
+
dtypes (List[np.dtype]): Data types of input/output data
|
276
|
+
names (List[str]): Input/output names
|
277
|
+
|
278
|
+
Returns:
|
279
|
+
List[Any]: CUDA shared memory handles
|
280
|
+
"""
|
281
|
+
return [self._create_cuda_shm_handle(shape, dtype, name)
|
282
|
+
for shape, dtype, name in zip(shapes, dtypes, names)]
|
283
|
+
|
284
|
+
def _create_input_output_shm_handles(self) -> None:
|
285
|
+
"""
|
286
|
+
Create CUDA shared memory handles for inputs and outputs
|
287
|
+
"""
|
288
|
+
self.input_shm_handles = self._create_cuda_shm_handles_for_io(
|
289
|
+
self.inputs_shapes, self.np_inputs_dtypes, self.inputs_names
|
290
|
+
)
|
291
|
+
self.output_shm_handles = self._create_cuda_shm_handles_for_io(
|
292
|
+
self.outputs_shapes, self.np_outputs_dtypes, self.outputs_names
|
293
|
+
)
|
294
|
+
|
295
|
+
def _create_triton_input(self, input_data: np.ndarray, input_name: str,
|
296
|
+
config_input_format: str, shm_handle = None) -> Any:
|
297
|
+
"""
|
298
|
+
Create triton InferInput
|
299
|
+
|
300
|
+
Args:
|
301
|
+
input_data (np.ndarray): data for send to model
|
302
|
+
input_name (str): name of input
|
303
|
+
config_input_format (str): triton input format
|
304
|
+
shm_handle (_type_, optional): CUDA shared memory handle. Defaults to None.
|
305
|
+
|
306
|
+
Returns:
|
307
|
+
Any: triton InferInput for sending request
|
308
|
+
"""
|
309
|
+
infer_input = self.client_module.InferInput(input_name, input_data.shape, config_input_format)
|
310
|
+
if self.cuda_shm:
|
311
|
+
cudashm.set_shared_memory_region(shm_handle, [input_data])
|
312
|
+
infer_input.set_shared_memory(shm_handle._triton_shm_name, shm_handle._byte_size)
|
313
|
+
else:
|
314
|
+
infer_input.set_data_from_numpy(input_data)
|
315
|
+
return infer_input
|
316
|
+
|
317
|
+
def _create_triton_output(self, output_name: str, binary: bool = True, shm_handle = None) -> Any:
|
318
|
+
"""
|
319
|
+
Create triton InferRequestedOutput
|
320
|
+
|
321
|
+
Args:
|
322
|
+
output_name (str): output name
|
323
|
+
binary (bool, optional): Whether the output is binary. Defaults to True.
|
324
|
+
shm_handle (_type_, optional): CUDA shared memory handle. Defaults to None.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
Any: triton InferRequestedOutput for receiving response
|
328
|
+
"""
|
329
|
+
if self.scheme == "grpc":
|
330
|
+
infer_output = self.client_module.InferRequestedOutput(output_name)
|
331
|
+
else:
|
332
|
+
infer_output = self.client_module.InferRequestedOutput(output_name, binary_data=binary)
|
333
|
+
if self.cuda_shm:
|
334
|
+
infer_output.set_shared_memory(shm_handle._triton_shm_name, shm_handle._byte_size)
|
335
|
+
return infer_output
|
336
|
+
|
337
|
+
def _postprocess_triton_result(self, triton_response: Any, padding_size: int) -> Dict[str, np.ndarray]:
|
338
|
+
"""
|
339
|
+
Postprocess triton response.
|
340
|
+
|
341
|
+
Args:
|
342
|
+
triton_response (Any): triton response
|
343
|
+
padding_size (int): padding size for unpad output data
|
344
|
+
|
345
|
+
Returns:
|
346
|
+
Dict[str, np.ndarray]: dict of output name and output data
|
347
|
+
"""
|
348
|
+
result = dict()
|
349
|
+
for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
|
350
|
+
if self.cuda_shm:
|
351
|
+
if self.scheme == "grpc":
|
352
|
+
# output = triton_response.get_output(output_name, as_json=True) # WARN: bug in tritonclient library, return None
|
353
|
+
output = json.loads(MessageToJson(triton_response.get_output(output_name)))
|
354
|
+
else:
|
355
|
+
output = triton_response.get_output(output_name)
|
356
|
+
result[output_name] = cudashm.get_contents_as_numpy(
|
357
|
+
shm_op_handle,
|
358
|
+
utils.triton_to_np_dtype(output["datatype"]),
|
359
|
+
output["shape"],
|
360
|
+
)
|
361
|
+
else:
|
362
|
+
result[output_name] = triton_response.as_numpy(output_name)
|
363
|
+
|
364
|
+
if padding_size != 0:
|
365
|
+
result[output_name] = result[output_name][:-padding_size]
|
366
|
+
|
367
|
+
return result
|
368
|
+
|
369
|
+
def forward(self, *inputs_data: np.ndarray) -> Dict[str, np.ndarray]:
|
370
|
+
assert len(inputs_data) == len(self.inputs_names), 'inputs number is not equal to model inputs'
|
371
|
+
inputs_batches, batches_paddings = self._create_batches(*inputs_data)
|
372
|
+
|
373
|
+
result = defaultdict(list)
|
374
|
+
count_batches = len(next(iter(inputs_batches.values())))
|
375
|
+
|
376
|
+
for i_batch in range(count_batches):
|
377
|
+
triton_inputs = []
|
378
|
+
for input_name, config_input_format, shm_ip_handle in \
|
379
|
+
zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handles):
|
380
|
+
triton_input = self._create_triton_input(
|
381
|
+
inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handle
|
382
|
+
)
|
383
|
+
triton_inputs.append(triton_input)
|
384
|
+
|
385
|
+
triton_outputs = []
|
386
|
+
for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
|
387
|
+
triton_output = self._create_triton_output(
|
388
|
+
output_name, binary=True, shm_handle=shm_op_handle
|
389
|
+
)
|
390
|
+
triton_outputs.append(triton_output)
|
391
|
+
|
392
|
+
triton_response = self.triton_client.infer(
|
393
|
+
model_name=self.model_name,
|
394
|
+
inputs=triton_inputs,
|
395
|
+
outputs=triton_outputs
|
396
|
+
)
|
397
|
+
|
398
|
+
batch_result = self._postprocess_triton_result(triton_response, batches_paddings[i_batch])
|
399
|
+
|
400
|
+
for output_name, output_value in batch_result.items():
|
401
|
+
result[output_name].append(output_value)
|
402
|
+
|
403
|
+
for output_name, output_values in result.items():
|
404
|
+
result[output_name] = np.concatenate(output_values)
|
405
|
+
|
406
|
+
return result
|
407
|
+
|
408
|
+
def send_async_requests(self, inputs_batches: Dict):
|
409
|
+
count_batches = len(next(iter(inputs_batches.values())))
|
410
|
+
|
411
|
+
triton_response_handles = []
|
412
|
+
|
413
|
+
for i_batch in range(count_batches):
|
414
|
+
triton_inputs = []
|
415
|
+
for input_name, config_input_format, shm_ip_handle in \
|
416
|
+
zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handles):
|
417
|
+
triton_input = self._create_triton_input(
|
418
|
+
inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handle
|
419
|
+
)
|
420
|
+
triton_inputs.append(triton_input)
|
421
|
+
|
422
|
+
triton_outputs = []
|
423
|
+
for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
|
424
|
+
triton_output = self._create_triton_output(
|
425
|
+
output_name, binary=True, shm_handle=shm_op_handle
|
426
|
+
)
|
427
|
+
triton_outputs.append(triton_output)
|
428
|
+
|
429
|
+
triton_response_handle = self.triton_client.async_infer(
|
430
|
+
model_name=self.model_name,
|
431
|
+
inputs=triton_inputs,
|
432
|
+
outputs=triton_outputs
|
433
|
+
)
|
434
|
+
triton_response_handles.append(triton_response_handle)
|
435
|
+
|
436
|
+
return triton_response_handles
|
437
|
+
|
438
|
+
def get_async_results(self, triton_response_handles, batches_paddings):
|
439
|
+
result = defaultdict(list)
|
440
|
+
for i_batch, triton_response_handle in enumerate(triton_response_handles):
|
441
|
+
triton_response = triton_response_handle.get_result()
|
442
|
+
batch_result = self._postprocess_triton_result(triton_response, batches_paddings[i_batch])
|
443
|
+
|
444
|
+
for output_name, output_value in batch_result.items():
|
445
|
+
result[output_name].append(output_value)
|
446
|
+
|
447
|
+
for output_name, output_values in result.items():
|
448
|
+
result[output_name] = np.concatenate(output_values)
|
449
|
+
|
450
|
+
return result
|
451
|
+
|
452
|
+
def async_forward(self, *inputs_data: np.ndarray):
|
453
|
+
assert len(inputs_data) == len(self.inputs_names), 'inputs number is not equal to model inputs'
|
454
|
+
inputs_batches, batches_paddings = self._create_batches(*inputs_data)
|
455
|
+
|
456
|
+
triton_response_handles = self.send_async_requests(inputs_batches)
|
457
|
+
|
458
|
+
result = self.get_async_results(triton_response_handles, batches_paddings)
|
459
|
+
|
460
|
+
return result
|
@@ -0,0 +1,105 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: imb
|
3
|
+
Version: 1.0.1
|
4
|
+
Summary: Python library for run inference of deep learning models in different backends
|
5
|
+
Home-page: https://github.com/TheConstant3/InferenceMultiBackend
|
6
|
+
Author: p-constant
|
7
|
+
Author-email: nikshorop@gmail.com
|
8
|
+
Classifier: Programming Language :: Python :: 3.8
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.8
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: numpy
|
15
|
+
Provides-Extra: triton
|
16
|
+
Requires-Dist: tritonclient[all]>=2.38.0; extra == "triton"
|
17
|
+
Provides-Extra: onnxcpu
|
18
|
+
Requires-Dist: onnxruntime>=1.16.0; extra == "onnxcpu"
|
19
|
+
Provides-Extra: onnxgpu
|
20
|
+
Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "onnxgpu"
|
21
|
+
Provides-Extra: all
|
22
|
+
Requires-Dist: tritonclient[all]>=2.38.0; extra == "all"
|
23
|
+
Requires-Dist: onnxruntime>=1.16.0; extra == "all"
|
24
|
+
Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "all"
|
25
|
+
Dynamic: author
|
26
|
+
Dynamic: author-email
|
27
|
+
Dynamic: classifier
|
28
|
+
Dynamic: description
|
29
|
+
Dynamic: description-content-type
|
30
|
+
Dynamic: home-page
|
31
|
+
Dynamic: provides-extra
|
32
|
+
Dynamic: requires-dist
|
33
|
+
Dynamic: requires-python
|
34
|
+
Dynamic: summary
|
35
|
+
|
36
|
+
# InferenceMultiBackend
|
37
|
+
|
38
|
+
Python library for run inference of deep learning models in different backends
|
39
|
+
|
40
|
+
## Installation
|
41
|
+
|
42
|
+
For use triton inference client:
|
43
|
+
```pip install imb[triton]```
|
44
|
+
|
45
|
+
For use onnxruntime-gpu client:
|
46
|
+
```pip install imb[onnxgpu]```
|
47
|
+
|
48
|
+
For use onnxruntime client:
|
49
|
+
```pip install imb[onnxcpu]```
|
50
|
+
|
51
|
+
For support all implemented clients:
|
52
|
+
```pip install imb[all]```
|
53
|
+
|
54
|
+
## Usage
|
55
|
+
|
56
|
+
OnnxClient usage example
|
57
|
+
```
|
58
|
+
onnx_client = OnnxClient(
|
59
|
+
model_path='model.onnx',
|
60
|
+
model_name='any name',
|
61
|
+
providers=['CUDAExecutionProvider', 'CPUExecutionProvider'],
|
62
|
+
max_batch_size=16,
|
63
|
+
return_dict=True,
|
64
|
+
fixed_batch=True,
|
65
|
+
warmup=True
|
66
|
+
)
|
67
|
+
# if model has fixed input size (except batch size) then sample_inputs will be created
|
68
|
+
sample_inputs = onnx_client.sample_inputs
|
69
|
+
print('inputs shapes', [o.shape for o in sample_inputs])
|
70
|
+
outputs = onnx_client(*sample_inputs)
|
71
|
+
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
72
|
+
```
|
73
|
+
|
74
|
+
TritonClient usage example
|
75
|
+
```
|
76
|
+
triton_client = TritonClient(
|
77
|
+
url='localhost:8000',
|
78
|
+
model_name='arcface',
|
79
|
+
max_batch_size=16,
|
80
|
+
timeout=10,
|
81
|
+
resend_count=10,
|
82
|
+
fixed_batch=True,
|
83
|
+
is_async=False,
|
84
|
+
cuda_shm=False,
|
85
|
+
max_shm_regions=2,
|
86
|
+
scheme='http',
|
87
|
+
return_dict=True,
|
88
|
+
warmup=False
|
89
|
+
)
|
90
|
+
# if model has fixed input size (except batch size) then sample_inputs will be created
|
91
|
+
sample_inputs = triton_client.sample_inputs
|
92
|
+
print('inputs shapes', [o.shape for o in sample_inputs])
|
93
|
+
outputs = triton_client(*sample_inputs)
|
94
|
+
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
95
|
+
```
|
96
|
+
|
97
|
+
## Notes
|
98
|
+
|
99
|
+
max_batch_size - maximum batch size for inference. If input data larger that max_batch_size, then input data will be splitted to several batches.
|
100
|
+
|
101
|
+
fixed_batch - if fixed batch is True, then each batch will have fixed size (padding the smallest batch to max_batch_size).
|
102
|
+
|
103
|
+
warmup - if True, model will run several calls on sample_inputs while initialization.
|
104
|
+
|
105
|
+
return_dict - if True, __call__ return dict {'output_name1': output_value1, ...}, else [output_value1, ...]
|
@@ -0,0 +1,13 @@
|
|
1
|
+
imb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
imb/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
|
3
|
+
imb/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
|
4
|
+
imb/triton.py,sha256=hdnCtDjoRAl_Ss49_ayvW3-VhsYcY2MbNqh3ax6y-18,18629
|
5
|
+
imb/inference_clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
imb/inference_clients/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
|
7
|
+
imb/inference_clients/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
|
8
|
+
imb/inference_clients/triton.py,sha256=hdnCtDjoRAl_Ss49_ayvW3-VhsYcY2MbNqh3ax6y-18,18629
|
9
|
+
imb-1.0.1.dist-info/LICENSE,sha256=pAZXnNE2dxxwXFIduGyn1gpvPefJtUYOYZOi3yeGG94,1068
|
10
|
+
imb-1.0.1.dist-info/METADATA,sha256=5x0Xa-Gbg8D8e0mNMzKZB4l27xwNlKfyUXzDwQfkJUA,3240
|
11
|
+
imb-1.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
12
|
+
imb-1.0.1.dist-info/top_level.txt,sha256=kY8Fp1i_MzTZhuoVhVexG762D8HBd-THfX_lfw4EZmY,4
|
13
|
+
imb-1.0.1.dist-info/RECORD,,
|
imb-1.0.0.dist-info/METADATA
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: imb
|
3
|
-
Version: 1.0.0
|
4
|
-
Summary: Python library for run inference of deep learning models in different backends
|
5
|
-
Home-page: https://github.com/TheConstant3/InferenceMultiBackend
|
6
|
-
Author: p-constant
|
7
|
-
Author-email: nikshorop@gmail.com
|
8
|
-
Classifier: Programming Language :: Python :: 3.8
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Requires-Python: >=3.8
|
12
|
-
Description-Content-Type: text/markdown
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: onnxruntime-gpu>=1.16.0
|
15
|
-
Requires-Dist: tritonclient[all]>=2.38.0
|
16
|
-
Requires-Dist: numpy>=1.19.4
|
17
|
-
Dynamic: author
|
18
|
-
Dynamic: author-email
|
19
|
-
Dynamic: classifier
|
20
|
-
Dynamic: description
|
21
|
-
Dynamic: description-content-type
|
22
|
-
Dynamic: home-page
|
23
|
-
Dynamic: requires-dist
|
24
|
-
Dynamic: requires-python
|
25
|
-
Dynamic: summary
|
26
|
-
|
27
|
-
# InferenceMultiBackend
|
28
|
-
|
29
|
-
Python library for run inference of deep learning models in different backends
|
30
|
-
|
imb-1.0.0.dist-info/RECORD
DELETED
@@ -1,10 +0,0 @@
|
|
1
|
-
imb/__init__.py,sha256=8XoaonMp09UWmynubLMIu2bln41iKgIdWj-wxgsQjnk,55
|
2
|
-
imb/inference_clients/__init__.py,sha256=Glv4yD0QdtZmCOiYFbILSl90VhxdwvPoH9gFczHlVFk,61
|
3
|
-
imb/inference_clients/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
|
4
|
-
imb/inference_clients/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
|
5
|
-
imb/inference_clients/triton.py,sha256=hdnCtDjoRAl_Ss49_ayvW3-VhsYcY2MbNqh3ax6y-18,18629
|
6
|
-
imb-1.0.0.dist-info/LICENSE,sha256=pAZXnNE2dxxwXFIduGyn1gpvPefJtUYOYZOi3yeGG94,1068
|
7
|
-
imb-1.0.0.dist-info/METADATA,sha256=NZcJPx91mzPg4Zo9FZxlMQE4c6zB2s_yPVhhRVxPBzM,898
|
8
|
-
imb-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
-
imb-1.0.0.dist-info/top_level.txt,sha256=kY8Fp1i_MzTZhuoVhVexG762D8HBd-THfX_lfw4EZmY,4
|
10
|
-
imb-1.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|