imb 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imb/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from .inference_clients import OnnxClient, TritonClient
@@ -0,0 +1,2 @@
1
+ from .onnx import OnnxClient
2
+ from .triton import TritonClient
@@ -0,0 +1,129 @@
1
+ from typing import Dict, List, Optional, Tuple, Union
2
+ import numpy as np
3
+ import time
4
+ import os
5
+
6
+
7
+ class BaseClient:
8
+ def __init__(self, *args, **kwargs):
9
+ self.show_fps: bool = os.environ.get('SHOW_FPS') in {'yes', 'true'}
10
+ self.model_name = ''
11
+ self.fixed_batch = False
12
+ self.max_batch_size = 1
13
+ self.is_async = False
14
+ self.return_dict = True
15
+
16
+ self.inputs_names: List[str] = []
17
+ self.inputs_shapes: List[tuple] = []
18
+ self.np_inputs_dtypes: List[np.dtype] = []
19
+
20
+ self.outputs_names: List[str] = []
21
+ self.outputs_shapes: List[tuple] = []
22
+ self.np_outputs_dtypes: List[np.dtype] = []
23
+
24
+ self.sample_inputs: Optional[List[np.ndarray]] = None
25
+
26
+ def _load_model_params(self, *args, **kwargs):
27
+ raise NotImplementedError
28
+
29
+ def _create_input_sample(self):
30
+ if self.sample_inputs is not None:
31
+ # validate sample inputs
32
+ for sample_array, config_input_shape in zip(self.sample_inputs, self.inputs_shapes):
33
+ for i, (s_dim, t_dim) in enumerate(zip(sample_array.shape, config_input_shape)):
34
+ if i == 0:
35
+ if self.fixed_batch:
36
+ assert s_dim == t_dim, \
37
+ f'model support fixed batch size {t_dim}, \
38
+ sample_inputs has batch size {s_dim}'
39
+ else:
40
+ assert s_dim <= t_dim, \
41
+ f'model support max batch size {t_dim}, \
42
+ sample_inputs has batch size {s_dim}'
43
+ continue
44
+ assert ((t_dim != -1) and (int(s_dim) == int(t_dim))) or t_dim == -1, \
45
+ f'incorrect shape in sample_inputs {sample_array.shape}, must be {config_input_shape}'
46
+ else:
47
+ has_dynamic_shapes = any(
48
+ -1 in config_input_shape for config_input_shape in self.inputs_shapes
49
+ )
50
+ if has_dynamic_shapes:
51
+ return
52
+ self.sample_inputs = []
53
+ for config_input_shape, np_input_format in zip(self.inputs_shapes, self.np_inputs_dtypes):
54
+ self.sample_inputs.append(
55
+ np.ones(config_input_shape).astype(np_input_format)
56
+ )
57
+
58
+ def _create_batches(self, *inputs_data: np.ndarray) -> Tuple[Dict[str, List[np.ndarray]], List[int]]:
59
+ inputs_batches = dict()
60
+ paddings = []
61
+ for input_data, np_format, input_name in zip(inputs_data, self.np_inputs_dtypes, self.inputs_names):
62
+ input_data = input_data.astype(np_format)
63
+ input_batches, input_paddings = self._split_on_batches(input_data)
64
+ if paddings == []:
65
+ paddings = input_paddings
66
+ inputs_batches[input_name] = input_batches
67
+ return inputs_batches, paddings
68
+
69
+ def log(self, text, warn=False, err=False):
70
+ text = f'Model ({self.model_name}) - {text}'
71
+ if err:
72
+ print('error', text)
73
+ elif warn:
74
+ print('warning',text)
75
+ else:
76
+ print('debug', text)
77
+
78
+ def warmup_model(self):
79
+ if self.sample_inputs is None:
80
+ print('Model was not warmed up, because sample_inputs didn\'t set or shape is dynamic and cannot auto generate')
81
+ return
82
+ exception = None
83
+ for _ in range(5):
84
+ try:
85
+ _ = self.__call__(*self.sample_inputs)
86
+ exception = None
87
+ except Exception as e:
88
+ print(f'{e} while warmup, repeat inference...')
89
+ exception = e
90
+ time.sleep(2)
91
+ if exception is not None:
92
+ raise exception
93
+
94
+ def pad_batch(self, batch: np.ndarray):
95
+ padding_size = self.max_batch_size - batch.shape[0]
96
+ if padding_size > 0:
97
+ pad = np.zeros([padding_size, *batch.shape[1:]], dtype=batch.dtype)
98
+ batch = np.concatenate((batch, pad), axis=0)
99
+ return batch, padding_size
100
+
101
+ def _split_on_batches(self, input_data: np.ndarray):
102
+ batches = []
103
+ paddings = []
104
+ for i in range(0, len(input_data), self.max_batch_size):
105
+ batch = input_data[i:i+self.max_batch_size]
106
+ batches.append(batch)
107
+ paddings.append(0)
108
+
109
+ if self.fixed_batch:
110
+ batches[-1], paddings[-1] = self.pad_batch(batches[-1])
111
+
112
+ return batches, paddings
113
+
114
+ def forward(self, *input_data):
115
+ raise NotImplementedError
116
+
117
+ def async_forward(self, *input_data):
118
+ raise NotImplementedError
119
+
120
+ def __call__(self, *args, **kwargs) -> Union[Dict[str, np.ndarray], List[np.ndarray]]:
121
+ t1 = time.time()
122
+ forward_func = self.async_forward if self.is_async else self.forward
123
+ output = forward_func(*args, **kwargs)
124
+ if self.return_dict is False:
125
+ output = [output[output_name] for output_name in self.outputs_names]
126
+ t2 = time.time()
127
+ if self.show_fps:
128
+ self.log(f'Model: {self.model_name} fps {int(len(args[0])/(t2-t1))}')
129
+ return output
@@ -0,0 +1,99 @@
1
+ from collections import defaultdict
2
+ from typing import Dict, List
3
+ from .base import BaseClient
4
+ import onnxruntime as rt
5
+ import numpy as np
6
+
7
+
8
+ class OnnxClient(BaseClient):
9
+ def __init__(self, model_path: str,
10
+ model_name: str,
11
+ providers: List[str] = ['CUDAExecutionProvider', 'CPUExecutionProvider'],
12
+ max_batch_size: int = 1,
13
+ return_dict: bool = True,
14
+ fixed_batch: bool = False,
15
+ warmup: bool = False
16
+ ):
17
+ super().__init__()
18
+ self.model_name = model_name
19
+ self.model_path = model_path
20
+ self.providers = providers
21
+ self.return_dict = return_dict
22
+ self.max_batch_size = max_batch_size
23
+ self.fixed_batch = fixed_batch
24
+
25
+ self._load_model_params(max_batch_size)
26
+
27
+ self.sample_inputs = [np.zeros((*shape,), dtype=dtype) for shape, dtype in zip(self.inputs_shapes, self.np_inputs_dtypes)]
28
+
29
+ if warmup:
30
+ self.warmup_model()
31
+
32
+ def _load_model_params(self, max_batch_size: int = 1):
33
+ """
34
+ Load model parameters from onnx model
35
+
36
+ Args:
37
+ max_batch_size (int, optional): max batch size. Defaults to 1.
38
+
39
+ Raises:
40
+ ValueError: not support dynamic batch
41
+ """
42
+ sess_options = rt.SessionOptions()
43
+ self.onnx_model = rt.InferenceSession(
44
+ self.model_path,
45
+ providers=self.providers,
46
+ sess_options=sess_options
47
+ )
48
+
49
+ model_inputs = self.onnx_model.get_inputs()
50
+ data_dtype = np.float16 if 'float16' in model_inputs[0].type else np.float32
51
+ self.inputs_names = [model_inputs[i].name for i in range(len(model_inputs))]
52
+ self.np_inputs_dtypes = [data_dtype for _ in range(len(self.inputs_names))]
53
+ self.inputs_shapes = [model_inputs[i].shape for i in range(len(model_inputs))]
54
+ for i_input, shape in enumerate(self.inputs_shapes):
55
+ new_shape = []
56
+ for i_dim, value in enumerate(shape):
57
+ if isinstance(value, int):
58
+ if i_dim == 0:
59
+ self.max_batch_size = value
60
+ self.log(f'set batch size {value} from model metadata')
61
+ new_shape.append(value)
62
+ elif isinstance(value, str) and 'batch' in value:
63
+ new_shape.append(max_batch_size)
64
+ self.log(f'set batch size {value} from user settings')
65
+ else:
66
+ raise ValueError(f'not support value {value} in input shape {shape}')
67
+ self.inputs_shapes[i_input] = new_shape
68
+
69
+ model_outputs = self.onnx_model.get_outputs()
70
+ self.outputs_names = [model_outputs[i].name for i in range(len(model_outputs))]
71
+ self.np_outputs_dtypes = [data_dtype for _ in range(len(self.outputs_names))]
72
+
73
+ def forward(self, *inputs_data: np.ndarray) -> Dict[str, np.ndarray]:
74
+ inputs_batches, batches_paddings = self._create_batches(*inputs_data)
75
+
76
+ result = defaultdict(list)
77
+ count_batches = len(next(iter(inputs_batches.values())))
78
+
79
+ for i_batch in range(count_batches):
80
+ batch = dict()
81
+ for input_name, np_dtype in zip(self.inputs_names, self.np_inputs_dtypes):
82
+ batch[input_name] = inputs_batches[input_name][i_batch].astype(np_dtype)
83
+
84
+ batch_result = self.onnx_model.run(self.outputs_names, batch)
85
+ batch_result = {
86
+ self.outputs_names[i]: batch_result[i].astype(self.np_outputs_dtypes[i])
87
+ for i in range(len(self.outputs_names))
88
+ }
89
+
90
+ padding_size = batches_paddings[i_batch]
91
+ for output_name, output_value in batch_result.items():
92
+ result[output_name].append(
93
+ output_value if padding_size == 0 else output_value[:-padding_size]
94
+ )
95
+
96
+ for output_name, output_values in result.items():
97
+ result[output_name] = np.concatenate(output_values)
98
+
99
+ return result
@@ -0,0 +1,460 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict, List, Literal, Optional, Tuple
3
+ import tritonclient.http as httpclient
4
+ import tritonclient.grpc as grpcclient
5
+ import tritonclient.utils.cuda_shared_memory as cudashm
6
+ from google.protobuf.json_format import MessageToJson
7
+ from tritonclient import utils
8
+ from .base import BaseClient
9
+ import numpy as np
10
+ import json
11
+ import time
12
+
13
+
14
+ class TritonClient(BaseClient):
15
+ def __init__(self, url: str,
16
+ model_name: str,
17
+ max_batch_size: int = 0,
18
+ sample_inputs: Optional[List[np.ndarray]] = None,
19
+ timeout: int = 10,
20
+ resend_count: int = 10,
21
+ fixed_batch: bool = True,
22
+ is_async: bool = False,
23
+ cuda_shm: bool = False,
24
+ max_shm_regions: int = 2,
25
+ scheme: Literal["http", "grpc"] = "http",
26
+ return_dict: bool = True,
27
+ warmup: bool = False
28
+ ):
29
+ super().__init__()
30
+ self.model_name = model_name
31
+ self.scheme = scheme
32
+ self.client_module = httpclient if scheme == "http" else grpcclient
33
+ self.url = url
34
+ self.is_async = is_async
35
+ self.cuda_shm = cuda_shm
36
+ self.triton_timeout = timeout
37
+ self.resend_count = resend_count
38
+ self.max_shm_regions = max_shm_regions
39
+ self.return_dict = return_dict
40
+
41
+ self.triton_client = None
42
+ self._init_triton()
43
+
44
+ self.triton_inputs_dtypes = None
45
+ self.np_inputs_dtypes = None
46
+
47
+ self.inputs_shapes = None
48
+ self.fixed_batch = fixed_batch
49
+
50
+ self.inputs_names = None
51
+ self.outputs_names = None
52
+
53
+ self.sample_inputs = sample_inputs
54
+
55
+ self._load_model_params(max_batch_size)
56
+ self._create_input_sample()
57
+ if warmup:
58
+ self.warmup_model()
59
+
60
+ self.input_shm_handles = [None for _ in range(len(self.inputs_names))]
61
+ self.output_shm_handles = [None for _ in range(len(self.outputs_names))]
62
+
63
+ if self.cuda_shm:
64
+ assert is_async == False and fixed_batch == True
65
+ self._fill_output_dynamic_axis()
66
+ self._create_input_output_shm_handles()
67
+ self._register_cuda_shm_regions()
68
+
69
+ def io_summary(self):
70
+ return {
71
+ "model_name": self.model_name,
72
+ "url": self.url,
73
+ "scheme": self.scheme,
74
+
75
+ "inputs_shapes": self.inputs_shapes,
76
+ "inputs_names": self.inputs_names,
77
+ "triton_inputs_dtypes": self.triton_inputs_dtypes,
78
+ "np_inputs_dtypes": self.np_inputs_dtypes,
79
+
80
+ "outputs_shapes": self.outputs_shapes,
81
+ "outputs_names": self.outputs_names,
82
+ "triton_outputs_dtypes": self.triton_outputs_dtypes,
83
+ "np_outputs_dtypes": self.np_outputs_dtypes,
84
+
85
+ "fixed_batch": self.fixed_batch,
86
+ "async": self.is_async,
87
+ "cuda_shm": self.cuda_shm,
88
+ "max_shm_regions": self.max_shm_regions,
89
+ }
90
+
91
+ def _init_triton(self):
92
+ if self.triton_client is not None:
93
+ # reinit
94
+ self.triton_client.close()
95
+ time.sleep(3)
96
+
97
+ self.triton_client = self.client_module.InferenceServerClient(
98
+ url=self.url,
99
+ verbose=False,
100
+ ssl=False,
101
+ network_timeout=self.triton_timeout,
102
+ connection_timeout=self.triton_timeout
103
+ )
104
+
105
+ def _load_model_params(self, user_max_batch_size: int) -> None:
106
+ """
107
+ Load the model config from Triton Inferernce Server and update the class attributes.
108
+
109
+ Args:
110
+ user_max_batch_size (int): max_batch_size defined by user
111
+ """
112
+ if self.scheme == "grpc":
113
+ config = self.triton_client.get_model_config(self.model_name, as_json=True)
114
+ config = config["config"]
115
+ else:
116
+ config = self.triton_client.get_model_config(self.model_name)
117
+
118
+ self.triton_inputs_dtypes, self.np_inputs_dtypes, \
119
+ self.inputs_shapes, self.inputs_names \
120
+ = self._parse_io_params(config['input'])
121
+
122
+ self.triton_outputs_dtypes, self.np_outputs_dtypes, \
123
+ self.outputs_shapes, self.outputs_names \
124
+ = self._parse_io_params(config['output'])
125
+
126
+ not_support_dynamic_batch = config['max_batch_size'] == 0
127
+ if not_support_dynamic_batch:
128
+ # use batch size from config
129
+ self.max_batch_size = config['input'][0]['dims'][0]
130
+ self.fixed_batch = True
131
+ else:
132
+ # user can decrease max_batch_size from config
133
+ if user_max_batch_size > 0:
134
+ self.max_batch_size = min(config['max_batch_size'], user_max_batch_size)
135
+ else:
136
+ self.max_batch_size = config['max_batch_size']
137
+ # in config's shape has no batch size
138
+ self.inputs_shapes = self._insert_batch_size_to_shapes(
139
+ self.inputs_shapes, self.max_batch_size
140
+ )
141
+ self.outputs_shapes = self._insert_batch_size_to_shapes(
142
+ self.outputs_shapes, self.max_batch_size
143
+ )
144
+
145
+ def _fill_output_dynamic_axis(self) -> None:
146
+ """
147
+ Fill real values in the dynamic axis of the output shapes.
148
+ """
149
+ has_dynamic_shapes = any(
150
+ -1 in output_shape for output_shape in self.outputs_shapes
151
+ )
152
+ if has_dynamic_shapes:
153
+ start_cuda_shm_flag = self.cuda_shm
154
+ self.cuda_shm = False
155
+ outputs = self.forward(*self.sample_inputs)
156
+ self.outputs_shapes = [
157
+ list(outputs[output_name].shape) for output_name in self.outputs_names
158
+ ]
159
+ self.cuda_shm = start_cuda_shm_flag
160
+
161
+ @staticmethod
162
+ def _parse_io_params(io_params: List[Dict]) -> Tuple[List[str], List[np.dtype], List[List[int]], List[str]]:
163
+ """
164
+ Parse the input/output parameters from the model config.
165
+
166
+ Args:
167
+ io_params (List[Dict]): The input/output parameters.
168
+
169
+ Returns:
170
+ Tuple[List[str], List[np.dtype], List[List[int]], List[str]]: The input/output dtypes, shapes, and names.
171
+ """
172
+ triton_dtypes = []
173
+ np_dtypes = []
174
+ shapes = []
175
+ names = []
176
+ for params in io_params:
177
+ triton_dtypes.append(params['data_type'].replace('TYPE_', ''))
178
+ np_dtypes.append(utils.triton_to_np_dtype(triton_dtypes[-1]))
179
+ shapes.append(params['dims'])
180
+ names.append(params['name'])
181
+
182
+ return triton_dtypes, np_dtypes, shapes, names
183
+
184
+ @staticmethod
185
+ def _insert_batch_size_to_shapes(shapes: List[List], insert_batch_size: int) -> List[List[int]]:
186
+ """
187
+ Insert the batch size to the shapes.
188
+
189
+ Args:
190
+ shapes (List[List]): Shapes from config
191
+ insert_batch_size (int): Value for insert batch size to shape
192
+
193
+ Returns:
194
+ List[List[int]]: Result shape
195
+ """
196
+ return [[insert_batch_size] + shape for shape in shapes]
197
+
198
+ def _generate_shm_name(self, ioname: str) -> str:
199
+ """
200
+ Generate shared region name
201
+
202
+ Args:
203
+ ioname (str): Input/output name
204
+
205
+ Returns:
206
+ str: Shared region name
207
+ """
208
+ return f'{self.model_name}_{ioname}_{time.time()}'
209
+
210
+ def _get_old_regions_names(self, regions_statuses: list, new_triton_shm_name: str) -> List[str]:
211
+ """
212
+ Get old regions names for unregister
213
+
214
+ Args:
215
+ regions_statuses (list): responce of get_cuda_shared_memory_status from triton
216
+ new_triton_shm_name (str): name of new region
217
+
218
+ Returns:
219
+ List[str]: old regions names for unregister
220
+ """
221
+ i_sep = len(new_triton_shm_name) - new_triton_shm_name[::-1].index('_') - 1
222
+ region_name = new_triton_shm_name[:i_sep]
223
+ registrated_regions = [
224
+ (region['name'], float(region['name'][i_sep+1:]))
225
+ for region in regions_statuses if region['name'].startswith(region_name)
226
+ ]
227
+ registrated_regions.sort(key=lambda x: x[1])
228
+ count_old_regions = len(registrated_regions) - self.max_shm_regions + 1
229
+ old_regions = []
230
+ if count_old_regions > 0:
231
+ old_regions = [name for name, _ in registrated_regions[:count_old_regions]]
232
+ return old_regions
233
+
234
+ def _register_cuda_shm_regions(self):
235
+ """
236
+ Register CUDA shared memory regions in Triton
237
+ """
238
+ if self.scheme == "grpc":
239
+ regions_statuses = self.triton_client.get_cuda_shared_memory_status(as_json=True)['regions']
240
+ else:
241
+ regions_statuses = self.triton_client.get_cuda_shared_memory_status()
242
+
243
+ for shm_handle in self.input_shm_handles + self.output_shm_handles:
244
+ old_regions_names = self._get_old_regions_names(regions_statuses, shm_handle._triton_shm_name)
245
+ for old_region_name in old_regions_names:
246
+ self.triton_client.unregister_cuda_shared_memory(old_region_name)
247
+ self.triton_client.register_cuda_shared_memory(
248
+ shm_handle._triton_shm_name, cudashm.get_raw_handle(shm_handle), 0, shm_handle._byte_size
249
+ )
250
+
251
+ def _create_cuda_shm_handle(self, shape: List[int], dtype: np.dtype, name: str) -> Any:
252
+ """
253
+ Create CUDA shared memory handle
254
+
255
+ Args:
256
+ shape (List[int]): Shape of cuda shared memory region
257
+ dtype (np.dtype): Data type of input/output data
258
+ name (str): Input/output name
259
+
260
+ Returns:
261
+ Any: CUDA shared memory handle
262
+ """
263
+ byte_size = int(np.prod(shape) * np.dtype(dtype).itemsize)
264
+ shm_name = self._generate_shm_name(name)
265
+ return cudashm.create_shared_memory_region(shm_name, byte_size, 0)
266
+
267
+ def _create_cuda_shm_handles_for_io(self, shapes: List[List[int]],
268
+ dtypes: List[np.dtype],
269
+ names: List[str]) -> List[Any]:
270
+ """
271
+ Create CUDA shared memory handles for inputs or outputs
272
+
273
+ Args:
274
+ shapes (List[List[int]]): Shapes of cuda shared memory regions
275
+ dtypes (List[np.dtype]): Data types of input/output data
276
+ names (List[str]): Input/output names
277
+
278
+ Returns:
279
+ List[Any]: CUDA shared memory handles
280
+ """
281
+ return [self._create_cuda_shm_handle(shape, dtype, name)
282
+ for shape, dtype, name in zip(shapes, dtypes, names)]
283
+
284
+ def _create_input_output_shm_handles(self) -> None:
285
+ """
286
+ Create CUDA shared memory handles for inputs and outputs
287
+ """
288
+ self.input_shm_handles = self._create_cuda_shm_handles_for_io(
289
+ self.inputs_shapes, self.np_inputs_dtypes, self.inputs_names
290
+ )
291
+ self.output_shm_handles = self._create_cuda_shm_handles_for_io(
292
+ self.outputs_shapes, self.np_outputs_dtypes, self.outputs_names
293
+ )
294
+
295
+ def _create_triton_input(self, input_data: np.ndarray, input_name: str,
296
+ config_input_format: str, shm_handle = None) -> Any:
297
+ """
298
+ Create triton InferInput
299
+
300
+ Args:
301
+ input_data (np.ndarray): data for send to model
302
+ input_name (str): name of input
303
+ config_input_format (str): triton input format
304
+ shm_handle (_type_, optional): CUDA shared memory handle. Defaults to None.
305
+
306
+ Returns:
307
+ Any: triton InferInput for sending request
308
+ """
309
+ infer_input = self.client_module.InferInput(input_name, input_data.shape, config_input_format)
310
+ if self.cuda_shm:
311
+ cudashm.set_shared_memory_region(shm_handle, [input_data])
312
+ infer_input.set_shared_memory(shm_handle._triton_shm_name, shm_handle._byte_size)
313
+ else:
314
+ infer_input.set_data_from_numpy(input_data)
315
+ return infer_input
316
+
317
+ def _create_triton_output(self, output_name: str, binary: bool = True, shm_handle = None) -> Any:
318
+ """
319
+ Create triton InferRequestedOutput
320
+
321
+ Args:
322
+ output_name (str): output name
323
+ binary (bool, optional): Whether the output is binary. Defaults to True.
324
+ shm_handle (_type_, optional): CUDA shared memory handle. Defaults to None.
325
+
326
+ Returns:
327
+ Any: triton InferRequestedOutput for receiving response
328
+ """
329
+ if self.scheme == "grpc":
330
+ infer_output = self.client_module.InferRequestedOutput(output_name)
331
+ else:
332
+ infer_output = self.client_module.InferRequestedOutput(output_name, binary_data=binary)
333
+ if self.cuda_shm:
334
+ infer_output.set_shared_memory(shm_handle._triton_shm_name, shm_handle._byte_size)
335
+ return infer_output
336
+
337
+ def _postprocess_triton_result(self, triton_response: Any, padding_size: int) -> Dict[str, np.ndarray]:
338
+ """
339
+ Postprocess triton response.
340
+
341
+ Args:
342
+ triton_response (Any): triton response
343
+ padding_size (int): padding size for unpad output data
344
+
345
+ Returns:
346
+ Dict[str, np.ndarray]: dict of output name and output data
347
+ """
348
+ result = dict()
349
+ for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
350
+ if self.cuda_shm:
351
+ if self.scheme == "grpc":
352
+ # output = triton_response.get_output(output_name, as_json=True) # WARN: bug in tritonclient library, return None
353
+ output = json.loads(MessageToJson(triton_response.get_output(output_name)))
354
+ else:
355
+ output = triton_response.get_output(output_name)
356
+ result[output_name] = cudashm.get_contents_as_numpy(
357
+ shm_op_handle,
358
+ utils.triton_to_np_dtype(output["datatype"]),
359
+ output["shape"],
360
+ )
361
+ else:
362
+ result[output_name] = triton_response.as_numpy(output_name)
363
+
364
+ if padding_size != 0:
365
+ result[output_name] = result[output_name][:-padding_size]
366
+
367
+ return result
368
+
369
+ def forward(self, *inputs_data: np.ndarray) -> Dict[str, np.ndarray]:
370
+ assert len(inputs_data) == len(self.inputs_names), 'inputs number is not equal to model inputs'
371
+ inputs_batches, batches_paddings = self._create_batches(*inputs_data)
372
+
373
+ result = defaultdict(list)
374
+ count_batches = len(next(iter(inputs_batches.values())))
375
+
376
+ for i_batch in range(count_batches):
377
+ triton_inputs = []
378
+ for input_name, config_input_format, shm_ip_handle in \
379
+ zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handles):
380
+ triton_input = self._create_triton_input(
381
+ inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handle
382
+ )
383
+ triton_inputs.append(triton_input)
384
+
385
+ triton_outputs = []
386
+ for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
387
+ triton_output = self._create_triton_output(
388
+ output_name, binary=True, shm_handle=shm_op_handle
389
+ )
390
+ triton_outputs.append(triton_output)
391
+
392
+ triton_response = self.triton_client.infer(
393
+ model_name=self.model_name,
394
+ inputs=triton_inputs,
395
+ outputs=triton_outputs
396
+ )
397
+
398
+ batch_result = self._postprocess_triton_result(triton_response, batches_paddings[i_batch])
399
+
400
+ for output_name, output_value in batch_result.items():
401
+ result[output_name].append(output_value)
402
+
403
+ for output_name, output_values in result.items():
404
+ result[output_name] = np.concatenate(output_values)
405
+
406
+ return result
407
+
408
+ def send_async_requests(self, inputs_batches: Dict):
409
+ count_batches = len(next(iter(inputs_batches.values())))
410
+
411
+ triton_response_handles = []
412
+
413
+ for i_batch in range(count_batches):
414
+ triton_inputs = []
415
+ for input_name, config_input_format, shm_ip_handle in \
416
+ zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handles):
417
+ triton_input = self._create_triton_input(
418
+ inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handle
419
+ )
420
+ triton_inputs.append(triton_input)
421
+
422
+ triton_outputs = []
423
+ for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
424
+ triton_output = self._create_triton_output(
425
+ output_name, binary=True, shm_handle=shm_op_handle
426
+ )
427
+ triton_outputs.append(triton_output)
428
+
429
+ triton_response_handle = self.triton_client.async_infer(
430
+ model_name=self.model_name,
431
+ inputs=triton_inputs,
432
+ outputs=triton_outputs
433
+ )
434
+ triton_response_handles.append(triton_response_handle)
435
+
436
+ return triton_response_handles
437
+
438
+ def get_async_results(self, triton_response_handles, batches_paddings):
439
+ result = defaultdict(list)
440
+ for i_batch, triton_response_handle in enumerate(triton_response_handles):
441
+ triton_response = triton_response_handle.get_result()
442
+ batch_result = self._postprocess_triton_result(triton_response, batches_paddings[i_batch])
443
+
444
+ for output_name, output_value in batch_result.items():
445
+ result[output_name].append(output_value)
446
+
447
+ for output_name, output_values in result.items():
448
+ result[output_name] = np.concatenate(output_values)
449
+
450
+ return result
451
+
452
+ def async_forward(self, *inputs_data: np.ndarray):
453
+ assert len(inputs_data) == len(self.inputs_names), 'inputs number is not equal to model inputs'
454
+ inputs_batches, batches_paddings = self._create_batches(*inputs_data)
455
+
456
+ triton_response_handles = self.send_async_requests(inputs_batches)
457
+
458
+ result = self.get_async_results(triton_response_handles, batches_paddings)
459
+
460
+ return result
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [year] [fullname]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,30 @@
1
+ Metadata-Version: 2.2
2
+ Name: imb
3
+ Version: 1.0.0
4
+ Summary: Python library for run inference of deep learning models in different backends
5
+ Home-page: https://github.com/TheConstant3/InferenceMultiBackend
6
+ Author: p-constant
7
+ Author-email: nikshorop@gmail.com
8
+ Classifier: Programming Language :: Python :: 3.8
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: onnxruntime-gpu>=1.16.0
15
+ Requires-Dist: tritonclient[all]>=2.38.0
16
+ Requires-Dist: numpy>=1.19.4
17
+ Dynamic: author
18
+ Dynamic: author-email
19
+ Dynamic: classifier
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: requires-dist
24
+ Dynamic: requires-python
25
+ Dynamic: summary
26
+
27
+ # InferenceMultiBackend
28
+
29
+ Python library for run inference of deep learning models in different backends
30
+
@@ -0,0 +1,10 @@
1
+ imb/__init__.py,sha256=8XoaonMp09UWmynubLMIu2bln41iKgIdWj-wxgsQjnk,55
2
+ imb/inference_clients/__init__.py,sha256=Glv4yD0QdtZmCOiYFbILSl90VhxdwvPoH9gFczHlVFk,61
3
+ imb/inference_clients/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
4
+ imb/inference_clients/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
5
+ imb/inference_clients/triton.py,sha256=hdnCtDjoRAl_Ss49_ayvW3-VhsYcY2MbNqh3ax6y-18,18629
6
+ imb-1.0.0.dist-info/LICENSE,sha256=pAZXnNE2dxxwXFIduGyn1gpvPefJtUYOYZOi3yeGG94,1068
7
+ imb-1.0.0.dist-info/METADATA,sha256=NZcJPx91mzPg4Zo9FZxlMQE4c6zB2s_yPVhhRVxPBzM,898
8
+ imb-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
+ imb-1.0.0.dist-info/top_level.txt,sha256=kY8Fp1i_MzTZhuoVhVexG762D8HBd-THfX_lfw4EZmY,4
10
+ imb-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.8.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ imb