imb 1.0.1__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {imb-1.0.1 → imb-1.0.3}/PKG-INFO +14 -20
- {imb-1.0.1 → imb-1.0.3}/README.md +8 -0
- {imb-1.0.1 → imb-1.0.3}/imb/triton.py +122 -69
- {imb-1.0.1 → imb-1.0.3}/imb.egg-info/PKG-INFO +14 -20
- {imb-1.0.1 → imb-1.0.3}/setup.py +1 -1
- {imb-1.0.1 → imb-1.0.3}/LICENSE +0 -0
- {imb-1.0.1 → imb-1.0.3}/imb/__init__.py +0 -0
- {imb-1.0.1 → imb-1.0.3}/imb/base.py +0 -0
- {imb-1.0.1 → imb-1.0.3}/imb/onnx.py +0 -0
- {imb-1.0.1 → imb-1.0.3}/imb.egg-info/SOURCES.txt +0 -0
- {imb-1.0.1 → imb-1.0.3}/imb.egg-info/dependency_links.txt +0 -0
- {imb-1.0.1 → imb-1.0.3}/imb.egg-info/requires.txt +0 -0
- {imb-1.0.1 → imb-1.0.3}/imb.egg-info/top_level.txt +0 -0
- {imb-1.0.1 → imb-1.0.3}/setup.cfg +0 -0
{imb-1.0.1 → imb-1.0.3}/PKG-INFO
RENAMED
@@ -1,37 +1,22 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.1
|
2
2
|
Name: imb
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.3
|
4
4
|
Summary: Python library for run inference of deep learning models in different backends
|
5
5
|
Home-page: https://github.com/TheConstant3/InferenceMultiBackend
|
6
6
|
Author: p-constant
|
7
7
|
Author-email: nikshorop@gmail.com
|
8
|
+
License: UNKNOWN
|
9
|
+
Platform: UNKNOWN
|
8
10
|
Classifier: Programming Language :: Python :: 3.8
|
9
11
|
Classifier: License :: OSI Approved :: MIT License
|
10
12
|
Classifier: Operating System :: OS Independent
|
11
13
|
Requires-Python: >=3.8
|
12
14
|
Description-Content-Type: text/markdown
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: numpy
|
15
15
|
Provides-Extra: triton
|
16
|
-
Requires-Dist: tritonclient[all]>=2.38.0; extra == "triton"
|
17
16
|
Provides-Extra: onnxcpu
|
18
|
-
Requires-Dist: onnxruntime>=1.16.0; extra == "onnxcpu"
|
19
17
|
Provides-Extra: onnxgpu
|
20
|
-
Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "onnxgpu"
|
21
18
|
Provides-Extra: all
|
22
|
-
|
23
|
-
Requires-Dist: onnxruntime>=1.16.0; extra == "all"
|
24
|
-
Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "all"
|
25
|
-
Dynamic: author
|
26
|
-
Dynamic: author-email
|
27
|
-
Dynamic: classifier
|
28
|
-
Dynamic: description
|
29
|
-
Dynamic: description-content-type
|
30
|
-
Dynamic: home-page
|
31
|
-
Dynamic: provides-extra
|
32
|
-
Dynamic: requires-dist
|
33
|
-
Dynamic: requires-python
|
34
|
-
Dynamic: summary
|
19
|
+
License-File: LICENSE
|
35
20
|
|
36
21
|
# InferenceMultiBackend
|
37
22
|
|
@@ -55,6 +40,8 @@ For support all implemented clients:
|
|
55
40
|
|
56
41
|
OnnxClient usage example
|
57
42
|
```
|
43
|
+
from imb.onnx import OnnxClient
|
44
|
+
|
58
45
|
onnx_client = OnnxClient(
|
59
46
|
model_path='model.onnx',
|
60
47
|
model_name='any name',
|
@@ -64,15 +51,19 @@ onnx_client = OnnxClient(
|
|
64
51
|
fixed_batch=True,
|
65
52
|
warmup=True
|
66
53
|
)
|
54
|
+
|
67
55
|
# if model has fixed input size (except batch size) then sample_inputs will be created
|
68
56
|
sample_inputs = onnx_client.sample_inputs
|
69
57
|
print('inputs shapes', [o.shape for o in sample_inputs])
|
58
|
+
|
70
59
|
outputs = onnx_client(*sample_inputs)
|
71
60
|
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
72
61
|
```
|
73
62
|
|
74
63
|
TritonClient usage example
|
75
64
|
```
|
65
|
+
from imb.triton import TritonClient
|
66
|
+
|
76
67
|
triton_client = TritonClient(
|
77
68
|
url='localhost:8000',
|
78
69
|
model_name='arcface',
|
@@ -87,9 +78,11 @@ triton_client = TritonClient(
|
|
87
78
|
return_dict=True,
|
88
79
|
warmup=False
|
89
80
|
)
|
81
|
+
|
90
82
|
# if model has fixed input size (except batch size) then sample_inputs will be created
|
91
83
|
sample_inputs = triton_client.sample_inputs
|
92
84
|
print('inputs shapes', [o.shape for o in sample_inputs])
|
85
|
+
|
93
86
|
outputs = triton_client(*sample_inputs)
|
94
87
|
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
95
88
|
```
|
@@ -103,3 +96,4 @@ fixed_batch - if fixed batch is True, then each batch will have fixed size (padd
|
|
103
96
|
warmup - if True, model will run several calls on sample_inputs while initialization.
|
104
97
|
|
105
98
|
return_dict - if True, __call__ return dict {'output_name1': output_value1, ...}, else [output_value1, ...]
|
99
|
+
|
@@ -20,6 +20,8 @@ For support all implemented clients:
|
|
20
20
|
|
21
21
|
OnnxClient usage example
|
22
22
|
```
|
23
|
+
from imb.onnx import OnnxClient
|
24
|
+
|
23
25
|
onnx_client = OnnxClient(
|
24
26
|
model_path='model.onnx',
|
25
27
|
model_name='any name',
|
@@ -29,15 +31,19 @@ onnx_client = OnnxClient(
|
|
29
31
|
fixed_batch=True,
|
30
32
|
warmup=True
|
31
33
|
)
|
34
|
+
|
32
35
|
# if model has fixed input size (except batch size) then sample_inputs will be created
|
33
36
|
sample_inputs = onnx_client.sample_inputs
|
34
37
|
print('inputs shapes', [o.shape for o in sample_inputs])
|
38
|
+
|
35
39
|
outputs = onnx_client(*sample_inputs)
|
36
40
|
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
37
41
|
```
|
38
42
|
|
39
43
|
TritonClient usage example
|
40
44
|
```
|
45
|
+
from imb.triton import TritonClient
|
46
|
+
|
41
47
|
triton_client = TritonClient(
|
42
48
|
url='localhost:8000',
|
43
49
|
model_name='arcface',
|
@@ -52,9 +58,11 @@ triton_client = TritonClient(
|
|
52
58
|
return_dict=True,
|
53
59
|
warmup=False
|
54
60
|
)
|
61
|
+
|
55
62
|
# if model has fixed input size (except batch size) then sample_inputs will be created
|
56
63
|
sample_inputs = triton_client.sample_inputs
|
57
64
|
print('inputs shapes', [o.shape for o in sample_inputs])
|
65
|
+
|
58
66
|
outputs = triton_client(*sample_inputs)
|
59
67
|
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
60
68
|
```
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple
|
|
3
3
|
import tritonclient.http as httpclient
|
4
4
|
import tritonclient.grpc as grpcclient
|
5
5
|
import tritonclient.utils.cuda_shared_memory as cudashm
|
6
|
+
import tritonclient.utils.shared_memory as shm
|
6
7
|
from google.protobuf.json_format import MessageToJson
|
7
8
|
from tritonclient import utils
|
8
9
|
from .base import BaseClient
|
@@ -11,30 +12,53 @@ import json
|
|
11
12
|
import time
|
12
13
|
|
13
14
|
|
15
|
+
class ShmHandlerWrapper:
|
16
|
+
def __init__(self, handler: Any, name: str, size: int):
|
17
|
+
self.handler = handler
|
18
|
+
self.name = name
|
19
|
+
self.size = size
|
20
|
+
|
21
|
+
|
22
|
+
|
14
23
|
class TritonClient(BaseClient):
|
15
24
|
def __init__(self, url: str,
|
16
25
|
model_name: str,
|
17
26
|
max_batch_size: int = 0,
|
18
27
|
sample_inputs: Optional[List[np.ndarray]] = None,
|
19
|
-
timeout: int = 10,
|
20
|
-
resend_count: int = 10,
|
21
28
|
fixed_batch: bool = True,
|
22
29
|
is_async: bool = False,
|
23
|
-
|
24
|
-
|
30
|
+
use_cuda_shm: bool = False,
|
31
|
+
use_system_shm: bool = False,
|
32
|
+
max_shm_regions: int = 0,
|
25
33
|
scheme: Literal["http", "grpc"] = "http",
|
26
34
|
return_dict: bool = True,
|
27
35
|
warmup: bool = False
|
28
36
|
):
|
37
|
+
"""_summary_
|
38
|
+
|
39
|
+
Args:
|
40
|
+
url (str): url of the triton server
|
41
|
+
model_name (str): name of the model endpoint
|
42
|
+
max_batch_size (int, optional): max batch size. Defaults to 0 (get value from triton config).
|
43
|
+
sample_inputs (Optional[List[np.ndarray]], optional): samples for warmup. Defaults to None (zeros array).
|
44
|
+
fixed_batch (bool, optional): use fixed batch size, using padding for smaller batch. Defaults to True.
|
45
|
+
is_async (bool, optional): async inference. Defaults to False.
|
46
|
+
use_cuda_shm (bool, optional): use cuda shared memory. Defaults to False.
|
47
|
+
use_system_shm (bool, optional): use system shared memory. Defaults to False.
|
48
|
+
max_shm_regions (int, optional): max clients for shared memory. Will unregister old regions. Defaults to 0.
|
49
|
+
scheme (Literal["http", "grpc"], optional): scheme for triton client. Defaults to "http".
|
50
|
+
return_dict (bool, optional): return dict or list of values. Defaults to True.
|
51
|
+
warmup (bool, optional): warmup model. Defaults to False.
|
52
|
+
"""
|
29
53
|
super().__init__()
|
54
|
+
assert not (use_cuda_shm and use_system_shm), 'shm and cuda_shm are mutually exclusive'
|
30
55
|
self.model_name = model_name
|
31
56
|
self.scheme = scheme
|
32
57
|
self.client_module = httpclient if scheme == "http" else grpcclient
|
33
58
|
self.url = url
|
34
59
|
self.is_async = is_async
|
35
|
-
self.
|
36
|
-
self.
|
37
|
-
self.resend_count = resend_count
|
60
|
+
self.use_cuda_shm = use_cuda_shm
|
61
|
+
self.use_system_shm = use_system_shm
|
38
62
|
self.max_shm_regions = max_shm_regions
|
39
63
|
self.return_dict = return_dict
|
40
64
|
|
@@ -57,14 +81,16 @@ class TritonClient(BaseClient):
|
|
57
81
|
if warmup:
|
58
82
|
self.warmup_model()
|
59
83
|
|
60
|
-
self.
|
61
|
-
|
84
|
+
self.input_shm_handlers: List[Optional[ShmHandlerWrapper]] = \
|
85
|
+
[None for _ in range(len(self.inputs_names))]
|
86
|
+
self.output_shm_handlers: List[Optional[ShmHandlerWrapper]] = \
|
87
|
+
[None for _ in range(len(self.outputs_names))]
|
62
88
|
|
63
|
-
if self.
|
89
|
+
if self.use_cuda_shm or self.use_system_shm:
|
64
90
|
assert is_async == False and fixed_batch == True
|
65
91
|
self._fill_output_dynamic_axis()
|
66
92
|
self._create_input_output_shm_handles()
|
67
|
-
self.
|
93
|
+
self._register_shm_regions()
|
68
94
|
|
69
95
|
def io_summary(self):
|
70
96
|
return {
|
@@ -84,7 +110,8 @@ class TritonClient(BaseClient):
|
|
84
110
|
|
85
111
|
"fixed_batch": self.fixed_batch,
|
86
112
|
"async": self.is_async,
|
87
|
-
"cuda_shm": self.
|
113
|
+
"cuda_shm": self.use_cuda_shm,
|
114
|
+
"shm": self.use_system_shm,
|
88
115
|
"max_shm_regions": self.max_shm_regions,
|
89
116
|
}
|
90
117
|
|
@@ -97,9 +124,7 @@ class TritonClient(BaseClient):
|
|
97
124
|
self.triton_client = self.client_module.InferenceServerClient(
|
98
125
|
url=self.url,
|
99
126
|
verbose=False,
|
100
|
-
ssl=False
|
101
|
-
network_timeout=self.triton_timeout,
|
102
|
-
connection_timeout=self.triton_timeout
|
127
|
+
ssl=False
|
103
128
|
)
|
104
129
|
|
105
130
|
def _load_model_params(self, user_max_batch_size: int) -> None:
|
@@ -150,13 +175,16 @@ class TritonClient(BaseClient):
|
|
150
175
|
-1 in output_shape for output_shape in self.outputs_shapes
|
151
176
|
)
|
152
177
|
if has_dynamic_shapes:
|
153
|
-
start_cuda_shm_flag = self.
|
154
|
-
|
178
|
+
start_cuda_shm_flag = self.use_cuda_shm
|
179
|
+
start_system_shm_flag = self.use_system_shm
|
180
|
+
self.use_cuda_shm = False
|
181
|
+
self.use_system_shm = False
|
155
182
|
outputs = self.forward(*self.sample_inputs)
|
156
183
|
self.outputs_shapes = [
|
157
184
|
list(outputs[output_name].shape) for output_name in self.outputs_names
|
158
185
|
]
|
159
|
-
self.
|
186
|
+
self.use_cuda_shm = start_cuda_shm_flag
|
187
|
+
self.use_system_shm = start_system_shm_flag
|
160
188
|
|
161
189
|
@staticmethod
|
162
190
|
def _parse_io_params(io_params: List[Dict]) -> Tuple[List[str], List[np.dtype], List[List[int]], List[str]]:
|
@@ -212,12 +240,14 @@ class TritonClient(BaseClient):
|
|
212
240
|
Get old regions names for unregister
|
213
241
|
|
214
242
|
Args:
|
215
|
-
regions_statuses (list): responce of
|
243
|
+
regions_statuses (list): responce of get_shared_memory_status from triton
|
216
244
|
new_triton_shm_name (str): name of new region
|
217
245
|
|
218
246
|
Returns:
|
219
247
|
List[str]: old regions names for unregister
|
220
248
|
"""
|
249
|
+
if self.max_shm_regions < 1:
|
250
|
+
return []
|
221
251
|
i_sep = len(new_triton_shm_name) - new_triton_shm_name[::-1].index('_') - 1
|
222
252
|
region_name = new_triton_shm_name[:i_sep]
|
223
253
|
registrated_regions = [
|
@@ -231,44 +261,35 @@ class TritonClient(BaseClient):
|
|
231
261
|
old_regions = [name for name, _ in registrated_regions[:count_old_regions]]
|
232
262
|
return old_regions
|
233
263
|
|
234
|
-
def
|
235
|
-
"""
|
236
|
-
Register CUDA shared memory regions in Triton
|
264
|
+
def _create_shm_handle(self, shape: List[int], dtype: np.dtype, name: str) -> ShmHandlerWrapper:
|
237
265
|
"""
|
238
|
-
|
239
|
-
regions_statuses = self.triton_client.get_cuda_shared_memory_status(as_json=True)['regions']
|
240
|
-
else:
|
241
|
-
regions_statuses = self.triton_client.get_cuda_shared_memory_status()
|
242
|
-
|
243
|
-
for shm_handle in self.input_shm_handles + self.output_shm_handles:
|
244
|
-
old_regions_names = self._get_old_regions_names(regions_statuses, shm_handle._triton_shm_name)
|
245
|
-
for old_region_name in old_regions_names:
|
246
|
-
self.triton_client.unregister_cuda_shared_memory(old_region_name)
|
247
|
-
self.triton_client.register_cuda_shared_memory(
|
248
|
-
shm_handle._triton_shm_name, cudashm.get_raw_handle(shm_handle), 0, shm_handle._byte_size
|
249
|
-
)
|
250
|
-
|
251
|
-
def _create_cuda_shm_handle(self, shape: List[int], dtype: np.dtype, name: str) -> Any:
|
252
|
-
"""
|
253
|
-
Create CUDA shared memory handle
|
266
|
+
Create shared memory handle
|
254
267
|
|
255
268
|
Args:
|
256
|
-
shape (List[int]): Shape of
|
269
|
+
shape (List[int]): Shape of shared memory region
|
257
270
|
dtype (np.dtype): Data type of input/output data
|
258
271
|
name (str): Input/output name
|
259
272
|
|
260
273
|
Returns:
|
261
|
-
Any:
|
274
|
+
Any: shared memory handle
|
262
275
|
"""
|
263
276
|
byte_size = int(np.prod(shape) * np.dtype(dtype).itemsize)
|
264
277
|
shm_name = self._generate_shm_name(name)
|
265
|
-
|
278
|
+
if self.use_cuda_shm:
|
279
|
+
shm_handle = cudashm.create_shared_memory_region(
|
280
|
+
shm_name, byte_size, 0
|
281
|
+
)
|
282
|
+
else:
|
283
|
+
shm_handle = shm.create_shared_memory_region(
|
284
|
+
shm_name, shm_name, byte_size
|
285
|
+
)
|
286
|
+
return ShmHandlerWrapper(shm_handle, shm_name, byte_size)
|
266
287
|
|
267
|
-
def
|
288
|
+
def _create_shm_handles_for_io(self, shapes: List[List[int]],
|
268
289
|
dtypes: List[np.dtype],
|
269
|
-
names: List[str]) -> List[
|
290
|
+
names: List[str]) -> List[ShmHandlerWrapper]:
|
270
291
|
"""
|
271
|
-
Create
|
292
|
+
Create shared memory handles for inputs or outputs
|
272
293
|
|
273
294
|
Args:
|
274
295
|
shapes (List[List[int]]): Shapes of cuda shared memory regions
|
@@ -276,24 +297,24 @@ class TritonClient(BaseClient):
|
|
276
297
|
names (List[str]): Input/output names
|
277
298
|
|
278
299
|
Returns:
|
279
|
-
List[
|
300
|
+
List[ShmHandlerWrapper]: shared memory handles
|
280
301
|
"""
|
281
|
-
return [self.
|
302
|
+
return [self._create_shm_handle(shape, dtype, name)
|
282
303
|
for shape, dtype, name in zip(shapes, dtypes, names)]
|
283
304
|
|
284
305
|
def _create_input_output_shm_handles(self) -> None:
|
285
306
|
"""
|
286
|
-
Create
|
307
|
+
Create shared memory handles for inputs and outputs
|
287
308
|
"""
|
288
|
-
self.
|
309
|
+
self.input_shm_handlers = self._create_shm_handles_for_io(
|
289
310
|
self.inputs_shapes, self.np_inputs_dtypes, self.inputs_names
|
290
311
|
)
|
291
|
-
self.
|
312
|
+
self.output_shm_handlers = self._create_shm_handles_for_io(
|
292
313
|
self.outputs_shapes, self.np_outputs_dtypes, self.outputs_names
|
293
314
|
)
|
294
315
|
|
295
316
|
def _create_triton_input(self, input_data: np.ndarray, input_name: str,
|
296
|
-
config_input_format: str,
|
317
|
+
config_input_format: str, shm_handler: Optional[ShmHandlerWrapper] = None) -> Any:
|
297
318
|
"""
|
298
319
|
Create triton InferInput
|
299
320
|
|
@@ -301,27 +322,28 @@ class TritonClient(BaseClient):
|
|
301
322
|
input_data (np.ndarray): data for send to model
|
302
323
|
input_name (str): name of input
|
303
324
|
config_input_format (str): triton input format
|
304
|
-
|
325
|
+
shm_handler (ShmHandlerWrapper, optional): shared memory handler. Defaults to None.
|
305
326
|
|
306
327
|
Returns:
|
307
328
|
Any: triton InferInput for sending request
|
308
329
|
"""
|
309
330
|
infer_input = self.client_module.InferInput(input_name, input_data.shape, config_input_format)
|
310
|
-
if self.
|
311
|
-
cudashm.
|
312
|
-
|
331
|
+
if self.use_cuda_shm or self.use_system_shm:
|
332
|
+
shm_utils = cudashm if self.use_cuda_shm else shm
|
333
|
+
shm_utils.set_shared_memory_region(shm_handler.handler, [input_data])
|
334
|
+
infer_input.set_shared_memory(shm_handler.name, shm_handler.size)
|
313
335
|
else:
|
314
336
|
infer_input.set_data_from_numpy(input_data)
|
315
337
|
return infer_input
|
316
338
|
|
317
|
-
def _create_triton_output(self, output_name: str, binary: bool = True,
|
339
|
+
def _create_triton_output(self, output_name: str, binary: bool = True, shm_handler: Optional[ShmHandlerWrapper] = None) -> Any:
|
318
340
|
"""
|
319
341
|
Create triton InferRequestedOutput
|
320
342
|
|
321
343
|
Args:
|
322
344
|
output_name (str): output name
|
323
345
|
binary (bool, optional): Whether the output is binary. Defaults to True.
|
324
|
-
|
346
|
+
shm_handler (ShmHandlerWrapper, optional): shared memory handler. Defaults to None.
|
325
347
|
|
326
348
|
Returns:
|
327
349
|
Any: triton InferRequestedOutput for receiving response
|
@@ -330,10 +352,39 @@ class TritonClient(BaseClient):
|
|
330
352
|
infer_output = self.client_module.InferRequestedOutput(output_name)
|
331
353
|
else:
|
332
354
|
infer_output = self.client_module.InferRequestedOutput(output_name, binary_data=binary)
|
333
|
-
if self.
|
334
|
-
infer_output.set_shared_memory(
|
355
|
+
if self.use_cuda_shm or self.use_system_shm:
|
356
|
+
infer_output.set_shared_memory(shm_handler.name, shm_handler.size)
|
335
357
|
return infer_output
|
336
358
|
|
359
|
+
def _register_shm_regions(self):
|
360
|
+
"""
|
361
|
+
Register shared memory regions in Triton
|
362
|
+
"""
|
363
|
+
get_shared_memory_status = self.triton_client.get_cuda_shared_memory_status \
|
364
|
+
if self.use_cuda_shm else self.triton_client.get_system_shared_memory_status
|
365
|
+
|
366
|
+
unregister_shared_memory = self.triton_client.unregister_cuda_shared_memory \
|
367
|
+
if self.use_cuda_shm else self.triton_client.unregister_system_shared_memory
|
368
|
+
|
369
|
+
if self.scheme == "grpc":
|
370
|
+
regions_statuses = get_shared_memory_status(as_json=True)['regions']
|
371
|
+
else:
|
372
|
+
regions_statuses = get_shared_memory_status()
|
373
|
+
|
374
|
+
for shm_handler in self.input_shm_handlers + self.output_shm_handlers:
|
375
|
+
old_regions_names = self._get_old_regions_names(regions_statuses, shm_handler.name)
|
376
|
+
for old_region_name in old_regions_names:
|
377
|
+
unregister_shared_memory(old_region_name)
|
378
|
+
|
379
|
+
if self.use_cuda_shm:
|
380
|
+
self.triton_client.register_cuda_shared_memory(
|
381
|
+
shm_handler.name, cudashm.get_raw_handle(shm_handler.handler), 0, shm_handler.size
|
382
|
+
)
|
383
|
+
else:
|
384
|
+
self.triton_client.register_system_shared_memory(
|
385
|
+
shm_handler.name, shm_handler.name, shm_handler.size
|
386
|
+
)
|
387
|
+
|
337
388
|
def _postprocess_triton_result(self, triton_response: Any, padding_size: int) -> Dict[str, np.ndarray]:
|
338
389
|
"""
|
339
390
|
Postprocess triton response.
|
@@ -346,15 +397,17 @@ class TritonClient(BaseClient):
|
|
346
397
|
Dict[str, np.ndarray]: dict of output name and output data
|
347
398
|
"""
|
348
399
|
result = dict()
|
349
|
-
for output_name, shm_op_handle in zip(self.outputs_names, self.
|
350
|
-
if self.
|
400
|
+
for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handlers):
|
401
|
+
if self.use_cuda_shm or self.use_system_shm:
|
351
402
|
if self.scheme == "grpc":
|
352
403
|
# output = triton_response.get_output(output_name, as_json=True) # WARN: bug in tritonclient library, return None
|
353
404
|
output = json.loads(MessageToJson(triton_response.get_output(output_name)))
|
354
405
|
else:
|
355
406
|
output = triton_response.get_output(output_name)
|
356
|
-
|
357
|
-
|
407
|
+
|
408
|
+
shm_utils = shm if self.use_system_shm else cudashm
|
409
|
+
result[output_name] = shm_utils.get_contents_as_numpy(
|
410
|
+
shm_op_handle.handler,
|
358
411
|
utils.triton_to_np_dtype(output["datatype"]),
|
359
412
|
output["shape"],
|
360
413
|
)
|
@@ -375,17 +428,17 @@ class TritonClient(BaseClient):
|
|
375
428
|
|
376
429
|
for i_batch in range(count_batches):
|
377
430
|
triton_inputs = []
|
378
|
-
for input_name, config_input_format,
|
379
|
-
zip(self.inputs_names, self.triton_inputs_dtypes, self.
|
431
|
+
for input_name, config_input_format, shm_ip_handler in \
|
432
|
+
zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handlers):
|
380
433
|
triton_input = self._create_triton_input(
|
381
|
-
inputs_batches[input_name][i_batch], input_name, config_input_format,
|
434
|
+
inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handler
|
382
435
|
)
|
383
436
|
triton_inputs.append(triton_input)
|
384
437
|
|
385
438
|
triton_outputs = []
|
386
|
-
for output_name,
|
439
|
+
for output_name, shm_op_handlers in zip(self.outputs_names, self.output_shm_handlers):
|
387
440
|
triton_output = self._create_triton_output(
|
388
|
-
output_name, binary=True,
|
441
|
+
output_name, binary=True, shm_handler=shm_op_handlers
|
389
442
|
)
|
390
443
|
triton_outputs.append(triton_output)
|
391
444
|
|
@@ -413,14 +466,14 @@ class TritonClient(BaseClient):
|
|
413
466
|
for i_batch in range(count_batches):
|
414
467
|
triton_inputs = []
|
415
468
|
for input_name, config_input_format, shm_ip_handle in \
|
416
|
-
zip(self.inputs_names, self.triton_inputs_dtypes, self.
|
469
|
+
zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handlers):
|
417
470
|
triton_input = self._create_triton_input(
|
418
471
|
inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handle
|
419
472
|
)
|
420
473
|
triton_inputs.append(triton_input)
|
421
474
|
|
422
475
|
triton_outputs = []
|
423
|
-
for output_name, shm_op_handle in zip(self.outputs_names, self.
|
476
|
+
for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handlers):
|
424
477
|
triton_output = self._create_triton_output(
|
425
478
|
output_name, binary=True, shm_handle=shm_op_handle
|
426
479
|
)
|
@@ -1,37 +1,22 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.1
|
2
2
|
Name: imb
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.3
|
4
4
|
Summary: Python library for run inference of deep learning models in different backends
|
5
5
|
Home-page: https://github.com/TheConstant3/InferenceMultiBackend
|
6
6
|
Author: p-constant
|
7
7
|
Author-email: nikshorop@gmail.com
|
8
|
+
License: UNKNOWN
|
9
|
+
Platform: UNKNOWN
|
8
10
|
Classifier: Programming Language :: Python :: 3.8
|
9
11
|
Classifier: License :: OSI Approved :: MIT License
|
10
12
|
Classifier: Operating System :: OS Independent
|
11
13
|
Requires-Python: >=3.8
|
12
14
|
Description-Content-Type: text/markdown
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: numpy
|
15
15
|
Provides-Extra: triton
|
16
|
-
Requires-Dist: tritonclient[all]>=2.38.0; extra == "triton"
|
17
16
|
Provides-Extra: onnxcpu
|
18
|
-
Requires-Dist: onnxruntime>=1.16.0; extra == "onnxcpu"
|
19
17
|
Provides-Extra: onnxgpu
|
20
|
-
Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "onnxgpu"
|
21
18
|
Provides-Extra: all
|
22
|
-
|
23
|
-
Requires-Dist: onnxruntime>=1.16.0; extra == "all"
|
24
|
-
Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "all"
|
25
|
-
Dynamic: author
|
26
|
-
Dynamic: author-email
|
27
|
-
Dynamic: classifier
|
28
|
-
Dynamic: description
|
29
|
-
Dynamic: description-content-type
|
30
|
-
Dynamic: home-page
|
31
|
-
Dynamic: provides-extra
|
32
|
-
Dynamic: requires-dist
|
33
|
-
Dynamic: requires-python
|
34
|
-
Dynamic: summary
|
19
|
+
License-File: LICENSE
|
35
20
|
|
36
21
|
# InferenceMultiBackend
|
37
22
|
|
@@ -55,6 +40,8 @@ For support all implemented clients:
|
|
55
40
|
|
56
41
|
OnnxClient usage example
|
57
42
|
```
|
43
|
+
from imb.onnx import OnnxClient
|
44
|
+
|
58
45
|
onnx_client = OnnxClient(
|
59
46
|
model_path='model.onnx',
|
60
47
|
model_name='any name',
|
@@ -64,15 +51,19 @@ onnx_client = OnnxClient(
|
|
64
51
|
fixed_batch=True,
|
65
52
|
warmup=True
|
66
53
|
)
|
54
|
+
|
67
55
|
# if model has fixed input size (except batch size) then sample_inputs will be created
|
68
56
|
sample_inputs = onnx_client.sample_inputs
|
69
57
|
print('inputs shapes', [o.shape for o in sample_inputs])
|
58
|
+
|
70
59
|
outputs = onnx_client(*sample_inputs)
|
71
60
|
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
72
61
|
```
|
73
62
|
|
74
63
|
TritonClient usage example
|
75
64
|
```
|
65
|
+
from imb.triton import TritonClient
|
66
|
+
|
76
67
|
triton_client = TritonClient(
|
77
68
|
url='localhost:8000',
|
78
69
|
model_name='arcface',
|
@@ -87,9 +78,11 @@ triton_client = TritonClient(
|
|
87
78
|
return_dict=True,
|
88
79
|
warmup=False
|
89
80
|
)
|
81
|
+
|
90
82
|
# if model has fixed input size (except batch size) then sample_inputs will be created
|
91
83
|
sample_inputs = triton_client.sample_inputs
|
92
84
|
print('inputs shapes', [o.shape for o in sample_inputs])
|
85
|
+
|
93
86
|
outputs = triton_client(*sample_inputs)
|
94
87
|
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
95
88
|
```
|
@@ -103,3 +96,4 @@ fixed_batch - if fixed batch is True, then each batch will have fixed size (padd
|
|
103
96
|
warmup - if True, model will run several calls on sample_inputs while initialization.
|
104
97
|
|
105
98
|
return_dict - if True, __call__ return dict {'output_name1': output_value1, ...}, else [output_value1, ...]
|
99
|
+
|
{imb-1.0.1 → imb-1.0.3}/setup.py
RENAMED
@@ -19,7 +19,7 @@ extras_require["all"] = list(chain(extras_require.values()))
|
|
19
19
|
|
20
20
|
setup(
|
21
21
|
name='imb',
|
22
|
-
version='1.0.
|
22
|
+
version='1.0.3',
|
23
23
|
author='p-constant',
|
24
24
|
author_email='nikshorop@gmail.com',
|
25
25
|
description='Python library for run inference of deep learning models in different backends',
|
{imb-1.0.1 → imb-1.0.3}/LICENSE
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|