imb 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imb/__init__.py +0 -1
- imb/{inference_clients/triton.py → triton.py} +123 -62
- imb-1.0.2.dist-info/METADATA +113 -0
- imb-1.0.2.dist-info/RECORD +9 -0
- imb/inference_clients/__init__.py +0 -2
- imb-1.0.0.dist-info/METADATA +0 -30
- imb-1.0.0.dist-info/RECORD +0 -10
- /imb/{inference_clients/base.py → base.py} +0 -0
- /imb/{inference_clients/onnx.py → onnx.py} +0 -0
- {imb-1.0.0.dist-info → imb-1.0.2.dist-info}/LICENSE +0 -0
- {imb-1.0.0.dist-info → imb-1.0.2.dist-info}/WHEEL +0 -0
- {imb-1.0.0.dist-info → imb-1.0.2.dist-info}/top_level.txt +0 -0
imb/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1
|
-
from .inference_clients import OnnxClient, TritonClient
|
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple
|
|
3
3
|
import tritonclient.http as httpclient
|
4
4
|
import tritonclient.grpc as grpcclient
|
5
5
|
import tritonclient.utils.cuda_shared_memory as cudashm
|
6
|
+
import tritonclient.utils.shared_memory as shm
|
6
7
|
from google.protobuf.json_format import MessageToJson
|
7
8
|
from tritonclient import utils
|
8
9
|
from .base import BaseClient
|
@@ -11,6 +12,14 @@ import json
|
|
11
12
|
import time
|
12
13
|
|
13
14
|
|
15
|
+
class ShmHandlerWrapper:
|
16
|
+
def __init__(self, handler: Any, name: str, size: int):
|
17
|
+
self.handler = handler
|
18
|
+
self.name = name
|
19
|
+
self.size = size
|
20
|
+
|
21
|
+
|
22
|
+
|
14
23
|
class TritonClient(BaseClient):
|
15
24
|
def __init__(self, url: str,
|
16
25
|
model_name: str,
|
@@ -20,19 +29,40 @@ class TritonClient(BaseClient):
|
|
20
29
|
resend_count: int = 10,
|
21
30
|
fixed_batch: bool = True,
|
22
31
|
is_async: bool = False,
|
23
|
-
|
24
|
-
|
32
|
+
use_cuda_shm: bool = False,
|
33
|
+
use_system_shm: bool = False,
|
34
|
+
max_shm_regions: int = 0,
|
25
35
|
scheme: Literal["http", "grpc"] = "http",
|
26
36
|
return_dict: bool = True,
|
27
37
|
warmup: bool = False
|
28
38
|
):
|
39
|
+
"""_summary_
|
40
|
+
|
41
|
+
Args:
|
42
|
+
url (str): url of the triton server
|
43
|
+
model_name (str): name of the model endpoint
|
44
|
+
max_batch_size (int, optional): max batch size. Defaults to 0 (get value from triton config).
|
45
|
+
sample_inputs (Optional[List[np.ndarray]], optional): samples for warmup. Defaults to None (zeros array).
|
46
|
+
timeout (int, optional): triton client timeout. Defaults to 10.
|
47
|
+
resend_count (int, optional): triton client resend count. Defaults to 10.
|
48
|
+
fixed_batch (bool, optional): use fixed batch size, using padding for smaller batch. Defaults to True.
|
49
|
+
is_async (bool, optional): async inference. Defaults to False.
|
50
|
+
use_cuda_shm (bool, optional): use cuda shared memory. Defaults to False.
|
51
|
+
use_system_shm (bool, optional): use system shared memory. Defaults to False.
|
52
|
+
max_shm_regions (int, optional): max clients for shared memory. Will unregister old regions. Defaults to 0.
|
53
|
+
scheme (Literal["http", "grpc"], optional): scheme for triton client. Defaults to "http".
|
54
|
+
return_dict (bool, optional): return dict or list of values. Defaults to True.
|
55
|
+
warmup (bool, optional): warmup model. Defaults to False.
|
56
|
+
"""
|
29
57
|
super().__init__()
|
58
|
+
assert not (use_cuda_shm and use_system_shm), 'shm and cuda_shm are mutually exclusive'
|
30
59
|
self.model_name = model_name
|
31
60
|
self.scheme = scheme
|
32
61
|
self.client_module = httpclient if scheme == "http" else grpcclient
|
33
62
|
self.url = url
|
34
63
|
self.is_async = is_async
|
35
|
-
self.
|
64
|
+
self.use_cuda_shm = use_cuda_shm
|
65
|
+
self.use_system_shm = use_system_shm
|
36
66
|
self.triton_timeout = timeout
|
37
67
|
self.resend_count = resend_count
|
38
68
|
self.max_shm_regions = max_shm_regions
|
@@ -57,14 +87,16 @@ class TritonClient(BaseClient):
|
|
57
87
|
if warmup:
|
58
88
|
self.warmup_model()
|
59
89
|
|
60
|
-
self.
|
61
|
-
|
90
|
+
self.input_shm_handlers: List[Optional[ShmHandlerWrapper]] = \
|
91
|
+
[None for _ in range(len(self.inputs_names))]
|
92
|
+
self.output_shm_handlers: List[Optional[ShmHandlerWrapper]] = \
|
93
|
+
[None for _ in range(len(self.outputs_names))]
|
62
94
|
|
63
|
-
if self.
|
95
|
+
if self.use_cuda_shm or self.use_system_shm:
|
64
96
|
assert is_async == False and fixed_batch == True
|
65
97
|
self._fill_output_dynamic_axis()
|
66
98
|
self._create_input_output_shm_handles()
|
67
|
-
self.
|
99
|
+
self._register_shm_regions()
|
68
100
|
|
69
101
|
def io_summary(self):
|
70
102
|
return {
|
@@ -84,7 +116,8 @@ class TritonClient(BaseClient):
|
|
84
116
|
|
85
117
|
"fixed_batch": self.fixed_batch,
|
86
118
|
"async": self.is_async,
|
87
|
-
"cuda_shm": self.
|
119
|
+
"cuda_shm": self.use_cuda_shm,
|
120
|
+
"shm": self.use_system_shm,
|
88
121
|
"max_shm_regions": self.max_shm_regions,
|
89
122
|
}
|
90
123
|
|
@@ -150,13 +183,16 @@ class TritonClient(BaseClient):
|
|
150
183
|
-1 in output_shape for output_shape in self.outputs_shapes
|
151
184
|
)
|
152
185
|
if has_dynamic_shapes:
|
153
|
-
start_cuda_shm_flag = self.
|
154
|
-
|
186
|
+
start_cuda_shm_flag = self.use_cuda_shm
|
187
|
+
start_system_shm_flag = self.use_system_shm
|
188
|
+
self.use_cuda_shm = False
|
189
|
+
self.use_system_shm = False
|
155
190
|
outputs = self.forward(*self.sample_inputs)
|
156
191
|
self.outputs_shapes = [
|
157
192
|
list(outputs[output_name].shape) for output_name in self.outputs_names
|
158
193
|
]
|
159
|
-
self.
|
194
|
+
self.use_cuda_shm = start_cuda_shm_flag
|
195
|
+
self.use_system_shm = start_system_shm_flag
|
160
196
|
|
161
197
|
@staticmethod
|
162
198
|
def _parse_io_params(io_params: List[Dict]) -> Tuple[List[str], List[np.dtype], List[List[int]], List[str]]:
|
@@ -212,12 +248,14 @@ class TritonClient(BaseClient):
|
|
212
248
|
Get old regions names for unregister
|
213
249
|
|
214
250
|
Args:
|
215
|
-
regions_statuses (list): responce of
|
251
|
+
regions_statuses (list): responce of get_shared_memory_status from triton
|
216
252
|
new_triton_shm_name (str): name of new region
|
217
253
|
|
218
254
|
Returns:
|
219
255
|
List[str]: old regions names for unregister
|
220
256
|
"""
|
257
|
+
if self.max_shm_regions < 1:
|
258
|
+
return []
|
221
259
|
i_sep = len(new_triton_shm_name) - new_triton_shm_name[::-1].index('_') - 1
|
222
260
|
region_name = new_triton_shm_name[:i_sep]
|
223
261
|
registrated_regions = [
|
@@ -231,44 +269,35 @@ class TritonClient(BaseClient):
|
|
231
269
|
old_regions = [name for name, _ in registrated_regions[:count_old_regions]]
|
232
270
|
return old_regions
|
233
271
|
|
234
|
-
def
|
235
|
-
"""
|
236
|
-
Register CUDA shared memory regions in Triton
|
272
|
+
def _create_shm_handle(self, shape: List[int], dtype: np.dtype, name: str) -> ShmHandlerWrapper:
|
237
273
|
"""
|
238
|
-
|
239
|
-
regions_statuses = self.triton_client.get_cuda_shared_memory_status(as_json=True)['regions']
|
240
|
-
else:
|
241
|
-
regions_statuses = self.triton_client.get_cuda_shared_memory_status()
|
242
|
-
|
243
|
-
for shm_handle in self.input_shm_handles + self.output_shm_handles:
|
244
|
-
old_regions_names = self._get_old_regions_names(regions_statuses, shm_handle._triton_shm_name)
|
245
|
-
for old_region_name in old_regions_names:
|
246
|
-
self.triton_client.unregister_cuda_shared_memory(old_region_name)
|
247
|
-
self.triton_client.register_cuda_shared_memory(
|
248
|
-
shm_handle._triton_shm_name, cudashm.get_raw_handle(shm_handle), 0, shm_handle._byte_size
|
249
|
-
)
|
250
|
-
|
251
|
-
def _create_cuda_shm_handle(self, shape: List[int], dtype: np.dtype, name: str) -> Any:
|
252
|
-
"""
|
253
|
-
Create CUDA shared memory handle
|
274
|
+
Create shared memory handle
|
254
275
|
|
255
276
|
Args:
|
256
|
-
shape (List[int]): Shape of
|
277
|
+
shape (List[int]): Shape of shared memory region
|
257
278
|
dtype (np.dtype): Data type of input/output data
|
258
279
|
name (str): Input/output name
|
259
280
|
|
260
281
|
Returns:
|
261
|
-
Any:
|
282
|
+
Any: shared memory handle
|
262
283
|
"""
|
263
284
|
byte_size = int(np.prod(shape) * np.dtype(dtype).itemsize)
|
264
285
|
shm_name = self._generate_shm_name(name)
|
265
|
-
|
286
|
+
if self.use_cuda_shm:
|
287
|
+
shm_handle = cudashm.create_shared_memory_region(
|
288
|
+
shm_name, byte_size, 0
|
289
|
+
)
|
290
|
+
else:
|
291
|
+
shm_handle = shm.create_shared_memory_region(
|
292
|
+
shm_name, shm_name, byte_size
|
293
|
+
)
|
294
|
+
return ShmHandlerWrapper(shm_handle, shm_name, byte_size)
|
266
295
|
|
267
|
-
def
|
296
|
+
def _create_shm_handles_for_io(self, shapes: List[List[int]],
|
268
297
|
dtypes: List[np.dtype],
|
269
|
-
names: List[str]) -> List[
|
298
|
+
names: List[str]) -> List[ShmHandlerWrapper]:
|
270
299
|
"""
|
271
|
-
Create
|
300
|
+
Create shared memory handles for inputs or outputs
|
272
301
|
|
273
302
|
Args:
|
274
303
|
shapes (List[List[int]]): Shapes of cuda shared memory regions
|
@@ -276,24 +305,24 @@ class TritonClient(BaseClient):
|
|
276
305
|
names (List[str]): Input/output names
|
277
306
|
|
278
307
|
Returns:
|
279
|
-
List[
|
308
|
+
List[ShmHandlerWrapper]: shared memory handles
|
280
309
|
"""
|
281
|
-
return [self.
|
310
|
+
return [self._create_shm_handle(shape, dtype, name)
|
282
311
|
for shape, dtype, name in zip(shapes, dtypes, names)]
|
283
312
|
|
284
313
|
def _create_input_output_shm_handles(self) -> None:
|
285
314
|
"""
|
286
|
-
Create
|
315
|
+
Create shared memory handles for inputs and outputs
|
287
316
|
"""
|
288
|
-
self.
|
317
|
+
self.input_shm_handlers = self._create_shm_handles_for_io(
|
289
318
|
self.inputs_shapes, self.np_inputs_dtypes, self.inputs_names
|
290
319
|
)
|
291
|
-
self.
|
320
|
+
self.output_shm_handlers = self._create_shm_handles_for_io(
|
292
321
|
self.outputs_shapes, self.np_outputs_dtypes, self.outputs_names
|
293
322
|
)
|
294
323
|
|
295
324
|
def _create_triton_input(self, input_data: np.ndarray, input_name: str,
|
296
|
-
config_input_format: str,
|
325
|
+
config_input_format: str, shm_handler: Optional[ShmHandlerWrapper] = None) -> Any:
|
297
326
|
"""
|
298
327
|
Create triton InferInput
|
299
328
|
|
@@ -301,27 +330,28 @@ class TritonClient(BaseClient):
|
|
301
330
|
input_data (np.ndarray): data for send to model
|
302
331
|
input_name (str): name of input
|
303
332
|
config_input_format (str): triton input format
|
304
|
-
|
333
|
+
shm_handler (ShmHandlerWrapper, optional): shared memory handler. Defaults to None.
|
305
334
|
|
306
335
|
Returns:
|
307
336
|
Any: triton InferInput for sending request
|
308
337
|
"""
|
309
338
|
infer_input = self.client_module.InferInput(input_name, input_data.shape, config_input_format)
|
310
|
-
if self.
|
311
|
-
cudashm.
|
312
|
-
|
339
|
+
if self.use_cuda_shm or self.use_system_shm:
|
340
|
+
shm_utils = cudashm if self.use_cuda_shm else shm
|
341
|
+
shm_utils.set_shared_memory_region(shm_handler.handler, [input_data])
|
342
|
+
infer_input.set_shared_memory(shm_handler.name, shm_handler.size)
|
313
343
|
else:
|
314
344
|
infer_input.set_data_from_numpy(input_data)
|
315
345
|
return infer_input
|
316
346
|
|
317
|
-
def _create_triton_output(self, output_name: str, binary: bool = True,
|
347
|
+
def _create_triton_output(self, output_name: str, binary: bool = True, shm_handler: Optional[ShmHandlerWrapper] = None) -> Any:
|
318
348
|
"""
|
319
349
|
Create triton InferRequestedOutput
|
320
350
|
|
321
351
|
Args:
|
322
352
|
output_name (str): output name
|
323
353
|
binary (bool, optional): Whether the output is binary. Defaults to True.
|
324
|
-
|
354
|
+
shm_handler (ShmHandlerWrapper, optional): shared memory handler. Defaults to None.
|
325
355
|
|
326
356
|
Returns:
|
327
357
|
Any: triton InferRequestedOutput for receiving response
|
@@ -330,10 +360,39 @@ class TritonClient(BaseClient):
|
|
330
360
|
infer_output = self.client_module.InferRequestedOutput(output_name)
|
331
361
|
else:
|
332
362
|
infer_output = self.client_module.InferRequestedOutput(output_name, binary_data=binary)
|
333
|
-
if self.
|
334
|
-
infer_output.set_shared_memory(
|
363
|
+
if self.use_cuda_shm or self.use_system_shm:
|
364
|
+
infer_output.set_shared_memory(shm_handler.name, shm_handler.size)
|
335
365
|
return infer_output
|
336
366
|
|
367
|
+
def _register_shm_regions(self):
|
368
|
+
"""
|
369
|
+
Register shared memory regions in Triton
|
370
|
+
"""
|
371
|
+
get_shared_memory_status = self.triton_client.get_cuda_shared_memory_status \
|
372
|
+
if self.use_cuda_shm else self.triton_client.get_system_shared_memory_status
|
373
|
+
|
374
|
+
unregister_shared_memory = self.triton_client.unregister_cuda_shared_memory \
|
375
|
+
if self.use_cuda_shm else self.triton_client.unregister_system_shared_memory
|
376
|
+
|
377
|
+
if self.scheme == "grpc":
|
378
|
+
regions_statuses = get_shared_memory_status(as_json=True)['regions']
|
379
|
+
else:
|
380
|
+
regions_statuses = get_shared_memory_status()
|
381
|
+
|
382
|
+
for shm_handler in self.input_shm_handlers + self.output_shm_handlers:
|
383
|
+
old_regions_names = self._get_old_regions_names(regions_statuses, shm_handler.name)
|
384
|
+
for old_region_name in old_regions_names:
|
385
|
+
unregister_shared_memory(old_region_name)
|
386
|
+
|
387
|
+
if self.use_cuda_shm:
|
388
|
+
self.triton_client.register_cuda_shared_memory(
|
389
|
+
shm_handler.name, cudashm.get_raw_handle(shm_handler.handler), 0, shm_handler.size
|
390
|
+
)
|
391
|
+
else:
|
392
|
+
self.triton_client.register_system_shared_memory(
|
393
|
+
shm_handler.name, shm_handler.name, shm_handler.size
|
394
|
+
)
|
395
|
+
|
337
396
|
def _postprocess_triton_result(self, triton_response: Any, padding_size: int) -> Dict[str, np.ndarray]:
|
338
397
|
"""
|
339
398
|
Postprocess triton response.
|
@@ -346,15 +405,17 @@ class TritonClient(BaseClient):
|
|
346
405
|
Dict[str, np.ndarray]: dict of output name and output data
|
347
406
|
"""
|
348
407
|
result = dict()
|
349
|
-
for output_name, shm_op_handle in zip(self.outputs_names, self.
|
350
|
-
if self.
|
408
|
+
for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handlers):
|
409
|
+
if self.use_cuda_shm or self.use_system_shm:
|
351
410
|
if self.scheme == "grpc":
|
352
411
|
# output = triton_response.get_output(output_name, as_json=True) # WARN: bug in tritonclient library, return None
|
353
412
|
output = json.loads(MessageToJson(triton_response.get_output(output_name)))
|
354
413
|
else:
|
355
414
|
output = triton_response.get_output(output_name)
|
356
|
-
|
357
|
-
|
415
|
+
|
416
|
+
shm_utils = shm if self.use_system_shm else cudashm
|
417
|
+
result[output_name] = shm_utils.get_contents_as_numpy(
|
418
|
+
shm_op_handle.handler,
|
358
419
|
utils.triton_to_np_dtype(output["datatype"]),
|
359
420
|
output["shape"],
|
360
421
|
)
|
@@ -375,17 +436,17 @@ class TritonClient(BaseClient):
|
|
375
436
|
|
376
437
|
for i_batch in range(count_batches):
|
377
438
|
triton_inputs = []
|
378
|
-
for input_name, config_input_format,
|
379
|
-
zip(self.inputs_names, self.triton_inputs_dtypes, self.
|
439
|
+
for input_name, config_input_format, shm_ip_handler in \
|
440
|
+
zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handlers):
|
380
441
|
triton_input = self._create_triton_input(
|
381
|
-
inputs_batches[input_name][i_batch], input_name, config_input_format,
|
442
|
+
inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handler
|
382
443
|
)
|
383
444
|
triton_inputs.append(triton_input)
|
384
445
|
|
385
446
|
triton_outputs = []
|
386
|
-
for output_name,
|
447
|
+
for output_name, shm_op_handlers in zip(self.outputs_names, self.output_shm_handlers):
|
387
448
|
triton_output = self._create_triton_output(
|
388
|
-
output_name, binary=True,
|
449
|
+
output_name, binary=True, shm_handler=shm_op_handlers
|
389
450
|
)
|
390
451
|
triton_outputs.append(triton_output)
|
391
452
|
|
@@ -413,14 +474,14 @@ class TritonClient(BaseClient):
|
|
413
474
|
for i_batch in range(count_batches):
|
414
475
|
triton_inputs = []
|
415
476
|
for input_name, config_input_format, shm_ip_handle in \
|
416
|
-
zip(self.inputs_names, self.triton_inputs_dtypes, self.
|
477
|
+
zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handlers):
|
417
478
|
triton_input = self._create_triton_input(
|
418
479
|
inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handle
|
419
480
|
)
|
420
481
|
triton_inputs.append(triton_input)
|
421
482
|
|
422
483
|
triton_outputs = []
|
423
|
-
for output_name, shm_op_handle in zip(self.outputs_names, self.
|
484
|
+
for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handlers):
|
424
485
|
triton_output = self._create_triton_output(
|
425
486
|
output_name, binary=True, shm_handle=shm_op_handle
|
426
487
|
)
|
@@ -0,0 +1,113 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: imb
|
3
|
+
Version: 1.0.2
|
4
|
+
Summary: Python library for run inference of deep learning models in different backends
|
5
|
+
Home-page: https://github.com/TheConstant3/InferenceMultiBackend
|
6
|
+
Author: p-constant
|
7
|
+
Author-email: nikshorop@gmail.com
|
8
|
+
Classifier: Programming Language :: Python :: 3.8
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Requires-Python: >=3.8
|
12
|
+
Description-Content-Type: text/markdown
|
13
|
+
License-File: LICENSE
|
14
|
+
Requires-Dist: numpy
|
15
|
+
Provides-Extra: triton
|
16
|
+
Requires-Dist: tritonclient[all]>=2.38.0; extra == "triton"
|
17
|
+
Provides-Extra: onnxcpu
|
18
|
+
Requires-Dist: onnxruntime>=1.16.0; extra == "onnxcpu"
|
19
|
+
Provides-Extra: onnxgpu
|
20
|
+
Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "onnxgpu"
|
21
|
+
Provides-Extra: all
|
22
|
+
Requires-Dist: tritonclient[all]>=2.38.0; extra == "all"
|
23
|
+
Requires-Dist: onnxruntime>=1.16.0; extra == "all"
|
24
|
+
Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "all"
|
25
|
+
Dynamic: author
|
26
|
+
Dynamic: author-email
|
27
|
+
Dynamic: classifier
|
28
|
+
Dynamic: description
|
29
|
+
Dynamic: description-content-type
|
30
|
+
Dynamic: home-page
|
31
|
+
Dynamic: provides-extra
|
32
|
+
Dynamic: requires-dist
|
33
|
+
Dynamic: requires-python
|
34
|
+
Dynamic: summary
|
35
|
+
|
36
|
+
# InferenceMultiBackend
|
37
|
+
|
38
|
+
Python library for run inference of deep learning models in different backends
|
39
|
+
|
40
|
+
## Installation
|
41
|
+
|
42
|
+
For use triton inference client:
|
43
|
+
```pip install imb[triton]```
|
44
|
+
|
45
|
+
For use onnxruntime-gpu client:
|
46
|
+
```pip install imb[onnxgpu]```
|
47
|
+
|
48
|
+
For use onnxruntime client:
|
49
|
+
```pip install imb[onnxcpu]```
|
50
|
+
|
51
|
+
For support all implemented clients:
|
52
|
+
```pip install imb[all]```
|
53
|
+
|
54
|
+
## Usage
|
55
|
+
|
56
|
+
OnnxClient usage example
|
57
|
+
```
|
58
|
+
from imb.onnx import OnnxClient
|
59
|
+
|
60
|
+
onnx_client = OnnxClient(
|
61
|
+
model_path='model.onnx',
|
62
|
+
model_name='any name',
|
63
|
+
providers=['CUDAExecutionProvider', 'CPUExecutionProvider'],
|
64
|
+
max_batch_size=16,
|
65
|
+
return_dict=True,
|
66
|
+
fixed_batch=True,
|
67
|
+
warmup=True
|
68
|
+
)
|
69
|
+
|
70
|
+
# if model has fixed input size (except batch size) then sample_inputs will be created
|
71
|
+
sample_inputs = onnx_client.sample_inputs
|
72
|
+
print('inputs shapes', [o.shape for o in sample_inputs])
|
73
|
+
|
74
|
+
outputs = onnx_client(*sample_inputs)
|
75
|
+
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
76
|
+
```
|
77
|
+
|
78
|
+
TritonClient usage example
|
79
|
+
```
|
80
|
+
from imb.triton import TritonClient
|
81
|
+
|
82
|
+
triton_client = TritonClient(
|
83
|
+
url='localhost:8000',
|
84
|
+
model_name='arcface',
|
85
|
+
max_batch_size=16,
|
86
|
+
timeout=10,
|
87
|
+
resend_count=10,
|
88
|
+
fixed_batch=True,
|
89
|
+
is_async=False,
|
90
|
+
cuda_shm=False,
|
91
|
+
max_shm_regions=2,
|
92
|
+
scheme='http',
|
93
|
+
return_dict=True,
|
94
|
+
warmup=False
|
95
|
+
)
|
96
|
+
|
97
|
+
# if model has fixed input size (except batch size) then sample_inputs will be created
|
98
|
+
sample_inputs = triton_client.sample_inputs
|
99
|
+
print('inputs shapes', [o.shape for o in sample_inputs])
|
100
|
+
|
101
|
+
outputs = triton_client(*sample_inputs)
|
102
|
+
print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
|
103
|
+
```
|
104
|
+
|
105
|
+
## Notes
|
106
|
+
|
107
|
+
max_batch_size - maximum batch size for inference. If input data larger that max_batch_size, then input data will be splitted to several batches.
|
108
|
+
|
109
|
+
fixed_batch - if fixed batch is True, then each batch will have fixed size (padding the smallest batch to max_batch_size).
|
110
|
+
|
111
|
+
warmup - if True, model will run several calls on sample_inputs while initialization.
|
112
|
+
|
113
|
+
return_dict - if True, __call__ return dict {'output_name1': output_value1, ...}, else [output_value1, ...]
|
@@ -0,0 +1,9 @@
|
|
1
|
+
imb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
+
imb/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
|
3
|
+
imb/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
|
4
|
+
imb/triton.py,sha256=92d3tvCniWGSnC1UyjkQ5OcXgSbsBnX6T2hoewLal0k,21796
|
5
|
+
imb-1.0.2.dist-info/LICENSE,sha256=pAZXnNE2dxxwXFIduGyn1gpvPefJtUYOYZOi3yeGG94,1068
|
6
|
+
imb-1.0.2.dist-info/METADATA,sha256=lEzhVDdcdNHZeECQPisnQcZDjueOFP8zuhVTDh4Vi3s,3314
|
7
|
+
imb-1.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
8
|
+
imb-1.0.2.dist-info/top_level.txt,sha256=kY8Fp1i_MzTZhuoVhVexG762D8HBd-THfX_lfw4EZmY,4
|
9
|
+
imb-1.0.2.dist-info/RECORD,,
|
imb-1.0.0.dist-info/METADATA
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: imb
|
3
|
-
Version: 1.0.0
|
4
|
-
Summary: Python library for run inference of deep learning models in different backends
|
5
|
-
Home-page: https://github.com/TheConstant3/InferenceMultiBackend
|
6
|
-
Author: p-constant
|
7
|
-
Author-email: nikshorop@gmail.com
|
8
|
-
Classifier: Programming Language :: Python :: 3.8
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Requires-Python: >=3.8
|
12
|
-
Description-Content-Type: text/markdown
|
13
|
-
License-File: LICENSE
|
14
|
-
Requires-Dist: onnxruntime-gpu>=1.16.0
|
15
|
-
Requires-Dist: tritonclient[all]>=2.38.0
|
16
|
-
Requires-Dist: numpy>=1.19.4
|
17
|
-
Dynamic: author
|
18
|
-
Dynamic: author-email
|
19
|
-
Dynamic: classifier
|
20
|
-
Dynamic: description
|
21
|
-
Dynamic: description-content-type
|
22
|
-
Dynamic: home-page
|
23
|
-
Dynamic: requires-dist
|
24
|
-
Dynamic: requires-python
|
25
|
-
Dynamic: summary
|
26
|
-
|
27
|
-
# InferenceMultiBackend
|
28
|
-
|
29
|
-
Python library for run inference of deep learning models in different backends
|
30
|
-
|
imb-1.0.0.dist-info/RECORD
DELETED
@@ -1,10 +0,0 @@
|
|
1
|
-
imb/__init__.py,sha256=8XoaonMp09UWmynubLMIu2bln41iKgIdWj-wxgsQjnk,55
|
2
|
-
imb/inference_clients/__init__.py,sha256=Glv4yD0QdtZmCOiYFbILSl90VhxdwvPoH9gFczHlVFk,61
|
3
|
-
imb/inference_clients/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
|
4
|
-
imb/inference_clients/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
|
5
|
-
imb/inference_clients/triton.py,sha256=hdnCtDjoRAl_Ss49_ayvW3-VhsYcY2MbNqh3ax6y-18,18629
|
6
|
-
imb-1.0.0.dist-info/LICENSE,sha256=pAZXnNE2dxxwXFIduGyn1gpvPefJtUYOYZOi3yeGG94,1068
|
7
|
-
imb-1.0.0.dist-info/METADATA,sha256=NZcJPx91mzPg4Zo9FZxlMQE4c6zB2s_yPVhhRVxPBzM,898
|
8
|
-
imb-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
-
imb-1.0.0.dist-info/top_level.txt,sha256=kY8Fp1i_MzTZhuoVhVexG762D8HBd-THfX_lfw4EZmY,4
|
10
|
-
imb-1.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|