imb 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- imb/__init__.py +0 -1
- imb/base.py +129 -0
- imb/inference_clients/__init__.py +0 -2
- imb/onnx.py +99 -0
- imb/triton.py +460 -0
- imb-1.0.1.dist-info/METADATA +105 -0
- imb-1.0.1.dist-info/RECORD +13 -0
- imb-1.0.0.dist-info/METADATA +0 -30
- imb-1.0.0.dist-info/RECORD +0 -10
- {imb-1.0.0.dist-info → imb-1.0.1.dist-info}/LICENSE +0 -0
- {imb-1.0.0.dist-info → imb-1.0.1.dist-info}/WHEEL +0 -0
- {imb-1.0.0.dist-info → imb-1.0.1.dist-info}/top_level.txt +0 -0
    
        imb/__init__.py
    CHANGED
    
    | @@ -1 +0,0 @@ | |
| 1 | 
            -
            from .inference_clients import OnnxClient, TritonClient
         | 
    
        imb/base.py
    ADDED
    
    | @@ -0,0 +1,129 @@ | |
| 1 | 
            +
            from typing import Dict, List, Optional, Tuple, Union
         | 
| 2 | 
            +
            import numpy as np
         | 
| 3 | 
            +
            import time
         | 
| 4 | 
            +
            import os
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            class BaseClient:
         | 
| 8 | 
            +
                def __init__(self, *args, **kwargs):
         | 
| 9 | 
            +
                    self.show_fps: bool = os.environ.get('SHOW_FPS') in {'yes', 'true'}
         | 
| 10 | 
            +
                    self.model_name = ''
         | 
| 11 | 
            +
                    self.fixed_batch = False
         | 
| 12 | 
            +
                    self.max_batch_size = 1
         | 
| 13 | 
            +
                    self.is_async = False
         | 
| 14 | 
            +
                    self.return_dict = True
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    self.inputs_names: List[str] = []
         | 
| 17 | 
            +
                    self.inputs_shapes: List[tuple] = []
         | 
| 18 | 
            +
                    self.np_inputs_dtypes: List[np.dtype] = []
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    self.outputs_names: List[str] = []
         | 
| 21 | 
            +
                    self.outputs_shapes: List[tuple] = []
         | 
| 22 | 
            +
                    self.np_outputs_dtypes: List[np.dtype] = []
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    self.sample_inputs: Optional[List[np.ndarray]] = None
         | 
| 25 | 
            +
                
         | 
| 26 | 
            +
                def _load_model_params(self, *args, **kwargs):
         | 
| 27 | 
            +
                    raise NotImplementedError
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                def _create_input_sample(self):
         | 
| 30 | 
            +
                    if self.sample_inputs is not None:
         | 
| 31 | 
            +
                        # validate sample inputs
         | 
| 32 | 
            +
                        for sample_array, config_input_shape in zip(self.sample_inputs, self.inputs_shapes):
         | 
| 33 | 
            +
                            for i, (s_dim, t_dim) in enumerate(zip(sample_array.shape, config_input_shape)):
         | 
| 34 | 
            +
                                if i == 0:
         | 
| 35 | 
            +
                                    if self.fixed_batch:
         | 
| 36 | 
            +
                                        assert s_dim == t_dim, \
         | 
| 37 | 
            +
                                            f'model support fixed batch size {t_dim}, \
         | 
| 38 | 
            +
                                                sample_inputs has batch size {s_dim}'
         | 
| 39 | 
            +
                                    else:
         | 
| 40 | 
            +
                                        assert s_dim <= t_dim, \
         | 
| 41 | 
            +
                                            f'model support max batch size {t_dim}, \
         | 
| 42 | 
            +
                                                sample_inputs has batch size {s_dim}'
         | 
| 43 | 
            +
                                    continue
         | 
| 44 | 
            +
                                assert ((t_dim != -1) and (int(s_dim) == int(t_dim))) or t_dim == -1, \
         | 
| 45 | 
            +
                                    f'incorrect shape in sample_inputs {sample_array.shape}, must be {config_input_shape}'
         | 
| 46 | 
            +
                    else:
         | 
| 47 | 
            +
                        has_dynamic_shapes = any(
         | 
| 48 | 
            +
                            -1 in config_input_shape for config_input_shape in self.inputs_shapes
         | 
| 49 | 
            +
                        )
         | 
| 50 | 
            +
                        if has_dynamic_shapes:
         | 
| 51 | 
            +
                            return
         | 
| 52 | 
            +
                        self.sample_inputs = []
         | 
| 53 | 
            +
                        for config_input_shape, np_input_format in zip(self.inputs_shapes, self.np_inputs_dtypes):
         | 
| 54 | 
            +
                            self.sample_inputs.append(
         | 
| 55 | 
            +
                                np.ones(config_input_shape).astype(np_input_format)
         | 
| 56 | 
            +
                            )
         | 
| 57 | 
            +
                
         | 
| 58 | 
            +
                def _create_batches(self, *inputs_data: np.ndarray) -> Tuple[Dict[str, List[np.ndarray]], List[int]]:
         | 
| 59 | 
            +
                    inputs_batches = dict()
         | 
| 60 | 
            +
                    paddings = []
         | 
| 61 | 
            +
                    for input_data, np_format, input_name in zip(inputs_data, self.np_inputs_dtypes, self.inputs_names):
         | 
| 62 | 
            +
                        input_data = input_data.astype(np_format)
         | 
| 63 | 
            +
                        input_batches, input_paddings = self._split_on_batches(input_data)
         | 
| 64 | 
            +
                        if paddings == []:
         | 
| 65 | 
            +
                            paddings = input_paddings
         | 
| 66 | 
            +
                        inputs_batches[input_name] = input_batches
         | 
| 67 | 
            +
                    return inputs_batches, paddings
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                def log(self, text, warn=False, err=False):
         | 
| 70 | 
            +
                    text = f'Model ({self.model_name}) - {text}'
         | 
| 71 | 
            +
                    if err:
         | 
| 72 | 
            +
                        print('error', text)
         | 
| 73 | 
            +
                    elif warn:
         | 
| 74 | 
            +
                        print('warning',text)
         | 
| 75 | 
            +
                    else:
         | 
| 76 | 
            +
                        print('debug', text)
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                def warmup_model(self):
         | 
| 79 | 
            +
                    if self.sample_inputs is None:
         | 
| 80 | 
            +
                        print('Model was not warmed up, because sample_inputs didn\'t set or shape is dynamic and cannot auto generate')
         | 
| 81 | 
            +
                        return
         | 
| 82 | 
            +
                    exception = None
         | 
| 83 | 
            +
                    for _ in range(5):
         | 
| 84 | 
            +
                        try:
         | 
| 85 | 
            +
                            _ = self.__call__(*self.sample_inputs)
         | 
| 86 | 
            +
                            exception = None
         | 
| 87 | 
            +
                        except Exception as e:
         | 
| 88 | 
            +
                            print(f'{e} while warmup, repeat inference...')
         | 
| 89 | 
            +
                            exception = e
         | 
| 90 | 
            +
                            time.sleep(2)
         | 
| 91 | 
            +
                    if exception is not None:
         | 
| 92 | 
            +
                        raise exception
         | 
| 93 | 
            +
                
         | 
| 94 | 
            +
                def pad_batch(self, batch: np.ndarray):
         | 
| 95 | 
            +
                    padding_size = self.max_batch_size - batch.shape[0]
         | 
| 96 | 
            +
                    if padding_size > 0:
         | 
| 97 | 
            +
                        pad = np.zeros([padding_size, *batch.shape[1:]], dtype=batch.dtype)
         | 
| 98 | 
            +
                        batch = np.concatenate((batch, pad), axis=0)
         | 
| 99 | 
            +
                    return batch, padding_size
         | 
| 100 | 
            +
                
         | 
| 101 | 
            +
                def _split_on_batches(self, input_data: np.ndarray):
         | 
| 102 | 
            +
                    batches = []
         | 
| 103 | 
            +
                    paddings = []
         | 
| 104 | 
            +
                    for i in range(0, len(input_data), self.max_batch_size):
         | 
| 105 | 
            +
                        batch = input_data[i:i+self.max_batch_size]
         | 
| 106 | 
            +
                        batches.append(batch)
         | 
| 107 | 
            +
                        paddings.append(0)
         | 
| 108 | 
            +
                    
         | 
| 109 | 
            +
                    if self.fixed_batch:
         | 
| 110 | 
            +
                        batches[-1], paddings[-1] = self.pad_batch(batches[-1])
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                    return batches, paddings
         | 
| 113 | 
            +
                
         | 
| 114 | 
            +
                def forward(self, *input_data):
         | 
| 115 | 
            +
                    raise NotImplementedError
         | 
| 116 | 
            +
                
         | 
| 117 | 
            +
                def async_forward(self, *input_data):
         | 
| 118 | 
            +
                    raise NotImplementedError
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                def __call__(self, *args, **kwargs) -> Union[Dict[str, np.ndarray], List[np.ndarray]]:
         | 
| 121 | 
            +
                    t1 = time.time()
         | 
| 122 | 
            +
                    forward_func = self.async_forward if self.is_async else self.forward
         | 
| 123 | 
            +
                    output = forward_func(*args, **kwargs)
         | 
| 124 | 
            +
                    if self.return_dict is False:
         | 
| 125 | 
            +
                        output = [output[output_name] for output_name in self.outputs_names]
         | 
| 126 | 
            +
                    t2 = time.time()
         | 
| 127 | 
            +
                    if self.show_fps:
         | 
| 128 | 
            +
                        self.log(f'Model: {self.model_name} fps {int(len(args[0])/(t2-t1))}')
         | 
| 129 | 
            +
                    return output
         | 
    
        imb/onnx.py
    ADDED
    
    | @@ -0,0 +1,99 @@ | |
| 1 | 
            +
            from collections import defaultdict
         | 
| 2 | 
            +
            from typing import Dict, List
         | 
| 3 | 
            +
            from .base import BaseClient
         | 
| 4 | 
            +
            import onnxruntime as rt
         | 
| 5 | 
            +
            import numpy as np
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            class OnnxClient(BaseClient):
         | 
| 9 | 
            +
                def __init__(self, model_path: str, 
         | 
| 10 | 
            +
                             model_name: str, 
         | 
| 11 | 
            +
                             providers: List[str] = ['CUDAExecutionProvider', 'CPUExecutionProvider'], 
         | 
| 12 | 
            +
                             max_batch_size: int = 1,
         | 
| 13 | 
            +
                             return_dict: bool = True,
         | 
| 14 | 
            +
                             fixed_batch: bool = False,
         | 
| 15 | 
            +
                             warmup: bool = False
         | 
| 16 | 
            +
                             ):
         | 
| 17 | 
            +
                    super().__init__()
         | 
| 18 | 
            +
                    self.model_name = model_name
         | 
| 19 | 
            +
                    self.model_path = model_path
         | 
| 20 | 
            +
                    self.providers = providers
         | 
| 21 | 
            +
                    self.return_dict = return_dict
         | 
| 22 | 
            +
                    self.max_batch_size = max_batch_size
         | 
| 23 | 
            +
                    self.fixed_batch = fixed_batch
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                    self._load_model_params(max_batch_size)
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                    self.sample_inputs = [np.zeros((*shape,), dtype=dtype) for shape, dtype in zip(self.inputs_shapes, self.np_inputs_dtypes)]
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    if warmup:
         | 
| 30 | 
            +
                        self.warmup_model()
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                def _load_model_params(self, max_batch_size: int = 1):
         | 
| 33 | 
            +
                    """
         | 
| 34 | 
            +
                    Load model parameters from onnx model
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    Args:
         | 
| 37 | 
            +
                        max_batch_size (int, optional): max batch size. Defaults to 1.
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    Raises:
         | 
| 40 | 
            +
                        ValueError: not support dynamic batch
         | 
| 41 | 
            +
                    """
         | 
| 42 | 
            +
                    sess_options = rt.SessionOptions()
         | 
| 43 | 
            +
                    self.onnx_model = rt.InferenceSession(
         | 
| 44 | 
            +
                        self.model_path,
         | 
| 45 | 
            +
                        providers=self.providers,
         | 
| 46 | 
            +
                        sess_options=sess_options
         | 
| 47 | 
            +
                    )
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    model_inputs = self.onnx_model.get_inputs()
         | 
| 50 | 
            +
                    data_dtype = np.float16 if 'float16' in model_inputs[0].type else np.float32
         | 
| 51 | 
            +
                    self.inputs_names = [model_inputs[i].name for i in range(len(model_inputs))]
         | 
| 52 | 
            +
                    self.np_inputs_dtypes = [data_dtype for _ in range(len(self.inputs_names))]
         | 
| 53 | 
            +
                    self.inputs_shapes = [model_inputs[i].shape for i in range(len(model_inputs))] 
         | 
| 54 | 
            +
                    for i_input, shape in enumerate(self.inputs_shapes):
         | 
| 55 | 
            +
                        new_shape = []
         | 
| 56 | 
            +
                        for i_dim, value in enumerate(shape):
         | 
| 57 | 
            +
                            if isinstance(value, int):
         | 
| 58 | 
            +
                                if i_dim == 0:
         | 
| 59 | 
            +
                                    self.max_batch_size = value
         | 
| 60 | 
            +
                                    self.log(f'set batch size {value} from model metadata')
         | 
| 61 | 
            +
                                new_shape.append(value)
         | 
| 62 | 
            +
                            elif isinstance(value, str) and 'batch' in value:
         | 
| 63 | 
            +
                                new_shape.append(max_batch_size)
         | 
| 64 | 
            +
                                self.log(f'set batch size {value} from user settings')
         | 
| 65 | 
            +
                            else:
         | 
| 66 | 
            +
                                raise ValueError(f'not support value {value} in input shape {shape}')
         | 
| 67 | 
            +
                        self.inputs_shapes[i_input] = new_shape            
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                    model_outputs = self.onnx_model.get_outputs()
         | 
| 70 | 
            +
                    self.outputs_names = [model_outputs[i].name for i in range(len(model_outputs))]
         | 
| 71 | 
            +
                    self.np_outputs_dtypes = [data_dtype for _ in range(len(self.outputs_names))]
         | 
| 72 | 
            +
                
         | 
| 73 | 
            +
                def forward(self, *inputs_data: np.ndarray) -> Dict[str, np.ndarray]:
         | 
| 74 | 
            +
                    inputs_batches, batches_paddings = self._create_batches(*inputs_data)
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                    result = defaultdict(list)
         | 
| 77 | 
            +
                    count_batches = len(next(iter(inputs_batches.values())))
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    for i_batch in range(count_batches):
         | 
| 80 | 
            +
                        batch = dict()
         | 
| 81 | 
            +
                        for input_name, np_dtype in zip(self.inputs_names, self.np_inputs_dtypes):
         | 
| 82 | 
            +
                            batch[input_name] = inputs_batches[input_name][i_batch].astype(np_dtype)
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                        batch_result = self.onnx_model.run(self.outputs_names, batch)
         | 
| 85 | 
            +
                        batch_result = {
         | 
| 86 | 
            +
                            self.outputs_names[i]: batch_result[i].astype(self.np_outputs_dtypes[i])
         | 
| 87 | 
            +
                            for i in range(len(self.outputs_names))
         | 
| 88 | 
            +
                        }
         | 
| 89 | 
            +
                        
         | 
| 90 | 
            +
                        padding_size = batches_paddings[i_batch]
         | 
| 91 | 
            +
                        for output_name, output_value in batch_result.items():
         | 
| 92 | 
            +
                            result[output_name].append(
         | 
| 93 | 
            +
                                output_value if padding_size == 0 else output_value[:-padding_size]
         | 
| 94 | 
            +
                                )
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                    for output_name, output_values in result.items(): 
         | 
| 97 | 
            +
                        result[output_name] = np.concatenate(output_values)
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                    return result
         | 
    
        imb/triton.py
    ADDED
    
    | @@ -0,0 +1,460 @@ | |
| 1 | 
            +
            from collections import defaultdict
         | 
| 2 | 
            +
            from typing import Any, Dict, List, Literal, Optional, Tuple
         | 
| 3 | 
            +
            import tritonclient.http as httpclient
         | 
| 4 | 
            +
            import tritonclient.grpc as grpcclient
         | 
| 5 | 
            +
            import tritonclient.utils.cuda_shared_memory as cudashm
         | 
| 6 | 
            +
            from google.protobuf.json_format import MessageToJson
         | 
| 7 | 
            +
            from tritonclient import utils
         | 
| 8 | 
            +
            from .base import BaseClient
         | 
| 9 | 
            +
            import numpy as np
         | 
| 10 | 
            +
            import json
         | 
| 11 | 
            +
            import time
         | 
| 12 | 
            +
             | 
| 13 | 
            +
             | 
| 14 | 
            +
            class TritonClient(BaseClient):
         | 
| 15 | 
            +
                def __init__(self, url: str,
         | 
| 16 | 
            +
                             model_name: str,
         | 
| 17 | 
            +
                             max_batch_size: int = 0,
         | 
| 18 | 
            +
                             sample_inputs: Optional[List[np.ndarray]] = None,
         | 
| 19 | 
            +
                             timeout: int = 10,
         | 
| 20 | 
            +
                             resend_count: int = 10,
         | 
| 21 | 
            +
                             fixed_batch: bool = True,
         | 
| 22 | 
            +
                             is_async: bool = False,
         | 
| 23 | 
            +
                             cuda_shm: bool = False,
         | 
| 24 | 
            +
                             max_shm_regions: int = 2,
         | 
| 25 | 
            +
                             scheme: Literal["http", "grpc"] = "http",
         | 
| 26 | 
            +
                             return_dict: bool = True,
         | 
| 27 | 
            +
                             warmup: bool = False
         | 
| 28 | 
            +
                             ):
         | 
| 29 | 
            +
                    super().__init__()
         | 
| 30 | 
            +
                    self.model_name = model_name
         | 
| 31 | 
            +
                    self.scheme = scheme
         | 
| 32 | 
            +
                    self.client_module = httpclient if scheme == "http" else grpcclient
         | 
| 33 | 
            +
                    self.url = url
         | 
| 34 | 
            +
                    self.is_async = is_async
         | 
| 35 | 
            +
                    self.cuda_shm = cuda_shm
         | 
| 36 | 
            +
                    self.triton_timeout = timeout
         | 
| 37 | 
            +
                    self.resend_count = resend_count
         | 
| 38 | 
            +
                    self.max_shm_regions = max_shm_regions
         | 
| 39 | 
            +
                    self.return_dict = return_dict
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    self.triton_client = None
         | 
| 42 | 
            +
                    self._init_triton()
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                    self.triton_inputs_dtypes = None
         | 
| 45 | 
            +
                    self.np_inputs_dtypes = None
         | 
| 46 | 
            +
                    
         | 
| 47 | 
            +
                    self.inputs_shapes = None
         | 
| 48 | 
            +
                    self.fixed_batch = fixed_batch
         | 
| 49 | 
            +
                    
         | 
| 50 | 
            +
                    self.inputs_names = None
         | 
| 51 | 
            +
                    self.outputs_names = None
         | 
| 52 | 
            +
                    
         | 
| 53 | 
            +
                    self.sample_inputs = sample_inputs       
         | 
| 54 | 
            +
                    
         | 
| 55 | 
            +
                    self._load_model_params(max_batch_size) 
         | 
| 56 | 
            +
                    self._create_input_sample()
         | 
| 57 | 
            +
                    if warmup:
         | 
| 58 | 
            +
                        self.warmup_model()
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                    self.input_shm_handles = [None for _ in range(len(self.inputs_names))]
         | 
| 61 | 
            +
                    self.output_shm_handles = [None for _ in range(len(self.outputs_names))]
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    if self.cuda_shm:
         | 
| 64 | 
            +
                        assert is_async == False and fixed_batch == True
         | 
| 65 | 
            +
                        self._fill_output_dynamic_axis()
         | 
| 66 | 
            +
                        self._create_input_output_shm_handles()
         | 
| 67 | 
            +
                        self._register_cuda_shm_regions()
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                def io_summary(self):
         | 
| 70 | 
            +
                    return {
         | 
| 71 | 
            +
                        "model_name": self.model_name,
         | 
| 72 | 
            +
                        "url": self.url,
         | 
| 73 | 
            +
                        "scheme": self.scheme,
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                        "inputs_shapes": self.inputs_shapes,
         | 
| 76 | 
            +
                        "inputs_names": self.inputs_names,
         | 
| 77 | 
            +
                        "triton_inputs_dtypes": self.triton_inputs_dtypes,
         | 
| 78 | 
            +
                        "np_inputs_dtypes": self.np_inputs_dtypes,
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                        "outputs_shapes": self.outputs_shapes,
         | 
| 81 | 
            +
                        "outputs_names": self.outputs_names,
         | 
| 82 | 
            +
                        "triton_outputs_dtypes": self.triton_outputs_dtypes,
         | 
| 83 | 
            +
                        "np_outputs_dtypes": self.np_outputs_dtypes,
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                        "fixed_batch": self.fixed_batch,
         | 
| 86 | 
            +
                        "async": self.is_async,
         | 
| 87 | 
            +
                        "cuda_shm": self.cuda_shm,
         | 
| 88 | 
            +
                        "max_shm_regions": self.max_shm_regions,
         | 
| 89 | 
            +
                    }
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                def _init_triton(self):
         | 
| 92 | 
            +
                    if self.triton_client is not None:
         | 
| 93 | 
            +
                        # reinit
         | 
| 94 | 
            +
                        self.triton_client.close()
         | 
| 95 | 
            +
                        time.sleep(3)
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                    self.triton_client = self.client_module.InferenceServerClient(
         | 
| 98 | 
            +
                                            url=self.url,
         | 
| 99 | 
            +
                                            verbose=False,
         | 
| 100 | 
            +
                                            ssl=False,
         | 
| 101 | 
            +
                                            network_timeout=self.triton_timeout,
         | 
| 102 | 
            +
                                            connection_timeout=self.triton_timeout
         | 
| 103 | 
            +
                                        )
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                def _load_model_params(self, user_max_batch_size: int) -> None:
         | 
| 106 | 
            +
                    """
         | 
| 107 | 
            +
                    Load the model config from Triton Inferernce Server and update the class attributes.
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                    Args:
         | 
| 110 | 
            +
                        user_max_batch_size (int): max_batch_size defined by user
         | 
| 111 | 
            +
                    """
         | 
| 112 | 
            +
                    if self.scheme == "grpc":
         | 
| 113 | 
            +
                        config = self.triton_client.get_model_config(self.model_name, as_json=True)
         | 
| 114 | 
            +
                        config = config["config"]
         | 
| 115 | 
            +
                    else:
         | 
| 116 | 
            +
                        config = self.triton_client.get_model_config(self.model_name)
         | 
| 117 | 
            +
                    
         | 
| 118 | 
            +
                    self.triton_inputs_dtypes, self.np_inputs_dtypes, \
         | 
| 119 | 
            +
                        self.inputs_shapes, self.inputs_names \
         | 
| 120 | 
            +
                            = self._parse_io_params(config['input'])
         | 
| 121 | 
            +
                    
         | 
| 122 | 
            +
                    self.triton_outputs_dtypes, self.np_outputs_dtypes, \
         | 
| 123 | 
            +
                        self.outputs_shapes, self.outputs_names \
         | 
| 124 | 
            +
                            = self._parse_io_params(config['output'])
         | 
| 125 | 
            +
                    
         | 
| 126 | 
            +
                    not_support_dynamic_batch = config['max_batch_size'] == 0
         | 
| 127 | 
            +
                    if not_support_dynamic_batch:
         | 
| 128 | 
            +
                        # use batch size from config
         | 
| 129 | 
            +
                        self.max_batch_size = config['input'][0]['dims'][0]
         | 
| 130 | 
            +
                        self.fixed_batch = True
         | 
| 131 | 
            +
                    else:
         | 
| 132 | 
            +
                        # user can decrease max_batch_size from config
         | 
| 133 | 
            +
                        if user_max_batch_size > 0:
         | 
| 134 | 
            +
                            self.max_batch_size = min(config['max_batch_size'], user_max_batch_size)
         | 
| 135 | 
            +
                        else:
         | 
| 136 | 
            +
                            self.max_batch_size = config['max_batch_size']
         | 
| 137 | 
            +
                        # in config's shape has no batch size
         | 
| 138 | 
            +
                        self.inputs_shapes = self._insert_batch_size_to_shapes(
         | 
| 139 | 
            +
                            self.inputs_shapes, self.max_batch_size
         | 
| 140 | 
            +
                            )
         | 
| 141 | 
            +
                        self.outputs_shapes = self._insert_batch_size_to_shapes(
         | 
| 142 | 
            +
                            self.outputs_shapes, self.max_batch_size
         | 
| 143 | 
            +
                            )
         | 
| 144 | 
            +
                
         | 
| 145 | 
            +
                def _fill_output_dynamic_axis(self) -> None:
         | 
| 146 | 
            +
                    """
         | 
| 147 | 
            +
                    Fill real values in the dynamic axis of the output shapes.
         | 
| 148 | 
            +
                    """
         | 
| 149 | 
            +
                    has_dynamic_shapes = any(
         | 
| 150 | 
            +
                            -1 in output_shape for output_shape in self.outputs_shapes
         | 
| 151 | 
            +
                        )
         | 
| 152 | 
            +
                    if has_dynamic_shapes:
         | 
| 153 | 
            +
                        start_cuda_shm_flag = self.cuda_shm
         | 
| 154 | 
            +
                        self.cuda_shm = False
         | 
| 155 | 
            +
                        outputs = self.forward(*self.sample_inputs)
         | 
| 156 | 
            +
                        self.outputs_shapes = [
         | 
| 157 | 
            +
                            list(outputs[output_name].shape) for output_name in self.outputs_names
         | 
| 158 | 
            +
                            ]
         | 
| 159 | 
            +
                        self.cuda_shm = start_cuda_shm_flag
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                @staticmethod
         | 
| 162 | 
            +
                def _parse_io_params(io_params: List[Dict]) -> Tuple[List[str], List[np.dtype], List[List[int]], List[str]]:
         | 
| 163 | 
            +
                    """
         | 
| 164 | 
            +
                    Parse the input/output parameters from the model config.
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    Args:
         | 
| 167 | 
            +
                        io_params (List[Dict]): The input/output parameters.
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                    Returns:
         | 
| 170 | 
            +
                        Tuple[List[str], List[np.dtype], List[List[int]], List[str]]: The input/output dtypes, shapes, and names.
         | 
| 171 | 
            +
                    """
         | 
| 172 | 
            +
                    triton_dtypes = []
         | 
| 173 | 
            +
                    np_dtypes = []
         | 
| 174 | 
            +
                    shapes = []
         | 
| 175 | 
            +
                    names = []
         | 
| 176 | 
            +
                    for params in io_params:
         | 
| 177 | 
            +
                        triton_dtypes.append(params['data_type'].replace('TYPE_', ''))
         | 
| 178 | 
            +
                        np_dtypes.append(utils.triton_to_np_dtype(triton_dtypes[-1]))
         | 
| 179 | 
            +
                        shapes.append(params['dims'])
         | 
| 180 | 
            +
                        names.append(params['name'])
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                    return triton_dtypes, np_dtypes, shapes, names
         | 
| 183 | 
            +
             | 
| 184 | 
            +
                @staticmethod
         | 
| 185 | 
            +
                def _insert_batch_size_to_shapes(shapes: List[List], insert_batch_size: int) -> List[List[int]]:
         | 
| 186 | 
            +
                    """
         | 
| 187 | 
            +
                    Insert the batch size to the shapes.
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                    Args:
         | 
| 190 | 
            +
                        shapes (List[List]): Shapes from config
         | 
| 191 | 
            +
                        insert_batch_size (int): Value for insert batch size to shape
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                    Returns:
         | 
| 194 | 
            +
                        List[List[int]]: Result shape
         | 
| 195 | 
            +
                    """
         | 
| 196 | 
            +
                    return [[insert_batch_size] + shape for shape in shapes]
         | 
| 197 | 
            +
             | 
| 198 | 
            +
                def _generate_shm_name(self, ioname: str) -> str:
         | 
| 199 | 
            +
                    """
         | 
| 200 | 
            +
                    Generate shared region name
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                    Args:
         | 
| 203 | 
            +
                        ioname (str): Input/output name
         | 
| 204 | 
            +
             | 
| 205 | 
            +
                    Returns:
         | 
| 206 | 
            +
                        str: Shared region name
         | 
| 207 | 
            +
                    """
         | 
| 208 | 
            +
                    return f'{self.model_name}_{ioname}_{time.time()}'
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                def _get_old_regions_names(self, regions_statuses: list, new_triton_shm_name: str) -> List[str]:
         | 
| 211 | 
            +
                    """
         | 
| 212 | 
            +
                    Get old regions names for unregister
         | 
| 213 | 
            +
             | 
| 214 | 
            +
                    Args:
         | 
| 215 | 
            +
                        regions_statuses (list): responce of get_cuda_shared_memory_status from triton
         | 
| 216 | 
            +
                        new_triton_shm_name (str): name of new region
         | 
| 217 | 
            +
             | 
| 218 | 
            +
                    Returns:
         | 
| 219 | 
            +
                        List[str]: old regions names for unregister
         | 
| 220 | 
            +
                    """
         | 
| 221 | 
            +
                    i_sep = len(new_triton_shm_name) - new_triton_shm_name[::-1].index('_') - 1
         | 
| 222 | 
            +
                    region_name = new_triton_shm_name[:i_sep]
         | 
| 223 | 
            +
                    registrated_regions = [
         | 
| 224 | 
            +
                        (region['name'], float(region['name'][i_sep+1:])) 
         | 
| 225 | 
            +
                        for region in regions_statuses if region['name'].startswith(region_name)
         | 
| 226 | 
            +
                    ]
         | 
| 227 | 
            +
                    registrated_regions.sort(key=lambda x: x[1])
         | 
| 228 | 
            +
                    count_old_regions = len(registrated_regions) - self.max_shm_regions + 1
         | 
| 229 | 
            +
                    old_regions = []
         | 
| 230 | 
            +
                    if count_old_regions > 0:
         | 
| 231 | 
            +
                        old_regions = [name for name, _ in registrated_regions[:count_old_regions]]
         | 
| 232 | 
            +
                    return old_regions
         | 
| 233 | 
            +
             | 
| 234 | 
            +
                def _register_cuda_shm_regions(self):
         | 
| 235 | 
            +
                    """
         | 
| 236 | 
            +
                    Register CUDA shared memory regions in Triton
         | 
| 237 | 
            +
                    """
         | 
| 238 | 
            +
                    if self.scheme == "grpc":
         | 
| 239 | 
            +
                        regions_statuses = self.triton_client.get_cuda_shared_memory_status(as_json=True)['regions']
         | 
| 240 | 
            +
                    else:
         | 
| 241 | 
            +
                        regions_statuses = self.triton_client.get_cuda_shared_memory_status()
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                    for shm_handle in self.input_shm_handles + self.output_shm_handles:
         | 
| 244 | 
            +
                        old_regions_names = self._get_old_regions_names(regions_statuses, shm_handle._triton_shm_name)
         | 
| 245 | 
            +
                        for old_region_name in old_regions_names:
         | 
| 246 | 
            +
                            self.triton_client.unregister_cuda_shared_memory(old_region_name)
         | 
| 247 | 
            +
                        self.triton_client.register_cuda_shared_memory(
         | 
| 248 | 
            +
                            shm_handle._triton_shm_name, cudashm.get_raw_handle(shm_handle), 0, shm_handle._byte_size
         | 
| 249 | 
            +
                        )
         | 
| 250 | 
            +
             | 
| 251 | 
            +
                def _create_cuda_shm_handle(self, shape: List[int], dtype: np.dtype, name: str) -> Any:
         | 
| 252 | 
            +
                    """
         | 
| 253 | 
            +
                    Create CUDA shared memory handle
         | 
| 254 | 
            +
             | 
| 255 | 
            +
                    Args:
         | 
| 256 | 
            +
                        shape (List[int]): Shape of cuda shared memory region
         | 
| 257 | 
            +
                        dtype (np.dtype): Data type of input/output data
         | 
| 258 | 
            +
                        name (str): Input/output name
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                    Returns:
         | 
| 261 | 
            +
                        Any: CUDA shared memory handle
         | 
| 262 | 
            +
                    """
         | 
| 263 | 
            +
                    byte_size = int(np.prod(shape) * np.dtype(dtype).itemsize)
         | 
| 264 | 
            +
                    shm_name = self._generate_shm_name(name)
         | 
| 265 | 
            +
                    return cudashm.create_shared_memory_region(shm_name, byte_size, 0)
         | 
| 266 | 
            +
             | 
| 267 | 
            +
                def _create_cuda_shm_handles_for_io(self, shapes: List[List[int]], 
         | 
| 268 | 
            +
                                                    dtypes: List[np.dtype], 
         | 
| 269 | 
            +
                                                    names: List[str]) -> List[Any]:
         | 
| 270 | 
            +
                    """
         | 
| 271 | 
            +
                    Create CUDA shared memory handles for inputs or outputs
         | 
| 272 | 
            +
             | 
| 273 | 
            +
                    Args:
         | 
| 274 | 
            +
                        shapes (List[List[int]]): Shapes of cuda shared memory regions
         | 
| 275 | 
            +
                        dtypes (List[np.dtype]): Data types of input/output data
         | 
| 276 | 
            +
                        names (List[str]): Input/output names
         | 
| 277 | 
            +
             | 
| 278 | 
            +
                    Returns:
         | 
| 279 | 
            +
                        List[Any]: CUDA shared memory handles
         | 
| 280 | 
            +
                    """
         | 
| 281 | 
            +
                    return [self._create_cuda_shm_handle(shape, dtype, name) 
         | 
| 282 | 
            +
                            for shape, dtype, name in zip(shapes, dtypes, names)]
         | 
| 283 | 
            +
             | 
| 284 | 
            +
                def _create_input_output_shm_handles(self) -> None:
         | 
| 285 | 
            +
                    """
         | 
| 286 | 
            +
                    Create CUDA shared memory handles for inputs and outputs
         | 
| 287 | 
            +
                    """
         | 
| 288 | 
            +
                    self.input_shm_handles = self._create_cuda_shm_handles_for_io(
         | 
| 289 | 
            +
                        self.inputs_shapes, self.np_inputs_dtypes, self.inputs_names
         | 
| 290 | 
            +
                    )
         | 
| 291 | 
            +
                    self.output_shm_handles = self._create_cuda_shm_handles_for_io(
         | 
| 292 | 
            +
                        self.outputs_shapes, self.np_outputs_dtypes, self.outputs_names
         | 
| 293 | 
            +
                    )
         | 
| 294 | 
            +
             | 
| 295 | 
            +
                def _create_triton_input(self, input_data: np.ndarray, input_name: str, 
         | 
| 296 | 
            +
                                         config_input_format: str, shm_handle = None) -> Any:
         | 
| 297 | 
            +
                    """
         | 
| 298 | 
            +
                    Create triton InferInput
         | 
| 299 | 
            +
             | 
| 300 | 
            +
                    Args:
         | 
| 301 | 
            +
                        input_data (np.ndarray): data for send to model
         | 
| 302 | 
            +
                        input_name (str): name of input
         | 
| 303 | 
            +
                        config_input_format (str): triton input format
         | 
| 304 | 
            +
                        shm_handle (_type_, optional): CUDA shared memory handle. Defaults to None.
         | 
| 305 | 
            +
             | 
| 306 | 
            +
                    Returns:
         | 
| 307 | 
            +
                        Any: triton InferInput for sending request
         | 
| 308 | 
            +
                    """
         | 
| 309 | 
            +
                    infer_input = self.client_module.InferInput(input_name, input_data.shape, config_input_format)
         | 
| 310 | 
            +
                    if self.cuda_shm:
         | 
| 311 | 
            +
                        cudashm.set_shared_memory_region(shm_handle, [input_data])
         | 
| 312 | 
            +
                        infer_input.set_shared_memory(shm_handle._triton_shm_name, shm_handle._byte_size)
         | 
| 313 | 
            +
                    else:
         | 
| 314 | 
            +
                        infer_input.set_data_from_numpy(input_data)
         | 
| 315 | 
            +
                    return infer_input
         | 
| 316 | 
            +
             | 
| 317 | 
            +
                def _create_triton_output(self, output_name: str, binary: bool = True, shm_handle = None) -> Any:
         | 
| 318 | 
            +
                    """
         | 
| 319 | 
            +
                    Create triton InferRequestedOutput
         | 
| 320 | 
            +
             | 
| 321 | 
            +
                    Args:
         | 
| 322 | 
            +
                        output_name (str): output name
         | 
| 323 | 
            +
                        binary (bool, optional): Whether the output is binary. Defaults to True.
         | 
| 324 | 
            +
                        shm_handle (_type_, optional): CUDA shared memory handle. Defaults to None.
         | 
| 325 | 
            +
             | 
| 326 | 
            +
                    Returns:
         | 
| 327 | 
            +
                        Any: triton InferRequestedOutput for receiving response
         | 
| 328 | 
            +
                    """
         | 
| 329 | 
            +
                    if self.scheme == "grpc":
         | 
| 330 | 
            +
                        infer_output = self.client_module.InferRequestedOutput(output_name)
         | 
| 331 | 
            +
                    else:
         | 
| 332 | 
            +
                        infer_output = self.client_module.InferRequestedOutput(output_name, binary_data=binary)
         | 
| 333 | 
            +
                    if self.cuda_shm:
         | 
| 334 | 
            +
                        infer_output.set_shared_memory(shm_handle._triton_shm_name, shm_handle._byte_size)
         | 
| 335 | 
            +
                    return infer_output
         | 
| 336 | 
            +
             | 
| 337 | 
            +
                def _postprocess_triton_result(self, triton_response: Any, padding_size: int) -> Dict[str, np.ndarray]:
         | 
| 338 | 
            +
                    """
         | 
| 339 | 
            +
                    Postprocess triton response.
         | 
| 340 | 
            +
             | 
| 341 | 
            +
                    Args:
         | 
| 342 | 
            +
                        triton_response (Any): triton response
         | 
| 343 | 
            +
                        padding_size (int): padding size for unpad output data
         | 
| 344 | 
            +
             | 
| 345 | 
            +
                    Returns:
         | 
| 346 | 
            +
                        Dict[str, np.ndarray]: dict of output name and output data
         | 
| 347 | 
            +
                    """
         | 
| 348 | 
            +
                    result = dict()
         | 
| 349 | 
            +
                    for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
         | 
| 350 | 
            +
                        if self.cuda_shm:
         | 
| 351 | 
            +
                            if self.scheme == "grpc":
         | 
| 352 | 
            +
                                # output = triton_response.get_output(output_name, as_json=True) # WARN: bug in tritonclient library, return None
         | 
| 353 | 
            +
                                output = json.loads(MessageToJson(triton_response.get_output(output_name)))
         | 
| 354 | 
            +
                            else:
         | 
| 355 | 
            +
                                output = triton_response.get_output(output_name)
         | 
| 356 | 
            +
                            result[output_name] = cudashm.get_contents_as_numpy(
         | 
| 357 | 
            +
                                shm_op_handle,
         | 
| 358 | 
            +
                                utils.triton_to_np_dtype(output["datatype"]),
         | 
| 359 | 
            +
                                output["shape"],
         | 
| 360 | 
            +
                            )
         | 
| 361 | 
            +
                        else:
         | 
| 362 | 
            +
                            result[output_name] = triton_response.as_numpy(output_name)
         | 
| 363 | 
            +
             | 
| 364 | 
            +
                        if padding_size != 0:
         | 
| 365 | 
            +
                            result[output_name] = result[output_name][:-padding_size]
         | 
| 366 | 
            +
                            
         | 
| 367 | 
            +
                    return result
         | 
| 368 | 
            +
             | 
| 369 | 
            +
                def forward(self, *inputs_data: np.ndarray) -> Dict[str, np.ndarray]:
         | 
| 370 | 
            +
                    assert len(inputs_data) == len(self.inputs_names), 'inputs number is not equal to model inputs'
         | 
| 371 | 
            +
                    inputs_batches, batches_paddings = self._create_batches(*inputs_data)
         | 
| 372 | 
            +
             | 
| 373 | 
            +
                    result = defaultdict(list)
         | 
| 374 | 
            +
                    count_batches = len(next(iter(inputs_batches.values())))
         | 
| 375 | 
            +
                    
         | 
| 376 | 
            +
                    for i_batch in range(count_batches):
         | 
| 377 | 
            +
                        triton_inputs = []
         | 
| 378 | 
            +
                        for input_name, config_input_format, shm_ip_handle in \
         | 
| 379 | 
            +
                                zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handles):
         | 
| 380 | 
            +
                            triton_input = self._create_triton_input(
         | 
| 381 | 
            +
                                inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handle
         | 
| 382 | 
            +
                                )
         | 
| 383 | 
            +
                            triton_inputs.append(triton_input)
         | 
| 384 | 
            +
             | 
| 385 | 
            +
                        triton_outputs = []
         | 
| 386 | 
            +
                        for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
         | 
| 387 | 
            +
                            triton_output = self._create_triton_output(
         | 
| 388 | 
            +
                                output_name, binary=True, shm_handle=shm_op_handle
         | 
| 389 | 
            +
                                )
         | 
| 390 | 
            +
                            triton_outputs.append(triton_output)
         | 
| 391 | 
            +
             | 
| 392 | 
            +
                        triton_response = self.triton_client.infer(
         | 
| 393 | 
            +
                            model_name=self.model_name, 
         | 
| 394 | 
            +
                            inputs=triton_inputs, 
         | 
| 395 | 
            +
                            outputs=triton_outputs
         | 
| 396 | 
            +
                            )
         | 
| 397 | 
            +
                        
         | 
| 398 | 
            +
                        batch_result = self._postprocess_triton_result(triton_response, batches_paddings[i_batch])
         | 
| 399 | 
            +
             | 
| 400 | 
            +
                        for output_name, output_value in batch_result.items():
         | 
| 401 | 
            +
                            result[output_name].append(output_value)
         | 
| 402 | 
            +
             | 
| 403 | 
            +
                    for output_name, output_values in result.items(): 
         | 
| 404 | 
            +
                        result[output_name] = np.concatenate(output_values)
         | 
| 405 | 
            +
             | 
| 406 | 
            +
                    return result
         | 
| 407 | 
            +
             | 
| 408 | 
            +
                def send_async_requests(self, inputs_batches: Dict):
         | 
| 409 | 
            +
                    count_batches = len(next(iter(inputs_batches.values())))
         | 
| 410 | 
            +
                    
         | 
| 411 | 
            +
                    triton_response_handles = []
         | 
| 412 | 
            +
             | 
| 413 | 
            +
                    for i_batch in range(count_batches):
         | 
| 414 | 
            +
                        triton_inputs = []
         | 
| 415 | 
            +
                        for input_name, config_input_format, shm_ip_handle in \
         | 
| 416 | 
            +
                                zip(self.inputs_names, self.triton_inputs_dtypes, self.input_shm_handles):
         | 
| 417 | 
            +
                            triton_input = self._create_triton_input(
         | 
| 418 | 
            +
                                inputs_batches[input_name][i_batch], input_name, config_input_format, shm_ip_handle
         | 
| 419 | 
            +
                                )
         | 
| 420 | 
            +
                            triton_inputs.append(triton_input)
         | 
| 421 | 
            +
             | 
| 422 | 
            +
                        triton_outputs = []
         | 
| 423 | 
            +
                        for output_name, shm_op_handle in zip(self.outputs_names, self.output_shm_handles):
         | 
| 424 | 
            +
                            triton_output = self._create_triton_output(
         | 
| 425 | 
            +
                                output_name, binary=True, shm_handle=shm_op_handle
         | 
| 426 | 
            +
                                )
         | 
| 427 | 
            +
                            triton_outputs.append(triton_output)
         | 
| 428 | 
            +
                        
         | 
| 429 | 
            +
                        triton_response_handle = self.triton_client.async_infer(
         | 
| 430 | 
            +
                                    model_name=self.model_name, 
         | 
| 431 | 
            +
                                    inputs=triton_inputs, 
         | 
| 432 | 
            +
                                    outputs=triton_outputs
         | 
| 433 | 
            +
                                    )
         | 
| 434 | 
            +
                        triton_response_handles.append(triton_response_handle)
         | 
| 435 | 
            +
                    
         | 
| 436 | 
            +
                    return triton_response_handles
         | 
| 437 | 
            +
                
         | 
| 438 | 
            +
                def get_async_results(self, triton_response_handles, batches_paddings):
         | 
| 439 | 
            +
                    result = defaultdict(list)
         | 
| 440 | 
            +
                    for i_batch, triton_response_handle in enumerate(triton_response_handles):
         | 
| 441 | 
            +
                        triton_response = triton_response_handle.get_result()
         | 
| 442 | 
            +
                        batch_result = self._postprocess_triton_result(triton_response, batches_paddings[i_batch])
         | 
| 443 | 
            +
             | 
| 444 | 
            +
                        for output_name, output_value in batch_result.items():
         | 
| 445 | 
            +
                            result[output_name].append(output_value)
         | 
| 446 | 
            +
             | 
| 447 | 
            +
                    for output_name, output_values in result.items(): 
         | 
| 448 | 
            +
                        result[output_name] = np.concatenate(output_values)
         | 
| 449 | 
            +
                    
         | 
| 450 | 
            +
                    return result
         | 
| 451 | 
            +
                
         | 
| 452 | 
            +
                def async_forward(self, *inputs_data: np.ndarray):
         | 
| 453 | 
            +
                    assert len(inputs_data) == len(self.inputs_names), 'inputs number is not equal to model inputs'
         | 
| 454 | 
            +
                    inputs_batches, batches_paddings = self._create_batches(*inputs_data)
         | 
| 455 | 
            +
                    
         | 
| 456 | 
            +
                    triton_response_handles = self.send_async_requests(inputs_batches)
         | 
| 457 | 
            +
             | 
| 458 | 
            +
                    result = self.get_async_results(triton_response_handles, batches_paddings)
         | 
| 459 | 
            +
             | 
| 460 | 
            +
                    return result
         | 
| @@ -0,0 +1,105 @@ | |
| 1 | 
            +
            Metadata-Version: 2.2
         | 
| 2 | 
            +
            Name: imb
         | 
| 3 | 
            +
            Version: 1.0.1
         | 
| 4 | 
            +
            Summary: Python library for run inference of deep learning models in different backends
         | 
| 5 | 
            +
            Home-page: https://github.com/TheConstant3/InferenceMultiBackend
         | 
| 6 | 
            +
            Author: p-constant
         | 
| 7 | 
            +
            Author-email: nikshorop@gmail.com
         | 
| 8 | 
            +
            Classifier: Programming Language :: Python :: 3.8
         | 
| 9 | 
            +
            Classifier: License :: OSI Approved :: MIT License
         | 
| 10 | 
            +
            Classifier: Operating System :: OS Independent
         | 
| 11 | 
            +
            Requires-Python: >=3.8
         | 
| 12 | 
            +
            Description-Content-Type: text/markdown
         | 
| 13 | 
            +
            License-File: LICENSE
         | 
| 14 | 
            +
            Requires-Dist: numpy
         | 
| 15 | 
            +
            Provides-Extra: triton
         | 
| 16 | 
            +
            Requires-Dist: tritonclient[all]>=2.38.0; extra == "triton"
         | 
| 17 | 
            +
            Provides-Extra: onnxcpu
         | 
| 18 | 
            +
            Requires-Dist: onnxruntime>=1.16.0; extra == "onnxcpu"
         | 
| 19 | 
            +
            Provides-Extra: onnxgpu
         | 
| 20 | 
            +
            Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "onnxgpu"
         | 
| 21 | 
            +
            Provides-Extra: all
         | 
| 22 | 
            +
            Requires-Dist: tritonclient[all]>=2.38.0; extra == "all"
         | 
| 23 | 
            +
            Requires-Dist: onnxruntime>=1.16.0; extra == "all"
         | 
| 24 | 
            +
            Requires-Dist: onnxruntime-gpu>=1.16.0; extra == "all"
         | 
| 25 | 
            +
            Dynamic: author
         | 
| 26 | 
            +
            Dynamic: author-email
         | 
| 27 | 
            +
            Dynamic: classifier
         | 
| 28 | 
            +
            Dynamic: description
         | 
| 29 | 
            +
            Dynamic: description-content-type
         | 
| 30 | 
            +
            Dynamic: home-page
         | 
| 31 | 
            +
            Dynamic: provides-extra
         | 
| 32 | 
            +
            Dynamic: requires-dist
         | 
| 33 | 
            +
            Dynamic: requires-python
         | 
| 34 | 
            +
            Dynamic: summary
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            # InferenceMultiBackend
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            Python library for run inference of deep learning models in different backends
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            ## Installation
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            For use triton inference client:
         | 
| 43 | 
            +
            ```pip install imb[triton]```
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            For use onnxruntime-gpu client:
         | 
| 46 | 
            +
            ```pip install imb[onnxgpu]```
         | 
| 47 | 
            +
             | 
| 48 | 
            +
            For use onnxruntime client:
         | 
| 49 | 
            +
            ```pip install imb[onnxcpu]```
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            For support all implemented clients:
         | 
| 52 | 
            +
            ```pip install imb[all]```
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            ## Usage
         | 
| 55 | 
            +
             | 
| 56 | 
            +
            OnnxClient usage example
         | 
| 57 | 
            +
            ```
         | 
| 58 | 
            +
            onnx_client = OnnxClient(
         | 
| 59 | 
            +
                model_path='model.onnx',
         | 
| 60 | 
            +
                model_name='any name',
         | 
| 61 | 
            +
                providers=['CUDAExecutionProvider', 'CPUExecutionProvider'],
         | 
| 62 | 
            +
                max_batch_size=16,
         | 
| 63 | 
            +
                return_dict=True,
         | 
| 64 | 
            +
                fixed_batch=True,
         | 
| 65 | 
            +
                warmup=True
         | 
| 66 | 
            +
            )
         | 
| 67 | 
            +
            # if model has fixed input size (except batch size) then sample_inputs will be created
         | 
| 68 | 
            +
            sample_inputs = onnx_client.sample_inputs
         | 
| 69 | 
            +
            print('inputs shapes', [o.shape for o in sample_inputs])
         | 
| 70 | 
            +
            outputs = onnx_client(*sample_inputs)
         | 
| 71 | 
            +
            print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
         | 
| 72 | 
            +
            ```
         | 
| 73 | 
            +
             | 
| 74 | 
            +
            TritonClient usage example
         | 
| 75 | 
            +
            ```
         | 
| 76 | 
            +
            triton_client = TritonClient(
         | 
| 77 | 
            +
                url='localhost:8000',
         | 
| 78 | 
            +
                model_name='arcface',
         | 
| 79 | 
            +
                max_batch_size=16,
         | 
| 80 | 
            +
                timeout=10,
         | 
| 81 | 
            +
                resend_count=10,
         | 
| 82 | 
            +
                fixed_batch=True,
         | 
| 83 | 
            +
                is_async=False,
         | 
| 84 | 
            +
                cuda_shm=False,
         | 
| 85 | 
            +
                max_shm_regions=2,
         | 
| 86 | 
            +
                scheme='http',
         | 
| 87 | 
            +
                return_dict=True,
         | 
| 88 | 
            +
                warmup=False
         | 
| 89 | 
            +
            )
         | 
| 90 | 
            +
            # if model has fixed input size (except batch size) then sample_inputs will be created
         | 
| 91 | 
            +
            sample_inputs = triton_client.sample_inputs
         | 
| 92 | 
            +
            print('inputs shapes', [o.shape for o in sample_inputs])
         | 
| 93 | 
            +
            outputs = triton_client(*sample_inputs)
         | 
| 94 | 
            +
            print('outputs shapes', [(o_name, o_value.shape) for o_name, o_value in outputs.items()])
         | 
| 95 | 
            +
            ```
         | 
| 96 | 
            +
             | 
| 97 | 
            +
            ## Notes
         | 
| 98 | 
            +
             | 
| 99 | 
            +
            max_batch_size - maximum batch size for inference. If input data larger that max_batch_size, then input data will be splitted to several batches.
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            fixed_batch - if fixed batch is True, then each batch will have fixed size (padding the smallest batch to max_batch_size).
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            warmup - if True, model will run several calls on sample_inputs while initialization. 
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            return_dict - if True, __call__ return dict {'output_name1': output_value1, ...}, else [output_value1, ...]
         | 
| @@ -0,0 +1,13 @@ | |
| 1 | 
            +
            imb/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 2 | 
            +
            imb/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
         | 
| 3 | 
            +
            imb/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
         | 
| 4 | 
            +
            imb/triton.py,sha256=hdnCtDjoRAl_Ss49_ayvW3-VhsYcY2MbNqh3ax6y-18,18629
         | 
| 5 | 
            +
            imb/inference_clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
         | 
| 6 | 
            +
            imb/inference_clients/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
         | 
| 7 | 
            +
            imb/inference_clients/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
         | 
| 8 | 
            +
            imb/inference_clients/triton.py,sha256=hdnCtDjoRAl_Ss49_ayvW3-VhsYcY2MbNqh3ax6y-18,18629
         | 
| 9 | 
            +
            imb-1.0.1.dist-info/LICENSE,sha256=pAZXnNE2dxxwXFIduGyn1gpvPefJtUYOYZOi3yeGG94,1068
         | 
| 10 | 
            +
            imb-1.0.1.dist-info/METADATA,sha256=5x0Xa-Gbg8D8e0mNMzKZB4l27xwNlKfyUXzDwQfkJUA,3240
         | 
| 11 | 
            +
            imb-1.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
         | 
| 12 | 
            +
            imb-1.0.1.dist-info/top_level.txt,sha256=kY8Fp1i_MzTZhuoVhVexG762D8HBd-THfX_lfw4EZmY,4
         | 
| 13 | 
            +
            imb-1.0.1.dist-info/RECORD,,
         | 
    
        imb-1.0.0.dist-info/METADATA
    DELETED
    
    | @@ -1,30 +0,0 @@ | |
| 1 | 
            -
            Metadata-Version: 2.2
         | 
| 2 | 
            -
            Name: imb
         | 
| 3 | 
            -
            Version: 1.0.0
         | 
| 4 | 
            -
            Summary: Python library for run inference of deep learning models in different backends
         | 
| 5 | 
            -
            Home-page: https://github.com/TheConstant3/InferenceMultiBackend
         | 
| 6 | 
            -
            Author: p-constant
         | 
| 7 | 
            -
            Author-email: nikshorop@gmail.com
         | 
| 8 | 
            -
            Classifier: Programming Language :: Python :: 3.8
         | 
| 9 | 
            -
            Classifier: License :: OSI Approved :: MIT License
         | 
| 10 | 
            -
            Classifier: Operating System :: OS Independent
         | 
| 11 | 
            -
            Requires-Python: >=3.8
         | 
| 12 | 
            -
            Description-Content-Type: text/markdown
         | 
| 13 | 
            -
            License-File: LICENSE
         | 
| 14 | 
            -
            Requires-Dist: onnxruntime-gpu>=1.16.0
         | 
| 15 | 
            -
            Requires-Dist: tritonclient[all]>=2.38.0
         | 
| 16 | 
            -
            Requires-Dist: numpy>=1.19.4
         | 
| 17 | 
            -
            Dynamic: author
         | 
| 18 | 
            -
            Dynamic: author-email
         | 
| 19 | 
            -
            Dynamic: classifier
         | 
| 20 | 
            -
            Dynamic: description
         | 
| 21 | 
            -
            Dynamic: description-content-type
         | 
| 22 | 
            -
            Dynamic: home-page
         | 
| 23 | 
            -
            Dynamic: requires-dist
         | 
| 24 | 
            -
            Dynamic: requires-python
         | 
| 25 | 
            -
            Dynamic: summary
         | 
| 26 | 
            -
             | 
| 27 | 
            -
            # InferenceMultiBackend
         | 
| 28 | 
            -
             | 
| 29 | 
            -
            Python library for run inference of deep learning models in different backends
         | 
| 30 | 
            -
             | 
    
        imb-1.0.0.dist-info/RECORD
    DELETED
    
    | @@ -1,10 +0,0 @@ | |
| 1 | 
            -
            imb/__init__.py,sha256=8XoaonMp09UWmynubLMIu2bln41iKgIdWj-wxgsQjnk,55
         | 
| 2 | 
            -
            imb/inference_clients/__init__.py,sha256=Glv4yD0QdtZmCOiYFbILSl90VhxdwvPoH9gFczHlVFk,61
         | 
| 3 | 
            -
            imb/inference_clients/base.py,sha256=oBmiTu4rHgzED5kCxKPvS9e3PhI229Pj5lxuPm7ep6M,5189
         | 
| 4 | 
            -
            imb/inference_clients/onnx.py,sha256=g3vQBJPeln0YUOQ1X9RjZce8AAi-7SXntLyevOZZdG8,4100
         | 
| 5 | 
            -
            imb/inference_clients/triton.py,sha256=hdnCtDjoRAl_Ss49_ayvW3-VhsYcY2MbNqh3ax6y-18,18629
         | 
| 6 | 
            -
            imb-1.0.0.dist-info/LICENSE,sha256=pAZXnNE2dxxwXFIduGyn1gpvPefJtUYOYZOi3yeGG94,1068
         | 
| 7 | 
            -
            imb-1.0.0.dist-info/METADATA,sha256=NZcJPx91mzPg4Zo9FZxlMQE4c6zB2s_yPVhhRVxPBzM,898
         | 
| 8 | 
            -
            imb-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
         | 
| 9 | 
            -
            imb-1.0.0.dist-info/top_level.txt,sha256=kY8Fp1i_MzTZhuoVhVexG762D8HBd-THfX_lfw4EZmY,4
         | 
| 10 | 
            -
            imb-1.0.0.dist-info/RECORD,,
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         |