matrice-inference 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of matrice-inference might be problematic. Click here for more details.
- matrice_inference/__init__.py +72 -0
- matrice_inference/py.typed +0 -0
- matrice_inference/server/__init__.py +23 -0
- matrice_inference/server/inference_interface.py +176 -0
- matrice_inference/server/model/__init__.py +1 -0
- matrice_inference/server/model/model_manager.py +274 -0
- matrice_inference/server/model/model_manager_wrapper.py +550 -0
- matrice_inference/server/model/triton_model_manager.py +290 -0
- matrice_inference/server/model/triton_server.py +1248 -0
- matrice_inference/server/proxy_interface.py +371 -0
- matrice_inference/server/server.py +1004 -0
- matrice_inference/server/stream/__init__.py +0 -0
- matrice_inference/server/stream/app_deployment.py +228 -0
- matrice_inference/server/stream/consumer_worker.py +201 -0
- matrice_inference/server/stream/frame_cache.py +127 -0
- matrice_inference/server/stream/inference_worker.py +163 -0
- matrice_inference/server/stream/post_processing_worker.py +230 -0
- matrice_inference/server/stream/producer_worker.py +147 -0
- matrice_inference/server/stream/stream_pipeline.py +451 -0
- matrice_inference/server/stream/utils.py +23 -0
- matrice_inference/tmp/abstract_model_manager.py +58 -0
- matrice_inference/tmp/aggregator/__init__.py +18 -0
- matrice_inference/tmp/aggregator/aggregator.py +330 -0
- matrice_inference/tmp/aggregator/analytics.py +906 -0
- matrice_inference/tmp/aggregator/ingestor.py +438 -0
- matrice_inference/tmp/aggregator/latency.py +597 -0
- matrice_inference/tmp/aggregator/pipeline.py +968 -0
- matrice_inference/tmp/aggregator/publisher.py +431 -0
- matrice_inference/tmp/aggregator/synchronizer.py +594 -0
- matrice_inference/tmp/batch_manager.py +239 -0
- matrice_inference/tmp/overall_inference_testing.py +338 -0
- matrice_inference/tmp/triton_utils.py +638 -0
- matrice_inference-0.1.2.dist-info/METADATA +28 -0
- matrice_inference-0.1.2.dist-info/RECORD +37 -0
- matrice_inference-0.1.2.dist-info/WHEEL +5 -0
- matrice_inference-0.1.2.dist-info/licenses/LICENSE.txt +21 -0
- matrice_inference-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import numpy as np
|
|
3
|
+
import requests
|
|
4
|
+
import time
|
|
5
|
+
from typing import Tuple, Any, List, Union
|
|
6
|
+
from matrice_inference.server.model.triton_server import TritonServer, TritonInference
|
|
7
|
+
|
|
8
|
+
class TritonModelManager:
|
|
9
|
+
"""Model manager for Triton Inference Server, aligned with pipeline and inference interface."""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
model_name: str,
|
|
14
|
+
model_path: str,
|
|
15
|
+
runtime_framework: str,
|
|
16
|
+
internal_server_type: str,
|
|
17
|
+
internal_port: int,
|
|
18
|
+
internal_host: str,
|
|
19
|
+
input_size: Union[int, List[int]] = 640, # Priority Obj det
|
|
20
|
+
num_classes: int = 10,
|
|
21
|
+
num_model_instances: int = 1,
|
|
22
|
+
use_dynamic_batching: bool = False,
|
|
23
|
+
max_batch_size: int = 8,
|
|
24
|
+
is_yolo: bool = False,
|
|
25
|
+
is_ocr: bool = False,
|
|
26
|
+
use_trt_accelerator: bool = False,
|
|
27
|
+
):
|
|
28
|
+
try:
|
|
29
|
+
if internal_server_type not in ["rest", "grpc"]:
|
|
30
|
+
logging.warning(f"Invalid internal_server_type '{internal_server_type}', defaulting to 'rest'")
|
|
31
|
+
|
|
32
|
+
self.internal_server_type = internal_server_type
|
|
33
|
+
self.internal_port = internal_port
|
|
34
|
+
self.internal_host = internal_host
|
|
35
|
+
self.use_dynamic_batching = use_dynamic_batching
|
|
36
|
+
self.max_batch_size = max_batch_size
|
|
37
|
+
|
|
38
|
+
self.triton_server = TritonServer(
|
|
39
|
+
model_name=model_name,
|
|
40
|
+
model_path=model_path,
|
|
41
|
+
runtime_framework=runtime_framework,
|
|
42
|
+
input_size=input_size,
|
|
43
|
+
num_classes=num_classes,
|
|
44
|
+
dynamic_batching=use_dynamic_batching,
|
|
45
|
+
num_model_instances=num_model_instances,
|
|
46
|
+
max_batch_size=max_batch_size,
|
|
47
|
+
connection_protocol=internal_server_type,
|
|
48
|
+
is_yolo=is_yolo,
|
|
49
|
+
is_ocr=is_ocr,
|
|
50
|
+
use_trt_accelerator=use_trt_accelerator,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
logging.info(f"Starting Triton server on {internal_host}:{internal_port}...")
|
|
54
|
+
self.triton_server_process = self.triton_server.setup(internal_port)
|
|
55
|
+
|
|
56
|
+
logging.info("Waiting for Triton server to be ready...")
|
|
57
|
+
self._wait_for_ready()
|
|
58
|
+
|
|
59
|
+
self.client = TritonInference(
|
|
60
|
+
server_type=self.triton_server.connection_protocol,
|
|
61
|
+
model_name=model_name,
|
|
62
|
+
internal_port=internal_port,
|
|
63
|
+
internal_host=internal_host,
|
|
64
|
+
runtime_framework=self.triton_server.runtime_framework,
|
|
65
|
+
is_yolo = self.triton_server.is_yolo,
|
|
66
|
+
input_size=input_size,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
logging.info(f"Initialized TritonModelManager with {num_model_instances} client instances, protocol: {self.triton_server.connection_protocol}")
|
|
70
|
+
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logging.error(f"Failed to initialize TritonModelManager: {str(e)}", exc_info=True)
|
|
73
|
+
raise
|
|
74
|
+
|
|
75
|
+
def _wait_for_ready(self):
|
|
76
|
+
"""Wait for Triton server to be ready with fixed retries and 5s sleep."""
|
|
77
|
+
max_attempts = 30 # 150 seconds wait time
|
|
78
|
+
for attempt in range(max_attempts):
|
|
79
|
+
try:
|
|
80
|
+
if self.internal_server_type == "rest":
|
|
81
|
+
response = requests.get(
|
|
82
|
+
f"http://{self.internal_host}:{self.internal_port}/v2/health/ready",
|
|
83
|
+
timeout=5
|
|
84
|
+
)
|
|
85
|
+
if response.status_code == 200:
|
|
86
|
+
logging.info("========= Triton server is ready (REST) =========")
|
|
87
|
+
break
|
|
88
|
+
else:
|
|
89
|
+
logging.info(f"Attempt {attempt + 1}/{max_attempts} - server not ready, retrying in 5 seconds...")
|
|
90
|
+
time.sleep(5)
|
|
91
|
+
|
|
92
|
+
else: # gRPC
|
|
93
|
+
try:
|
|
94
|
+
import tritonclient.grpc as grpcclient
|
|
95
|
+
except ImportError:
|
|
96
|
+
grpcclient = None
|
|
97
|
+
|
|
98
|
+
if grpcclient is None:
|
|
99
|
+
raise ImportError("tritonclient.grpc required for gRPC")
|
|
100
|
+
|
|
101
|
+
with grpcclient.InferenceServerClient(f"{self.internal_host}:{self.internal_port}") as client:
|
|
102
|
+
if client.is_server_ready():
|
|
103
|
+
logging.info("========= Triton server is ready (gRPC) =========")
|
|
104
|
+
break
|
|
105
|
+
else:
|
|
106
|
+
logging.info(f"Attempt {attempt + 1}/{max_attempts} - server not ready, retrying in 5 seconds...")
|
|
107
|
+
time.sleep(5)
|
|
108
|
+
|
|
109
|
+
except Exception as e:
|
|
110
|
+
if attempt < max_attempts - 1:
|
|
111
|
+
logging.info(f"Attempt {attempt + 1}/{max_attempts} failed, retrying in 5 seconds... (Error: {str(e)})")
|
|
112
|
+
time.sleep(5)
|
|
113
|
+
else:
|
|
114
|
+
logging.error("Triton server failed to become ready after maximum attempts")
|
|
115
|
+
raise
|
|
116
|
+
|
|
117
|
+
def inference(
|
|
118
|
+
self,
|
|
119
|
+
input: bytes,
|
|
120
|
+
) -> Tuple[Any, bool]:
|
|
121
|
+
"""Perform synchronous single inference using TritonInference client.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
input: Primary input data (e.g., image bytes).
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Tuple of (results, success_flag).
|
|
128
|
+
"""
|
|
129
|
+
if input is None:
|
|
130
|
+
raise ValueError("Input data cannot be None")
|
|
131
|
+
try:
|
|
132
|
+
client = self.client
|
|
133
|
+
if not client:
|
|
134
|
+
raise RuntimeError("No Triton client available")
|
|
135
|
+
results = client.inference(input)
|
|
136
|
+
results = client.format_response(results)
|
|
137
|
+
return results, True
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logging.error(f"Triton sync inference failed for: {str(e)}", exc_info=True)
|
|
140
|
+
return None, False
|
|
141
|
+
|
|
142
|
+
async def async_inference(
|
|
143
|
+
self,
|
|
144
|
+
input: Union[bytes, np.ndarray],
|
|
145
|
+
) -> Tuple[Any, bool]:
|
|
146
|
+
"""Perform asynchronous single inference using TritonInference client.
|
|
147
|
+
Args:
|
|
148
|
+
input: Primary input data (Image bytes or numpy array).
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Tuple of (results, success_flag).
|
|
152
|
+
"""
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if input is None:
|
|
156
|
+
logging.error("Input data cannot be None")
|
|
157
|
+
raise ValueError("Input data cannot be None")
|
|
158
|
+
try:
|
|
159
|
+
client = self.client
|
|
160
|
+
if not client:
|
|
161
|
+
logging.error("No Triton client available")
|
|
162
|
+
raise RuntimeError("No Triton client available")
|
|
163
|
+
results = await client.async_inference(input)
|
|
164
|
+
results = client.format_response(results)
|
|
165
|
+
logging.info(f"Async inference result: {results}")
|
|
166
|
+
return results, True
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logging.error(f"Triton async inference failed: {e}")
|
|
169
|
+
return {"error": str(e), "predictions": None}, False
|
|
170
|
+
|
|
171
|
+
def batch_inference(
|
|
172
|
+
self,
|
|
173
|
+
input: List[bytes],
|
|
174
|
+
) -> Tuple[List[Any], bool]:
|
|
175
|
+
"""Perform synchronous batch inference using TritonInference client.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
input: List of primary input data (e.g., image bytes).
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Tuple of (results_list, success_flag).
|
|
182
|
+
"""
|
|
183
|
+
if not input:
|
|
184
|
+
raise ValueError("Batch input cannot be None")
|
|
185
|
+
try:
|
|
186
|
+
client = self.client
|
|
187
|
+
if not client:
|
|
188
|
+
raise RuntimeError("No Triton client available")
|
|
189
|
+
results = []
|
|
190
|
+
|
|
191
|
+
if self.use_dynamic_batching:
|
|
192
|
+
input_array = self._preprocess_batch_inputs(input, client)
|
|
193
|
+
batch_results = client.inference(input_array)
|
|
194
|
+
results = self._split_batch_results(batch_results, len(input))
|
|
195
|
+
else:
|
|
196
|
+
for inp in input:
|
|
197
|
+
result = client.inference(inp)
|
|
198
|
+
results.append(result)
|
|
199
|
+
|
|
200
|
+
results = [client.format_response(result) for result in results]
|
|
201
|
+
return results, True
|
|
202
|
+
except Exception as e:
|
|
203
|
+
logging.error(f"Triton sync batch inference failed for: {str(e)}", exc_info=True)
|
|
204
|
+
return None, False
|
|
205
|
+
|
|
206
|
+
async def async_batch_inference(
|
|
207
|
+
self,
|
|
208
|
+
input: List[bytes],
|
|
209
|
+
) -> Tuple[List[Any], bool]:
|
|
210
|
+
"""Perform asynchronous batch inference using TritonInference client.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
input: List of primary input data (e.g., image bytes).
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Tuple of (results_list, success_flag).
|
|
217
|
+
"""
|
|
218
|
+
if not input:
|
|
219
|
+
raise ValueError("Batch input cannot be None")
|
|
220
|
+
try:
|
|
221
|
+
client = self.client
|
|
222
|
+
if not client:
|
|
223
|
+
raise RuntimeError("No Triton client available")
|
|
224
|
+
results = []
|
|
225
|
+
|
|
226
|
+
if self.use_dynamic_batching:
|
|
227
|
+
input_array = self._preprocess_batch_inputs(input, client)
|
|
228
|
+
batch_results = await client.async_inference(input_array)
|
|
229
|
+
split_results = self._split_batch_results(batch_results, len(input))
|
|
230
|
+
results = [client.format_response(r) for r in split_results]
|
|
231
|
+
else:
|
|
232
|
+
for inp in input:
|
|
233
|
+
res = await client.async_inference(inp)
|
|
234
|
+
results.append(client.format_response(res))
|
|
235
|
+
|
|
236
|
+
return results, True
|
|
237
|
+
except Exception as e:
|
|
238
|
+
logging.error(f"Triton async batch inference failed for: {str(e)}", exc_info=True)
|
|
239
|
+
return None, False
|
|
240
|
+
|
|
241
|
+
def _preprocess_batch_inputs(self, input: List[bytes], client: TritonInference) -> np.ndarray:
|
|
242
|
+
"""Preprocess batch inputs for Triton dynamic batching.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
input: List of input data (e.g., image bytes).
|
|
246
|
+
client: TritonInference client for shape and data type information.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Preprocessed NumPy array for batch inference.
|
|
250
|
+
"""
|
|
251
|
+
try:
|
|
252
|
+
batch_inputs = []
|
|
253
|
+
for inp in input:
|
|
254
|
+
arr = client._preprocess_input(inp)
|
|
255
|
+
|
|
256
|
+
if arr.ndim == 4 and arr.shape[0] == 1:
|
|
257
|
+
arr = np.squeeze(arr, axis=0)
|
|
258
|
+
|
|
259
|
+
if arr.ndim != 3:
|
|
260
|
+
logging.warning(f"Unexpected input shape {arr.shape}, expected (C,H,W) after preprocessing")
|
|
261
|
+
|
|
262
|
+
batch_inputs.append(arr)
|
|
263
|
+
|
|
264
|
+
# Stack into final batch (B, C, H, W)
|
|
265
|
+
stacked = np.stack(batch_inputs, axis=0)
|
|
266
|
+
# Ensure C-contiguous (important for Triton)
|
|
267
|
+
return np.ascontiguousarray(stacked)
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
logging.error(f"Failed to preprocess batch inputs: {str(e)}", exc_info=True)
|
|
271
|
+
raise
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _split_batch_results(self, batch_results: np.ndarray, batch_size: int) -> List[Any]:
|
|
275
|
+
"""Split batch results into individual results.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
batch_results: NumPy array of batch inference results.
|
|
279
|
+
batch_size: Number of inputs in the batch.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
List of individual results.
|
|
283
|
+
"""
|
|
284
|
+
try:
|
|
285
|
+
if batch_results.ndim == 1:
|
|
286
|
+
return [batch_results] * batch_size
|
|
287
|
+
return [batch_results[i] for i in range(batch_size)]
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logging.error(f"Failed to split batch results: {str(e)}", exc_info=True)
|
|
290
|
+
raise
|