matrice-inference 0.1.58__tar.gz → 0.1.73__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/PKG-INFO +1 -1
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/PKG-INFO +1 -1
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/inference_interface.py +341 -25
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/model_manager_wrapper.py +1 -1
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/server.py +9 -1
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/analytics_publisher.py +43 -8
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/consumer_manager.py +91 -13
- matrice_inference-0.1.73/src/matrice_inference/server/stream/inference_worker.py +799 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/post_processing_manager.py +56 -29
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/producer_worker.py +53 -15
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/stream_pipeline.py +139 -99
- matrice_inference-0.1.58/src/matrice_inference/server/stream/inference_worker.py +0 -561
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/LICENSE.txt +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/README.md +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/SOURCES.txt +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/dependency_links.txt +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/not-zip-safe +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/top_level.txt +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/pyproject.toml +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/setup.cfg +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/setup.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/__init__.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/py.typed +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/__init__.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/__init__.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/model_manager.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/triton_model_manager.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/triton_server.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/proxy_interface.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/DATA_FLOW_DIAGRAM.md +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/STREAMING_PIPELINE_ARCHITECTURE.md +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/__init__.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/app_deployment.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/app_event_listener.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/camera_config_monitor.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/deployment_refresh_listener.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/frame_cache.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/inference_metric_logger.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/metric_publisher.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/utils.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/worker_metrics.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/abstract_model_manager.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/__init__.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/aggregator.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/analytics.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/ingestor.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/latency.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/pipeline.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/publisher.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/synchronizer.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/batch_manager.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/overall_inference_testing.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/triton_utils.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_frame_cache_optimizations.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_integration_real_components.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_msgpack_simple.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_msgpack_unpacking.py +0 -0
- {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_streaming_pipeline_e2e.py +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
InferenceInterface: Thread-safe inference with
|
|
2
|
+
InferenceInterface: Thread-safe inference with worker queue routing.
|
|
3
3
|
|
|
4
4
|
THREAD SAFETY & CONCURRENT REQUEST HANDLING:
|
|
5
5
|
============================================
|
|
@@ -10,41 +10,45 @@ This module solves the greenlet thread context switching problem that occurs whe
|
|
|
10
10
|
|
|
11
11
|
The Problem:
|
|
12
12
|
-----------
|
|
13
|
-
- Streaming frames
|
|
14
|
-
- Direct API calls
|
|
15
|
-
- Models
|
|
13
|
+
- Streaming frames are processed by inference worker processes with their own models
|
|
14
|
+
- Direct API calls attempt to use models in the main process from different thread contexts
|
|
15
|
+
- Models using gevent/greenlet internally cannot switch between different greenlet contexts
|
|
16
16
|
- This causes: "Cannot switch to a different thread" errors
|
|
17
17
|
|
|
18
|
-
The Solution:
|
|
19
|
-
|
|
20
|
-
1. StreamingPipeline creates
|
|
21
|
-
2.
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
18
|
+
The Solution (Worker Queue Routing):
|
|
19
|
+
-----------------------------------
|
|
20
|
+
1. StreamingPipeline creates inference worker processes that load their own models
|
|
21
|
+
2. When pipeline is active, ALL inference requests (streaming + direct API) are routed
|
|
22
|
+
through the same worker queue (inference_queue)
|
|
23
|
+
3. Direct API calls (identity images) submit tasks to the worker queue and wait for
|
|
24
|
+
responses via a dedicated response queue (direct_api_response_queue)
|
|
25
|
+
4. This ensures all inference uses the same greenlet context (worker process)
|
|
26
|
+
5. High-priority requests bypass the streaming queue backpressure with priority handling
|
|
26
27
|
|
|
27
28
|
Benefits:
|
|
28
29
|
--------
|
|
29
|
-
- No greenlet thread context errors
|
|
30
|
+
- No greenlet thread context errors (all inference in worker process context)
|
|
30
31
|
- Identity images work during streaming
|
|
31
|
-
- Natural frame skipping:
|
|
32
|
-
dropped
|
|
33
|
-
- Simple, robust
|
|
32
|
+
- Natural frame skipping: Workers process identity images, streaming frames queue up
|
|
33
|
+
and may be dropped if queue fills (acceptable for continuous video streams)
|
|
34
|
+
- Simple, robust architecture using multiprocessing queues
|
|
34
35
|
|
|
35
36
|
Usage:
|
|
36
37
|
-----
|
|
37
|
-
1. StreamingPipeline calls: inference_interface.
|
|
38
|
-
2.
|
|
39
|
-
3.
|
|
38
|
+
1. StreamingPipeline calls: inference_interface.set_worker_queues(input_queue, response_queue)
|
|
39
|
+
2. Direct API calls automatically route through worker queue when pipeline is active
|
|
40
|
+
3. High-priority requests (identity images) get dedicated handling
|
|
40
41
|
"""
|
|
41
42
|
|
|
42
43
|
from matrice_inference.server.model.model_manager_wrapper import ModelManagerWrapper
|
|
43
|
-
from typing import Dict, Any, Optional, Tuple, Union
|
|
44
|
+
from typing import Dict, Any, List, Optional, Tuple, Union
|
|
44
45
|
from datetime import datetime, timezone
|
|
45
46
|
import logging
|
|
46
47
|
import time
|
|
47
48
|
import asyncio
|
|
49
|
+
import multiprocessing as mp
|
|
50
|
+
import uuid
|
|
51
|
+
import queue
|
|
48
52
|
from matrice_analytics.post_processing.post_processor import PostProcessor
|
|
49
53
|
|
|
50
54
|
class InferenceInterface:
|
|
@@ -68,6 +72,14 @@ class InferenceInterface:
|
|
|
68
72
|
self.latest_inference_time = datetime.now(timezone.utc)
|
|
69
73
|
self.pipeline_event_loop: Optional[asyncio.AbstractEventLoop] = None
|
|
70
74
|
|
|
75
|
+
# Worker queue routing for direct API calls
|
|
76
|
+
# When set, ALL inference requests are routed through worker processes
|
|
77
|
+
# to avoid greenlet context switching issues
|
|
78
|
+
self._worker_input_queues: Optional[List[mp.Queue]] = None
|
|
79
|
+
self._worker_response_queue: Optional[mp.Queue] = None
|
|
80
|
+
self._use_worker_queue_routing = False
|
|
81
|
+
self._direct_api_worker_counter = 0 # Round-robin counter for load balancing
|
|
82
|
+
|
|
71
83
|
# Track concurrent inference requests for monitoring
|
|
72
84
|
self._active_inference_count = 0
|
|
73
85
|
self._inference_count_lock = asyncio.Lock() if asyncio else None
|
|
@@ -85,6 +97,38 @@ class InferenceInterface:
|
|
|
85
97
|
self.pipeline_event_loop = event_loop
|
|
86
98
|
self.logger.info("Pipeline event loop registered for thread-safe inference")
|
|
87
99
|
|
|
100
|
+
def set_worker_queues(
|
|
101
|
+
self,
|
|
102
|
+
input_queues: List[mp.Queue],
|
|
103
|
+
response_queue: mp.Queue,
|
|
104
|
+
) -> None:
|
|
105
|
+
"""Set worker queues for routing direct API calls through inference workers.
|
|
106
|
+
|
|
107
|
+
When set, direct API calls (e.g., identity images for face recognition) are
|
|
108
|
+
routed through the same inference worker processes that handle streaming frames.
|
|
109
|
+
This avoids greenlet context switching issues by ensuring all model inference
|
|
110
|
+
happens in the worker process context.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
input_queues: List of multiprocessing queues (one per worker) for submitting tasks
|
|
114
|
+
response_queue: Multiprocessing queue for receiving inference results
|
|
115
|
+
"""
|
|
116
|
+
self._worker_input_queues = input_queues
|
|
117
|
+
self._worker_response_queue = response_queue
|
|
118
|
+
self._use_worker_queue_routing = True
|
|
119
|
+
self._direct_api_worker_counter = 0 # Round-robin counter for load balancing
|
|
120
|
+
self.logger.info(
|
|
121
|
+
f"Worker queue routing enabled - direct API calls will use {len(input_queues)} inference workers"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def disable_worker_queue_routing(self) -> None:
|
|
125
|
+
"""Disable worker queue routing (used when pipeline stops)."""
|
|
126
|
+
self._use_worker_queue_routing = False
|
|
127
|
+
self._worker_input_queues = None
|
|
128
|
+
self._worker_response_queue = None
|
|
129
|
+
self._direct_api_worker_counter = 0
|
|
130
|
+
self.logger.info("Worker queue routing disabled")
|
|
131
|
+
|
|
88
132
|
def has_async_predict(self) -> bool:
|
|
89
133
|
"""Check if async_predict is available in the underlying model manager.
|
|
90
134
|
|
|
@@ -106,6 +150,248 @@ class InferenceInterface:
|
|
|
106
150
|
except Exception as e:
|
|
107
151
|
self.logger.warning(f"Error checking async_predict availability: {e}")
|
|
108
152
|
return False
|
|
153
|
+
|
|
154
|
+
def _route_through_worker_queue(
|
|
155
|
+
self,
|
|
156
|
+
input: Any,
|
|
157
|
+
extra_params: Optional[Dict[str, Any]] = None,
|
|
158
|
+
stream_key: Optional[str] = None,
|
|
159
|
+
stream_info: Optional[Dict[str, Any]] = None,
|
|
160
|
+
timeout: float = 5.0,
|
|
161
|
+
) -> Tuple[Any, bool]:
|
|
162
|
+
"""Route inference through worker queue to avoid greenlet context issues.
|
|
163
|
+
|
|
164
|
+
This method submits the inference task to the same queue used by streaming
|
|
165
|
+
frames, ensuring the model is accessed in the worker process context where
|
|
166
|
+
it was loaded. This avoids greenlet "Cannot switch to a different thread" errors.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
input: Input data (image bytes)
|
|
170
|
+
extra_params: Additional parameters for inference
|
|
171
|
+
stream_key: Stream key identifier
|
|
172
|
+
stream_info: Stream metadata
|
|
173
|
+
timeout: Maximum time to wait for response (seconds)
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Tuple of (results, success_flag)
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
RuntimeError: If worker queue routing fails
|
|
180
|
+
"""
|
|
181
|
+
if not self._worker_input_queues:
|
|
182
|
+
raise RuntimeError("Worker queues not configured for routing")
|
|
183
|
+
|
|
184
|
+
# Generate unique request ID for correlation
|
|
185
|
+
request_id = str(uuid.uuid4())
|
|
186
|
+
|
|
187
|
+
# Create a dedicated response queue for this request to avoid cross-talk
|
|
188
|
+
response_queue = mp.Queue(maxsize=1)
|
|
189
|
+
|
|
190
|
+
# Create task for worker queue
|
|
191
|
+
# Uses special "direct_api" type so workers know to send response back
|
|
192
|
+
task = {
|
|
193
|
+
"type": "direct_api",
|
|
194
|
+
"request_id": request_id,
|
|
195
|
+
"input_bytes": input if isinstance(input, bytes) else bytes(input),
|
|
196
|
+
"extra_params": extra_params or {},
|
|
197
|
+
"stream_key": stream_key or f"direct_api_{request_id}",
|
|
198
|
+
"stream_info": stream_info,
|
|
199
|
+
"response_queue": response_queue,
|
|
200
|
+
# Required fields for worker validation (using placeholder values)
|
|
201
|
+
"camera_id": f"direct_api_{request_id[:8]}",
|
|
202
|
+
"frame_id": request_id,
|
|
203
|
+
"message": {"type": "direct_api"},
|
|
204
|
+
"camera_config": {"type": "direct_api"},
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
# Round-robin select a worker queue for load balancing
|
|
208
|
+
num_workers = len(self._worker_input_queues)
|
|
209
|
+
worker_id = self._direct_api_worker_counter % num_workers
|
|
210
|
+
self._direct_api_worker_counter += 1
|
|
211
|
+
target_queue = self._worker_input_queues[worker_id]
|
|
212
|
+
|
|
213
|
+
self.logger.debug(f"Submitting direct API task {request_id} to worker {worker_id}")
|
|
214
|
+
|
|
215
|
+
try:
|
|
216
|
+
# Submit task to worker queue (non-blocking with short timeout)
|
|
217
|
+
# This ensures we don't block forever if queue is full
|
|
218
|
+
target_queue.put(task, timeout=5.0)
|
|
219
|
+
except Exception as e:
|
|
220
|
+
self.logger.error(f"Failed to submit task to worker queue {worker_id}: {e}")
|
|
221
|
+
raise RuntimeError(f"Worker queue submission failed: {e}") from e
|
|
222
|
+
|
|
223
|
+
# Wait for response on the dedicated response queue
|
|
224
|
+
try:
|
|
225
|
+
result = response_queue.get(timeout=timeout)
|
|
226
|
+
except Exception:
|
|
227
|
+
raise RuntimeError(
|
|
228
|
+
f"Timeout waiting for worker response (request_id={request_id}, timeout={timeout}s)"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
# Extract result
|
|
232
|
+
if result.get("success"):
|
|
233
|
+
self.logger.debug(f"Direct API task {request_id} completed successfully")
|
|
234
|
+
return result.get("model_result"), True
|
|
235
|
+
else:
|
|
236
|
+
error_msg = result.get("error", "Unknown worker error")
|
|
237
|
+
self.logger.error(f"Direct API task {request_id} failed: {error_msg}")
|
|
238
|
+
return None, False
|
|
239
|
+
|
|
240
|
+
async def _inference_via_worker_queue(
|
|
241
|
+
self,
|
|
242
|
+
input: Any,
|
|
243
|
+
extra_params: Optional[Dict[str, Any]] = None,
|
|
244
|
+
apply_post_processing: bool = False,
|
|
245
|
+
post_processing_config: Optional[Union[Dict[str, Any], str]] = None,
|
|
246
|
+
stream_key: Optional[str] = None,
|
|
247
|
+
stream_info: Optional[Dict[str, Any]] = None,
|
|
248
|
+
) -> Tuple[Any, Optional[Dict[str, Any]]]:
|
|
249
|
+
"""Async wrapper for worker queue inference with optional post-processing.
|
|
250
|
+
|
|
251
|
+
Routes inference through worker queue and handles post-processing if requested.
|
|
252
|
+
This method is used for high-priority requests (e.g., identity images) when
|
|
253
|
+
streaming is active to avoid greenlet context switching issues.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
input: Input data (image bytes)
|
|
257
|
+
extra_params: Additional parameters for inference
|
|
258
|
+
apply_post_processing: Whether to apply post-processing
|
|
259
|
+
post_processing_config: Configuration for post-processing
|
|
260
|
+
stream_key: Stream key identifier
|
|
261
|
+
stream_info: Stream metadata
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
Tuple of (results, metadata)
|
|
265
|
+
"""
|
|
266
|
+
model_start_time = time.time()
|
|
267
|
+
|
|
268
|
+
# Update latest inference time
|
|
269
|
+
self.latest_inference_time = datetime.now(timezone.utc)
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
# Route through worker queue (synchronous call)
|
|
273
|
+
# Run in thread pool to avoid blocking async event loop
|
|
274
|
+
loop = asyncio.get_event_loop()
|
|
275
|
+
raw_results, success = await loop.run_in_executor(
|
|
276
|
+
None, # Use default executor
|
|
277
|
+
self._route_through_worker_queue,
|
|
278
|
+
input,
|
|
279
|
+
extra_params,
|
|
280
|
+
stream_key,
|
|
281
|
+
stream_info,
|
|
282
|
+
6.0, # timeout
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
model_inference_time = time.time() - model_start_time
|
|
286
|
+
|
|
287
|
+
if not success:
|
|
288
|
+
raise RuntimeError("Model inference via worker queue failed")
|
|
289
|
+
|
|
290
|
+
self.logger.debug(
|
|
291
|
+
f"Worker queue inference executed stream_key={stream_key} "
|
|
292
|
+
f"time={model_inference_time:.4f}s"
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
except Exception as exc:
|
|
296
|
+
error_msg = str(exc)
|
|
297
|
+
if "greenlet" in error_msg.lower() or "cannot switch" in error_msg.lower():
|
|
298
|
+
self.logger.error(
|
|
299
|
+
f"Greenlet error in worker queue routing. This is unexpected - "
|
|
300
|
+
f"worker queue routing should avoid greenlet issues. Error: {error_msg}",
|
|
301
|
+
exc_info=True
|
|
302
|
+
)
|
|
303
|
+
else:
|
|
304
|
+
self.logger.error(f"Worker queue inference failed: {error_msg}", exc_info=True)
|
|
305
|
+
raise RuntimeError(f"Worker queue inference failed: {error_msg}") from exc
|
|
306
|
+
|
|
307
|
+
# If no post-processing requested, return raw results
|
|
308
|
+
if not apply_post_processing or not self.post_processor:
|
|
309
|
+
return raw_results, {
|
|
310
|
+
"timing_metadata": {
|
|
311
|
+
"model_inference_time_sec": model_inference_time,
|
|
312
|
+
"post_processing_time_sec": 0.0,
|
|
313
|
+
"total_time_sec": model_inference_time,
|
|
314
|
+
},
|
|
315
|
+
"routing": "worker_queue",
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
# Apply post-processing using PostProcessor
|
|
319
|
+
try:
|
|
320
|
+
post_processing_start_time = time.time()
|
|
321
|
+
|
|
322
|
+
result = await self.post_processor.process(
|
|
323
|
+
data=raw_results,
|
|
324
|
+
config=post_processing_config,
|
|
325
|
+
input_bytes=input if isinstance(input, bytes) else None,
|
|
326
|
+
stream_key=stream_key,
|
|
327
|
+
stream_info=stream_info
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
post_processing_time = time.time() - post_processing_start_time
|
|
331
|
+
|
|
332
|
+
if result.is_success():
|
|
333
|
+
processed_raw_results = [] if (
|
|
334
|
+
hasattr(result, 'usecase') and result.usecase == 'face_recognition'
|
|
335
|
+
) else raw_results
|
|
336
|
+
|
|
337
|
+
agg_summary = {}
|
|
338
|
+
if hasattr(result, 'data') and isinstance(result.data, dict):
|
|
339
|
+
agg_summary = result.data.get("agg_summary", {})
|
|
340
|
+
|
|
341
|
+
post_processing_result = {
|
|
342
|
+
"status": "success",
|
|
343
|
+
"processing_time": result.processing_time,
|
|
344
|
+
"usecase": getattr(result, 'usecase', ''),
|
|
345
|
+
"category": getattr(result, 'category', ''),
|
|
346
|
+
"summary": getattr(result, 'summary', ''),
|
|
347
|
+
"insights": getattr(result, 'insights', []),
|
|
348
|
+
"metrics": getattr(result, 'metrics', {}),
|
|
349
|
+
"predictions": getattr(result, 'predictions', []),
|
|
350
|
+
"agg_summary": agg_summary,
|
|
351
|
+
"stream_key": stream_key or "default_stream",
|
|
352
|
+
"timing_metadata": {
|
|
353
|
+
"model_inference_time_sec": model_inference_time,
|
|
354
|
+
"post_processing_time_sec": post_processing_time,
|
|
355
|
+
"total_time_sec": model_inference_time + post_processing_time,
|
|
356
|
+
},
|
|
357
|
+
"routing": "worker_queue",
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return processed_raw_results, post_processing_result
|
|
361
|
+
else:
|
|
362
|
+
self.logger.error(f"Post-processing failed: {result.error_message}")
|
|
363
|
+
return raw_results, {
|
|
364
|
+
"status": "post_processing_failed",
|
|
365
|
+
"error": result.error_message,
|
|
366
|
+
"error_type": getattr(result, 'error_type', 'ProcessingError'),
|
|
367
|
+
"processing_time": result.processing_time,
|
|
368
|
+
"processed_data": raw_results,
|
|
369
|
+
"stream_key": stream_key or "default_stream",
|
|
370
|
+
"timing_metadata": {
|
|
371
|
+
"model_inference_time_sec": model_inference_time,
|
|
372
|
+
"post_processing_time_sec": post_processing_time,
|
|
373
|
+
"total_time_sec": model_inference_time + post_processing_time,
|
|
374
|
+
},
|
|
375
|
+
"routing": "worker_queue",
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
except Exception as e:
|
|
379
|
+
post_processing_time = time.time() - post_processing_start_time
|
|
380
|
+
self.logger.error(f"Post-processing exception: {str(e)}", exc_info=True)
|
|
381
|
+
|
|
382
|
+
return raw_results, {
|
|
383
|
+
"status": "post_processing_failed",
|
|
384
|
+
"error": str(e),
|
|
385
|
+
"error_type": type(e).__name__,
|
|
386
|
+
"processed_data": raw_results,
|
|
387
|
+
"stream_key": stream_key or "default_stream",
|
|
388
|
+
"timing_metadata": {
|
|
389
|
+
"model_inference_time_sec": model_inference_time,
|
|
390
|
+
"post_processing_time_sec": post_processing_time,
|
|
391
|
+
"total_time_sec": model_inference_time + post_processing_time,
|
|
392
|
+
},
|
|
393
|
+
"routing": "worker_queue",
|
|
394
|
+
}
|
|
109
395
|
|
|
110
396
|
async def inference(
|
|
111
397
|
self,
|
|
@@ -138,10 +424,11 @@ class InferenceInterface:
|
|
|
138
424
|
- Metadata about the inference and post-processing (if applicable)
|
|
139
425
|
|
|
140
426
|
Note:
|
|
141
|
-
High-priority requests (like identity images for face recognition)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
427
|
+
High-priority requests (like identity images for face recognition) are routed
|
|
428
|
+
through the worker queue when streaming is active. This avoids greenlet context
|
|
429
|
+
switching issues by ensuring all model inference happens in the worker process.
|
|
430
|
+
During their execution, streaming frames may be naturally skipped if the
|
|
431
|
+
inference queue fills up, which is acceptable for continuous streaming scenarios.
|
|
145
432
|
"""
|
|
146
433
|
if input is None:
|
|
147
434
|
raise ValueError("Input cannot be None")
|
|
@@ -150,6 +437,35 @@ class InferenceInterface:
|
|
|
150
437
|
if is_high_priority:
|
|
151
438
|
self.logger.info(f"Processing high-priority inference request (stream_key={stream_key})")
|
|
152
439
|
|
|
440
|
+
# CRITICAL: Route high-priority requests through worker queue when streaming is active
|
|
441
|
+
# This avoids greenlet "Cannot switch to a different thread" errors
|
|
442
|
+
# Only applies when: 1) high priority request AND 2) worker queue routing enabled AND 3) queues available
|
|
443
|
+
if (
|
|
444
|
+
is_high_priority
|
|
445
|
+
and self._use_worker_queue_routing
|
|
446
|
+
and self._worker_input_queues is not None
|
|
447
|
+
):
|
|
448
|
+
self.logger.info(
|
|
449
|
+
f"Routing high-priority request through worker queue to avoid greenlet issues "
|
|
450
|
+
f"(stream_key={stream_key})"
|
|
451
|
+
)
|
|
452
|
+
try:
|
|
453
|
+
return await self._inference_via_worker_queue(
|
|
454
|
+
input=input,
|
|
455
|
+
extra_params=extra_params,
|
|
456
|
+
apply_post_processing=apply_post_processing,
|
|
457
|
+
post_processing_config=post_processing_config,
|
|
458
|
+
stream_key=stream_key,
|
|
459
|
+
stream_info=stream_info,
|
|
460
|
+
)
|
|
461
|
+
except Exception as worker_exc:
|
|
462
|
+
# If worker queue routing fails, log warning and fall back to direct inference
|
|
463
|
+
# This ensures the request still has a chance to complete
|
|
464
|
+
self.logger.warning(
|
|
465
|
+
f"Worker queue routing failed, falling back to direct inference: {worker_exc}"
|
|
466
|
+
)
|
|
467
|
+
# Continue to original inference path below
|
|
468
|
+
|
|
153
469
|
# Measure model inference time
|
|
154
470
|
model_start_time = time.time()
|
|
155
471
|
|
|
@@ -180,7 +496,7 @@ class InferenceInterface:
|
|
|
180
496
|
event_loop_to_use
|
|
181
497
|
)
|
|
182
498
|
# High-priority requests get longer timeout
|
|
183
|
-
timeout =
|
|
499
|
+
timeout = 10.0 if is_high_priority else 6.0
|
|
184
500
|
raw_results, success = future.result(timeout=timeout)
|
|
185
501
|
else:
|
|
186
502
|
# Fall back to sync inference (no async support or no event loop)
|
|
@@ -77,7 +77,7 @@ class ModelManagerWrapper:
|
|
|
77
77
|
self.action_tracker = action_tracker
|
|
78
78
|
self.test_env = test_env
|
|
79
79
|
self.model_type = model_type.lower() if model_type else "default"
|
|
80
|
-
|
|
80
|
+
self.model_type = "default" # TODO: remove this once BE is updated with the current types
|
|
81
81
|
# Validate model_type
|
|
82
82
|
if self.model_type not in ["default", "triton"]:
|
|
83
83
|
raise ValueError(f"Invalid model_type '{self.model_type}'. Must be 'default' or 'triton'")
|
{matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/server.py
RENAMED
|
@@ -422,6 +422,11 @@ class MatriceDeployServer:
|
|
|
422
422
|
except Exception as e:
|
|
423
423
|
logging.warning(f"Failed to get index_to_category from action_tracker: {str(e)}")
|
|
424
424
|
|
|
425
|
+
# Store post-processing config for passing to StreamingPipeline (as dict, not extracted from post_processor)
|
|
426
|
+
self._post_processing_config = post_processing_config
|
|
427
|
+
self._index_to_category = index_to_category
|
|
428
|
+
self._target_categories = target_categories
|
|
429
|
+
|
|
425
430
|
# Create PostProcessor
|
|
426
431
|
self.post_processor = PostProcessor(
|
|
427
432
|
post_processing_config=post_processing_config,
|
|
@@ -473,7 +478,6 @@ class MatriceDeployServer:
|
|
|
473
478
|
# Create streaming pipeline with configured parameters
|
|
474
479
|
self.streaming_pipeline = StreamingPipeline(
|
|
475
480
|
inference_interface=self.inference_interface,
|
|
476
|
-
post_processor=self.post_processor,
|
|
477
481
|
inference_queue_maxsize=self.job_params.get("inference_queue_maxsize", 5000),
|
|
478
482
|
postproc_queue_maxsize=self.job_params.get("postproc_queue_maxsize", 5000),
|
|
479
483
|
output_queue_maxsize=self.job_params.get("output_queue_maxsize", 5000),
|
|
@@ -499,6 +503,10 @@ class MatriceDeployServer:
|
|
|
499
503
|
async_predict=self.async_predict,
|
|
500
504
|
async_load_model=self.async_load_model,
|
|
501
505
|
batch_predict=self.batch_predict,
|
|
506
|
+
# Pass post-processing configuration as dict (not extracted from post_processor)
|
|
507
|
+
post_processing_config=getattr(self, '_post_processing_config', {}),
|
|
508
|
+
index_to_category=getattr(self, '_index_to_category', None),
|
|
509
|
+
target_categories=getattr(self, '_target_categories', None),
|
|
502
510
|
)
|
|
503
511
|
|
|
504
512
|
# Start the pipeline (now manages its own event loop thread)
|
|
@@ -240,15 +240,33 @@ class AnalyticsPublisher:
|
|
|
240
240
|
try:
|
|
241
241
|
camera_id = task_data.get("camera_id")
|
|
242
242
|
if not camera_id:
|
|
243
|
+
self.logger.debug("No camera_id in task_data, skipping analytics extraction")
|
|
243
244
|
return
|
|
244
|
-
|
|
245
|
+
|
|
245
246
|
data = task_data.get("data", {})
|
|
246
247
|
post_processing_result = data.get("post_processing_result", {})
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
248
|
+
|
|
249
|
+
# Check for agg_summary at top level (current format after flattening)
|
|
250
|
+
# or nested in data field (legacy format for backward compatibility)
|
|
251
|
+
agg_summary = post_processing_result.get("agg_summary")
|
|
252
|
+
if agg_summary is None and "data" in post_processing_result:
|
|
253
|
+
# Legacy format: agg_summary nested in data field
|
|
254
|
+
agg_summary = post_processing_result.get("data", {}).get("agg_summary")
|
|
255
|
+
if agg_summary:
|
|
256
|
+
self.logger.debug(f"Found agg_summary in legacy nested format for camera {camera_id}")
|
|
257
|
+
|
|
258
|
+
# Skip if no agg_summary found
|
|
259
|
+
if not agg_summary or not isinstance(agg_summary, dict):
|
|
260
|
+
self.logger.debug(
|
|
261
|
+
f"No valid agg_summary for camera {camera_id}. "
|
|
262
|
+
f"post_processing_result keys: {list(post_processing_result.keys()) if post_processing_result else 'empty'}"
|
|
263
|
+
)
|
|
250
264
|
return
|
|
251
|
-
|
|
265
|
+
|
|
266
|
+
self.logger.debug(
|
|
267
|
+
f"Processing agg_summary for camera {camera_id} with {len(agg_summary)} frame(s)"
|
|
268
|
+
)
|
|
269
|
+
|
|
252
270
|
# Process each frame in agg_summary
|
|
253
271
|
for frame_id, frame_data in agg_summary.items():
|
|
254
272
|
tracking_stats = frame_data.get("tracking_stats", {})
|
|
@@ -349,15 +367,25 @@ class AnalyticsPublisher:
|
|
|
349
367
|
async def _publish_analytics(self) -> None:
|
|
350
368
|
"""Publish aggregated analytics to Redis and optionally Kafka."""
|
|
351
369
|
try:
|
|
370
|
+
if not self.analytics_store:
|
|
371
|
+
self.logger.debug("No analytics data to publish (analytics_store is empty)")
|
|
372
|
+
return
|
|
373
|
+
|
|
374
|
+
self.logger.info(
|
|
375
|
+
f"Publishing analytics for {len(self.analytics_store)} camera(s) to results-agg"
|
|
376
|
+
)
|
|
377
|
+
|
|
352
378
|
# Publish analytics for each camera
|
|
353
379
|
for camera_id, analytics_data in self.analytics_store.items():
|
|
354
380
|
if not analytics_data:
|
|
381
|
+
self.logger.debug(f"No analytics data for camera {camera_id}, skipping")
|
|
355
382
|
continue
|
|
356
383
|
|
|
357
384
|
# Build analytics message
|
|
358
385
|
message = self._build_analytics_message(camera_id, analytics_data)
|
|
359
386
|
|
|
360
387
|
if not message:
|
|
388
|
+
self.logger.warning(f"Failed to build analytics message for camera {camera_id}")
|
|
361
389
|
continue
|
|
362
390
|
|
|
363
391
|
# Publish to Redis (required)
|
|
@@ -463,15 +491,22 @@ class AnalyticsPublisher:
|
|
|
463
491
|
if not self.redis_stream:
|
|
464
492
|
self.logger.warning("Redis stream not initialized, skipping publish")
|
|
465
493
|
return
|
|
466
|
-
|
|
494
|
+
|
|
467
495
|
message_json = json.dumps(message)
|
|
468
496
|
await self.redis_stream.async_add_message(
|
|
469
497
|
self.ANALYTICS_TOPIC,
|
|
470
498
|
message_json,
|
|
471
499
|
key=camera_id
|
|
472
500
|
)
|
|
473
|
-
|
|
474
|
-
|
|
501
|
+
|
|
502
|
+
# Log at info level so we can see when data is being published
|
|
503
|
+
tracking_stats = message.get("tracking_stats", {})
|
|
504
|
+
current_counts = tracking_stats.get("current_counts", [])
|
|
505
|
+
total_counts = tracking_stats.get("total_counts", [])
|
|
506
|
+
self.logger.info(
|
|
507
|
+
f"Published analytics to Redis '{self.ANALYTICS_TOPIC}' for camera {camera_id}: "
|
|
508
|
+
f"current={current_counts}, total={total_counts}"
|
|
509
|
+
)
|
|
475
510
|
|
|
476
511
|
except Exception as e:
|
|
477
512
|
self.logger.error(f"Error publishing to Redis for {camera_id}: {e}", exc_info=True)
|