matrice-inference 0.1.58__tar.gz → 0.1.73__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/PKG-INFO +1 -1
  2. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/PKG-INFO +1 -1
  3. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/inference_interface.py +341 -25
  4. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/model_manager_wrapper.py +1 -1
  5. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/server.py +9 -1
  6. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/analytics_publisher.py +43 -8
  7. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/consumer_manager.py +91 -13
  8. matrice_inference-0.1.73/src/matrice_inference/server/stream/inference_worker.py +799 -0
  9. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/post_processing_manager.py +56 -29
  10. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/producer_worker.py +53 -15
  11. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/stream_pipeline.py +139 -99
  12. matrice_inference-0.1.58/src/matrice_inference/server/stream/inference_worker.py +0 -561
  13. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/LICENSE.txt +0 -0
  14. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/README.md +0 -0
  15. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/SOURCES.txt +0 -0
  16. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/dependency_links.txt +0 -0
  17. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/not-zip-safe +0 -0
  18. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/top_level.txt +0 -0
  19. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/pyproject.toml +0 -0
  20. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/setup.cfg +0 -0
  21. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/setup.py +0 -0
  22. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/__init__.py +0 -0
  23. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/py.typed +0 -0
  24. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/__init__.py +0 -0
  25. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/__init__.py +0 -0
  26. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/model_manager.py +0 -0
  27. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/triton_model_manager.py +0 -0
  28. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/triton_server.py +0 -0
  29. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/proxy_interface.py +0 -0
  30. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/DATA_FLOW_DIAGRAM.md +0 -0
  31. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/STREAMING_PIPELINE_ARCHITECTURE.md +0 -0
  32. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/__init__.py +0 -0
  33. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/app_deployment.py +0 -0
  34. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/app_event_listener.py +0 -0
  35. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/camera_config_monitor.py +0 -0
  36. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/deployment_refresh_listener.py +0 -0
  37. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/frame_cache.py +0 -0
  38. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/inference_metric_logger.py +0 -0
  39. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/metric_publisher.py +0 -0
  40. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/utils.py +0 -0
  41. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/worker_metrics.py +0 -0
  42. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/abstract_model_manager.py +0 -0
  43. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/__init__.py +0 -0
  44. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/aggregator.py +0 -0
  45. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/analytics.py +0 -0
  46. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/ingestor.py +0 -0
  47. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/latency.py +0 -0
  48. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/pipeline.py +0 -0
  49. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/publisher.py +0 -0
  50. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/aggregator/synchronizer.py +0 -0
  51. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/batch_manager.py +0 -0
  52. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/overall_inference_testing.py +0 -0
  53. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/tmp/triton_utils.py +0 -0
  54. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_frame_cache_optimizations.py +0 -0
  55. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_integration_real_components.py +0 -0
  56. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_msgpack_simple.py +0 -0
  57. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_msgpack_unpacking.py +0 -0
  58. {matrice_inference-0.1.58 → matrice_inference-0.1.73}/tests/test_streaming_pipeline_e2e.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_inference
3
- Version: 0.1.58
3
+ Version: 0.1.73
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: matrice_inference
3
- Version: 0.1.58
3
+ Version: 0.1.73
4
4
  Summary: Common server utilities for Matrice.ai services
5
5
  Author-email: "Matrice.ai" <dipendra@matrice.ai>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
1
  """
2
- InferenceInterface: Thread-safe inference with unified event loop management.
2
+ InferenceInterface: Thread-safe inference with worker queue routing.
3
3
 
4
4
  THREAD SAFETY & CONCURRENT REQUEST HANDLING:
5
5
  ============================================
@@ -10,41 +10,45 @@ This module solves the greenlet thread context switching problem that occurs whe
10
10
 
11
11
  The Problem:
12
12
  -----------
13
- - Streaming frames use the StreamingPipeline's dedicated event loop
14
- - Direct API calls may try to use a different event loop or thread context
15
- - Models loaded with gevent/greenlet cannot switch between different thread contexts
13
+ - Streaming frames are processed by inference worker processes with their own models
14
+ - Direct API calls attempt to use models in the main process from different thread contexts
15
+ - Models using gevent/greenlet internally cannot switch between different greenlet contexts
16
16
  - This causes: "Cannot switch to a different thread" errors
17
17
 
18
- The Solution:
19
- ------------
20
- 1. StreamingPipeline creates a single dedicated event loop at startup
21
- 2. All model instances are loaded in this event loop
22
- 3. InferenceInterface stores a reference to this pipeline event loop
23
- 4. ALL inference requests (streaming + direct API) use asyncio.run_coroutine_threadsafe()
24
- to execute in the pipeline's event loop, regardless of which thread they originate from
25
- 5. High-priority requests (identity images) get longer timeouts and always complete
18
+ The Solution (Worker Queue Routing):
19
+ -----------------------------------
20
+ 1. StreamingPipeline creates inference worker processes that load their own models
21
+ 2. When pipeline is active, ALL inference requests (streaming + direct API) are routed
22
+ through the same worker queue (inference_queue)
23
+ 3. Direct API calls (identity images) submit tasks to the worker queue and wait for
24
+ responses via a dedicated response queue (direct_api_response_queue)
25
+ 4. This ensures all inference uses the same greenlet context (worker process)
26
+ 5. High-priority requests bypass the streaming queue backpressure with priority handling
26
27
 
27
28
  Benefits:
28
29
  --------
29
- - No greenlet thread context errors
30
+ - No greenlet thread context errors (all inference in worker process context)
30
31
  - Identity images work during streaming
31
- - Natural frame skipping: If identity processing takes time, streaming frames may be
32
- dropped from the queue, which is acceptable for continuous video streams
33
- - Simple, robust, maintainable solution
32
+ - Natural frame skipping: Workers process identity images, streaming frames queue up
33
+ and may be dropped if queue fills (acceptable for continuous video streams)
34
+ - Simple, robust architecture using multiprocessing queues
34
35
 
35
36
  Usage:
36
37
  -----
37
- 1. StreamingPipeline calls: inference_interface.set_pipeline_event_loop(event_loop)
38
- 2. All inference calls automatically use this event loop via run_coroutine_threadsafe()
39
- 3. Direct API calls set is_high_priority=True for guaranteed execution
38
+ 1. StreamingPipeline calls: inference_interface.set_worker_queues(input_queue, response_queue)
39
+ 2. Direct API calls automatically route through worker queue when pipeline is active
40
+ 3. High-priority requests (identity images) get dedicated handling
40
41
  """
41
42
 
42
43
  from matrice_inference.server.model.model_manager_wrapper import ModelManagerWrapper
43
- from typing import Dict, Any, Optional, Tuple, Union
44
+ from typing import Dict, Any, List, Optional, Tuple, Union
44
45
  from datetime import datetime, timezone
45
46
  import logging
46
47
  import time
47
48
  import asyncio
49
+ import multiprocessing as mp
50
+ import uuid
51
+ import queue
48
52
  from matrice_analytics.post_processing.post_processor import PostProcessor
49
53
 
50
54
  class InferenceInterface:
@@ -68,6 +72,14 @@ class InferenceInterface:
68
72
  self.latest_inference_time = datetime.now(timezone.utc)
69
73
  self.pipeline_event_loop: Optional[asyncio.AbstractEventLoop] = None
70
74
 
75
+ # Worker queue routing for direct API calls
76
+ # When set, ALL inference requests are routed through worker processes
77
+ # to avoid greenlet context switching issues
78
+ self._worker_input_queues: Optional[List[mp.Queue]] = None
79
+ self._worker_response_queue: Optional[mp.Queue] = None
80
+ self._use_worker_queue_routing = False
81
+ self._direct_api_worker_counter = 0 # Round-robin counter for load balancing
82
+
71
83
  # Track concurrent inference requests for monitoring
72
84
  self._active_inference_count = 0
73
85
  self._inference_count_lock = asyncio.Lock() if asyncio else None
@@ -85,6 +97,38 @@ class InferenceInterface:
85
97
  self.pipeline_event_loop = event_loop
86
98
  self.logger.info("Pipeline event loop registered for thread-safe inference")
87
99
 
100
+ def set_worker_queues(
101
+ self,
102
+ input_queues: List[mp.Queue],
103
+ response_queue: mp.Queue,
104
+ ) -> None:
105
+ """Set worker queues for routing direct API calls through inference workers.
106
+
107
+ When set, direct API calls (e.g., identity images for face recognition) are
108
+ routed through the same inference worker processes that handle streaming frames.
109
+ This avoids greenlet context switching issues by ensuring all model inference
110
+ happens in the worker process context.
111
+
112
+ Args:
113
+ input_queues: List of multiprocessing queues (one per worker) for submitting tasks
114
+ response_queue: Multiprocessing queue for receiving inference results
115
+ """
116
+ self._worker_input_queues = input_queues
117
+ self._worker_response_queue = response_queue
118
+ self._use_worker_queue_routing = True
119
+ self._direct_api_worker_counter = 0 # Round-robin counter for load balancing
120
+ self.logger.info(
121
+ f"Worker queue routing enabled - direct API calls will use {len(input_queues)} inference workers"
122
+ )
123
+
124
+ def disable_worker_queue_routing(self) -> None:
125
+ """Disable worker queue routing (used when pipeline stops)."""
126
+ self._use_worker_queue_routing = False
127
+ self._worker_input_queues = None
128
+ self._worker_response_queue = None
129
+ self._direct_api_worker_counter = 0
130
+ self.logger.info("Worker queue routing disabled")
131
+
88
132
  def has_async_predict(self) -> bool:
89
133
  """Check if async_predict is available in the underlying model manager.
90
134
 
@@ -106,6 +150,248 @@ class InferenceInterface:
106
150
  except Exception as e:
107
151
  self.logger.warning(f"Error checking async_predict availability: {e}")
108
152
  return False
153
+
154
+ def _route_through_worker_queue(
155
+ self,
156
+ input: Any,
157
+ extra_params: Optional[Dict[str, Any]] = None,
158
+ stream_key: Optional[str] = None,
159
+ stream_info: Optional[Dict[str, Any]] = None,
160
+ timeout: float = 5.0,
161
+ ) -> Tuple[Any, bool]:
162
+ """Route inference through worker queue to avoid greenlet context issues.
163
+
164
+ This method submits the inference task to the same queue used by streaming
165
+ frames, ensuring the model is accessed in the worker process context where
166
+ it was loaded. This avoids greenlet "Cannot switch to a different thread" errors.
167
+
168
+ Args:
169
+ input: Input data (image bytes)
170
+ extra_params: Additional parameters for inference
171
+ stream_key: Stream key identifier
172
+ stream_info: Stream metadata
173
+ timeout: Maximum time to wait for response (seconds)
174
+
175
+ Returns:
176
+ Tuple of (results, success_flag)
177
+
178
+ Raises:
179
+ RuntimeError: If worker queue routing fails
180
+ """
181
+ if not self._worker_input_queues:
182
+ raise RuntimeError("Worker queues not configured for routing")
183
+
184
+ # Generate unique request ID for correlation
185
+ request_id = str(uuid.uuid4())
186
+
187
+ # Create a dedicated response queue for this request to avoid cross-talk
188
+ response_queue = mp.Queue(maxsize=1)
189
+
190
+ # Create task for worker queue
191
+ # Uses special "direct_api" type so workers know to send response back
192
+ task = {
193
+ "type": "direct_api",
194
+ "request_id": request_id,
195
+ "input_bytes": input if isinstance(input, bytes) else bytes(input),
196
+ "extra_params": extra_params or {},
197
+ "stream_key": stream_key or f"direct_api_{request_id}",
198
+ "stream_info": stream_info,
199
+ "response_queue": response_queue,
200
+ # Required fields for worker validation (using placeholder values)
201
+ "camera_id": f"direct_api_{request_id[:8]}",
202
+ "frame_id": request_id,
203
+ "message": {"type": "direct_api"},
204
+ "camera_config": {"type": "direct_api"},
205
+ }
206
+
207
+ # Round-robin select a worker queue for load balancing
208
+ num_workers = len(self._worker_input_queues)
209
+ worker_id = self._direct_api_worker_counter % num_workers
210
+ self._direct_api_worker_counter += 1
211
+ target_queue = self._worker_input_queues[worker_id]
212
+
213
+ self.logger.debug(f"Submitting direct API task {request_id} to worker {worker_id}")
214
+
215
+ try:
216
+ # Submit task to worker queue (non-blocking with short timeout)
217
+ # This ensures we don't block forever if queue is full
218
+ target_queue.put(task, timeout=5.0)
219
+ except Exception as e:
220
+ self.logger.error(f"Failed to submit task to worker queue {worker_id}: {e}")
221
+ raise RuntimeError(f"Worker queue submission failed: {e}") from e
222
+
223
+ # Wait for response on the dedicated response queue
224
+ try:
225
+ result = response_queue.get(timeout=timeout)
226
+ except Exception:
227
+ raise RuntimeError(
228
+ f"Timeout waiting for worker response (request_id={request_id}, timeout={timeout}s)"
229
+ )
230
+
231
+ # Extract result
232
+ if result.get("success"):
233
+ self.logger.debug(f"Direct API task {request_id} completed successfully")
234
+ return result.get("model_result"), True
235
+ else:
236
+ error_msg = result.get("error", "Unknown worker error")
237
+ self.logger.error(f"Direct API task {request_id} failed: {error_msg}")
238
+ return None, False
239
+
240
+ async def _inference_via_worker_queue(
241
+ self,
242
+ input: Any,
243
+ extra_params: Optional[Dict[str, Any]] = None,
244
+ apply_post_processing: bool = False,
245
+ post_processing_config: Optional[Union[Dict[str, Any], str]] = None,
246
+ stream_key: Optional[str] = None,
247
+ stream_info: Optional[Dict[str, Any]] = None,
248
+ ) -> Tuple[Any, Optional[Dict[str, Any]]]:
249
+ """Async wrapper for worker queue inference with optional post-processing.
250
+
251
+ Routes inference through worker queue and handles post-processing if requested.
252
+ This method is used for high-priority requests (e.g., identity images) when
253
+ streaming is active to avoid greenlet context switching issues.
254
+
255
+ Args:
256
+ input: Input data (image bytes)
257
+ extra_params: Additional parameters for inference
258
+ apply_post_processing: Whether to apply post-processing
259
+ post_processing_config: Configuration for post-processing
260
+ stream_key: Stream key identifier
261
+ stream_info: Stream metadata
262
+
263
+ Returns:
264
+ Tuple of (results, metadata)
265
+ """
266
+ model_start_time = time.time()
267
+
268
+ # Update latest inference time
269
+ self.latest_inference_time = datetime.now(timezone.utc)
270
+
271
+ try:
272
+ # Route through worker queue (synchronous call)
273
+ # Run in thread pool to avoid blocking async event loop
274
+ loop = asyncio.get_event_loop()
275
+ raw_results, success = await loop.run_in_executor(
276
+ None, # Use default executor
277
+ self._route_through_worker_queue,
278
+ input,
279
+ extra_params,
280
+ stream_key,
281
+ stream_info,
282
+ 6.0, # timeout
283
+ )
284
+
285
+ model_inference_time = time.time() - model_start_time
286
+
287
+ if not success:
288
+ raise RuntimeError("Model inference via worker queue failed")
289
+
290
+ self.logger.debug(
291
+ f"Worker queue inference executed stream_key={stream_key} "
292
+ f"time={model_inference_time:.4f}s"
293
+ )
294
+
295
+ except Exception as exc:
296
+ error_msg = str(exc)
297
+ if "greenlet" in error_msg.lower() or "cannot switch" in error_msg.lower():
298
+ self.logger.error(
299
+ f"Greenlet error in worker queue routing. This is unexpected - "
300
+ f"worker queue routing should avoid greenlet issues. Error: {error_msg}",
301
+ exc_info=True
302
+ )
303
+ else:
304
+ self.logger.error(f"Worker queue inference failed: {error_msg}", exc_info=True)
305
+ raise RuntimeError(f"Worker queue inference failed: {error_msg}") from exc
306
+
307
+ # If no post-processing requested, return raw results
308
+ if not apply_post_processing or not self.post_processor:
309
+ return raw_results, {
310
+ "timing_metadata": {
311
+ "model_inference_time_sec": model_inference_time,
312
+ "post_processing_time_sec": 0.0,
313
+ "total_time_sec": model_inference_time,
314
+ },
315
+ "routing": "worker_queue",
316
+ }
317
+
318
+ # Apply post-processing using PostProcessor
319
+ try:
320
+ post_processing_start_time = time.time()
321
+
322
+ result = await self.post_processor.process(
323
+ data=raw_results,
324
+ config=post_processing_config,
325
+ input_bytes=input if isinstance(input, bytes) else None,
326
+ stream_key=stream_key,
327
+ stream_info=stream_info
328
+ )
329
+
330
+ post_processing_time = time.time() - post_processing_start_time
331
+
332
+ if result.is_success():
333
+ processed_raw_results = [] if (
334
+ hasattr(result, 'usecase') and result.usecase == 'face_recognition'
335
+ ) else raw_results
336
+
337
+ agg_summary = {}
338
+ if hasattr(result, 'data') and isinstance(result.data, dict):
339
+ agg_summary = result.data.get("agg_summary", {})
340
+
341
+ post_processing_result = {
342
+ "status": "success",
343
+ "processing_time": result.processing_time,
344
+ "usecase": getattr(result, 'usecase', ''),
345
+ "category": getattr(result, 'category', ''),
346
+ "summary": getattr(result, 'summary', ''),
347
+ "insights": getattr(result, 'insights', []),
348
+ "metrics": getattr(result, 'metrics', {}),
349
+ "predictions": getattr(result, 'predictions', []),
350
+ "agg_summary": agg_summary,
351
+ "stream_key": stream_key or "default_stream",
352
+ "timing_metadata": {
353
+ "model_inference_time_sec": model_inference_time,
354
+ "post_processing_time_sec": post_processing_time,
355
+ "total_time_sec": model_inference_time + post_processing_time,
356
+ },
357
+ "routing": "worker_queue",
358
+ }
359
+
360
+ return processed_raw_results, post_processing_result
361
+ else:
362
+ self.logger.error(f"Post-processing failed: {result.error_message}")
363
+ return raw_results, {
364
+ "status": "post_processing_failed",
365
+ "error": result.error_message,
366
+ "error_type": getattr(result, 'error_type', 'ProcessingError'),
367
+ "processing_time": result.processing_time,
368
+ "processed_data": raw_results,
369
+ "stream_key": stream_key or "default_stream",
370
+ "timing_metadata": {
371
+ "model_inference_time_sec": model_inference_time,
372
+ "post_processing_time_sec": post_processing_time,
373
+ "total_time_sec": model_inference_time + post_processing_time,
374
+ },
375
+ "routing": "worker_queue",
376
+ }
377
+
378
+ except Exception as e:
379
+ post_processing_time = time.time() - post_processing_start_time
380
+ self.logger.error(f"Post-processing exception: {str(e)}", exc_info=True)
381
+
382
+ return raw_results, {
383
+ "status": "post_processing_failed",
384
+ "error": str(e),
385
+ "error_type": type(e).__name__,
386
+ "processed_data": raw_results,
387
+ "stream_key": stream_key or "default_stream",
388
+ "timing_metadata": {
389
+ "model_inference_time_sec": model_inference_time,
390
+ "post_processing_time_sec": post_processing_time,
391
+ "total_time_sec": model_inference_time + post_processing_time,
392
+ },
393
+ "routing": "worker_queue",
394
+ }
109
395
 
110
396
  async def inference(
111
397
  self,
@@ -138,10 +424,11 @@ class InferenceInterface:
138
424
  - Metadata about the inference and post-processing (if applicable)
139
425
 
140
426
  Note:
141
- High-priority requests (like identity images for face recognition) will
142
- always execute successfully. During their execution, streaming frames may
143
- be naturally skipped if the inference queue fills up, which is acceptable
144
- for continuous streaming scenarios.
427
+ High-priority requests (like identity images for face recognition) are routed
428
+ through the worker queue when streaming is active. This avoids greenlet context
429
+ switching issues by ensuring all model inference happens in the worker process.
430
+ During their execution, streaming frames may be naturally skipped if the
431
+ inference queue fills up, which is acceptable for continuous streaming scenarios.
145
432
  """
146
433
  if input is None:
147
434
  raise ValueError("Input cannot be None")
@@ -150,6 +437,35 @@ class InferenceInterface:
150
437
  if is_high_priority:
151
438
  self.logger.info(f"Processing high-priority inference request (stream_key={stream_key})")
152
439
 
440
+ # CRITICAL: Route high-priority requests through worker queue when streaming is active
441
+ # This avoids greenlet "Cannot switch to a different thread" errors
442
+ # Only applies when: 1) high priority request AND 2) worker queue routing enabled AND 3) queues available
443
+ if (
444
+ is_high_priority
445
+ and self._use_worker_queue_routing
446
+ and self._worker_input_queues is not None
447
+ ):
448
+ self.logger.info(
449
+ f"Routing high-priority request through worker queue to avoid greenlet issues "
450
+ f"(stream_key={stream_key})"
451
+ )
452
+ try:
453
+ return await self._inference_via_worker_queue(
454
+ input=input,
455
+ extra_params=extra_params,
456
+ apply_post_processing=apply_post_processing,
457
+ post_processing_config=post_processing_config,
458
+ stream_key=stream_key,
459
+ stream_info=stream_info,
460
+ )
461
+ except Exception as worker_exc:
462
+ # If worker queue routing fails, log warning and fall back to direct inference
463
+ # This ensures the request still has a chance to complete
464
+ self.logger.warning(
465
+ f"Worker queue routing failed, falling back to direct inference: {worker_exc}"
466
+ )
467
+ # Continue to original inference path below
468
+
153
469
  # Measure model inference time
154
470
  model_start_time = time.time()
155
471
 
@@ -180,7 +496,7 @@ class InferenceInterface:
180
496
  event_loop_to_use
181
497
  )
182
498
  # High-priority requests get longer timeout
183
- timeout = 120.0 if is_high_priority else 60.0
499
+ timeout = 10.0 if is_high_priority else 6.0
184
500
  raw_results, success = future.result(timeout=timeout)
185
501
  else:
186
502
  # Fall back to sync inference (no async support or no event loop)
@@ -77,7 +77,7 @@ class ModelManagerWrapper:
77
77
  self.action_tracker = action_tracker
78
78
  self.test_env = test_env
79
79
  self.model_type = model_type.lower() if model_type else "default"
80
-
80
+ self.model_type = "default" # TODO: remove this once BE is updated with the current types
81
81
  # Validate model_type
82
82
  if self.model_type not in ["default", "triton"]:
83
83
  raise ValueError(f"Invalid model_type '{self.model_type}'. Must be 'default' or 'triton'")
@@ -422,6 +422,11 @@ class MatriceDeployServer:
422
422
  except Exception as e:
423
423
  logging.warning(f"Failed to get index_to_category from action_tracker: {str(e)}")
424
424
 
425
+ # Store post-processing config for passing to StreamingPipeline (as dict, not extracted from post_processor)
426
+ self._post_processing_config = post_processing_config
427
+ self._index_to_category = index_to_category
428
+ self._target_categories = target_categories
429
+
425
430
  # Create PostProcessor
426
431
  self.post_processor = PostProcessor(
427
432
  post_processing_config=post_processing_config,
@@ -473,7 +478,6 @@ class MatriceDeployServer:
473
478
  # Create streaming pipeline with configured parameters
474
479
  self.streaming_pipeline = StreamingPipeline(
475
480
  inference_interface=self.inference_interface,
476
- post_processor=self.post_processor,
477
481
  inference_queue_maxsize=self.job_params.get("inference_queue_maxsize", 5000),
478
482
  postproc_queue_maxsize=self.job_params.get("postproc_queue_maxsize", 5000),
479
483
  output_queue_maxsize=self.job_params.get("output_queue_maxsize", 5000),
@@ -499,6 +503,10 @@ class MatriceDeployServer:
499
503
  async_predict=self.async_predict,
500
504
  async_load_model=self.async_load_model,
501
505
  batch_predict=self.batch_predict,
506
+ # Pass post-processing configuration as dict (not extracted from post_processor)
507
+ post_processing_config=getattr(self, '_post_processing_config', {}),
508
+ index_to_category=getattr(self, '_index_to_category', None),
509
+ target_categories=getattr(self, '_target_categories', None),
502
510
  )
503
511
 
504
512
  # Start the pipeline (now manages its own event loop thread)
@@ -240,15 +240,33 @@ class AnalyticsPublisher:
240
240
  try:
241
241
  camera_id = task_data.get("camera_id")
242
242
  if not camera_id:
243
+ self.logger.debug("No camera_id in task_data, skipping analytics extraction")
243
244
  return
244
-
245
+
245
246
  data = task_data.get("data", {})
246
247
  post_processing_result = data.get("post_processing_result", {})
247
- agg_summary = post_processing_result.get("agg_summary", {})
248
-
249
- if not agg_summary:
248
+
249
+ # Check for agg_summary at top level (current format after flattening)
250
+ # or nested in data field (legacy format for backward compatibility)
251
+ agg_summary = post_processing_result.get("agg_summary")
252
+ if agg_summary is None and "data" in post_processing_result:
253
+ # Legacy format: agg_summary nested in data field
254
+ agg_summary = post_processing_result.get("data", {}).get("agg_summary")
255
+ if agg_summary:
256
+ self.logger.debug(f"Found agg_summary in legacy nested format for camera {camera_id}")
257
+
258
+ # Skip if no agg_summary found
259
+ if not agg_summary or not isinstance(agg_summary, dict):
260
+ self.logger.debug(
261
+ f"No valid agg_summary for camera {camera_id}. "
262
+ f"post_processing_result keys: {list(post_processing_result.keys()) if post_processing_result else 'empty'}"
263
+ )
250
264
  return
251
-
265
+
266
+ self.logger.debug(
267
+ f"Processing agg_summary for camera {camera_id} with {len(agg_summary)} frame(s)"
268
+ )
269
+
252
270
  # Process each frame in agg_summary
253
271
  for frame_id, frame_data in agg_summary.items():
254
272
  tracking_stats = frame_data.get("tracking_stats", {})
@@ -349,15 +367,25 @@ class AnalyticsPublisher:
349
367
  async def _publish_analytics(self) -> None:
350
368
  """Publish aggregated analytics to Redis and optionally Kafka."""
351
369
  try:
370
+ if not self.analytics_store:
371
+ self.logger.debug("No analytics data to publish (analytics_store is empty)")
372
+ return
373
+
374
+ self.logger.info(
375
+ f"Publishing analytics for {len(self.analytics_store)} camera(s) to results-agg"
376
+ )
377
+
352
378
  # Publish analytics for each camera
353
379
  for camera_id, analytics_data in self.analytics_store.items():
354
380
  if not analytics_data:
381
+ self.logger.debug(f"No analytics data for camera {camera_id}, skipping")
355
382
  continue
356
383
 
357
384
  # Build analytics message
358
385
  message = self._build_analytics_message(camera_id, analytics_data)
359
386
 
360
387
  if not message:
388
+ self.logger.warning(f"Failed to build analytics message for camera {camera_id}")
361
389
  continue
362
390
 
363
391
  # Publish to Redis (required)
@@ -463,15 +491,22 @@ class AnalyticsPublisher:
463
491
  if not self.redis_stream:
464
492
  self.logger.warning("Redis stream not initialized, skipping publish")
465
493
  return
466
-
494
+
467
495
  message_json = json.dumps(message)
468
496
  await self.redis_stream.async_add_message(
469
497
  self.ANALYTICS_TOPIC,
470
498
  message_json,
471
499
  key=camera_id
472
500
  )
473
-
474
- self.logger.debug(f"Published analytics to Redis for camera {camera_id}")
501
+
502
+ # Log at info level so we can see when data is being published
503
+ tracking_stats = message.get("tracking_stats", {})
504
+ current_counts = tracking_stats.get("current_counts", [])
505
+ total_counts = tracking_stats.get("total_counts", [])
506
+ self.logger.info(
507
+ f"Published analytics to Redis '{self.ANALYTICS_TOPIC}' for camera {camera_id}: "
508
+ f"current={current_counts}, total={total_counts}"
509
+ )
475
510
 
476
511
  except Exception as e:
477
512
  self.logger.error(f"Error publishing to Redis for {camera_id}: {e}", exc_info=True)