PyPI - matrice-inference - Versions diffs - 0.1.58__tar.gz → 0.1.73__tar.gz - Mend

matrice-inference 0.1.58tar.gz → 0.1.73tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

{matrice_inference-0.1.58 → matrice_inference-0.1.73}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_inference
-Version: 0.1.58
+Version: 0.1.73
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_inference-0.1.58 → matrice_inference-0.1.73}/matrice_inference.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_inference
-Version: 0.1.58
+Version: 0.1.73
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/inference_interface.py RENAMED Viewed

@@ -1,5 +1,5 @@
 """
-InferenceInterface: Thread-safe inference with unified event loop management.
+InferenceInterface: Thread-safe inference with worker queue routing.
 THREAD SAFETY & CONCURRENT REQUEST HANDLING:
 ============================================
@@ -10,41 +10,45 @@ This module solves the greenlet thread context switching problem that occurs whe
 The Problem:
 -----------
-- Streaming frames use the StreamingPipeline's dedicated event loop
-- Direct API calls may try to use a different event loop or thread context
-- Models loaded with gevent/greenlet cannot switch between different thread contexts
+- Streaming frames are processed by inference worker processes with their own models
+- Direct API calls attempt to use models in the main process from different thread contexts
+- Models using gevent/greenlet internally cannot switch between different greenlet contexts
 - This causes: "Cannot switch to a different thread" errors
-The Solution:
-------------
-1. StreamingPipeline creates a single dedicated event loop at startup
-2. All model instances are loaded in this event loop
-3. InferenceInterface stores a reference to this pipeline event loop
-4. ALL inference requests (streaming + direct API) use asyncio.run_coroutine_threadsafe()
-   to execute in the pipeline's event loop, regardless of which thread they originate from
-5. High-priority requests (identity images) get longer timeouts and always complete
+The Solution (Worker Queue Routing):
+-----------------------------------
+1. StreamingPipeline creates inference worker processes that load their own models
+2. When pipeline is active, ALL inference requests (streaming + direct API) are routed
+   through the same worker queue (inference_queue)
+3. Direct API calls (identity images) submit tasks to the worker queue and wait for
+   responses via a dedicated response queue (direct_api_response_queue)
+4. This ensures all inference uses the same greenlet context (worker process)
+5. High-priority requests bypass the streaming queue backpressure with priority handling
 Benefits:
 --------
-- No greenlet thread context errors
+- No greenlet thread context errors (all inference in worker process context)
 - Identity images work during streaming
-- Natural frame skipping: If identity processing takes time, streaming frames may be
-  dropped from the queue, which is acceptable for continuous video streams
-- Simple, robust, maintainable solution
+- Natural frame skipping: Workers process identity images, streaming frames queue up
+  and may be dropped if queue fills (acceptable for continuous video streams)
+- Simple, robust architecture using multiprocessing queues
 Usage:
 -----
-1. StreamingPipeline calls: inference_interface.set_pipeline_event_loop(event_loop)
-2. All inference calls automatically use this event loop via run_coroutine_threadsafe()
-3. Direct API calls set is_high_priority=True for guaranteed execution
+1. StreamingPipeline calls: inference_interface.set_worker_queues(input_queue, response_queue)
+2. Direct API calls automatically route through worker queue when pipeline is active
+3. High-priority requests (identity images) get dedicated handling
 """
 from matrice_inference.server.model.model_manager_wrapper import ModelManagerWrapper
-from typing import Dict, Any, Optional, Tuple, Union
+from typing import Dict, Any, List, Optional, Tuple, Union
 from datetime import datetime, timezone
 import logging
 import time
 import asyncio
+import multiprocessing as mp
+import uuid
+import queue
 from matrice_analytics.post_processing.post_processor import PostProcessor
 class InferenceInterface:
@@ -68,6 +72,14 @@ class InferenceInterface:
         self.latest_inference_time = datetime.now(timezone.utc)
         self.pipeline_event_loop: Optional[asyncio.AbstractEventLoop] = None
+        # Worker queue routing for direct API calls
+        # When set, ALL inference requests are routed through worker processes
+        # to avoid greenlet context switching issues
+        self._worker_input_queues: Optional[List[mp.Queue]] = None
+        self._worker_response_queue: Optional[mp.Queue] = None
+        self._use_worker_queue_routing = False
+        self._direct_api_worker_counter = 0  # Round-robin counter for load balancing
         # Track concurrent inference requests for monitoring
         self._active_inference_count = 0
         self._inference_count_lock = asyncio.Lock() if asyncio else None
@@ -85,6 +97,38 @@ class InferenceInterface:
         self.pipeline_event_loop = event_loop
         self.logger.info("Pipeline event loop registered for thread-safe inference")
+    def set_worker_queues(
+        self,
+        input_queues: List[mp.Queue],
+        response_queue: mp.Queue,
+    ) -> None:
+        """Set worker queues for routing direct API calls through inference workers.
+        When set, direct API calls (e.g., identity images for face recognition) are
+        routed through the same inference worker processes that handle streaming frames.
+        This avoids greenlet context switching issues by ensuring all model inference
+        happens in the worker process context.
+        Args:
+            input_queues: List of multiprocessing queues (one per worker) for submitting tasks
+            response_queue: Multiprocessing queue for receiving inference results
+        """
+        self._worker_input_queues = input_queues
+        self._worker_response_queue = response_queue
+        self._use_worker_queue_routing = True
+        self._direct_api_worker_counter = 0  # Round-robin counter for load balancing
+        self.logger.info(
+            f"Worker queue routing enabled - direct API calls will use {len(input_queues)} inference workers"
+        )
+    def disable_worker_queue_routing(self) -> None:
+        """Disable worker queue routing (used when pipeline stops)."""
+        self._use_worker_queue_routing = False
+        self._worker_input_queues = None
+        self._worker_response_queue = None
+        self._direct_api_worker_counter = 0
+        self.logger.info("Worker queue routing disabled")
     def has_async_predict(self) -> bool:
         """Check if async_predict is available in the underlying model manager.
@@ -106,6 +150,248 @@ class InferenceInterface:
         except Exception as e:
             self.logger.warning(f"Error checking async_predict availability: {e}")
             return False
+    def _route_through_worker_queue(
+        self,
+        input: Any,
+        extra_params: Optional[Dict[str, Any]] = None,
+        stream_key: Optional[str] = None,
+        stream_info: Optional[Dict[str, Any]] = None,
+        timeout: float = 5.0,
+    ) -> Tuple[Any, bool]:
+        """Route inference through worker queue to avoid greenlet context issues.
+        This method submits the inference task to the same queue used by streaming
+        frames, ensuring the model is accessed in the worker process context where
+        it was loaded. This avoids greenlet "Cannot switch to a different thread" errors.
+        Args:
+            input: Input data (image bytes)
+            extra_params: Additional parameters for inference
+            stream_key: Stream key identifier
+            stream_info: Stream metadata
+            timeout: Maximum time to wait for response (seconds)
+        Returns:
+            Tuple of (results, success_flag)
+        Raises:
+            RuntimeError: If worker queue routing fails
+        """
+        if not self._worker_input_queues:
+            raise RuntimeError("Worker queues not configured for routing")
+        # Generate unique request ID for correlation
+        request_id = str(uuid.uuid4())
+        # Create a dedicated response queue for this request to avoid cross-talk
+        response_queue = mp.Queue(maxsize=1)
+        # Create task for worker queue
+        # Uses special "direct_api" type so workers know to send response back
+        task = {
+            "type": "direct_api",
+            "request_id": request_id,
+            "input_bytes": input if isinstance(input, bytes) else bytes(input),
+            "extra_params": extra_params or {},
+            "stream_key": stream_key or f"direct_api_{request_id}",
+            "stream_info": stream_info,
+            "response_queue": response_queue,
+            # Required fields for worker validation (using placeholder values)
+            "camera_id": f"direct_api_{request_id[:8]}",
+            "frame_id": request_id,
+            "message": {"type": "direct_api"},
+            "camera_config": {"type": "direct_api"},
+        }
+        # Round-robin select a worker queue for load balancing
+        num_workers = len(self._worker_input_queues)
+        worker_id = self._direct_api_worker_counter % num_workers
+        self._direct_api_worker_counter += 1
+        target_queue = self._worker_input_queues[worker_id]
+        self.logger.debug(f"Submitting direct API task {request_id} to worker {worker_id}")
+        try:
+            # Submit task to worker queue (non-blocking with short timeout)
+            # This ensures we don't block forever if queue is full
+            target_queue.put(task, timeout=5.0)
+        except Exception as e:
+            self.logger.error(f"Failed to submit task to worker queue {worker_id}: {e}")
+            raise RuntimeError(f"Worker queue submission failed: {e}") from e
+        # Wait for response on the dedicated response queue
+        try:
+            result = response_queue.get(timeout=timeout)
+        except Exception:
+            raise RuntimeError(
+                f"Timeout waiting for worker response (request_id={request_id}, timeout={timeout}s)"
+            )
+        # Extract result
+        if result.get("success"):
+            self.logger.debug(f"Direct API task {request_id} completed successfully")
+            return result.get("model_result"), True
+        else:
+            error_msg = result.get("error", "Unknown worker error")
+            self.logger.error(f"Direct API task {request_id} failed: {error_msg}")
+            return None, False
+    async def _inference_via_worker_queue(
+        self,
+        input: Any,
+        extra_params: Optional[Dict[str, Any]] = None,
+        apply_post_processing: bool = False,
+        post_processing_config: Optional[Union[Dict[str, Any], str]] = None,
+        stream_key: Optional[str] = None,
+        stream_info: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[Any, Optional[Dict[str, Any]]]:
+        """Async wrapper for worker queue inference with optional post-processing.
+        Routes inference through worker queue and handles post-processing if requested.
+        This method is used for high-priority requests (e.g., identity images) when
+        streaming is active to avoid greenlet context switching issues.
+        Args:
+            input: Input data (image bytes)
+            extra_params: Additional parameters for inference
+            apply_post_processing: Whether to apply post-processing
+            post_processing_config: Configuration for post-processing
+            stream_key: Stream key identifier
+            stream_info: Stream metadata
+        Returns:
+            Tuple of (results, metadata)
+        """
+        model_start_time = time.time()
+        # Update latest inference time
+        self.latest_inference_time = datetime.now(timezone.utc)
+        try:
+            # Route through worker queue (synchronous call)
+            # Run in thread pool to avoid blocking async event loop
+            loop = asyncio.get_event_loop()
+            raw_results, success = await loop.run_in_executor(
+                None,  # Use default executor
+                self._route_through_worker_queue,
+                input,
+                extra_params,
+                stream_key,
+                stream_info,
+                6.0,  # timeout
+            )
+            model_inference_time = time.time() - model_start_time
+            if not success:
+                raise RuntimeError("Model inference via worker queue failed")
+            self.logger.debug(
+                f"Worker queue inference executed stream_key={stream_key} "
+                f"time={model_inference_time:.4f}s"
+            )
+        except Exception as exc:
+            error_msg = str(exc)
+            if "greenlet" in error_msg.lower() or "cannot switch" in error_msg.lower():
+                self.logger.error(
+                    f"Greenlet error in worker queue routing. This is unexpected - "
+                    f"worker queue routing should avoid greenlet issues. Error: {error_msg}",
+                    exc_info=True
+                )
+            else:
+                self.logger.error(f"Worker queue inference failed: {error_msg}", exc_info=True)
+            raise RuntimeError(f"Worker queue inference failed: {error_msg}") from exc
+        # If no post-processing requested, return raw results
+        if not apply_post_processing or not self.post_processor:
+            return raw_results, {
+                "timing_metadata": {
+                    "model_inference_time_sec": model_inference_time,
+                    "post_processing_time_sec": 0.0,
+                    "total_time_sec": model_inference_time,
+                },
+                "routing": "worker_queue",
+            }
+        # Apply post-processing using PostProcessor
+        try:
+            post_processing_start_time = time.time()
+            result = await self.post_processor.process(
+                data=raw_results,
+                config=post_processing_config,
+                input_bytes=input if isinstance(input, bytes) else None,
+                stream_key=stream_key,
+                stream_info=stream_info
+            )
+            post_processing_time = time.time() - post_processing_start_time
+            if result.is_success():
+                processed_raw_results = [] if (
+                    hasattr(result, 'usecase') and result.usecase == 'face_recognition'
+                ) else raw_results
+                agg_summary = {}
+                if hasattr(result, 'data') and isinstance(result.data, dict):
+                    agg_summary = result.data.get("agg_summary", {})
+                post_processing_result = {
+                    "status": "success",
+                    "processing_time": result.processing_time,
+                    "usecase": getattr(result, 'usecase', ''),
+                    "category": getattr(result, 'category', ''),
+                    "summary": getattr(result, 'summary', ''),
+                    "insights": getattr(result, 'insights', []),
+                    "metrics": getattr(result, 'metrics', {}),
+                    "predictions": getattr(result, 'predictions', []),
+                    "agg_summary": agg_summary,
+                    "stream_key": stream_key or "default_stream",
+                    "timing_metadata": {
+                        "model_inference_time_sec": model_inference_time,
+                        "post_processing_time_sec": post_processing_time,
+                        "total_time_sec": model_inference_time + post_processing_time,
+                    },
+                    "routing": "worker_queue",
+                }
+                return processed_raw_results, post_processing_result
+            else:
+                self.logger.error(f"Post-processing failed: {result.error_message}")
+                return raw_results, {
+                    "status": "post_processing_failed",
+                    "error": result.error_message,
+                    "error_type": getattr(result, 'error_type', 'ProcessingError'),
+                    "processing_time": result.processing_time,
+                    "processed_data": raw_results,
+                    "stream_key": stream_key or "default_stream",
+                    "timing_metadata": {
+                        "model_inference_time_sec": model_inference_time,
+                        "post_processing_time_sec": post_processing_time,
+                        "total_time_sec": model_inference_time + post_processing_time,
+                    },
+                    "routing": "worker_queue",
+                }
+        except Exception as e:
+            post_processing_time = time.time() - post_processing_start_time
+            self.logger.error(f"Post-processing exception: {str(e)}", exc_info=True)
+            return raw_results, {
+                "status": "post_processing_failed",
+                "error": str(e),
+                "error_type": type(e).__name__,
+                "processed_data": raw_results,
+                "stream_key": stream_key or "default_stream",
+                "timing_metadata": {
+                    "model_inference_time_sec": model_inference_time,
+                    "post_processing_time_sec": post_processing_time,
+                    "total_time_sec": model_inference_time + post_processing_time,
+                },
+                "routing": "worker_queue",
+            }
     async def inference(
         self,
@@ -138,10 +424,11 @@ class InferenceInterface:
                 - Metadata about the inference and post-processing (if applicable)
         Note:
-            High-priority requests (like identity images for face recognition) will
-            always execute successfully. During their execution, streaming frames may
-            be naturally skipped if the inference queue fills up, which is acceptable
-            for continuous streaming scenarios.
+            High-priority requests (like identity images for face recognition) are routed
+            through the worker queue when streaming is active. This avoids greenlet context
+            switching issues by ensuring all model inference happens in the worker process.
+            During their execution, streaming frames may be naturally skipped if the
+            inference queue fills up, which is acceptable for continuous streaming scenarios.
         """
         if input is None:
             raise ValueError("Input cannot be None")
@@ -150,6 +437,35 @@ class InferenceInterface:
         if is_high_priority:
             self.logger.info(f"Processing high-priority inference request (stream_key={stream_key})")
+        # CRITICAL: Route high-priority requests through worker queue when streaming is active
+        # This avoids greenlet "Cannot switch to a different thread" errors
+        # Only applies when: 1) high priority request AND 2) worker queue routing enabled AND 3) queues available
+        if (
+            is_high_priority
+            and self._use_worker_queue_routing
+            and self._worker_input_queues is not None
+        ):
+            self.logger.info(
+                f"Routing high-priority request through worker queue to avoid greenlet issues "
+                f"(stream_key={stream_key})"
+            )
+            try:
+                return await self._inference_via_worker_queue(
+                    input=input,
+                    extra_params=extra_params,
+                    apply_post_processing=apply_post_processing,
+                    post_processing_config=post_processing_config,
+                    stream_key=stream_key,
+                    stream_info=stream_info,
+                )
+            except Exception as worker_exc:
+                # If worker queue routing fails, log warning and fall back to direct inference
+                # This ensures the request still has a chance to complete
+                self.logger.warning(
+                    f"Worker queue routing failed, falling back to direct inference: {worker_exc}"
+                )
+                # Continue to original inference path below
         # Measure model inference time
         model_start_time = time.time()
@@ -180,7 +496,7 @@ class InferenceInterface:
                     event_loop_to_use
                 )
                 # High-priority requests get longer timeout
-                timeout = 120.0 if is_high_priority else 60.0
+                timeout = 10.0 if is_high_priority else 6.0
                 raw_results, success = future.result(timeout=timeout)
             else:
                 # Fall back to sync inference (no async support or no event loop)

{matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/model/model_manager_wrapper.py RENAMED Viewed

@@ -77,7 +77,7 @@ class ModelManagerWrapper:
         self.action_tracker = action_tracker
         self.test_env = test_env
         self.model_type = model_type.lower() if model_type else "default"
+        self.model_type = "default" # TODO: remove this once BE is updated with the current types
         # Validate model_type
         if self.model_type not in ["default", "triton"]:
             raise ValueError(f"Invalid model_type '{self.model_type}'. Must be 'default' or 'triton'")

{matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/server.py RENAMED Viewed

@@ -422,6 +422,11 @@ class MatriceDeployServer:
         except Exception as e:
             logging.warning(f"Failed to get index_to_category from action_tracker: {str(e)}")
+        # Store post-processing config for passing to StreamingPipeline (as dict, not extracted from post_processor)
+        self._post_processing_config = post_processing_config
+        self._index_to_category = index_to_category
+        self._target_categories = target_categories
         # Create PostProcessor
         self.post_processor = PostProcessor(
             post_processing_config=post_processing_config,
@@ -473,7 +478,6 @@ class MatriceDeployServer:
             # Create streaming pipeline with configured parameters
             self.streaming_pipeline = StreamingPipeline(
                 inference_interface=self.inference_interface,
-                post_processor=self.post_processor,
                 inference_queue_maxsize=self.job_params.get("inference_queue_maxsize", 5000),
                 postproc_queue_maxsize=self.job_params.get("postproc_queue_maxsize", 5000),
                 output_queue_maxsize=self.job_params.get("output_queue_maxsize", 5000),
@@ -499,6 +503,10 @@ class MatriceDeployServer:
                 async_predict=self.async_predict,
                 async_load_model=self.async_load_model,
                 batch_predict=self.batch_predict,
+                # Pass post-processing configuration as dict (not extracted from post_processor)
+                post_processing_config=getattr(self, '_post_processing_config', {}),
+                index_to_category=getattr(self, '_index_to_category', None),
+                target_categories=getattr(self, '_target_categories', None),
             )
             # Start the pipeline (now manages its own event loop thread)

{matrice_inference-0.1.58 → matrice_inference-0.1.73}/src/matrice_inference/server/stream/analytics_publisher.py RENAMED Viewed

@@ -240,15 +240,33 @@ class AnalyticsPublisher:
         try:
             camera_id = task_data.get("camera_id")
             if not camera_id:
+                self.logger.debug("No camera_id in task_data, skipping analytics extraction")
                 return
             data = task_data.get("data", {})
             post_processing_result = data.get("post_processing_result", {})
-            agg_summary = post_processing_result.get("agg_summary", {})
-            if not agg_summary:
+            # Check for agg_summary at top level (current format after flattening)
+            # or nested in data field (legacy format for backward compatibility)
+            agg_summary = post_processing_result.get("agg_summary")
+            if agg_summary is None and "data" in post_processing_result:
+                # Legacy format: agg_summary nested in data field
+                agg_summary = post_processing_result.get("data", {}).get("agg_summary")
+                if agg_summary:
+                    self.logger.debug(f"Found agg_summary in legacy nested format for camera {camera_id}")
+            # Skip if no agg_summary found
+            if not agg_summary or not isinstance(agg_summary, dict):
+                self.logger.debug(
+                    f"No valid agg_summary for camera {camera_id}. "
+                    f"post_processing_result keys: {list(post_processing_result.keys()) if post_processing_result else 'empty'}"
+                )
                 return
+            self.logger.debug(
+                f"Processing agg_summary for camera {camera_id} with {len(agg_summary)} frame(s)"
+            )
             # Process each frame in agg_summary
             for frame_id, frame_data in agg_summary.items():
                 tracking_stats = frame_data.get("tracking_stats", {})
@@ -349,15 +367,25 @@ class AnalyticsPublisher:
     async def _publish_analytics(self) -> None:
         """Publish aggregated analytics to Redis and optionally Kafka."""
         try:
+            if not self.analytics_store:
+                self.logger.debug("No analytics data to publish (analytics_store is empty)")
+                return
+            self.logger.info(
+                f"Publishing analytics for {len(self.analytics_store)} camera(s) to results-agg"
+            )
             # Publish analytics for each camera
             for camera_id, analytics_data in self.analytics_store.items():
                 if not analytics_data:
+                    self.logger.debug(f"No analytics data for camera {camera_id}, skipping")
                     continue
                 # Build analytics message
                 message = self._build_analytics_message(camera_id, analytics_data)
                 if not message:
+                    self.logger.warning(f"Failed to build analytics message for camera {camera_id}")
                     continue
                 # Publish to Redis (required)
@@ -463,15 +491,22 @@ class AnalyticsPublisher:
             if not self.redis_stream:
                 self.logger.warning("Redis stream not initialized, skipping publish")
                 return
             message_json = json.dumps(message)
             await self.redis_stream.async_add_message(
                 self.ANALYTICS_TOPIC,
                 message_json,
                 key=camera_id
             )
-            self.logger.debug(f"Published analytics to Redis for camera {camera_id}")
+            # Log at info level so we can see when data is being published
+            tracking_stats = message.get("tracking_stats", {})
+            current_counts = tracking_stats.get("current_counts", [])
+            total_counts = tracking_stats.get("total_counts", [])
+            self.logger.info(
+                f"Published analytics to Redis '{self.ANALYTICS_TOPIC}' for camera {camera_id}: "
+                f"current={current_counts}, total={total_counts}"
+            )
         except Exception as e:
             self.logger.error(f"Error publishing to Redis for {camera_id}: {e}", exc_info=True)

matrice-inference 0.1.58__tar.gz → 0.1.73__tar.gz

matrice-inference 0.1.58tar.gz → 0.1.73tar.gz