PyPI - matrice-inference - Versions diffs - 0.1.33__tar.gz → 0.1.58__tar.gz - Mend

matrice-inference 0.1.33tar.gz → 0.1.58tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

{matrice_inference-0.1.33 → matrice_inference-0.1.58}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_inference
-Version: 0.1.33
+Version: 0.1.58
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_inference-0.1.33 → matrice_inference-0.1.58}/matrice_inference.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: matrice_inference
-Version: 0.1.33
+Version: 0.1.58
 Summary: Common server utilities for Matrice.ai services
 Author-email: "Matrice.ai" <dipendra@matrice.ai>
 License-Expression: MIT

{matrice_inference-0.1.33 → matrice_inference-0.1.58}/matrice_inference.egg-info/SOURCES.txt RENAMED Viewed

@@ -18,18 +18,20 @@ src/matrice_inference/server/model/model_manager.py
 src/matrice_inference/server/model/model_manager_wrapper.py
 src/matrice_inference/server/model/triton_model_manager.py
 src/matrice_inference/server/model/triton_server.py
+src/matrice_inference/server/stream/DATA_FLOW_DIAGRAM.md
+src/matrice_inference/server/stream/STREAMING_PIPELINE_ARCHITECTURE.md
 src/matrice_inference/server/stream/__init__.py
 src/matrice_inference/server/stream/analytics_publisher.py
 src/matrice_inference/server/stream/app_deployment.py
 src/matrice_inference/server/stream/app_event_listener.py
 src/matrice_inference/server/stream/camera_config_monitor.py
-src/matrice_inference/server/stream/consumer_worker.py
+src/matrice_inference/server/stream/consumer_manager.py
 src/matrice_inference/server/stream/deployment_refresh_listener.py
 src/matrice_inference/server/stream/frame_cache.py
 src/matrice_inference/server/stream/inference_metric_logger.py
 src/matrice_inference/server/stream/inference_worker.py
 src/matrice_inference/server/stream/metric_publisher.py
-src/matrice_inference/server/stream/post_processing_worker.py
+src/matrice_inference/server/stream/post_processing_manager.py
 src/matrice_inference/server/stream/producer_worker.py
 src/matrice_inference/server/stream/stream_pipeline.py
 src/matrice_inference/server/stream/utils.py
@@ -45,4 +47,9 @@ src/matrice_inference/tmp/aggregator/ingestor.py
 src/matrice_inference/tmp/aggregator/latency.py
 src/matrice_inference/tmp/aggregator/pipeline.py
 src/matrice_inference/tmp/aggregator/publisher.py
-src/matrice_inference/tmp/aggregator/synchronizer.py
+src/matrice_inference/tmp/aggregator/synchronizer.py
+tests/test_frame_cache_optimizations.py
+tests/test_integration_real_components.py
+tests/test_msgpack_simple.py
+tests/test_msgpack_unpacking.py
+tests/test_streaming_pipeline_e2e.py

{matrice_inference-0.1.33 → matrice_inference-0.1.58}/src/matrice_inference/server/__init__.py RENAMED Viewed

@@ -1,12 +1,22 @@
 import os
 import logging
+from logging.handlers import RotatingFileHandler
 # Define paths
 log_path = os.path.join(os.getcwd(), "deploy_server.log")
 # Create handlers explicitly
 console_handler = logging.StreamHandler()
-file_handler = logging.FileHandler(log_path)
+# Use RotatingFileHandler with 0.5 GB max size and 3 backup files
+# When the log reaches 0.5 GB, it's rotated to deploy_server.log.1, deploy_server.log.2, etc.
+# Oldest logs are automatically deleted when backup count is exceeded
+file_handler = RotatingFileHandler(
+    log_path,
+    maxBytes=500 * 1024 * 1024,  # 0.5 GB = 500 MB
+    backupCount=3,  # Keep 3 backup files (total ~2 GB max: 0.5GB current + 3x0.5GB backups)
+    encoding='utf-8'
+)
 # Set levels
 console_handler.setLevel(logging.INFO)

matrice_inference-0.1.58/src/matrice_inference/server/inference_interface.py ADDED Viewed

@@ -0,0 +1,459 @@
+"""
+InferenceInterface: Thread-safe inference with unified event loop management.
+THREAD SAFETY & CONCURRENT REQUEST HANDLING:
+============================================
+This module solves the greenlet thread context switching problem that occurs when:
+1. Streaming frames are being processed continuously through the StreamingPipeline
+2. Direct API calls (e.g., identity images for face recognition) arrive simultaneously
+The Problem:
+-----------
+- Streaming frames use the StreamingPipeline's dedicated event loop
+- Direct API calls may try to use a different event loop or thread context
+- Models loaded with gevent/greenlet cannot switch between different thread contexts
+- This causes: "Cannot switch to a different thread" errors
+The Solution:
+------------
+1. StreamingPipeline creates a single dedicated event loop at startup
+2. All model instances are loaded in this event loop
+3. InferenceInterface stores a reference to this pipeline event loop
+4. ALL inference requests (streaming + direct API) use asyncio.run_coroutine_threadsafe()
+   to execute in the pipeline's event loop, regardless of which thread they originate from
+5. High-priority requests (identity images) get longer timeouts and always complete
+Benefits:
+--------
+- No greenlet thread context errors
+- Identity images work during streaming
+- Natural frame skipping: If identity processing takes time, streaming frames may be
+  dropped from the queue, which is acceptable for continuous video streams
+- Simple, robust, maintainable solution
+Usage:
+-----
+1. StreamingPipeline calls: inference_interface.set_pipeline_event_loop(event_loop)
+2. All inference calls automatically use this event loop via run_coroutine_threadsafe()
+3. Direct API calls set is_high_priority=True for guaranteed execution
+"""
+from matrice_inference.server.model.model_manager_wrapper import ModelManagerWrapper
+from typing import Dict, Any, Optional, Tuple, Union
+from datetime import datetime, timezone
+import logging
+import time
+import asyncio
+from matrice_analytics.post_processing.post_processor import PostProcessor
+class InferenceInterface:
+    """Interface for proxying requests to model servers with optional post-processing."""
+    def __init__(
+        self,
+        model_manager_wrapper: ModelManagerWrapper,
+        post_processor: Optional[PostProcessor] = None,
+    ):
+        """
+        Initialize the inference interface.
+        Args:
+            model_manager: Model manager for model inference
+            post_processor: Post processor for post-processing
+        """
+        self.logger = logging.getLogger(__name__)
+        self.model_manager_wrapper = model_manager_wrapper
+        self.post_processor = post_processor
+        self.latest_inference_time = datetime.now(timezone.utc)
+        self.pipeline_event_loop: Optional[asyncio.AbstractEventLoop] = None
+        # Track concurrent inference requests for monitoring
+        self._active_inference_count = 0
+        self._inference_count_lock = asyncio.Lock() if asyncio else None
+    def get_latest_inference_time(self) -> datetime:
+        """Get the latest inference time."""
+        return self.latest_inference_time
+    def set_pipeline_event_loop(self, event_loop: asyncio.AbstractEventLoop) -> None:
+        """Set the pipeline event loop for thread-safe async operations.
+        Args:
+            event_loop: Event loop from StreamingPipeline
+        """
+        self.pipeline_event_loop = event_loop
+        self.logger.info("Pipeline event loop registered for thread-safe inference")
+    def has_async_predict(self) -> bool:
+        """Check if async_predict is available in the underlying model manager.
+        Returns:
+            bool: True if async_predict is available, False otherwise
+        """
+        try:
+            # Check if model_manager_wrapper has model_manager attribute
+            if not hasattr(self.model_manager_wrapper, 'model_manager'):
+                return False
+            model_manager = self.model_manager_wrapper.model_manager
+            # Check if model_manager has async_predict and it's not None
+            if hasattr(model_manager, 'async_predict') and model_manager.async_predict is not None:
+                return True
+            return False
+        except Exception as e:
+            self.logger.warning(f"Error checking async_predict availability: {e}")
+            return False
+    async def inference(
+        self,
+        input: Any,
+        extra_params: Optional[Dict[str, Any]] = None,
+        apply_post_processing: bool = False,
+        post_processing_config: Optional[Union[Dict[str, Any], str]] = None,
+        stream_key: Optional[str] = None,
+        stream_info: Optional[Dict[str, Any]] = None,
+        camera_info: Optional[Dict[str, Any]] = None,
+        pipeline_event_loop: Optional[asyncio.AbstractEventLoop] = None,
+        is_high_priority: bool = False,
+    ) -> Tuple[Any, Optional[Dict[str, Any]]]:
+        """Perform inference using the appropriate client with optional post-processing.
+        Args:
+            input: Primary input data (e.g., image bytes, numpy array)
+            extra_params: Additional parameters for inference (optional)
+            apply_post_processing: Whether to apply post-processing
+            post_processing_config: Configuration for post-processing
+            stream_key: Unique identifier for the input stream
+            stream_info: Additional metadata about the stream (optional)
+            camera_info: Additional metadata about the camera/source (optional)
+            pipeline_event_loop: Event loop from StreamingPipeline (if available)
+            is_high_priority: If True, this is a high-priority request (e.g., identity image)
+        Returns:
+            A tuple containing:
+                - The inference results (raw or post-processed)
+                - Metadata about the inference and post-processing (if applicable)
+        Note:
+            High-priority requests (like identity images for face recognition) will
+            always execute successfully. During their execution, streaming frames may
+            be naturally skipped if the inference queue fills up, which is acceptable
+            for continuous streaming scenarios.
+        """
+        if input is None:
+            raise ValueError("Input cannot be None")
+        # Log high-priority requests for monitoring
+        if is_high_priority:
+            self.logger.info(f"Processing high-priority inference request (stream_key={stream_key})")
+        # Measure model inference time
+        model_start_time = time.time()
+        # Update latest inference time
+        self.latest_inference_time = datetime.now(timezone.utc)
+        # Run model inference with proper thread-safety
+        try:
+            # Use provided event loop or fall back to stored pipeline event loop
+            event_loop_to_use = pipeline_event_loop or self.pipeline_event_loop
+            # If event loop is available and has async_predict, use thread-safe async inference
+            if event_loop_to_use and self.has_async_predict():
+                # Run async inference in pipeline's event loop from any thread
+                # This ensures identity images and streaming frames use the same event loop
+                # This prevents greenlet/gevent thread context switching errors
+                self.logger.debug(
+                    f"Using thread-safe async inference via pipeline event loop "
+                    f"(priority={'high' if is_high_priority else 'normal'})"
+                )
+                future = asyncio.run_coroutine_threadsafe(
+                    self.model_manager_wrapper.async_inference(
+                        input=input,
+                        extra_params=extra_params,
+                        stream_key=stream_key,
+                        stream_info=stream_info
+                    ),
+                    event_loop_to_use
+                )
+                # High-priority requests get longer timeout
+                timeout = 120.0 if is_high_priority else 60.0
+                raw_results, success = future.result(timeout=timeout)
+            else:
+                # Fall back to sync inference (no async support or no event loop)
+                self.logger.debug("Using synchronous inference (no async support or event loop)")
+                raw_results, success = self.model_manager_wrapper.inference(
+                    input=input,
+                    extra_params=extra_params,
+                    stream_key=stream_key,
+                    stream_info=stream_info
+                )
+            model_inference_time = time.time() - model_start_time
+            if not success:
+                raise RuntimeError("Model inference failed")
+            self.logger.debug(
+                f"Model inference executed stream_key={stream_key} "
+                f"time={model_inference_time:.4f}s priority={'high' if is_high_priority else 'normal'}"
+            )
+        except Exception as exc:
+            # Add context about greenlet errors
+            error_msg = str(exc)
+            if "greenlet" in error_msg.lower() or "cannot switch" in error_msg.lower():
+                self.logger.error(
+                    f"Greenlet thread context error detected. This typically means the model "
+                    f"is being accessed from multiple threads without proper event loop coordination. "
+                    f"Error: {error_msg}",
+                    exc_info=True
+                )
+            else:
+                self.logger.error(f"Model inference failed: {error_msg}", exc_info=True)
+            raise RuntimeError(f"Model inference failed: {error_msg}") from exc
+        # If no post-processing requested, return raw results
+        if not apply_post_processing or not self.post_processor:
+            return raw_results, {
+                "timing_metadata": {
+                    "model_inference_time_sec": model_inference_time,
+                    "post_processing_time_sec": 0.0,
+                    "total_time_sec": model_inference_time,
+                }
+            }
+        # Apply post-processing using PostProcessor
+        try:
+            post_processing_start_time = time.time()
+            # Use PostProcessor.process() method directly
+            result = await self.post_processor.process(
+                data=raw_results,
+                config=post_processing_config,  # Use stream_key as fallback if no config
+                input_bytes=input if isinstance(input, bytes) else None,
+                stream_key=stream_key,
+                stream_info=stream_info
+            )
+            post_processing_time = time.time() - post_processing_start_time
+            # Format the response based on PostProcessor result
+            if result.is_success():
+                # For face recognition use case, return empty raw results
+                processed_raw_results = [] if (
+                    hasattr(result, 'usecase') and result.usecase == 'face_recognition'
+                ) else raw_results
+                # Extract agg_summary from result data if available
+                agg_summary = {}
+                if hasattr(result, 'data') and isinstance(result.data, dict):
+                    agg_summary = result.data.get("agg_summary", {})
+                post_processing_result = {
+                    "status": "success",
+                    "processing_time": result.processing_time,
+                    "usecase": getattr(result, 'usecase', ''),
+                    "category": getattr(result, 'category', ''),
+                    "summary": getattr(result, 'summary', ''),
+                    "insights": getattr(result, 'insights', []),
+                    "metrics": getattr(result, 'metrics', {}),
+                    "predictions": getattr(result, 'predictions', []),
+                    "agg_summary": agg_summary,
+                    "stream_key": stream_key or "default_stream",
+                    "timing_metadata": {
+                        "model_inference_time_sec": model_inference_time,
+                        "post_processing_time_sec": post_processing_time,
+                        "total_time_sec": model_inference_time + post_processing_time,
+                    }
+                }
+                return processed_raw_results, post_processing_result
+            else:
+                # Post-processing failed
+                self.logger.error(f"Post-processing failed: {result.error_message}")
+                return raw_results, {
+                    "status": "post_processing_failed",
+                    "error": result.error_message,
+                    "error_type": getattr(result, 'error_type', 'ProcessingError'),
+                    "processing_time": result.processing_time,
+                    "processed_data": raw_results,
+                    "stream_key": stream_key or "default_stream",
+                    "timing_metadata": {
+                        "model_inference_time_sec": model_inference_time,
+                        "post_processing_time_sec": post_processing_time,
+                        "total_time_sec": model_inference_time + post_processing_time,
+                    }
+                }
+        except Exception as e:
+            post_processing_time = time.time() - post_processing_start_time
+            self.logger.error(f"Post-processing exception: {str(e)}", exc_info=True)
+            return raw_results, {
+                "status": "post_processing_failed",
+                "error": str(e),
+                "error_type": type(e).__name__,
+                "processed_data": raw_results,
+                "stream_key": stream_key or "default_stream",
+                "timing_metadata": {
+                    "model_inference_time_sec": model_inference_time,
+                    "post_processing_time_sec": post_processing_time,
+                    "total_time_sec": model_inference_time + post_processing_time,
+                }
+            }
+    async def async_inference(
+        self,
+        input: Any,
+        extra_params: Optional[Dict[str, Any]] = None,
+        apply_post_processing: bool = False,
+        post_processing_config: Optional[Union[Dict[str, Any], str]] = None,
+        stream_key: Optional[str] = None,
+        stream_info: Optional[Dict[str, Any]] = None,
+        camera_info: Optional[Dict[str, Any]] = None,
+        pipeline_event_loop: Optional[asyncio.AbstractEventLoop] = None,
+    ) -> Tuple[Any, Optional[Dict[str, Any]]]:
+        """Perform asynchronous inference using async_predict when available.
+        This method MUST be called within the pipeline's event loop.
+        For calls from other threads, use the regular inference() method which
+        handles thread-safety automatically.
+        Args:
+            input: Primary input data (e.g., image bytes, numpy array)
+            extra_params: Additional parameters for inference (optional)
+            apply_post_processing: Whether to apply post-processing
+            post_processing_config: Configuration for post-processing
+            stream_key: Unique identifier for the input stream
+            stream_info: Additional metadata about the stream (optional)
+            camera_info: Additional metadata about the camera/source (optional)
+            pipeline_event_loop: Event loop from StreamingPipeline (optional, for validation)
+        Returns:
+            A tuple containing:
+                - The inference results (raw or post-processed)
+                - Metadata about the inference and post-processing (if applicable)
+        """
+        if input is None:
+            raise ValueError("Input cannot be None")
+        # Measure model inference time
+        model_start_time = time.time()
+        # Update latest inference time
+        self.latest_inference_time = datetime.now(timezone.utc)
+        # Run asynchronous model inference
+        try:
+            raw_results, success = await self.model_manager_wrapper.async_inference(
+                input=input,
+                extra_params=extra_params,
+                stream_key=stream_key,
+                stream_info=stream_info
+            )
+            model_inference_time = time.time() - model_start_time
+            if not success:
+                raise RuntimeError("Model inference failed")
+            self.logger.debug(
+                f"Async model inference executed stream_key={stream_key} time={model_inference_time:.4f}s"
+            )
+        except Exception as exc:
+            self.logger.error(f"Async model inference failed: {str(exc)}", exc_info=True)
+            raise RuntimeError(f"Async model inference failed: {str(exc)}") from exc
+        # If no post-processing requested, return raw results
+        if not apply_post_processing or not self.post_processor:
+            return raw_results, {
+                "timing_metadata": {
+                    "model_inference_time_sec": model_inference_time,
+                    "post_processing_time_sec": 0.0,
+                    "total_time_sec": model_inference_time,
+                }
+            }
+        # Apply post-processing using PostProcessor
+        try:
+            post_processing_start_time = time.time()
+            # Use PostProcessor.process() method directly (async)
+            result = await self.post_processor.process(
+                data=raw_results,
+                config=post_processing_config,
+                input_bytes=input if isinstance(input, bytes) else None,
+                stream_key=stream_key,
+                stream_info=stream_info
+            )
+            post_processing_time = time.time() - post_processing_start_time
+            # Format the response based on PostProcessor result
+            if result.is_success():
+                # For face recognition use case, return empty raw results
+                processed_raw_results = [] if (
+                    hasattr(result, 'usecase') and result.usecase == 'face_recognition'
+                ) else raw_results
+                # Extract agg_summary from result data if available
+                agg_summary = {}
+                if hasattr(result, 'data') and isinstance(result.data, dict):
+                    agg_summary = result.data.get("agg_summary", {})
+                post_processing_result = {
+                    "status": "success",
+                    "processing_time": result.processing_time,
+                    "usecase": getattr(result, 'usecase', ''),
+                    "category": getattr(result, 'category', ''),
+                    "summary": getattr(result, 'summary', ''),
+                    "insights": getattr(result, 'insights', []),
+                    "metrics": getattr(result, 'metrics', {}),
+                    "predictions": getattr(result, 'predictions', []),
+                    "agg_summary": agg_summary,
+                    "stream_key": stream_key or "default_stream",
+                    "timing_metadata": {
+                        "model_inference_time_sec": model_inference_time,
+                        "post_processing_time_sec": post_processing_time,
+                        "total_time_sec": model_inference_time + post_processing_time,
+                    }
+                }
+                return processed_raw_results, post_processing_result
+            else:
+                # Post-processing failed
+                self.logger.error(f"Post-processing failed: {result.error_message}")
+                return raw_results, {
+                    "status": "post_processing_failed",
+                    "error": result.error_message,
+                    "error_type": getattr(result, 'error_type', 'ProcessingError'),
+                    "processing_time": result.processing_time,
+                    "processed_data": raw_results,
+                    "stream_key": stream_key or "default_stream",
+                    "timing_metadata": {
+                        "model_inference_time_sec": model_inference_time,
+                        "post_processing_time_sec": post_processing_time,
+                        "total_time_sec": model_inference_time + post_processing_time,
+                    }
+                }
+        except Exception as e:
+            post_processing_time = time.time() - post_processing_start_time
+            self.logger.error(f"Post-processing exception: {str(e)}", exc_info=True)
+            return raw_results, {
+                "status": "post_processing_failed",
+                "error": str(e),
+                "error_type": type(e).__name__,
+                "processed_data": raw_results,
+                "stream_key": stream_key or "default_stream",
+                "timing_metadata": {
+                    "model_inference_time_sec": model_inference_time,
+                    "post_processing_time_sec": post_processing_time,
+                    "total_time_sec": model_inference_time + post_processing_time,
+                }
+            }

matrice-inference 0.1.33__tar.gz → 0.1.58__tar.gz

matrice-inference 0.1.33tar.gz → 0.1.58tar.gz