PyPI - matrice-inference - Versions diffs - 0.1.2__py3-none-any.whl - Mend

matrice-inference 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of matrice-inference might be problematic. Click here for more details.

Files changed (37) hide show

matrice_inference/__init__.py +72 -0
matrice_inference/py.typed +0 -0
matrice_inference/server/__init__.py +23 -0
matrice_inference/server/inference_interface.py +176 -0
matrice_inference/server/model/__init__.py +1 -0
matrice_inference/server/model/model_manager.py +274 -0
matrice_inference/server/model/model_manager_wrapper.py +550 -0
matrice_inference/server/model/triton_model_manager.py +290 -0
matrice_inference/server/model/triton_server.py +1248 -0
matrice_inference/server/proxy_interface.py +371 -0
matrice_inference/server/server.py +1004 -0
matrice_inference/server/stream/__init__.py +0 -0
matrice_inference/server/stream/app_deployment.py +228 -0
matrice_inference/server/stream/consumer_worker.py +201 -0
matrice_inference/server/stream/frame_cache.py +127 -0
matrice_inference/server/stream/inference_worker.py +163 -0
matrice_inference/server/stream/post_processing_worker.py +230 -0
matrice_inference/server/stream/producer_worker.py +147 -0
matrice_inference/server/stream/stream_pipeline.py +451 -0
matrice_inference/server/stream/utils.py +23 -0
matrice_inference/tmp/abstract_model_manager.py +58 -0
matrice_inference/tmp/aggregator/__init__.py +18 -0
matrice_inference/tmp/aggregator/aggregator.py +330 -0
matrice_inference/tmp/aggregator/analytics.py +906 -0
matrice_inference/tmp/aggregator/ingestor.py +438 -0
matrice_inference/tmp/aggregator/latency.py +597 -0
matrice_inference/tmp/aggregator/pipeline.py +968 -0
matrice_inference/tmp/aggregator/publisher.py +431 -0
matrice_inference/tmp/aggregator/synchronizer.py +594 -0
matrice_inference/tmp/batch_manager.py +239 -0
matrice_inference/tmp/overall_inference_testing.py +338 -0
matrice_inference/tmp/triton_utils.py +638 -0
matrice_inference-0.1.2.dist-info/METADATA +28 -0
matrice_inference-0.1.2.dist-info/RECORD +37 -0
matrice_inference-0.1.2.dist-info/WHEEL +5 -0
matrice_inference-0.1.2.dist-info/licenses/LICENSE.txt +21 -0
matrice_inference-0.1.2.dist-info/top_level.txt +1 -0

matrice_inference/tmp/batch_manager.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""
+Dynamic batching manager for inference requests.
+This module contains the batching logic separated from the main inference interface
+to improve modularity and maintainability.
+"""
+import asyncio
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from matrice_analytics.post_processing.core.config import BaseConfig
+@dataclass
+class BatchRequest:
+    """Represents a single inference request in a batch"""
+    input1: Any
+    input2: Optional[Any] = None
+    extra_params: Optional[Dict[str, Any]] = None
+    apply_post_processing: bool = False
+    post_processing_config: Optional[Union[Dict[str, Any], BaseConfig]] = None
+    future: asyncio.Future = field(default_factory=asyncio.Future)
+    timestamp: float = field(default_factory=time.time)
+    stream_key: Optional[str] = None
+    stream_info: Optional[Dict[str, Any]] = None
+    input_hash: Optional[str] = None
+    camera_info: Optional[Dict[str, Any]] = None
+class DynamicBatchManager:
+    """Manages dynamic batching for inference requests"""
+    def __init__(
+        self,
+        batch_size: int,
+        max_batch_wait_time: float,
+        model_manager,
+        post_processing_fn: Callable,
+    ):
+        """
+        Initialize the dynamic batch manager.
+        Args:
+            batch_size: Maximum batch size for processing
+            max_batch_wait_time: Maximum wait time for batching
+            model_manager: Model manager for inference
+            post_processing_fn: Function to apply post-processing
+        """
+        self.logger = logging.getLogger(__name__)
+        self.batch_size = batch_size
+        self.max_batch_wait_time = max_batch_wait_time
+        self.model_manager = model_manager
+        self.post_processing_fn = post_processing_fn
+        # Dynamic batching components
+        self.batch_queue: List[BatchRequest] = []
+        self.batch_lock = asyncio.Lock()
+        self.processing_batch = False
+    async def add_request(self, batch_request: BatchRequest) -> Tuple[Any, Optional[Dict[str, Any]]]:
+        """Add a request to the batch queue and process if needed"""
+        # Add to batch queue
+        async with self.batch_lock:
+            self.batch_queue.append(batch_request)
+            # Check if we should process the batch
+            should_process = (
+                len(self.batch_queue) >= self.batch_size or not self.processing_batch
+            )
+            if should_process and not self.processing_batch:
+                self.processing_batch = True
+                # Start batch processing in background
+                asyncio.create_task(self._process_batch())
+        # Wait for the result
+        try:
+            return await batch_request.future
+        except Exception as e:
+            raise RuntimeError(f"Dynamic batch inference failed: {str(e)}") from e
+    async def _process_batch(self):
+        """Process batched inference requests"""
+        try:
+            # Wait for batch to fill up or timeout
+            await asyncio.sleep(self.max_batch_wait_time)
+            async with self.batch_lock:
+                if not self.batch_queue:
+                    self.processing_batch = False
+                    return
+                # Extract current batch
+                current_batch = self.batch_queue[: self.batch_size]
+                self.batch_queue = self.batch_queue[self.batch_size :]
+                # Reset processing flag if no more items
+                if not self.batch_queue:
+                    self.processing_batch = False
+                else:
+                    # Continue processing remaining items
+                    asyncio.create_task(self._process_batch())
+            if not current_batch:
+                return
+            # Prepare batch inputs
+            batch_input1 = [req.input1 for req in current_batch]
+            batch_input2 = (
+                [req.input2 for req in current_batch]
+                if any(req.input2 is not None for req in current_batch)
+                else None
+            )
+            batch_extra_params = [req.extra_params for req in current_batch]
+            stream_key = current_batch[0].stream_key
+            stream_info = current_batch[0].stream_info
+            input_hash = current_batch[0].input_hash
+            # Validate that all requests in the batch have the same stream_key
+            batch_stream_keys = [req.stream_key for req in current_batch]
+            if not all(sk == stream_key for sk in batch_stream_keys):
+                self.logger.warning(
+                    f"Batch contains requests with different stream keys: {set(batch_stream_keys)}. "
+                    f"Using first request's stream key: {stream_key} for model inference, "
+                    f"but individual stream keys for post-processing."
+                )
+            else:
+                self.logger.debug(
+                    f"Processing batch size={len(current_batch)} stream_key={stream_key}"
+                )
+            # Check if all requests have the same extra_params structure
+            if batch_extra_params and all(
+                params == batch_extra_params[0] for params in batch_extra_params
+            ):
+                merged_extra_params = batch_extra_params[0]
+            else:
+                # Handle heterogeneous extra_params - use first non-None or empty dict
+                merged_extra_params = next(
+                    (params for params in batch_extra_params if params), {}
+                )
+            try:
+                # Perform batch inference
+                batch_results, success = self.model_manager.batch_inference(
+                    batch_input1,
+                    batch_input2,
+                    merged_extra_params,
+                    stream_key,
+                    stream_info,
+                    input_hash
+                )
+                if not success:
+                    raise RuntimeError("Batch inference failed")
+                self.logger.debug(
+                    f"Batch inference executed items={len(current_batch)} stream_key={stream_key}"
+                )
+                # Process results for each request
+                for i, (request, result) in enumerate(
+                    zip(current_batch, batch_results)
+                ):
+                    try:
+                        if request.apply_post_processing:
+                            processed_result, post_processing_result = (
+                                await self.post_processing_fn(
+                                    result,
+                                    request.input1,
+                                    request.post_processing_config,
+                                    request.stream_key,
+                                    request.stream_info,
+                                    request.camera_info,
+                                )
+                            )
+                            request.future.set_result(
+                                (processed_result, post_processing_result)
+                            )
+                        else:
+                            # Check if this is face recognition use case and return empty predictions for raw results
+                            if self._is_face_recognition_request(request):
+                                request.future.set_result(([], None))
+                            else:
+                                request.future.set_result((result, None))
+                    except Exception as e:
+                        request.future.set_exception(e)
+            except Exception as e:
+                # Set exception for all requests in the batch
+                for request in current_batch:
+                    if not request.future.done():
+                        request.future.set_exception(e)
+        except Exception as e:
+            # Handle unexpected errors
+            self.logger.error(f"Batch processing failed: {str(e)}")
+            async with self.batch_lock:
+                self.processing_batch = False
+    def _is_face_recognition_request(self, request: BatchRequest) -> bool:
+        """Check if a request is for face recognition use case."""
+        try:
+            # Parse the post-processing config to check if it's face recognition
+            config = request.post_processing_config
+            if isinstance(config, BaseConfig):
+                return hasattr(config, 'usecase') and config.usecase == 'face_recognition'
+            elif isinstance(config, dict):
+                return config.get('usecase') == 'face_recognition'
+            elif isinstance(config, str):
+                return config == 'face_recognition'
+            return False
+        except Exception:
+            return False
+    def get_stats(self) -> Dict[str, Any]:
+        """Get statistics about the current batching state."""
+        return {
+            "batch_size": self.batch_size,
+            "max_batch_wait_time": self.max_batch_wait_time,
+            "current_queue_size": len(self.batch_queue),
+            "processing_batch": self.processing_batch,
+        }
+    async def flush_queue(self) -> int:
+        """Force process all remaining items in the batch queue.
+        Returns:
+            Number of items processed
+        """
+        async with self.batch_lock:
+            remaining_items = len(self.batch_queue)
+            if remaining_items > 0 and not self.processing_batch:
+                self.processing_batch = True
+                asyncio.create_task(self._process_batch())
+        return remaining_items

matrice_inference/tmp/overall_inference_testing.py ADDED Viewed

@@ -0,0 +1,338 @@
+import asyncio
+import os
+import numpy as np
+import logging
+import time
+from datetime import datetime
+import subprocess
+import psutil
+from triton_model_manager import TritonModelManager
+import GPUtil
+import pytz
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+logger = logging.getLogger(__name__)
+COCO_CLASSES = [
+    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
+    "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
+    "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
+    "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
+    "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
+    "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
+    "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
+    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
+    "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
+    "toothbrush"
+]
+async def triton_async_benchmark(image_dir, num_requests=100, output_report="master_benchmark_report_v1.md"):
+    logger.info("Starting Triton Async Inference Master Benchmark at %s IST", datetime.now(pytz.timezone("Asia/Kolkata")).strftime('%Y-%m-%d %H:%M:%S'))
+    MODEL_NAME = "yolov8n"
+    MODEL_DIR = r"./models"
+    # NOTE: Place your model files (yolov8n.onnx, yolov8n.plan) in the MODEL_DIR
+    # wget https://github.com/Vedant-MatriceAI/Temporary_Model_Repository/raw/main/yolov8n.onnx
+    # wget https://github.com/Vedant-MatriceAI/Temporary_Model_Repository/raw/main/yolov8n.plan
+    INTERNAL_HOST = "localhost"
+    INPUT_SIZE = 640
+    NUM_CLASSES = 80
+    NUM_MODEL_INSTANCES = 1
+    MAX_BATCH_SIZE = 2
+    IS_YOLO = True
+    configurations = [
+        {
+            "model_path": os.path.join(MODEL_DIR, "yolov8n.plan"),
+            "runtime_framework": "tensorrt",
+            "server_type": server_type,
+            "port": 8000 if server_type == "rest" else 8001,
+            "dynamic_batching": dynamic_batching,
+            "use_trt_accelerator": use_trt
+        }
+        for server_type in ["rest", "grpc"]
+        for dynamic_batching in [True, False]
+        for use_trt in [True, False]
+    ] + [
+        {
+            "model_path": os.path.join(MODEL_DIR, "yolov8n.onnx"),
+            "runtime_framework": "onnx",
+            "server_type": server_type,
+            "port": 8000 if server_type == "rest" else 8001,
+            "dynamic_batching": dynamic_batching,
+            "use_trt_accelerator": False
+        }
+        for server_type in ["rest", "grpc"]
+        for dynamic_batching in [True, False]
+    ]
+    logger.info(f"Total configurations to test: {len(configurations)}")
+    all_metrics = []
+    system_info = {
+        "triton_version": "2.37.0",
+        "docker_image": "nvcr.io/nvidia/tritonserver:23.08-py3",
+        "cuda_version": "12.1",
+        "nvidia_driver_version": "535.216.03",
+        "gpu_info": "NVIDIA L4 (ID: 0, Memory: 23034.0 MB)",
+        "cpu_info": f"{psutil.cpu_count(logical=True)} logical cores, {psutil.cpu_count(logical=False)} physical cores",
+        "memory_total": f"{psutil.virtual_memory().total / (1024**3):.2f} GB",
+        "os": f"{subprocess.getoutput('cat /etc/os-release').split('PRETTY_NAME=')[1].splitlines()[0].strip()}"
+    }
+    try:
+        system_info["cuda_version"] = subprocess.getoutput("nvcc --version | grep release").split("release ")[1].split(",")[0]
+    except:
+        logger.warning("Could not retrieve CUDA version, using fallback")
+    try:
+        system_info["nvidia_driver_version"] = subprocess.getoutput("nvidia-smi | grep Driver").split("Driver Version: ")[1].split()[0]
+    except:
+        logger.warning("Could not retrieve NVIDIA driver version, using fallback")
+    try:
+        gpus = GPUtil.getGPUs()
+        system_info["gpu_info"] = ", ".join([f"{gpu.name} (ID: {gpu.id}, Memory: {gpu.memoryTotal} MB)" for gpu in gpus])
+    except:
+        logger.warning("Could not retrieve GPU info, using fallback")
+    try:
+        gpus = GPUtil.getGPUs()
+        if gpus:
+            temp = gpus[0].temperature
+            mem_used = gpus[0].memoryUsed
+            mem_total = gpus[0].memoryTotal
+            logger.info(f"Initial GPU status: Temperature={temp}°C, Memory={mem_used}/{mem_total} MB")
+            if temp > 55 or mem_used > 0.1 * mem_total:
+                logger.info("Initial GPU temperature or memory usage high, waiting for stabilization...")
+                for _ in range(60):
+                    await asyncio.sleep(1)
+                    gpus = GPUtil.getGPUs()
+                    temp = gpus[0].temperature if gpus else 0
+                    mem_used = gpus[0].memoryUsed if gpus else 0
+                    if temp <= 55 and mem_used <= 0.1 * mem_total:
+                        logger.info(f"GPU stabilized at {temp}°C, memory {mem_used}/{mem_total} MB")
+                        break
+                else:
+                    logger.error(f"GPU still at {temp}°C, memory {mem_used}/{mem_total} MB after waiting. Aborting benchmark to prevent shutdown.")
+                    raise RuntimeError("Initial GPU conditions unsafe for benchmarking")
+    except Exception as e:
+        logger.warning(f"Could not check initial GPU status: {str(e)}. Proceeding with caution.")
+    image_files = [
+        os.path.join(image_dir, f)
+        for f in os.listdir(image_dir)
+        if f.lower().endswith((".jpg", ".jpeg", ".png"))
+    ]
+    if len(image_files) < num_requests:
+        logger.warning(f"Requested {num_requests} images, but only {len(image_files)} found. Using available images.")
+        num_requests = len(image_files)
+    image_files = image_files[:num_requests]
+    if not image_files:
+        raise FileNotFoundError(f"No images found in {image_dir}")
+    image_bytes_list = []
+    for img_path in image_files:
+        with open(img_path, "rb") as f:
+            image_bytes_list.append(f.read())
+    for idx, config in enumerate(configurations):
+        logger.info(f"Running benchmark for configuration {idx + 1}/{len(configurations)}: {config}")
+        metrics = {
+            "latencies": [],
+            "total_time": 0,
+            "num_requests": num_requests,
+            "successful_requests": 0,
+            "failed_requests": 0,
+            "total_objects_detected": 0,
+            "failure_reason": ""
+        }
+        # GPU cool-down before each run
+        try:
+            gpus = GPUtil.getGPUs()
+            if gpus:
+                temp = gpus[0].temperature
+                mem_used = gpus[0].memoryUsed
+                mem_total = gpus[0].memoryTotal
+                logger.info(f"GPU status before run: Temperature={temp}°C, Memory={mem_used}/{mem_total} MB")
+                if temp > 55 or mem_used > 0.1 * mem_total:
+                    logger.info("GPU temperature or memory usage high, waiting for cool-down...")
+                    for _ in range(30):
+                        await asyncio.sleep(1)
+                        gpus = GPUtil.getGPUs()
+                        temp = gpus[0].temperature if gpus else 0
+                        mem_used = gpus[0].memoryUsed if gpus else 0
+                        if temp <= 55 and mem_used <= 0.1 * mem_total:
+                            logger.info(f"GPU cooled to {temp}°C, memory freed to {mem_used}/{mem_total} MB")
+                            break
+                    else:
+                        logger.warning(f"GPU still at {temp}°C, memory {mem_used}/{mem_total} MB after waiting, proceeding with run")
+        except Exception as e:
+            logger.warning(f"Could not check GPU status: {str(e)}")
+        try:
+            if not os.path.exists(config["model_path"]):
+                error_msg = f"Model file not found: {config['model_path']}"
+                logger.error(error_msg)
+                metrics["failed_requests"] = num_requests
+                metrics["failure_reason"] = error_msg
+                all_metrics.append((config, metrics))
+                continue
+            manager = TritonModelManager(
+                model_name=MODEL_NAME,
+                model_path=config["model_path"],
+                runtime_framework=config["runtime_framework"],
+                internal_server_type=config["server_type"],
+                internal_port=config["port"],
+                internal_host=INTERNAL_HOST,
+                input_size=INPUT_SIZE,
+                num_classes=NUM_CLASSES,
+                num_model_instances=NUM_MODEL_INSTANCES,
+                use_dynamic_batching=config["dynamic_batching"],
+                max_batch_size=MAX_BATCH_SIZE,
+                is_yolo=IS_YOLO,
+                use_trt_accelerator=config["use_trt_accelerator"]
+            )
+            async def run_inference(image_bytes, img_idx):
+                start_time = time.time()
+                try:
+                    result, success = await manager.async_inference(image_bytes)
+                    if not success or result is None or result.get("predictions") is None:
+                        raise RuntimeError(f"Inference failed for image {img_idx}")
+                    # Extract predictions
+                    predictions = result["predictions"]
+                    boxes = np.array(predictions["boxes"])
+                    scores = np.array(predictions["scores"])
+                    class_ids = np.array(predictions["class_ids"])
+                    # Log results for first few images
+                    if img_idx < 3:
+                        logger.info(f"======= Results for image {img_idx}: {os.path.basename(image_files[img_idx])} =======")
+                        logger.info(f"Detected {boxes.shape[0]} objects")
+                        for i in range(min(boxes.shape[0], 3)):
+                            try:
+                                box = boxes[i]
+                                score = scores[i]
+                                class_id = int(class_ids[i])
+                                class_name = COCO_CLASSES[class_id] if 0 <= class_id < len(COCO_CLASSES) else "unknown"
+                                logger.info(f"Object {i+1}: {class_name} (Score: {score:.4f}, Box: {box})")
+                            except Exception as e:
+                                logger.warning(f"Failed to log object {i+1} for image {img_idx}: {e}")
+                        logger.info("=============================================")
+                    metrics["successful_requests"] += 1
+                    metrics["total_objects_detected"] += boxes.shape[0]
+                    metrics["latencies"].append(time.time() - start_time)
+                except Exception as e:
+                    logger.error(f"Inference failed for image {img_idx}: {str(e)}")
+                    metrics["failed_requests"] += 1
+            start_total_time = time.time()
+            tasks = [run_inference(image_bytes, idx) for idx, image_bytes in enumerate(image_bytes_list)]
+            await asyncio.gather(*tasks, return_exceptions=True)
+            metrics["total_time"] = time.time() - start_total_time
+            # Calculate metrics
+            metrics["throughput"] = metrics["successful_requests"] / metrics["total_time"] if metrics["total_time"] > 0 else 0
+            metrics["avg_fps"] = metrics["successful_requests"] / metrics["total_time"] if metrics["total_time"] > 0 else 0
+            metrics["avg_latency_ms"] = (sum(metrics["latencies"]) / len(metrics["latencies"]) * 1000) if metrics["latencies"] else 0
+            metrics["min_latency_ms"] = min(metrics["latencies"]) * 1000 if metrics["latencies"] else 0
+            metrics["max_latency_ms"] = max(metrics["latencies"]) * 1000 if metrics["latencies"] else 0
+            metrics["p95_latency_ms"] = np.percentile(metrics["latencies"], 95) * 1000 if metrics["latencies"] else 0
+        except Exception as e:
+            error_msg = f"Benchmark error: {str(e)}"
+            logger.error(error_msg)
+            metrics["failed_requests"] = num_requests
+            metrics["failure_reason"] = error_msg
+        finally:
+            try:
+                manager.triton_server_process.terminate()
+                manager.triton_server_process.wait(timeout=300)
+                logger.info("Triton server terminated")
+            except Exception as e:
+                logger.warning(f"Cleanup failed: {str(e)}")
+            all_metrics.append((config, metrics))
+    report_content = f"""# Triton Inference Server Master Benchmark Report
+*Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} UTC*
+*Generated on {datetime.now(pytz.timezone("Asia/Kolkata")).strftime('%Y-%m-%d %H:%M:%S')} IST*
+## System Configuration
+- **Operating System**: {system_info["os"]}
+- **Triton Server Version**: {system_info["triton_version"]}
+- **Docker Image**: {system_info["docker_image"]}
+- **CUDA Version**: {system_info["cuda_version"]}
+- **NVIDIA Driver Version**: {system_info["nvidia_driver_version"]}
+- **GPU Configuration**: {system_info["gpu_info"]}
+- **CPU Configuration**: {system_info["cpu_info"]}
+- **System Memory**: {system_info["memory_total"]}
+## Benchmark Summary
+| Config ID | Model Format | Server Protocol | Dynamic Batching | TensorRT Accelerator | Total Images | Failed Requests | Objects Detected | Total Time (s) | Throughput (img/s) | Avg FPS | Avg Latency (ms) | Min Latency (ms) | Max Latency (ms) | P95 Latency (ms) |
+|-----------|--------------|-----------------|------------------|---------------------|--------------|-----------------|------------------|----------------|--------------------|---------|------------------|------------------|------------------|------------------|
+"""
+    for idx, (config, metrics) in enumerate(all_metrics):
+        report_content += f"| {idx + 1} | {config['runtime_framework']} | {config['server_type']} | {config['dynamic_batching']} | {config['use_trt_accelerator']} | {metrics['successful_requests']} | {metrics['failed_requests']} | {metrics['total_objects_detected']} | {metrics['total_time']:.2f} | {metrics['throughput']:.2f} | {metrics['avg_fps']:.2f} | {metrics['avg_latency_ms']:.2f} | {metrics['min_latency_ms']:.2f} | {metrics['max_latency_ms']:.2f} | {metrics['p95_latency_ms']:.2f} |\n"
+    report_content += """
+## Detailed Results
+"""
+    for idx, (config, metrics) in enumerate(all_metrics):
+        report_content += f"""
+### Configuration {idx + 1}: {config['runtime_framework'].upper()} ({config['server_type'].upper()}, Dynamic Batching: {config['dynamic_batching']}, TensorRT: {config['use_trt_accelerator']})
+- **Model Name**: {MODEL_NAME}
+- **Model Path**: {config['model_path']}
+- **Runtime Framework**: {config['runtime_framework']}
+- **Server Protocol**: {config['server_type']}
+- **Port**: {config['port']}
+- **Input Size**: {INPUT_SIZE}x{INPUT_SIZE}
+- **Number of Classes**: {NUM_CLASSES}
+- **Number of Model Instances**: {NUM_MODEL_INSTANCES}
+- **Dynamic Batching**: {config['dynamic_batching']}
+- **Max Batch Size**: {MAX_BATCH_SIZE}
+- **YOLO Model**: {IS_YOLO}
+- **TensorRT Accelerator**: {config['use_trt_accelerator']}
+#### Benchmark Results
+"""
+        if metrics["failure_reason"]:
+            report_content += f"- **Status**: Failed\n- **Failure Reason**: {metrics['failure_reason']}\n"
+        else:
+            report_content += f"""- **Total Images Processed**: {metrics['successful_requests']}
+- **Failed Requests**: {metrics['failed_requests']}
+- **Total Objects Detected**: {metrics['total_objects_detected']}
+- **Total Time**: {metrics['total_time']:.2f} seconds
+- **Throughput**: {metrics['throughput']:.2f} images/second
+- **Average FPS**: {metrics['avg_fps']:.2f} frames/second
+- **Average Latency**: {metrics['avg_latency_ms']:.2f} ms
+- **Min Latency**: {metrics['min_latency_ms']:.2f} ms
+- **Max Latency**: {metrics['max_latency_ms']:.2f} ms
+- **P95 Latency**: {metrics['p95_latency_ms']:.2f} ms
+"""
+    with open(output_report, "w") as f:
+        f.write(report_content)
+    logger.info(f"Master benchmark report saved to {output_report}")
+if __name__ == "__main__":
+    image_dir = r"./coco/val2017"
+    # NOTE: Exec the below commands beforehand to prepare dataset
+    # mkdir -p coco && cd coco
+    # wget http://images.cocodataset.org/zips/val2017.zip
+    # unzip val2017.zip
+    num_requests = 100
+    output_report = "master_benchmark_report_v1.md"
+    asyncio.run(triton_async_benchmark(image_dir, num_requests, output_report))
+    logger.info("Benchmarking completed for %d requests.", num_requests)