matrice-inference 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of matrice-inference might be problematic. Click here for more details.

Files changed (37) hide show
  1. matrice_inference/__init__.py +72 -0
  2. matrice_inference/py.typed +0 -0
  3. matrice_inference/server/__init__.py +23 -0
  4. matrice_inference/server/inference_interface.py +176 -0
  5. matrice_inference/server/model/__init__.py +1 -0
  6. matrice_inference/server/model/model_manager.py +274 -0
  7. matrice_inference/server/model/model_manager_wrapper.py +550 -0
  8. matrice_inference/server/model/triton_model_manager.py +290 -0
  9. matrice_inference/server/model/triton_server.py +1248 -0
  10. matrice_inference/server/proxy_interface.py +371 -0
  11. matrice_inference/server/server.py +1004 -0
  12. matrice_inference/server/stream/__init__.py +0 -0
  13. matrice_inference/server/stream/app_deployment.py +228 -0
  14. matrice_inference/server/stream/consumer_worker.py +201 -0
  15. matrice_inference/server/stream/frame_cache.py +127 -0
  16. matrice_inference/server/stream/inference_worker.py +163 -0
  17. matrice_inference/server/stream/post_processing_worker.py +230 -0
  18. matrice_inference/server/stream/producer_worker.py +147 -0
  19. matrice_inference/server/stream/stream_pipeline.py +451 -0
  20. matrice_inference/server/stream/utils.py +23 -0
  21. matrice_inference/tmp/abstract_model_manager.py +58 -0
  22. matrice_inference/tmp/aggregator/__init__.py +18 -0
  23. matrice_inference/tmp/aggregator/aggregator.py +330 -0
  24. matrice_inference/tmp/aggregator/analytics.py +906 -0
  25. matrice_inference/tmp/aggregator/ingestor.py +438 -0
  26. matrice_inference/tmp/aggregator/latency.py +597 -0
  27. matrice_inference/tmp/aggregator/pipeline.py +968 -0
  28. matrice_inference/tmp/aggregator/publisher.py +431 -0
  29. matrice_inference/tmp/aggregator/synchronizer.py +594 -0
  30. matrice_inference/tmp/batch_manager.py +239 -0
  31. matrice_inference/tmp/overall_inference_testing.py +338 -0
  32. matrice_inference/tmp/triton_utils.py +638 -0
  33. matrice_inference-0.1.2.dist-info/METADATA +28 -0
  34. matrice_inference-0.1.2.dist-info/RECORD +37 -0
  35. matrice_inference-0.1.2.dist-info/WHEEL +5 -0
  36. matrice_inference-0.1.2.dist-info/licenses/LICENSE.txt +21 -0
  37. matrice_inference-0.1.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,239 @@
1
+ """
2
+ Dynamic batching manager for inference requests.
3
+
4
+ This module contains the batching logic separated from the main inference interface
5
+ to improve modularity and maintainability.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ import time
11
+ from dataclasses import dataclass, field
12
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
13
+
14
+ from matrice_analytics.post_processing.core.config import BaseConfig
15
+
16
+ @dataclass
17
+ class BatchRequest:
18
+ """Represents a single inference request in a batch"""
19
+
20
+ input1: Any
21
+ input2: Optional[Any] = None
22
+ extra_params: Optional[Dict[str, Any]] = None
23
+ apply_post_processing: bool = False
24
+ post_processing_config: Optional[Union[Dict[str, Any], BaseConfig]] = None
25
+ future: asyncio.Future = field(default_factory=asyncio.Future)
26
+ timestamp: float = field(default_factory=time.time)
27
+ stream_key: Optional[str] = None
28
+ stream_info: Optional[Dict[str, Any]] = None
29
+ input_hash: Optional[str] = None
30
+ camera_info: Optional[Dict[str, Any]] = None
31
+
32
+
33
+ class DynamicBatchManager:
34
+ """Manages dynamic batching for inference requests"""
35
+
36
+ def __init__(
37
+ self,
38
+ batch_size: int,
39
+ max_batch_wait_time: float,
40
+ model_manager,
41
+ post_processing_fn: Callable,
42
+ ):
43
+ """
44
+ Initialize the dynamic batch manager.
45
+
46
+ Args:
47
+ batch_size: Maximum batch size for processing
48
+ max_batch_wait_time: Maximum wait time for batching
49
+ model_manager: Model manager for inference
50
+ post_processing_fn: Function to apply post-processing
51
+ """
52
+ self.logger = logging.getLogger(__name__)
53
+ self.batch_size = batch_size
54
+ self.max_batch_wait_time = max_batch_wait_time
55
+ self.model_manager = model_manager
56
+ self.post_processing_fn = post_processing_fn
57
+
58
+ # Dynamic batching components
59
+ self.batch_queue: List[BatchRequest] = []
60
+ self.batch_lock = asyncio.Lock()
61
+ self.processing_batch = False
62
+
63
+ async def add_request(self, batch_request: BatchRequest) -> Tuple[Any, Optional[Dict[str, Any]]]:
64
+ """Add a request to the batch queue and process if needed"""
65
+ # Add to batch queue
66
+ async with self.batch_lock:
67
+ self.batch_queue.append(batch_request)
68
+
69
+ # Check if we should process the batch
70
+ should_process = (
71
+ len(self.batch_queue) >= self.batch_size or not self.processing_batch
72
+ )
73
+
74
+ if should_process and not self.processing_batch:
75
+ self.processing_batch = True
76
+ # Start batch processing in background
77
+ asyncio.create_task(self._process_batch())
78
+
79
+ # Wait for the result
80
+ try:
81
+ return await batch_request.future
82
+ except Exception as e:
83
+ raise RuntimeError(f"Dynamic batch inference failed: {str(e)}") from e
84
+
85
+ async def _process_batch(self):
86
+ """Process batched inference requests"""
87
+ try:
88
+ # Wait for batch to fill up or timeout
89
+ await asyncio.sleep(self.max_batch_wait_time)
90
+
91
+ async with self.batch_lock:
92
+ if not self.batch_queue:
93
+ self.processing_batch = False
94
+ return
95
+
96
+ # Extract current batch
97
+ current_batch = self.batch_queue[: self.batch_size]
98
+ self.batch_queue = self.batch_queue[self.batch_size :]
99
+
100
+ # Reset processing flag if no more items
101
+ if not self.batch_queue:
102
+ self.processing_batch = False
103
+ else:
104
+ # Continue processing remaining items
105
+ asyncio.create_task(self._process_batch())
106
+
107
+ if not current_batch:
108
+ return
109
+
110
+ # Prepare batch inputs
111
+ batch_input1 = [req.input1 for req in current_batch]
112
+ batch_input2 = (
113
+ [req.input2 for req in current_batch]
114
+ if any(req.input2 is not None for req in current_batch)
115
+ else None
116
+ )
117
+ batch_extra_params = [req.extra_params for req in current_batch]
118
+ stream_key = current_batch[0].stream_key
119
+ stream_info = current_batch[0].stream_info
120
+ input_hash = current_batch[0].input_hash
121
+
122
+ # Validate that all requests in the batch have the same stream_key
123
+ batch_stream_keys = [req.stream_key for req in current_batch]
124
+ if not all(sk == stream_key for sk in batch_stream_keys):
125
+ self.logger.warning(
126
+ f"Batch contains requests with different stream keys: {set(batch_stream_keys)}. "
127
+ f"Using first request's stream key: {stream_key} for model inference, "
128
+ f"but individual stream keys for post-processing."
129
+ )
130
+ else:
131
+ self.logger.debug(
132
+ f"Processing batch size={len(current_batch)} stream_key={stream_key}"
133
+ )
134
+
135
+ # Check if all requests have the same extra_params structure
136
+ if batch_extra_params and all(
137
+ params == batch_extra_params[0] for params in batch_extra_params
138
+ ):
139
+ merged_extra_params = batch_extra_params[0]
140
+ else:
141
+ # Handle heterogeneous extra_params - use first non-None or empty dict
142
+ merged_extra_params = next(
143
+ (params for params in batch_extra_params if params), {}
144
+ )
145
+
146
+ try:
147
+ # Perform batch inference
148
+ batch_results, success = self.model_manager.batch_inference(
149
+ batch_input1,
150
+ batch_input2,
151
+ merged_extra_params,
152
+ stream_key,
153
+ stream_info,
154
+ input_hash
155
+ )
156
+
157
+ if not success:
158
+ raise RuntimeError("Batch inference failed")
159
+ self.logger.debug(
160
+ f"Batch inference executed items={len(current_batch)} stream_key={stream_key}"
161
+ )
162
+
163
+ # Process results for each request
164
+ for i, (request, result) in enumerate(
165
+ zip(current_batch, batch_results)
166
+ ):
167
+ try:
168
+ if request.apply_post_processing:
169
+ processed_result, post_processing_result = (
170
+ await self.post_processing_fn(
171
+ result,
172
+ request.input1,
173
+ request.post_processing_config,
174
+ request.stream_key,
175
+ request.stream_info,
176
+ request.camera_info,
177
+ )
178
+ )
179
+ request.future.set_result(
180
+ (processed_result, post_processing_result)
181
+ )
182
+ else:
183
+ # Check if this is face recognition use case and return empty predictions for raw results
184
+ if self._is_face_recognition_request(request):
185
+ request.future.set_result(([], None))
186
+ else:
187
+ request.future.set_result((result, None))
188
+ except Exception as e:
189
+ request.future.set_exception(e)
190
+
191
+ except Exception as e:
192
+ # Set exception for all requests in the batch
193
+ for request in current_batch:
194
+ if not request.future.done():
195
+ request.future.set_exception(e)
196
+
197
+ except Exception as e:
198
+ # Handle unexpected errors
199
+ self.logger.error(f"Batch processing failed: {str(e)}")
200
+ async with self.batch_lock:
201
+ self.processing_batch = False
202
+
203
+ def _is_face_recognition_request(self, request: BatchRequest) -> bool:
204
+ """Check if a request is for face recognition use case."""
205
+ try:
206
+ # Parse the post-processing config to check if it's face recognition
207
+ config = request.post_processing_config
208
+ if isinstance(config, BaseConfig):
209
+ return hasattr(config, 'usecase') and config.usecase == 'face_recognition'
210
+ elif isinstance(config, dict):
211
+ return config.get('usecase') == 'face_recognition'
212
+ elif isinstance(config, str):
213
+ return config == 'face_recognition'
214
+ return False
215
+ except Exception:
216
+ return False
217
+
218
+ def get_stats(self) -> Dict[str, Any]:
219
+ """Get statistics about the current batching state."""
220
+ return {
221
+ "batch_size": self.batch_size,
222
+ "max_batch_wait_time": self.max_batch_wait_time,
223
+ "current_queue_size": len(self.batch_queue),
224
+ "processing_batch": self.processing_batch,
225
+ }
226
+
227
+ async def flush_queue(self) -> int:
228
+ """Force process all remaining items in the batch queue.
229
+
230
+ Returns:
231
+ Number of items processed
232
+ """
233
+ async with self.batch_lock:
234
+ remaining_items = len(self.batch_queue)
235
+ if remaining_items > 0 and not self.processing_batch:
236
+ self.processing_batch = True
237
+ asyncio.create_task(self._process_batch())
238
+
239
+ return remaining_items
@@ -0,0 +1,338 @@
1
+ import asyncio
2
+ import os
3
+ import numpy as np
4
+ import logging
5
+ import time
6
+ from datetime import datetime
7
+ import subprocess
8
+ import psutil
9
+ from triton_model_manager import TritonModelManager
10
+ import GPUtil
11
+ import pytz
12
+
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s [%(levelname)s] %(message)s",
16
+ )
17
+ logger = logging.getLogger(__name__)
18
+
19
+ COCO_CLASSES = [
20
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
21
+ "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
22
+ "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
23
+ "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
24
+ "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
25
+ "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
26
+ "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
27
+ "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
28
+ "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
29
+ "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
30
+ "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
31
+ "toothbrush"
32
+ ]
33
+
34
+ async def triton_async_benchmark(image_dir, num_requests=100, output_report="master_benchmark_report_v1.md"):
35
+ logger.info("Starting Triton Async Inference Master Benchmark at %s IST", datetime.now(pytz.timezone("Asia/Kolkata")).strftime('%Y-%m-%d %H:%M:%S'))
36
+ MODEL_NAME = "yolov8n"
37
+ MODEL_DIR = r"./models"
38
+ # NOTE: Place your model files (yolov8n.onnx, yolov8n.plan) in the MODEL_DIR
39
+ # wget https://github.com/Vedant-MatriceAI/Temporary_Model_Repository/raw/main/yolov8n.onnx
40
+ # wget https://github.com/Vedant-MatriceAI/Temporary_Model_Repository/raw/main/yolov8n.plan
41
+
42
+ INTERNAL_HOST = "localhost"
43
+ INPUT_SIZE = 640
44
+ NUM_CLASSES = 80
45
+ NUM_MODEL_INSTANCES = 1
46
+ MAX_BATCH_SIZE = 2
47
+ IS_YOLO = True
48
+
49
+ configurations = [
50
+ {
51
+ "model_path": os.path.join(MODEL_DIR, "yolov8n.plan"),
52
+ "runtime_framework": "tensorrt",
53
+ "server_type": server_type,
54
+ "port": 8000 if server_type == "rest" else 8001,
55
+ "dynamic_batching": dynamic_batching,
56
+ "use_trt_accelerator": use_trt
57
+ }
58
+ for server_type in ["rest", "grpc"]
59
+ for dynamic_batching in [True, False]
60
+ for use_trt in [True, False]
61
+ ] + [
62
+ {
63
+ "model_path": os.path.join(MODEL_DIR, "yolov8n.onnx"),
64
+ "runtime_framework": "onnx",
65
+ "server_type": server_type,
66
+ "port": 8000 if server_type == "rest" else 8001,
67
+ "dynamic_batching": dynamic_batching,
68
+ "use_trt_accelerator": False
69
+ }
70
+ for server_type in ["rest", "grpc"]
71
+ for dynamic_batching in [True, False]
72
+ ]
73
+
74
+ logger.info(f"Total configurations to test: {len(configurations)}")
75
+
76
+ all_metrics = []
77
+
78
+ system_info = {
79
+ "triton_version": "2.37.0",
80
+ "docker_image": "nvcr.io/nvidia/tritonserver:23.08-py3",
81
+ "cuda_version": "12.1",
82
+ "nvidia_driver_version": "535.216.03",
83
+ "gpu_info": "NVIDIA L4 (ID: 0, Memory: 23034.0 MB)",
84
+ "cpu_info": f"{psutil.cpu_count(logical=True)} logical cores, {psutil.cpu_count(logical=False)} physical cores",
85
+ "memory_total": f"{psutil.virtual_memory().total / (1024**3):.2f} GB",
86
+ "os": f"{subprocess.getoutput('cat /etc/os-release').split('PRETTY_NAME=')[1].splitlines()[0].strip()}"
87
+ }
88
+ try:
89
+ system_info["cuda_version"] = subprocess.getoutput("nvcc --version | grep release").split("release ")[1].split(",")[0]
90
+ except:
91
+ logger.warning("Could not retrieve CUDA version, using fallback")
92
+ try:
93
+ system_info["nvidia_driver_version"] = subprocess.getoutput("nvidia-smi | grep Driver").split("Driver Version: ")[1].split()[0]
94
+ except:
95
+ logger.warning("Could not retrieve NVIDIA driver version, using fallback")
96
+ try:
97
+ gpus = GPUtil.getGPUs()
98
+ system_info["gpu_info"] = ", ".join([f"{gpu.name} (ID: {gpu.id}, Memory: {gpu.memoryTotal} MB)" for gpu in gpus])
99
+ except:
100
+ logger.warning("Could not retrieve GPU info, using fallback")
101
+
102
+ try:
103
+ gpus = GPUtil.getGPUs()
104
+ if gpus:
105
+ temp = gpus[0].temperature
106
+ mem_used = gpus[0].memoryUsed
107
+ mem_total = gpus[0].memoryTotal
108
+ logger.info(f"Initial GPU status: Temperature={temp}°C, Memory={mem_used}/{mem_total} MB")
109
+ if temp > 55 or mem_used > 0.1 * mem_total:
110
+ logger.info("Initial GPU temperature or memory usage high, waiting for stabilization...")
111
+ for _ in range(60):
112
+ await asyncio.sleep(1)
113
+ gpus = GPUtil.getGPUs()
114
+ temp = gpus[0].temperature if gpus else 0
115
+ mem_used = gpus[0].memoryUsed if gpus else 0
116
+ if temp <= 55 and mem_used <= 0.1 * mem_total:
117
+ logger.info(f"GPU stabilized at {temp}°C, memory {mem_used}/{mem_total} MB")
118
+ break
119
+ else:
120
+ logger.error(f"GPU still at {temp}°C, memory {mem_used}/{mem_total} MB after waiting. Aborting benchmark to prevent shutdown.")
121
+ raise RuntimeError("Initial GPU conditions unsafe for benchmarking")
122
+ except Exception as e:
123
+ logger.warning(f"Could not check initial GPU status: {str(e)}. Proceeding with caution.")
124
+
125
+ image_files = [
126
+ os.path.join(image_dir, f)
127
+ for f in os.listdir(image_dir)
128
+ if f.lower().endswith((".jpg", ".jpeg", ".png"))
129
+ ]
130
+ if len(image_files) < num_requests:
131
+ logger.warning(f"Requested {num_requests} images, but only {len(image_files)} found. Using available images.")
132
+ num_requests = len(image_files)
133
+ image_files = image_files[:num_requests]
134
+
135
+ if not image_files:
136
+ raise FileNotFoundError(f"No images found in {image_dir}")
137
+
138
+ image_bytes_list = []
139
+ for img_path in image_files:
140
+ with open(img_path, "rb") as f:
141
+ image_bytes_list.append(f.read())
142
+
143
+ for idx, config in enumerate(configurations):
144
+ logger.info(f"Running benchmark for configuration {idx + 1}/{len(configurations)}: {config}")
145
+ metrics = {
146
+ "latencies": [],
147
+ "total_time": 0,
148
+ "num_requests": num_requests,
149
+ "successful_requests": 0,
150
+ "failed_requests": 0,
151
+ "total_objects_detected": 0,
152
+ "failure_reason": ""
153
+ }
154
+
155
+ # GPU cool-down before each run
156
+ try:
157
+ gpus = GPUtil.getGPUs()
158
+ if gpus:
159
+ temp = gpus[0].temperature
160
+ mem_used = gpus[0].memoryUsed
161
+ mem_total = gpus[0].memoryTotal
162
+ logger.info(f"GPU status before run: Temperature={temp}°C, Memory={mem_used}/{mem_total} MB")
163
+ if temp > 55 or mem_used > 0.1 * mem_total:
164
+ logger.info("GPU temperature or memory usage high, waiting for cool-down...")
165
+ for _ in range(30):
166
+ await asyncio.sleep(1)
167
+ gpus = GPUtil.getGPUs()
168
+ temp = gpus[0].temperature if gpus else 0
169
+ mem_used = gpus[0].memoryUsed if gpus else 0
170
+ if temp <= 55 and mem_used <= 0.1 * mem_total:
171
+ logger.info(f"GPU cooled to {temp}°C, memory freed to {mem_used}/{mem_total} MB")
172
+ break
173
+ else:
174
+ logger.warning(f"GPU still at {temp}°C, memory {mem_used}/{mem_total} MB after waiting, proceeding with run")
175
+
176
+ except Exception as e:
177
+ logger.warning(f"Could not check GPU status: {str(e)}")
178
+
179
+ try:
180
+ if not os.path.exists(config["model_path"]):
181
+ error_msg = f"Model file not found: {config['model_path']}"
182
+ logger.error(error_msg)
183
+ metrics["failed_requests"] = num_requests
184
+ metrics["failure_reason"] = error_msg
185
+ all_metrics.append((config, metrics))
186
+ continue
187
+
188
+ manager = TritonModelManager(
189
+ model_name=MODEL_NAME,
190
+ model_path=config["model_path"],
191
+ runtime_framework=config["runtime_framework"],
192
+ internal_server_type=config["server_type"],
193
+ internal_port=config["port"],
194
+ internal_host=INTERNAL_HOST,
195
+ input_size=INPUT_SIZE,
196
+ num_classes=NUM_CLASSES,
197
+ num_model_instances=NUM_MODEL_INSTANCES,
198
+ use_dynamic_batching=config["dynamic_batching"],
199
+ max_batch_size=MAX_BATCH_SIZE,
200
+ is_yolo=IS_YOLO,
201
+ use_trt_accelerator=config["use_trt_accelerator"]
202
+ )
203
+
204
+ async def run_inference(image_bytes, img_idx):
205
+ start_time = time.time()
206
+ try:
207
+ result, success = await manager.async_inference(image_bytes)
208
+ if not success or result is None or result.get("predictions") is None:
209
+ raise RuntimeError(f"Inference failed for image {img_idx}")
210
+
211
+ # Extract predictions
212
+ predictions = result["predictions"]
213
+ boxes = np.array(predictions["boxes"])
214
+ scores = np.array(predictions["scores"])
215
+ class_ids = np.array(predictions["class_ids"])
216
+
217
+ # Log results for first few images
218
+ if img_idx < 3:
219
+ logger.info(f"======= Results for image {img_idx}: {os.path.basename(image_files[img_idx])} =======")
220
+ logger.info(f"Detected {boxes.shape[0]} objects")
221
+ for i in range(min(boxes.shape[0], 3)):
222
+ try:
223
+ box = boxes[i]
224
+ score = scores[i]
225
+ class_id = int(class_ids[i])
226
+ class_name = COCO_CLASSES[class_id] if 0 <= class_id < len(COCO_CLASSES) else "unknown"
227
+ logger.info(f"Object {i+1}: {class_name} (Score: {score:.4f}, Box: {box})")
228
+ except Exception as e:
229
+ logger.warning(f"Failed to log object {i+1} for image {img_idx}: {e}")
230
+ logger.info("=============================================")
231
+
232
+ metrics["successful_requests"] += 1
233
+ metrics["total_objects_detected"] += boxes.shape[0]
234
+ metrics["latencies"].append(time.time() - start_time)
235
+ except Exception as e:
236
+ logger.error(f"Inference failed for image {img_idx}: {str(e)}")
237
+ metrics["failed_requests"] += 1
238
+
239
+ start_total_time = time.time()
240
+ tasks = [run_inference(image_bytes, idx) for idx, image_bytes in enumerate(image_bytes_list)]
241
+ await asyncio.gather(*tasks, return_exceptions=True)
242
+ metrics["total_time"] = time.time() - start_total_time
243
+
244
+ # Calculate metrics
245
+ metrics["throughput"] = metrics["successful_requests"] / metrics["total_time"] if metrics["total_time"] > 0 else 0
246
+ metrics["avg_fps"] = metrics["successful_requests"] / metrics["total_time"] if metrics["total_time"] > 0 else 0
247
+ metrics["avg_latency_ms"] = (sum(metrics["latencies"]) / len(metrics["latencies"]) * 1000) if metrics["latencies"] else 0
248
+ metrics["min_latency_ms"] = min(metrics["latencies"]) * 1000 if metrics["latencies"] else 0
249
+ metrics["max_latency_ms"] = max(metrics["latencies"]) * 1000 if metrics["latencies"] else 0
250
+ metrics["p95_latency_ms"] = np.percentile(metrics["latencies"], 95) * 1000 if metrics["latencies"] else 0
251
+
252
+ except Exception as e:
253
+ error_msg = f"Benchmark error: {str(e)}"
254
+ logger.error(error_msg)
255
+ metrics["failed_requests"] = num_requests
256
+ metrics["failure_reason"] = error_msg
257
+ finally:
258
+ try:
259
+ manager.triton_server_process.terminate()
260
+ manager.triton_server_process.wait(timeout=300)
261
+ logger.info("Triton server terminated")
262
+ except Exception as e:
263
+ logger.warning(f"Cleanup failed: {str(e)}")
264
+ all_metrics.append((config, metrics))
265
+
266
+ report_content = f"""# Triton Inference Server Master Benchmark Report
267
+ *Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} UTC*
268
+ *Generated on {datetime.now(pytz.timezone("Asia/Kolkata")).strftime('%Y-%m-%d %H:%M:%S')} IST*
269
+
270
+ ## System Configuration
271
+ - **Operating System**: {system_info["os"]}
272
+ - **Triton Server Version**: {system_info["triton_version"]}
273
+ - **Docker Image**: {system_info["docker_image"]}
274
+ - **CUDA Version**: {system_info["cuda_version"]}
275
+ - **NVIDIA Driver Version**: {system_info["nvidia_driver_version"]}
276
+ - **GPU Configuration**: {system_info["gpu_info"]}
277
+ - **CPU Configuration**: {system_info["cpu_info"]}
278
+ - **System Memory**: {system_info["memory_total"]}
279
+
280
+ ## Benchmark Summary
281
+ | Config ID | Model Format | Server Protocol | Dynamic Batching | TensorRT Accelerator | Total Images | Failed Requests | Objects Detected | Total Time (s) | Throughput (img/s) | Avg FPS | Avg Latency (ms) | Min Latency (ms) | Max Latency (ms) | P95 Latency (ms) |
282
+ |-----------|--------------|-----------------|------------------|---------------------|--------------|-----------------|------------------|----------------|--------------------|---------|------------------|------------------|------------------|------------------|
283
+ """
284
+ for idx, (config, metrics) in enumerate(all_metrics):
285
+ report_content += f"| {idx + 1} | {config['runtime_framework']} | {config['server_type']} | {config['dynamic_batching']} | {config['use_trt_accelerator']} | {metrics['successful_requests']} | {metrics['failed_requests']} | {metrics['total_objects_detected']} | {metrics['total_time']:.2f} | {metrics['throughput']:.2f} | {metrics['avg_fps']:.2f} | {metrics['avg_latency_ms']:.2f} | {metrics['min_latency_ms']:.2f} | {metrics['max_latency_ms']:.2f} | {metrics['p95_latency_ms']:.2f} |\n"
286
+
287
+ report_content += """
288
+ ## Detailed Results
289
+ """
290
+ for idx, (config, metrics) in enumerate(all_metrics):
291
+ report_content += f"""
292
+ ### Configuration {idx + 1}: {config['runtime_framework'].upper()} ({config['server_type'].upper()}, Dynamic Batching: {config['dynamic_batching']}, TensorRT: {config['use_trt_accelerator']})
293
+ - **Model Name**: {MODEL_NAME}
294
+ - **Model Path**: {config['model_path']}
295
+ - **Runtime Framework**: {config['runtime_framework']}
296
+ - **Server Protocol**: {config['server_type']}
297
+ - **Port**: {config['port']}
298
+ - **Input Size**: {INPUT_SIZE}x{INPUT_SIZE}
299
+ - **Number of Classes**: {NUM_CLASSES}
300
+ - **Number of Model Instances**: {NUM_MODEL_INSTANCES}
301
+ - **Dynamic Batching**: {config['dynamic_batching']}
302
+ - **Max Batch Size**: {MAX_BATCH_SIZE}
303
+ - **YOLO Model**: {IS_YOLO}
304
+ - **TensorRT Accelerator**: {config['use_trt_accelerator']}
305
+
306
+ #### Benchmark Results
307
+ """
308
+ if metrics["failure_reason"]:
309
+ report_content += f"- **Status**: Failed\n- **Failure Reason**: {metrics['failure_reason']}\n"
310
+ else:
311
+ report_content += f"""- **Total Images Processed**: {metrics['successful_requests']}
312
+ - **Failed Requests**: {metrics['failed_requests']}
313
+ - **Total Objects Detected**: {metrics['total_objects_detected']}
314
+ - **Total Time**: {metrics['total_time']:.2f} seconds
315
+ - **Throughput**: {metrics['throughput']:.2f} images/second
316
+ - **Average FPS**: {metrics['avg_fps']:.2f} frames/second
317
+ - **Average Latency**: {metrics['avg_latency_ms']:.2f} ms
318
+ - **Min Latency**: {metrics['min_latency_ms']:.2f} ms
319
+ - **Max Latency**: {metrics['max_latency_ms']:.2f} ms
320
+ - **P95 Latency**: {metrics['p95_latency_ms']:.2f} ms
321
+ """
322
+
323
+ with open(output_report, "w") as f:
324
+ f.write(report_content)
325
+ logger.info(f"Master benchmark report saved to {output_report}")
326
+
327
+ if __name__ == "__main__":
328
+ image_dir = r"./coco/val2017"
329
+ # NOTE: Exec the below commands beforehand to prepare dataset
330
+
331
+ # mkdir -p coco && cd coco
332
+ # wget http://images.cocodataset.org/zips/val2017.zip
333
+ # unzip val2017.zip
334
+
335
+ num_requests = 100
336
+ output_report = "master_benchmark_report_v1.md"
337
+ asyncio.run(triton_async_benchmark(image_dir, num_requests, output_report))
338
+ logger.info("Benchmarking completed for %d requests.", num_requests)