matrice-streaming 0.1.14__py3-none-any.whl → 0.1.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. matrice_streaming/__init__.py +44 -32
  2. matrice_streaming/streaming_gateway/camera_streamer/__init__.py +68 -1
  3. matrice_streaming/streaming_gateway/camera_streamer/async_camera_worker.py +1388 -0
  4. matrice_streaming/streaming_gateway/camera_streamer/async_ffmpeg_worker.py +966 -0
  5. matrice_streaming/streaming_gateway/camera_streamer/camera_streamer.py +188 -24
  6. matrice_streaming/streaming_gateway/camera_streamer/device_detection.py +507 -0
  7. matrice_streaming/streaming_gateway/camera_streamer/encoding_pool_manager.py +136 -0
  8. matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_camera_streamer.py +1048 -0
  9. matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_config.py +192 -0
  10. matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_worker_manager.py +470 -0
  11. matrice_streaming/streaming_gateway/camera_streamer/gstreamer_camera_streamer.py +1368 -0
  12. matrice_streaming/streaming_gateway/camera_streamer/gstreamer_worker.py +1063 -0
  13. matrice_streaming/streaming_gateway/camera_streamer/gstreamer_worker_manager.py +546 -0
  14. matrice_streaming/streaming_gateway/camera_streamer/message_builder.py +60 -15
  15. matrice_streaming/streaming_gateway/camera_streamer/nvdec.py +1330 -0
  16. matrice_streaming/streaming_gateway/camera_streamer/nvdec_worker_manager.py +412 -0
  17. matrice_streaming/streaming_gateway/camera_streamer/platform_pipelines.py +680 -0
  18. matrice_streaming/streaming_gateway/camera_streamer/stream_statistics.py +111 -4
  19. matrice_streaming/streaming_gateway/camera_streamer/video_capture_manager.py +223 -27
  20. matrice_streaming/streaming_gateway/camera_streamer/worker_manager.py +694 -0
  21. matrice_streaming/streaming_gateway/debug/__init__.py +27 -2
  22. matrice_streaming/streaming_gateway/debug/benchmark.py +727 -0
  23. matrice_streaming/streaming_gateway/debug/debug_gstreamer_gateway.py +599 -0
  24. matrice_streaming/streaming_gateway/debug/debug_streaming_gateway.py +245 -95
  25. matrice_streaming/streaming_gateway/debug/debug_utils.py +29 -0
  26. matrice_streaming/streaming_gateway/debug/test_videoplayback.py +318 -0
  27. matrice_streaming/streaming_gateway/dynamic_camera_manager.py +656 -39
  28. matrice_streaming/streaming_gateway/metrics_reporter.py +676 -139
  29. matrice_streaming/streaming_gateway/streaming_action.py +71 -20
  30. matrice_streaming/streaming_gateway/streaming_gateway.py +1026 -78
  31. matrice_streaming/streaming_gateway/streaming_gateway_utils.py +175 -20
  32. matrice_streaming/streaming_gateway/streaming_status_listener.py +89 -0
  33. {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/METADATA +1 -1
  34. matrice_streaming-0.1.65.dist-info/RECORD +56 -0
  35. matrice_streaming-0.1.14.dist-info/RECORD +0 -38
  36. {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/WHEEL +0 -0
  37. {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/licenses/LICENSE.txt +0 -0
  38. {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1330 @@
1
+ #!/usr/bin/env python3
2
+ """Streaming Gateway - CUDA IPC Video Producer (NVDEC Hardware Decode).
3
+
4
+ This module implements the producer side of the zero-copy video pipeline
5
+ using NVDEC hardware video decoding for maximum throughput.
6
+
7
+ Architecture:
8
+ =============
9
+
10
+ ┌─────────────────────────────────────────────────────────────────────────┐
11
+ │ STREAMING GATEWAY (Producer) │
12
+ ├─────────────────────────────────────────────────────────────────────────┤
13
+ │ │
14
+ │ ┌─────────────────────────────────────────────────────────────────┐ │
15
+ │ │ NVDEC Decoder Pool │ │
16
+ │ │ │ │
17
+ │ │ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ │ │
18
+ │ │ │ Decoder 0 │ │ Decoder 1 │ │ Decoder N │ │ │
19
+ │ │ │ │ │ │ │ │ │ │
20
+ │ │ │ NVDEC HW │ │ NVDEC HW │ │ NVDEC HW │ │ │
21
+ │ │ │ decode │ │ decode │ │ decode │ │ │
22
+ │ │ │ ↓ │ │ ↓ │ │ ↓ │ │ │
23
+ │ │ │ NV12 Resize │ │ NV12 Resize │ │ NV12 Resize │ │ │
24
+ │ │ │ ↓ │ │ ↓ │ │ ↓ │ │ │
25
+ │ │ │ CUDA IPC │ │ CUDA IPC │ │ CUDA IPC │ │ │
26
+ │ │ │ Ring Buf │ │ Ring Buf │ │ Ring Buf │ │ │
27
+ │ │ │ (NV12 0.6MB) │ │ (NV12 0.6MB) │ (NV12 0.6MB) │ │ │
28
+ │ │ └────────────────┘ └────────────────┘ └────────────────┘ │ │
29
+ │ │ │ │
30
+ │ └─────────────────────────────────────────────────────────────────┘ │
31
+ │ │ │
32
+ │ Output: NV12 (H*1.5, W) uint8 = 0.6 MB │
33
+ │ 50% less IPC bandwidth than RGB │
34
+ │ ↓ │
35
+ └───────────────────────────────┼─────────────────────────────────────────┘
36
+
37
+ Consumer reads via CUDA IPC
38
+ → NV12→RGB→CHW→FP16 in one kernel
39
+ → TensorRT inference
40
+
41
+ Usage:
42
+ ======
43
+ python streaming_gateway.py --video videoplayback.mp4 --num-streams 100
44
+
45
+ Requirements:
46
+ =============
47
+ - PyNvVideoCodec for NVDEC hardware decode
48
+ - CuPy with CUDA support
49
+ - cuda_shm_ring_buffer module
50
+ """
51
+
52
+ import argparse
53
+ import logging
54
+ import multiprocessing as mp
55
+ import os
56
+ import time
57
+ import threading
58
+ import queue as thread_queue
59
+ import hashlib
60
+ import tempfile
61
+ from dataclasses import dataclass
62
+ from pathlib import Path
63
+ from typing import Dict, List, Optional, Tuple, Any
64
+ from urllib.parse import urlparse, urlunparse
65
+
66
+ import numpy as np
67
+
68
+ try:
69
+ import requests
70
+ REQUESTS_AVAILABLE = True
71
+ except ImportError:
72
+ REQUESTS_AVAILABLE = False
73
+
74
+ try:
75
+ import cupy as cp
76
+ CUPY_AVAILABLE = True
77
+ except ImportError:
78
+ CUPY_AVAILABLE = False
79
+ cp = None
80
+
81
+ try:
82
+ import PyNvVideoCodec as nvc
83
+ PYNVCODEC_AVAILABLE = True
84
+ except ImportError:
85
+ PYNVCODEC_AVAILABLE = False
86
+ nvc = None
87
+
88
+ try:
89
+ from matrice_common.stream.cuda_shm_ring_buffer import CudaIpcRingBuffer, GlobalFrameCounter
90
+ RING_BUFFER_AVAILABLE = True
91
+ except ImportError:
92
+ RING_BUFFER_AVAILABLE = False
93
+
94
+ logger = logging.getLogger(__name__)
95
+
96
+ def setup_logging(quiet: bool = True):
97
+ """Configure logging level based on quiet mode."""
98
+ level = logging.WARNING if quiet else logging.INFO
99
+ logging.basicConfig(
100
+ level=level,
101
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
102
+ )
103
+ logging.getLogger('cuda_shm_ring_buffer').setLevel(logging.WARNING if quiet else logging.INFO)
104
+
105
+
106
+ # =============================================================================
107
+ # Video Downloader for HTTPS URLs (PyNvVideoCodec's FFmpeg lacks HTTPS support)
108
+ # =============================================================================
109
+
110
+ class VideoDownloader:
111
+ """Downloads and caches video files from HTTPS URLs.
112
+
113
+ PyNvVideoCodec uses a bundled FFmpeg that doesn't have HTTPS support.
114
+ This class downloads HTTPS videos to local files before passing them
115
+ to the NVDEC demuxer.
116
+
117
+ Features:
118
+ - URL deduplication: same video URL (ignoring query params) is only downloaded once
119
+ - Disk caching: reuses existing files across runs
120
+ - Progress tracking for large files
121
+ - Dynamic timeout based on file size
122
+ """
123
+
124
+ # Configuration
125
+ DOWNLOAD_TIMEOUT = 300 # Base timeout in seconds
126
+ DOWNLOAD_TIMEOUT_PER_100MB = 300 # Additional seconds per 100MB
127
+ MAX_DOWNLOAD_TIMEOUT = 6000 # 100 minutes max
128
+ DOWNLOAD_CHUNK_SIZE = 8192
129
+
130
+ # Singleton instance for process-wide caching
131
+ _instance: Optional['VideoDownloader'] = None
132
+ _lock = threading.Lock()
133
+
134
+ def __new__(cls):
135
+ """Singleton pattern for process-wide cache sharing."""
136
+ if cls._instance is None:
137
+ with cls._lock:
138
+ if cls._instance is None:
139
+ cls._instance = super().__new__(cls)
140
+ cls._instance._initialized = False
141
+ return cls._instance
142
+
143
+ def __init__(self):
144
+ """Initialize the video downloader."""
145
+ if self._initialized:
146
+ return
147
+
148
+ self._initialized = True
149
+ self.downloaded_files: Dict[str, str] = {}
150
+ self._normalized_url_to_path: Dict[str, str] = {}
151
+ self._download_lock = threading.Lock()
152
+ self.temp_dir = Path(tempfile.gettempdir()) / "nvdec_video_cache"
153
+ self.temp_dir.mkdir(exist_ok=True)
154
+ logger.info(f"VideoDownloader initialized, cache dir: {self.temp_dir}")
155
+
156
+ def prepare_source(self, video_path: str, camera_id: str) -> str:
157
+ """Prepare video source, downloading HTTPS URLs if needed.
158
+
159
+ Args:
160
+ video_path: Video file path, RTSP URL, or HTTPS URL
161
+ camera_id: Camera identifier for logging
162
+
163
+ Returns:
164
+ Local file path (downloaded if HTTPS) or original path
165
+ """
166
+ if not self._is_https_url(video_path):
167
+ return video_path
168
+
169
+ if not REQUESTS_AVAILABLE:
170
+ logger.warning(f"requests module not available, cannot download HTTPS URL for {camera_id}")
171
+ return video_path
172
+
173
+ local_path = self._download_video(video_path, camera_id)
174
+ if local_path:
175
+ return local_path
176
+
177
+ logger.warning(f"Failed to download {video_path} for {camera_id}, will try URL directly (may fail)")
178
+ return video_path
179
+
180
+ def _is_https_url(self, source: str) -> bool:
181
+ """Check if source is an HTTPS URL."""
182
+ return source.startswith('https://')
183
+
184
+ def _normalize_url(self, url: str) -> str:
185
+ """Normalize URL by stripping query parameters for deduplication."""
186
+ parsed = urlparse(url)
187
+ return urlunparse((
188
+ parsed.scheme,
189
+ parsed.netloc,
190
+ parsed.path,
191
+ '', '', '' # params, query, fragment
192
+ ))
193
+
194
+ def _get_url_hash(self, normalized_url: str) -> str:
195
+ """Generate a short hash for consistent file naming."""
196
+ return hashlib.md5(normalized_url.encode()).hexdigest()[:12]
197
+
198
+ def _download_video(self, url: str, camera_id: str) -> Optional[str]:
199
+ """Download video file from HTTPS URL with caching.
200
+
201
+ Thread-safe: uses lock to prevent duplicate downloads.
202
+
203
+ Args:
204
+ url: HTTPS video URL
205
+ camera_id: Camera identifier for logging
206
+
207
+ Returns:
208
+ Local file path or None if download failed
209
+ """
210
+ normalized_url = self._normalize_url(url)
211
+ file_ext = Path(url.split('?')[0]).suffix or '.mp4'
212
+ url_hash = self._get_url_hash(normalized_url)
213
+ expected_path = self.temp_dir / f"video_{url_hash}{file_ext}"
214
+ expected_path_str = str(expected_path)
215
+
216
+ # Quick check: file already on disk
217
+ if expected_path.exists():
218
+ existing_size = expected_path.stat().st_size
219
+ logger.info(
220
+ f"[{camera_id}] Reusing cached video: {expected_path.name} "
221
+ f"({existing_size / (1024*1024):.1f}MB)"
222
+ )
223
+ with self._download_lock:
224
+ self.downloaded_files[url] = expected_path_str
225
+ self._normalized_url_to_path[normalized_url] = expected_path_str
226
+ return expected_path_str
227
+
228
+ # Check memory cache
229
+ with self._download_lock:
230
+ if url in self.downloaded_files:
231
+ local_path = self.downloaded_files[url]
232
+ if os.path.exists(local_path):
233
+ logger.debug(f"[{camera_id}] Using cached path (exact URL match)")
234
+ return local_path
235
+
236
+ if normalized_url in self._normalized_url_to_path:
237
+ local_path = self._normalized_url_to_path[normalized_url]
238
+ if os.path.exists(local_path):
239
+ logger.info(f"[{camera_id}] Reusing download (same base URL)")
240
+ self.downloaded_files[url] = local_path
241
+ return local_path
242
+
243
+ # Need to download - acquire lock to prevent duplicate downloads
244
+ with self._download_lock:
245
+ # Double-check after acquiring lock
246
+ if expected_path.exists():
247
+ self.downloaded_files[url] = expected_path_str
248
+ self._normalized_url_to_path[normalized_url] = expected_path_str
249
+ return expected_path_str
250
+
251
+ return self._do_download(url, expected_path, camera_id)
252
+
253
+ def _do_download(self, url: str, dest_path: Path, camera_id: str) -> Optional[str]:
254
+ """Perform the actual download. Must be called with _download_lock held."""
255
+ content_length = 0
256
+ file_size_mb = 0.0
257
+ bytes_downloaded = 0
258
+ timeout = self.DOWNLOAD_TIMEOUT
259
+
260
+ try:
261
+ # HEAD request to get file size
262
+ try:
263
+ head_response = requests.head(url, timeout=10, allow_redirects=True)
264
+ content_length = int(head_response.headers.get('Content-Length', 0))
265
+ file_size_mb = content_length / (1024 * 1024)
266
+ except Exception as e:
267
+ logger.debug(f"[{camera_id}] HEAD request failed: {e}")
268
+
269
+ # Calculate dynamic timeout
270
+ if content_length > 0:
271
+ timeout = min(
272
+ self.DOWNLOAD_TIMEOUT + int(file_size_mb // 100) * self.DOWNLOAD_TIMEOUT_PER_100MB,
273
+ self.MAX_DOWNLOAD_TIMEOUT
274
+ )
275
+ logger.info(f"[{camera_id}] Downloading {file_size_mb:.1f}MB (timeout: {timeout}s)")
276
+ else:
277
+ logger.info(f"[{camera_id}] Downloading video (size unknown, timeout: {timeout}s)")
278
+
279
+ # Download with progress tracking
280
+ response = requests.get(url, stream=True, timeout=timeout)
281
+ response.raise_for_status()
282
+
283
+ if content_length == 0:
284
+ content_length = int(response.headers.get('Content-Length', 0))
285
+ file_size_mb = content_length / (1024 * 1024) if content_length > 0 else 0
286
+
287
+ last_progress_log = 0
288
+
289
+ with open(dest_path, 'wb') as f:
290
+ for chunk in response.iter_content(chunk_size=self.DOWNLOAD_CHUNK_SIZE):
291
+ f.write(chunk)
292
+ bytes_downloaded += len(chunk)
293
+
294
+ # Log progress every 50MB for large files
295
+ if content_length > 50_000_000:
296
+ mb_downloaded = bytes_downloaded // (1024 * 1024)
297
+ if mb_downloaded - last_progress_log >= 50:
298
+ progress = (bytes_downloaded / content_length * 100) if content_length else 0
299
+ logger.info(
300
+ f"[{camera_id}] Download progress: "
301
+ f"{mb_downloaded}MB / {file_size_mb:.0f}MB ({progress:.1f}%)"
302
+ )
303
+ last_progress_log = mb_downloaded
304
+
305
+ # Update caches
306
+ normalized_url = self._normalize_url(url)
307
+ dest_path_str = str(dest_path)
308
+ self.downloaded_files[url] = dest_path_str
309
+ self._normalized_url_to_path[normalized_url] = dest_path_str
310
+
311
+ logger.info(
312
+ f"[{camera_id}] Downloaded: {dest_path.name} "
313
+ f"({bytes_downloaded / (1024*1024):.1f}MB)"
314
+ )
315
+ return dest_path_str
316
+
317
+ except requests.Timeout:
318
+ logger.error(
319
+ f"[{camera_id}] Download timeout: {file_size_mb:.1f}MB, "
320
+ f"got {bytes_downloaded/(1024*1024):.1f}MB in {timeout}s"
321
+ )
322
+ except requests.HTTPError as e:
323
+ logger.error(f"[{camera_id}] HTTP error: {e.response.status_code} - {e.response.reason}")
324
+ except IOError as e:
325
+ logger.error(f"[{camera_id}] Disk I/O error: {e}")
326
+ except Exception as e:
327
+ logger.error(f"[{camera_id}] Download failed: {type(e).__name__}: {e}")
328
+
329
+ # Cleanup partial download
330
+ try:
331
+ if dest_path.exists():
332
+ dest_path.unlink()
333
+ except Exception:
334
+ pass
335
+
336
+ return None
337
+
338
+ def cleanup(self):
339
+ """Clean up downloaded temporary files."""
340
+ unique_files = set(self.downloaded_files.values())
341
+ unique_files.update(self._normalized_url_to_path.values())
342
+
343
+ for filepath in unique_files:
344
+ try:
345
+ if os.path.exists(filepath):
346
+ os.remove(filepath)
347
+ logger.debug(f"Removed temp file: {filepath}")
348
+ except Exception as e:
349
+ logger.warning(f"Failed to remove temp file {filepath}: {e}")
350
+
351
+ self.downloaded_files.clear()
352
+ self._normalized_url_to_path.clear()
353
+
354
+
355
+ # Global video downloader instance
356
+ _video_downloader: Optional[VideoDownloader] = None
357
+
358
+
359
+ def get_video_downloader() -> VideoDownloader:
360
+ """Get or create the global VideoDownloader instance."""
361
+ global _video_downloader
362
+ if _video_downloader is None:
363
+ _video_downloader = VideoDownloader()
364
+ return _video_downloader
365
+
366
+
367
+ @dataclass
368
+ class StreamConfig:
369
+ """Configuration for a single video stream."""
370
+ camera_id: str
371
+ video_path: str
372
+ width: int = 640
373
+ height: int = 640
374
+ target_fps: int = 10
375
+ gpu_id: int = 0
376
+
377
+
378
+ @dataclass
379
+ class GatewayConfig:
380
+ """Configuration for the streaming gateway."""
381
+ video_path: str
382
+ num_streams: int = 100
383
+ target_fps: int = 0 # 0 = unlimited, >0 = FPS limit per stream
384
+ frame_width: int = 640
385
+ frame_height: int = 640
386
+ gpu_id: int = 0
387
+ num_gpus: int = 1
388
+ duration_sec: float = 30.0
389
+ nvdec_pool_size: int = 8
390
+ nvdec_burst_size: int = 4
391
+ num_slots: int = 32
392
+
393
+
394
+ @dataclass
395
+ class StreamState:
396
+ """Track state for each logical stream in NVDEC pool."""
397
+ stream_id: int
398
+ camera_id: str
399
+ video_path: str
400
+ demuxer: Any
401
+ frames_decoded: int = 0
402
+ width: int = 640
403
+ height: int = 640
404
+ empty_packets: int = 0
405
+
406
+
407
+ # =============================================================================
408
+ # CUDA Kernel: NV12 Resize (no color conversion - 50% less bandwidth)
409
+ # =============================================================================
410
+
411
+ _nv12_resize_kernel = None
412
+
413
+
414
+ def _get_nv12_resize_kernel():
415
+ """Get or compile the NV12 resize kernel.
416
+
417
+ This kernel resizes NV12 directly (no color conversion).
418
+ Output: concatenated Y (H×W) + UV ((H/2)×W) = H×W×1.5 bytes
419
+ This is 50% smaller than RGB (H×W×3 bytes).
420
+
421
+ Consumer will do: NV12→RGB→CHW→FP16 in one fused kernel.
422
+ """
423
+ global _nv12_resize_kernel
424
+ if _nv12_resize_kernel is None and CUPY_AVAILABLE:
425
+ _nv12_resize_kernel = cp.RawKernel(r'''
426
+ extern "C" __global__ void nv12_resize(
427
+ const unsigned char* src_y, // Source Y plane
428
+ const unsigned char* src_uv, // Source UV plane (interleaved)
429
+ unsigned char* dst, // Output: Y (H×W) followed by UV ((H/2)×W)
430
+ int src_h, int src_w,
431
+ int dst_h, int dst_w,
432
+ int y_stride, int uv_stride
433
+ ) {
434
+ int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
435
+ int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
436
+
437
+ // Total height in output: dst_h (Y) + dst_h/2 (UV) = dst_h * 1.5
438
+ int total_h = dst_h + dst_h / 2;
439
+ if (dst_x >= dst_w || dst_y >= total_h) return;
440
+
441
+ float scale_x = (float)src_w / dst_w;
442
+ float scale_y = (float)src_h / dst_h;
443
+
444
+ if (dst_y < dst_h) {
445
+ // Y plane region: resize Y
446
+ int src_x = min((int)(dst_x * scale_x), src_w - 1);
447
+ int src_y_coord = min((int)(dst_y * scale_y), src_h - 1);
448
+ int src_idx = src_y_coord * y_stride + src_x;
449
+ int dst_idx = dst_y * dst_w + dst_x;
450
+ dst[dst_idx] = src_y[src_idx];
451
+ } else {
452
+ // UV plane region: resize UV (UV is at half vertical resolution)
453
+ int uv_dst_y = dst_y - dst_h; // 0 to dst_h/2-1
454
+ int uv_src_y = min((int)(uv_dst_y * scale_y), src_h / 2 - 1);
455
+
456
+ // UV is interleaved, so we copy pairs (U, V) together
457
+ int src_uv_x = min((int)((dst_x / 2) * 2 * scale_x), src_w - 2);
458
+ src_uv_x = (src_uv_x / 2) * 2; // Ensure even
459
+
460
+ int src_idx = uv_src_y * uv_stride + src_uv_x + (dst_x % 2);
461
+ int dst_idx = dst_h * dst_w + uv_dst_y * dst_w + dst_x;
462
+ dst[dst_idx] = src_uv[src_idx];
463
+ }
464
+ }
465
+ ''', 'nv12_resize')
466
+ return _nv12_resize_kernel
467
+
468
+
469
+ def nv12_resize(y_plane: cp.ndarray, uv_plane: cp.ndarray,
470
+ y_stride: int, uv_stride: int,
471
+ src_h: int, src_w: int,
472
+ dst_h: int = 640, dst_w: int = 640) -> cp.ndarray:
473
+ """Resize NV12 without color conversion.
474
+
475
+ Output: concatenated Y (H×W) + UV ((H/2)×W) as single buffer.
476
+ Total size: H×W + (H/2)×W = H×W×1.5 bytes (50% of RGB).
477
+ """
478
+ kernel = _get_nv12_resize_kernel()
479
+ if kernel is None:
480
+ return None
481
+
482
+ total_h = dst_h + dst_h // 2
483
+ output = cp.empty((total_h, dst_w), dtype=cp.uint8)
484
+
485
+ block = (16, 16)
486
+ grid = ((dst_w + 15) // 16, (total_h + 15) // 16)
487
+
488
+ kernel(grid, block, (
489
+ y_plane, uv_plane, output,
490
+ cp.int32(src_h), cp.int32(src_w),
491
+ cp.int32(dst_h), cp.int32(dst_w),
492
+ cp.int32(y_stride), cp.int32(uv_stride)
493
+ ))
494
+
495
+ return output
496
+
497
+
498
+ def surface_to_nv12(frame, target_h: int = 640, target_w: int = 640) -> Optional[cp.ndarray]:
499
+ """Convert NVDEC surface to resized NV12 (50% smaller than RGB).
500
+
501
+ Output: (H + H/2, W) uint8 - concatenated Y + UV planes.
502
+ Total size: H×W×1.5 bytes (vs H×W×3 for RGB).
503
+ """
504
+ if not CUPY_AVAILABLE or frame is None:
505
+ return None
506
+
507
+ try:
508
+ cuda_views = frame.cuda()
509
+ if not cuda_views or len(cuda_views) < 2:
510
+ return None
511
+
512
+ # Extract Y plane
513
+ y_view = cuda_views[0]
514
+ y_cai = y_view.__cuda_array_interface__
515
+ y_shape = tuple(y_cai['shape'])
516
+ y_strides = tuple(y_cai['strides'])
517
+ y_ptr = y_cai['data'][0]
518
+ src_h, src_w = y_shape[:2]
519
+ y_stride = y_strides[0]
520
+
521
+ y_size = src_h * y_stride
522
+ y_mem = cp.cuda.UnownedMemory(y_ptr, y_size, owner=frame)
523
+ y_memptr = cp.cuda.MemoryPointer(y_mem, 0)
524
+ y_plane = cp.ndarray((src_h, src_w), dtype=cp.uint8, memptr=y_memptr,
525
+ strides=(y_stride, 1))
526
+
527
+ # Extract UV plane
528
+ uv_view = cuda_views[1]
529
+ uv_cai = uv_view.__cuda_array_interface__
530
+ uv_shape = tuple(uv_cai['shape'])
531
+ uv_strides = tuple(uv_cai['strides'])
532
+ uv_ptr = uv_cai['data'][0]
533
+ uv_stride = uv_strides[0]
534
+
535
+ uv_h = uv_shape[0]
536
+ uv_w = uv_shape[1] if len(uv_shape) > 1 else src_w
537
+ uv_size = uv_h * uv_stride
538
+ uv_mem = cp.cuda.UnownedMemory(uv_ptr, uv_size, owner=frame)
539
+ uv_memptr = cp.cuda.MemoryPointer(uv_mem, 0)
540
+ uv_plane = cp.ndarray((uv_h, uv_w), dtype=cp.uint8, memptr=uv_memptr,
541
+ strides=(uv_stride, 1))
542
+
543
+ # NV12 resize (no color conversion - 50% smaller output!)
544
+ nv12_frame = nv12_resize(y_plane, uv_plane, y_stride, uv_stride,
545
+ src_h, src_w, target_h, target_w)
546
+ # Add channel dimension for ring buffer compatibility: (H*1.5, W) -> (H*1.5, W, 1)
547
+ return nv12_frame[:, :, cp.newaxis] if nv12_frame is not None else None
548
+
549
+ except Exception as e:
550
+ # Safely encode error message (some CUDA errors contain non-ASCII chars like '×')
551
+ try:
552
+ err_msg = str(e).encode('ascii', errors='replace').decode('ascii')
553
+ except Exception:
554
+ err_msg = "unknown error"
555
+ logger.warning(f"surface_to_nv12 failed: {err_msg}")
556
+ return None
557
+
558
+
559
+ # =============================================================================
560
+ # NVDEC Decoder Pool
561
+ # =============================================================================
562
+
563
+ class NVDECDecoderPool:
564
+ """Pool of NVDEC decoders that time-multiplex streams.
565
+
566
+ Each decoder is exclusively owned by one worker thread.
567
+ Outputs NV12: 1.5×H×W bytes (50% smaller than RGB).
568
+ """
569
+
570
+ def __init__(self, pool_size: int, gpu_id: int = 0):
571
+ self.pool_size = pool_size
572
+ self.gpu_id = gpu_id
573
+ self.decoders = []
574
+ self.streams_per_decoder: List[List[StreamState]] = [[] for _ in range(pool_size)]
575
+
576
+ if not PYNVCODEC_AVAILABLE:
577
+ raise RuntimeError("PyNvVideoCodec not available")
578
+
579
+ if CUPY_AVAILABLE:
580
+ cp.cuda.Device(gpu_id).use()
581
+
582
+ for i in range(pool_size):
583
+ try:
584
+ decoder = nvc.CreateDecoder(
585
+ gpuid=gpu_id,
586
+ codec=nvc.cudaVideoCodec.H264,
587
+ usedevicememory=True
588
+ )
589
+ self.decoders.append(decoder)
590
+ except Exception as e:
591
+ logger.warning(f"Failed to create decoder {i}: {e}")
592
+ break
593
+
594
+ self.actual_pool_size = len(self.decoders)
595
+ logger.info(f"Created NVDEC pool: {self.actual_pool_size}/{pool_size} decoders on GPU {gpu_id}")
596
+
597
+ def assign_stream(self, stream_id: int, camera_id: str, video_path: str,
598
+ width: int = 640, height: int = 640) -> bool:
599
+ """Assign a stream to a decoder (round-robin).
600
+
601
+ Automatically downloads HTTPS URLs to local files since PyNvVideoCodec's
602
+ bundled FFmpeg doesn't support HTTPS protocol.
603
+ """
604
+ if self.actual_pool_size == 0:
605
+ return False
606
+
607
+ decoder_idx = stream_id % self.actual_pool_size
608
+
609
+ # Download HTTPS URLs to local files (PyNvVideoCodec lacks HTTPS support)
610
+ downloader = get_video_downloader()
611
+ local_path = downloader.prepare_source(video_path, camera_id)
612
+
613
+ try:
614
+ demuxer = nvc.CreateDemuxer(local_path)
615
+ except Exception as e:
616
+ logger.error(f"Failed to create demuxer for {camera_id}: {e}")
617
+ return False
618
+
619
+ stream_state = StreamState(
620
+ stream_id=stream_id,
621
+ camera_id=camera_id,
622
+ video_path=local_path, # Store local path for video looping
623
+ demuxer=demuxer,
624
+ width=width,
625
+ height=height
626
+ )
627
+ self.streams_per_decoder[decoder_idx].append(stream_state)
628
+ return True
629
+
630
+ def decode_round(self, decoder_idx: int, frames_per_stream: int = 4,
631
+ target_h: int = 640, target_w: int = 640) -> Tuple[int, List[Tuple[str, cp.ndarray]]]:
632
+ """Decode frames and convert to NV12.
633
+
634
+ Returns:
635
+ (total_frames, [(camera_id, nv12_tensor), ...])
636
+ """
637
+ if decoder_idx >= self.actual_pool_size:
638
+ return 0, []
639
+
640
+ decoder = self.decoders[decoder_idx]
641
+ streams = self.streams_per_decoder[decoder_idx]
642
+ total_frames = 0
643
+ decoded_frames = []
644
+
645
+ for stream in streams:
646
+ frames_this_stream = 0
647
+
648
+ while frames_this_stream < frames_per_stream:
649
+ try:
650
+ packet = stream.demuxer.Demux()
651
+ if packet is None:
652
+ stream.demuxer = nvc.CreateDemuxer(stream.video_path)
653
+ stream.empty_packets = 0
654
+ packet = stream.demuxer.Demux()
655
+ if packet is None:
656
+ break
657
+
658
+ frames_before = frames_this_stream
659
+ for surface in decoder.Decode(packet):
660
+ tensor = surface_to_nv12(surface, target_h, target_w)
661
+
662
+ if tensor is not None:
663
+ decoded_frames.append((stream.camera_id, tensor))
664
+ frames_this_stream += 1
665
+ stream.frames_decoded += 1
666
+ total_frames += 1
667
+ stream.empty_packets = 0
668
+
669
+ if frames_this_stream >= frames_per_stream:
670
+ break
671
+
672
+ if frames_this_stream == frames_before:
673
+ stream.empty_packets += 1
674
+ if stream.empty_packets >= 3:
675
+ stream.demuxer = nvc.CreateDemuxer(stream.video_path)
676
+ stream.empty_packets = 0
677
+
678
+ except Exception:
679
+ break
680
+
681
+ if frames_this_stream >= frames_per_stream:
682
+ break
683
+
684
+ return total_frames, decoded_frames
685
+
686
+ def get_camera_ids_for_decoder(self, decoder_idx: int) -> List[str]:
687
+ """Get camera IDs for a decoder."""
688
+ if decoder_idx >= self.actual_pool_size:
689
+ return []
690
+ return [s.camera_id for s in self.streams_per_decoder[decoder_idx]]
691
+
692
+ def close(self):
693
+ """Close all decoders."""
694
+ self.decoders.clear()
695
+ for streams in self.streams_per_decoder:
696
+ streams.clear()
697
+
698
+
699
+ # =============================================================================
700
+ # Worker Thread
701
+ # =============================================================================
702
+
703
+ def nvdec_pool_worker(
704
+ worker_id: int,
705
+ decoder_idx: int,
706
+ pool: NVDECDecoderPool,
707
+ ring_buffers: Dict[str, CudaIpcRingBuffer],
708
+ frame_counter: GlobalFrameCounter,
709
+ duration_sec: float,
710
+ result_queue: thread_queue.Queue,
711
+ stop_event: threading.Event,
712
+ burst_size: int = 4,
713
+ target_h: int = 640,
714
+ target_w: int = 640,
715
+ target_fps: int = 0,
716
+ shared_frame_count: Optional[mp.Value] = None,
717
+ gpu_frame_count: Optional[mp.Value] = None,
718
+ ):
719
+ """NVDEC worker thread.
720
+
721
+ Decodes frames and writes NV12 tensors to ring buffers.
722
+ Uses dedicated CUDA stream per worker for kernel overlap.
723
+ Supports FPS limiting when target_fps > 0.
724
+
725
+ Args:
726
+ shared_frame_count: Global counter (all GPUs)
727
+ gpu_frame_count: Per-GPU counter (this GPU only)
728
+ """
729
+ if CUPY_AVAILABLE:
730
+ cp.cuda.Device(pool.gpu_id).use()
731
+ cuda_stream = cp.cuda.Stream(non_blocking=True)
732
+ else:
733
+ cuda_stream = None
734
+
735
+ local_frames = 0
736
+ local_errors = 0
737
+ frames_since_counter_update = 0
738
+ counter_batch_size = 100
739
+ start_time = time.perf_counter()
740
+ camera_ids = pool.get_camera_ids_for_decoder(decoder_idx)
741
+ num_streams = len(camera_ids)
742
+
743
+ # FPS limiting: calculate frames per second target for this worker
744
+ # Each worker handles num_streams cameras at target_fps each
745
+ fps_limit_enabled = target_fps > 0 and num_streams > 0
746
+ if fps_limit_enabled:
747
+ # Total target frames per second for all streams handled by this worker
748
+ worker_target_fps = target_fps * num_streams
749
+ frame_interval = 1.0 / worker_target_fps
750
+ next_frame_time = start_time
751
+ fps_mode = f", FPS limit={target_fps}/stream"
752
+ else:
753
+ frame_interval = 0
754
+ next_frame_time = 0
755
+ fps_mode = ", unlimited FPS"
756
+
757
+ logger.debug(f"Worker {worker_id}: decoder={decoder_idx}, cams={num_streams}{fps_mode}")
758
+
759
+ while not stop_event.is_set():
760
+ if time.perf_counter() - start_time >= duration_sec:
761
+ break
762
+
763
+ # FPS limiting: wait until next scheduled frame time
764
+ if fps_limit_enabled:
765
+ current_time = time.perf_counter()
766
+ if current_time < next_frame_time:
767
+ sleep_time = next_frame_time - current_time
768
+ if sleep_time > 0.0001: # Only sleep if > 100us
769
+ time.sleep(sleep_time)
770
+
771
+ try:
772
+ with cuda_stream:
773
+ num_frames, decoded_frames = pool.decode_round(
774
+ decoder_idx,
775
+ frames_per_stream=burst_size,
776
+ target_h=target_h,
777
+ target_w=target_w
778
+ )
779
+
780
+ for cam_id, tensor in decoded_frames:
781
+ if cam_id in ring_buffers:
782
+ try:
783
+ ring_buffers[cam_id].write_frame_fast(tensor, sync=False)
784
+ local_frames += 1
785
+ frames_since_counter_update += 1
786
+
787
+ # Update global counter (all GPUs)
788
+ if shared_frame_count is not None:
789
+ with shared_frame_count.get_lock():
790
+ shared_frame_count.value += 1
791
+
792
+ # Update per-GPU counter (this GPU only)
793
+ if gpu_frame_count is not None:
794
+ with gpu_frame_count.get_lock():
795
+ gpu_frame_count.value += 1
796
+
797
+ # Update next frame time for FPS limiting
798
+ if fps_limit_enabled:
799
+ next_frame_time += frame_interval
800
+
801
+ except Exception as e:
802
+ local_errors += 1
803
+ if local_errors <= 3:
804
+ logger.error(f"Worker {worker_id} write error: {e}")
805
+
806
+ if decoded_frames and len(ring_buffers) > 0:
807
+ next(iter(ring_buffers.values())).sync_writes()
808
+
809
+ if num_frames == 0:
810
+ time.sleep(0.0001)
811
+ continue
812
+
813
+ if frames_since_counter_update >= counter_batch_size:
814
+ frame_counter.increment()
815
+ frames_since_counter_update = 0
816
+
817
+ except Exception as e:
818
+ local_errors += 1
819
+ if local_errors <= 3:
820
+ logger.error(f"Worker {worker_id} error: {e}")
821
+
822
+ if frames_since_counter_update > 0:
823
+ frame_counter.increment()
824
+
825
+ elapsed = time.perf_counter() - start_time
826
+ result_queue.put({
827
+ "worker_id": worker_id,
828
+ "decoder_idx": decoder_idx,
829
+ "elapsed_sec": elapsed,
830
+ "total_frames": local_frames,
831
+ "total_errors": local_errors,
832
+ "num_streams": len(camera_ids),
833
+ "fps": local_frames / elapsed if elapsed > 0 else 0,
834
+ })
835
+
836
+
837
+ # =============================================================================
838
+ # GPU Process
839
+ # =============================================================================
840
+
841
+ def nvdec_pool_process(
842
+ process_id: int,
843
+ camera_configs: List[StreamConfig],
844
+ pool_size: int,
845
+ duration_sec: float,
846
+ result_queue: mp.Queue,
847
+ stop_event: mp.Event,
848
+ burst_size: int = 4,
849
+ num_slots: int = 32,
850
+ target_fps: int = 0,
851
+ shared_frame_count: Optional[mp.Value] = None,
852
+ gpu_frame_counts: Optional[Dict[int, mp.Value]] = None,
853
+ total_num_streams: int = 0,
854
+ total_num_gpus: int = 1,
855
+ ):
856
+ """NVDEC process for one GPU.
857
+
858
+ Creates NV12 ring buffers: (H*1.5, W) = 0.6 MB/frame.
859
+
860
+ Args:
861
+ gpu_frame_counts: Dict mapping gpu_id -> per-GPU frame counter (for per-GPU stats)
862
+ shared_frame_count: Global frame counter (for overall stats)
863
+ total_num_streams: Total streams across ALL GPUs (for global per-stream calc)
864
+ total_num_gpus: Total number of GPUs (for context in logging)
865
+ """
866
+ if not camera_configs:
867
+ return
868
+
869
+ gpu_id = camera_configs[0].gpu_id
870
+ target_h = camera_configs[0].height
871
+ target_w = camera_configs[0].width
872
+
873
+ # Get per-GPU counter (or fall back to shared if not provided)
874
+ gpu_frame_count = gpu_frame_counts.get(gpu_id) if gpu_frame_counts else None
875
+
876
+ if CUPY_AVAILABLE:
877
+ cp.cuda.Device(gpu_id).use()
878
+
879
+ # Initialize global frame counter
880
+ frame_counter = GlobalFrameCounter(is_producer=True)
881
+ if process_id == 0:
882
+ frame_counter.initialize()
883
+ logger.info(f"Process {process_id}: GlobalFrameCounter initialized")
884
+ else:
885
+ max_retries = 50
886
+ for retry in range(max_retries):
887
+ try:
888
+ if os.path.exists("/dev/shm/global_frame_counter"):
889
+ frame_counter.connect()
890
+ logger.info(f"Process {process_id}: Connected to GlobalFrameCounter")
891
+ break
892
+ except Exception:
893
+ if retry == max_retries - 1:
894
+ raise
895
+ time.sleep(0.1)
896
+ else:
897
+ raise RuntimeError(f"Process {process_id}: GlobalFrameCounter not found")
898
+
899
+ # Create decoder pool
900
+ try:
901
+ pool = NVDECDecoderPool(pool_size, gpu_id)
902
+ except Exception as e:
903
+ logger.error(f"Process {process_id}: Failed to create decoder pool: {e}")
904
+ result_queue.put({
905
+ "process_id": process_id,
906
+ "error": str(e),
907
+ "total_frames": 0,
908
+ "total_errors": 1,
909
+ })
910
+ return
911
+
912
+ if pool.actual_pool_size == 0:
913
+ result_queue.put({
914
+ "process_id": process_id,
915
+ "error": "No decoders created",
916
+ "total_frames": 0,
917
+ "total_errors": 1,
918
+ })
919
+ return
920
+
921
+ # Create NV12 ring buffers: (H + H/2, W, 1) = 0.6 MB/frame
922
+ ring_buffers: Dict[str, CudaIpcRingBuffer] = {}
923
+ frame_size_mb = target_h * target_w * 1.5 / 1e6
924
+
925
+ try:
926
+ for i, config in enumerate(camera_configs):
927
+ rb = CudaIpcRingBuffer.create_producer(
928
+ config.camera_id,
929
+ gpu_id=config.gpu_id,
930
+ num_slots=num_slots,
931
+ width=config.width,
932
+ height=config.height + config.height // 2, # H * 1.5 for NV12
933
+ channels=1,
934
+ )
935
+ ring_buffers[config.camera_id] = rb
936
+
937
+ pool.assign_stream(
938
+ stream_id=i,
939
+ camera_id=config.camera_id,
940
+ video_path=config.video_path,
941
+ width=config.width,
942
+ height=config.height
943
+ )
944
+
945
+ logger.info(f"Process {process_id}: {pool.actual_pool_size} decoders, "
946
+ f"{len(camera_configs)} streams, NV12 ({frame_size_mb:.1f} MB/frame)")
947
+
948
+ thread_stop_event = threading.Event()
949
+ thread_result_queue = thread_queue.Queue()
950
+
951
+ threads = []
952
+ for decoder_idx in range(pool.actual_pool_size):
953
+ t = threading.Thread(
954
+ target=nvdec_pool_worker,
955
+ args=(
956
+ process_id * 100 + decoder_idx,
957
+ decoder_idx,
958
+ pool,
959
+ ring_buffers,
960
+ frame_counter,
961
+ duration_sec,
962
+ thread_result_queue,
963
+ thread_stop_event,
964
+ burst_size,
965
+ target_h,
966
+ target_w,
967
+ target_fps,
968
+ shared_frame_count,
969
+ gpu_frame_count, # Per-GPU counter
970
+ )
971
+ )
972
+ t.start()
973
+ threads.append(t)
974
+
975
+ # Progress monitoring loop with current/avg FPS tracking
976
+ start_time = time.perf_counter()
977
+ last_report_time = start_time
978
+ last_gpu_frame_count = 0
979
+ last_global_frame_count = 0
980
+ report_interval = 5.0
981
+ processing_start_time = None
982
+ gpu_frames_at_start = 0
983
+ global_frames_at_start = 0
984
+ num_gpu_streams = len(camera_configs)
985
+
986
+ while not stop_event.is_set():
987
+ current_time = time.perf_counter()
988
+ if current_time - start_time >= duration_sec:
989
+ break
990
+
991
+ # Periodic progress report with current and average FPS
992
+ if current_time - last_report_time >= report_interval:
993
+ elapsed = current_time - start_time
994
+ remaining = max(0, duration_sec - elapsed)
995
+
996
+ # Get per-GPU frame count (this GPU only)
997
+ gpu_frames = gpu_frame_count.value if gpu_frame_count else 0
998
+ gpu_interval_frames = gpu_frames - last_gpu_frame_count
999
+ gpu_interval_fps = gpu_interval_frames / report_interval
1000
+ gpu_per_stream_fps = gpu_interval_fps / num_gpu_streams if num_gpu_streams > 0 else 0
1001
+
1002
+ # Get global frame count (all GPUs)
1003
+ global_frames = shared_frame_count.value if shared_frame_count else 0
1004
+ global_interval_frames = global_frames - last_global_frame_count
1005
+ global_interval_fps = global_interval_frames / report_interval
1006
+ global_per_stream_fps = global_interval_fps / total_num_streams if total_num_streams > 0 else 0
1007
+
1008
+ # Track when processing actually starts (exclude warmup)
1009
+ if processing_start_time is None and gpu_frames > 0:
1010
+ processing_start_time = last_report_time
1011
+ gpu_frames_at_start = last_gpu_frame_count
1012
+ global_frames_at_start = last_global_frame_count
1013
+
1014
+ # Calculate average FPS excluding warmup
1015
+ if processing_start_time is not None:
1016
+ processing_elapsed = current_time - processing_start_time
1017
+
1018
+ # Per-GPU averages
1019
+ gpu_processing_frames = gpu_frames - gpu_frames_at_start
1020
+ gpu_avg_fps = gpu_processing_frames / processing_elapsed if processing_elapsed > 0 else 0
1021
+ gpu_avg_per_stream = gpu_avg_fps / num_gpu_streams if num_gpu_streams > 0 else 0
1022
+
1023
+ # Global averages
1024
+ global_processing_frames = global_frames - global_frames_at_start
1025
+ global_avg_fps = global_processing_frames / processing_elapsed if processing_elapsed > 0 else 0
1026
+ global_avg_per_stream = global_avg_fps / total_num_streams if total_num_streams > 0 else 0
1027
+
1028
+ # Log per-GPU stats
1029
+ logger.info(
1030
+ f"GPU{gpu_id} [{elapsed:5.1f}s] {gpu_frames:,} frames ({num_gpu_streams} cams) | "
1031
+ f"cur: {gpu_interval_fps:,.0f} FPS ({gpu_per_stream_fps:.1f}/cam) | "
1032
+ f"avg: {gpu_avg_fps:,.0f} FPS ({gpu_avg_per_stream:.1f}/cam)"
1033
+ )
1034
+
1035
+ # Log global stats (only from GPU0 to avoid spam)
1036
+ if gpu_id == 0:
1037
+ logger.info(
1038
+ f"GLOBAL [{elapsed:5.1f}s] {global_frames:,} frames ({total_num_streams} cams, {total_num_gpus} GPUs) | "
1039
+ f"cur: {global_interval_fps:,.0f} FPS ({global_per_stream_fps:.1f}/cam) | "
1040
+ f"avg: {global_avg_fps:,.0f} FPS ({global_avg_per_stream:.1f}/cam) | "
1041
+ f"{remaining:.0f}s left"
1042
+ )
1043
+
1044
+ last_gpu_frame_count = gpu_frames
1045
+ last_global_frame_count = global_frames
1046
+ last_report_time = current_time
1047
+
1048
+ time.sleep(0.1)
1049
+
1050
+ thread_stop_event.set()
1051
+
1052
+ for t in threads:
1053
+ t.join(timeout=30.0)
1054
+
1055
+ total_frames = 0
1056
+ total_errors = 0
1057
+ elapsed = time.perf_counter() - start_time
1058
+
1059
+ while not thread_result_queue.empty():
1060
+ try:
1061
+ r = thread_result_queue.get_nowait()
1062
+ total_frames += r.get("total_frames", 0)
1063
+ total_errors += r.get("total_errors", 0)
1064
+ except:
1065
+ break
1066
+
1067
+ pool.close()
1068
+ for rb in ring_buffers.values():
1069
+ rb.close()
1070
+
1071
+ result_queue.put({
1072
+ "process_id": process_id,
1073
+ "elapsed_sec": elapsed,
1074
+ "total_frames": total_frames,
1075
+ "total_errors": total_errors,
1076
+ "num_streams": len(camera_configs),
1077
+ "pool_size": pool.actual_pool_size,
1078
+ "fps": total_frames / elapsed if elapsed > 0 else 0,
1079
+ "per_stream_fps": total_frames / elapsed / len(camera_configs) if elapsed > 0 and camera_configs else 0,
1080
+ })
1081
+
1082
+ except Exception as e:
1083
+ logger.error(f"Process {process_id} error: {e}")
1084
+ import traceback
1085
+ traceback.print_exc()
1086
+
1087
+ pool.close()
1088
+ for rb in ring_buffers.values():
1089
+ rb.close()
1090
+
1091
+ result_queue.put({
1092
+ "process_id": process_id,
1093
+ "error": str(e),
1094
+ "total_frames": 0,
1095
+ "total_errors": 1,
1096
+ })
1097
+
1098
+
1099
+ # =============================================================================
1100
+ # Streaming Gateway
1101
+ # =============================================================================
1102
+
1103
+ class StreamingGateway:
1104
+ """Multi-stream video producer outputting NV12 tensors (minimal IPC payload)."""
1105
+
1106
+ def __init__(self, config: GatewayConfig):
1107
+ self.config = config
1108
+ self._workers: List[mp.Process] = []
1109
+ self._stop_event = mp.Event()
1110
+ self._result_queue = mp.Queue()
1111
+
1112
+ def start(self) -> Dict:
1113
+ """Start the gateway."""
1114
+ if not CUPY_AVAILABLE:
1115
+ raise RuntimeError("CuPy is required")
1116
+ if not RING_BUFFER_AVAILABLE:
1117
+ raise RuntimeError("CUDA IPC ring buffer not available")
1118
+ if not PYNVCODEC_AVAILABLE:
1119
+ raise RuntimeError("PyNvVideoCodec required")
1120
+ return self._start_nvdec_pool()
1121
+
1122
+ def _start_nvdec_pool(self) -> Dict:
1123
+ """Start NVDEC pool across GPUs."""
1124
+ num_gpus = min(self.config.num_gpus, 8)
1125
+ streams_per_gpu = self.config.num_streams // num_gpus
1126
+ extra_streams = self.config.num_streams % num_gpus
1127
+
1128
+ logger.info(f"Starting NVDEC on {num_gpus} GPU(s): {self.config.num_streams} streams, "
1129
+ f"pool_size={self.config.nvdec_pool_size}/GPU, output=NV12 (0.6 MB)")
1130
+
1131
+ ctx = mp.get_context("spawn")
1132
+ self._stop_event = ctx.Event()
1133
+ self._result_queue = ctx.Queue()
1134
+
1135
+ # Shared counter for real-time FPS tracking (use 'L' for large counts)
1136
+ shared_frame_count = ctx.Value('L', 0)
1137
+
1138
+ stream_idx = 0
1139
+ for gpu_id in range(num_gpus):
1140
+ n_streams = streams_per_gpu + (1 if gpu_id < extra_streams else 0)
1141
+
1142
+ gpu_configs = []
1143
+ for i in range(n_streams):
1144
+ config = StreamConfig(
1145
+ camera_id=f"cam_{stream_idx:04d}",
1146
+ video_path=self.config.video_path,
1147
+ width=self.config.frame_width,
1148
+ height=self.config.frame_height,
1149
+ target_fps=self.config.target_fps,
1150
+ gpu_id=gpu_id,
1151
+ )
1152
+ gpu_configs.append(config)
1153
+ stream_idx += 1
1154
+
1155
+ p = ctx.Process(
1156
+ target=nvdec_pool_process,
1157
+ args=(gpu_id, gpu_configs, self.config.nvdec_pool_size,
1158
+ self.config.duration_sec, self._result_queue, self._stop_event,
1159
+ self.config.nvdec_burst_size, self.config.num_slots,
1160
+ self.config.target_fps, shared_frame_count)
1161
+ )
1162
+ p.start()
1163
+ self._workers.append(p)
1164
+ logger.info(f"GPU {gpu_id}: {n_streams} streams")
1165
+ time.sleep(0.1)
1166
+
1167
+ # Progress monitoring loop - print progress every 5 seconds
1168
+ start_time = time.perf_counter()
1169
+ last_report_time = start_time
1170
+ last_frame_count = 0
1171
+ report_interval = 5.0 # seconds
1172
+ processing_start_time = None # Track when actual processing starts
1173
+ frames_at_processing_start = 0
1174
+
1175
+ print(f" [ 0.0s] Started {num_gpus} GPU workers...")
1176
+
1177
+ while any(p.is_alive() for p in self._workers):
1178
+ time.sleep(0.5)
1179
+ current_time = time.perf_counter()
1180
+
1181
+ # Periodic progress report with real-time FPS
1182
+ if current_time - last_report_time >= report_interval:
1183
+ elapsed = current_time - start_time
1184
+ remaining = max(0, self.config.duration_sec - elapsed)
1185
+
1186
+ # Read current frame count
1187
+ current_frames = shared_frame_count.value
1188
+ interval_frames = current_frames - last_frame_count
1189
+ interval_fps = interval_frames / report_interval # Current throughput
1190
+ per_stream_fps = interval_fps / self.config.num_streams if self.config.num_streams > 0 else 0
1191
+
1192
+ # Track when processing actually starts (exclude warmup from avg)
1193
+ if processing_start_time is None and current_frames > 0:
1194
+ processing_start_time = last_report_time # Use previous report time
1195
+ frames_at_processing_start = last_frame_count
1196
+
1197
+ # Calculate average FPS excluding warmup time
1198
+ if processing_start_time is not None:
1199
+ processing_elapsed = current_time - processing_start_time
1200
+ processing_frames = current_frames - frames_at_processing_start
1201
+ avg_fps = processing_frames / processing_elapsed if processing_elapsed > 0 else 0
1202
+ print(f" [{elapsed:5.1f}s] {current_frames:,} frames | cur: {interval_fps:,.0f} FPS ({per_stream_fps:.1f}/stream) | avg: {avg_fps:,.0f} FPS | {remaining:.0f}s left")
1203
+ else:
1204
+ print(f" [{elapsed:5.1f}s] Warming up... | {remaining:.0f}s left")
1205
+
1206
+ last_report_time = current_time
1207
+ last_frame_count = current_frames
1208
+
1209
+ # Wait for all workers to fully complete
1210
+ for p in self._workers:
1211
+ p.join(timeout=5)
1212
+
1213
+ results = []
1214
+ while not self._result_queue.empty():
1215
+ results.append(self._result_queue.get())
1216
+
1217
+ for r in results:
1218
+ if "error" in r:
1219
+ logger.error(f"NVDEC error: {r['error']}")
1220
+
1221
+ total_frames = sum(r.get("total_frames", 0) for r in results)
1222
+ total_errors = sum(r.get("total_errors", 0) for r in results)
1223
+ total_elapsed = max((r.get("elapsed_sec", 0) for r in results), default=0)
1224
+
1225
+ aggregate_fps = total_frames / total_elapsed if total_elapsed > 0 else 0
1226
+ per_stream_fps = aggregate_fps / self.config.num_streams if self.config.num_streams > 0 else 0
1227
+
1228
+ return {
1229
+ "num_streams": self.config.num_streams,
1230
+ "num_gpus": num_gpus,
1231
+ "pool_size": self.config.nvdec_pool_size,
1232
+ "duration_sec": total_elapsed,
1233
+ "total_frames": total_frames,
1234
+ "total_errors": total_errors,
1235
+ "aggregate_fps": aggregate_fps,
1236
+ "per_stream_fps": per_stream_fps,
1237
+ "gpu_results": results,
1238
+ }
1239
+
1240
+ def stop(self):
1241
+ """Stop all workers."""
1242
+ self._stop_event.set()
1243
+ for p in self._workers:
1244
+ p.join(timeout=5)
1245
+ if p.is_alive():
1246
+ p.terminate()
1247
+
1248
+
1249
+ # =============================================================================
1250
+ # CLI
1251
+ # =============================================================================
1252
+
1253
+ def main():
1254
+ parser = argparse.ArgumentParser(description="Streaming Gateway - CUDA IPC Producer (NV12)")
1255
+ parser.add_argument("--video", "-v", required=True, help="Video file path")
1256
+ parser.add_argument("--num-streams", "-n", type=int, default=100, help="Number of streams")
1257
+ parser.add_argument("--fps", type=int, default=0, help="Target FPS limit per stream (0=unlimited)")
1258
+ parser.add_argument("--width", type=int, default=640, help="Frame width")
1259
+ parser.add_argument("--height", type=int, default=640, help="Frame height")
1260
+ parser.add_argument("--duration", "-d", type=float, default=30.0, help="Duration in seconds")
1261
+ parser.add_argument("--gpu", type=int, default=0, help="Primary GPU ID")
1262
+ parser.add_argument("--num-gpus", "-g", type=int, default=1, help="Number of GPUs (1-8)")
1263
+ parser.add_argument("--pool-size", type=int, default=8, help="NVDEC pool size per GPU")
1264
+ parser.add_argument("--burst-size", type=int, default=4, help="Frames per stream before rotating")
1265
+ parser.add_argument("--slots", type=int, default=32, help="Ring buffer slots per camera")
1266
+ parser.add_argument("--quiet", "-q", action="store_true", help="Quiet mode - only show final results")
1267
+ args = parser.parse_args()
1268
+
1269
+ # Setup logging based on quiet mode
1270
+ setup_logging(quiet=args.quiet)
1271
+
1272
+ config = GatewayConfig(
1273
+ video_path=args.video,
1274
+ num_streams=args.num_streams,
1275
+ target_fps=args.fps,
1276
+ frame_width=args.width,
1277
+ frame_height=args.height,
1278
+ gpu_id=args.gpu,
1279
+ num_gpus=args.num_gpus,
1280
+ duration_sec=args.duration,
1281
+ nvdec_pool_size=args.pool_size,
1282
+ nvdec_burst_size=args.burst_size,
1283
+ num_slots=args.slots,
1284
+ )
1285
+
1286
+ frame_size = args.width * args.height * 1.5
1287
+ output_fmt = f"NV12 ({args.width}x{args.height}x1.5 = {frame_size/1e6:.1f} MB/frame)"
1288
+ fps_limit_str = f"{args.fps} FPS/stream" if args.fps > 0 else "unlimited"
1289
+
1290
+ if not args.quiet:
1291
+ print("\n" + "=" * 60)
1292
+ print(" STREAMING GATEWAY - CUDA IPC Producer (NV12)")
1293
+ print("=" * 60)
1294
+ print(f" Video: {args.video}")
1295
+ print(f" Streams: {args.num_streams}")
1296
+ print(f" GPUs: {args.num_gpus}")
1297
+ print(f" Pool size: {args.pool_size} NVDEC decoders/GPU")
1298
+ print(f" FPS limit: {fps_limit_str}")
1299
+ print(f" Output: {output_fmt}")
1300
+ print(f" Duration: {args.duration}s")
1301
+ print("=" * 60)
1302
+
1303
+ gateway = StreamingGateway(config)
1304
+
1305
+ try:
1306
+ results = gateway.start()
1307
+ # Clean summary output
1308
+ print("\n")
1309
+ print("=" * 60)
1310
+ print(" STREAMING GATEWAY BENCHMARK RESULTS")
1311
+ print("=" * 60)
1312
+ print(f" Video: {args.video}")
1313
+ print(f" Streams: {args.num_streams}")
1314
+ print(f" GPUs: {args.num_gpus}")
1315
+ print(f" FPS limit: {fps_limit_str}")
1316
+ print(f" Duration: {args.duration}s")
1317
+ print("-" * 60)
1318
+ print(f" Total Frames: {results['total_frames']:,}")
1319
+ print("-" * 60)
1320
+ print(f" >>> AGGREGATE FPS: {results['aggregate_fps']:,.0f} <<<")
1321
+ print(f" >>> PER-STREAM FPS: {results['per_stream_fps']:.1f} <<<")
1322
+ print("=" * 60)
1323
+ print()
1324
+ except KeyboardInterrupt:
1325
+ gateway.stop()
1326
+ print("\nStopped")
1327
+
1328
+
1329
+ if __name__ == "__main__":
1330
+ main()