matrice-streaming 0.1.14__py3-none-any.whl → 0.1.65__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_streaming/__init__.py +44 -32
- matrice_streaming/streaming_gateway/camera_streamer/__init__.py +68 -1
- matrice_streaming/streaming_gateway/camera_streamer/async_camera_worker.py +1388 -0
- matrice_streaming/streaming_gateway/camera_streamer/async_ffmpeg_worker.py +966 -0
- matrice_streaming/streaming_gateway/camera_streamer/camera_streamer.py +188 -24
- matrice_streaming/streaming_gateway/camera_streamer/device_detection.py +507 -0
- matrice_streaming/streaming_gateway/camera_streamer/encoding_pool_manager.py +136 -0
- matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_camera_streamer.py +1048 -0
- matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_config.py +192 -0
- matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_worker_manager.py +470 -0
- matrice_streaming/streaming_gateway/camera_streamer/gstreamer_camera_streamer.py +1368 -0
- matrice_streaming/streaming_gateway/camera_streamer/gstreamer_worker.py +1063 -0
- matrice_streaming/streaming_gateway/camera_streamer/gstreamer_worker_manager.py +546 -0
- matrice_streaming/streaming_gateway/camera_streamer/message_builder.py +60 -15
- matrice_streaming/streaming_gateway/camera_streamer/nvdec.py +1330 -0
- matrice_streaming/streaming_gateway/camera_streamer/nvdec_worker_manager.py +412 -0
- matrice_streaming/streaming_gateway/camera_streamer/platform_pipelines.py +680 -0
- matrice_streaming/streaming_gateway/camera_streamer/stream_statistics.py +111 -4
- matrice_streaming/streaming_gateway/camera_streamer/video_capture_manager.py +223 -27
- matrice_streaming/streaming_gateway/camera_streamer/worker_manager.py +694 -0
- matrice_streaming/streaming_gateway/debug/__init__.py +27 -2
- matrice_streaming/streaming_gateway/debug/benchmark.py +727 -0
- matrice_streaming/streaming_gateway/debug/debug_gstreamer_gateway.py +599 -0
- matrice_streaming/streaming_gateway/debug/debug_streaming_gateway.py +245 -95
- matrice_streaming/streaming_gateway/debug/debug_utils.py +29 -0
- matrice_streaming/streaming_gateway/debug/test_videoplayback.py +318 -0
- matrice_streaming/streaming_gateway/dynamic_camera_manager.py +656 -39
- matrice_streaming/streaming_gateway/metrics_reporter.py +676 -139
- matrice_streaming/streaming_gateway/streaming_action.py +71 -20
- matrice_streaming/streaming_gateway/streaming_gateway.py +1026 -78
- matrice_streaming/streaming_gateway/streaming_gateway_utils.py +175 -20
- matrice_streaming/streaming_gateway/streaming_status_listener.py +89 -0
- {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/METADATA +1 -1
- matrice_streaming-0.1.65.dist-info/RECORD +56 -0
- matrice_streaming-0.1.14.dist-info/RECORD +0 -38
- {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/WHEEL +0 -0
- {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1330 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Streaming Gateway - CUDA IPC Video Producer (NVDEC Hardware Decode).
|
|
3
|
+
|
|
4
|
+
This module implements the producer side of the zero-copy video pipeline
|
|
5
|
+
using NVDEC hardware video decoding for maximum throughput.
|
|
6
|
+
|
|
7
|
+
Architecture:
|
|
8
|
+
=============
|
|
9
|
+
|
|
10
|
+
┌─────────────────────────────────────────────────────────────────────────┐
|
|
11
|
+
│ STREAMING GATEWAY (Producer) │
|
|
12
|
+
├─────────────────────────────────────────────────────────────────────────┤
|
|
13
|
+
│ │
|
|
14
|
+
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
|
15
|
+
│ │ NVDEC Decoder Pool │ │
|
|
16
|
+
│ │ │ │
|
|
17
|
+
│ │ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ │ │
|
|
18
|
+
│ │ │ Decoder 0 │ │ Decoder 1 │ │ Decoder N │ │ │
|
|
19
|
+
│ │ │ │ │ │ │ │ │ │
|
|
20
|
+
│ │ │ NVDEC HW │ │ NVDEC HW │ │ NVDEC HW │ │ │
|
|
21
|
+
│ │ │ decode │ │ decode │ │ decode │ │ │
|
|
22
|
+
│ │ │ ↓ │ │ ↓ │ │ ↓ │ │ │
|
|
23
|
+
│ │ │ NV12 Resize │ │ NV12 Resize │ │ NV12 Resize │ │ │
|
|
24
|
+
│ │ │ ↓ │ │ ↓ │ │ ↓ │ │ │
|
|
25
|
+
│ │ │ CUDA IPC │ │ CUDA IPC │ │ CUDA IPC │ │ │
|
|
26
|
+
│ │ │ Ring Buf │ │ Ring Buf │ │ Ring Buf │ │ │
|
|
27
|
+
│ │ │ (NV12 0.6MB) │ │ (NV12 0.6MB) │ (NV12 0.6MB) │ │ │
|
|
28
|
+
│ │ └────────────────┘ └────────────────┘ └────────────────┘ │ │
|
|
29
|
+
│ │ │ │
|
|
30
|
+
│ └─────────────────────────────────────────────────────────────────┘ │
|
|
31
|
+
│ │ │
|
|
32
|
+
│ Output: NV12 (H*1.5, W) uint8 = 0.6 MB │
|
|
33
|
+
│ 50% less IPC bandwidth than RGB │
|
|
34
|
+
│ ↓ │
|
|
35
|
+
└───────────────────────────────┼─────────────────────────────────────────┘
|
|
36
|
+
│
|
|
37
|
+
Consumer reads via CUDA IPC
|
|
38
|
+
→ NV12→RGB→CHW→FP16 in one kernel
|
|
39
|
+
→ TensorRT inference
|
|
40
|
+
|
|
41
|
+
Usage:
|
|
42
|
+
======
|
|
43
|
+
python streaming_gateway.py --video videoplayback.mp4 --num-streams 100
|
|
44
|
+
|
|
45
|
+
Requirements:
|
|
46
|
+
=============
|
|
47
|
+
- PyNvVideoCodec for NVDEC hardware decode
|
|
48
|
+
- CuPy with CUDA support
|
|
49
|
+
- cuda_shm_ring_buffer module
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
import argparse
|
|
53
|
+
import logging
|
|
54
|
+
import multiprocessing as mp
|
|
55
|
+
import os
|
|
56
|
+
import time
|
|
57
|
+
import threading
|
|
58
|
+
import queue as thread_queue
|
|
59
|
+
import hashlib
|
|
60
|
+
import tempfile
|
|
61
|
+
from dataclasses import dataclass
|
|
62
|
+
from pathlib import Path
|
|
63
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
64
|
+
from urllib.parse import urlparse, urlunparse
|
|
65
|
+
|
|
66
|
+
import numpy as np
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
import requests
|
|
70
|
+
REQUESTS_AVAILABLE = True
|
|
71
|
+
except ImportError:
|
|
72
|
+
REQUESTS_AVAILABLE = False
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
import cupy as cp
|
|
76
|
+
CUPY_AVAILABLE = True
|
|
77
|
+
except ImportError:
|
|
78
|
+
CUPY_AVAILABLE = False
|
|
79
|
+
cp = None
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
import PyNvVideoCodec as nvc
|
|
83
|
+
PYNVCODEC_AVAILABLE = True
|
|
84
|
+
except ImportError:
|
|
85
|
+
PYNVCODEC_AVAILABLE = False
|
|
86
|
+
nvc = None
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
from matrice_common.stream.cuda_shm_ring_buffer import CudaIpcRingBuffer, GlobalFrameCounter
|
|
90
|
+
RING_BUFFER_AVAILABLE = True
|
|
91
|
+
except ImportError:
|
|
92
|
+
RING_BUFFER_AVAILABLE = False
|
|
93
|
+
|
|
94
|
+
logger = logging.getLogger(__name__)
|
|
95
|
+
|
|
96
|
+
def setup_logging(quiet: bool = True):
|
|
97
|
+
"""Configure logging level based on quiet mode."""
|
|
98
|
+
level = logging.WARNING if quiet else logging.INFO
|
|
99
|
+
logging.basicConfig(
|
|
100
|
+
level=level,
|
|
101
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
102
|
+
)
|
|
103
|
+
logging.getLogger('cuda_shm_ring_buffer').setLevel(logging.WARNING if quiet else logging.INFO)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# =============================================================================
|
|
107
|
+
# Video Downloader for HTTPS URLs (PyNvVideoCodec's FFmpeg lacks HTTPS support)
|
|
108
|
+
# =============================================================================
|
|
109
|
+
|
|
110
|
+
class VideoDownloader:
|
|
111
|
+
"""Downloads and caches video files from HTTPS URLs.
|
|
112
|
+
|
|
113
|
+
PyNvVideoCodec uses a bundled FFmpeg that doesn't have HTTPS support.
|
|
114
|
+
This class downloads HTTPS videos to local files before passing them
|
|
115
|
+
to the NVDEC demuxer.
|
|
116
|
+
|
|
117
|
+
Features:
|
|
118
|
+
- URL deduplication: same video URL (ignoring query params) is only downloaded once
|
|
119
|
+
- Disk caching: reuses existing files across runs
|
|
120
|
+
- Progress tracking for large files
|
|
121
|
+
- Dynamic timeout based on file size
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
# Configuration
|
|
125
|
+
DOWNLOAD_TIMEOUT = 300 # Base timeout in seconds
|
|
126
|
+
DOWNLOAD_TIMEOUT_PER_100MB = 300 # Additional seconds per 100MB
|
|
127
|
+
MAX_DOWNLOAD_TIMEOUT = 6000 # 100 minutes max
|
|
128
|
+
DOWNLOAD_CHUNK_SIZE = 8192
|
|
129
|
+
|
|
130
|
+
# Singleton instance for process-wide caching
|
|
131
|
+
_instance: Optional['VideoDownloader'] = None
|
|
132
|
+
_lock = threading.Lock()
|
|
133
|
+
|
|
134
|
+
def __new__(cls):
|
|
135
|
+
"""Singleton pattern for process-wide cache sharing."""
|
|
136
|
+
if cls._instance is None:
|
|
137
|
+
with cls._lock:
|
|
138
|
+
if cls._instance is None:
|
|
139
|
+
cls._instance = super().__new__(cls)
|
|
140
|
+
cls._instance._initialized = False
|
|
141
|
+
return cls._instance
|
|
142
|
+
|
|
143
|
+
def __init__(self):
|
|
144
|
+
"""Initialize the video downloader."""
|
|
145
|
+
if self._initialized:
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
self._initialized = True
|
|
149
|
+
self.downloaded_files: Dict[str, str] = {}
|
|
150
|
+
self._normalized_url_to_path: Dict[str, str] = {}
|
|
151
|
+
self._download_lock = threading.Lock()
|
|
152
|
+
self.temp_dir = Path(tempfile.gettempdir()) / "nvdec_video_cache"
|
|
153
|
+
self.temp_dir.mkdir(exist_ok=True)
|
|
154
|
+
logger.info(f"VideoDownloader initialized, cache dir: {self.temp_dir}")
|
|
155
|
+
|
|
156
|
+
def prepare_source(self, video_path: str, camera_id: str) -> str:
|
|
157
|
+
"""Prepare video source, downloading HTTPS URLs if needed.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
video_path: Video file path, RTSP URL, or HTTPS URL
|
|
161
|
+
camera_id: Camera identifier for logging
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Local file path (downloaded if HTTPS) or original path
|
|
165
|
+
"""
|
|
166
|
+
if not self._is_https_url(video_path):
|
|
167
|
+
return video_path
|
|
168
|
+
|
|
169
|
+
if not REQUESTS_AVAILABLE:
|
|
170
|
+
logger.warning(f"requests module not available, cannot download HTTPS URL for {camera_id}")
|
|
171
|
+
return video_path
|
|
172
|
+
|
|
173
|
+
local_path = self._download_video(video_path, camera_id)
|
|
174
|
+
if local_path:
|
|
175
|
+
return local_path
|
|
176
|
+
|
|
177
|
+
logger.warning(f"Failed to download {video_path} for {camera_id}, will try URL directly (may fail)")
|
|
178
|
+
return video_path
|
|
179
|
+
|
|
180
|
+
def _is_https_url(self, source: str) -> bool:
|
|
181
|
+
"""Check if source is an HTTPS URL."""
|
|
182
|
+
return source.startswith('https://')
|
|
183
|
+
|
|
184
|
+
def _normalize_url(self, url: str) -> str:
|
|
185
|
+
"""Normalize URL by stripping query parameters for deduplication."""
|
|
186
|
+
parsed = urlparse(url)
|
|
187
|
+
return urlunparse((
|
|
188
|
+
parsed.scheme,
|
|
189
|
+
parsed.netloc,
|
|
190
|
+
parsed.path,
|
|
191
|
+
'', '', '' # params, query, fragment
|
|
192
|
+
))
|
|
193
|
+
|
|
194
|
+
def _get_url_hash(self, normalized_url: str) -> str:
|
|
195
|
+
"""Generate a short hash for consistent file naming."""
|
|
196
|
+
return hashlib.md5(normalized_url.encode()).hexdigest()[:12]
|
|
197
|
+
|
|
198
|
+
def _download_video(self, url: str, camera_id: str) -> Optional[str]:
|
|
199
|
+
"""Download video file from HTTPS URL with caching.
|
|
200
|
+
|
|
201
|
+
Thread-safe: uses lock to prevent duplicate downloads.
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
url: HTTPS video URL
|
|
205
|
+
camera_id: Camera identifier for logging
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Local file path or None if download failed
|
|
209
|
+
"""
|
|
210
|
+
normalized_url = self._normalize_url(url)
|
|
211
|
+
file_ext = Path(url.split('?')[0]).suffix or '.mp4'
|
|
212
|
+
url_hash = self._get_url_hash(normalized_url)
|
|
213
|
+
expected_path = self.temp_dir / f"video_{url_hash}{file_ext}"
|
|
214
|
+
expected_path_str = str(expected_path)
|
|
215
|
+
|
|
216
|
+
# Quick check: file already on disk
|
|
217
|
+
if expected_path.exists():
|
|
218
|
+
existing_size = expected_path.stat().st_size
|
|
219
|
+
logger.info(
|
|
220
|
+
f"[{camera_id}] Reusing cached video: {expected_path.name} "
|
|
221
|
+
f"({existing_size / (1024*1024):.1f}MB)"
|
|
222
|
+
)
|
|
223
|
+
with self._download_lock:
|
|
224
|
+
self.downloaded_files[url] = expected_path_str
|
|
225
|
+
self._normalized_url_to_path[normalized_url] = expected_path_str
|
|
226
|
+
return expected_path_str
|
|
227
|
+
|
|
228
|
+
# Check memory cache
|
|
229
|
+
with self._download_lock:
|
|
230
|
+
if url in self.downloaded_files:
|
|
231
|
+
local_path = self.downloaded_files[url]
|
|
232
|
+
if os.path.exists(local_path):
|
|
233
|
+
logger.debug(f"[{camera_id}] Using cached path (exact URL match)")
|
|
234
|
+
return local_path
|
|
235
|
+
|
|
236
|
+
if normalized_url in self._normalized_url_to_path:
|
|
237
|
+
local_path = self._normalized_url_to_path[normalized_url]
|
|
238
|
+
if os.path.exists(local_path):
|
|
239
|
+
logger.info(f"[{camera_id}] Reusing download (same base URL)")
|
|
240
|
+
self.downloaded_files[url] = local_path
|
|
241
|
+
return local_path
|
|
242
|
+
|
|
243
|
+
# Need to download - acquire lock to prevent duplicate downloads
|
|
244
|
+
with self._download_lock:
|
|
245
|
+
# Double-check after acquiring lock
|
|
246
|
+
if expected_path.exists():
|
|
247
|
+
self.downloaded_files[url] = expected_path_str
|
|
248
|
+
self._normalized_url_to_path[normalized_url] = expected_path_str
|
|
249
|
+
return expected_path_str
|
|
250
|
+
|
|
251
|
+
return self._do_download(url, expected_path, camera_id)
|
|
252
|
+
|
|
253
|
+
def _do_download(self, url: str, dest_path: Path, camera_id: str) -> Optional[str]:
|
|
254
|
+
"""Perform the actual download. Must be called with _download_lock held."""
|
|
255
|
+
content_length = 0
|
|
256
|
+
file_size_mb = 0.0
|
|
257
|
+
bytes_downloaded = 0
|
|
258
|
+
timeout = self.DOWNLOAD_TIMEOUT
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
# HEAD request to get file size
|
|
262
|
+
try:
|
|
263
|
+
head_response = requests.head(url, timeout=10, allow_redirects=True)
|
|
264
|
+
content_length = int(head_response.headers.get('Content-Length', 0))
|
|
265
|
+
file_size_mb = content_length / (1024 * 1024)
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.debug(f"[{camera_id}] HEAD request failed: {e}")
|
|
268
|
+
|
|
269
|
+
# Calculate dynamic timeout
|
|
270
|
+
if content_length > 0:
|
|
271
|
+
timeout = min(
|
|
272
|
+
self.DOWNLOAD_TIMEOUT + int(file_size_mb // 100) * self.DOWNLOAD_TIMEOUT_PER_100MB,
|
|
273
|
+
self.MAX_DOWNLOAD_TIMEOUT
|
|
274
|
+
)
|
|
275
|
+
logger.info(f"[{camera_id}] Downloading {file_size_mb:.1f}MB (timeout: {timeout}s)")
|
|
276
|
+
else:
|
|
277
|
+
logger.info(f"[{camera_id}] Downloading video (size unknown, timeout: {timeout}s)")
|
|
278
|
+
|
|
279
|
+
# Download with progress tracking
|
|
280
|
+
response = requests.get(url, stream=True, timeout=timeout)
|
|
281
|
+
response.raise_for_status()
|
|
282
|
+
|
|
283
|
+
if content_length == 0:
|
|
284
|
+
content_length = int(response.headers.get('Content-Length', 0))
|
|
285
|
+
file_size_mb = content_length / (1024 * 1024) if content_length > 0 else 0
|
|
286
|
+
|
|
287
|
+
last_progress_log = 0
|
|
288
|
+
|
|
289
|
+
with open(dest_path, 'wb') as f:
|
|
290
|
+
for chunk in response.iter_content(chunk_size=self.DOWNLOAD_CHUNK_SIZE):
|
|
291
|
+
f.write(chunk)
|
|
292
|
+
bytes_downloaded += len(chunk)
|
|
293
|
+
|
|
294
|
+
# Log progress every 50MB for large files
|
|
295
|
+
if content_length > 50_000_000:
|
|
296
|
+
mb_downloaded = bytes_downloaded // (1024 * 1024)
|
|
297
|
+
if mb_downloaded - last_progress_log >= 50:
|
|
298
|
+
progress = (bytes_downloaded / content_length * 100) if content_length else 0
|
|
299
|
+
logger.info(
|
|
300
|
+
f"[{camera_id}] Download progress: "
|
|
301
|
+
f"{mb_downloaded}MB / {file_size_mb:.0f}MB ({progress:.1f}%)"
|
|
302
|
+
)
|
|
303
|
+
last_progress_log = mb_downloaded
|
|
304
|
+
|
|
305
|
+
# Update caches
|
|
306
|
+
normalized_url = self._normalize_url(url)
|
|
307
|
+
dest_path_str = str(dest_path)
|
|
308
|
+
self.downloaded_files[url] = dest_path_str
|
|
309
|
+
self._normalized_url_to_path[normalized_url] = dest_path_str
|
|
310
|
+
|
|
311
|
+
logger.info(
|
|
312
|
+
f"[{camera_id}] Downloaded: {dest_path.name} "
|
|
313
|
+
f"({bytes_downloaded / (1024*1024):.1f}MB)"
|
|
314
|
+
)
|
|
315
|
+
return dest_path_str
|
|
316
|
+
|
|
317
|
+
except requests.Timeout:
|
|
318
|
+
logger.error(
|
|
319
|
+
f"[{camera_id}] Download timeout: {file_size_mb:.1f}MB, "
|
|
320
|
+
f"got {bytes_downloaded/(1024*1024):.1f}MB in {timeout}s"
|
|
321
|
+
)
|
|
322
|
+
except requests.HTTPError as e:
|
|
323
|
+
logger.error(f"[{camera_id}] HTTP error: {e.response.status_code} - {e.response.reason}")
|
|
324
|
+
except IOError as e:
|
|
325
|
+
logger.error(f"[{camera_id}] Disk I/O error: {e}")
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logger.error(f"[{camera_id}] Download failed: {type(e).__name__}: {e}")
|
|
328
|
+
|
|
329
|
+
# Cleanup partial download
|
|
330
|
+
try:
|
|
331
|
+
if dest_path.exists():
|
|
332
|
+
dest_path.unlink()
|
|
333
|
+
except Exception:
|
|
334
|
+
pass
|
|
335
|
+
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
def cleanup(self):
|
|
339
|
+
"""Clean up downloaded temporary files."""
|
|
340
|
+
unique_files = set(self.downloaded_files.values())
|
|
341
|
+
unique_files.update(self._normalized_url_to_path.values())
|
|
342
|
+
|
|
343
|
+
for filepath in unique_files:
|
|
344
|
+
try:
|
|
345
|
+
if os.path.exists(filepath):
|
|
346
|
+
os.remove(filepath)
|
|
347
|
+
logger.debug(f"Removed temp file: {filepath}")
|
|
348
|
+
except Exception as e:
|
|
349
|
+
logger.warning(f"Failed to remove temp file {filepath}: {e}")
|
|
350
|
+
|
|
351
|
+
self.downloaded_files.clear()
|
|
352
|
+
self._normalized_url_to_path.clear()
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
# Global video downloader instance
|
|
356
|
+
_video_downloader: Optional[VideoDownloader] = None
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def get_video_downloader() -> VideoDownloader:
|
|
360
|
+
"""Get or create the global VideoDownloader instance."""
|
|
361
|
+
global _video_downloader
|
|
362
|
+
if _video_downloader is None:
|
|
363
|
+
_video_downloader = VideoDownloader()
|
|
364
|
+
return _video_downloader
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@dataclass
|
|
368
|
+
class StreamConfig:
|
|
369
|
+
"""Configuration for a single video stream."""
|
|
370
|
+
camera_id: str
|
|
371
|
+
video_path: str
|
|
372
|
+
width: int = 640
|
|
373
|
+
height: int = 640
|
|
374
|
+
target_fps: int = 10
|
|
375
|
+
gpu_id: int = 0
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
@dataclass
|
|
379
|
+
class GatewayConfig:
|
|
380
|
+
"""Configuration for the streaming gateway."""
|
|
381
|
+
video_path: str
|
|
382
|
+
num_streams: int = 100
|
|
383
|
+
target_fps: int = 0 # 0 = unlimited, >0 = FPS limit per stream
|
|
384
|
+
frame_width: int = 640
|
|
385
|
+
frame_height: int = 640
|
|
386
|
+
gpu_id: int = 0
|
|
387
|
+
num_gpus: int = 1
|
|
388
|
+
duration_sec: float = 30.0
|
|
389
|
+
nvdec_pool_size: int = 8
|
|
390
|
+
nvdec_burst_size: int = 4
|
|
391
|
+
num_slots: int = 32
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
@dataclass
|
|
395
|
+
class StreamState:
|
|
396
|
+
"""Track state for each logical stream in NVDEC pool."""
|
|
397
|
+
stream_id: int
|
|
398
|
+
camera_id: str
|
|
399
|
+
video_path: str
|
|
400
|
+
demuxer: Any
|
|
401
|
+
frames_decoded: int = 0
|
|
402
|
+
width: int = 640
|
|
403
|
+
height: int = 640
|
|
404
|
+
empty_packets: int = 0
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
# =============================================================================
|
|
408
|
+
# CUDA Kernel: NV12 Resize (no color conversion - 50% less bandwidth)
|
|
409
|
+
# =============================================================================
|
|
410
|
+
|
|
411
|
+
_nv12_resize_kernel = None
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _get_nv12_resize_kernel():
|
|
415
|
+
"""Get or compile the NV12 resize kernel.
|
|
416
|
+
|
|
417
|
+
This kernel resizes NV12 directly (no color conversion).
|
|
418
|
+
Output: concatenated Y (H×W) + UV ((H/2)×W) = H×W×1.5 bytes
|
|
419
|
+
This is 50% smaller than RGB (H×W×3 bytes).
|
|
420
|
+
|
|
421
|
+
Consumer will do: NV12→RGB→CHW→FP16 in one fused kernel.
|
|
422
|
+
"""
|
|
423
|
+
global _nv12_resize_kernel
|
|
424
|
+
if _nv12_resize_kernel is None and CUPY_AVAILABLE:
|
|
425
|
+
_nv12_resize_kernel = cp.RawKernel(r'''
|
|
426
|
+
extern "C" __global__ void nv12_resize(
|
|
427
|
+
const unsigned char* src_y, // Source Y plane
|
|
428
|
+
const unsigned char* src_uv, // Source UV plane (interleaved)
|
|
429
|
+
unsigned char* dst, // Output: Y (H×W) followed by UV ((H/2)×W)
|
|
430
|
+
int src_h, int src_w,
|
|
431
|
+
int dst_h, int dst_w,
|
|
432
|
+
int y_stride, int uv_stride
|
|
433
|
+
) {
|
|
434
|
+
int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
435
|
+
int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
436
|
+
|
|
437
|
+
// Total height in output: dst_h (Y) + dst_h/2 (UV) = dst_h * 1.5
|
|
438
|
+
int total_h = dst_h + dst_h / 2;
|
|
439
|
+
if (dst_x >= dst_w || dst_y >= total_h) return;
|
|
440
|
+
|
|
441
|
+
float scale_x = (float)src_w / dst_w;
|
|
442
|
+
float scale_y = (float)src_h / dst_h;
|
|
443
|
+
|
|
444
|
+
if (dst_y < dst_h) {
|
|
445
|
+
// Y plane region: resize Y
|
|
446
|
+
int src_x = min((int)(dst_x * scale_x), src_w - 1);
|
|
447
|
+
int src_y_coord = min((int)(dst_y * scale_y), src_h - 1);
|
|
448
|
+
int src_idx = src_y_coord * y_stride + src_x;
|
|
449
|
+
int dst_idx = dst_y * dst_w + dst_x;
|
|
450
|
+
dst[dst_idx] = src_y[src_idx];
|
|
451
|
+
} else {
|
|
452
|
+
// UV plane region: resize UV (UV is at half vertical resolution)
|
|
453
|
+
int uv_dst_y = dst_y - dst_h; // 0 to dst_h/2-1
|
|
454
|
+
int uv_src_y = min((int)(uv_dst_y * scale_y), src_h / 2 - 1);
|
|
455
|
+
|
|
456
|
+
// UV is interleaved, so we copy pairs (U, V) together
|
|
457
|
+
int src_uv_x = min((int)((dst_x / 2) * 2 * scale_x), src_w - 2);
|
|
458
|
+
src_uv_x = (src_uv_x / 2) * 2; // Ensure even
|
|
459
|
+
|
|
460
|
+
int src_idx = uv_src_y * uv_stride + src_uv_x + (dst_x % 2);
|
|
461
|
+
int dst_idx = dst_h * dst_w + uv_dst_y * dst_w + dst_x;
|
|
462
|
+
dst[dst_idx] = src_uv[src_idx];
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
''', 'nv12_resize')
|
|
466
|
+
return _nv12_resize_kernel
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def nv12_resize(y_plane: cp.ndarray, uv_plane: cp.ndarray,
|
|
470
|
+
y_stride: int, uv_stride: int,
|
|
471
|
+
src_h: int, src_w: int,
|
|
472
|
+
dst_h: int = 640, dst_w: int = 640) -> cp.ndarray:
|
|
473
|
+
"""Resize NV12 without color conversion.
|
|
474
|
+
|
|
475
|
+
Output: concatenated Y (H×W) + UV ((H/2)×W) as single buffer.
|
|
476
|
+
Total size: H×W + (H/2)×W = H×W×1.5 bytes (50% of RGB).
|
|
477
|
+
"""
|
|
478
|
+
kernel = _get_nv12_resize_kernel()
|
|
479
|
+
if kernel is None:
|
|
480
|
+
return None
|
|
481
|
+
|
|
482
|
+
total_h = dst_h + dst_h // 2
|
|
483
|
+
output = cp.empty((total_h, dst_w), dtype=cp.uint8)
|
|
484
|
+
|
|
485
|
+
block = (16, 16)
|
|
486
|
+
grid = ((dst_w + 15) // 16, (total_h + 15) // 16)
|
|
487
|
+
|
|
488
|
+
kernel(grid, block, (
|
|
489
|
+
y_plane, uv_plane, output,
|
|
490
|
+
cp.int32(src_h), cp.int32(src_w),
|
|
491
|
+
cp.int32(dst_h), cp.int32(dst_w),
|
|
492
|
+
cp.int32(y_stride), cp.int32(uv_stride)
|
|
493
|
+
))
|
|
494
|
+
|
|
495
|
+
return output
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
def surface_to_nv12(frame, target_h: int = 640, target_w: int = 640) -> Optional[cp.ndarray]:
|
|
499
|
+
"""Convert NVDEC surface to resized NV12 (50% smaller than RGB).
|
|
500
|
+
|
|
501
|
+
Output: (H + H/2, W) uint8 - concatenated Y + UV planes.
|
|
502
|
+
Total size: H×W×1.5 bytes (vs H×W×3 for RGB).
|
|
503
|
+
"""
|
|
504
|
+
if not CUPY_AVAILABLE or frame is None:
|
|
505
|
+
return None
|
|
506
|
+
|
|
507
|
+
try:
|
|
508
|
+
cuda_views = frame.cuda()
|
|
509
|
+
if not cuda_views or len(cuda_views) < 2:
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
# Extract Y plane
|
|
513
|
+
y_view = cuda_views[0]
|
|
514
|
+
y_cai = y_view.__cuda_array_interface__
|
|
515
|
+
y_shape = tuple(y_cai['shape'])
|
|
516
|
+
y_strides = tuple(y_cai['strides'])
|
|
517
|
+
y_ptr = y_cai['data'][0]
|
|
518
|
+
src_h, src_w = y_shape[:2]
|
|
519
|
+
y_stride = y_strides[0]
|
|
520
|
+
|
|
521
|
+
y_size = src_h * y_stride
|
|
522
|
+
y_mem = cp.cuda.UnownedMemory(y_ptr, y_size, owner=frame)
|
|
523
|
+
y_memptr = cp.cuda.MemoryPointer(y_mem, 0)
|
|
524
|
+
y_plane = cp.ndarray((src_h, src_w), dtype=cp.uint8, memptr=y_memptr,
|
|
525
|
+
strides=(y_stride, 1))
|
|
526
|
+
|
|
527
|
+
# Extract UV plane
|
|
528
|
+
uv_view = cuda_views[1]
|
|
529
|
+
uv_cai = uv_view.__cuda_array_interface__
|
|
530
|
+
uv_shape = tuple(uv_cai['shape'])
|
|
531
|
+
uv_strides = tuple(uv_cai['strides'])
|
|
532
|
+
uv_ptr = uv_cai['data'][0]
|
|
533
|
+
uv_stride = uv_strides[0]
|
|
534
|
+
|
|
535
|
+
uv_h = uv_shape[0]
|
|
536
|
+
uv_w = uv_shape[1] if len(uv_shape) > 1 else src_w
|
|
537
|
+
uv_size = uv_h * uv_stride
|
|
538
|
+
uv_mem = cp.cuda.UnownedMemory(uv_ptr, uv_size, owner=frame)
|
|
539
|
+
uv_memptr = cp.cuda.MemoryPointer(uv_mem, 0)
|
|
540
|
+
uv_plane = cp.ndarray((uv_h, uv_w), dtype=cp.uint8, memptr=uv_memptr,
|
|
541
|
+
strides=(uv_stride, 1))
|
|
542
|
+
|
|
543
|
+
# NV12 resize (no color conversion - 50% smaller output!)
|
|
544
|
+
nv12_frame = nv12_resize(y_plane, uv_plane, y_stride, uv_stride,
|
|
545
|
+
src_h, src_w, target_h, target_w)
|
|
546
|
+
# Add channel dimension for ring buffer compatibility: (H*1.5, W) -> (H*1.5, W, 1)
|
|
547
|
+
return nv12_frame[:, :, cp.newaxis] if nv12_frame is not None else None
|
|
548
|
+
|
|
549
|
+
except Exception as e:
|
|
550
|
+
# Safely encode error message (some CUDA errors contain non-ASCII chars like '×')
|
|
551
|
+
try:
|
|
552
|
+
err_msg = str(e).encode('ascii', errors='replace').decode('ascii')
|
|
553
|
+
except Exception:
|
|
554
|
+
err_msg = "unknown error"
|
|
555
|
+
logger.warning(f"surface_to_nv12 failed: {err_msg}")
|
|
556
|
+
return None
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
# =============================================================================
|
|
560
|
+
# NVDEC Decoder Pool
|
|
561
|
+
# =============================================================================
|
|
562
|
+
|
|
563
|
+
class NVDECDecoderPool:
|
|
564
|
+
"""Pool of NVDEC decoders that time-multiplex streams.
|
|
565
|
+
|
|
566
|
+
Each decoder is exclusively owned by one worker thread.
|
|
567
|
+
Outputs NV12: 1.5×H×W bytes (50% smaller than RGB).
|
|
568
|
+
"""
|
|
569
|
+
|
|
570
|
+
def __init__(self, pool_size: int, gpu_id: int = 0):
|
|
571
|
+
self.pool_size = pool_size
|
|
572
|
+
self.gpu_id = gpu_id
|
|
573
|
+
self.decoders = []
|
|
574
|
+
self.streams_per_decoder: List[List[StreamState]] = [[] for _ in range(pool_size)]
|
|
575
|
+
|
|
576
|
+
if not PYNVCODEC_AVAILABLE:
|
|
577
|
+
raise RuntimeError("PyNvVideoCodec not available")
|
|
578
|
+
|
|
579
|
+
if CUPY_AVAILABLE:
|
|
580
|
+
cp.cuda.Device(gpu_id).use()
|
|
581
|
+
|
|
582
|
+
for i in range(pool_size):
|
|
583
|
+
try:
|
|
584
|
+
decoder = nvc.CreateDecoder(
|
|
585
|
+
gpuid=gpu_id,
|
|
586
|
+
codec=nvc.cudaVideoCodec.H264,
|
|
587
|
+
usedevicememory=True
|
|
588
|
+
)
|
|
589
|
+
self.decoders.append(decoder)
|
|
590
|
+
except Exception as e:
|
|
591
|
+
logger.warning(f"Failed to create decoder {i}: {e}")
|
|
592
|
+
break
|
|
593
|
+
|
|
594
|
+
self.actual_pool_size = len(self.decoders)
|
|
595
|
+
logger.info(f"Created NVDEC pool: {self.actual_pool_size}/{pool_size} decoders on GPU {gpu_id}")
|
|
596
|
+
|
|
597
|
+
def assign_stream(self, stream_id: int, camera_id: str, video_path: str,
|
|
598
|
+
width: int = 640, height: int = 640) -> bool:
|
|
599
|
+
"""Assign a stream to a decoder (round-robin).
|
|
600
|
+
|
|
601
|
+
Automatically downloads HTTPS URLs to local files since PyNvVideoCodec's
|
|
602
|
+
bundled FFmpeg doesn't support HTTPS protocol.
|
|
603
|
+
"""
|
|
604
|
+
if self.actual_pool_size == 0:
|
|
605
|
+
return False
|
|
606
|
+
|
|
607
|
+
decoder_idx = stream_id % self.actual_pool_size
|
|
608
|
+
|
|
609
|
+
# Download HTTPS URLs to local files (PyNvVideoCodec lacks HTTPS support)
|
|
610
|
+
downloader = get_video_downloader()
|
|
611
|
+
local_path = downloader.prepare_source(video_path, camera_id)
|
|
612
|
+
|
|
613
|
+
try:
|
|
614
|
+
demuxer = nvc.CreateDemuxer(local_path)
|
|
615
|
+
except Exception as e:
|
|
616
|
+
logger.error(f"Failed to create demuxer for {camera_id}: {e}")
|
|
617
|
+
return False
|
|
618
|
+
|
|
619
|
+
stream_state = StreamState(
|
|
620
|
+
stream_id=stream_id,
|
|
621
|
+
camera_id=camera_id,
|
|
622
|
+
video_path=local_path, # Store local path for video looping
|
|
623
|
+
demuxer=demuxer,
|
|
624
|
+
width=width,
|
|
625
|
+
height=height
|
|
626
|
+
)
|
|
627
|
+
self.streams_per_decoder[decoder_idx].append(stream_state)
|
|
628
|
+
return True
|
|
629
|
+
|
|
630
|
+
def decode_round(self, decoder_idx: int, frames_per_stream: int = 4,
|
|
631
|
+
target_h: int = 640, target_w: int = 640) -> Tuple[int, List[Tuple[str, cp.ndarray]]]:
|
|
632
|
+
"""Decode frames and convert to NV12.
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
(total_frames, [(camera_id, nv12_tensor), ...])
|
|
636
|
+
"""
|
|
637
|
+
if decoder_idx >= self.actual_pool_size:
|
|
638
|
+
return 0, []
|
|
639
|
+
|
|
640
|
+
decoder = self.decoders[decoder_idx]
|
|
641
|
+
streams = self.streams_per_decoder[decoder_idx]
|
|
642
|
+
total_frames = 0
|
|
643
|
+
decoded_frames = []
|
|
644
|
+
|
|
645
|
+
for stream in streams:
|
|
646
|
+
frames_this_stream = 0
|
|
647
|
+
|
|
648
|
+
while frames_this_stream < frames_per_stream:
|
|
649
|
+
try:
|
|
650
|
+
packet = stream.demuxer.Demux()
|
|
651
|
+
if packet is None:
|
|
652
|
+
stream.demuxer = nvc.CreateDemuxer(stream.video_path)
|
|
653
|
+
stream.empty_packets = 0
|
|
654
|
+
packet = stream.demuxer.Demux()
|
|
655
|
+
if packet is None:
|
|
656
|
+
break
|
|
657
|
+
|
|
658
|
+
frames_before = frames_this_stream
|
|
659
|
+
for surface in decoder.Decode(packet):
|
|
660
|
+
tensor = surface_to_nv12(surface, target_h, target_w)
|
|
661
|
+
|
|
662
|
+
if tensor is not None:
|
|
663
|
+
decoded_frames.append((stream.camera_id, tensor))
|
|
664
|
+
frames_this_stream += 1
|
|
665
|
+
stream.frames_decoded += 1
|
|
666
|
+
total_frames += 1
|
|
667
|
+
stream.empty_packets = 0
|
|
668
|
+
|
|
669
|
+
if frames_this_stream >= frames_per_stream:
|
|
670
|
+
break
|
|
671
|
+
|
|
672
|
+
if frames_this_stream == frames_before:
|
|
673
|
+
stream.empty_packets += 1
|
|
674
|
+
if stream.empty_packets >= 3:
|
|
675
|
+
stream.demuxer = nvc.CreateDemuxer(stream.video_path)
|
|
676
|
+
stream.empty_packets = 0
|
|
677
|
+
|
|
678
|
+
except Exception:
|
|
679
|
+
break
|
|
680
|
+
|
|
681
|
+
if frames_this_stream >= frames_per_stream:
|
|
682
|
+
break
|
|
683
|
+
|
|
684
|
+
return total_frames, decoded_frames
|
|
685
|
+
|
|
686
|
+
def get_camera_ids_for_decoder(self, decoder_idx: int) -> List[str]:
|
|
687
|
+
"""Get camera IDs for a decoder."""
|
|
688
|
+
if decoder_idx >= self.actual_pool_size:
|
|
689
|
+
return []
|
|
690
|
+
return [s.camera_id for s in self.streams_per_decoder[decoder_idx]]
|
|
691
|
+
|
|
692
|
+
def close(self):
|
|
693
|
+
"""Close all decoders."""
|
|
694
|
+
self.decoders.clear()
|
|
695
|
+
for streams in self.streams_per_decoder:
|
|
696
|
+
streams.clear()
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
# =============================================================================
|
|
700
|
+
# Worker Thread
|
|
701
|
+
# =============================================================================
|
|
702
|
+
|
|
703
|
+
def nvdec_pool_worker(
|
|
704
|
+
worker_id: int,
|
|
705
|
+
decoder_idx: int,
|
|
706
|
+
pool: NVDECDecoderPool,
|
|
707
|
+
ring_buffers: Dict[str, CudaIpcRingBuffer],
|
|
708
|
+
frame_counter: GlobalFrameCounter,
|
|
709
|
+
duration_sec: float,
|
|
710
|
+
result_queue: thread_queue.Queue,
|
|
711
|
+
stop_event: threading.Event,
|
|
712
|
+
burst_size: int = 4,
|
|
713
|
+
target_h: int = 640,
|
|
714
|
+
target_w: int = 640,
|
|
715
|
+
target_fps: int = 0,
|
|
716
|
+
shared_frame_count: Optional[mp.Value] = None,
|
|
717
|
+
gpu_frame_count: Optional[mp.Value] = None,
|
|
718
|
+
):
|
|
719
|
+
"""NVDEC worker thread.
|
|
720
|
+
|
|
721
|
+
Decodes frames and writes NV12 tensors to ring buffers.
|
|
722
|
+
Uses dedicated CUDA stream per worker for kernel overlap.
|
|
723
|
+
Supports FPS limiting when target_fps > 0.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
shared_frame_count: Global counter (all GPUs)
|
|
727
|
+
gpu_frame_count: Per-GPU counter (this GPU only)
|
|
728
|
+
"""
|
|
729
|
+
if CUPY_AVAILABLE:
|
|
730
|
+
cp.cuda.Device(pool.gpu_id).use()
|
|
731
|
+
cuda_stream = cp.cuda.Stream(non_blocking=True)
|
|
732
|
+
else:
|
|
733
|
+
cuda_stream = None
|
|
734
|
+
|
|
735
|
+
local_frames = 0
|
|
736
|
+
local_errors = 0
|
|
737
|
+
frames_since_counter_update = 0
|
|
738
|
+
counter_batch_size = 100
|
|
739
|
+
start_time = time.perf_counter()
|
|
740
|
+
camera_ids = pool.get_camera_ids_for_decoder(decoder_idx)
|
|
741
|
+
num_streams = len(camera_ids)
|
|
742
|
+
|
|
743
|
+
# FPS limiting: calculate frames per second target for this worker
|
|
744
|
+
# Each worker handles num_streams cameras at target_fps each
|
|
745
|
+
fps_limit_enabled = target_fps > 0 and num_streams > 0
|
|
746
|
+
if fps_limit_enabled:
|
|
747
|
+
# Total target frames per second for all streams handled by this worker
|
|
748
|
+
worker_target_fps = target_fps * num_streams
|
|
749
|
+
frame_interval = 1.0 / worker_target_fps
|
|
750
|
+
next_frame_time = start_time
|
|
751
|
+
fps_mode = f", FPS limit={target_fps}/stream"
|
|
752
|
+
else:
|
|
753
|
+
frame_interval = 0
|
|
754
|
+
next_frame_time = 0
|
|
755
|
+
fps_mode = ", unlimited FPS"
|
|
756
|
+
|
|
757
|
+
logger.debug(f"Worker {worker_id}: decoder={decoder_idx}, cams={num_streams}{fps_mode}")
|
|
758
|
+
|
|
759
|
+
while not stop_event.is_set():
|
|
760
|
+
if time.perf_counter() - start_time >= duration_sec:
|
|
761
|
+
break
|
|
762
|
+
|
|
763
|
+
# FPS limiting: wait until next scheduled frame time
|
|
764
|
+
if fps_limit_enabled:
|
|
765
|
+
current_time = time.perf_counter()
|
|
766
|
+
if current_time < next_frame_time:
|
|
767
|
+
sleep_time = next_frame_time - current_time
|
|
768
|
+
if sleep_time > 0.0001: # Only sleep if > 100us
|
|
769
|
+
time.sleep(sleep_time)
|
|
770
|
+
|
|
771
|
+
try:
|
|
772
|
+
with cuda_stream:
|
|
773
|
+
num_frames, decoded_frames = pool.decode_round(
|
|
774
|
+
decoder_idx,
|
|
775
|
+
frames_per_stream=burst_size,
|
|
776
|
+
target_h=target_h,
|
|
777
|
+
target_w=target_w
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
for cam_id, tensor in decoded_frames:
|
|
781
|
+
if cam_id in ring_buffers:
|
|
782
|
+
try:
|
|
783
|
+
ring_buffers[cam_id].write_frame_fast(tensor, sync=False)
|
|
784
|
+
local_frames += 1
|
|
785
|
+
frames_since_counter_update += 1
|
|
786
|
+
|
|
787
|
+
# Update global counter (all GPUs)
|
|
788
|
+
if shared_frame_count is not None:
|
|
789
|
+
with shared_frame_count.get_lock():
|
|
790
|
+
shared_frame_count.value += 1
|
|
791
|
+
|
|
792
|
+
# Update per-GPU counter (this GPU only)
|
|
793
|
+
if gpu_frame_count is not None:
|
|
794
|
+
with gpu_frame_count.get_lock():
|
|
795
|
+
gpu_frame_count.value += 1
|
|
796
|
+
|
|
797
|
+
# Update next frame time for FPS limiting
|
|
798
|
+
if fps_limit_enabled:
|
|
799
|
+
next_frame_time += frame_interval
|
|
800
|
+
|
|
801
|
+
except Exception as e:
|
|
802
|
+
local_errors += 1
|
|
803
|
+
if local_errors <= 3:
|
|
804
|
+
logger.error(f"Worker {worker_id} write error: {e}")
|
|
805
|
+
|
|
806
|
+
if decoded_frames and len(ring_buffers) > 0:
|
|
807
|
+
next(iter(ring_buffers.values())).sync_writes()
|
|
808
|
+
|
|
809
|
+
if num_frames == 0:
|
|
810
|
+
time.sleep(0.0001)
|
|
811
|
+
continue
|
|
812
|
+
|
|
813
|
+
if frames_since_counter_update >= counter_batch_size:
|
|
814
|
+
frame_counter.increment()
|
|
815
|
+
frames_since_counter_update = 0
|
|
816
|
+
|
|
817
|
+
except Exception as e:
|
|
818
|
+
local_errors += 1
|
|
819
|
+
if local_errors <= 3:
|
|
820
|
+
logger.error(f"Worker {worker_id} error: {e}")
|
|
821
|
+
|
|
822
|
+
if frames_since_counter_update > 0:
|
|
823
|
+
frame_counter.increment()
|
|
824
|
+
|
|
825
|
+
elapsed = time.perf_counter() - start_time
|
|
826
|
+
result_queue.put({
|
|
827
|
+
"worker_id": worker_id,
|
|
828
|
+
"decoder_idx": decoder_idx,
|
|
829
|
+
"elapsed_sec": elapsed,
|
|
830
|
+
"total_frames": local_frames,
|
|
831
|
+
"total_errors": local_errors,
|
|
832
|
+
"num_streams": len(camera_ids),
|
|
833
|
+
"fps": local_frames / elapsed if elapsed > 0 else 0,
|
|
834
|
+
})
|
|
835
|
+
|
|
836
|
+
|
|
837
|
+
# =============================================================================
|
|
838
|
+
# GPU Process
|
|
839
|
+
# =============================================================================
|
|
840
|
+
|
|
841
|
+
def nvdec_pool_process(
|
|
842
|
+
process_id: int,
|
|
843
|
+
camera_configs: List[StreamConfig],
|
|
844
|
+
pool_size: int,
|
|
845
|
+
duration_sec: float,
|
|
846
|
+
result_queue: mp.Queue,
|
|
847
|
+
stop_event: mp.Event,
|
|
848
|
+
burst_size: int = 4,
|
|
849
|
+
num_slots: int = 32,
|
|
850
|
+
target_fps: int = 0,
|
|
851
|
+
shared_frame_count: Optional[mp.Value] = None,
|
|
852
|
+
gpu_frame_counts: Optional[Dict[int, mp.Value]] = None,
|
|
853
|
+
total_num_streams: int = 0,
|
|
854
|
+
total_num_gpus: int = 1,
|
|
855
|
+
):
|
|
856
|
+
"""NVDEC process for one GPU.
|
|
857
|
+
|
|
858
|
+
Creates NV12 ring buffers: (H*1.5, W) = 0.6 MB/frame.
|
|
859
|
+
|
|
860
|
+
Args:
|
|
861
|
+
gpu_frame_counts: Dict mapping gpu_id -> per-GPU frame counter (for per-GPU stats)
|
|
862
|
+
shared_frame_count: Global frame counter (for overall stats)
|
|
863
|
+
total_num_streams: Total streams across ALL GPUs (for global per-stream calc)
|
|
864
|
+
total_num_gpus: Total number of GPUs (for context in logging)
|
|
865
|
+
"""
|
|
866
|
+
if not camera_configs:
|
|
867
|
+
return
|
|
868
|
+
|
|
869
|
+
gpu_id = camera_configs[0].gpu_id
|
|
870
|
+
target_h = camera_configs[0].height
|
|
871
|
+
target_w = camera_configs[0].width
|
|
872
|
+
|
|
873
|
+
# Get per-GPU counter (or fall back to shared if not provided)
|
|
874
|
+
gpu_frame_count = gpu_frame_counts.get(gpu_id) if gpu_frame_counts else None
|
|
875
|
+
|
|
876
|
+
if CUPY_AVAILABLE:
|
|
877
|
+
cp.cuda.Device(gpu_id).use()
|
|
878
|
+
|
|
879
|
+
# Initialize global frame counter
|
|
880
|
+
frame_counter = GlobalFrameCounter(is_producer=True)
|
|
881
|
+
if process_id == 0:
|
|
882
|
+
frame_counter.initialize()
|
|
883
|
+
logger.info(f"Process {process_id}: GlobalFrameCounter initialized")
|
|
884
|
+
else:
|
|
885
|
+
max_retries = 50
|
|
886
|
+
for retry in range(max_retries):
|
|
887
|
+
try:
|
|
888
|
+
if os.path.exists("/dev/shm/global_frame_counter"):
|
|
889
|
+
frame_counter.connect()
|
|
890
|
+
logger.info(f"Process {process_id}: Connected to GlobalFrameCounter")
|
|
891
|
+
break
|
|
892
|
+
except Exception:
|
|
893
|
+
if retry == max_retries - 1:
|
|
894
|
+
raise
|
|
895
|
+
time.sleep(0.1)
|
|
896
|
+
else:
|
|
897
|
+
raise RuntimeError(f"Process {process_id}: GlobalFrameCounter not found")
|
|
898
|
+
|
|
899
|
+
# Create decoder pool
|
|
900
|
+
try:
|
|
901
|
+
pool = NVDECDecoderPool(pool_size, gpu_id)
|
|
902
|
+
except Exception as e:
|
|
903
|
+
logger.error(f"Process {process_id}: Failed to create decoder pool: {e}")
|
|
904
|
+
result_queue.put({
|
|
905
|
+
"process_id": process_id,
|
|
906
|
+
"error": str(e),
|
|
907
|
+
"total_frames": 0,
|
|
908
|
+
"total_errors": 1,
|
|
909
|
+
})
|
|
910
|
+
return
|
|
911
|
+
|
|
912
|
+
if pool.actual_pool_size == 0:
|
|
913
|
+
result_queue.put({
|
|
914
|
+
"process_id": process_id,
|
|
915
|
+
"error": "No decoders created",
|
|
916
|
+
"total_frames": 0,
|
|
917
|
+
"total_errors": 1,
|
|
918
|
+
})
|
|
919
|
+
return
|
|
920
|
+
|
|
921
|
+
# Create NV12 ring buffers: (H + H/2, W, 1) = 0.6 MB/frame
|
|
922
|
+
ring_buffers: Dict[str, CudaIpcRingBuffer] = {}
|
|
923
|
+
frame_size_mb = target_h * target_w * 1.5 / 1e6
|
|
924
|
+
|
|
925
|
+
try:
|
|
926
|
+
for i, config in enumerate(camera_configs):
|
|
927
|
+
rb = CudaIpcRingBuffer.create_producer(
|
|
928
|
+
config.camera_id,
|
|
929
|
+
gpu_id=config.gpu_id,
|
|
930
|
+
num_slots=num_slots,
|
|
931
|
+
width=config.width,
|
|
932
|
+
height=config.height + config.height // 2, # H * 1.5 for NV12
|
|
933
|
+
channels=1,
|
|
934
|
+
)
|
|
935
|
+
ring_buffers[config.camera_id] = rb
|
|
936
|
+
|
|
937
|
+
pool.assign_stream(
|
|
938
|
+
stream_id=i,
|
|
939
|
+
camera_id=config.camera_id,
|
|
940
|
+
video_path=config.video_path,
|
|
941
|
+
width=config.width,
|
|
942
|
+
height=config.height
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
logger.info(f"Process {process_id}: {pool.actual_pool_size} decoders, "
|
|
946
|
+
f"{len(camera_configs)} streams, NV12 ({frame_size_mb:.1f} MB/frame)")
|
|
947
|
+
|
|
948
|
+
thread_stop_event = threading.Event()
|
|
949
|
+
thread_result_queue = thread_queue.Queue()
|
|
950
|
+
|
|
951
|
+
threads = []
|
|
952
|
+
for decoder_idx in range(pool.actual_pool_size):
|
|
953
|
+
t = threading.Thread(
|
|
954
|
+
target=nvdec_pool_worker,
|
|
955
|
+
args=(
|
|
956
|
+
process_id * 100 + decoder_idx,
|
|
957
|
+
decoder_idx,
|
|
958
|
+
pool,
|
|
959
|
+
ring_buffers,
|
|
960
|
+
frame_counter,
|
|
961
|
+
duration_sec,
|
|
962
|
+
thread_result_queue,
|
|
963
|
+
thread_stop_event,
|
|
964
|
+
burst_size,
|
|
965
|
+
target_h,
|
|
966
|
+
target_w,
|
|
967
|
+
target_fps,
|
|
968
|
+
shared_frame_count,
|
|
969
|
+
gpu_frame_count, # Per-GPU counter
|
|
970
|
+
)
|
|
971
|
+
)
|
|
972
|
+
t.start()
|
|
973
|
+
threads.append(t)
|
|
974
|
+
|
|
975
|
+
# Progress monitoring loop with current/avg FPS tracking
|
|
976
|
+
start_time = time.perf_counter()
|
|
977
|
+
last_report_time = start_time
|
|
978
|
+
last_gpu_frame_count = 0
|
|
979
|
+
last_global_frame_count = 0
|
|
980
|
+
report_interval = 5.0
|
|
981
|
+
processing_start_time = None
|
|
982
|
+
gpu_frames_at_start = 0
|
|
983
|
+
global_frames_at_start = 0
|
|
984
|
+
num_gpu_streams = len(camera_configs)
|
|
985
|
+
|
|
986
|
+
while not stop_event.is_set():
|
|
987
|
+
current_time = time.perf_counter()
|
|
988
|
+
if current_time - start_time >= duration_sec:
|
|
989
|
+
break
|
|
990
|
+
|
|
991
|
+
# Periodic progress report with current and average FPS
|
|
992
|
+
if current_time - last_report_time >= report_interval:
|
|
993
|
+
elapsed = current_time - start_time
|
|
994
|
+
remaining = max(0, duration_sec - elapsed)
|
|
995
|
+
|
|
996
|
+
# Get per-GPU frame count (this GPU only)
|
|
997
|
+
gpu_frames = gpu_frame_count.value if gpu_frame_count else 0
|
|
998
|
+
gpu_interval_frames = gpu_frames - last_gpu_frame_count
|
|
999
|
+
gpu_interval_fps = gpu_interval_frames / report_interval
|
|
1000
|
+
gpu_per_stream_fps = gpu_interval_fps / num_gpu_streams if num_gpu_streams > 0 else 0
|
|
1001
|
+
|
|
1002
|
+
# Get global frame count (all GPUs)
|
|
1003
|
+
global_frames = shared_frame_count.value if shared_frame_count else 0
|
|
1004
|
+
global_interval_frames = global_frames - last_global_frame_count
|
|
1005
|
+
global_interval_fps = global_interval_frames / report_interval
|
|
1006
|
+
global_per_stream_fps = global_interval_fps / total_num_streams if total_num_streams > 0 else 0
|
|
1007
|
+
|
|
1008
|
+
# Track when processing actually starts (exclude warmup)
|
|
1009
|
+
if processing_start_time is None and gpu_frames > 0:
|
|
1010
|
+
processing_start_time = last_report_time
|
|
1011
|
+
gpu_frames_at_start = last_gpu_frame_count
|
|
1012
|
+
global_frames_at_start = last_global_frame_count
|
|
1013
|
+
|
|
1014
|
+
# Calculate average FPS excluding warmup
|
|
1015
|
+
if processing_start_time is not None:
|
|
1016
|
+
processing_elapsed = current_time - processing_start_time
|
|
1017
|
+
|
|
1018
|
+
# Per-GPU averages
|
|
1019
|
+
gpu_processing_frames = gpu_frames - gpu_frames_at_start
|
|
1020
|
+
gpu_avg_fps = gpu_processing_frames / processing_elapsed if processing_elapsed > 0 else 0
|
|
1021
|
+
gpu_avg_per_stream = gpu_avg_fps / num_gpu_streams if num_gpu_streams > 0 else 0
|
|
1022
|
+
|
|
1023
|
+
# Global averages
|
|
1024
|
+
global_processing_frames = global_frames - global_frames_at_start
|
|
1025
|
+
global_avg_fps = global_processing_frames / processing_elapsed if processing_elapsed > 0 else 0
|
|
1026
|
+
global_avg_per_stream = global_avg_fps / total_num_streams if total_num_streams > 0 else 0
|
|
1027
|
+
|
|
1028
|
+
# Log per-GPU stats
|
|
1029
|
+
logger.info(
|
|
1030
|
+
f"GPU{gpu_id} [{elapsed:5.1f}s] {gpu_frames:,} frames ({num_gpu_streams} cams) | "
|
|
1031
|
+
f"cur: {gpu_interval_fps:,.0f} FPS ({gpu_per_stream_fps:.1f}/cam) | "
|
|
1032
|
+
f"avg: {gpu_avg_fps:,.0f} FPS ({gpu_avg_per_stream:.1f}/cam)"
|
|
1033
|
+
)
|
|
1034
|
+
|
|
1035
|
+
# Log global stats (only from GPU0 to avoid spam)
|
|
1036
|
+
if gpu_id == 0:
|
|
1037
|
+
logger.info(
|
|
1038
|
+
f"GLOBAL [{elapsed:5.1f}s] {global_frames:,} frames ({total_num_streams} cams, {total_num_gpus} GPUs) | "
|
|
1039
|
+
f"cur: {global_interval_fps:,.0f} FPS ({global_per_stream_fps:.1f}/cam) | "
|
|
1040
|
+
f"avg: {global_avg_fps:,.0f} FPS ({global_avg_per_stream:.1f}/cam) | "
|
|
1041
|
+
f"{remaining:.0f}s left"
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
last_gpu_frame_count = gpu_frames
|
|
1045
|
+
last_global_frame_count = global_frames
|
|
1046
|
+
last_report_time = current_time
|
|
1047
|
+
|
|
1048
|
+
time.sleep(0.1)
|
|
1049
|
+
|
|
1050
|
+
thread_stop_event.set()
|
|
1051
|
+
|
|
1052
|
+
for t in threads:
|
|
1053
|
+
t.join(timeout=30.0)
|
|
1054
|
+
|
|
1055
|
+
total_frames = 0
|
|
1056
|
+
total_errors = 0
|
|
1057
|
+
elapsed = time.perf_counter() - start_time
|
|
1058
|
+
|
|
1059
|
+
while not thread_result_queue.empty():
|
|
1060
|
+
try:
|
|
1061
|
+
r = thread_result_queue.get_nowait()
|
|
1062
|
+
total_frames += r.get("total_frames", 0)
|
|
1063
|
+
total_errors += r.get("total_errors", 0)
|
|
1064
|
+
except:
|
|
1065
|
+
break
|
|
1066
|
+
|
|
1067
|
+
pool.close()
|
|
1068
|
+
for rb in ring_buffers.values():
|
|
1069
|
+
rb.close()
|
|
1070
|
+
|
|
1071
|
+
result_queue.put({
|
|
1072
|
+
"process_id": process_id,
|
|
1073
|
+
"elapsed_sec": elapsed,
|
|
1074
|
+
"total_frames": total_frames,
|
|
1075
|
+
"total_errors": total_errors,
|
|
1076
|
+
"num_streams": len(camera_configs),
|
|
1077
|
+
"pool_size": pool.actual_pool_size,
|
|
1078
|
+
"fps": total_frames / elapsed if elapsed > 0 else 0,
|
|
1079
|
+
"per_stream_fps": total_frames / elapsed / len(camera_configs) if elapsed > 0 and camera_configs else 0,
|
|
1080
|
+
})
|
|
1081
|
+
|
|
1082
|
+
except Exception as e:
|
|
1083
|
+
logger.error(f"Process {process_id} error: {e}")
|
|
1084
|
+
import traceback
|
|
1085
|
+
traceback.print_exc()
|
|
1086
|
+
|
|
1087
|
+
pool.close()
|
|
1088
|
+
for rb in ring_buffers.values():
|
|
1089
|
+
rb.close()
|
|
1090
|
+
|
|
1091
|
+
result_queue.put({
|
|
1092
|
+
"process_id": process_id,
|
|
1093
|
+
"error": str(e),
|
|
1094
|
+
"total_frames": 0,
|
|
1095
|
+
"total_errors": 1,
|
|
1096
|
+
})
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
# =============================================================================
|
|
1100
|
+
# Streaming Gateway
|
|
1101
|
+
# =============================================================================
|
|
1102
|
+
|
|
1103
|
+
class StreamingGateway:
|
|
1104
|
+
"""Multi-stream video producer outputting NV12 tensors (minimal IPC payload)."""
|
|
1105
|
+
|
|
1106
|
+
def __init__(self, config: GatewayConfig):
|
|
1107
|
+
self.config = config
|
|
1108
|
+
self._workers: List[mp.Process] = []
|
|
1109
|
+
self._stop_event = mp.Event()
|
|
1110
|
+
self._result_queue = mp.Queue()
|
|
1111
|
+
|
|
1112
|
+
def start(self) -> Dict:
|
|
1113
|
+
"""Start the gateway."""
|
|
1114
|
+
if not CUPY_AVAILABLE:
|
|
1115
|
+
raise RuntimeError("CuPy is required")
|
|
1116
|
+
if not RING_BUFFER_AVAILABLE:
|
|
1117
|
+
raise RuntimeError("CUDA IPC ring buffer not available")
|
|
1118
|
+
if not PYNVCODEC_AVAILABLE:
|
|
1119
|
+
raise RuntimeError("PyNvVideoCodec required")
|
|
1120
|
+
return self._start_nvdec_pool()
|
|
1121
|
+
|
|
1122
|
+
def _start_nvdec_pool(self) -> Dict:
|
|
1123
|
+
"""Start NVDEC pool across GPUs."""
|
|
1124
|
+
num_gpus = min(self.config.num_gpus, 8)
|
|
1125
|
+
streams_per_gpu = self.config.num_streams // num_gpus
|
|
1126
|
+
extra_streams = self.config.num_streams % num_gpus
|
|
1127
|
+
|
|
1128
|
+
logger.info(f"Starting NVDEC on {num_gpus} GPU(s): {self.config.num_streams} streams, "
|
|
1129
|
+
f"pool_size={self.config.nvdec_pool_size}/GPU, output=NV12 (0.6 MB)")
|
|
1130
|
+
|
|
1131
|
+
ctx = mp.get_context("spawn")
|
|
1132
|
+
self._stop_event = ctx.Event()
|
|
1133
|
+
self._result_queue = ctx.Queue()
|
|
1134
|
+
|
|
1135
|
+
# Shared counter for real-time FPS tracking (use 'L' for large counts)
|
|
1136
|
+
shared_frame_count = ctx.Value('L', 0)
|
|
1137
|
+
|
|
1138
|
+
stream_idx = 0
|
|
1139
|
+
for gpu_id in range(num_gpus):
|
|
1140
|
+
n_streams = streams_per_gpu + (1 if gpu_id < extra_streams else 0)
|
|
1141
|
+
|
|
1142
|
+
gpu_configs = []
|
|
1143
|
+
for i in range(n_streams):
|
|
1144
|
+
config = StreamConfig(
|
|
1145
|
+
camera_id=f"cam_{stream_idx:04d}",
|
|
1146
|
+
video_path=self.config.video_path,
|
|
1147
|
+
width=self.config.frame_width,
|
|
1148
|
+
height=self.config.frame_height,
|
|
1149
|
+
target_fps=self.config.target_fps,
|
|
1150
|
+
gpu_id=gpu_id,
|
|
1151
|
+
)
|
|
1152
|
+
gpu_configs.append(config)
|
|
1153
|
+
stream_idx += 1
|
|
1154
|
+
|
|
1155
|
+
p = ctx.Process(
|
|
1156
|
+
target=nvdec_pool_process,
|
|
1157
|
+
args=(gpu_id, gpu_configs, self.config.nvdec_pool_size,
|
|
1158
|
+
self.config.duration_sec, self._result_queue, self._stop_event,
|
|
1159
|
+
self.config.nvdec_burst_size, self.config.num_slots,
|
|
1160
|
+
self.config.target_fps, shared_frame_count)
|
|
1161
|
+
)
|
|
1162
|
+
p.start()
|
|
1163
|
+
self._workers.append(p)
|
|
1164
|
+
logger.info(f"GPU {gpu_id}: {n_streams} streams")
|
|
1165
|
+
time.sleep(0.1)
|
|
1166
|
+
|
|
1167
|
+
# Progress monitoring loop - print progress every 5 seconds
|
|
1168
|
+
start_time = time.perf_counter()
|
|
1169
|
+
last_report_time = start_time
|
|
1170
|
+
last_frame_count = 0
|
|
1171
|
+
report_interval = 5.0 # seconds
|
|
1172
|
+
processing_start_time = None # Track when actual processing starts
|
|
1173
|
+
frames_at_processing_start = 0
|
|
1174
|
+
|
|
1175
|
+
print(f" [ 0.0s] Started {num_gpus} GPU workers...")
|
|
1176
|
+
|
|
1177
|
+
while any(p.is_alive() for p in self._workers):
|
|
1178
|
+
time.sleep(0.5)
|
|
1179
|
+
current_time = time.perf_counter()
|
|
1180
|
+
|
|
1181
|
+
# Periodic progress report with real-time FPS
|
|
1182
|
+
if current_time - last_report_time >= report_interval:
|
|
1183
|
+
elapsed = current_time - start_time
|
|
1184
|
+
remaining = max(0, self.config.duration_sec - elapsed)
|
|
1185
|
+
|
|
1186
|
+
# Read current frame count
|
|
1187
|
+
current_frames = shared_frame_count.value
|
|
1188
|
+
interval_frames = current_frames - last_frame_count
|
|
1189
|
+
interval_fps = interval_frames / report_interval # Current throughput
|
|
1190
|
+
per_stream_fps = interval_fps / self.config.num_streams if self.config.num_streams > 0 else 0
|
|
1191
|
+
|
|
1192
|
+
# Track when processing actually starts (exclude warmup from avg)
|
|
1193
|
+
if processing_start_time is None and current_frames > 0:
|
|
1194
|
+
processing_start_time = last_report_time # Use previous report time
|
|
1195
|
+
frames_at_processing_start = last_frame_count
|
|
1196
|
+
|
|
1197
|
+
# Calculate average FPS excluding warmup time
|
|
1198
|
+
if processing_start_time is not None:
|
|
1199
|
+
processing_elapsed = current_time - processing_start_time
|
|
1200
|
+
processing_frames = current_frames - frames_at_processing_start
|
|
1201
|
+
avg_fps = processing_frames / processing_elapsed if processing_elapsed > 0 else 0
|
|
1202
|
+
print(f" [{elapsed:5.1f}s] {current_frames:,} frames | cur: {interval_fps:,.0f} FPS ({per_stream_fps:.1f}/stream) | avg: {avg_fps:,.0f} FPS | {remaining:.0f}s left")
|
|
1203
|
+
else:
|
|
1204
|
+
print(f" [{elapsed:5.1f}s] Warming up... | {remaining:.0f}s left")
|
|
1205
|
+
|
|
1206
|
+
last_report_time = current_time
|
|
1207
|
+
last_frame_count = current_frames
|
|
1208
|
+
|
|
1209
|
+
# Wait for all workers to fully complete
|
|
1210
|
+
for p in self._workers:
|
|
1211
|
+
p.join(timeout=5)
|
|
1212
|
+
|
|
1213
|
+
results = []
|
|
1214
|
+
while not self._result_queue.empty():
|
|
1215
|
+
results.append(self._result_queue.get())
|
|
1216
|
+
|
|
1217
|
+
for r in results:
|
|
1218
|
+
if "error" in r:
|
|
1219
|
+
logger.error(f"NVDEC error: {r['error']}")
|
|
1220
|
+
|
|
1221
|
+
total_frames = sum(r.get("total_frames", 0) for r in results)
|
|
1222
|
+
total_errors = sum(r.get("total_errors", 0) for r in results)
|
|
1223
|
+
total_elapsed = max((r.get("elapsed_sec", 0) for r in results), default=0)
|
|
1224
|
+
|
|
1225
|
+
aggregate_fps = total_frames / total_elapsed if total_elapsed > 0 else 0
|
|
1226
|
+
per_stream_fps = aggregate_fps / self.config.num_streams if self.config.num_streams > 0 else 0
|
|
1227
|
+
|
|
1228
|
+
return {
|
|
1229
|
+
"num_streams": self.config.num_streams,
|
|
1230
|
+
"num_gpus": num_gpus,
|
|
1231
|
+
"pool_size": self.config.nvdec_pool_size,
|
|
1232
|
+
"duration_sec": total_elapsed,
|
|
1233
|
+
"total_frames": total_frames,
|
|
1234
|
+
"total_errors": total_errors,
|
|
1235
|
+
"aggregate_fps": aggregate_fps,
|
|
1236
|
+
"per_stream_fps": per_stream_fps,
|
|
1237
|
+
"gpu_results": results,
|
|
1238
|
+
}
|
|
1239
|
+
|
|
1240
|
+
def stop(self):
|
|
1241
|
+
"""Stop all workers."""
|
|
1242
|
+
self._stop_event.set()
|
|
1243
|
+
for p in self._workers:
|
|
1244
|
+
p.join(timeout=5)
|
|
1245
|
+
if p.is_alive():
|
|
1246
|
+
p.terminate()
|
|
1247
|
+
|
|
1248
|
+
|
|
1249
|
+
# =============================================================================
|
|
1250
|
+
# CLI
|
|
1251
|
+
# =============================================================================
|
|
1252
|
+
|
|
1253
|
+
def main():
|
|
1254
|
+
parser = argparse.ArgumentParser(description="Streaming Gateway - CUDA IPC Producer (NV12)")
|
|
1255
|
+
parser.add_argument("--video", "-v", required=True, help="Video file path")
|
|
1256
|
+
parser.add_argument("--num-streams", "-n", type=int, default=100, help="Number of streams")
|
|
1257
|
+
parser.add_argument("--fps", type=int, default=0, help="Target FPS limit per stream (0=unlimited)")
|
|
1258
|
+
parser.add_argument("--width", type=int, default=640, help="Frame width")
|
|
1259
|
+
parser.add_argument("--height", type=int, default=640, help="Frame height")
|
|
1260
|
+
parser.add_argument("--duration", "-d", type=float, default=30.0, help="Duration in seconds")
|
|
1261
|
+
parser.add_argument("--gpu", type=int, default=0, help="Primary GPU ID")
|
|
1262
|
+
parser.add_argument("--num-gpus", "-g", type=int, default=1, help="Number of GPUs (1-8)")
|
|
1263
|
+
parser.add_argument("--pool-size", type=int, default=8, help="NVDEC pool size per GPU")
|
|
1264
|
+
parser.add_argument("--burst-size", type=int, default=4, help="Frames per stream before rotating")
|
|
1265
|
+
parser.add_argument("--slots", type=int, default=32, help="Ring buffer slots per camera")
|
|
1266
|
+
parser.add_argument("--quiet", "-q", action="store_true", help="Quiet mode - only show final results")
|
|
1267
|
+
args = parser.parse_args()
|
|
1268
|
+
|
|
1269
|
+
# Setup logging based on quiet mode
|
|
1270
|
+
setup_logging(quiet=args.quiet)
|
|
1271
|
+
|
|
1272
|
+
config = GatewayConfig(
|
|
1273
|
+
video_path=args.video,
|
|
1274
|
+
num_streams=args.num_streams,
|
|
1275
|
+
target_fps=args.fps,
|
|
1276
|
+
frame_width=args.width,
|
|
1277
|
+
frame_height=args.height,
|
|
1278
|
+
gpu_id=args.gpu,
|
|
1279
|
+
num_gpus=args.num_gpus,
|
|
1280
|
+
duration_sec=args.duration,
|
|
1281
|
+
nvdec_pool_size=args.pool_size,
|
|
1282
|
+
nvdec_burst_size=args.burst_size,
|
|
1283
|
+
num_slots=args.slots,
|
|
1284
|
+
)
|
|
1285
|
+
|
|
1286
|
+
frame_size = args.width * args.height * 1.5
|
|
1287
|
+
output_fmt = f"NV12 ({args.width}x{args.height}x1.5 = {frame_size/1e6:.1f} MB/frame)"
|
|
1288
|
+
fps_limit_str = f"{args.fps} FPS/stream" if args.fps > 0 else "unlimited"
|
|
1289
|
+
|
|
1290
|
+
if not args.quiet:
|
|
1291
|
+
print("\n" + "=" * 60)
|
|
1292
|
+
print(" STREAMING GATEWAY - CUDA IPC Producer (NV12)")
|
|
1293
|
+
print("=" * 60)
|
|
1294
|
+
print(f" Video: {args.video}")
|
|
1295
|
+
print(f" Streams: {args.num_streams}")
|
|
1296
|
+
print(f" GPUs: {args.num_gpus}")
|
|
1297
|
+
print(f" Pool size: {args.pool_size} NVDEC decoders/GPU")
|
|
1298
|
+
print(f" FPS limit: {fps_limit_str}")
|
|
1299
|
+
print(f" Output: {output_fmt}")
|
|
1300
|
+
print(f" Duration: {args.duration}s")
|
|
1301
|
+
print("=" * 60)
|
|
1302
|
+
|
|
1303
|
+
gateway = StreamingGateway(config)
|
|
1304
|
+
|
|
1305
|
+
try:
|
|
1306
|
+
results = gateway.start()
|
|
1307
|
+
# Clean summary output
|
|
1308
|
+
print("\n")
|
|
1309
|
+
print("=" * 60)
|
|
1310
|
+
print(" STREAMING GATEWAY BENCHMARK RESULTS")
|
|
1311
|
+
print("=" * 60)
|
|
1312
|
+
print(f" Video: {args.video}")
|
|
1313
|
+
print(f" Streams: {args.num_streams}")
|
|
1314
|
+
print(f" GPUs: {args.num_gpus}")
|
|
1315
|
+
print(f" FPS limit: {fps_limit_str}")
|
|
1316
|
+
print(f" Duration: {args.duration}s")
|
|
1317
|
+
print("-" * 60)
|
|
1318
|
+
print(f" Total Frames: {results['total_frames']:,}")
|
|
1319
|
+
print("-" * 60)
|
|
1320
|
+
print(f" >>> AGGREGATE FPS: {results['aggregate_fps']:,.0f} <<<")
|
|
1321
|
+
print(f" >>> PER-STREAM FPS: {results['per_stream_fps']:.1f} <<<")
|
|
1322
|
+
print("=" * 60)
|
|
1323
|
+
print()
|
|
1324
|
+
except KeyboardInterrupt:
|
|
1325
|
+
gateway.stop()
|
|
1326
|
+
print("\nStopped")
|
|
1327
|
+
|
|
1328
|
+
|
|
1329
|
+
if __name__ == "__main__":
|
|
1330
|
+
main()
|