matrice-streaming 0.1.60__py3-none-any.whl → 0.1.61__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,961 @@
1
+ #!/usr/bin/env python3
2
+ """Streaming Gateway - CUDA IPC Video Producer (NVDEC Hardware Decode).
3
+
4
+ This module implements the producer side of the zero-copy video pipeline
5
+ using NVDEC hardware video decoding for maximum throughput.
6
+
7
+ Architecture:
8
+ =============
9
+
10
+ ┌─────────────────────────────────────────────────────────────────────────┐
11
+ │ STREAMING GATEWAY (Producer) │
12
+ ├─────────────────────────────────────────────────────────────────────────┤
13
+ │ │
14
+ │ ┌─────────────────────────────────────────────────────────────────┐ │
15
+ │ │ NVDEC Decoder Pool │ │
16
+ │ │ │ │
17
+ │ │ ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ │ │
18
+ │ │ │ Decoder 0 │ │ Decoder 1 │ │ Decoder N │ │ │
19
+ │ │ │ │ │ │ │ │ │ │
20
+ │ │ │ NVDEC HW │ │ NVDEC HW │ │ NVDEC HW │ │ │
21
+ │ │ │ decode │ │ decode │ │ decode │ │ │
22
+ │ │ │ ↓ │ │ ↓ │ │ ↓ │ │ │
23
+ │ │ │ NV12 Resize │ │ NV12 Resize │ │ NV12 Resize │ │ │
24
+ │ │ │ ↓ │ │ ↓ │ │ ↓ │ │ │
25
+ │ │ │ CUDA IPC │ │ CUDA IPC │ │ CUDA IPC │ │ │
26
+ │ │ │ Ring Buf │ │ Ring Buf │ │ Ring Buf │ │ │
27
+ │ │ │ (NV12 0.6MB) │ │ (NV12 0.6MB) │ (NV12 0.6MB) │ │ │
28
+ │ │ └────────────────┘ └────────────────┘ └────────────────┘ │ │
29
+ │ │ │ │
30
+ │ └─────────────────────────────────────────────────────────────────┘ │
31
+ │ │ │
32
+ │ Output: NV12 (H*1.5, W) uint8 = 0.6 MB │
33
+ │ 50% less IPC bandwidth than RGB │
34
+ │ ↓ │
35
+ └───────────────────────────────┼─────────────────────────────────────────┘
36
+
37
+ Consumer reads via CUDA IPC
38
+ → NV12→RGB→CHW→FP16 in one kernel
39
+ → TensorRT inference
40
+
41
+ Usage:
42
+ ======
43
+ python streaming_gateway.py --video videoplayback.mp4 --num-streams 100
44
+
45
+ Requirements:
46
+ =============
47
+ - PyNvVideoCodec for NVDEC hardware decode
48
+ - CuPy with CUDA support
49
+ - cuda_shm_ring_buffer module
50
+ """
51
+
52
+ import argparse
53
+ import logging
54
+ import multiprocessing as mp
55
+ import os
56
+ import time
57
+ import threading
58
+ import queue as thread_queue
59
+ from dataclasses import dataclass
60
+ from typing import Dict, List, Optional, Tuple, Any
61
+
62
+ import numpy as np
63
+
64
+ try:
65
+ import cupy as cp
66
+ CUPY_AVAILABLE = True
67
+ except ImportError:
68
+ CUPY_AVAILABLE = False
69
+ cp = None
70
+
71
+ try:
72
+ import PyNvVideoCodec as nvc
73
+ PYNVCODEC_AVAILABLE = True
74
+ except ImportError:
75
+ PYNVCODEC_AVAILABLE = False
76
+ nvc = None
77
+
78
+ try:
79
+ from matrice_common.stream.cuda_shm_ring_buffer import CudaIpcRingBuffer, GlobalFrameCounter
80
+ RING_BUFFER_AVAILABLE = True
81
+ except ImportError:
82
+ RING_BUFFER_AVAILABLE = False
83
+
84
+ logger = logging.getLogger(__name__)
85
+
86
+ def setup_logging(quiet: bool = True):
87
+ """Configure logging level based on quiet mode."""
88
+ level = logging.WARNING if quiet else logging.INFO
89
+ logging.basicConfig(
90
+ level=level,
91
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
92
+ )
93
+ logging.getLogger('cuda_shm_ring_buffer').setLevel(logging.WARNING if quiet else logging.INFO)
94
+
95
+
96
+ @dataclass
97
+ class StreamConfig:
98
+ """Configuration for a single video stream."""
99
+ camera_id: str
100
+ video_path: str
101
+ width: int = 640
102
+ height: int = 640
103
+ target_fps: int = 10
104
+ gpu_id: int = 0
105
+
106
+
107
+ @dataclass
108
+ class GatewayConfig:
109
+ """Configuration for the streaming gateway."""
110
+ video_path: str
111
+ num_streams: int = 100
112
+ target_fps: int = 0 # 0 = unlimited, >0 = FPS limit per stream
113
+ frame_width: int = 640
114
+ frame_height: int = 640
115
+ gpu_id: int = 0
116
+ num_gpus: int = 1
117
+ duration_sec: float = 30.0
118
+ nvdec_pool_size: int = 8
119
+ nvdec_burst_size: int = 4
120
+ num_slots: int = 32
121
+
122
+
123
+ @dataclass
124
+ class StreamState:
125
+ """Track state for each logical stream in NVDEC pool."""
126
+ stream_id: int
127
+ camera_id: str
128
+ video_path: str
129
+ demuxer: Any
130
+ frames_decoded: int = 0
131
+ width: int = 640
132
+ height: int = 640
133
+ empty_packets: int = 0
134
+
135
+
136
+ # =============================================================================
137
+ # CUDA Kernel: NV12 Resize (no color conversion - 50% less bandwidth)
138
+ # =============================================================================
139
+
140
+ _nv12_resize_kernel = None
141
+
142
+
143
+ def _get_nv12_resize_kernel():
144
+ """Get or compile the NV12 resize kernel.
145
+
146
+ This kernel resizes NV12 directly (no color conversion).
147
+ Output: concatenated Y (H×W) + UV ((H/2)×W) = H×W×1.5 bytes
148
+ This is 50% smaller than RGB (H×W×3 bytes).
149
+
150
+ Consumer will do: NV12→RGB→CHW→FP16 in one fused kernel.
151
+ """
152
+ global _nv12_resize_kernel
153
+ if _nv12_resize_kernel is None and CUPY_AVAILABLE:
154
+ _nv12_resize_kernel = cp.RawKernel(r'''
155
+ extern "C" __global__ void nv12_resize(
156
+ const unsigned char* src_y, // Source Y plane
157
+ const unsigned char* src_uv, // Source UV plane (interleaved)
158
+ unsigned char* dst, // Output: Y (H×W) followed by UV ((H/2)×W)
159
+ int src_h, int src_w,
160
+ int dst_h, int dst_w,
161
+ int y_stride, int uv_stride
162
+ ) {
163
+ int dst_x = blockIdx.x * blockDim.x + threadIdx.x;
164
+ int dst_y = blockIdx.y * blockDim.y + threadIdx.y;
165
+
166
+ // Total height in output: dst_h (Y) + dst_h/2 (UV) = dst_h * 1.5
167
+ int total_h = dst_h + dst_h / 2;
168
+ if (dst_x >= dst_w || dst_y >= total_h) return;
169
+
170
+ float scale_x = (float)src_w / dst_w;
171
+ float scale_y = (float)src_h / dst_h;
172
+
173
+ if (dst_y < dst_h) {
174
+ // Y plane region: resize Y
175
+ int src_x = min((int)(dst_x * scale_x), src_w - 1);
176
+ int src_y_coord = min((int)(dst_y * scale_y), src_h - 1);
177
+ int src_idx = src_y_coord * y_stride + src_x;
178
+ int dst_idx = dst_y * dst_w + dst_x;
179
+ dst[dst_idx] = src_y[src_idx];
180
+ } else {
181
+ // UV plane region: resize UV (UV is at half vertical resolution)
182
+ int uv_dst_y = dst_y - dst_h; // 0 to dst_h/2-1
183
+ int uv_src_y = min((int)(uv_dst_y * scale_y), src_h / 2 - 1);
184
+
185
+ // UV is interleaved, so we copy pairs (U, V) together
186
+ int src_uv_x = min((int)((dst_x / 2) * 2 * scale_x), src_w - 2);
187
+ src_uv_x = (src_uv_x / 2) * 2; // Ensure even
188
+
189
+ int src_idx = uv_src_y * uv_stride + src_uv_x + (dst_x % 2);
190
+ int dst_idx = dst_h * dst_w + uv_dst_y * dst_w + dst_x;
191
+ dst[dst_idx] = src_uv[src_idx];
192
+ }
193
+ }
194
+ ''', 'nv12_resize')
195
+ return _nv12_resize_kernel
196
+
197
+
198
+ def nv12_resize(y_plane: cp.ndarray, uv_plane: cp.ndarray,
199
+ y_stride: int, uv_stride: int,
200
+ src_h: int, src_w: int,
201
+ dst_h: int = 640, dst_w: int = 640) -> cp.ndarray:
202
+ """Resize NV12 without color conversion.
203
+
204
+ Output: concatenated Y (H×W) + UV ((H/2)×W) as single buffer.
205
+ Total size: H×W + (H/2)×W = H×W×1.5 bytes (50% of RGB).
206
+ """
207
+ kernel = _get_nv12_resize_kernel()
208
+ if kernel is None:
209
+ return None
210
+
211
+ total_h = dst_h + dst_h // 2
212
+ output = cp.empty((total_h, dst_w), dtype=cp.uint8)
213
+
214
+ block = (16, 16)
215
+ grid = ((dst_w + 15) // 16, (total_h + 15) // 16)
216
+
217
+ kernel(grid, block, (
218
+ y_plane, uv_plane, output,
219
+ cp.int32(src_h), cp.int32(src_w),
220
+ cp.int32(dst_h), cp.int32(dst_w),
221
+ cp.int32(y_stride), cp.int32(uv_stride)
222
+ ))
223
+
224
+ return output
225
+
226
+
227
+ def surface_to_nv12(frame, target_h: int = 640, target_w: int = 640) -> Optional[cp.ndarray]:
228
+ """Convert NVDEC surface to resized NV12 (50% smaller than RGB).
229
+
230
+ Output: (H + H/2, W) uint8 - concatenated Y + UV planes.
231
+ Total size: H×W×1.5 bytes (vs H×W×3 for RGB).
232
+ """
233
+ if not CUPY_AVAILABLE or frame is None:
234
+ return None
235
+
236
+ try:
237
+ cuda_views = frame.cuda()
238
+ if not cuda_views or len(cuda_views) < 2:
239
+ return None
240
+
241
+ # Extract Y plane
242
+ y_view = cuda_views[0]
243
+ y_cai = y_view.__cuda_array_interface__
244
+ y_shape = tuple(y_cai['shape'])
245
+ y_strides = tuple(y_cai['strides'])
246
+ y_ptr = y_cai['data'][0]
247
+ src_h, src_w = y_shape[:2]
248
+ y_stride = y_strides[0]
249
+
250
+ y_size = src_h * y_stride
251
+ y_mem = cp.cuda.UnownedMemory(y_ptr, y_size, owner=frame)
252
+ y_memptr = cp.cuda.MemoryPointer(y_mem, 0)
253
+ y_plane = cp.ndarray((src_h, src_w), dtype=cp.uint8, memptr=y_memptr,
254
+ strides=(y_stride, 1))
255
+
256
+ # Extract UV plane
257
+ uv_view = cuda_views[1]
258
+ uv_cai = uv_view.__cuda_array_interface__
259
+ uv_shape = tuple(uv_cai['shape'])
260
+ uv_strides = tuple(uv_cai['strides'])
261
+ uv_ptr = uv_cai['data'][0]
262
+ uv_stride = uv_strides[0]
263
+
264
+ uv_h = uv_shape[0]
265
+ uv_w = uv_shape[1] if len(uv_shape) > 1 else src_w
266
+ uv_size = uv_h * uv_stride
267
+ uv_mem = cp.cuda.UnownedMemory(uv_ptr, uv_size, owner=frame)
268
+ uv_memptr = cp.cuda.MemoryPointer(uv_mem, 0)
269
+ uv_plane = cp.ndarray((uv_h, uv_w), dtype=cp.uint8, memptr=uv_memptr,
270
+ strides=(uv_stride, 1))
271
+
272
+ # NV12 resize (no color conversion - 50% smaller output!)
273
+ nv12_frame = nv12_resize(y_plane, uv_plane, y_stride, uv_stride,
274
+ src_h, src_w, target_h, target_w)
275
+ # Add channel dimension for ring buffer compatibility: (H*1.5, W) -> (H*1.5, W, 1)
276
+ return nv12_frame[:, :, cp.newaxis] if nv12_frame is not None else None
277
+
278
+ except Exception as e:
279
+ logger.warning(f"surface_to_nv12 failed: {e}")
280
+ return None
281
+
282
+
283
+ # =============================================================================
284
+ # NVDEC Decoder Pool
285
+ # =============================================================================
286
+
287
+ class NVDECDecoderPool:
288
+ """Pool of NVDEC decoders that time-multiplex streams.
289
+
290
+ Each decoder is exclusively owned by one worker thread.
291
+ Outputs NV12: 1.5×H×W bytes (50% smaller than RGB).
292
+ """
293
+
294
+ def __init__(self, pool_size: int, gpu_id: int = 0):
295
+ self.pool_size = pool_size
296
+ self.gpu_id = gpu_id
297
+ self.decoders = []
298
+ self.streams_per_decoder: List[List[StreamState]] = [[] for _ in range(pool_size)]
299
+
300
+ if not PYNVCODEC_AVAILABLE:
301
+ raise RuntimeError("PyNvVideoCodec not available")
302
+
303
+ if CUPY_AVAILABLE:
304
+ cp.cuda.Device(gpu_id).use()
305
+
306
+ for i in range(pool_size):
307
+ try:
308
+ decoder = nvc.CreateDecoder(
309
+ gpuid=gpu_id,
310
+ codec=nvc.cudaVideoCodec.H264,
311
+ usedevicememory=True
312
+ )
313
+ self.decoders.append(decoder)
314
+ except Exception as e:
315
+ logger.warning(f"Failed to create decoder {i}: {e}")
316
+ break
317
+
318
+ self.actual_pool_size = len(self.decoders)
319
+ logger.info(f"Created NVDEC pool: {self.actual_pool_size}/{pool_size} decoders on GPU {gpu_id}")
320
+
321
+ def assign_stream(self, stream_id: int, camera_id: str, video_path: str,
322
+ width: int = 640, height: int = 640) -> bool:
323
+ """Assign a stream to a decoder (round-robin)."""
324
+ if self.actual_pool_size == 0:
325
+ return False
326
+
327
+ decoder_idx = stream_id % self.actual_pool_size
328
+ try:
329
+ demuxer = nvc.CreateDemuxer(video_path)
330
+ except Exception as e:
331
+ logger.error(f"Failed to create demuxer for {camera_id}: {e}")
332
+ return False
333
+
334
+ stream_state = StreamState(
335
+ stream_id=stream_id,
336
+ camera_id=camera_id,
337
+ video_path=video_path,
338
+ demuxer=demuxer,
339
+ width=width,
340
+ height=height
341
+ )
342
+ self.streams_per_decoder[decoder_idx].append(stream_state)
343
+ return True
344
+
345
+ def decode_round(self, decoder_idx: int, frames_per_stream: int = 4,
346
+ target_h: int = 640, target_w: int = 640) -> Tuple[int, List[Tuple[str, cp.ndarray]]]:
347
+ """Decode frames and convert to NV12.
348
+
349
+ Returns:
350
+ (total_frames, [(camera_id, nv12_tensor), ...])
351
+ """
352
+ if decoder_idx >= self.actual_pool_size:
353
+ return 0, []
354
+
355
+ decoder = self.decoders[decoder_idx]
356
+ streams = self.streams_per_decoder[decoder_idx]
357
+ total_frames = 0
358
+ decoded_frames = []
359
+
360
+ for stream in streams:
361
+ frames_this_stream = 0
362
+
363
+ while frames_this_stream < frames_per_stream:
364
+ try:
365
+ packet = stream.demuxer.Demux()
366
+ if packet is None:
367
+ stream.demuxer = nvc.CreateDemuxer(stream.video_path)
368
+ stream.empty_packets = 0
369
+ packet = stream.demuxer.Demux()
370
+ if packet is None:
371
+ break
372
+
373
+ frames_before = frames_this_stream
374
+ for surface in decoder.Decode(packet):
375
+ tensor = surface_to_nv12(surface, target_h, target_w)
376
+
377
+ if tensor is not None:
378
+ decoded_frames.append((stream.camera_id, tensor))
379
+ frames_this_stream += 1
380
+ stream.frames_decoded += 1
381
+ total_frames += 1
382
+ stream.empty_packets = 0
383
+
384
+ if frames_this_stream >= frames_per_stream:
385
+ break
386
+
387
+ if frames_this_stream == frames_before:
388
+ stream.empty_packets += 1
389
+ if stream.empty_packets >= 3:
390
+ stream.demuxer = nvc.CreateDemuxer(stream.video_path)
391
+ stream.empty_packets = 0
392
+
393
+ except Exception:
394
+ break
395
+
396
+ if frames_this_stream >= frames_per_stream:
397
+ break
398
+
399
+ return total_frames, decoded_frames
400
+
401
+ def get_camera_ids_for_decoder(self, decoder_idx: int) -> List[str]:
402
+ """Get camera IDs for a decoder."""
403
+ if decoder_idx >= self.actual_pool_size:
404
+ return []
405
+ return [s.camera_id for s in self.streams_per_decoder[decoder_idx]]
406
+
407
+ def close(self):
408
+ """Close all decoders."""
409
+ self.decoders.clear()
410
+ for streams in self.streams_per_decoder:
411
+ streams.clear()
412
+
413
+
414
+ # =============================================================================
415
+ # Worker Thread
416
+ # =============================================================================
417
+
418
+ def nvdec_pool_worker(
419
+ worker_id: int,
420
+ decoder_idx: int,
421
+ pool: NVDECDecoderPool,
422
+ ring_buffers: Dict[str, CudaIpcRingBuffer],
423
+ frame_counter: GlobalFrameCounter,
424
+ duration_sec: float,
425
+ result_queue: thread_queue.Queue,
426
+ stop_event: threading.Event,
427
+ burst_size: int = 4,
428
+ target_h: int = 640,
429
+ target_w: int = 640,
430
+ target_fps: int = 0,
431
+ shared_frame_count: Optional[mp.Value] = None,
432
+ ):
433
+ """NVDEC worker thread.
434
+
435
+ Decodes frames and writes NV12 tensors to ring buffers.
436
+ Uses dedicated CUDA stream per worker for kernel overlap.
437
+ Supports FPS limiting when target_fps > 0.
438
+ """
439
+ if CUPY_AVAILABLE:
440
+ cp.cuda.Device(pool.gpu_id).use()
441
+ cuda_stream = cp.cuda.Stream(non_blocking=True)
442
+ else:
443
+ cuda_stream = None
444
+
445
+ local_frames = 0
446
+ local_errors = 0
447
+ frames_since_counter_update = 0
448
+ counter_batch_size = 100
449
+ start_time = time.perf_counter()
450
+ last_log_time = start_time
451
+ camera_ids = pool.get_camera_ids_for_decoder(decoder_idx)
452
+ num_streams = len(camera_ids)
453
+
454
+ # FPS limiting: calculate frames per second target for this worker
455
+ # Each worker handles num_streams cameras at target_fps each
456
+ fps_limit_enabled = target_fps > 0 and num_streams > 0
457
+ if fps_limit_enabled:
458
+ # Total target frames per second for all streams handled by this worker
459
+ worker_target_fps = target_fps * num_streams
460
+ frame_interval = 1.0 / worker_target_fps
461
+ next_frame_time = start_time
462
+ fps_mode = f", FPS limit={target_fps}/stream"
463
+ else:
464
+ frame_interval = 0
465
+ next_frame_time = 0
466
+ fps_mode = ", unlimited FPS"
467
+
468
+ logger.info(f"Worker {worker_id}: decoder={decoder_idx}, cams={num_streams}{fps_mode}")
469
+
470
+ while not stop_event.is_set():
471
+ if time.perf_counter() - start_time >= duration_sec:
472
+ break
473
+
474
+ now = time.perf_counter()
475
+ if now - last_log_time >= 5.0:
476
+ elapsed = now - start_time
477
+ fps = local_frames / elapsed if elapsed > 0 else 0
478
+ logger.info(f"Worker {worker_id}: {local_frames} frames, {fps:.0f} FPS")
479
+ last_log_time = now
480
+
481
+ # FPS limiting: wait until next scheduled frame time
482
+ if fps_limit_enabled:
483
+ current_time = time.perf_counter()
484
+ if current_time < next_frame_time:
485
+ sleep_time = next_frame_time - current_time
486
+ if sleep_time > 0.0001: # Only sleep if > 100us
487
+ time.sleep(sleep_time)
488
+
489
+ try:
490
+ with cuda_stream:
491
+ num_frames, decoded_frames = pool.decode_round(
492
+ decoder_idx,
493
+ frames_per_stream=burst_size,
494
+ target_h=target_h,
495
+ target_w=target_w
496
+ )
497
+
498
+ for cam_id, tensor in decoded_frames:
499
+ if cam_id in ring_buffers:
500
+ try:
501
+ ring_buffers[cam_id].write_frame_fast(tensor, sync=False)
502
+ local_frames += 1
503
+ frames_since_counter_update += 1
504
+
505
+ # Update shared counter for real-time progress
506
+ if shared_frame_count is not None:
507
+ with shared_frame_count.get_lock():
508
+ shared_frame_count.value += 1
509
+
510
+ # Update next frame time for FPS limiting
511
+ if fps_limit_enabled:
512
+ next_frame_time += frame_interval
513
+
514
+ except Exception as e:
515
+ local_errors += 1
516
+ if local_errors <= 3:
517
+ logger.error(f"Worker {worker_id} write error: {e}")
518
+
519
+ if decoded_frames and len(ring_buffers) > 0:
520
+ next(iter(ring_buffers.values())).sync_writes()
521
+
522
+ if num_frames == 0:
523
+ time.sleep(0.0001)
524
+ continue
525
+
526
+ if frames_since_counter_update >= counter_batch_size:
527
+ frame_counter.increment()
528
+ frames_since_counter_update = 0
529
+
530
+ except Exception as e:
531
+ local_errors += 1
532
+ if local_errors <= 3:
533
+ logger.error(f"Worker {worker_id} error: {e}")
534
+
535
+ if frames_since_counter_update > 0:
536
+ frame_counter.increment()
537
+
538
+ elapsed = time.perf_counter() - start_time
539
+ result_queue.put({
540
+ "worker_id": worker_id,
541
+ "decoder_idx": decoder_idx,
542
+ "elapsed_sec": elapsed,
543
+ "total_frames": local_frames,
544
+ "total_errors": local_errors,
545
+ "num_streams": len(camera_ids),
546
+ "fps": local_frames / elapsed if elapsed > 0 else 0,
547
+ })
548
+
549
+
550
+ # =============================================================================
551
+ # GPU Process
552
+ # =============================================================================
553
+
554
+ def nvdec_pool_process(
555
+ process_id: int,
556
+ camera_configs: List[StreamConfig],
557
+ pool_size: int,
558
+ duration_sec: float,
559
+ result_queue: mp.Queue,
560
+ stop_event: mp.Event,
561
+ burst_size: int = 4,
562
+ num_slots: int = 32,
563
+ target_fps: int = 0,
564
+ shared_frame_count: Optional[mp.Value] = None,
565
+ ):
566
+ """NVDEC process for one GPU.
567
+
568
+ Creates NV12 ring buffers: (H*1.5, W) = 0.6 MB/frame.
569
+ """
570
+ if not camera_configs:
571
+ return
572
+
573
+ gpu_id = camera_configs[0].gpu_id
574
+ target_h = camera_configs[0].height
575
+ target_w = camera_configs[0].width
576
+
577
+ if CUPY_AVAILABLE:
578
+ cp.cuda.Device(gpu_id).use()
579
+
580
+ # Initialize global frame counter
581
+ frame_counter = GlobalFrameCounter(is_producer=True)
582
+ if process_id == 0:
583
+ frame_counter.initialize()
584
+ logger.info(f"Process {process_id}: GlobalFrameCounter initialized")
585
+ else:
586
+ max_retries = 50
587
+ for retry in range(max_retries):
588
+ try:
589
+ if os.path.exists("/dev/shm/global_frame_counter"):
590
+ frame_counter.connect()
591
+ logger.info(f"Process {process_id}: Connected to GlobalFrameCounter")
592
+ break
593
+ except Exception:
594
+ if retry == max_retries - 1:
595
+ raise
596
+ time.sleep(0.1)
597
+ else:
598
+ raise RuntimeError(f"Process {process_id}: GlobalFrameCounter not found")
599
+
600
+ # Create decoder pool
601
+ try:
602
+ pool = NVDECDecoderPool(pool_size, gpu_id)
603
+ except Exception as e:
604
+ logger.error(f"Process {process_id}: Failed to create decoder pool: {e}")
605
+ result_queue.put({
606
+ "process_id": process_id,
607
+ "error": str(e),
608
+ "total_frames": 0,
609
+ "total_errors": 1,
610
+ })
611
+ return
612
+
613
+ if pool.actual_pool_size == 0:
614
+ result_queue.put({
615
+ "process_id": process_id,
616
+ "error": "No decoders created",
617
+ "total_frames": 0,
618
+ "total_errors": 1,
619
+ })
620
+ return
621
+
622
+ # Create NV12 ring buffers: (H + H/2, W, 1) = 0.6 MB/frame
623
+ ring_buffers: Dict[str, CudaIpcRingBuffer] = {}
624
+ frame_size_mb = target_h * target_w * 1.5 / 1e6
625
+
626
+ try:
627
+ for i, config in enumerate(camera_configs):
628
+ rb = CudaIpcRingBuffer.create_producer(
629
+ config.camera_id,
630
+ gpu_id=config.gpu_id,
631
+ num_slots=num_slots,
632
+ width=config.width,
633
+ height=config.height + config.height // 2, # H * 1.5 for NV12
634
+ channels=1,
635
+ )
636
+ ring_buffers[config.camera_id] = rb
637
+
638
+ pool.assign_stream(
639
+ stream_id=i,
640
+ camera_id=config.camera_id,
641
+ video_path=config.video_path,
642
+ width=config.width,
643
+ height=config.height
644
+ )
645
+
646
+ logger.info(f"Process {process_id}: {pool.actual_pool_size} decoders, "
647
+ f"{len(camera_configs)} streams, NV12 ({frame_size_mb:.1f} MB/frame)")
648
+
649
+ thread_stop_event = threading.Event()
650
+ thread_result_queue = thread_queue.Queue()
651
+
652
+ threads = []
653
+ for decoder_idx in range(pool.actual_pool_size):
654
+ t = threading.Thread(
655
+ target=nvdec_pool_worker,
656
+ args=(
657
+ process_id * 100 + decoder_idx,
658
+ decoder_idx,
659
+ pool,
660
+ ring_buffers,
661
+ frame_counter,
662
+ duration_sec,
663
+ thread_result_queue,
664
+ thread_stop_event,
665
+ burst_size,
666
+ target_h,
667
+ target_w,
668
+ target_fps,
669
+ shared_frame_count,
670
+ )
671
+ )
672
+ t.start()
673
+ threads.append(t)
674
+
675
+ start_time = time.perf_counter()
676
+ while not stop_event.is_set():
677
+ if time.perf_counter() - start_time >= duration_sec:
678
+ break
679
+ time.sleep(0.1)
680
+
681
+ thread_stop_event.set()
682
+
683
+ for t in threads:
684
+ t.join(timeout=30.0)
685
+
686
+ total_frames = 0
687
+ total_errors = 0
688
+ elapsed = time.perf_counter() - start_time
689
+
690
+ while not thread_result_queue.empty():
691
+ try:
692
+ r = thread_result_queue.get_nowait()
693
+ total_frames += r.get("total_frames", 0)
694
+ total_errors += r.get("total_errors", 0)
695
+ except:
696
+ break
697
+
698
+ pool.close()
699
+ for rb in ring_buffers.values():
700
+ rb.close()
701
+
702
+ result_queue.put({
703
+ "process_id": process_id,
704
+ "elapsed_sec": elapsed,
705
+ "total_frames": total_frames,
706
+ "total_errors": total_errors,
707
+ "num_streams": len(camera_configs),
708
+ "pool_size": pool.actual_pool_size,
709
+ "fps": total_frames / elapsed if elapsed > 0 else 0,
710
+ "per_stream_fps": total_frames / elapsed / len(camera_configs) if elapsed > 0 and camera_configs else 0,
711
+ })
712
+
713
+ except Exception as e:
714
+ logger.error(f"Process {process_id} error: {e}")
715
+ import traceback
716
+ traceback.print_exc()
717
+
718
+ pool.close()
719
+ for rb in ring_buffers.values():
720
+ rb.close()
721
+
722
+ result_queue.put({
723
+ "process_id": process_id,
724
+ "error": str(e),
725
+ "total_frames": 0,
726
+ "total_errors": 1,
727
+ })
728
+
729
+
730
+ # =============================================================================
731
+ # Streaming Gateway
732
+ # =============================================================================
733
+
734
+ class StreamingGateway:
735
+ """Multi-stream video producer outputting NV12 tensors (minimal IPC payload)."""
736
+
737
+ def __init__(self, config: GatewayConfig):
738
+ self.config = config
739
+ self._workers: List[mp.Process] = []
740
+ self._stop_event = mp.Event()
741
+ self._result_queue = mp.Queue()
742
+
743
+ def start(self) -> Dict:
744
+ """Start the gateway."""
745
+ if not CUPY_AVAILABLE:
746
+ raise RuntimeError("CuPy is required")
747
+ if not RING_BUFFER_AVAILABLE:
748
+ raise RuntimeError("CUDA IPC ring buffer not available")
749
+ if not PYNVCODEC_AVAILABLE:
750
+ raise RuntimeError("PyNvVideoCodec required")
751
+ return self._start_nvdec_pool()
752
+
753
+ def _start_nvdec_pool(self) -> Dict:
754
+ """Start NVDEC pool across GPUs."""
755
+ num_gpus = min(self.config.num_gpus, 8)
756
+ streams_per_gpu = self.config.num_streams // num_gpus
757
+ extra_streams = self.config.num_streams % num_gpus
758
+
759
+ logger.info(f"Starting NVDEC on {num_gpus} GPU(s): {self.config.num_streams} streams, "
760
+ f"pool_size={self.config.nvdec_pool_size}/GPU, output=NV12 (0.6 MB)")
761
+
762
+ ctx = mp.get_context("spawn")
763
+ self._stop_event = ctx.Event()
764
+ self._result_queue = ctx.Queue()
765
+
766
+ # Shared counter for real-time FPS tracking (use 'L' for large counts)
767
+ shared_frame_count = ctx.Value('L', 0)
768
+
769
+ stream_idx = 0
770
+ for gpu_id in range(num_gpus):
771
+ n_streams = streams_per_gpu + (1 if gpu_id < extra_streams else 0)
772
+
773
+ gpu_configs = []
774
+ for i in range(n_streams):
775
+ config = StreamConfig(
776
+ camera_id=f"cam_{stream_idx:04d}",
777
+ video_path=self.config.video_path,
778
+ width=self.config.frame_width,
779
+ height=self.config.frame_height,
780
+ target_fps=self.config.target_fps,
781
+ gpu_id=gpu_id,
782
+ )
783
+ gpu_configs.append(config)
784
+ stream_idx += 1
785
+
786
+ p = ctx.Process(
787
+ target=nvdec_pool_process,
788
+ args=(gpu_id, gpu_configs, self.config.nvdec_pool_size,
789
+ self.config.duration_sec, self._result_queue, self._stop_event,
790
+ self.config.nvdec_burst_size, self.config.num_slots,
791
+ self.config.target_fps, shared_frame_count)
792
+ )
793
+ p.start()
794
+ self._workers.append(p)
795
+ logger.info(f"GPU {gpu_id}: {n_streams} streams")
796
+ time.sleep(0.1)
797
+
798
+ # Progress monitoring loop - print progress every 5 seconds
799
+ start_time = time.perf_counter()
800
+ last_report_time = start_time
801
+ last_frame_count = 0
802
+ report_interval = 5.0 # seconds
803
+ processing_start_time = None # Track when actual processing starts
804
+ frames_at_processing_start = 0
805
+
806
+ print(f" [ 0.0s] Started {num_gpus} GPU workers...")
807
+
808
+ while any(p.is_alive() for p in self._workers):
809
+ time.sleep(0.5)
810
+ current_time = time.perf_counter()
811
+
812
+ # Periodic progress report with real-time FPS
813
+ if current_time - last_report_time >= report_interval:
814
+ elapsed = current_time - start_time
815
+ remaining = max(0, self.config.duration_sec - elapsed)
816
+
817
+ # Read current frame count
818
+ current_frames = shared_frame_count.value
819
+ interval_frames = current_frames - last_frame_count
820
+ interval_fps = interval_frames / report_interval # Current throughput
821
+ per_stream_fps = interval_fps / self.config.num_streams if self.config.num_streams > 0 else 0
822
+
823
+ # Track when processing actually starts (exclude warmup from avg)
824
+ if processing_start_time is None and current_frames > 0:
825
+ processing_start_time = last_report_time # Use previous report time
826
+ frames_at_processing_start = last_frame_count
827
+
828
+ # Calculate average FPS excluding warmup time
829
+ if processing_start_time is not None:
830
+ processing_elapsed = current_time - processing_start_time
831
+ processing_frames = current_frames - frames_at_processing_start
832
+ avg_fps = processing_frames / processing_elapsed if processing_elapsed > 0 else 0
833
+ print(f" [{elapsed:5.1f}s] {current_frames:,} frames | cur: {interval_fps:,.0f} FPS ({per_stream_fps:.1f}/stream) | avg: {avg_fps:,.0f} FPS | {remaining:.0f}s left")
834
+ else:
835
+ print(f" [{elapsed:5.1f}s] Warming up... | {remaining:.0f}s left")
836
+
837
+ last_report_time = current_time
838
+ last_frame_count = current_frames
839
+
840
+ # Wait for all workers to fully complete
841
+ for p in self._workers:
842
+ p.join(timeout=5)
843
+
844
+ results = []
845
+ while not self._result_queue.empty():
846
+ results.append(self._result_queue.get())
847
+
848
+ for r in results:
849
+ if "error" in r:
850
+ logger.error(f"NVDEC error: {r['error']}")
851
+
852
+ total_frames = sum(r.get("total_frames", 0) for r in results)
853
+ total_errors = sum(r.get("total_errors", 0) for r in results)
854
+ total_elapsed = max((r.get("elapsed_sec", 0) for r in results), default=0)
855
+
856
+ aggregate_fps = total_frames / total_elapsed if total_elapsed > 0 else 0
857
+ per_stream_fps = aggregate_fps / self.config.num_streams if self.config.num_streams > 0 else 0
858
+
859
+ return {
860
+ "num_streams": self.config.num_streams,
861
+ "num_gpus": num_gpus,
862
+ "pool_size": self.config.nvdec_pool_size,
863
+ "duration_sec": total_elapsed,
864
+ "total_frames": total_frames,
865
+ "total_errors": total_errors,
866
+ "aggregate_fps": aggregate_fps,
867
+ "per_stream_fps": per_stream_fps,
868
+ "gpu_results": results,
869
+ }
870
+
871
+ def stop(self):
872
+ """Stop all workers."""
873
+ self._stop_event.set()
874
+ for p in self._workers:
875
+ p.join(timeout=5)
876
+ if p.is_alive():
877
+ p.terminate()
878
+
879
+
880
+ # =============================================================================
881
+ # CLI
882
+ # =============================================================================
883
+
884
+ def main():
885
+ parser = argparse.ArgumentParser(description="Streaming Gateway - CUDA IPC Producer (NV12)")
886
+ parser.add_argument("--video", "-v", required=True, help="Video file path")
887
+ parser.add_argument("--num-streams", "-n", type=int, default=100, help="Number of streams")
888
+ parser.add_argument("--fps", type=int, default=0, help="Target FPS limit per stream (0=unlimited)")
889
+ parser.add_argument("--width", type=int, default=640, help="Frame width")
890
+ parser.add_argument("--height", type=int, default=640, help="Frame height")
891
+ parser.add_argument("--duration", "-d", type=float, default=30.0, help="Duration in seconds")
892
+ parser.add_argument("--gpu", type=int, default=0, help="Primary GPU ID")
893
+ parser.add_argument("--num-gpus", "-g", type=int, default=1, help="Number of GPUs (1-8)")
894
+ parser.add_argument("--pool-size", type=int, default=8, help="NVDEC pool size per GPU")
895
+ parser.add_argument("--burst-size", type=int, default=4, help="Frames per stream before rotating")
896
+ parser.add_argument("--slots", type=int, default=32, help="Ring buffer slots per camera")
897
+ parser.add_argument("--quiet", "-q", action="store_true", help="Quiet mode - only show final results")
898
+ args = parser.parse_args()
899
+
900
+ # Setup logging based on quiet mode
901
+ setup_logging(quiet=args.quiet)
902
+
903
+ config = GatewayConfig(
904
+ video_path=args.video,
905
+ num_streams=args.num_streams,
906
+ target_fps=args.fps,
907
+ frame_width=args.width,
908
+ frame_height=args.height,
909
+ gpu_id=args.gpu,
910
+ num_gpus=args.num_gpus,
911
+ duration_sec=args.duration,
912
+ nvdec_pool_size=args.pool_size,
913
+ nvdec_burst_size=args.burst_size,
914
+ num_slots=args.slots,
915
+ )
916
+
917
+ frame_size = args.width * args.height * 1.5
918
+ output_fmt = f"NV12 ({args.width}x{args.height}x1.5 = {frame_size/1e6:.1f} MB/frame)"
919
+ fps_limit_str = f"{args.fps} FPS/stream" if args.fps > 0 else "unlimited"
920
+
921
+ if not args.quiet:
922
+ print("\n" + "=" * 60)
923
+ print(" STREAMING GATEWAY - CUDA IPC Producer (NV12)")
924
+ print("=" * 60)
925
+ print(f" Video: {args.video}")
926
+ print(f" Streams: {args.num_streams}")
927
+ print(f" GPUs: {args.num_gpus}")
928
+ print(f" Pool size: {args.pool_size} NVDEC decoders/GPU")
929
+ print(f" FPS limit: {fps_limit_str}")
930
+ print(f" Output: {output_fmt}")
931
+ print(f" Duration: {args.duration}s")
932
+ print("=" * 60)
933
+
934
+ gateway = StreamingGateway(config)
935
+
936
+ try:
937
+ results = gateway.start()
938
+ # Clean summary output
939
+ print("\n")
940
+ print("=" * 60)
941
+ print(" STREAMING GATEWAY BENCHMARK RESULTS")
942
+ print("=" * 60)
943
+ print(f" Video: {args.video}")
944
+ print(f" Streams: {args.num_streams}")
945
+ print(f" GPUs: {args.num_gpus}")
946
+ print(f" FPS limit: {fps_limit_str}")
947
+ print(f" Duration: {args.duration}s")
948
+ print("-" * 60)
949
+ print(f" Total Frames: {results['total_frames']:,}")
950
+ print("-" * 60)
951
+ print(f" >>> AGGREGATE FPS: {results['aggregate_fps']:,.0f} <<<")
952
+ print(f" >>> PER-STREAM FPS: {results['per_stream_fps']:.1f} <<<")
953
+ print("=" * 60)
954
+ print()
955
+ except KeyboardInterrupt:
956
+ gateway.stop()
957
+ print("\nStopped")
958
+
959
+
960
+ if __name__ == "__main__":
961
+ main()