matrice-streaming 0.1.14__py3-none-any.whl → 0.1.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. matrice_streaming/__init__.py +44 -32
  2. matrice_streaming/streaming_gateway/camera_streamer/__init__.py +68 -1
  3. matrice_streaming/streaming_gateway/camera_streamer/async_camera_worker.py +1388 -0
  4. matrice_streaming/streaming_gateway/camera_streamer/async_ffmpeg_worker.py +966 -0
  5. matrice_streaming/streaming_gateway/camera_streamer/camera_streamer.py +188 -24
  6. matrice_streaming/streaming_gateway/camera_streamer/device_detection.py +507 -0
  7. matrice_streaming/streaming_gateway/camera_streamer/encoding_pool_manager.py +136 -0
  8. matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_camera_streamer.py +1048 -0
  9. matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_config.py +192 -0
  10. matrice_streaming/streaming_gateway/camera_streamer/ffmpeg_worker_manager.py +470 -0
  11. matrice_streaming/streaming_gateway/camera_streamer/gstreamer_camera_streamer.py +1368 -0
  12. matrice_streaming/streaming_gateway/camera_streamer/gstreamer_worker.py +1063 -0
  13. matrice_streaming/streaming_gateway/camera_streamer/gstreamer_worker_manager.py +546 -0
  14. matrice_streaming/streaming_gateway/camera_streamer/message_builder.py +60 -15
  15. matrice_streaming/streaming_gateway/camera_streamer/nvdec.py +1330 -0
  16. matrice_streaming/streaming_gateway/camera_streamer/nvdec_worker_manager.py +412 -0
  17. matrice_streaming/streaming_gateway/camera_streamer/platform_pipelines.py +680 -0
  18. matrice_streaming/streaming_gateway/camera_streamer/stream_statistics.py +111 -4
  19. matrice_streaming/streaming_gateway/camera_streamer/video_capture_manager.py +223 -27
  20. matrice_streaming/streaming_gateway/camera_streamer/worker_manager.py +694 -0
  21. matrice_streaming/streaming_gateway/debug/__init__.py +27 -2
  22. matrice_streaming/streaming_gateway/debug/benchmark.py +727 -0
  23. matrice_streaming/streaming_gateway/debug/debug_gstreamer_gateway.py +599 -0
  24. matrice_streaming/streaming_gateway/debug/debug_streaming_gateway.py +245 -95
  25. matrice_streaming/streaming_gateway/debug/debug_utils.py +29 -0
  26. matrice_streaming/streaming_gateway/debug/test_videoplayback.py +318 -0
  27. matrice_streaming/streaming_gateway/dynamic_camera_manager.py +656 -39
  28. matrice_streaming/streaming_gateway/metrics_reporter.py +676 -139
  29. matrice_streaming/streaming_gateway/streaming_action.py +71 -20
  30. matrice_streaming/streaming_gateway/streaming_gateway.py +1026 -78
  31. matrice_streaming/streaming_gateway/streaming_gateway_utils.py +175 -20
  32. matrice_streaming/streaming_gateway/streaming_status_listener.py +89 -0
  33. {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/METADATA +1 -1
  34. matrice_streaming-0.1.65.dist-info/RECORD +56 -0
  35. matrice_streaming-0.1.14.dist-info/RECORD +0 -38
  36. {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/WHEEL +0 -0
  37. {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/licenses/LICENSE.txt +0 -0
  38. {matrice_streaming-0.1.14.dist-info → matrice_streaming-0.1.65.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,694 @@
1
+ """Worker manager for coordinating multiple async camera workers.
2
+
3
+ This module manages a pool of async worker processes, distributing cameras
4
+ across them and monitoring their health.
5
+ """
6
+ import logging
7
+ import multiprocessing
8
+ import os
9
+ import sys
10
+ import time
11
+ import signal
12
+ from typing import List, Dict, Any, Optional
13
+ from pathlib import Path
14
+
15
+ from .async_camera_worker import run_async_worker
16
+ from .encoding_pool_manager import EncodingPoolManager
17
+ from .camera_streamer import CameraStreamer
18
+
19
+ USE_SHM = os.getenv("USE_SHM", "false").lower() == "true"
20
+
21
+ class WorkerManager:
22
+ """Manages multiple async camera worker processes with dynamic scaling.
23
+
24
+ This manager coordinates worker processes based on available CPU cores,
25
+ distributing cameras across them for optimal throughput. Each worker handles
26
+ multiple cameras concurrently using async I/O.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ camera_configs: List[Dict[str, Any]],
32
+ stream_config: Dict[str, Any],
33
+ num_workers: Optional[int] = None,
34
+ cpu_percentage: float = 0.9,
35
+ num_encoding_workers: Optional[int] = None,
36
+ max_cameras_per_worker: int = 100,
37
+ # ================================================================
38
+ # SHM_MODE: New parameters for shared memory architecture
39
+ # ================================================================
40
+ use_shm: bool = USE_SHM, # Enable SHM mode (raw frames in shared memory)
41
+ shm_slot_count: int = 1000, # Ring buffer size per camera (increased for consumer lag)
42
+ shm_frame_format: str = "BGR", # Frame format: "BGR", "RGB", or "NV12"
43
+ # ================================================================
44
+ # PERFORMANCE: New parameters for optimized frame capture
45
+ # ================================================================
46
+ drop_stale_frames: bool = True, # Use grab()/grab()/retrieve() for latest frame
47
+ pin_cpu_affinity: bool = True, # Pin workers to specific CPU cores
48
+ buffer_size: int = 1, # VideoCapture buffer size (1 = minimal latency)
49
+ ):
50
+ """Initialize worker manager with dynamic CPU-based scaling.
51
+
52
+ Args:
53
+ camera_configs: List of all camera configurations
54
+ stream_config: Streaming configuration (Redis, Kafka, etc.)
55
+ num_workers: Number of worker processes (default: auto-calculated from CPU cores)
56
+ cpu_percentage: Percentage of CPU cores to use (default: 0.9 = 90%)
57
+ num_encoding_workers: Number of encoding workers (default: CPU_count - 2)
58
+ max_cameras_per_worker: Maximum cameras per worker for load balancing (default: 100)
59
+ use_shm: Enable SHM mode (raw frames in shared memory, metadata in Redis)
60
+ shm_slot_count: Number of frame slots per camera ring buffer
61
+ shm_frame_format: Frame format for SHM storage
62
+ drop_stale_frames: Use grab()/grab()/retrieve() pattern for latest frame
63
+ pin_cpu_affinity: Pin worker processes to specific CPU cores for cache locality
64
+ buffer_size: VideoCapture buffer size (1 = minimal latency)
65
+ """
66
+ self.camera_configs = camera_configs
67
+ self.stream_config = stream_config
68
+
69
+ # Calculate dynamic worker count based on CPU cores if not specified
70
+ if num_workers is None:
71
+ cpu_count = os.cpu_count() or 4 # Fallback to 4 if can't detect
72
+ num_cameras = len(camera_configs)
73
+
74
+ # For systems with 16+ cores OR large camera counts, use camera-based calculation
75
+ # This applies to Docker containers with limited CPU allocation (e.g., 20 cores)
76
+ # Too many workers = process overhead; too few = underutilization
77
+ # Target: ~25 cameras per worker for better read parallelism with video files
78
+ if cpu_count >= 16 or num_cameras >= 100:
79
+ # Use camera-based calculation for better distribution
80
+ # 1000 cameras / 25 cameras per worker = 40 workers
81
+ target_cameras_per_worker = 25
82
+ calculated_workers = max(4, min(num_cameras // target_cameras_per_worker, 50))
83
+ else:
84
+ # Standard calculation for smaller systems
85
+ calculated_workers = max(4, int(cpu_count * cpu_percentage))
86
+
87
+ # Cap at camera count (no point having more workers than cameras)
88
+ self.num_workers = min(calculated_workers, num_cameras) if num_cameras > 0 else calculated_workers
89
+ else:
90
+ self.num_workers = num_workers
91
+
92
+ self.num_encoding_workers = num_encoding_workers
93
+ self.logger = logging.getLogger(__name__)
94
+
95
+ # Max cameras per worker (for load balancing)
96
+ self.max_cameras_per_worker = max_cameras_per_worker
97
+
98
+ # Log dynamic scaling info
99
+ cpu_count = os.cpu_count() or 4
100
+ self.logger.info(
101
+ f"Dynamic worker scaling: {cpu_count} CPU cores detected, "
102
+ f"using {cpu_percentage*100:.0f}% = {self.num_workers} workers "
103
+ f"for {len(camera_configs)} cameras"
104
+ )
105
+
106
+ # ================================================================
107
+ # SHM_MODE: Store shared memory configuration
108
+ # ================================================================
109
+ self.use_shm = use_shm
110
+ self.shm_slot_count = shm_slot_count
111
+ self.shm_frame_format = shm_frame_format
112
+
113
+ if use_shm:
114
+ self.logger.info(
115
+ f"SHM_MODE ENABLED: format={shm_frame_format}, slots={shm_slot_count}"
116
+ )
117
+
118
+ # ================================================================
119
+ # PERFORMANCE: Store optimized frame capture configuration
120
+ # ================================================================
121
+ self.drop_stale_frames = drop_stale_frames
122
+ self.pin_cpu_affinity = pin_cpu_affinity
123
+ self.buffer_size = buffer_size
124
+
125
+ if drop_stale_frames or pin_cpu_affinity:
126
+ self.logger.info(
127
+ f"PERFORMANCE OPTIMIZATIONS ENABLED: "
128
+ f"drop_stale_frames={drop_stale_frames}, "
129
+ f"pin_cpu_affinity={pin_cpu_affinity}, "
130
+ f"buffer_size={buffer_size}"
131
+ )
132
+
133
+ # Note: Batch parameters are calculated per-worker in _start_worker()
134
+ # based on each worker's camera count, not the global total.
135
+ # This ensures optimal batching when cameras are distributed across workers.
136
+
137
+ # Multiprocessing primitives
138
+ self.stop_event = multiprocessing.Event()
139
+ self.health_queue = multiprocessing.Queue()
140
+
141
+ # Worker processes
142
+ self.workers: List[multiprocessing.Process] = []
143
+ self.worker_camera_assignments: Dict[int, List[Dict[str, Any]]] = {}
144
+
145
+ # Encoding pool
146
+ self.encoding_pool_manager: Optional[EncodingPoolManager] = None
147
+ self.encoding_pool: Optional[multiprocessing.Pool] = None
148
+
149
+ # Health monitoring
150
+ self.last_health_reports: Dict[int, Dict[str, Any]] = {}
151
+
152
+ # Dynamic camera support - command queues for each worker
153
+ self.command_queues: Dict[int, multiprocessing.Queue] = {}
154
+
155
+ # Response queue for acknowledgments from workers
156
+ self.response_queue = multiprocessing.Queue()
157
+
158
+ # Camera-to-worker mapping for targeted operations
159
+ self.camera_to_worker: Dict[str, int] = {} # stream_key -> worker_id
160
+
161
+ # Worker load tracking
162
+ self.worker_camera_count: Dict[int, int] = {} # worker_id -> camera_count
163
+
164
+ self.logger.info(
165
+ f"WorkerManager initialized: {self.num_workers} workers, "
166
+ f"{len(camera_configs)} cameras total, "
167
+ f"max {self.max_cameras_per_worker} cameras per worker"
168
+ )
169
+
170
+ def start(self):
171
+ """Start all workers and begin streaming."""
172
+ try:
173
+ # Distribute cameras across workers (static partitioning)
174
+ self._distribute_cameras()
175
+
176
+ # Note: Encoding pool not needed - each worker uses asyncio.to_thread()
177
+ # which provides good enough parallelism for JPEG encoding (mostly C code)
178
+
179
+ # Start worker processes
180
+ self.logger.info(f"Starting {self.num_workers} worker processes...")
181
+ for worker_id in range(self.num_workers):
182
+ self._start_worker(worker_id)
183
+
184
+ self.logger.info(
185
+ f"All workers started! "
186
+ f"Streaming {len(self.camera_configs)} cameras across {self.num_workers} workers"
187
+ )
188
+
189
+ except Exception as exc:
190
+ self.logger.error(f"Failed to start workers: {exc}")
191
+ self.stop()
192
+ raise
193
+
194
+ def _distribute_cameras(self):
195
+ """Distribute cameras across workers using static partitioning."""
196
+ total_cameras = len(self.camera_configs)
197
+ cameras_per_worker = total_cameras // self.num_workers
198
+ remainder = total_cameras % self.num_workers
199
+
200
+ self.logger.info(
201
+ f"Distributing {total_cameras} cameras: "
202
+ f"~{cameras_per_worker} per worker"
203
+ )
204
+
205
+ camera_idx = 0
206
+ for worker_id in range(self.num_workers):
207
+ # Some workers get 1 extra camera if there's a remainder
208
+ num_cameras = cameras_per_worker + (1 if worker_id < remainder else 0)
209
+
210
+ worker_cameras = self.camera_configs[camera_idx:camera_idx + num_cameras]
211
+ self.worker_camera_assignments[worker_id] = worker_cameras
212
+
213
+ self.logger.info(
214
+ f"Worker {worker_id}: {len(worker_cameras)} cameras "
215
+ f"(indices {camera_idx} to {camera_idx + num_cameras - 1})"
216
+ )
217
+
218
+ camera_idx += num_cameras
219
+
220
+ def _start_worker(self, worker_id: int):
221
+ """Start a single worker process with command queue for dynamic camera support.
222
+
223
+ Args:
224
+ worker_id: Worker identifier
225
+ """
226
+ worker_cameras = self.worker_camera_assignments.get(worker_id, [])
227
+
228
+ # Create command queue for this worker (even if no cameras assigned initially)
229
+ command_queue = multiprocessing.Queue()
230
+ self.command_queues[worker_id] = command_queue
231
+
232
+ # Track initial camera count
233
+ self.worker_camera_count[worker_id] = len(worker_cameras)
234
+
235
+ # Track initial camera-to-worker mapping
236
+ for cam_config in worker_cameras:
237
+ stream_key = cam_config.get('stream_key')
238
+ if stream_key:
239
+ self.camera_to_worker[stream_key] = worker_id
240
+
241
+ if not worker_cameras:
242
+ self.logger.warning(f"Worker {worker_id} has no cameras assigned initially")
243
+
244
+ # Calculate batch parameters based on THIS worker's camera count, not global total
245
+ # Each worker only handles ~50 cameras (1000 / 20 workers), so batch settings
246
+ # should match the per-worker load, not the overall deployment size
247
+ worker_stream_config = self.stream_config.copy()
248
+ num_worker_cameras = len(worker_cameras)
249
+ if num_worker_cameras > 0 and worker_stream_config.get('enable_batching', True):
250
+ batch_params = CameraStreamer.calculate_batch_parameters(num_worker_cameras)
251
+ worker_stream_config.update({
252
+ 'enable_batching': True,
253
+ 'batch_size': batch_params['batch_size'],
254
+ 'batch_timeout': batch_params['batch_timeout']
255
+ })
256
+ self.logger.debug(
257
+ f"Worker {worker_id}: {num_worker_cameras} cameras → "
258
+ f"batch_size={batch_params['batch_size']}, "
259
+ f"batch_timeout={batch_params['batch_timeout']*1000:.1f}ms"
260
+ )
261
+
262
+ try:
263
+ # Use 'fork' context on Linux to avoid re-importing modules in child processes.
264
+ # This prevents dependencies_check from running pip install in child processes.
265
+ # On Windows, 'fork' is not available so we use 'spawn' (the only option).
266
+ if sys.platform == 'win32':
267
+ # Windows only supports spawn
268
+ ctx = multiprocessing.get_context('spawn')
269
+ context_name = 'spawn'
270
+ else:
271
+ # Linux/macOS: use 'fork' - child inherits parent memory, no re-imports
272
+ ctx = multiprocessing.get_context('fork')
273
+ context_name = 'fork'
274
+
275
+ worker = ctx.Process(
276
+ target=run_async_worker,
277
+ args=(
278
+ worker_id,
279
+ worker_cameras,
280
+ worker_stream_config, # Use per-worker config with correct batch params
281
+ self.stop_event,
282
+ self.health_queue,
283
+ command_queue, # Pass command queue for dynamic camera ops
284
+ self.response_queue, # Pass response queue for acknowledgments
285
+ # SHM_MODE: Pass shared memory parameters
286
+ self.use_shm,
287
+ self.shm_slot_count,
288
+ self.shm_frame_format,
289
+ # PERFORMANCE: Pass optimized frame capture parameters
290
+ self.drop_stale_frames,
291
+ self.pin_cpu_affinity,
292
+ self.num_workers, # Total workers for CPU affinity calculation
293
+ self.buffer_size,
294
+ ),
295
+ name=f"AsyncWorker-{worker_id}",
296
+ daemon=False # Non-daemon so we can properly wait for shutdown
297
+ )
298
+ worker.start()
299
+ self.workers.append(worker)
300
+
301
+ self.logger.info(
302
+ f"Started worker {worker_id} (PID: {worker.pid}) "
303
+ f"with {len(worker_cameras)} cameras (context: {context_name})"
304
+ )
305
+
306
+ except Exception as exc:
307
+ self.logger.error(f"Failed to start worker {worker_id}: {exc}")
308
+ raise
309
+
310
+ def monitor(self, duration: Optional[float] = None):
311
+ """Monitor workers and collect health reports.
312
+
313
+ Args:
314
+ duration: How long to monitor (None = indefinite)
315
+ """
316
+ self.logger.info("Starting health monitoring...")
317
+
318
+ start_time = time.time()
319
+ last_summary_time = start_time
320
+
321
+ try:
322
+ while not self.stop_event.is_set():
323
+ # Check if duration exceeded
324
+ if duration and (time.time() - start_time) >= duration:
325
+ self.logger.info(f"Monitoring duration ({duration}s) complete")
326
+ break
327
+
328
+ # Collect health reports
329
+ while not self.health_queue.empty():
330
+ try:
331
+ report = self.health_queue.get_nowait()
332
+ worker_id = report['worker_id']
333
+ self.last_health_reports[worker_id] = report
334
+
335
+ # Log significant status changes
336
+ if report['status'] in ['error', 'stopped']:
337
+ self.logger.warning(
338
+ f"Worker {worker_id} status: {report['status']}"
339
+ f" (error: {report.get('error', 'None')})"
340
+ )
341
+
342
+ except Exception as exc:
343
+ self.logger.error(f"Error processing health report: {exc}")
344
+
345
+ # Check worker processes
346
+ for i, worker in enumerate(self.workers):
347
+ if not worker.is_alive() and not self.stop_event.is_set():
348
+ self.logger.error(
349
+ f"Worker {i} (PID: {worker.pid}) died unexpectedly! "
350
+ f"Exit code: {worker.exitcode}"
351
+ )
352
+
353
+ # Print summary every 10 seconds
354
+ if time.time() - last_summary_time >= 10.0:
355
+ self._print_health_summary()
356
+ last_summary_time = time.time()
357
+
358
+ time.sleep(0.5)
359
+
360
+ except KeyboardInterrupt:
361
+ self.logger.info("Monitoring interrupted by user")
362
+
363
+ def _print_health_summary(self):
364
+ """Print summary of worker health."""
365
+ running_workers = sum(1 for w in self.workers if w.is_alive())
366
+ total_cameras = sum(
367
+ report.get('active_cameras', 0)
368
+ for report in self.last_health_reports.values()
369
+ )
370
+
371
+ self.logger.info(
372
+ f"Health Summary: {running_workers}/{len(self.workers)} workers alive, "
373
+ f"{total_cameras} active cameras"
374
+ )
375
+
376
+ # Detailed per-worker status
377
+ for worker_id, report in sorted(self.last_health_reports.items()):
378
+ status = report.get('status', 'unknown')
379
+ cameras = report.get('active_cameras', 0)
380
+ age = time.time() - report.get('timestamp', 0)
381
+
382
+ self.logger.debug(
383
+ f" Worker {worker_id}: {status}, {cameras} cameras, "
384
+ f"last report {age:.1f}s ago"
385
+ )
386
+
387
+ def stop(self, timeout: float = 15.0):
388
+ """Stop all workers gracefully.
389
+
390
+ Args:
391
+ timeout: Maximum time to wait per worker (seconds)
392
+ """
393
+ self.logger.info("Stopping all workers...")
394
+
395
+ # Signal stop
396
+ self.stop_event.set()
397
+
398
+ # Wait for workers to finish
399
+ for i, worker in enumerate(self.workers):
400
+ if worker.is_alive():
401
+ self.logger.info(f"Waiting for worker {i} to stop...")
402
+ worker.join(timeout=timeout)
403
+
404
+ if worker.is_alive():
405
+ self.logger.warning(
406
+ f"Worker {i} did not stop gracefully, terminating..."
407
+ )
408
+ worker.terminate()
409
+ worker.join(timeout=5.0)
410
+
411
+ if worker.is_alive():
412
+ self.logger.error(f"Worker {i} could not be stopped!")
413
+ else:
414
+ self.logger.info(f"Worker {i} terminated")
415
+ else:
416
+ self.logger.info(f"Worker {i} stopped gracefully")
417
+
418
+ # Final summary
419
+ self.logger.info("="*60)
420
+ self.logger.info("SHUTDOWN COMPLETE")
421
+ self.logger.info("="*60)
422
+ self._print_final_summary()
423
+
424
+ def _print_final_summary(self):
425
+ """Print final summary of worker status."""
426
+ total_cameras_assigned = sum(
427
+ len(cameras)
428
+ for cameras in self.worker_camera_assignments.values()
429
+ )
430
+
431
+ self.logger.info(f"Total cameras assigned: {total_cameras_assigned}")
432
+ self.logger.info(f"Workers started: {len(self.workers)}")
433
+
434
+ # Count workers by exit status
435
+ normal_exits = sum(1 for w in self.workers if w.exitcode == 0)
436
+ error_exits = sum(1 for w in self.workers if w.exitcode != 0 and w.exitcode is not None)
437
+ still_alive = sum(1 for w in self.workers if w.is_alive())
438
+
439
+ self.logger.info(
440
+ f"Exit status: {normal_exits} normal, {error_exits} errors, "
441
+ f"{still_alive} still alive"
442
+ )
443
+
444
+ # Last health reports
445
+ if self.last_health_reports:
446
+ self.logger.info("Last health reports:")
447
+ for worker_id in sorted(self.last_health_reports.keys()):
448
+ report = self.last_health_reports[worker_id]
449
+ self.logger.info(
450
+ f" Worker {worker_id}: {report['status']}, "
451
+ f"{report.get('active_cameras', 0)} cameras"
452
+ )
453
+
454
+ def run(self, duration: Optional[float] = None):
455
+ """Start workers and monitor until stopped.
456
+
457
+ This is the main entry point that combines start(), monitor(), and stop().
458
+
459
+ Args:
460
+ duration: How long to run (None = until interrupted)
461
+ """
462
+ try:
463
+ # Setup signal handlers for graceful shutdown
464
+ signal.signal(signal.SIGINT, self._signal_handler)
465
+ signal.signal(signal.SIGTERM, self._signal_handler)
466
+
467
+ # Start all workers
468
+ self.start()
469
+
470
+ # Monitor
471
+ self.monitor(duration=duration)
472
+
473
+ except Exception as exc:
474
+ self.logger.error(f"Error in run loop: {exc}", exc_info=True)
475
+
476
+ finally:
477
+ # Always cleanup
478
+ self.stop()
479
+
480
+ def _signal_handler(self, signum, frame):
481
+ """Handle shutdown signals gracefully."""
482
+ signal_name = signal.Signals(signum).name
483
+ self.logger.info(f"Received {signal_name}, initiating graceful shutdown...")
484
+ self.stop_event.set()
485
+
486
+ # ========================================================================
487
+ # Dynamic Camera Management Methods
488
+ # ========================================================================
489
+
490
+ def add_camera(self, camera_config: Dict[str, Any]) -> bool:
491
+ """Add a camera to the least-loaded worker at runtime.
492
+
493
+ Args:
494
+ camera_config: Camera configuration dictionary with stream_key, source, etc.
495
+
496
+ Returns:
497
+ bool: True if camera was added successfully
498
+ """
499
+ stream_key = camera_config.get('stream_key')
500
+
501
+ if not stream_key:
502
+ self.logger.error("Camera config missing stream_key")
503
+ return False
504
+
505
+ if stream_key in self.camera_to_worker:
506
+ self.logger.warning(f"Camera {stream_key} already exists, use update_camera instead")
507
+ return False
508
+
509
+ # Find least-loaded worker that's not at capacity
510
+ target_worker_id = self._find_least_loaded_worker()
511
+
512
+ if target_worker_id is None:
513
+ self.logger.error("All workers at capacity, cannot add camera")
514
+ return False
515
+
516
+ # Send command to worker
517
+ command = {
518
+ 'type': 'add_camera',
519
+ 'camera_config': camera_config,
520
+ 'timestamp': time.time()
521
+ }
522
+
523
+ try:
524
+ self.command_queues[target_worker_id].put(command, timeout=5.0)
525
+
526
+ # Update tracking (optimistic - will be verified via health report)
527
+ self.camera_to_worker[stream_key] = target_worker_id
528
+ self.worker_camera_count[target_worker_id] += 1
529
+
530
+ self.logger.info(
531
+ f"Sent add_camera command for {stream_key} to worker {target_worker_id}"
532
+ )
533
+ return True
534
+
535
+ except Exception as exc:
536
+ self.logger.error(f"Failed to send add_camera command: {exc}")
537
+ return False
538
+
539
+ def remove_camera(self, stream_key: str) -> bool:
540
+ """Remove a camera from its assigned worker.
541
+
542
+ Args:
543
+ stream_key: Unique identifier for the camera stream
544
+
545
+ Returns:
546
+ bool: True if camera removal was initiated
547
+ """
548
+ if stream_key not in self.camera_to_worker:
549
+ self.logger.warning(f"Camera {stream_key} not found in any worker")
550
+ return False
551
+
552
+ worker_id = self.camera_to_worker[stream_key]
553
+
554
+ command = {
555
+ 'type': 'remove_camera',
556
+ 'stream_key': stream_key,
557
+ 'timestamp': time.time()
558
+ }
559
+
560
+ try:
561
+ self.command_queues[worker_id].put(command, timeout=5.0)
562
+
563
+ # Update tracking
564
+ del self.camera_to_worker[stream_key]
565
+ self.worker_camera_count[worker_id] -= 1
566
+
567
+ self.logger.info(
568
+ f"Sent remove_camera command for {stream_key} to worker {worker_id}"
569
+ )
570
+ return True
571
+
572
+ except Exception as exc:
573
+ self.logger.error(f"Failed to send remove_camera command: {exc}")
574
+ return False
575
+
576
+ def update_camera(self, camera_config: Dict[str, Any]) -> bool:
577
+ """Update a camera's configuration (removes and re-adds with new config).
578
+
579
+ Args:
580
+ camera_config: Updated camera configuration
581
+
582
+ Returns:
583
+ bool: True if update was initiated
584
+ """
585
+ stream_key = camera_config.get('stream_key')
586
+
587
+ if not stream_key:
588
+ self.logger.error("Camera config missing stream_key")
589
+ return False
590
+
591
+ if stream_key not in self.camera_to_worker:
592
+ self.logger.warning(f"Camera {stream_key} not found, adding instead")
593
+ return self.add_camera(camera_config)
594
+
595
+ worker_id = self.camera_to_worker[stream_key]
596
+
597
+ command = {
598
+ 'type': 'update_camera',
599
+ 'camera_config': camera_config,
600
+ 'stream_key': stream_key,
601
+ 'timestamp': time.time()
602
+ }
603
+
604
+ try:
605
+ self.command_queues[worker_id].put(command, timeout=5.0)
606
+ self.logger.info(
607
+ f"Sent update_camera command for {stream_key} to worker {worker_id}"
608
+ )
609
+ return True
610
+
611
+ except Exception as exc:
612
+ self.logger.error(f"Failed to send update_camera command: {exc}")
613
+ return False
614
+
615
+ def _find_least_loaded_worker(self) -> Optional[int]:
616
+ """Find the worker with the least cameras that's not at capacity.
617
+
618
+ Returns:
619
+ Worker ID or None if all workers are at capacity
620
+ """
621
+ # Filter workers that have capacity and are alive
622
+ available_workers = []
623
+ for worker_id, count in self.worker_camera_count.items():
624
+ if count < self.max_cameras_per_worker and worker_id in self.command_queues:
625
+ # Check if worker is still alive
626
+ if worker_id < len(self.workers) and self.workers[worker_id].is_alive():
627
+ available_workers.append((worker_id, count))
628
+
629
+ if not available_workers:
630
+ return None
631
+
632
+ # Return worker with minimum cameras
633
+ return min(available_workers, key=lambda x: x[1])[0]
634
+
635
+ def get_camera_assignments(self) -> Dict[str, int]:
636
+ """Get current camera-to-worker assignments.
637
+
638
+ Returns:
639
+ Dict mapping stream_key to worker_id
640
+ """
641
+ return self.camera_to_worker.copy()
642
+
643
+ def _flush_health_queue(self):
644
+ """Consume all pending health reports from the queue."""
645
+ while not self.health_queue.empty():
646
+ try:
647
+ report = self.health_queue.get_nowait()
648
+ worker_id = report.get('worker_id')
649
+ if worker_id is not None:
650
+ self.last_health_reports[worker_id] = report
651
+ except Exception:
652
+ break
653
+
654
+ def get_worker_statistics(self) -> Dict[str, Any]:
655
+ """Get detailed statistics about workers and cameras.
656
+
657
+ Returns:
658
+ Dict with worker statistics for metrics/monitoring
659
+ """
660
+ # First, flush all pending health reports from the queue
661
+ self._flush_health_queue()
662
+
663
+ # Aggregate per-camera stats from all worker health reports
664
+ per_camera_stats = {}
665
+ for worker_id, report in self.last_health_reports.items():
666
+ worker_camera_stats = report.get('per_camera_stats', {})
667
+ per_camera_stats.update(worker_camera_stats)
668
+
669
+ return {
670
+ 'num_workers': len(self.workers),
671
+ 'running_workers': sum(1 for w in self.workers if w.is_alive()),
672
+ 'total_cameras': sum(self.worker_camera_count.values()),
673
+ 'camera_assignments': self.camera_to_worker.copy(),
674
+ 'worker_camera_counts': self.worker_camera_count.copy(),
675
+ 'health_reports': {
676
+ worker_id: {
677
+ 'status': report.get('status', 'unknown'),
678
+ 'active_cameras': report.get('active_cameras', 0),
679
+ 'timestamp': report.get('timestamp', 0),
680
+ 'metrics': report.get('metrics', {}),
681
+ }
682
+ for worker_id, report in self.last_health_reports.items()
683
+ },
684
+ 'per_camera_stats': per_camera_stats,
685
+ }
686
+
687
+ def __enter__(self):
688
+ """Context manager entry."""
689
+ self.start()
690
+ return self
691
+
692
+ def __exit__(self, exc_type, exc_val, exc_tb):
693
+ """Context manager exit."""
694
+ self.stop()