matrice-inference 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of matrice-inference might be problematic. Click here for more details.

Files changed (37) hide show
  1. matrice_inference/__init__.py +72 -0
  2. matrice_inference/py.typed +0 -0
  3. matrice_inference/server/__init__.py +23 -0
  4. matrice_inference/server/inference_interface.py +176 -0
  5. matrice_inference/server/model/__init__.py +1 -0
  6. matrice_inference/server/model/model_manager.py +274 -0
  7. matrice_inference/server/model/model_manager_wrapper.py +550 -0
  8. matrice_inference/server/model/triton_model_manager.py +290 -0
  9. matrice_inference/server/model/triton_server.py +1248 -0
  10. matrice_inference/server/proxy_interface.py +371 -0
  11. matrice_inference/server/server.py +1004 -0
  12. matrice_inference/server/stream/__init__.py +0 -0
  13. matrice_inference/server/stream/app_deployment.py +228 -0
  14. matrice_inference/server/stream/consumer_worker.py +201 -0
  15. matrice_inference/server/stream/frame_cache.py +127 -0
  16. matrice_inference/server/stream/inference_worker.py +163 -0
  17. matrice_inference/server/stream/post_processing_worker.py +230 -0
  18. matrice_inference/server/stream/producer_worker.py +147 -0
  19. matrice_inference/server/stream/stream_pipeline.py +451 -0
  20. matrice_inference/server/stream/utils.py +23 -0
  21. matrice_inference/tmp/abstract_model_manager.py +58 -0
  22. matrice_inference/tmp/aggregator/__init__.py +18 -0
  23. matrice_inference/tmp/aggregator/aggregator.py +330 -0
  24. matrice_inference/tmp/aggregator/analytics.py +906 -0
  25. matrice_inference/tmp/aggregator/ingestor.py +438 -0
  26. matrice_inference/tmp/aggregator/latency.py +597 -0
  27. matrice_inference/tmp/aggregator/pipeline.py +968 -0
  28. matrice_inference/tmp/aggregator/publisher.py +431 -0
  29. matrice_inference/tmp/aggregator/synchronizer.py +594 -0
  30. matrice_inference/tmp/batch_manager.py +239 -0
  31. matrice_inference/tmp/overall_inference_testing.py +338 -0
  32. matrice_inference/tmp/triton_utils.py +638 -0
  33. matrice_inference-0.1.2.dist-info/METADATA +28 -0
  34. matrice_inference-0.1.2.dist-info/RECORD +37 -0
  35. matrice_inference-0.1.2.dist-info/WHEEL +5 -0
  36. matrice_inference-0.1.2.dist-info/licenses/LICENSE.txt +21 -0
  37. matrice_inference-0.1.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,968 @@
1
+ import logging
2
+ import time
3
+ from typing import Dict
4
+ from queue import Queue
5
+ from matrice_common.session import Session
6
+ from matrice_inference.tmp.aggregator.ingestor import ResultsIngestor
7
+ from matrice_inference.tmp.aggregator.synchronizer import ResultsSynchronizer
8
+ from matrice_inference.tmp.aggregator.aggregator import ResultsAggregator
9
+ from matrice_inference.tmp.aggregator.publisher import ResultsPublisher
10
+ from matrice_inference.tmp.aggregator.analytics import AnalyticsSummarizer
11
+ from matrice_inference.tmp.aggregator.latency import LatencyTracker
12
+
13
+
14
+ class ResultsAggregationPipeline:
15
+ """
16
+ Enhanced deployments aggregator that handles multiple streams, synchronizes results,
17
+ and outputs aggregated results to Kafka topics with consistent structure.
18
+
19
+ This class orchestrates the complete pipeline for collecting, synchronizing, and
20
+ publishing results from multiple ML model deployments in an inference pipeline,
21
+ ensuring all results follow the same structure as individual deployment results.
22
+
23
+ Usage Example:
24
+ ```python
25
+ from matrice import Session
26
+ from matrice_inference.tmp.aggregator import ResultsAggregationPipeline
27
+
28
+ # Initialize session
29
+ session = Session(account_number="...", access_key="...", secret_key="...")
30
+
31
+ # Create aggregator for an inference pipeline
32
+ aggregator = ResultsAggregationPipeline(session, "your-inference-pipeline-id")
33
+
34
+ # Setup the aggregation pipeline
35
+ if aggregator.setup_components():
36
+ print(f"Setup complete for {len(aggregator.deployment_ids)} deployments")
37
+
38
+ # Start streaming and run until keyboard interrupt
39
+ try:
40
+ aggregator.start_streaming()
41
+ except KeyboardInterrupt:
42
+ print("Pipeline stopped by user")
43
+ finally:
44
+ aggregator.cleanup()
45
+ ```
46
+ """
47
+
48
+ def __init__(self, session: Session, action_record_id: str):
49
+ """
50
+ Initialize the deployments aggregator.
51
+
52
+ Args:
53
+ session: Session object for authentication
54
+ action_record_id: Action Record ID
55
+ """
56
+ self.session = session
57
+ self.rpc = session.rpc
58
+ self.action_record_id = action_record_id
59
+ url = f"/v1/project/action/{self.action_record_id}/details"
60
+ self.action_doc = self.rpc.get(url)["data"]
61
+ self.action_type = self.action_doc["action"]
62
+ self.job_params = self.action_doc["jobParams"]
63
+ self.action_details = self.action_doc["actionDetails"]
64
+
65
+ self.inference_pipeline_id = self.job_params["inference_pipeline_id"]
66
+ self.aggregator_id = self.job_params["aggregator_id"]
67
+
68
+ # self.inference_pipeline = InferencePipeline(session, pipeline_id=self.inference_pipeline_id) # TODO: Replace the usage with api call
69
+ self.inference_pipeline = None
70
+
71
+ # Initialize components
72
+ self.results_ingestor = None
73
+ self.results_synchronizer = None
74
+ self.results_aggregator = None
75
+ self.results_publisher = None
76
+ self.analytics_summarizer = None
77
+ self.latency_tracker = None
78
+
79
+ # Initialize the final results queue
80
+ self.final_results_queue = Queue()
81
+
82
+ # Statistics and monitoring
83
+ self.stats = {
84
+ "start_time": None,
85
+ "deployments_created": 0,
86
+ "pipeline_version": "2.0",
87
+ "errors": 0,
88
+ "last_error": None,
89
+ "last_error_time": None,
90
+ "component_status": {
91
+ "ingestor": "not_initialized",
92
+ "synchronizer": "not_initialized",
93
+ "aggregator": "not_initialized",
94
+ "analytics_summarizer": "not_initialized",
95
+ "latency_tracker": "not_initialized",
96
+ "publisher": "not_initialized"
97
+ }
98
+ }
99
+
100
+ # State management
101
+ self.components_setup = False
102
+ self.is_running = False
103
+ self.deployment_ids = []
104
+
105
+ logging.info("Action doc: %s", self.action_doc)
106
+ self.update_status(
107
+ "AGG_ACK",
108
+ "ACK",
109
+ "Action is acknowledged by aggregator",
110
+ )
111
+
112
+ def update_status(
113
+ self,
114
+ step_code: str,
115
+ status: str,
116
+ status_description: str,
117
+ ) -> None:
118
+ """Update status of data preparation.
119
+
120
+ Args:
121
+ step_code: Code indicating current step
122
+ status: Status of step
123
+ status_description: Description of status
124
+ """
125
+ try:
126
+ logging.info(status_description)
127
+ url = "/v1/actions"
128
+ payload = {
129
+ "_id": self.action_record_id,
130
+ "action": self.action_type,
131
+ "serviceName": self.action_doc["serviceName"],
132
+ "stepCode": step_code,
133
+ "status": status,
134
+ "statusDescription": status_description,
135
+ }
136
+
137
+ self.rpc.put(path=url, payload=payload)
138
+ except Exception as exc:
139
+ logging.error(
140
+ "Exception in update_status: %s",
141
+ str(exc),
142
+ )
143
+
144
+
145
+ def setup_components(self) -> bool:
146
+ """
147
+ Setup all components and initialize the aggregation pipeline.
148
+
149
+ Returns:
150
+ bool: True if all components initialized successfully, False otherwise
151
+ """
152
+ try:
153
+ self.components_setup = True
154
+ # Get deployment IDs from the inference pipeline
155
+ self.deployment_ids = self.inference_pipeline.deployment_ids
156
+ if not self.deployment_ids:
157
+ self._record_error("No deployment IDs found in inference pipeline")
158
+ return False
159
+
160
+ self.stats["deployments_created"] = len(self.deployment_ids)
161
+ self.stats["start_time"] = time.time()
162
+
163
+ # Initialize the results ingestor
164
+ logging.info("Initializing results ingestor...")
165
+ self.results_ingestor = ResultsIngestor(
166
+ deployment_ids=self.deployment_ids,
167
+ session=self.session,
168
+ consumer_timeout=300,
169
+ action_id=self.action_record_id
170
+ )
171
+ self.stats["component_status"]["ingestor"] = "initialized"
172
+
173
+ # Initialize the results synchronizer with reasonable timeout
174
+ logging.info("Initializing results synchronizer...")
175
+ self.results_synchronizer = ResultsSynchronizer(
176
+ results_queues=self.results_ingestor.results_queues,
177
+ sync_timeout=300 # 60 seconds timeout for synchronization
178
+ )
179
+ self.stats["component_status"]["synchronizer"] = "initialized"
180
+
181
+ # Initialize the results aggregator
182
+ logging.info("Initializing results aggregator...")
183
+ self.results_aggregator = ResultsAggregator(
184
+ synchronized_results_queue=self.results_synchronizer.synchronized_results_queue
185
+ )
186
+ self.stats["component_status"]["aggregator"] = "initialized"
187
+
188
+ # Initialize analytics summarizer (5-minute window) - optional component
189
+ logging.info("Initializing analytics summarizer...")
190
+ try:
191
+ self.analytics_summarizer = AnalyticsSummarizer(
192
+ session=self.session,
193
+ inference_pipeline_id=self.inference_pipeline_id,
194
+ flush_interval_seconds=300,
195
+ )
196
+ self.stats["component_status"]["analytics_summarizer"] = "initialized"
197
+ logging.info("Analytics summarizer initialized successfully")
198
+ except Exception as exc:
199
+ logging.error(f"Failed to initialize analytics summarizer (non-critical): {exc}", exc_info=True)
200
+ self.analytics_summarizer = None
201
+ self.stats["component_status"]["analytics_summarizer"] = "disabled"
202
+ logging.warning("Pipeline will continue without analytics summarizer")
203
+
204
+ # Initialize latency tracker (1-minute flush) - optional component
205
+ logging.info("Initializing latency tracker...")
206
+ try:
207
+ self.latency_tracker = LatencyTracker(
208
+ session=self.session,
209
+ inference_pipeline_id=self.inference_pipeline_id,
210
+ flush_interval_seconds=60,
211
+ max_samples=1000,
212
+ )
213
+ self.stats["component_status"]["latency_tracker"] = "initialized"
214
+ logging.info("Latency tracker initialized successfully")
215
+ except Exception as exc:
216
+ logging.error(f"Failed to initialize latency tracker (non-critical): {exc}", exc_info=True)
217
+ self.latency_tracker = None
218
+ self.stats["component_status"]["latency_tracker"] = "disabled"
219
+ logging.warning("Pipeline will continue without latency tracker")
220
+
221
+ # Initialize the results publisher
222
+ logging.info("Initializing results publisher...")
223
+ self.results_publisher = ResultsPublisher(
224
+ inference_pipeline_id=self.inference_pipeline_id,
225
+ session=self.session,
226
+ final_results_queue=self.results_aggregator.aggregated_results_queue,
227
+ analytics_summarizer=self.analytics_summarizer,
228
+ latency_tracker=self.latency_tracker
229
+ )
230
+ self.stats["component_status"]["publisher"] = "initialized"
231
+
232
+ logging.info(f"Successfully initialized aggregation pipeline for {len(self.deployment_ids)} deployments")
233
+ return True
234
+
235
+ except Exception as exc:
236
+ self._record_error(f"Failed to setup components: {str(exc)}")
237
+ return False
238
+
239
+ def start_streaming(self, block: bool = True) -> bool:
240
+ """
241
+ Start the complete streaming pipeline: ingestion, synchronization, aggregation, and publishing.
242
+
243
+ Returns:
244
+ bool: True if streaming started successfully, False otherwise
245
+ """
246
+ if not self.components_setup:
247
+ self.setup_components()
248
+
249
+ if not self.deployment_ids:
250
+ logging.error("No deployments available. Call setup_components() first.")
251
+ return False
252
+
253
+ try:
254
+ if self.is_running:
255
+ logging.warning("Streaming is already running")
256
+ return True
257
+
258
+ self.is_running = True
259
+
260
+ # Start components in order: ingestor -> synchronizer -> aggregator -> publisher
261
+
262
+ # Start results ingestion
263
+ logging.info("Starting results ingestion...")
264
+ if not self.results_ingestor.start_streaming():
265
+ self._record_error("Failed to start results ingestion")
266
+ return False
267
+ self.stats["component_status"]["ingestor"] = "running"
268
+
269
+ # Start results synchronization
270
+ logging.info("Starting results synchronization...")
271
+ if not self.results_synchronizer.start_synchronization():
272
+ self._record_error("Failed to start results synchronization")
273
+ return False
274
+ self.stats["component_status"]["synchronizer"] = "running"
275
+
276
+ # Start results aggregation
277
+ logging.info("Starting results aggregation...")
278
+ if not self.results_aggregator.start_aggregation():
279
+ self._record_error("Failed to start results aggregation")
280
+ return False
281
+ self.stats["component_status"]["aggregator"] = "running"
282
+
283
+ # Start analytics summarizer (if available)
284
+ if self.analytics_summarizer is not None:
285
+ logging.info("Starting analytics summarizer...")
286
+ try:
287
+ if not self.analytics_summarizer.start():
288
+ logging.warning("Analytics summarizer failed to start (non-critical)")
289
+ self.stats["component_status"]["analytics_summarizer"] = "failed"
290
+ else:
291
+ self.stats["component_status"]["analytics_summarizer"] = "running"
292
+ logging.info("Analytics summarizer started successfully")
293
+ except Exception as exc:
294
+ logging.warning(f"Failed to start analytics summarizer (non-critical): {exc}")
295
+ self.stats["component_status"]["analytics_summarizer"] = "failed"
296
+ else:
297
+ logging.info("Analytics summarizer is disabled, skipping startup")
298
+ self.stats["component_status"]["analytics_summarizer"] = "disabled"
299
+
300
+ # Start latency tracker (if available)
301
+ if self.latency_tracker is not None:
302
+ logging.info("Starting latency tracker...")
303
+ try:
304
+ if not self.latency_tracker.start():
305
+ logging.warning("Latency tracker failed to start (non-critical)")
306
+ self.stats["component_status"]["latency_tracker"] = "failed"
307
+ else:
308
+ self.stats["component_status"]["latency_tracker"] = "running"
309
+ logging.info("Latency tracker started successfully")
310
+ except Exception as exc:
311
+ logging.warning(f"Failed to start latency tracker (non-critical): {exc}")
312
+ self.stats["component_status"]["latency_tracker"] = "failed"
313
+ else:
314
+ logging.info("Latency tracker is disabled, skipping startup")
315
+ self.stats["component_status"]["latency_tracker"] = "disabled"
316
+
317
+ # Start results publishing
318
+ logging.info("Starting results publishing...")
319
+ if not self.results_publisher.start_streaming():
320
+ self._record_error("Failed to start results publishing")
321
+ return False
322
+ self.stats["component_status"]["publisher"] = "running"
323
+
324
+ # Update status to indicate successful startup
325
+ self.update_status(
326
+ "AGG_RUNNING",
327
+ "SUCCESS",
328
+ f"Aggregation pipeline started successfully with {len(self.deployment_ids)} deployments"
329
+ )
330
+
331
+ logging.info("Aggregation pipeline started successfully")
332
+ if block:
333
+ self.start_logging()
334
+ return True
335
+
336
+ except Exception as exc:
337
+ self._record_error(f"Failed to start streaming: {str(exc)}")
338
+ self.stop_streaming()
339
+ return False
340
+
341
+ def start_logging(self, status_interval: int = 30) -> None:
342
+ """
343
+ Start the pipeline logging and run until interrupted.
344
+ Args:
345
+ status_interval: Interval in seconds between status log messages
346
+ """
347
+ try:
348
+ logging.info("=" * 60)
349
+ logging.info("🚀 Aggregation pipeline is running!")
350
+ logging.info(f"📊 Processing results from {len(self.deployment_ids)} deployments")
351
+ logging.info(f"🔗 Inference Pipeline ID: {self.inference_pipeline_id}")
352
+ if self.deployment_ids:
353
+ logging.info(f"🎯 Deployment IDs: {', '.join(self.deployment_ids)}")
354
+ logging.info("💡 Press Ctrl+C to stop the pipeline")
355
+ logging.info("=" * 60)
356
+
357
+ last_status_time = time.time()
358
+
359
+ # Main loop - run until interrupted
360
+ while True:
361
+ try:
362
+ current_time = time.time()
363
+
364
+ # Periodic status logging
365
+ if current_time - last_status_time >= status_interval:
366
+ self._log_pipeline_status()
367
+ last_status_time = current_time
368
+
369
+ # Check pipeline health
370
+ health = self.get_health_status()
371
+ overall_status = health.get("overall_status")
372
+
373
+ if overall_status == "unhealthy":
374
+ issues = health.get("issues", [])
375
+ logging.error(f"Pipeline is UNHEALTHY with {len(issues)} critical issues:")
376
+ for i, issue in enumerate(issues, 1):
377
+ logging.error(f" {i}. {issue}")
378
+ logging.error("Pipeline will continue running but may need intervention")
379
+
380
+ elif overall_status == "degraded":
381
+ issues = health.get("issues", [])
382
+ logging.warning(f"Pipeline is DEGRADED with {len(issues)} issues:")
383
+ for i, issue in enumerate(issues, 1):
384
+ logging.warning(f" {i}. {issue}")
385
+
386
+ # Sleep for a short time to prevent busy waiting
387
+ time.sleep(1.0)
388
+
389
+ except KeyboardInterrupt:
390
+ # Re-raise to be caught by outer handler
391
+ raise
392
+ except Exception as exc:
393
+ logging.error(f"Error in main pipeline loop: {exc}")
394
+ # Continue running unless it's a critical error
395
+ time.sleep(5.0)
396
+
397
+ except KeyboardInterrupt:
398
+ logging.info("")
399
+ logging.info("🛑 Keyboard interrupt received - stopping pipeline...")
400
+
401
+ except Exception as exc:
402
+ logging.error(f"Critical error in pipeline: {exc}")
403
+ self._record_error(f"Critical pipeline error: {str(exc)}")
404
+
405
+ finally:
406
+ # Always cleanup
407
+ try:
408
+ logging.info("🧹 Cleaning up pipeline resources...")
409
+ self.cleanup()
410
+ logging.info("✅ Pipeline stopped successfully")
411
+ except KeyboardInterrupt:
412
+ # Handle second Ctrl+C during cleanup
413
+ logging.warning("⚠️ Second interrupt received during cleanup - forcing exit...")
414
+ try:
415
+ # Try quick cleanup
416
+ self.stop_streaming()
417
+ except:
418
+ pass
419
+ logging.info("✅ Pipeline force-stopped")
420
+ except Exception as exc:
421
+ logging.error(f"Error during cleanup: {exc}")
422
+
423
+ def _log_pipeline_status(self):
424
+ """Log current pipeline status and statistics."""
425
+ try:
426
+ stats = self.get_stats()
427
+ health = self.get_health_status()
428
+
429
+ logging.info("📈 Pipeline Status Report:")
430
+ logging.info(f" ⏱️ Runtime: {stats.get('runtime_seconds', 0):.1f} seconds")
431
+ logging.info(f" 🔄 Overall Health: {health.get('overall_status', 'unknown')}")
432
+
433
+ # Log health issues with details
434
+ issues = health.get("issues", [])
435
+ if issues:
436
+ logging.warning(f" ⚠️ Health Issues ({len(issues)}):")
437
+ for i, issue in enumerate(issues, 1):
438
+ logging.warning(f" {i}. {issue}")
439
+
440
+ # Component stats with error details
441
+ components = stats.get("components", {})
442
+
443
+ if "results_ingestor" in components:
444
+ ingestor_stats = components["results_ingestor"]
445
+ logging.info(f" 📥 Results Consumed: {ingestor_stats.get('results_consumed', 0)}")
446
+ if ingestor_stats.get("errors", 0) > 0:
447
+ logging.warning(f" └─ Ingestor Errors: {ingestor_stats['errors']} (last: {ingestor_stats.get('last_error', 'N/A')})")
448
+
449
+ if "results_synchronizer" in components:
450
+ sync_stats = components["results_synchronizer"]
451
+ logging.info(f" 🔗 Results Synchronized: {sync_stats.get('results_synchronized', 0)}")
452
+ logging.info(f" ✅ Complete Syncs: {sync_stats.get('complete_syncs', 0)}")
453
+ partial_syncs = sync_stats.get('partial_syncs', 0)
454
+ if partial_syncs > 0:
455
+ logging.warning(f" ⚠️ Partial Syncs: {partial_syncs}")
456
+ if sync_stats.get("errors", 0) > 0:
457
+ logging.warning(f" └─ Sync Errors: {sync_stats['errors']} (last: {sync_stats.get('last_error', 'N/A')})")
458
+
459
+ # Log sync performance details
460
+ completion_rate = sync_stats.get('completion_rate', 0.0)
461
+ avg_sync_time = sync_stats.get('avg_sync_time', 0.0)
462
+ if completion_rate < 0.9:
463
+ logging.warning(f" └─ Low Completion Rate: {completion_rate:.1%}")
464
+ if avg_sync_time > 5.0: # More than 5 seconds average
465
+ logging.warning(f" └─ High Avg Sync Time: {avg_sync_time:.2f}s")
466
+
467
+ if "results_aggregator" in components:
468
+ agg_stats = components["results_aggregator"]
469
+ logging.info(f" 🎯 Results Aggregated: {agg_stats.get('aggregations_created', 0)}")
470
+ if agg_stats.get("errors", 0) > 0:
471
+ logging.warning(f" └─ Aggregator Errors: {agg_stats['errors']} (last: {agg_stats.get('last_error', 'N/A')})")
472
+
473
+ if "analytics_summarizer" in components:
474
+ sum_stats = components["analytics_summarizer"]
475
+ if isinstance(sum_stats, dict) and sum_stats.get("summaries_published") is not None:
476
+ logging.info(f" 🧮 Summaries Published: {sum_stats.get('summaries_published', 0)}")
477
+ logging.info(f" 📍 Location Summaries: {sum_stats.get('location_summaries_published', 0)}")
478
+ logging.info(f" 🚨 Incidents Published: {sum_stats.get('incidents_published', 0)}")
479
+ if sum_stats.get("errors", 0) > 0:
480
+ logging.warning(f" └─ Summarizer Errors: {sum_stats['errors']} (last: {sum_stats.get('last_error', 'N/A')})")
481
+ else:
482
+ logging.info(" 🧮 Analytics: Disabled")
483
+
484
+ if "latency_tracker" in components:
485
+ lat_stats = components["latency_tracker"]
486
+ if isinstance(lat_stats, dict) and lat_stats.get("latency_reports_published") is not None:
487
+ logging.info(f" 📊 Latency Reports: {lat_stats.get('latency_reports_published', 0)}")
488
+ logging.info(f" ⚡ Alerts Triggered: {lat_stats.get('alerts_triggered', 0)}")
489
+ if lat_stats.get("errors", 0) > 0:
490
+ logging.warning(f" └─ Latency Tracker Errors: {lat_stats['errors']} (last: {lat_stats.get('last_error', 'N/A')})")
491
+ else:
492
+ logging.info(" 📊 Latency Tracking: Disabled")
493
+
494
+ if "results_publisher" in components:
495
+ pub_stats = components["results_publisher"]
496
+ logging.info(f" 📤 Messages Published: {pub_stats.get('messages_produced', 0)}")
497
+ kafka_errors = pub_stats.get('kafka_errors', 0)
498
+ validation_errors = pub_stats.get('validation_errors', 0)
499
+ if kafka_errors > 0 or validation_errors > 0:
500
+ logging.warning(f" └─ Publisher Errors: {kafka_errors} kafka, {validation_errors} validation")
501
+
502
+ # Pipeline metrics
503
+ pipeline_metrics = stats.get("pipeline_metrics", {})
504
+ if pipeline_metrics:
505
+ throughput = pipeline_metrics.get('throughput', 0)
506
+ completion_rate = pipeline_metrics.get('completion_rate', 0)
507
+ error_rate = pipeline_metrics.get('error_rate', 0)
508
+
509
+ logging.info(f" 🚀 Throughput: {throughput:.2f} msg/sec")
510
+ logging.info(f" 📊 Completion Rate: {completion_rate:.1%}")
511
+
512
+ if error_rate > 0.05: # More than 5% error rate
513
+ logging.warning(f" ❌ Error Rate: {error_rate:.1%}")
514
+ elif error_rate > 0:
515
+ logging.info(f" 📉 Error Rate: {error_rate:.1%}")
516
+
517
+ logging.info("─" * 50)
518
+
519
+ except Exception as exc:
520
+ logging.error(f"Error logging pipeline status: {exc}")
521
+ # Log basic fallback info
522
+ try:
523
+ health = self.get_health_status()
524
+ logging.error(f"Pipeline health: {health.get('overall_status', 'unknown')}, Issues: {len(health.get('issues', []))}")
525
+ except:
526
+ logging.error("Unable to retrieve basic health status")
527
+
528
+ def stop_streaming(self):
529
+ """Stop all streaming operations in reverse order."""
530
+ logging.info("Stopping aggregation pipeline...")
531
+
532
+ if not self.is_running:
533
+ logging.info("Streaming is not running")
534
+ return
535
+
536
+ # Update status to indicate shutdown is starting
537
+ self.update_status(
538
+ "AGG_SHUTDOWN",
539
+ "IN_PROGRESS",
540
+ "Aggregation pipeline shutdown initiated"
541
+ )
542
+
543
+ self.is_running = False
544
+
545
+ # Stop components in reverse order: publisher -> aggregator -> synchronizer -> ingestor
546
+ if self.results_publisher:
547
+ try:
548
+ logging.info("Stopping results publisher...")
549
+ self.results_publisher.stop_streaming()
550
+ self.stats["component_status"]["publisher"] = "stopped"
551
+ except Exception as exc:
552
+ logging.error(f"Error stopping results publisher: {exc}")
553
+
554
+ if self.analytics_summarizer is not None:
555
+ try:
556
+ logging.info("Stopping analytics summarizer...")
557
+ self.analytics_summarizer.stop()
558
+ self.stats["component_status"]["analytics_summarizer"] = "stopped"
559
+ except Exception as exc:
560
+ logging.error(f"Error stopping analytics summarizer: {exc}")
561
+
562
+ if self.latency_tracker:
563
+ try:
564
+ logging.info("Stopping latency tracker...")
565
+ self.latency_tracker.stop()
566
+ self.stats["component_status"]["latency_tracker"] = "stopped"
567
+ except Exception as exc:
568
+ logging.error(f"Error stopping latency tracker: {exc}")
569
+
570
+ if self.results_aggregator:
571
+ try:
572
+ logging.info("Stopping results aggregator...")
573
+ self.results_aggregator.stop_aggregation()
574
+ self.stats["component_status"]["aggregator"] = "stopped"
575
+ except Exception as exc:
576
+ logging.error(f"Error stopping results aggregator: {exc}")
577
+
578
+ if self.results_synchronizer:
579
+ try:
580
+ logging.info("Stopping results synchronizer...")
581
+ self.results_synchronizer.stop_synchronization()
582
+ self.stats["component_status"]["synchronizer"] = "stopped"
583
+ except Exception as exc:
584
+ logging.error(f"Error stopping results synchronization: {exc}")
585
+
586
+ if self.results_ingestor:
587
+ try:
588
+ logging.info("Stopping results ingestor...")
589
+ self.results_ingestor.stop_streaming()
590
+ self.stats["component_status"]["ingestor"] = "stopped"
591
+ except Exception as exc:
592
+ logging.error(f"Error stopping results ingestion: {exc}")
593
+
594
+ # Update status to indicate successful shutdown
595
+ self.update_status(
596
+ "AGG_SHUTDOWN",
597
+ "SUCCESS",
598
+ "Aggregation pipeline stopped successfully"
599
+ )
600
+
601
+ logging.info("Aggregation pipeline stopped")
602
+
603
+ def get_stats(self) -> Dict:
604
+ """Get current statistics from all components."""
605
+ stats = self.stats.copy()
606
+ if stats["start_time"]:
607
+ stats["runtime_seconds"] = time.time() - stats["start_time"]
608
+
609
+ # Add component statistics
610
+ stats["components"] = {}
611
+
612
+ if self.results_ingestor:
613
+ stats["components"]["results_ingestor"] = self.results_ingestor.get_stats()
614
+
615
+ if self.results_synchronizer:
616
+ stats["components"]["results_synchronizer"] = self.results_synchronizer.get_stats()
617
+
618
+ if self.results_aggregator:
619
+ stats["components"]["results_aggregator"] = self.results_aggregator.get_stats()
620
+
621
+ if self.analytics_summarizer is not None:
622
+ stats["components"]["analytics_summarizer"] = self.analytics_summarizer.get_stats()
623
+
624
+ if self.latency_tracker is not None:
625
+ stats["components"]["latency_tracker"] = self.latency_tracker.get_stats()
626
+
627
+ if self.results_publisher:
628
+ stats["components"]["results_publisher"] = self.results_publisher.get_stats()
629
+
630
+ # Add pipeline-level metrics
631
+ stats["pipeline_metrics"] = self._calculate_pipeline_metrics()
632
+
633
+ return stats
634
+
635
+ def _calculate_pipeline_metrics(self) -> Dict:
636
+ """Calculate pipeline-level performance metrics."""
637
+ metrics = {
638
+ "throughput": 0.0,
639
+ "latency": 0.0,
640
+ "error_rate": 0.0,
641
+ "completion_rate": 0.0,
642
+ }
643
+
644
+ try:
645
+ # Calculate throughput (messages per second)
646
+ if self.stats["start_time"]:
647
+ runtime = time.time() - self.stats["start_time"]
648
+ if runtime > 0 and self.results_publisher:
649
+ publisher_stats = self.results_publisher.get_stats()
650
+ metrics["throughput"] = publisher_stats.get("messages_produced", 0) / runtime
651
+
652
+ # Calculate completion rate from synchronizer
653
+ if self.results_synchronizer:
654
+ sync_stats = self.results_synchronizer.get_stats()
655
+ total_syncs = sync_stats.get("complete_syncs", 0) + sync_stats.get("partial_syncs", 0)
656
+ if total_syncs > 0:
657
+ metrics["completion_rate"] = sync_stats.get("complete_syncs", 0) / total_syncs
658
+
659
+ # Calculate error rate
660
+ total_errors = self.stats["errors"]
661
+ total_processed = 0
662
+
663
+ if self.results_ingestor:
664
+ ingestor_stats = self.results_ingestor.get_stats()
665
+ total_processed += ingestor_stats.get("results_consumed", 0)
666
+ total_errors += ingestor_stats.get("errors", 0)
667
+
668
+ if total_processed > 0:
669
+ metrics["error_rate"] = total_errors / total_processed
670
+
671
+ # Calculate average latency from synchronizer
672
+ if self.results_synchronizer:
673
+ sync_stats = self.results_synchronizer.get_stats()
674
+ metrics["latency"] = sync_stats.get("avg_sync_time", 0.0)
675
+
676
+ except Exception as exc:
677
+ logging.error(f"Error calculating pipeline metrics: {exc}")
678
+
679
+ return metrics
680
+
681
+ def get_health_status(self) -> Dict:
682
+ """Get health status of all components."""
683
+ health = {
684
+ "overall_status": "healthy",
685
+ "is_running": self.is_running,
686
+ "pipeline_version": self.stats["pipeline_version"],
687
+ "deployment_count": len(self.deployment_ids),
688
+ "components": {},
689
+ "issues": [],
690
+ }
691
+
692
+ try:
693
+ # Check components health with detailed logging
694
+ if self.results_ingestor:
695
+ ingestor_health = self.results_ingestor.get_health_status()
696
+ health["components"]["results_ingestor"] = ingestor_health
697
+ if ingestor_health.get("status") != "healthy":
698
+ issue_detail = f"Results ingestor is {ingestor_health.get('status', 'unknown')}"
699
+ if "reason" in ingestor_health:
700
+ issue_detail += f": {ingestor_health['reason']}"
701
+ if ingestor_health.get("errors", 0) > 0:
702
+ issue_detail += f" ({ingestor_health['errors']} errors)"
703
+ health["issues"].append(issue_detail)
704
+ logging.warning(f"Ingestor health issue: {issue_detail}")
705
+ else:
706
+ health["issues"].append("Results ingestor not initialized")
707
+ logging.error("Results ingestor not initialized")
708
+
709
+ if self.results_synchronizer:
710
+ sync_health = self.results_synchronizer.get_health_status()
711
+ health["components"]["results_synchronizer"] = sync_health
712
+ if sync_health.get("status") != "healthy":
713
+ issue_detail = f"Results synchronizer is {sync_health.get('status', 'unknown')}"
714
+ if "issue" in sync_health:
715
+ issue_detail += f": {sync_health['issue']}"
716
+ if "recent_error" in sync_health:
717
+ issue_detail += f" (recent error: {sync_health['recent_error']})"
718
+ if sync_health.get("completion_rate", 1.0) < 0.8:
719
+ issue_detail += f" (completion rate: {sync_health.get('completion_rate', 0):.1%})"
720
+ health["issues"].append(issue_detail)
721
+ logging.warning(f"Synchronizer health issue: {issue_detail}")
722
+ else:
723
+ health["issues"].append("Results synchronizer not initialized")
724
+ logging.error("Results synchronizer not initialized")
725
+
726
+ if self.results_aggregator:
727
+ agg_health = self.results_aggregator.get_health_status()
728
+ health["components"]["results_aggregator"] = agg_health
729
+ if agg_health.get("status") != "healthy":
730
+ issue_detail = f"Results aggregator is {agg_health.get('status', 'unknown')}"
731
+ if agg_health.get("errors", 0) > 0:
732
+ issue_detail += f" ({agg_health['errors']} errors)"
733
+ if agg_health.get("output_queue_size", 0) > 100:
734
+ issue_detail += f" (output queue size: {agg_health['output_queue_size']})"
735
+ health["issues"].append(issue_detail)
736
+ logging.warning(f"Aggregator health issue: {issue_detail}")
737
+ else:
738
+ health["issues"].append("Results aggregator not initialized")
739
+ logging.error("Results aggregator not initialized")
740
+
741
+ if self.analytics_summarizer is not None:
742
+ sum_health = self.analytics_summarizer.get_health_status()
743
+ health["components"]["analytics_summarizer"] = sum_health
744
+ if sum_health.get("status") != "healthy":
745
+ issue_detail = f"Analytics summarizer is {sum_health.get('status', 'unknown')}"
746
+ if "reason" in sum_health:
747
+ issue_detail += f": {sum_health['reason']}"
748
+ if sum_health.get("errors", 0) > 0:
749
+ issue_detail += f" ({sum_health['errors']} errors)"
750
+ health["issues"].append(issue_detail)
751
+ logging.warning(f"Summarizer health issue: {issue_detail}")
752
+ else:
753
+ # Analytics summarizer is disabled - this is not an error
754
+ health["components"]["analytics_summarizer"] = {
755
+ "status": "disabled",
756
+ "reason": "Analytics summarizer is disabled due to initialization failure"
757
+ }
758
+ logging.debug("Analytics summarizer is disabled")
759
+
760
+ if self.latency_tracker is not None:
761
+ lat_health = self.latency_tracker.get_health_status()
762
+ health["components"]["latency_tracker"] = lat_health
763
+ if lat_health.get("status") != "healthy":
764
+ issue_detail = f"Latency tracker is {lat_health.get('status', 'unknown')}"
765
+ if "reason" in lat_health:
766
+ issue_detail += f": {lat_health['reason']}"
767
+ if lat_health.get("errors", 0) > 0:
768
+ issue_detail += f" ({lat_health['errors']} errors)"
769
+ health["issues"].append(issue_detail)
770
+ logging.warning(f"Latency tracker health issue: {issue_detail}")
771
+ else:
772
+ # Latency tracker is disabled - this is not an error
773
+ health["components"]["latency_tracker"] = {
774
+ "status": "disabled",
775
+ "reason": "Latency tracker is disabled due to initialization failure"
776
+ }
777
+ logging.debug("Latency tracker is disabled")
778
+
779
+ if self.results_publisher:
780
+ pub_health = self.results_publisher.get_health_status()
781
+ health["components"]["results_publisher"] = pub_health
782
+ if pub_health.get("status") != "healthy":
783
+ issue_detail = f"Results publisher is {pub_health.get('status', 'unknown')}"
784
+ if "reason" in pub_health:
785
+ issue_detail += f": {pub_health['reason']}"
786
+ if "last_error" in pub_health:
787
+ issue_detail += f" (last error: {pub_health['last_error']})"
788
+ if pub_health.get("kafka_errors", 0) > 0:
789
+ issue_detail += f" ({pub_health['kafka_errors']} kafka errors)"
790
+ health["issues"].append(issue_detail)
791
+ logging.warning(f"Publisher health issue: {issue_detail}")
792
+ else:
793
+ health["issues"].append("Results publisher not initialized")
794
+ logging.error("Results publisher not initialized")
795
+
796
+ # Determine overall status with logging
797
+ issue_count = len(health["issues"])
798
+ if issue_count > 0:
799
+ if issue_count >= 2:
800
+ health["overall_status"] = "unhealthy"
801
+ logging.error(f"Pipeline is UNHEALTHY with {issue_count} issues: {'; '.join(health['issues'])}")
802
+ else:
803
+ health["overall_status"] = "degraded"
804
+ logging.warning(f"Pipeline is DEGRADED with {issue_count} issue: {health['issues'][0]}")
805
+ else:
806
+ logging.debug("Pipeline health check: all components healthy")
807
+
808
+ except Exception as exc:
809
+ health["overall_status"] = "unhealthy"
810
+ health["error"] = str(exc)
811
+ error_msg = f"Error checking health: {str(exc)}"
812
+ health["issues"].append(error_msg)
813
+ logging.error(f"Pipeline health check failed: {error_msg}")
814
+
815
+ return health
816
+
817
+ def get_deployment_info(self) -> Dict:
818
+ """
819
+ Get information about the deployments in this aggregator.
820
+
821
+ Returns:
822
+ Dict: Deployment information including IDs, count, and status
823
+ """
824
+ return {
825
+ "inference_pipeline_id": self.inference_pipeline_id,
826
+ "deployment_ids": self.deployment_ids,
827
+ "deployment_count": len(self.deployment_ids),
828
+ "pipeline_status": getattr(self.inference_pipeline, 'status', None),
829
+ "aggregator_running": self.is_running,
830
+ "component_status": self.stats["component_status"].copy(),
831
+ }
832
+
833
+ def wait_for_ready(self, timeout: int = 300, poll_interval: int = 10) -> bool:
834
+ """
835
+ Wait for the aggregator to be ready and processing results.
836
+
837
+ Args:
838
+ timeout: Maximum time to wait in seconds
839
+ poll_interval: Time between checks in seconds
840
+
841
+ Returns:
842
+ bool: True if aggregator is ready, False if timeout
843
+ """
844
+ if not self.is_running:
845
+ logging.warning("Aggregator is not running")
846
+ return False
847
+
848
+ start_time = time.time()
849
+
850
+ while time.time() - start_time < timeout:
851
+ try:
852
+ health = self.get_health_status()
853
+
854
+ # Check if all components are healthy
855
+ if health.get("overall_status") == "healthy":
856
+ # Check if we're receiving and processing results
857
+ stats = self.get_stats()
858
+ components = stats.get("components", {})
859
+
860
+ ingestor_stats = components.get("results_ingestor", {})
861
+ sync_stats = components.get("results_synchronizer", {})
862
+
863
+ # Consider ready if we're consuming and synchronizing results
864
+ if (ingestor_stats.get("results_consumed", 0) > 0 and
865
+ sync_stats.get("results_synchronized", 0) > 0):
866
+ logging.info("Aggregation pipeline is ready and processing results")
867
+ return True
868
+
869
+ logging.debug(f"Waiting for pipeline readiness... Health: {health.get('overall_status')}")
870
+ time.sleep(poll_interval)
871
+
872
+ except Exception as exc:
873
+ logging.error(f"Error checking aggregator readiness: {exc}")
874
+ time.sleep(poll_interval)
875
+
876
+ logging.warning(f"Aggregation pipeline not ready after {timeout} seconds")
877
+ return False
878
+
879
+ def force_sync_pending_results(self) -> int:
880
+ """
881
+ Force synchronization of all pending results.
882
+
883
+ Returns:
884
+ int: Number of pending results that were synchronized
885
+ """
886
+ if not self.results_synchronizer:
887
+ logging.warning("Results synchronizer not initialized")
888
+ return 0
889
+
890
+ return self.results_synchronizer.force_sync_pending()
891
+
892
+ def _record_error(self, error_message: str):
893
+ """Record an error with timestamp."""
894
+ logging.error(error_message)
895
+ self.stats["errors"] += 1
896
+ self.stats["last_error"] = error_message
897
+ self.stats["last_error_time"] = time.time()
898
+
899
+ def cleanup(self):
900
+ """Clean up all resources."""
901
+ logging.info("Cleaning up aggregation pipeline resources...")
902
+
903
+ # Update status to indicate cleanup is starting
904
+ self.update_status(
905
+ "AGG_CLEANUP",
906
+ "IN_PROGRESS",
907
+ "Aggregation pipeline cleanup initiated"
908
+ )
909
+
910
+ # Stop streaming if running
911
+ if self.is_running:
912
+ self.stop_streaming()
913
+
914
+ # Cleanup components in reverse order
915
+ if self.results_publisher:
916
+ try:
917
+ self.results_publisher.cleanup() if hasattr(self.results_publisher, 'cleanup') else None
918
+ except Exception as exc:
919
+ logging.error(f"Error cleaning up publisher: {exc}")
920
+
921
+ if self.results_aggregator:
922
+ try:
923
+ self.results_aggregator.cleanup()
924
+ except Exception as exc:
925
+ logging.error(f"Error cleaning up aggregator: {exc}")
926
+
927
+ if self.analytics_summarizer is not None:
928
+ try:
929
+ self.analytics_summarizer.cleanup()
930
+ except Exception as exc:
931
+ logging.error(f"Error cleaning up analytics summarizer: {exc}")
932
+
933
+ if self.latency_tracker:
934
+ try:
935
+ self.latency_tracker.cleanup()
936
+ except Exception as exc:
937
+ logging.error(f"Error cleaning up latency tracker: {exc}")
938
+
939
+ if self.results_synchronizer:
940
+ try:
941
+ self.results_synchronizer.cleanup()
942
+ except Exception as exc:
943
+ logging.error(f"Error cleaning up synchronizer: {exc}")
944
+
945
+ if self.results_ingestor:
946
+ try:
947
+ self.results_ingestor.cleanup()
948
+ except Exception as exc:
949
+ logging.error(f"Error cleaning up ingestor: {exc}")
950
+
951
+ # Clear the final results queue
952
+ if self.final_results_queue:
953
+ try:
954
+ while not self.final_results_queue.empty():
955
+ self.final_results_queue.get_nowait()
956
+ except Exception:
957
+ pass
958
+
959
+ # Update status to indicate successful cleanup
960
+ self.update_status(
961
+ "AGG_CLEANUP",
962
+ "SUCCESS",
963
+ "Aggregation pipeline cleanup completed successfully"
964
+ )
965
+
966
+ logging.info("Aggregation pipeline cleanup completed")
967
+
968
+