matrice-inference 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of matrice-inference might be problematic. Click here for more details.
- matrice_inference/__init__.py +72 -0
- matrice_inference/py.typed +0 -0
- matrice_inference/server/__init__.py +23 -0
- matrice_inference/server/inference_interface.py +176 -0
- matrice_inference/server/model/__init__.py +1 -0
- matrice_inference/server/model/model_manager.py +274 -0
- matrice_inference/server/model/model_manager_wrapper.py +550 -0
- matrice_inference/server/model/triton_model_manager.py +290 -0
- matrice_inference/server/model/triton_server.py +1248 -0
- matrice_inference/server/proxy_interface.py +371 -0
- matrice_inference/server/server.py +1004 -0
- matrice_inference/server/stream/__init__.py +0 -0
- matrice_inference/server/stream/app_deployment.py +228 -0
- matrice_inference/server/stream/consumer_worker.py +201 -0
- matrice_inference/server/stream/frame_cache.py +127 -0
- matrice_inference/server/stream/inference_worker.py +163 -0
- matrice_inference/server/stream/post_processing_worker.py +230 -0
- matrice_inference/server/stream/producer_worker.py +147 -0
- matrice_inference/server/stream/stream_pipeline.py +451 -0
- matrice_inference/server/stream/utils.py +23 -0
- matrice_inference/tmp/abstract_model_manager.py +58 -0
- matrice_inference/tmp/aggregator/__init__.py +18 -0
- matrice_inference/tmp/aggregator/aggregator.py +330 -0
- matrice_inference/tmp/aggregator/analytics.py +906 -0
- matrice_inference/tmp/aggregator/ingestor.py +438 -0
- matrice_inference/tmp/aggregator/latency.py +597 -0
- matrice_inference/tmp/aggregator/pipeline.py +968 -0
- matrice_inference/tmp/aggregator/publisher.py +431 -0
- matrice_inference/tmp/aggregator/synchronizer.py +594 -0
- matrice_inference/tmp/batch_manager.py +239 -0
- matrice_inference/tmp/overall_inference_testing.py +338 -0
- matrice_inference/tmp/triton_utils.py +638 -0
- matrice_inference-0.1.2.dist-info/METADATA +28 -0
- matrice_inference-0.1.2.dist-info/RECORD +37 -0
- matrice_inference-0.1.2.dist-info/WHEEL +5 -0
- matrice_inference-0.1.2.dist-info/licenses/LICENSE.txt +21 -0
- matrice_inference-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,968 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import time
|
|
3
|
+
from typing import Dict
|
|
4
|
+
from queue import Queue
|
|
5
|
+
from matrice_common.session import Session
|
|
6
|
+
from matrice_inference.tmp.aggregator.ingestor import ResultsIngestor
|
|
7
|
+
from matrice_inference.tmp.aggregator.synchronizer import ResultsSynchronizer
|
|
8
|
+
from matrice_inference.tmp.aggregator.aggregator import ResultsAggregator
|
|
9
|
+
from matrice_inference.tmp.aggregator.publisher import ResultsPublisher
|
|
10
|
+
from matrice_inference.tmp.aggregator.analytics import AnalyticsSummarizer
|
|
11
|
+
from matrice_inference.tmp.aggregator.latency import LatencyTracker
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ResultsAggregationPipeline:
|
|
15
|
+
"""
|
|
16
|
+
Enhanced deployments aggregator that handles multiple streams, synchronizes results,
|
|
17
|
+
and outputs aggregated results to Kafka topics with consistent structure.
|
|
18
|
+
|
|
19
|
+
This class orchestrates the complete pipeline for collecting, synchronizing, and
|
|
20
|
+
publishing results from multiple ML model deployments in an inference pipeline,
|
|
21
|
+
ensuring all results follow the same structure as individual deployment results.
|
|
22
|
+
|
|
23
|
+
Usage Example:
|
|
24
|
+
```python
|
|
25
|
+
from matrice import Session
|
|
26
|
+
from matrice_inference.tmp.aggregator import ResultsAggregationPipeline
|
|
27
|
+
|
|
28
|
+
# Initialize session
|
|
29
|
+
session = Session(account_number="...", access_key="...", secret_key="...")
|
|
30
|
+
|
|
31
|
+
# Create aggregator for an inference pipeline
|
|
32
|
+
aggregator = ResultsAggregationPipeline(session, "your-inference-pipeline-id")
|
|
33
|
+
|
|
34
|
+
# Setup the aggregation pipeline
|
|
35
|
+
if aggregator.setup_components():
|
|
36
|
+
print(f"Setup complete for {len(aggregator.deployment_ids)} deployments")
|
|
37
|
+
|
|
38
|
+
# Start streaming and run until keyboard interrupt
|
|
39
|
+
try:
|
|
40
|
+
aggregator.start_streaming()
|
|
41
|
+
except KeyboardInterrupt:
|
|
42
|
+
print("Pipeline stopped by user")
|
|
43
|
+
finally:
|
|
44
|
+
aggregator.cleanup()
|
|
45
|
+
```
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, session: Session, action_record_id: str):
|
|
49
|
+
"""
|
|
50
|
+
Initialize the deployments aggregator.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
session: Session object for authentication
|
|
54
|
+
action_record_id: Action Record ID
|
|
55
|
+
"""
|
|
56
|
+
self.session = session
|
|
57
|
+
self.rpc = session.rpc
|
|
58
|
+
self.action_record_id = action_record_id
|
|
59
|
+
url = f"/v1/project/action/{self.action_record_id}/details"
|
|
60
|
+
self.action_doc = self.rpc.get(url)["data"]
|
|
61
|
+
self.action_type = self.action_doc["action"]
|
|
62
|
+
self.job_params = self.action_doc["jobParams"]
|
|
63
|
+
self.action_details = self.action_doc["actionDetails"]
|
|
64
|
+
|
|
65
|
+
self.inference_pipeline_id = self.job_params["inference_pipeline_id"]
|
|
66
|
+
self.aggregator_id = self.job_params["aggregator_id"]
|
|
67
|
+
|
|
68
|
+
# self.inference_pipeline = InferencePipeline(session, pipeline_id=self.inference_pipeline_id) # TODO: Replace the usage with api call
|
|
69
|
+
self.inference_pipeline = None
|
|
70
|
+
|
|
71
|
+
# Initialize components
|
|
72
|
+
self.results_ingestor = None
|
|
73
|
+
self.results_synchronizer = None
|
|
74
|
+
self.results_aggregator = None
|
|
75
|
+
self.results_publisher = None
|
|
76
|
+
self.analytics_summarizer = None
|
|
77
|
+
self.latency_tracker = None
|
|
78
|
+
|
|
79
|
+
# Initialize the final results queue
|
|
80
|
+
self.final_results_queue = Queue()
|
|
81
|
+
|
|
82
|
+
# Statistics and monitoring
|
|
83
|
+
self.stats = {
|
|
84
|
+
"start_time": None,
|
|
85
|
+
"deployments_created": 0,
|
|
86
|
+
"pipeline_version": "2.0",
|
|
87
|
+
"errors": 0,
|
|
88
|
+
"last_error": None,
|
|
89
|
+
"last_error_time": None,
|
|
90
|
+
"component_status": {
|
|
91
|
+
"ingestor": "not_initialized",
|
|
92
|
+
"synchronizer": "not_initialized",
|
|
93
|
+
"aggregator": "not_initialized",
|
|
94
|
+
"analytics_summarizer": "not_initialized",
|
|
95
|
+
"latency_tracker": "not_initialized",
|
|
96
|
+
"publisher": "not_initialized"
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# State management
|
|
101
|
+
self.components_setup = False
|
|
102
|
+
self.is_running = False
|
|
103
|
+
self.deployment_ids = []
|
|
104
|
+
|
|
105
|
+
logging.info("Action doc: %s", self.action_doc)
|
|
106
|
+
self.update_status(
|
|
107
|
+
"AGG_ACK",
|
|
108
|
+
"ACK",
|
|
109
|
+
"Action is acknowledged by aggregator",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def update_status(
|
|
113
|
+
self,
|
|
114
|
+
step_code: str,
|
|
115
|
+
status: str,
|
|
116
|
+
status_description: str,
|
|
117
|
+
) -> None:
|
|
118
|
+
"""Update status of data preparation.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
step_code: Code indicating current step
|
|
122
|
+
status: Status of step
|
|
123
|
+
status_description: Description of status
|
|
124
|
+
"""
|
|
125
|
+
try:
|
|
126
|
+
logging.info(status_description)
|
|
127
|
+
url = "/v1/actions"
|
|
128
|
+
payload = {
|
|
129
|
+
"_id": self.action_record_id,
|
|
130
|
+
"action": self.action_type,
|
|
131
|
+
"serviceName": self.action_doc["serviceName"],
|
|
132
|
+
"stepCode": step_code,
|
|
133
|
+
"status": status,
|
|
134
|
+
"statusDescription": status_description,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
self.rpc.put(path=url, payload=payload)
|
|
138
|
+
except Exception as exc:
|
|
139
|
+
logging.error(
|
|
140
|
+
"Exception in update_status: %s",
|
|
141
|
+
str(exc),
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def setup_components(self) -> bool:
|
|
146
|
+
"""
|
|
147
|
+
Setup all components and initialize the aggregation pipeline.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
bool: True if all components initialized successfully, False otherwise
|
|
151
|
+
"""
|
|
152
|
+
try:
|
|
153
|
+
self.components_setup = True
|
|
154
|
+
# Get deployment IDs from the inference pipeline
|
|
155
|
+
self.deployment_ids = self.inference_pipeline.deployment_ids
|
|
156
|
+
if not self.deployment_ids:
|
|
157
|
+
self._record_error("No deployment IDs found in inference pipeline")
|
|
158
|
+
return False
|
|
159
|
+
|
|
160
|
+
self.stats["deployments_created"] = len(self.deployment_ids)
|
|
161
|
+
self.stats["start_time"] = time.time()
|
|
162
|
+
|
|
163
|
+
# Initialize the results ingestor
|
|
164
|
+
logging.info("Initializing results ingestor...")
|
|
165
|
+
self.results_ingestor = ResultsIngestor(
|
|
166
|
+
deployment_ids=self.deployment_ids,
|
|
167
|
+
session=self.session,
|
|
168
|
+
consumer_timeout=300,
|
|
169
|
+
action_id=self.action_record_id
|
|
170
|
+
)
|
|
171
|
+
self.stats["component_status"]["ingestor"] = "initialized"
|
|
172
|
+
|
|
173
|
+
# Initialize the results synchronizer with reasonable timeout
|
|
174
|
+
logging.info("Initializing results synchronizer...")
|
|
175
|
+
self.results_synchronizer = ResultsSynchronizer(
|
|
176
|
+
results_queues=self.results_ingestor.results_queues,
|
|
177
|
+
sync_timeout=300 # 60 seconds timeout for synchronization
|
|
178
|
+
)
|
|
179
|
+
self.stats["component_status"]["synchronizer"] = "initialized"
|
|
180
|
+
|
|
181
|
+
# Initialize the results aggregator
|
|
182
|
+
logging.info("Initializing results aggregator...")
|
|
183
|
+
self.results_aggregator = ResultsAggregator(
|
|
184
|
+
synchronized_results_queue=self.results_synchronizer.synchronized_results_queue
|
|
185
|
+
)
|
|
186
|
+
self.stats["component_status"]["aggregator"] = "initialized"
|
|
187
|
+
|
|
188
|
+
# Initialize analytics summarizer (5-minute window) - optional component
|
|
189
|
+
logging.info("Initializing analytics summarizer...")
|
|
190
|
+
try:
|
|
191
|
+
self.analytics_summarizer = AnalyticsSummarizer(
|
|
192
|
+
session=self.session,
|
|
193
|
+
inference_pipeline_id=self.inference_pipeline_id,
|
|
194
|
+
flush_interval_seconds=300,
|
|
195
|
+
)
|
|
196
|
+
self.stats["component_status"]["analytics_summarizer"] = "initialized"
|
|
197
|
+
logging.info("Analytics summarizer initialized successfully")
|
|
198
|
+
except Exception as exc:
|
|
199
|
+
logging.error(f"Failed to initialize analytics summarizer (non-critical): {exc}", exc_info=True)
|
|
200
|
+
self.analytics_summarizer = None
|
|
201
|
+
self.stats["component_status"]["analytics_summarizer"] = "disabled"
|
|
202
|
+
logging.warning("Pipeline will continue without analytics summarizer")
|
|
203
|
+
|
|
204
|
+
# Initialize latency tracker (1-minute flush) - optional component
|
|
205
|
+
logging.info("Initializing latency tracker...")
|
|
206
|
+
try:
|
|
207
|
+
self.latency_tracker = LatencyTracker(
|
|
208
|
+
session=self.session,
|
|
209
|
+
inference_pipeline_id=self.inference_pipeline_id,
|
|
210
|
+
flush_interval_seconds=60,
|
|
211
|
+
max_samples=1000,
|
|
212
|
+
)
|
|
213
|
+
self.stats["component_status"]["latency_tracker"] = "initialized"
|
|
214
|
+
logging.info("Latency tracker initialized successfully")
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
logging.error(f"Failed to initialize latency tracker (non-critical): {exc}", exc_info=True)
|
|
217
|
+
self.latency_tracker = None
|
|
218
|
+
self.stats["component_status"]["latency_tracker"] = "disabled"
|
|
219
|
+
logging.warning("Pipeline will continue without latency tracker")
|
|
220
|
+
|
|
221
|
+
# Initialize the results publisher
|
|
222
|
+
logging.info("Initializing results publisher...")
|
|
223
|
+
self.results_publisher = ResultsPublisher(
|
|
224
|
+
inference_pipeline_id=self.inference_pipeline_id,
|
|
225
|
+
session=self.session,
|
|
226
|
+
final_results_queue=self.results_aggregator.aggregated_results_queue,
|
|
227
|
+
analytics_summarizer=self.analytics_summarizer,
|
|
228
|
+
latency_tracker=self.latency_tracker
|
|
229
|
+
)
|
|
230
|
+
self.stats["component_status"]["publisher"] = "initialized"
|
|
231
|
+
|
|
232
|
+
logging.info(f"Successfully initialized aggregation pipeline for {len(self.deployment_ids)} deployments")
|
|
233
|
+
return True
|
|
234
|
+
|
|
235
|
+
except Exception as exc:
|
|
236
|
+
self._record_error(f"Failed to setup components: {str(exc)}")
|
|
237
|
+
return False
|
|
238
|
+
|
|
239
|
+
def start_streaming(self, block: bool = True) -> bool:
|
|
240
|
+
"""
|
|
241
|
+
Start the complete streaming pipeline: ingestion, synchronization, aggregation, and publishing.
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
bool: True if streaming started successfully, False otherwise
|
|
245
|
+
"""
|
|
246
|
+
if not self.components_setup:
|
|
247
|
+
self.setup_components()
|
|
248
|
+
|
|
249
|
+
if not self.deployment_ids:
|
|
250
|
+
logging.error("No deployments available. Call setup_components() first.")
|
|
251
|
+
return False
|
|
252
|
+
|
|
253
|
+
try:
|
|
254
|
+
if self.is_running:
|
|
255
|
+
logging.warning("Streaming is already running")
|
|
256
|
+
return True
|
|
257
|
+
|
|
258
|
+
self.is_running = True
|
|
259
|
+
|
|
260
|
+
# Start components in order: ingestor -> synchronizer -> aggregator -> publisher
|
|
261
|
+
|
|
262
|
+
# Start results ingestion
|
|
263
|
+
logging.info("Starting results ingestion...")
|
|
264
|
+
if not self.results_ingestor.start_streaming():
|
|
265
|
+
self._record_error("Failed to start results ingestion")
|
|
266
|
+
return False
|
|
267
|
+
self.stats["component_status"]["ingestor"] = "running"
|
|
268
|
+
|
|
269
|
+
# Start results synchronization
|
|
270
|
+
logging.info("Starting results synchronization...")
|
|
271
|
+
if not self.results_synchronizer.start_synchronization():
|
|
272
|
+
self._record_error("Failed to start results synchronization")
|
|
273
|
+
return False
|
|
274
|
+
self.stats["component_status"]["synchronizer"] = "running"
|
|
275
|
+
|
|
276
|
+
# Start results aggregation
|
|
277
|
+
logging.info("Starting results aggregation...")
|
|
278
|
+
if not self.results_aggregator.start_aggregation():
|
|
279
|
+
self._record_error("Failed to start results aggregation")
|
|
280
|
+
return False
|
|
281
|
+
self.stats["component_status"]["aggregator"] = "running"
|
|
282
|
+
|
|
283
|
+
# Start analytics summarizer (if available)
|
|
284
|
+
if self.analytics_summarizer is not None:
|
|
285
|
+
logging.info("Starting analytics summarizer...")
|
|
286
|
+
try:
|
|
287
|
+
if not self.analytics_summarizer.start():
|
|
288
|
+
logging.warning("Analytics summarizer failed to start (non-critical)")
|
|
289
|
+
self.stats["component_status"]["analytics_summarizer"] = "failed"
|
|
290
|
+
else:
|
|
291
|
+
self.stats["component_status"]["analytics_summarizer"] = "running"
|
|
292
|
+
logging.info("Analytics summarizer started successfully")
|
|
293
|
+
except Exception as exc:
|
|
294
|
+
logging.warning(f"Failed to start analytics summarizer (non-critical): {exc}")
|
|
295
|
+
self.stats["component_status"]["analytics_summarizer"] = "failed"
|
|
296
|
+
else:
|
|
297
|
+
logging.info("Analytics summarizer is disabled, skipping startup")
|
|
298
|
+
self.stats["component_status"]["analytics_summarizer"] = "disabled"
|
|
299
|
+
|
|
300
|
+
# Start latency tracker (if available)
|
|
301
|
+
if self.latency_tracker is not None:
|
|
302
|
+
logging.info("Starting latency tracker...")
|
|
303
|
+
try:
|
|
304
|
+
if not self.latency_tracker.start():
|
|
305
|
+
logging.warning("Latency tracker failed to start (non-critical)")
|
|
306
|
+
self.stats["component_status"]["latency_tracker"] = "failed"
|
|
307
|
+
else:
|
|
308
|
+
self.stats["component_status"]["latency_tracker"] = "running"
|
|
309
|
+
logging.info("Latency tracker started successfully")
|
|
310
|
+
except Exception as exc:
|
|
311
|
+
logging.warning(f"Failed to start latency tracker (non-critical): {exc}")
|
|
312
|
+
self.stats["component_status"]["latency_tracker"] = "failed"
|
|
313
|
+
else:
|
|
314
|
+
logging.info("Latency tracker is disabled, skipping startup")
|
|
315
|
+
self.stats["component_status"]["latency_tracker"] = "disabled"
|
|
316
|
+
|
|
317
|
+
# Start results publishing
|
|
318
|
+
logging.info("Starting results publishing...")
|
|
319
|
+
if not self.results_publisher.start_streaming():
|
|
320
|
+
self._record_error("Failed to start results publishing")
|
|
321
|
+
return False
|
|
322
|
+
self.stats["component_status"]["publisher"] = "running"
|
|
323
|
+
|
|
324
|
+
# Update status to indicate successful startup
|
|
325
|
+
self.update_status(
|
|
326
|
+
"AGG_RUNNING",
|
|
327
|
+
"SUCCESS",
|
|
328
|
+
f"Aggregation pipeline started successfully with {len(self.deployment_ids)} deployments"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
logging.info("Aggregation pipeline started successfully")
|
|
332
|
+
if block:
|
|
333
|
+
self.start_logging()
|
|
334
|
+
return True
|
|
335
|
+
|
|
336
|
+
except Exception as exc:
|
|
337
|
+
self._record_error(f"Failed to start streaming: {str(exc)}")
|
|
338
|
+
self.stop_streaming()
|
|
339
|
+
return False
|
|
340
|
+
|
|
341
|
+
def start_logging(self, status_interval: int = 30) -> None:
|
|
342
|
+
"""
|
|
343
|
+
Start the pipeline logging and run until interrupted.
|
|
344
|
+
Args:
|
|
345
|
+
status_interval: Interval in seconds between status log messages
|
|
346
|
+
"""
|
|
347
|
+
try:
|
|
348
|
+
logging.info("=" * 60)
|
|
349
|
+
logging.info("🚀 Aggregation pipeline is running!")
|
|
350
|
+
logging.info(f"📊 Processing results from {len(self.deployment_ids)} deployments")
|
|
351
|
+
logging.info(f"🔗 Inference Pipeline ID: {self.inference_pipeline_id}")
|
|
352
|
+
if self.deployment_ids:
|
|
353
|
+
logging.info(f"🎯 Deployment IDs: {', '.join(self.deployment_ids)}")
|
|
354
|
+
logging.info("💡 Press Ctrl+C to stop the pipeline")
|
|
355
|
+
logging.info("=" * 60)
|
|
356
|
+
|
|
357
|
+
last_status_time = time.time()
|
|
358
|
+
|
|
359
|
+
# Main loop - run until interrupted
|
|
360
|
+
while True:
|
|
361
|
+
try:
|
|
362
|
+
current_time = time.time()
|
|
363
|
+
|
|
364
|
+
# Periodic status logging
|
|
365
|
+
if current_time - last_status_time >= status_interval:
|
|
366
|
+
self._log_pipeline_status()
|
|
367
|
+
last_status_time = current_time
|
|
368
|
+
|
|
369
|
+
# Check pipeline health
|
|
370
|
+
health = self.get_health_status()
|
|
371
|
+
overall_status = health.get("overall_status")
|
|
372
|
+
|
|
373
|
+
if overall_status == "unhealthy":
|
|
374
|
+
issues = health.get("issues", [])
|
|
375
|
+
logging.error(f"Pipeline is UNHEALTHY with {len(issues)} critical issues:")
|
|
376
|
+
for i, issue in enumerate(issues, 1):
|
|
377
|
+
logging.error(f" {i}. {issue}")
|
|
378
|
+
logging.error("Pipeline will continue running but may need intervention")
|
|
379
|
+
|
|
380
|
+
elif overall_status == "degraded":
|
|
381
|
+
issues = health.get("issues", [])
|
|
382
|
+
logging.warning(f"Pipeline is DEGRADED with {len(issues)} issues:")
|
|
383
|
+
for i, issue in enumerate(issues, 1):
|
|
384
|
+
logging.warning(f" {i}. {issue}")
|
|
385
|
+
|
|
386
|
+
# Sleep for a short time to prevent busy waiting
|
|
387
|
+
time.sleep(1.0)
|
|
388
|
+
|
|
389
|
+
except KeyboardInterrupt:
|
|
390
|
+
# Re-raise to be caught by outer handler
|
|
391
|
+
raise
|
|
392
|
+
except Exception as exc:
|
|
393
|
+
logging.error(f"Error in main pipeline loop: {exc}")
|
|
394
|
+
# Continue running unless it's a critical error
|
|
395
|
+
time.sleep(5.0)
|
|
396
|
+
|
|
397
|
+
except KeyboardInterrupt:
|
|
398
|
+
logging.info("")
|
|
399
|
+
logging.info("🛑 Keyboard interrupt received - stopping pipeline...")
|
|
400
|
+
|
|
401
|
+
except Exception as exc:
|
|
402
|
+
logging.error(f"Critical error in pipeline: {exc}")
|
|
403
|
+
self._record_error(f"Critical pipeline error: {str(exc)}")
|
|
404
|
+
|
|
405
|
+
finally:
|
|
406
|
+
# Always cleanup
|
|
407
|
+
try:
|
|
408
|
+
logging.info("🧹 Cleaning up pipeline resources...")
|
|
409
|
+
self.cleanup()
|
|
410
|
+
logging.info("✅ Pipeline stopped successfully")
|
|
411
|
+
except KeyboardInterrupt:
|
|
412
|
+
# Handle second Ctrl+C during cleanup
|
|
413
|
+
logging.warning("⚠️ Second interrupt received during cleanup - forcing exit...")
|
|
414
|
+
try:
|
|
415
|
+
# Try quick cleanup
|
|
416
|
+
self.stop_streaming()
|
|
417
|
+
except:
|
|
418
|
+
pass
|
|
419
|
+
logging.info("✅ Pipeline force-stopped")
|
|
420
|
+
except Exception as exc:
|
|
421
|
+
logging.error(f"Error during cleanup: {exc}")
|
|
422
|
+
|
|
423
|
+
def _log_pipeline_status(self):
|
|
424
|
+
"""Log current pipeline status and statistics."""
|
|
425
|
+
try:
|
|
426
|
+
stats = self.get_stats()
|
|
427
|
+
health = self.get_health_status()
|
|
428
|
+
|
|
429
|
+
logging.info("📈 Pipeline Status Report:")
|
|
430
|
+
logging.info(f" ⏱️ Runtime: {stats.get('runtime_seconds', 0):.1f} seconds")
|
|
431
|
+
logging.info(f" 🔄 Overall Health: {health.get('overall_status', 'unknown')}")
|
|
432
|
+
|
|
433
|
+
# Log health issues with details
|
|
434
|
+
issues = health.get("issues", [])
|
|
435
|
+
if issues:
|
|
436
|
+
logging.warning(f" ⚠️ Health Issues ({len(issues)}):")
|
|
437
|
+
for i, issue in enumerate(issues, 1):
|
|
438
|
+
logging.warning(f" {i}. {issue}")
|
|
439
|
+
|
|
440
|
+
# Component stats with error details
|
|
441
|
+
components = stats.get("components", {})
|
|
442
|
+
|
|
443
|
+
if "results_ingestor" in components:
|
|
444
|
+
ingestor_stats = components["results_ingestor"]
|
|
445
|
+
logging.info(f" 📥 Results Consumed: {ingestor_stats.get('results_consumed', 0)}")
|
|
446
|
+
if ingestor_stats.get("errors", 0) > 0:
|
|
447
|
+
logging.warning(f" └─ Ingestor Errors: {ingestor_stats['errors']} (last: {ingestor_stats.get('last_error', 'N/A')})")
|
|
448
|
+
|
|
449
|
+
if "results_synchronizer" in components:
|
|
450
|
+
sync_stats = components["results_synchronizer"]
|
|
451
|
+
logging.info(f" 🔗 Results Synchronized: {sync_stats.get('results_synchronized', 0)}")
|
|
452
|
+
logging.info(f" ✅ Complete Syncs: {sync_stats.get('complete_syncs', 0)}")
|
|
453
|
+
partial_syncs = sync_stats.get('partial_syncs', 0)
|
|
454
|
+
if partial_syncs > 0:
|
|
455
|
+
logging.warning(f" ⚠️ Partial Syncs: {partial_syncs}")
|
|
456
|
+
if sync_stats.get("errors", 0) > 0:
|
|
457
|
+
logging.warning(f" └─ Sync Errors: {sync_stats['errors']} (last: {sync_stats.get('last_error', 'N/A')})")
|
|
458
|
+
|
|
459
|
+
# Log sync performance details
|
|
460
|
+
completion_rate = sync_stats.get('completion_rate', 0.0)
|
|
461
|
+
avg_sync_time = sync_stats.get('avg_sync_time', 0.0)
|
|
462
|
+
if completion_rate < 0.9:
|
|
463
|
+
logging.warning(f" └─ Low Completion Rate: {completion_rate:.1%}")
|
|
464
|
+
if avg_sync_time > 5.0: # More than 5 seconds average
|
|
465
|
+
logging.warning(f" └─ High Avg Sync Time: {avg_sync_time:.2f}s")
|
|
466
|
+
|
|
467
|
+
if "results_aggregator" in components:
|
|
468
|
+
agg_stats = components["results_aggregator"]
|
|
469
|
+
logging.info(f" 🎯 Results Aggregated: {agg_stats.get('aggregations_created', 0)}")
|
|
470
|
+
if agg_stats.get("errors", 0) > 0:
|
|
471
|
+
logging.warning(f" └─ Aggregator Errors: {agg_stats['errors']} (last: {agg_stats.get('last_error', 'N/A')})")
|
|
472
|
+
|
|
473
|
+
if "analytics_summarizer" in components:
|
|
474
|
+
sum_stats = components["analytics_summarizer"]
|
|
475
|
+
if isinstance(sum_stats, dict) and sum_stats.get("summaries_published") is not None:
|
|
476
|
+
logging.info(f" 🧮 Summaries Published: {sum_stats.get('summaries_published', 0)}")
|
|
477
|
+
logging.info(f" 📍 Location Summaries: {sum_stats.get('location_summaries_published', 0)}")
|
|
478
|
+
logging.info(f" 🚨 Incidents Published: {sum_stats.get('incidents_published', 0)}")
|
|
479
|
+
if sum_stats.get("errors", 0) > 0:
|
|
480
|
+
logging.warning(f" └─ Summarizer Errors: {sum_stats['errors']} (last: {sum_stats.get('last_error', 'N/A')})")
|
|
481
|
+
else:
|
|
482
|
+
logging.info(" 🧮 Analytics: Disabled")
|
|
483
|
+
|
|
484
|
+
if "latency_tracker" in components:
|
|
485
|
+
lat_stats = components["latency_tracker"]
|
|
486
|
+
if isinstance(lat_stats, dict) and lat_stats.get("latency_reports_published") is not None:
|
|
487
|
+
logging.info(f" 📊 Latency Reports: {lat_stats.get('latency_reports_published', 0)}")
|
|
488
|
+
logging.info(f" ⚡ Alerts Triggered: {lat_stats.get('alerts_triggered', 0)}")
|
|
489
|
+
if lat_stats.get("errors", 0) > 0:
|
|
490
|
+
logging.warning(f" └─ Latency Tracker Errors: {lat_stats['errors']} (last: {lat_stats.get('last_error', 'N/A')})")
|
|
491
|
+
else:
|
|
492
|
+
logging.info(" 📊 Latency Tracking: Disabled")
|
|
493
|
+
|
|
494
|
+
if "results_publisher" in components:
|
|
495
|
+
pub_stats = components["results_publisher"]
|
|
496
|
+
logging.info(f" 📤 Messages Published: {pub_stats.get('messages_produced', 0)}")
|
|
497
|
+
kafka_errors = pub_stats.get('kafka_errors', 0)
|
|
498
|
+
validation_errors = pub_stats.get('validation_errors', 0)
|
|
499
|
+
if kafka_errors > 0 or validation_errors > 0:
|
|
500
|
+
logging.warning(f" └─ Publisher Errors: {kafka_errors} kafka, {validation_errors} validation")
|
|
501
|
+
|
|
502
|
+
# Pipeline metrics
|
|
503
|
+
pipeline_metrics = stats.get("pipeline_metrics", {})
|
|
504
|
+
if pipeline_metrics:
|
|
505
|
+
throughput = pipeline_metrics.get('throughput', 0)
|
|
506
|
+
completion_rate = pipeline_metrics.get('completion_rate', 0)
|
|
507
|
+
error_rate = pipeline_metrics.get('error_rate', 0)
|
|
508
|
+
|
|
509
|
+
logging.info(f" 🚀 Throughput: {throughput:.2f} msg/sec")
|
|
510
|
+
logging.info(f" 📊 Completion Rate: {completion_rate:.1%}")
|
|
511
|
+
|
|
512
|
+
if error_rate > 0.05: # More than 5% error rate
|
|
513
|
+
logging.warning(f" ❌ Error Rate: {error_rate:.1%}")
|
|
514
|
+
elif error_rate > 0:
|
|
515
|
+
logging.info(f" 📉 Error Rate: {error_rate:.1%}")
|
|
516
|
+
|
|
517
|
+
logging.info("─" * 50)
|
|
518
|
+
|
|
519
|
+
except Exception as exc:
|
|
520
|
+
logging.error(f"Error logging pipeline status: {exc}")
|
|
521
|
+
# Log basic fallback info
|
|
522
|
+
try:
|
|
523
|
+
health = self.get_health_status()
|
|
524
|
+
logging.error(f"Pipeline health: {health.get('overall_status', 'unknown')}, Issues: {len(health.get('issues', []))}")
|
|
525
|
+
except:
|
|
526
|
+
logging.error("Unable to retrieve basic health status")
|
|
527
|
+
|
|
528
|
+
def stop_streaming(self):
|
|
529
|
+
"""Stop all streaming operations in reverse order."""
|
|
530
|
+
logging.info("Stopping aggregation pipeline...")
|
|
531
|
+
|
|
532
|
+
if not self.is_running:
|
|
533
|
+
logging.info("Streaming is not running")
|
|
534
|
+
return
|
|
535
|
+
|
|
536
|
+
# Update status to indicate shutdown is starting
|
|
537
|
+
self.update_status(
|
|
538
|
+
"AGG_SHUTDOWN",
|
|
539
|
+
"IN_PROGRESS",
|
|
540
|
+
"Aggregation pipeline shutdown initiated"
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
self.is_running = False
|
|
544
|
+
|
|
545
|
+
# Stop components in reverse order: publisher -> aggregator -> synchronizer -> ingestor
|
|
546
|
+
if self.results_publisher:
|
|
547
|
+
try:
|
|
548
|
+
logging.info("Stopping results publisher...")
|
|
549
|
+
self.results_publisher.stop_streaming()
|
|
550
|
+
self.stats["component_status"]["publisher"] = "stopped"
|
|
551
|
+
except Exception as exc:
|
|
552
|
+
logging.error(f"Error stopping results publisher: {exc}")
|
|
553
|
+
|
|
554
|
+
if self.analytics_summarizer is not None:
|
|
555
|
+
try:
|
|
556
|
+
logging.info("Stopping analytics summarizer...")
|
|
557
|
+
self.analytics_summarizer.stop()
|
|
558
|
+
self.stats["component_status"]["analytics_summarizer"] = "stopped"
|
|
559
|
+
except Exception as exc:
|
|
560
|
+
logging.error(f"Error stopping analytics summarizer: {exc}")
|
|
561
|
+
|
|
562
|
+
if self.latency_tracker:
|
|
563
|
+
try:
|
|
564
|
+
logging.info("Stopping latency tracker...")
|
|
565
|
+
self.latency_tracker.stop()
|
|
566
|
+
self.stats["component_status"]["latency_tracker"] = "stopped"
|
|
567
|
+
except Exception as exc:
|
|
568
|
+
logging.error(f"Error stopping latency tracker: {exc}")
|
|
569
|
+
|
|
570
|
+
if self.results_aggregator:
|
|
571
|
+
try:
|
|
572
|
+
logging.info("Stopping results aggregator...")
|
|
573
|
+
self.results_aggregator.stop_aggregation()
|
|
574
|
+
self.stats["component_status"]["aggregator"] = "stopped"
|
|
575
|
+
except Exception as exc:
|
|
576
|
+
logging.error(f"Error stopping results aggregator: {exc}")
|
|
577
|
+
|
|
578
|
+
if self.results_synchronizer:
|
|
579
|
+
try:
|
|
580
|
+
logging.info("Stopping results synchronizer...")
|
|
581
|
+
self.results_synchronizer.stop_synchronization()
|
|
582
|
+
self.stats["component_status"]["synchronizer"] = "stopped"
|
|
583
|
+
except Exception as exc:
|
|
584
|
+
logging.error(f"Error stopping results synchronization: {exc}")
|
|
585
|
+
|
|
586
|
+
if self.results_ingestor:
|
|
587
|
+
try:
|
|
588
|
+
logging.info("Stopping results ingestor...")
|
|
589
|
+
self.results_ingestor.stop_streaming()
|
|
590
|
+
self.stats["component_status"]["ingestor"] = "stopped"
|
|
591
|
+
except Exception as exc:
|
|
592
|
+
logging.error(f"Error stopping results ingestion: {exc}")
|
|
593
|
+
|
|
594
|
+
# Update status to indicate successful shutdown
|
|
595
|
+
self.update_status(
|
|
596
|
+
"AGG_SHUTDOWN",
|
|
597
|
+
"SUCCESS",
|
|
598
|
+
"Aggregation pipeline stopped successfully"
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
logging.info("Aggregation pipeline stopped")
|
|
602
|
+
|
|
603
|
+
def get_stats(self) -> Dict:
|
|
604
|
+
"""Get current statistics from all components."""
|
|
605
|
+
stats = self.stats.copy()
|
|
606
|
+
if stats["start_time"]:
|
|
607
|
+
stats["runtime_seconds"] = time.time() - stats["start_time"]
|
|
608
|
+
|
|
609
|
+
# Add component statistics
|
|
610
|
+
stats["components"] = {}
|
|
611
|
+
|
|
612
|
+
if self.results_ingestor:
|
|
613
|
+
stats["components"]["results_ingestor"] = self.results_ingestor.get_stats()
|
|
614
|
+
|
|
615
|
+
if self.results_synchronizer:
|
|
616
|
+
stats["components"]["results_synchronizer"] = self.results_synchronizer.get_stats()
|
|
617
|
+
|
|
618
|
+
if self.results_aggregator:
|
|
619
|
+
stats["components"]["results_aggregator"] = self.results_aggregator.get_stats()
|
|
620
|
+
|
|
621
|
+
if self.analytics_summarizer is not None:
|
|
622
|
+
stats["components"]["analytics_summarizer"] = self.analytics_summarizer.get_stats()
|
|
623
|
+
|
|
624
|
+
if self.latency_tracker is not None:
|
|
625
|
+
stats["components"]["latency_tracker"] = self.latency_tracker.get_stats()
|
|
626
|
+
|
|
627
|
+
if self.results_publisher:
|
|
628
|
+
stats["components"]["results_publisher"] = self.results_publisher.get_stats()
|
|
629
|
+
|
|
630
|
+
# Add pipeline-level metrics
|
|
631
|
+
stats["pipeline_metrics"] = self._calculate_pipeline_metrics()
|
|
632
|
+
|
|
633
|
+
return stats
|
|
634
|
+
|
|
635
|
+
def _calculate_pipeline_metrics(self) -> Dict:
|
|
636
|
+
"""Calculate pipeline-level performance metrics."""
|
|
637
|
+
metrics = {
|
|
638
|
+
"throughput": 0.0,
|
|
639
|
+
"latency": 0.0,
|
|
640
|
+
"error_rate": 0.0,
|
|
641
|
+
"completion_rate": 0.0,
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
try:
|
|
645
|
+
# Calculate throughput (messages per second)
|
|
646
|
+
if self.stats["start_time"]:
|
|
647
|
+
runtime = time.time() - self.stats["start_time"]
|
|
648
|
+
if runtime > 0 and self.results_publisher:
|
|
649
|
+
publisher_stats = self.results_publisher.get_stats()
|
|
650
|
+
metrics["throughput"] = publisher_stats.get("messages_produced", 0) / runtime
|
|
651
|
+
|
|
652
|
+
# Calculate completion rate from synchronizer
|
|
653
|
+
if self.results_synchronizer:
|
|
654
|
+
sync_stats = self.results_synchronizer.get_stats()
|
|
655
|
+
total_syncs = sync_stats.get("complete_syncs", 0) + sync_stats.get("partial_syncs", 0)
|
|
656
|
+
if total_syncs > 0:
|
|
657
|
+
metrics["completion_rate"] = sync_stats.get("complete_syncs", 0) / total_syncs
|
|
658
|
+
|
|
659
|
+
# Calculate error rate
|
|
660
|
+
total_errors = self.stats["errors"]
|
|
661
|
+
total_processed = 0
|
|
662
|
+
|
|
663
|
+
if self.results_ingestor:
|
|
664
|
+
ingestor_stats = self.results_ingestor.get_stats()
|
|
665
|
+
total_processed += ingestor_stats.get("results_consumed", 0)
|
|
666
|
+
total_errors += ingestor_stats.get("errors", 0)
|
|
667
|
+
|
|
668
|
+
if total_processed > 0:
|
|
669
|
+
metrics["error_rate"] = total_errors / total_processed
|
|
670
|
+
|
|
671
|
+
# Calculate average latency from synchronizer
|
|
672
|
+
if self.results_synchronizer:
|
|
673
|
+
sync_stats = self.results_synchronizer.get_stats()
|
|
674
|
+
metrics["latency"] = sync_stats.get("avg_sync_time", 0.0)
|
|
675
|
+
|
|
676
|
+
except Exception as exc:
|
|
677
|
+
logging.error(f"Error calculating pipeline metrics: {exc}")
|
|
678
|
+
|
|
679
|
+
return metrics
|
|
680
|
+
|
|
681
|
+
def get_health_status(self) -> Dict:
|
|
682
|
+
"""Get health status of all components."""
|
|
683
|
+
health = {
|
|
684
|
+
"overall_status": "healthy",
|
|
685
|
+
"is_running": self.is_running,
|
|
686
|
+
"pipeline_version": self.stats["pipeline_version"],
|
|
687
|
+
"deployment_count": len(self.deployment_ids),
|
|
688
|
+
"components": {},
|
|
689
|
+
"issues": [],
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
try:
|
|
693
|
+
# Check components health with detailed logging
|
|
694
|
+
if self.results_ingestor:
|
|
695
|
+
ingestor_health = self.results_ingestor.get_health_status()
|
|
696
|
+
health["components"]["results_ingestor"] = ingestor_health
|
|
697
|
+
if ingestor_health.get("status") != "healthy":
|
|
698
|
+
issue_detail = f"Results ingestor is {ingestor_health.get('status', 'unknown')}"
|
|
699
|
+
if "reason" in ingestor_health:
|
|
700
|
+
issue_detail += f": {ingestor_health['reason']}"
|
|
701
|
+
if ingestor_health.get("errors", 0) > 0:
|
|
702
|
+
issue_detail += f" ({ingestor_health['errors']} errors)"
|
|
703
|
+
health["issues"].append(issue_detail)
|
|
704
|
+
logging.warning(f"Ingestor health issue: {issue_detail}")
|
|
705
|
+
else:
|
|
706
|
+
health["issues"].append("Results ingestor not initialized")
|
|
707
|
+
logging.error("Results ingestor not initialized")
|
|
708
|
+
|
|
709
|
+
if self.results_synchronizer:
|
|
710
|
+
sync_health = self.results_synchronizer.get_health_status()
|
|
711
|
+
health["components"]["results_synchronizer"] = sync_health
|
|
712
|
+
if sync_health.get("status") != "healthy":
|
|
713
|
+
issue_detail = f"Results synchronizer is {sync_health.get('status', 'unknown')}"
|
|
714
|
+
if "issue" in sync_health:
|
|
715
|
+
issue_detail += f": {sync_health['issue']}"
|
|
716
|
+
if "recent_error" in sync_health:
|
|
717
|
+
issue_detail += f" (recent error: {sync_health['recent_error']})"
|
|
718
|
+
if sync_health.get("completion_rate", 1.0) < 0.8:
|
|
719
|
+
issue_detail += f" (completion rate: {sync_health.get('completion_rate', 0):.1%})"
|
|
720
|
+
health["issues"].append(issue_detail)
|
|
721
|
+
logging.warning(f"Synchronizer health issue: {issue_detail}")
|
|
722
|
+
else:
|
|
723
|
+
health["issues"].append("Results synchronizer not initialized")
|
|
724
|
+
logging.error("Results synchronizer not initialized")
|
|
725
|
+
|
|
726
|
+
if self.results_aggregator:
|
|
727
|
+
agg_health = self.results_aggregator.get_health_status()
|
|
728
|
+
health["components"]["results_aggregator"] = agg_health
|
|
729
|
+
if agg_health.get("status") != "healthy":
|
|
730
|
+
issue_detail = f"Results aggregator is {agg_health.get('status', 'unknown')}"
|
|
731
|
+
if agg_health.get("errors", 0) > 0:
|
|
732
|
+
issue_detail += f" ({agg_health['errors']} errors)"
|
|
733
|
+
if agg_health.get("output_queue_size", 0) > 100:
|
|
734
|
+
issue_detail += f" (output queue size: {agg_health['output_queue_size']})"
|
|
735
|
+
health["issues"].append(issue_detail)
|
|
736
|
+
logging.warning(f"Aggregator health issue: {issue_detail}")
|
|
737
|
+
else:
|
|
738
|
+
health["issues"].append("Results aggregator not initialized")
|
|
739
|
+
logging.error("Results aggregator not initialized")
|
|
740
|
+
|
|
741
|
+
if self.analytics_summarizer is not None:
|
|
742
|
+
sum_health = self.analytics_summarizer.get_health_status()
|
|
743
|
+
health["components"]["analytics_summarizer"] = sum_health
|
|
744
|
+
if sum_health.get("status") != "healthy":
|
|
745
|
+
issue_detail = f"Analytics summarizer is {sum_health.get('status', 'unknown')}"
|
|
746
|
+
if "reason" in sum_health:
|
|
747
|
+
issue_detail += f": {sum_health['reason']}"
|
|
748
|
+
if sum_health.get("errors", 0) > 0:
|
|
749
|
+
issue_detail += f" ({sum_health['errors']} errors)"
|
|
750
|
+
health["issues"].append(issue_detail)
|
|
751
|
+
logging.warning(f"Summarizer health issue: {issue_detail}")
|
|
752
|
+
else:
|
|
753
|
+
# Analytics summarizer is disabled - this is not an error
|
|
754
|
+
health["components"]["analytics_summarizer"] = {
|
|
755
|
+
"status": "disabled",
|
|
756
|
+
"reason": "Analytics summarizer is disabled due to initialization failure"
|
|
757
|
+
}
|
|
758
|
+
logging.debug("Analytics summarizer is disabled")
|
|
759
|
+
|
|
760
|
+
if self.latency_tracker is not None:
|
|
761
|
+
lat_health = self.latency_tracker.get_health_status()
|
|
762
|
+
health["components"]["latency_tracker"] = lat_health
|
|
763
|
+
if lat_health.get("status") != "healthy":
|
|
764
|
+
issue_detail = f"Latency tracker is {lat_health.get('status', 'unknown')}"
|
|
765
|
+
if "reason" in lat_health:
|
|
766
|
+
issue_detail += f": {lat_health['reason']}"
|
|
767
|
+
if lat_health.get("errors", 0) > 0:
|
|
768
|
+
issue_detail += f" ({lat_health['errors']} errors)"
|
|
769
|
+
health["issues"].append(issue_detail)
|
|
770
|
+
logging.warning(f"Latency tracker health issue: {issue_detail}")
|
|
771
|
+
else:
|
|
772
|
+
# Latency tracker is disabled - this is not an error
|
|
773
|
+
health["components"]["latency_tracker"] = {
|
|
774
|
+
"status": "disabled",
|
|
775
|
+
"reason": "Latency tracker is disabled due to initialization failure"
|
|
776
|
+
}
|
|
777
|
+
logging.debug("Latency tracker is disabled")
|
|
778
|
+
|
|
779
|
+
if self.results_publisher:
|
|
780
|
+
pub_health = self.results_publisher.get_health_status()
|
|
781
|
+
health["components"]["results_publisher"] = pub_health
|
|
782
|
+
if pub_health.get("status") != "healthy":
|
|
783
|
+
issue_detail = f"Results publisher is {pub_health.get('status', 'unknown')}"
|
|
784
|
+
if "reason" in pub_health:
|
|
785
|
+
issue_detail += f": {pub_health['reason']}"
|
|
786
|
+
if "last_error" in pub_health:
|
|
787
|
+
issue_detail += f" (last error: {pub_health['last_error']})"
|
|
788
|
+
if pub_health.get("kafka_errors", 0) > 0:
|
|
789
|
+
issue_detail += f" ({pub_health['kafka_errors']} kafka errors)"
|
|
790
|
+
health["issues"].append(issue_detail)
|
|
791
|
+
logging.warning(f"Publisher health issue: {issue_detail}")
|
|
792
|
+
else:
|
|
793
|
+
health["issues"].append("Results publisher not initialized")
|
|
794
|
+
logging.error("Results publisher not initialized")
|
|
795
|
+
|
|
796
|
+
# Determine overall status with logging
|
|
797
|
+
issue_count = len(health["issues"])
|
|
798
|
+
if issue_count > 0:
|
|
799
|
+
if issue_count >= 2:
|
|
800
|
+
health["overall_status"] = "unhealthy"
|
|
801
|
+
logging.error(f"Pipeline is UNHEALTHY with {issue_count} issues: {'; '.join(health['issues'])}")
|
|
802
|
+
else:
|
|
803
|
+
health["overall_status"] = "degraded"
|
|
804
|
+
logging.warning(f"Pipeline is DEGRADED with {issue_count} issue: {health['issues'][0]}")
|
|
805
|
+
else:
|
|
806
|
+
logging.debug("Pipeline health check: all components healthy")
|
|
807
|
+
|
|
808
|
+
except Exception as exc:
|
|
809
|
+
health["overall_status"] = "unhealthy"
|
|
810
|
+
health["error"] = str(exc)
|
|
811
|
+
error_msg = f"Error checking health: {str(exc)}"
|
|
812
|
+
health["issues"].append(error_msg)
|
|
813
|
+
logging.error(f"Pipeline health check failed: {error_msg}")
|
|
814
|
+
|
|
815
|
+
return health
|
|
816
|
+
|
|
817
|
+
def get_deployment_info(self) -> Dict:
|
|
818
|
+
"""
|
|
819
|
+
Get information about the deployments in this aggregator.
|
|
820
|
+
|
|
821
|
+
Returns:
|
|
822
|
+
Dict: Deployment information including IDs, count, and status
|
|
823
|
+
"""
|
|
824
|
+
return {
|
|
825
|
+
"inference_pipeline_id": self.inference_pipeline_id,
|
|
826
|
+
"deployment_ids": self.deployment_ids,
|
|
827
|
+
"deployment_count": len(self.deployment_ids),
|
|
828
|
+
"pipeline_status": getattr(self.inference_pipeline, 'status', None),
|
|
829
|
+
"aggregator_running": self.is_running,
|
|
830
|
+
"component_status": self.stats["component_status"].copy(),
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
def wait_for_ready(self, timeout: int = 300, poll_interval: int = 10) -> bool:
|
|
834
|
+
"""
|
|
835
|
+
Wait for the aggregator to be ready and processing results.
|
|
836
|
+
|
|
837
|
+
Args:
|
|
838
|
+
timeout: Maximum time to wait in seconds
|
|
839
|
+
poll_interval: Time between checks in seconds
|
|
840
|
+
|
|
841
|
+
Returns:
|
|
842
|
+
bool: True if aggregator is ready, False if timeout
|
|
843
|
+
"""
|
|
844
|
+
if not self.is_running:
|
|
845
|
+
logging.warning("Aggregator is not running")
|
|
846
|
+
return False
|
|
847
|
+
|
|
848
|
+
start_time = time.time()
|
|
849
|
+
|
|
850
|
+
while time.time() - start_time < timeout:
|
|
851
|
+
try:
|
|
852
|
+
health = self.get_health_status()
|
|
853
|
+
|
|
854
|
+
# Check if all components are healthy
|
|
855
|
+
if health.get("overall_status") == "healthy":
|
|
856
|
+
# Check if we're receiving and processing results
|
|
857
|
+
stats = self.get_stats()
|
|
858
|
+
components = stats.get("components", {})
|
|
859
|
+
|
|
860
|
+
ingestor_stats = components.get("results_ingestor", {})
|
|
861
|
+
sync_stats = components.get("results_synchronizer", {})
|
|
862
|
+
|
|
863
|
+
# Consider ready if we're consuming and synchronizing results
|
|
864
|
+
if (ingestor_stats.get("results_consumed", 0) > 0 and
|
|
865
|
+
sync_stats.get("results_synchronized", 0) > 0):
|
|
866
|
+
logging.info("Aggregation pipeline is ready and processing results")
|
|
867
|
+
return True
|
|
868
|
+
|
|
869
|
+
logging.debug(f"Waiting for pipeline readiness... Health: {health.get('overall_status')}")
|
|
870
|
+
time.sleep(poll_interval)
|
|
871
|
+
|
|
872
|
+
except Exception as exc:
|
|
873
|
+
logging.error(f"Error checking aggregator readiness: {exc}")
|
|
874
|
+
time.sleep(poll_interval)
|
|
875
|
+
|
|
876
|
+
logging.warning(f"Aggregation pipeline not ready after {timeout} seconds")
|
|
877
|
+
return False
|
|
878
|
+
|
|
879
|
+
def force_sync_pending_results(self) -> int:
|
|
880
|
+
"""
|
|
881
|
+
Force synchronization of all pending results.
|
|
882
|
+
|
|
883
|
+
Returns:
|
|
884
|
+
int: Number of pending results that were synchronized
|
|
885
|
+
"""
|
|
886
|
+
if not self.results_synchronizer:
|
|
887
|
+
logging.warning("Results synchronizer not initialized")
|
|
888
|
+
return 0
|
|
889
|
+
|
|
890
|
+
return self.results_synchronizer.force_sync_pending()
|
|
891
|
+
|
|
892
|
+
def _record_error(self, error_message: str):
|
|
893
|
+
"""Record an error with timestamp."""
|
|
894
|
+
logging.error(error_message)
|
|
895
|
+
self.stats["errors"] += 1
|
|
896
|
+
self.stats["last_error"] = error_message
|
|
897
|
+
self.stats["last_error_time"] = time.time()
|
|
898
|
+
|
|
899
|
+
def cleanup(self):
|
|
900
|
+
"""Clean up all resources."""
|
|
901
|
+
logging.info("Cleaning up aggregation pipeline resources...")
|
|
902
|
+
|
|
903
|
+
# Update status to indicate cleanup is starting
|
|
904
|
+
self.update_status(
|
|
905
|
+
"AGG_CLEANUP",
|
|
906
|
+
"IN_PROGRESS",
|
|
907
|
+
"Aggregation pipeline cleanup initiated"
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
# Stop streaming if running
|
|
911
|
+
if self.is_running:
|
|
912
|
+
self.stop_streaming()
|
|
913
|
+
|
|
914
|
+
# Cleanup components in reverse order
|
|
915
|
+
if self.results_publisher:
|
|
916
|
+
try:
|
|
917
|
+
self.results_publisher.cleanup() if hasattr(self.results_publisher, 'cleanup') else None
|
|
918
|
+
except Exception as exc:
|
|
919
|
+
logging.error(f"Error cleaning up publisher: {exc}")
|
|
920
|
+
|
|
921
|
+
if self.results_aggregator:
|
|
922
|
+
try:
|
|
923
|
+
self.results_aggregator.cleanup()
|
|
924
|
+
except Exception as exc:
|
|
925
|
+
logging.error(f"Error cleaning up aggregator: {exc}")
|
|
926
|
+
|
|
927
|
+
if self.analytics_summarizer is not None:
|
|
928
|
+
try:
|
|
929
|
+
self.analytics_summarizer.cleanup()
|
|
930
|
+
except Exception as exc:
|
|
931
|
+
logging.error(f"Error cleaning up analytics summarizer: {exc}")
|
|
932
|
+
|
|
933
|
+
if self.latency_tracker:
|
|
934
|
+
try:
|
|
935
|
+
self.latency_tracker.cleanup()
|
|
936
|
+
except Exception as exc:
|
|
937
|
+
logging.error(f"Error cleaning up latency tracker: {exc}")
|
|
938
|
+
|
|
939
|
+
if self.results_synchronizer:
|
|
940
|
+
try:
|
|
941
|
+
self.results_synchronizer.cleanup()
|
|
942
|
+
except Exception as exc:
|
|
943
|
+
logging.error(f"Error cleaning up synchronizer: {exc}")
|
|
944
|
+
|
|
945
|
+
if self.results_ingestor:
|
|
946
|
+
try:
|
|
947
|
+
self.results_ingestor.cleanup()
|
|
948
|
+
except Exception as exc:
|
|
949
|
+
logging.error(f"Error cleaning up ingestor: {exc}")
|
|
950
|
+
|
|
951
|
+
# Clear the final results queue
|
|
952
|
+
if self.final_results_queue:
|
|
953
|
+
try:
|
|
954
|
+
while not self.final_results_queue.empty():
|
|
955
|
+
self.final_results_queue.get_nowait()
|
|
956
|
+
except Exception:
|
|
957
|
+
pass
|
|
958
|
+
|
|
959
|
+
# Update status to indicate successful cleanup
|
|
960
|
+
self.update_status(
|
|
961
|
+
"AGG_CLEANUP",
|
|
962
|
+
"SUCCESS",
|
|
963
|
+
"Aggregation pipeline cleanup completed successfully"
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
logging.info("Aggregation pipeline cleanup completed")
|
|
967
|
+
|
|
968
|
+
|