matrice-inference 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of matrice-inference might be problematic. Click here for more details.
- matrice_inference/__init__.py +72 -0
- matrice_inference/py.typed +0 -0
- matrice_inference/server/__init__.py +23 -0
- matrice_inference/server/inference_interface.py +176 -0
- matrice_inference/server/model/__init__.py +1 -0
- matrice_inference/server/model/model_manager.py +274 -0
- matrice_inference/server/model/model_manager_wrapper.py +550 -0
- matrice_inference/server/model/triton_model_manager.py +290 -0
- matrice_inference/server/model/triton_server.py +1248 -0
- matrice_inference/server/proxy_interface.py +371 -0
- matrice_inference/server/server.py +1004 -0
- matrice_inference/server/stream/__init__.py +0 -0
- matrice_inference/server/stream/app_deployment.py +228 -0
- matrice_inference/server/stream/consumer_worker.py +201 -0
- matrice_inference/server/stream/frame_cache.py +127 -0
- matrice_inference/server/stream/inference_worker.py +163 -0
- matrice_inference/server/stream/post_processing_worker.py +230 -0
- matrice_inference/server/stream/producer_worker.py +147 -0
- matrice_inference/server/stream/stream_pipeline.py +451 -0
- matrice_inference/server/stream/utils.py +23 -0
- matrice_inference/tmp/abstract_model_manager.py +58 -0
- matrice_inference/tmp/aggregator/__init__.py +18 -0
- matrice_inference/tmp/aggregator/aggregator.py +330 -0
- matrice_inference/tmp/aggregator/analytics.py +906 -0
- matrice_inference/tmp/aggregator/ingestor.py +438 -0
- matrice_inference/tmp/aggregator/latency.py +597 -0
- matrice_inference/tmp/aggregator/pipeline.py +968 -0
- matrice_inference/tmp/aggregator/publisher.py +431 -0
- matrice_inference/tmp/aggregator/synchronizer.py +594 -0
- matrice_inference/tmp/batch_manager.py +239 -0
- matrice_inference/tmp/overall_inference_testing.py +338 -0
- matrice_inference/tmp/triton_utils.py +638 -0
- matrice_inference-0.1.2.dist-info/METADATA +28 -0
- matrice_inference-0.1.2.dist-info/RECORD +37 -0
- matrice_inference-0.1.2.dist-info/WHEEL +5 -0
- matrice_inference-0.1.2.dist-info/licenses/LICENSE.txt +21 -0
- matrice_inference-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,431 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import threading
|
|
3
|
+
import time
|
|
4
|
+
from queue import Queue, Empty
|
|
5
|
+
from typing import Dict, Optional, Any
|
|
6
|
+
from matrice_common.session import Session
|
|
7
|
+
from matrice_common.stream.kafka_stream import MatriceKafkaDeployment
|
|
8
|
+
from matrice_inference.tmp.aggregator.analytics import AnalyticsSummarizer
|
|
9
|
+
from matrice_inference.tmp.aggregator.latency import LatencyTracker
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ResultsPublisher:
|
|
13
|
+
"""
|
|
14
|
+
Optimized streaming of final aggregated results from inference pipeline to Kafka.
|
|
15
|
+
Processes results immediately for low latency.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
inference_pipeline_id: str,
|
|
21
|
+
session: Session,
|
|
22
|
+
final_results_queue: Queue,
|
|
23
|
+
analytics_summarizer: Optional[AnalyticsSummarizer] = None,
|
|
24
|
+
latency_tracker: Optional[LatencyTracker] = None
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Initialize the final results streamer.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
inference_pipeline_id: ID of the inference pipeline
|
|
31
|
+
session: Session object for authentication
|
|
32
|
+
final_results_queue: Queue containing final aggregated results
|
|
33
|
+
analytics_summarizer: Optional analytics summarizer for forwarding results
|
|
34
|
+
latency_tracker: Optional latency tracker for performance monitoring
|
|
35
|
+
"""
|
|
36
|
+
self.inference_pipeline_id = inference_pipeline_id
|
|
37
|
+
self.session = session
|
|
38
|
+
self.final_results_queue = final_results_queue
|
|
39
|
+
|
|
40
|
+
self.kafka_handler = MatriceKafkaDeployment(
|
|
41
|
+
session, inference_pipeline_id, type="server"
|
|
42
|
+
)
|
|
43
|
+
# Optional analytics summarizer hook
|
|
44
|
+
self.analytics_summarizer = analytics_summarizer
|
|
45
|
+
# Optional latency tracker hook
|
|
46
|
+
self.latency_tracker = latency_tracker
|
|
47
|
+
|
|
48
|
+
# Threading and state management
|
|
49
|
+
self._stop_streaming = threading.Event()
|
|
50
|
+
self._streaming_thread: Optional[threading.Thread] = None
|
|
51
|
+
self._is_running = False
|
|
52
|
+
self._stats_lock = threading.Lock()
|
|
53
|
+
|
|
54
|
+
# Statistics
|
|
55
|
+
self.stats = {
|
|
56
|
+
"start_time": None,
|
|
57
|
+
"messages_produced": 0,
|
|
58
|
+
"validation_errors": 0,
|
|
59
|
+
"kafka_errors": 0,
|
|
60
|
+
"errors": 0,
|
|
61
|
+
"last_error": None,
|
|
62
|
+
"last_error_time": None,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def start_streaming(self) -> bool:
|
|
66
|
+
"""
|
|
67
|
+
Start streaming final results to Kafka.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
bool: True if streaming started successfully, False otherwise
|
|
71
|
+
"""
|
|
72
|
+
if self._is_running:
|
|
73
|
+
logging.warning("Final results streaming is already running")
|
|
74
|
+
return True
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
# Reset stop event and start streaming thread
|
|
78
|
+
self._stop_streaming.clear()
|
|
79
|
+
self._streaming_thread = threading.Thread(
|
|
80
|
+
target=self._stream_results_to_kafka,
|
|
81
|
+
name=f"FinalResultsStreamer-{self.inference_pipeline_id}",
|
|
82
|
+
daemon=True
|
|
83
|
+
)
|
|
84
|
+
self._streaming_thread.start()
|
|
85
|
+
|
|
86
|
+
self._is_running = True
|
|
87
|
+
self.stats["start_time"] = time.time()
|
|
88
|
+
|
|
89
|
+
logging.info(f"Final results streaming started for pipeline: {self.inference_pipeline_id}")
|
|
90
|
+
return True
|
|
91
|
+
|
|
92
|
+
except Exception as exc:
|
|
93
|
+
logging.error(f"Failed to start final results streaming: {exc}")
|
|
94
|
+
self._record_error(f"Start streaming failed: {str(exc)}")
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
def _stream_results_to_kafka(self):
|
|
98
|
+
"""Stream final results from queue to Kafka immediately."""
|
|
99
|
+
logging.info("Starting final results streaming thread")
|
|
100
|
+
last_log_time = time.time()
|
|
101
|
+
log_interval = 30.0 # Log every 30 seconds
|
|
102
|
+
|
|
103
|
+
while not self._stop_streaming.is_set():
|
|
104
|
+
try:
|
|
105
|
+
# Get result from queue with timeout
|
|
106
|
+
try:
|
|
107
|
+
aggregated_result = self.final_results_queue.get(timeout=1.0)
|
|
108
|
+
except Empty:
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
# Process single result immediately
|
|
112
|
+
try:
|
|
113
|
+
# Extract stream key efficiently
|
|
114
|
+
stream_key = None
|
|
115
|
+
aggregation_metadata = aggregated_result.get("aggregation_metadata", {})
|
|
116
|
+
if aggregation_metadata:
|
|
117
|
+
stream_key = aggregation_metadata.get("stream_key")
|
|
118
|
+
if not stream_key:
|
|
119
|
+
# Fallback to camera_info
|
|
120
|
+
camera_info = aggregated_result.get('camera_info', {})
|
|
121
|
+
stream_key = camera_info.get('camera_name')
|
|
122
|
+
|
|
123
|
+
# Produce message to Kafka immediately
|
|
124
|
+
self.kafka_handler.produce_message(
|
|
125
|
+
message=aggregated_result,
|
|
126
|
+
key=stream_key
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
with self._stats_lock:
|
|
130
|
+
self.stats["messages_produced"] += 1
|
|
131
|
+
|
|
132
|
+
# Forward to analytics summarizer after successful publish
|
|
133
|
+
if self.analytics_summarizer is not None and hasattr(self.analytics_summarizer, 'ingest_result'):
|
|
134
|
+
try:
|
|
135
|
+
self.analytics_summarizer.ingest_result(aggregated_result)
|
|
136
|
+
except Exception as exc_inner:
|
|
137
|
+
if self.stats["messages_produced"] % 100 == 1: # Log occasionally
|
|
138
|
+
logging.warning(f"Failed to forward to analytics summarizer: {exc_inner}")
|
|
139
|
+
|
|
140
|
+
# Forward to latency tracker for performance monitoring
|
|
141
|
+
if self.latency_tracker is not None and hasattr(self.latency_tracker, 'ingest_result'):
|
|
142
|
+
try:
|
|
143
|
+
# Extract deployment ID from aggregation metadata
|
|
144
|
+
deployment_id = None
|
|
145
|
+
aggregation_metadata = aggregated_result.get("aggregation_metadata", {})
|
|
146
|
+
if aggregation_metadata:
|
|
147
|
+
# Use stream key as deployment identifier for tracking
|
|
148
|
+
deployment_id = aggregation_metadata.get("stream_key")
|
|
149
|
+
|
|
150
|
+
# Forward each deployment result to latency tracker
|
|
151
|
+
deployment_results = aggregated_result.get("deployment_results", {})
|
|
152
|
+
for dep_id, deployment_result in deployment_results.items():
|
|
153
|
+
result_data = deployment_result.get("result", {})
|
|
154
|
+
if result_data:
|
|
155
|
+
self.latency_tracker.ingest_result(dep_id, result_data)
|
|
156
|
+
except Exception as exc_inner:
|
|
157
|
+
if self.stats["messages_produced"] % 100 == 1: # Log occasionally
|
|
158
|
+
logging.warning(f"Failed to forward to latency tracker: {exc_inner}")
|
|
159
|
+
|
|
160
|
+
except Exception as exc:
|
|
161
|
+
with self._stats_lock:
|
|
162
|
+
self.stats["kafka_errors"] += 1
|
|
163
|
+
if self.stats["kafka_errors"] % 10 == 1: # Log every 10th error
|
|
164
|
+
self._record_error(f"Failed to produce aggregated result to Kafka: {str(exc)}")
|
|
165
|
+
|
|
166
|
+
# Mark task as done
|
|
167
|
+
self.final_results_queue.task_done()
|
|
168
|
+
|
|
169
|
+
# Reduced frequency logging
|
|
170
|
+
current_time = time.time()
|
|
171
|
+
if (current_time - last_log_time) > log_interval:
|
|
172
|
+
with self._stats_lock:
|
|
173
|
+
messages_produced = self.stats["messages_produced"]
|
|
174
|
+
kafka_errors = self.stats["kafka_errors"]
|
|
175
|
+
if messages_produced > 0 or kafka_errors > 0:
|
|
176
|
+
logging.debug(f"Publisher: produced={messages_produced} messages, kafka_errors={kafka_errors}")
|
|
177
|
+
last_log_time = current_time
|
|
178
|
+
|
|
179
|
+
except Exception as exc:
|
|
180
|
+
if not self._stop_streaming.is_set():
|
|
181
|
+
logging.error(f"Error streaming final result: {exc}")
|
|
182
|
+
self._record_error(f"Streaming error: {str(exc)}")
|
|
183
|
+
time.sleep(0.1) # Prevent tight error loops
|
|
184
|
+
|
|
185
|
+
logging.info("Final results streaming thread stopped")
|
|
186
|
+
|
|
187
|
+
# def _validate_aggregated_result(self, result: Any) -> bool:
|
|
188
|
+
# """
|
|
189
|
+
# Validate the result format for aggregated results based on stream_worker structure.
|
|
190
|
+
|
|
191
|
+
# Args:
|
|
192
|
+
# result: Result to validate
|
|
193
|
+
|
|
194
|
+
# Returns:
|
|
195
|
+
# bool: True if result is valid, False otherwise
|
|
196
|
+
# """
|
|
197
|
+
# if not isinstance(result, dict):
|
|
198
|
+
# logging.warning("Result is not a dictionary")
|
|
199
|
+
# return False
|
|
200
|
+
|
|
201
|
+
# # Check for required top-level fields
|
|
202
|
+
# required_fields = ["stream_info", "model_configs", "agg_summary"]
|
|
203
|
+
# for field in required_fields:
|
|
204
|
+
# if field not in result:
|
|
205
|
+
# logging.warning(f"Missing required field: {field}")
|
|
206
|
+
# return False
|
|
207
|
+
|
|
208
|
+
# # Validate stream_info structure
|
|
209
|
+
# stream_info = result.get("stream_info", {})
|
|
210
|
+
# if not isinstance(stream_info, dict):
|
|
211
|
+
# logging.warning("stream_info must be a dictionary")
|
|
212
|
+
# return False
|
|
213
|
+
|
|
214
|
+
# # Check for essential stream_info fields
|
|
215
|
+
# stream_info_required = ["stream_key", "input_order"]
|
|
216
|
+
# for field in stream_info_required:
|
|
217
|
+
# if field not in stream_info:
|
|
218
|
+
# logging.warning(f"Missing required stream_info field: {field}")
|
|
219
|
+
# return False
|
|
220
|
+
|
|
221
|
+
# # Validate stream_key and input_order types
|
|
222
|
+
# if not isinstance(stream_info.get("stream_key"), str):
|
|
223
|
+
# logging.warning("stream_key must be a string")
|
|
224
|
+
# return False
|
|
225
|
+
|
|
226
|
+
# if not isinstance(stream_info.get("input_order"), int):
|
|
227
|
+
# logging.warning("input_order must be an integer")
|
|
228
|
+
# return False
|
|
229
|
+
|
|
230
|
+
# # Validate model_configs
|
|
231
|
+
# model_configs = result.get("model_configs", [])
|
|
232
|
+
# if not isinstance(model_configs, list):
|
|
233
|
+
# logging.warning("model_configs must be a list")
|
|
234
|
+
# return False
|
|
235
|
+
|
|
236
|
+
# # Check each model config
|
|
237
|
+
# for i, config in enumerate(model_configs):
|
|
238
|
+
# if not isinstance(config, dict):
|
|
239
|
+
# logging.warning(f"Model config {i} must be a dictionary")
|
|
240
|
+
# return False
|
|
241
|
+
|
|
242
|
+
# # Check for essential model config fields
|
|
243
|
+
# config_required = ["deployment_id", "model_output"]
|
|
244
|
+
# for field in config_required:
|
|
245
|
+
# if field not in config:
|
|
246
|
+
# logging.warning(f"Missing required model config field: {field}")
|
|
247
|
+
# return False
|
|
248
|
+
|
|
249
|
+
# # Validate agg_summary
|
|
250
|
+
# agg_summary = result.get("agg_summary", {})
|
|
251
|
+
# if not isinstance(agg_summary, dict):
|
|
252
|
+
# logging.warning("agg_summary must be a dictionary")
|
|
253
|
+
# return False
|
|
254
|
+
|
|
255
|
+
# # Check that events and tracking_stats are lists if present
|
|
256
|
+
# if "events" in agg_summary and not isinstance(agg_summary["events"], list):
|
|
257
|
+
# logging.warning("agg_summary.events must be a list")
|
|
258
|
+
# return False
|
|
259
|
+
|
|
260
|
+
# if "tracking_stats" in agg_summary and not isinstance(agg_summary["tracking_stats"], list):
|
|
261
|
+
# logging.warning("agg_summary.tracking_stats must be a list")
|
|
262
|
+
# return False
|
|
263
|
+
|
|
264
|
+
# return True
|
|
265
|
+
|
|
266
|
+
# def _enhance_result_for_publishing(self, result: Dict) -> Dict:
|
|
267
|
+
# """
|
|
268
|
+
# Enhance the aggregated result with additional metadata for publishing.
|
|
269
|
+
|
|
270
|
+
# Args:
|
|
271
|
+
# result: Aggregated result to enhance
|
|
272
|
+
|
|
273
|
+
# Returns:
|
|
274
|
+
# Enhanced result ready for publishing
|
|
275
|
+
# """
|
|
276
|
+
# enhanced_result = result.copy()
|
|
277
|
+
|
|
278
|
+
# # Add publishing metadata
|
|
279
|
+
# enhanced_result["publishing_metadata"] = {
|
|
280
|
+
# "pipeline_id": self.inference_pipeline_id,
|
|
281
|
+
# "published_timestamp": time.time(),
|
|
282
|
+
# "publisher_version": "2.0",
|
|
283
|
+
# "is_aggregated": True,
|
|
284
|
+
# "deployment_count": result.get("deployment_count", 0),
|
|
285
|
+
# "aggregation_type": result.get("aggregation_type", "multi_deployment"),
|
|
286
|
+
# }
|
|
287
|
+
|
|
288
|
+
# # Add strategy summary if available
|
|
289
|
+
# strategy_results = result.get("strategy_results", {})
|
|
290
|
+
# if strategy_results:
|
|
291
|
+
# strategy_summary = {}
|
|
292
|
+
# for strategy, strategy_data in strategy_results.items():
|
|
293
|
+
# if isinstance(strategy_data, dict):
|
|
294
|
+
# performance_metrics = strategy_data.get("performance_metrics", {})
|
|
295
|
+
# strategy_summary[strategy] = {
|
|
296
|
+
# "total_outputs": performance_metrics.get("total_outputs", 0),
|
|
297
|
+
# "avg_processing_time": performance_metrics.get("avg_processing_time", 0.0),
|
|
298
|
+
# "aggregation_type": strategy_data.get("aggregation_type", strategy)
|
|
299
|
+
# }
|
|
300
|
+
# enhanced_result["publishing_metadata"]["strategy_summary"] = strategy_summary
|
|
301
|
+
|
|
302
|
+
# # Enhance stream_info with publishing timestamp
|
|
303
|
+
# if "stream_info" in enhanced_result:
|
|
304
|
+
# enhanced_result["stream_info"]["published_timestamp"] = time.time()
|
|
305
|
+
# enhanced_result["stream_info"]["pipeline_id"] = self.inference_pipeline_id
|
|
306
|
+
|
|
307
|
+
# # Add final stats to agg_summary
|
|
308
|
+
# if "agg_summary" in enhanced_result:
|
|
309
|
+
# enhanced_result["agg_summary"]["publishing_stats"] = {
|
|
310
|
+
# "total_events": len(enhanced_result["agg_summary"].get("events", [])),
|
|
311
|
+
# "total_tracking_stats": len(enhanced_result["agg_summary"].get("tracking_stats", [])),
|
|
312
|
+
# "total_model_configs": len(enhanced_result.get("model_configs", [])),
|
|
313
|
+
# "aggregation_strategies_count": len(strategy_results),
|
|
314
|
+
# }
|
|
315
|
+
|
|
316
|
+
# return enhanced_result
|
|
317
|
+
|
|
318
|
+
def _record_error(self, error_message: str):
|
|
319
|
+
"""Record error in statistics."""
|
|
320
|
+
with self._stats_lock:
|
|
321
|
+
self.stats["errors"] += 1
|
|
322
|
+
self.stats["last_error"] = error_message
|
|
323
|
+
self.stats["last_error_time"] = time.time()
|
|
324
|
+
# Reduce logging frequency for performance
|
|
325
|
+
if self.stats["errors"] % 10 == 1: # Log every 10th error
|
|
326
|
+
logging.error(f"Publisher error (#{self.stats['errors']}): {error_message}")
|
|
327
|
+
|
|
328
|
+
def stop_streaming(self):
|
|
329
|
+
"""Stop streaming final results."""
|
|
330
|
+
if not self._is_running:
|
|
331
|
+
logging.warning("Final results streaming is not running")
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
logging.info("Stopping final results streaming...")
|
|
335
|
+
|
|
336
|
+
# Signal stop and wait for thread
|
|
337
|
+
self._stop_streaming.set()
|
|
338
|
+
|
|
339
|
+
if self._streaming_thread and self._streaming_thread.is_alive():
|
|
340
|
+
try:
|
|
341
|
+
self._streaming_thread.join(timeout=5.0)
|
|
342
|
+
if self._streaming_thread.is_alive():
|
|
343
|
+
logging.warning("Final results streaming thread did not stop gracefully")
|
|
344
|
+
except Exception as exc:
|
|
345
|
+
logging.error(f"Error joining streaming thread: {exc}")
|
|
346
|
+
|
|
347
|
+
# Stop Kafka deployment
|
|
348
|
+
try:
|
|
349
|
+
self.kafka_handler.close()
|
|
350
|
+
except Exception as exc:
|
|
351
|
+
logging.error(f"Error stopping Kafka deployment: {exc}")
|
|
352
|
+
|
|
353
|
+
self._is_running = False
|
|
354
|
+
self._streaming_thread = None
|
|
355
|
+
|
|
356
|
+
logging.info("Final results streaming stopped")
|
|
357
|
+
|
|
358
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
359
|
+
"""
|
|
360
|
+
Get streaming statistics.
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Dict containing statistics
|
|
364
|
+
"""
|
|
365
|
+
with self._stats_lock:
|
|
366
|
+
stats = self.stats.copy()
|
|
367
|
+
|
|
368
|
+
stats["is_running"] = self._is_running
|
|
369
|
+
stats["queue_size"] = self.final_results_queue.qsize()
|
|
370
|
+
|
|
371
|
+
# Calculate success rate
|
|
372
|
+
total_attempts = stats["messages_produced"] + stats["validation_errors"] + stats["kafka_errors"]
|
|
373
|
+
stats["success_rate"] = stats["messages_produced"] / max(total_attempts, 1)
|
|
374
|
+
|
|
375
|
+
if stats["start_time"]:
|
|
376
|
+
stats["uptime"] = time.time() - stats["start_time"]
|
|
377
|
+
if stats["uptime"] > 0:
|
|
378
|
+
stats["messages_per_second"] = stats["messages_produced"] / stats["uptime"]
|
|
379
|
+
|
|
380
|
+
return stats
|
|
381
|
+
|
|
382
|
+
def get_health_status(self) -> Dict[str, Any]:
|
|
383
|
+
"""Get health status of the publisher."""
|
|
384
|
+
health = {
|
|
385
|
+
"status": "healthy",
|
|
386
|
+
"is_running": self._is_running,
|
|
387
|
+
"queue_size": self.final_results_queue.qsize(),
|
|
388
|
+
"errors": self.stats["errors"],
|
|
389
|
+
"validation_errors": self.stats["validation_errors"],
|
|
390
|
+
"kafka_errors": self.stats["kafka_errors"],
|
|
391
|
+
"messages_produced": self.stats["messages_produced"],
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
# Check for recent errors (within last 60 seconds)
|
|
395
|
+
if (
|
|
396
|
+
self.stats["last_error_time"]
|
|
397
|
+
and (time.time() - self.stats["last_error_time"]) < 60
|
|
398
|
+
):
|
|
399
|
+
health["status"] = "degraded"
|
|
400
|
+
health["last_error"] = self.stats["last_error"]
|
|
401
|
+
health["reason"] = f"Recent error: {self.stats['last_error']}"
|
|
402
|
+
logging.warning(f"Publisher degraded due to recent error: {self.stats['last_error']}")
|
|
403
|
+
|
|
404
|
+
# Check queue size
|
|
405
|
+
queue_size = self.final_results_queue.qsize()
|
|
406
|
+
if queue_size > 1000:
|
|
407
|
+
health["status"] = "degraded"
|
|
408
|
+
health["reason"] = f"Queue size too large ({queue_size} items)"
|
|
409
|
+
logging.warning(f"Publisher degraded: queue has {queue_size} items (threshold: 100)")
|
|
410
|
+
|
|
411
|
+
# Check error rates
|
|
412
|
+
total_attempts = self.stats["messages_produced"] + self.stats["validation_errors"] + self.stats["kafka_errors"]
|
|
413
|
+
if total_attempts > 10: # Only check after some attempts
|
|
414
|
+
error_rate = (self.stats["validation_errors"] + self.stats["kafka_errors"]) / total_attempts
|
|
415
|
+
if error_rate > 0.1: # More than 10% error rate
|
|
416
|
+
health["status"] = "degraded"
|
|
417
|
+
health["reason"] = f"High error rate: {error_rate:.2%} ({self.stats['kafka_errors']} kafka, {self.stats['validation_errors']} validation)"
|
|
418
|
+
logging.warning(f"Publisher degraded: high error rate {error_rate:.2%} with {self.stats['kafka_errors']} kafka errors and {self.stats['validation_errors']} validation errors")
|
|
419
|
+
|
|
420
|
+
# Check if not running when it should be
|
|
421
|
+
if not self._is_running:
|
|
422
|
+
health["status"] = "unhealthy"
|
|
423
|
+
health["reason"] = "Publisher is not running"
|
|
424
|
+
logging.error("Publisher is not running")
|
|
425
|
+
|
|
426
|
+
return health
|
|
427
|
+
|
|
428
|
+
@property
|
|
429
|
+
def is_running(self) -> bool:
|
|
430
|
+
"""Check if the streamer is currently running."""
|
|
431
|
+
return self._is_running
|