matrice-inference 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of matrice-inference might be problematic. Click here for more details.

Files changed (37) hide show
  1. matrice_inference/__init__.py +72 -0
  2. matrice_inference/py.typed +0 -0
  3. matrice_inference/server/__init__.py +23 -0
  4. matrice_inference/server/inference_interface.py +176 -0
  5. matrice_inference/server/model/__init__.py +1 -0
  6. matrice_inference/server/model/model_manager.py +274 -0
  7. matrice_inference/server/model/model_manager_wrapper.py +550 -0
  8. matrice_inference/server/model/triton_model_manager.py +290 -0
  9. matrice_inference/server/model/triton_server.py +1248 -0
  10. matrice_inference/server/proxy_interface.py +371 -0
  11. matrice_inference/server/server.py +1004 -0
  12. matrice_inference/server/stream/__init__.py +0 -0
  13. matrice_inference/server/stream/app_deployment.py +228 -0
  14. matrice_inference/server/stream/consumer_worker.py +201 -0
  15. matrice_inference/server/stream/frame_cache.py +127 -0
  16. matrice_inference/server/stream/inference_worker.py +163 -0
  17. matrice_inference/server/stream/post_processing_worker.py +230 -0
  18. matrice_inference/server/stream/producer_worker.py +147 -0
  19. matrice_inference/server/stream/stream_pipeline.py +451 -0
  20. matrice_inference/server/stream/utils.py +23 -0
  21. matrice_inference/tmp/abstract_model_manager.py +58 -0
  22. matrice_inference/tmp/aggregator/__init__.py +18 -0
  23. matrice_inference/tmp/aggregator/aggregator.py +330 -0
  24. matrice_inference/tmp/aggregator/analytics.py +906 -0
  25. matrice_inference/tmp/aggregator/ingestor.py +438 -0
  26. matrice_inference/tmp/aggregator/latency.py +597 -0
  27. matrice_inference/tmp/aggregator/pipeline.py +968 -0
  28. matrice_inference/tmp/aggregator/publisher.py +431 -0
  29. matrice_inference/tmp/aggregator/synchronizer.py +594 -0
  30. matrice_inference/tmp/batch_manager.py +239 -0
  31. matrice_inference/tmp/overall_inference_testing.py +338 -0
  32. matrice_inference/tmp/triton_utils.py +638 -0
  33. matrice_inference-0.1.2.dist-info/METADATA +28 -0
  34. matrice_inference-0.1.2.dist-info/RECORD +37 -0
  35. matrice_inference-0.1.2.dist-info/WHEEL +5 -0
  36. matrice_inference-0.1.2.dist-info/licenses/LICENSE.txt +21 -0
  37. matrice_inference-0.1.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,597 @@
1
+ import logging
2
+ import threading
3
+ import time
4
+ import json
5
+ from typing import Dict, Any, Optional, List, Tuple
6
+ from collections import defaultdict, deque
7
+ from datetime import datetime, timezone
8
+ from statistics import mean, median, stdev
9
+
10
+ from matrice_common.session import Session
11
+ from confluent_kafka import Producer
12
+ import base64
13
+
14
+
15
+ class LatencyTracker:
16
+ """
17
+ Tracks and analyzes latency metrics from multiple deployments in real-time.
18
+
19
+ Provides detailed timing analysis including:
20
+ - Model inference times
21
+ - Post-processing times
22
+ - End-to-end latencies
23
+ - Client-side timings
24
+ - Server-side breakdown
25
+ - Cross-deployment comparisons
26
+ """
27
+
28
+ def __init__(
29
+ self,
30
+ session: Session,
31
+ inference_pipeline_id: str,
32
+ flush_interval_seconds: int = 60,
33
+ max_samples: int = 1000,
34
+ ) -> None:
35
+ """Initialize latency tracker.
36
+
37
+ Args:
38
+ session: Session object for authentication
39
+ inference_pipeline_id: ID of the inference pipeline
40
+ flush_interval_seconds: Interval for publishing latency reports
41
+ max_samples: Maximum number of samples to keep per metric
42
+ """
43
+ self.session = session
44
+ self.inference_pipeline_id = inference_pipeline_id
45
+ self.flush_interval_seconds = flush_interval_seconds
46
+ self.max_samples = max_samples
47
+
48
+ self.kafka_producer = self._setup_kafka_producer()
49
+
50
+ # Threading
51
+ self._stop = threading.Event()
52
+ self._thread: Optional[threading.Thread] = None
53
+ self._is_running = False
54
+ self._lock = threading.Lock()
55
+
56
+ # Latency data storage
57
+ # Structure: {deployment_id: {metric_name: deque([values])}}
58
+ self._latency_data: Dict[str, Dict[str, deque]] = defaultdict(
59
+ lambda: defaultdict(lambda: deque(maxlen=max_samples))
60
+ )
61
+
62
+ # Per-stream latency tracking
63
+ # Structure: {(deployment_id, stream_key): {metric_name: deque([values])}}
64
+ self._stream_latency_data: Dict[Tuple[str, str], Dict[str, deque]] = defaultdict(
65
+ lambda: defaultdict(lambda: deque(maxlen=max_samples))
66
+ )
67
+
68
+ # Cross-deployment analysis
69
+ self._deployment_summary: Dict[str, Dict[str, Any]] = defaultdict(dict)
70
+
71
+ # Stats
72
+ self.stats = {
73
+ "start_time": None,
74
+ "messages_processed": 0,
75
+ "latency_reports_published": 0,
76
+ "errors": 0,
77
+ "last_error": None,
78
+ "last_error_time": None,
79
+ "last_flush_time": None,
80
+ }
81
+
82
+ def _setup_kafka_producer(self):
83
+ """Setup Kafka producer for publishing latency reports."""
84
+ try:
85
+ path = "/v1/actions/get_kafka_info"
86
+ response = self.session.rpc.get(path=path, raise_exception=True)
87
+
88
+ if not response or not response.get("success"):
89
+ raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
90
+
91
+ # Decode base64 fields
92
+ encoded_ip = response["data"]["ip"]
93
+ encoded_port = response["data"]["port"]
94
+ ip = base64.b64decode(encoded_ip).decode("utf-8")
95
+ port = base64.b64decode(encoded_port).decode("utf-8")
96
+ bootstrap_servers = f"{ip}:{port}"
97
+
98
+ kafka_producer = Producer({
99
+ "bootstrap.servers": bootstrap_servers,
100
+ "acks": "all",
101
+ "retries": 3,
102
+ "retry.backoff.ms": 1000,
103
+ "request.timeout.ms": 30000,
104
+ "max.in.flight.requests.per.connection": 1,
105
+ "linger.ms": 10,
106
+ "batch.size": 4096,
107
+ "queue.buffering.max.ms": 50,
108
+ "log_level": 0,
109
+ })
110
+ return kafka_producer
111
+ except Exception as exc:
112
+ logging.error(f"Failed to setup Kafka producer for latency tracker: {exc}")
113
+ return None
114
+
115
+ def start(self) -> bool:
116
+ """Start the latency tracker."""
117
+ if self._is_running:
118
+ logging.warning("Latency tracker already running")
119
+ return True
120
+
121
+ try:
122
+ self._stop.clear()
123
+ self._is_running = True
124
+ self.stats["start_time"] = time.time()
125
+ self.stats["last_flush_time"] = time.time()
126
+
127
+ self._thread = threading.Thread(
128
+ target=self._run,
129
+ name=f"LatencyTracker-{self.inference_pipeline_id}",
130
+ daemon=True
131
+ )
132
+ self._thread.start()
133
+
134
+ logging.info("Latency tracker started")
135
+ return True
136
+ except Exception as exc:
137
+ self._record_error(f"Failed to start latency tracker: {exc}")
138
+ self.stop()
139
+ return False
140
+
141
+ def stop(self) -> None:
142
+ """Stop the latency tracker."""
143
+ if not self._is_running:
144
+ logging.info("Latency tracker not running")
145
+ return
146
+
147
+ logging.info("Stopping latency tracker...")
148
+ self._is_running = False
149
+ self._stop.set()
150
+
151
+ try:
152
+ if self._thread and self._thread.is_alive():
153
+ self._thread.join(timeout=5.0)
154
+ except Exception as exc:
155
+ logging.error(f"Error joining latency tracker thread: {exc}")
156
+
157
+ self._thread = None
158
+ logging.info("Latency tracker stopped")
159
+
160
+ def ingest_result(self, deployment_id: str, aggregated_result: Dict[str, Any]) -> None:
161
+ """Ingest a result for latency analysis.
162
+
163
+ Args:
164
+ deployment_id: ID of the deployment that produced this result
165
+ aggregated_result: Result payload containing latency data
166
+ """
167
+ try:
168
+ with self._lock:
169
+ self._extract_and_store_latency_data(deployment_id, aggregated_result)
170
+ self.stats["messages_processed"] += 1
171
+ except Exception as exc:
172
+ self._record_error(f"Failed to ingest latency data: {exc}")
173
+
174
+ def _extract_and_store_latency_data(self, deployment_id: str, result: Dict[str, Any]) -> None:
175
+ """Extract latency data from result and store it."""
176
+ # Extract stream key for per-stream tracking
177
+ camera_info = result.get("camera_info", {}) or {}
178
+ stream_key = camera_info.get("camera_name", "unknown")
179
+ stream_tuple = (deployment_id, stream_key)
180
+
181
+ # Extract latency stats from agg_apps
182
+ agg_apps = result.get("agg_apps", []) or []
183
+ current_time = time.time()
184
+
185
+ for app in agg_apps:
186
+ # Extract timing data from separated inference and post-processing workers
187
+ # Look for inference timing (from inference worker message)
188
+ inference_timing = app.get("inference_timing", {}) or {}
189
+
190
+ # Look for post-processing timing (from post-processing worker message)
191
+ post_processing_timing = app.get("post_processing_timing", {}) or {}
192
+
193
+ # Legacy server timing fallback (for backward compatibility)
194
+ server_timing = app.get("server_timing", {}) or {}
195
+
196
+ # Extract client timing data from input stream metadata
197
+ input_streams = app.get("input_streams", []) or []
198
+ client_timing = {}
199
+ if input_streams:
200
+ first_input_stream = input_streams[0].get("input_stream", {}) or {}
201
+ client_timing = {
202
+ "last_read_time_sec": first_input_stream.get("last_read_time_sec", 0.0),
203
+ "last_write_time_sec": first_input_stream.get("last_write_time_sec", 0.0),
204
+ "last_process_time_sec": first_input_stream.get("last_process_time_sec", 0.0),
205
+ }
206
+
207
+ # Legacy latency stats fallback
208
+ latency_stats = app.get("latency_stats", {}) or {}
209
+ server_breakdown = latency_stats.get("server_processing_breakdown", {}) or {}
210
+ client_breakdown = latency_stats.get("client_timing_breakdown", {}) or {}
211
+
212
+ # Extract all timing metrics from separated workers
213
+ timing_metrics = {
214
+ # Model inference timing (from inference worker)
215
+ "model_inference_time_sec": (
216
+ inference_timing.get("model_inference_time_sec") or
217
+ server_timing.get("model_inference_time_sec") or
218
+ server_breakdown.get("model_inference_time_sec", 0.0)
219
+ ),
220
+
221
+ # Post-processing timing (from post-processing worker)
222
+ "post_processing_time_sec": (
223
+ post_processing_timing.get("post_processing_time_sec") or
224
+ server_timing.get("post_processing_time_sec") or
225
+ server_breakdown.get("post_processing_time_sec", 0.0)
226
+ ),
227
+
228
+ # Combined inference total time
229
+ "inference_total_time_sec": (
230
+ inference_timing.get("inference_total_time_sec") or
231
+ server_timing.get("inference_total_time_sec") or
232
+ server_breakdown.get("inference_total_time_sec", 0.0)
233
+ ),
234
+
235
+ # Individual worker times
236
+ "inference_worker_time_sec": inference_timing.get("total_worker_time_sec", 0.0),
237
+ "post_processing_worker_time_sec": post_processing_timing.get("total_worker_time_sec", 0.0),
238
+
239
+ # Legacy total worker time (for backward compatibility)
240
+ "total_worker_time_sec": server_timing.get("total_worker_time_sec", server_breakdown.get("total_worker_time_sec", 0.0)),
241
+
242
+ # Client timing breakdown
243
+ "client_read_time_sec": client_timing.get("last_read_time_sec", client_breakdown.get("last_read_time_sec", 0.0)),
244
+ "client_write_time_sec": client_timing.get("last_write_time_sec", client_breakdown.get("last_write_time_sec", 0.0)),
245
+ "client_process_time_sec": client_timing.get("last_process_time_sec", client_breakdown.get("last_process_time_sec", 0.0)),
246
+
247
+ # Legacy/extended server timing fields
248
+ "kafka_consume_time_sec": server_breakdown.get("kafka_consume_time_sec", 0.0),
249
+ "kafka_produce_time_sec": server_breakdown.get("kafka_produce_time_sec", 0.0),
250
+ "output_construct_time_sec": server_breakdown.get("output_construct_time_sec", 0.0),
251
+
252
+ # Application-level latency (legacy format)
253
+ "app_e2e_sec": latency_stats.get("app_e2e_sec", 0.0),
254
+ "last_input_feed_sec": latency_stats.get("last_input_feed_sec", 0.0),
255
+ "last_output_sec": latency_stats.get("last_output_sec", 0.0),
256
+
257
+ # Model-specific latency (from model streams)
258
+ "model_latency_sec": 0.0,
259
+ "post_processing_latency_sec": 0.0,
260
+ "inference_total_latency_sec": 0.0,
261
+
262
+ # Calculate total end-to-end pipeline time (inference + post-processing)
263
+ "total_e2e_pipeline_time_sec": 0.0,
264
+ }
265
+
266
+ # Calculate total end-to-end pipeline time
267
+ inference_worker_time = timing_metrics["inference_worker_time_sec"]
268
+ post_processing_worker_time = timing_metrics["post_processing_worker_time_sec"]
269
+
270
+ if inference_worker_time > 0 or post_processing_worker_time > 0:
271
+ timing_metrics["total_e2e_pipeline_time_sec"] = inference_worker_time + post_processing_worker_time
272
+ elif timing_metrics["model_inference_time_sec"] > 0 or timing_metrics["post_processing_time_sec"] > 0:
273
+ # Fallback to individual step times if worker times aren't available
274
+ timing_metrics["total_e2e_pipeline_time_sec"] = (
275
+ timing_metrics["model_inference_time_sec"] +
276
+ timing_metrics["post_processing_time_sec"]
277
+ )
278
+
279
+ # Extract model stream latency data
280
+ model_streams = app.get("model_streams", []) or []
281
+ for model_stream in model_streams:
282
+ model_latency_stats = model_stream.get("model_stream", {}).get("latency_stats", {}) or {}
283
+ timing_metrics.update({
284
+ "model_latency_sec": model_latency_stats.get("model_latency_sec", 0.0),
285
+ "post_processing_latency_sec": model_latency_stats.get("post_processing_latency_sec", 0.0),
286
+ "inference_total_latency_sec": model_latency_stats.get("inference_total_latency_sec", 0.0),
287
+ })
288
+ break # Take first model stream
289
+
290
+ # Store per-deployment metrics
291
+ for metric_name, value in timing_metrics.items():
292
+ if isinstance(value, (int, float)) and value > 0:
293
+ self._latency_data[deployment_id][metric_name].append((current_time, value))
294
+ self._stream_latency_data[stream_tuple][metric_name].append((current_time, value))
295
+
296
+ # Send individual latency metrics to Kafka
297
+ self._send_latency_metrics(deployment_id, stream_key, timing_metrics, current_time)
298
+
299
+ def _send_latency_metrics(self, deployment_id: str, stream_key: str, metrics: Dict[str, float], timestamp: float) -> None:
300
+ """Send individual latency metrics to Kafka."""
301
+ if not self.kafka_producer:
302
+ return
303
+
304
+ latency_data = {
305
+ "deployment_id": deployment_id,
306
+ "stream_key": stream_key,
307
+ "timestamp": datetime.fromtimestamp(timestamp, timezone.utc).isoformat(),
308
+ "pipeline_id": self.inference_pipeline_id,
309
+ "metrics": {k: v for k, v in metrics.items() if isinstance(v, (int, float)) and v > 0}
310
+ }
311
+
312
+ try:
313
+ self.kafka_producer.produce(
314
+ topic="Latency-Metrics",
315
+ key=f"{deployment_id}-{stream_key}".encode("utf-8"),
316
+ value=json.dumps(latency_data, separators=(",", ":")).encode("utf-8"),
317
+ )
318
+ except Exception as exc:
319
+ logging.error(f"Failed to send latency metrics: {exc}")
320
+
321
+ def _run(self) -> None:
322
+ """Main tracker loop."""
323
+ logging.info("Latency tracker worker started")
324
+
325
+ while not self._stop.is_set():
326
+ try:
327
+ current_time = time.time()
328
+ last_flush = self.stats.get("last_flush_time") or current_time
329
+
330
+ if current_time - last_flush >= self.flush_interval_seconds:
331
+ self._flush_latency_report(current_time)
332
+ self.stats["last_flush_time"] = current_time
333
+
334
+ time.sleep(1.0) # Check every second
335
+
336
+ except Exception as exc:
337
+ if not self._stop.is_set():
338
+ self._record_error(f"Error in latency tracker loop: {exc}")
339
+ time.sleep(1.0)
340
+
341
+ # Final flush on stop
342
+ try:
343
+ self._flush_latency_report(time.time())
344
+ except Exception as exc:
345
+ logging.error(f"Error during final latency flush: {exc}")
346
+
347
+ logging.info("Latency tracker worker stopped")
348
+
349
+ def _flush_latency_report(self, end_time: float) -> None:
350
+ """Generate and publish comprehensive latency report."""
351
+ with self._lock:
352
+ if not self._latency_data:
353
+ return # No data to report
354
+
355
+ # Generate deployment-level statistics
356
+ deployment_stats = {}
357
+ for deployment_id, metrics in self._latency_data.items():
358
+ deployment_stats[deployment_id] = self._calculate_deployment_stats(metrics)
359
+
360
+ # Generate cross-deployment analysis
361
+ cross_deployment_analysis = self._analyze_cross_deployment_performance(deployment_stats)
362
+
363
+ # Generate stream-level analysis
364
+ stream_analysis = self._analyze_stream_performance()
365
+
366
+ # Create comprehensive report
367
+ latency_report = {
368
+ "report_type": "latency_analysis",
369
+ "pipeline_id": self.inference_pipeline_id,
370
+ "report_timestamp": datetime.now(timezone.utc).isoformat(),
371
+ "report_period_seconds": self.flush_interval_seconds,
372
+ "deployment_statistics": deployment_stats,
373
+ "cross_deployment_analysis": cross_deployment_analysis,
374
+ "stream_analysis": stream_analysis,
375
+ "summary": {
376
+ "total_deployments": len(deployment_stats),
377
+ "total_streams": len(self._stream_latency_data),
378
+ "messages_processed": self.stats["messages_processed"],
379
+ },
380
+ "metadata": {
381
+ "tracker_version": "1.0",
382
+ "max_samples": self.max_samples,
383
+ },
384
+ }
385
+
386
+ # Publish report
387
+ if self.kafka_producer:
388
+ try:
389
+ self.kafka_producer.produce(
390
+ topic="Latency-Analytics",
391
+ key=str(self.inference_pipeline_id).encode("utf-8"),
392
+ value=json.dumps(latency_report, separators=(",", ":")).encode("utf-8"),
393
+ )
394
+ self.kafka_producer.poll(0)
395
+ self.stats["latency_reports_published"] += 1
396
+
397
+ logging.info(
398
+ f"Published latency report: {len(deployment_stats)} deployments, "
399
+ f"{self.stats['messages_processed']} messages processed"
400
+ )
401
+ except Exception as exc:
402
+ self._record_error(f"Failed to publish latency report: {exc}")
403
+
404
+ # Reset message counter
405
+ self.stats["messages_processed"] = 0
406
+
407
+ def _calculate_deployment_stats(self, metrics: Dict[str, deque]) -> Dict[str, Any]:
408
+ """Calculate statistics for a single deployment."""
409
+ stats = {}
410
+
411
+ for metric_name, samples in metrics.items():
412
+ if not samples:
413
+ continue
414
+
415
+ # Extract values (samples are (timestamp, value) tuples)
416
+ values = [sample[1] for sample in samples]
417
+
418
+ if values:
419
+ stats[metric_name] = {
420
+ "count": len(values),
421
+ "mean": mean(values),
422
+ "median": median(values),
423
+ "min": min(values),
424
+ "max": max(values),
425
+ "std": stdev(values) if len(values) > 1 else 0.0,
426
+ "p95": self._percentile(values, 95),
427
+ "p99": self._percentile(values, 99),
428
+ }
429
+
430
+ return stats
431
+
432
+ def _analyze_cross_deployment_performance(self, deployment_stats: Dict[str, Dict]) -> Dict[str, Any]:
433
+ """Analyze performance across all deployments."""
434
+ analysis = {
435
+ "performance_comparison": {},
436
+ "outlier_detection": {},
437
+ "recommendations": [],
438
+ }
439
+
440
+ # Compare key metrics across deployments (updated for separated worker architecture)
441
+ key_metrics = [
442
+ "model_inference_time_sec",
443
+ "post_processing_time_sec",
444
+ "inference_worker_time_sec",
445
+ "post_processing_worker_time_sec",
446
+ "total_e2e_pipeline_time_sec",
447
+ "app_e2e_sec" # Legacy metric
448
+ ]
449
+
450
+ for metric in key_metrics:
451
+ metric_values = {}
452
+ for deployment_id, stats in deployment_stats.items():
453
+ if metric in stats:
454
+ metric_values[deployment_id] = stats[metric]["mean"]
455
+
456
+ if len(metric_values) > 1:
457
+ values = list(metric_values.values())
458
+ analysis["performance_comparison"][metric] = {
459
+ "deployment_means": metric_values,
460
+ "overall_mean": mean(values),
461
+ "overall_std": stdev(values) if len(values) > 1 else 0.0,
462
+ "fastest_deployment": min(metric_values.keys(), key=lambda k: metric_values[k]),
463
+ "slowest_deployment": max(metric_values.keys(), key=lambda k: metric_values[k]),
464
+ "performance_spread": max(values) - min(values),
465
+ }
466
+
467
+ # Detect outliers (deployments with >2 std deviations from mean)
468
+ overall_mean = mean(values)
469
+ overall_std = stdev(values) if len(values) > 1 else 0.0
470
+
471
+ if overall_std > 0:
472
+ outliers = []
473
+ for deployment_id, value in metric_values.items():
474
+ z_score = abs(value - overall_mean) / overall_std
475
+ if z_score > 2.0:
476
+ outliers.append({
477
+ "deployment_id": deployment_id,
478
+ "value": value,
479
+ "z_score": z_score,
480
+ })
481
+
482
+ if outliers:
483
+ analysis["outlier_detection"][metric] = outliers
484
+
485
+ return analysis
486
+
487
+ def _analyze_stream_performance(self) -> Dict[str, Any]:
488
+ """Analyze performance per stream across deployments."""
489
+ stream_analysis = {}
490
+
491
+ # Group by stream key
492
+ streams = defaultdict(list)
493
+ for (deployment_id, stream_key), metrics in self._stream_latency_data.items():
494
+ streams[stream_key].append((deployment_id, metrics))
495
+
496
+ for stream_key, deployment_metrics in streams.items():
497
+ if len(deployment_metrics) > 1: # Only analyze streams with multiple deployments
498
+ stream_stats = {}
499
+
500
+ for deployment_id, metrics in deployment_metrics:
501
+ stream_stats[deployment_id] = self._calculate_deployment_stats(metrics)
502
+
503
+ stream_analysis[stream_key] = {
504
+ "deployment_count": len(deployment_metrics),
505
+ "deployment_stats": stream_stats,
506
+ }
507
+
508
+ return stream_analysis
509
+
510
+ def _percentile(self, values: List[float], percentile: int) -> float:
511
+ """Calculate percentile of a list of values."""
512
+ if not values:
513
+ return 0.0
514
+
515
+ sorted_values = sorted(values)
516
+ index = (percentile / 100.0) * (len(sorted_values) - 1)
517
+
518
+ if index.is_integer():
519
+ return sorted_values[int(index)]
520
+ else:
521
+ lower = sorted_values[int(index)]
522
+ upper = sorted_values[int(index) + 1]
523
+ return lower + (upper - lower) * (index - int(index))
524
+
525
+ def _record_error(self, error_message: str) -> None:
526
+ """Record an error in statistics."""
527
+ with self._lock:
528
+ self.stats["errors"] += 1
529
+ self.stats["last_error"] = error_message
530
+ self.stats["last_error_time"] = time.time()
531
+ logging.error(f"Latency tracker error: {error_message}")
532
+
533
+ def get_stats(self) -> Dict[str, Any]:
534
+ """Get current tracker statistics."""
535
+ with self._lock:
536
+ stats = dict(self.stats)
537
+
538
+ if stats.get("start_time"):
539
+ stats["uptime_seconds"] = time.time() - stats["start_time"]
540
+
541
+ # Add data size information
542
+ stats["deployment_count"] = len(self._latency_data)
543
+ stats["stream_count"] = len(self._stream_latency_data)
544
+
545
+ total_samples = sum(
546
+ sum(len(metric_queue) for metric_queue in deployment_metrics.values())
547
+ for deployment_metrics in self._latency_data.values()
548
+ )
549
+ stats["total_samples"] = total_samples
550
+
551
+ return stats
552
+
553
+ def get_health_status(self) -> Dict[str, Any]:
554
+ """Get health status of the latency tracker."""
555
+ health = {
556
+ "status": "healthy",
557
+ "is_running": self._is_running,
558
+ "errors": self.stats["errors"],
559
+ "reports_published": self.stats["latency_reports_published"],
560
+ "messages_processed": self.stats["messages_processed"],
561
+ }
562
+
563
+ # Check for recent errors
564
+ if (
565
+ self.stats.get("last_error_time")
566
+ and (time.time() - self.stats["last_error_time"]) < 60
567
+ ):
568
+ health["status"] = "degraded"
569
+ health["reason"] = f"Recent error: {self.stats.get('last_error')}"
570
+
571
+ # Check if not running
572
+ if not self._is_running:
573
+ health["status"] = "unhealthy"
574
+ health["reason"] = "Latency tracker is not running"
575
+
576
+ return health
577
+
578
+ def cleanup(self) -> None:
579
+ """Clean up resources."""
580
+ try:
581
+ self.stop()
582
+ except Exception:
583
+ pass
584
+
585
+ with self._lock:
586
+ self._latency_data.clear()
587
+ self._stream_latency_data.clear()
588
+ self._deployment_summary.clear()
589
+
590
+ try:
591
+ if hasattr(self, "kafka_producer") and self.kafka_producer is not None:
592
+ self.kafka_producer.flush(5)
593
+ except Exception as exc:
594
+ logging.error(f"Error flushing latency tracker kafka producer: {exc}")
595
+
596
+ logging.info("Latency tracker cleanup completed")
597
+