matrice-inference 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of matrice-inference might be problematic. Click here for more details.

Files changed (37) hide show
  1. matrice_inference/__init__.py +72 -0
  2. matrice_inference/py.typed +0 -0
  3. matrice_inference/server/__init__.py +23 -0
  4. matrice_inference/server/inference_interface.py +176 -0
  5. matrice_inference/server/model/__init__.py +1 -0
  6. matrice_inference/server/model/model_manager.py +274 -0
  7. matrice_inference/server/model/model_manager_wrapper.py +550 -0
  8. matrice_inference/server/model/triton_model_manager.py +290 -0
  9. matrice_inference/server/model/triton_server.py +1248 -0
  10. matrice_inference/server/proxy_interface.py +371 -0
  11. matrice_inference/server/server.py +1004 -0
  12. matrice_inference/server/stream/__init__.py +0 -0
  13. matrice_inference/server/stream/app_deployment.py +228 -0
  14. matrice_inference/server/stream/consumer_worker.py +201 -0
  15. matrice_inference/server/stream/frame_cache.py +127 -0
  16. matrice_inference/server/stream/inference_worker.py +163 -0
  17. matrice_inference/server/stream/post_processing_worker.py +230 -0
  18. matrice_inference/server/stream/producer_worker.py +147 -0
  19. matrice_inference/server/stream/stream_pipeline.py +451 -0
  20. matrice_inference/server/stream/utils.py +23 -0
  21. matrice_inference/tmp/abstract_model_manager.py +58 -0
  22. matrice_inference/tmp/aggregator/__init__.py +18 -0
  23. matrice_inference/tmp/aggregator/aggregator.py +330 -0
  24. matrice_inference/tmp/aggregator/analytics.py +906 -0
  25. matrice_inference/tmp/aggregator/ingestor.py +438 -0
  26. matrice_inference/tmp/aggregator/latency.py +597 -0
  27. matrice_inference/tmp/aggregator/pipeline.py +968 -0
  28. matrice_inference/tmp/aggregator/publisher.py +431 -0
  29. matrice_inference/tmp/aggregator/synchronizer.py +594 -0
  30. matrice_inference/tmp/batch_manager.py +239 -0
  31. matrice_inference/tmp/overall_inference_testing.py +338 -0
  32. matrice_inference/tmp/triton_utils.py +638 -0
  33. matrice_inference-0.1.2.dist-info/METADATA +28 -0
  34. matrice_inference-0.1.2.dist-info/RECORD +37 -0
  35. matrice_inference-0.1.2.dist-info/WHEEL +5 -0
  36. matrice_inference-0.1.2.dist-info/licenses/LICENSE.txt +21 -0
  37. matrice_inference-0.1.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,594 @@
1
+ from typing import List, Dict, Tuple, Set
2
+ from queue import Queue, Empty, PriorityQueue
3
+ import threading
4
+ import time
5
+ import logging
6
+ from collections import defaultdict, deque
7
+ import heapq
8
+
9
+
10
+ class ResultsSynchronizer:
11
+ """
12
+ Optimized synchronization of results from multiple deployments by stream_key and input_order.
13
+ Ensures consistent structure and proper error handling for the aggregation pipeline.
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ results_queues: Dict[str, PriorityQueue],
19
+ sync_timeout: float = 300,
20
+ ):
21
+ """
22
+ Initialize the results synchronizer.
23
+
24
+ Args:
25
+ results_queues: Dictionary of priority queues containing results from deployments
26
+ sync_timeout: Maximum time to wait for input_order synchronization (in seconds)
27
+ """
28
+ self.results_queues = results_queues
29
+ self.synchronized_results_queue = Queue()
30
+ self.sync_timeout = sync_timeout
31
+ self.deployment_ids = tuple(results_queues.keys()) # Use tuple for faster iteration
32
+ self.deployment_count = len(self.deployment_ids)
33
+
34
+ # State management
35
+ self._is_running = False
36
+ self._stop_synchronization = threading.Event()
37
+ # Use separate locks to reduce contention
38
+ self._pending_lock = threading.Lock()
39
+ self._stats_lock = threading.Lock()
40
+ self._synchronization_thread = None
41
+
42
+ # Optimized synchronization state using more efficient data structures
43
+ # Structure: {(stream_group_key, stream_key, input_order): {deployment_id: result, ...}}
44
+ self._pending_results: Dict[Tuple[str, str, int], Dict[str, Dict]] = {}
45
+ # Track when each key combination was first seen - use list for faster cleanup
46
+ self._result_timestamps: Dict[Tuple[str, str, int], float] = {}
47
+ # Timeout queue for efficient cleanup - (timestamp, key)
48
+ self._timeout_queue: List[Tuple[float, Tuple[str, str, int]]] = []
49
+ # Track keys that have been timed out to prevent duplicate processing
50
+ self._timed_out_keys: Set[Tuple[str, str, int]] = set()
51
+ # Track latest result per deployment for timeout scenarios
52
+ self._latest_deployment_results: Dict[Tuple[str, str, int], Dict[str, Dict]] = {}
53
+
54
+ # Statistics - use separate dict to reduce lock contention
55
+ self._stats = {
56
+ "results_consumed": 0,
57
+ "results_synchronized": 0,
58
+ "partial_syncs": 0,
59
+ "complete_syncs": 0,
60
+ "timeouts": 0,
61
+ "errors": 0,
62
+ "pending_keys": 0,
63
+ "timed_out_keys": 0,
64
+ "duplicates_prevented": 0,
65
+ }
66
+ self._timing_stats = {
67
+ "start_time": None,
68
+ "last_error": None,
69
+ "last_error_time": None,
70
+ "avg_sync_time": 0.0,
71
+ "max_sync_time": 0.0,
72
+ "total_sync_time": 0.0,
73
+ }
74
+
75
+ def _record_error(self, error_message: str):
76
+ """Record an error in statistics."""
77
+ with self._stats_lock:
78
+ self._stats["errors"] += 1
79
+ self._timing_stats["last_error"] = error_message
80
+ self._timing_stats["last_error_time"] = time.time()
81
+ # Reduce logging frequency for performance
82
+ if self._stats["errors"] % 10 == 1: # Log every 10th error
83
+ logging.error(f"Synchronizer error (#{self._stats['errors']}): {error_message}")
84
+
85
+ def _collect_results_from_queues(self) -> int:
86
+ """Collect results from all deployment queues for immediate processing."""
87
+ results_collected = 0
88
+ current_time = time.time()
89
+
90
+ # Collect from all queues non-blocking
91
+ for deployment_id in self.deployment_ids:
92
+ queue = self.results_queues[deployment_id]
93
+
94
+ try:
95
+ # Get all available results from this queue
96
+ while True:
97
+ try:
98
+ priority_result = queue.get(block=False)
99
+ # Extract result from priority queue tuple
100
+ if isinstance(priority_result, tuple):
101
+ result = priority_result[-1] # Last element is always the result
102
+ else:
103
+ result = priority_result
104
+
105
+ # Process immediately
106
+ stream_key = result.get("stream_key")
107
+ stream_group_key = result.get("stream_group_key")
108
+ input_order = result.get("input_order")
109
+
110
+ if not all([stream_key, stream_group_key, input_order is not None]):
111
+ continue # Skip invalid results
112
+
113
+ key = (stream_group_key, stream_key, input_order)
114
+
115
+ with self._pending_lock:
116
+ # Skip if this key has already been timed out to prevent duplicates
117
+ if key in self._timed_out_keys:
118
+ with self._stats_lock:
119
+ self._stats["duplicates_prevented"] += 1
120
+ logging.debug(f"Prevented duplicate processing for timed-out key: {key}")
121
+ continue
122
+
123
+ # Initialize if first result for this key
124
+ if key not in self._pending_results:
125
+ self._pending_results[key] = {}
126
+ self._result_timestamps[key] = current_time
127
+ self._latest_deployment_results[key] = {}
128
+ # Add to timeout queue for efficient cleanup
129
+ heapq.heappush(self._timeout_queue, (current_time + self.sync_timeout, key))
130
+
131
+ # Add result to pending collection and track as latest
132
+ self._pending_results[key][deployment_id] = result
133
+ self._latest_deployment_results[key][deployment_id] = result
134
+ results_collected += 1
135
+
136
+ except Empty:
137
+ break # No more results in this queue
138
+
139
+ except Exception as exc:
140
+ if not self._stop_synchronization.is_set():
141
+ self._record_error(f"Error collecting from {deployment_id}: {str(exc)}")
142
+
143
+ # Update stats
144
+ if results_collected > 0:
145
+ with self._stats_lock:
146
+ self._stats["results_consumed"] += results_collected
147
+ self._stats["pending_keys"] = len(self._pending_results)
148
+
149
+ return results_collected
150
+
151
+ def _create_synchronized_result(
152
+ self,
153
+ key: Tuple[str, str, int],
154
+ deployment_results: Dict[str, Dict],
155
+ is_complete: bool,
156
+ is_timeout: bool,
157
+ sync_start_time: float,
158
+ ) -> Dict:
159
+ """Create a synchronized result dictionary with enhanced metadata."""
160
+ stream_group_key, stream_key, input_order = key
161
+ current_time = time.time()
162
+ sync_duration = current_time - sync_start_time
163
+
164
+ # Update sync time statistics (batch update for performance)
165
+ with self._stats_lock:
166
+ self._timing_stats["max_sync_time"] = max(self._timing_stats["max_sync_time"], sync_duration)
167
+ self._timing_stats["total_sync_time"] += sync_duration
168
+ # Calculate running average more efficiently
169
+ sync_count = self._stats["results_synchronized"] + 1
170
+ self._timing_stats["avg_sync_time"] = self._timing_stats["total_sync_time"] / sync_count
171
+
172
+ # Pre-calculate metadata to avoid repeated calculations
173
+ deployments_count = len(deployment_results)
174
+ sync_completeness_ratio = deployments_count / self.deployment_count
175
+
176
+ # Create synchronized result with minimal object creation
177
+ synchronized_result = {
178
+ "stream_key": stream_key,
179
+ "input_order": input_order,
180
+ "stream_group_key": stream_group_key,
181
+ "deployment_results": deployment_results, # Don't copy, transfer ownership
182
+ "synchronization_metadata": {
183
+ "deployments_count": deployments_count,
184
+ "expected_deployments": self.deployment_count,
185
+ "complete": is_complete,
186
+ "timeout": is_timeout,
187
+ "sync_duration_seconds": sync_duration,
188
+ "sync_start_timestamp": sync_start_time,
189
+ "sync_end_timestamp": current_time,
190
+ "sync_completeness_ratio": sync_completeness_ratio,
191
+ "synchronizer_version": "2.1", # Updated optimized version
192
+ },
193
+ }
194
+
195
+ # Add missing deployments only if needed (avoid list comprehension when complete)
196
+ if not is_complete:
197
+ missing = []
198
+ for dep_id in self.deployment_ids:
199
+ if dep_id not in deployment_results:
200
+ missing.append(dep_id)
201
+ synchronized_result["synchronization_metadata"]["missing_deployments"] = missing
202
+ else:
203
+ synchronized_result["synchronization_metadata"]["missing_deployments"] = []
204
+
205
+ # Add timeout reason if applicable
206
+ if is_timeout:
207
+ synchronized_result["synchronization_metadata"]["timeout_reason"] = (
208
+ f"Sync timeout after {self.sync_timeout} seconds"
209
+ )
210
+
211
+ return synchronized_result
212
+
213
+ def _process_synchronized_results(self) -> List[Dict]:
214
+ """Process pending results using efficient timeout queue and batch processing."""
215
+ synchronized_results = []
216
+ current_time = time.time()
217
+ keys_to_remove = []
218
+ complete_count = 0
219
+ partial_count = 0
220
+
221
+ with self._pending_lock:
222
+ # Process timeouts efficiently using heap
223
+ while self._timeout_queue and self._timeout_queue[0][0] <= current_time:
224
+ timeout_time, key = heapq.heappop(self._timeout_queue)
225
+ if key in self._pending_results and key not in self._timed_out_keys:
226
+ # Use latest deployment results for timeout (ensures we get most recent data)
227
+ deployment_results = self._latest_deployment_results.get(key, self._pending_results[key])
228
+ is_complete = len(deployment_results) == self.deployment_count
229
+ sync_start_time = self._result_timestamps[key]
230
+
231
+ # Mark this key as timed out to prevent future processing
232
+ self._timed_out_keys.add(key)
233
+
234
+ synchronized_result = self._create_synchronized_result(
235
+ key, deployment_results, is_complete, True, sync_start_time
236
+ )
237
+ synchronized_results.append(synchronized_result)
238
+ keys_to_remove.append(key)
239
+
240
+ if is_complete:
241
+ complete_count += 1
242
+ else:
243
+ partial_count += 1
244
+
245
+ logging.debug(f"Processed timeout for key {key} with {len(deployment_results)} deployments (complete: {is_complete})")
246
+
247
+ # Check for complete results (not timed out yet)
248
+ for key, deployment_results in list(self._pending_results.items()):
249
+ if key not in keys_to_remove and key not in self._timed_out_keys and len(deployment_results) == self.deployment_count:
250
+ sync_start_time = self._result_timestamps[key]
251
+ synchronized_result = self._create_synchronized_result(
252
+ key, deployment_results, True, False, sync_start_time
253
+ )
254
+ synchronized_results.append(synchronized_result)
255
+ keys_to_remove.append(key)
256
+ complete_count += 1
257
+
258
+ # Batch remove processed keys and cleanup all related data structures
259
+ for key in keys_to_remove:
260
+ self._pending_results.pop(key, None)
261
+ self._result_timestamps.pop(key, None)
262
+ self._latest_deployment_results.pop(key, None)
263
+ # Don't remove from _timed_out_keys yet - keep for duplicate prevention
264
+
265
+ # Batch update statistics
266
+ if synchronized_results:
267
+ with self._stats_lock:
268
+ self._stats["complete_syncs"] += complete_count
269
+ self._stats["partial_syncs"] += partial_count
270
+ self._stats["results_synchronized"] += len(synchronized_results)
271
+ self._stats["pending_keys"] = len(self._pending_results)
272
+ if partial_count > 0:
273
+ self._stats["timeouts"] += partial_count
274
+
275
+ # Reduce debug logging frequency for performance
276
+ if complete_count > 0 and self._stats["complete_syncs"] % 100 == 0:
277
+ logging.debug(f"Processed {complete_count} complete syncs, {partial_count} partial syncs")
278
+ elif partial_count > 0 and self._stats["partial_syncs"] % 10 == 0:
279
+ logging.warning(f"Processed {partial_count} partial syncs (timeouts), {complete_count} complete syncs")
280
+
281
+ return synchronized_results
282
+
283
+ def _cleanup_old_timed_out_keys(self, current_time: float):
284
+ """Clean up old timed-out keys to prevent memory leaks."""
285
+ # Clean up timed-out keys older than 2x the sync timeout
286
+ cleanup_age = self.sync_timeout * 2
287
+ keys_to_cleanup = []
288
+
289
+ for key in self._timed_out_keys:
290
+ # Check if we have timestamp info for this key
291
+ if key in self._result_timestamps:
292
+ key_age = current_time - self._result_timestamps[key]
293
+ if key_age > cleanup_age:
294
+ keys_to_cleanup.append(key)
295
+ else:
296
+ # If no timestamp, it's safe to cleanup (shouldn't happen but defensive)
297
+ keys_to_cleanup.append(key)
298
+
299
+ # Remove old keys
300
+ for key in keys_to_cleanup:
301
+ self._timed_out_keys.discard(key)
302
+
303
+ if keys_to_cleanup:
304
+ logging.debug(f"Cleaned up {len(keys_to_cleanup)} old timed-out keys")
305
+
306
+ def _send_synchronized_result(self, synchronized_result: Dict):
307
+ """Send a single synchronized result to the output queue."""
308
+ try:
309
+ self.synchronized_results_queue.put(synchronized_result)
310
+
311
+ logging.debug(
312
+ f"Sent synchronized result for group {synchronized_result.get('stream_group_key')}, "
313
+ f"stream {synchronized_result['stream_key']}, "
314
+ f"order {synchronized_result['input_order']}"
315
+ )
316
+
317
+ except Exception as exc:
318
+ self._record_error(f"Error sending synchronized result: {str(exc)}")
319
+
320
+ def _synchronization_worker(self):
321
+ """Optimized main synchronization worker thread for immediate processing."""
322
+ logging.info("Results synchronization worker started")
323
+ last_log_time = time.time()
324
+ last_cleanup_time = time.time()
325
+ log_interval = 30.0 # Log every 30 seconds instead of every cycle
326
+ cleanup_interval = 120.0 # Clean up old timed-out keys every 2 minutes
327
+
328
+ while not self._stop_synchronization.is_set():
329
+ try:
330
+ # Collect new results for immediate processing
331
+ results_collected = self._collect_results_from_queues()
332
+
333
+ # Process synchronized results (complete or timed out)
334
+ synchronized_results = self._process_synchronized_results()
335
+
336
+ # Send results immediately
337
+ for synchronized_result in synchronized_results:
338
+ self._send_synchronized_result(synchronized_result)
339
+
340
+ # Reduced frequency logging for performance
341
+ current_time = time.time()
342
+ if (results_collected > 0 or synchronized_results) and (current_time - last_log_time) > log_interval:
343
+ with self._stats_lock:
344
+ total_syncs = self._stats['complete_syncs'] + self._stats['partial_syncs']
345
+ completion_rate = self._stats['complete_syncs'] / max(total_syncs, 1)
346
+ logging.debug(
347
+ f"Synchronizer: collected={results_collected}, "
348
+ f"synchronized={len(synchronized_results)}, "
349
+ f"pending_keys={self._stats['pending_keys']}, "
350
+ f"timed_out_keys={len(self._timed_out_keys)}, "
351
+ f"duplicates_prevented={self._stats['duplicates_prevented']}, "
352
+ f"completion_rate={completion_rate:.3f}, "
353
+ f"avg_sync_time={self._timing_stats['avg_sync_time']:.3f}s"
354
+ )
355
+ last_log_time = current_time
356
+
357
+ # Periodic cleanup of old timed-out keys
358
+ if (current_time - last_cleanup_time) > cleanup_interval:
359
+ with self._pending_lock:
360
+ self._cleanup_old_timed_out_keys(current_time)
361
+ with self._stats_lock:
362
+ self._stats["timed_out_keys"] = len(self._timed_out_keys)
363
+ last_cleanup_time = current_time
364
+
365
+ # Minimal delay for immediate processing
366
+ if results_collected > 0 or synchronized_results:
367
+ time.sleep(0.001) # Activity detected, minimal delay
368
+ else:
369
+ time.sleep(0.01) # No activity, short delay
370
+
371
+ except Exception as exc:
372
+ if not self._stop_synchronization.is_set():
373
+ self._record_error(f"Error in synchronization worker: {str(exc)}")
374
+ time.sleep(0.1) # Prevent tight error loops
375
+
376
+ # Process any remaining results before stopping
377
+ try:
378
+ final_results = self._process_synchronized_results()
379
+ if final_results:
380
+ for synchronized_result in final_results:
381
+ self._send_synchronized_result(synchronized_result)
382
+ logging.info(f"Processed {len(final_results)} final results during shutdown")
383
+ except Exception as exc:
384
+ logging.error(f"Error processing final results: {exc}")
385
+
386
+ logging.info("Results synchronization worker stopped")
387
+
388
+ def start_synchronization(self) -> bool:
389
+ """
390
+ Start the results synchronization process.
391
+
392
+ Returns:
393
+ bool: True if synchronization started successfully, False otherwise
394
+ """
395
+
396
+ if self._is_running:
397
+ logging.warning("Results synchronization is already running")
398
+ return True
399
+
400
+ self._is_running = True
401
+ self._timing_stats["start_time"] = time.time()
402
+ self._stop_synchronization.clear()
403
+
404
+ try:
405
+ # Start synchronization thread
406
+ self._synchronization_thread = threading.Thread(
407
+ target=self._synchronization_worker,
408
+ name="ResultsSynchronizer",
409
+ daemon=True,
410
+ )
411
+ self._synchronization_thread.start()
412
+
413
+ logging.info(
414
+ f"Started results synchronization for {len(self.results_queues)} deployment queues "
415
+ f"with timeout {self.sync_timeout}s"
416
+ )
417
+ return True
418
+
419
+ except Exception as exc:
420
+ self._record_error(f"Failed to start results synchronization: {str(exc)}")
421
+ self.stop_synchronization()
422
+ return False
423
+
424
+ def stop_synchronization(self):
425
+ """Stop the results synchronization process."""
426
+ if not self._is_running:
427
+ logging.info("Results synchronization is not running")
428
+ return
429
+
430
+ self._is_running = False
431
+ self._stop_synchronization.set()
432
+
433
+ logging.info("Stopping results synchronization...")
434
+
435
+ # Wait for synchronization thread to complete
436
+ if self._synchronization_thread and self._synchronization_thread.is_alive():
437
+ try:
438
+ self._synchronization_thread.join(timeout=5.0)
439
+ if self._synchronization_thread.is_alive():
440
+ logging.warning(
441
+ "Results synchronization thread did not stop gracefully"
442
+ )
443
+ except Exception as exc:
444
+ logging.error(f"Error joining synchronization thread: {exc}")
445
+
446
+ self._synchronization_thread = None
447
+ logging.info("Results synchronization stopped")
448
+
449
+ def get_stats(self) -> Dict:
450
+ """Get current synchronization statistics."""
451
+ with self._stats_lock:
452
+ stats = self._stats.copy()
453
+ timing_stats = self._timing_stats.copy()
454
+
455
+ # Merge statistics
456
+ stats.update(timing_stats)
457
+
458
+ # Add runtime statistics
459
+ if stats["start_time"]:
460
+ stats["runtime_seconds"] = time.time() - stats["start_time"]
461
+
462
+ # Add calculated metrics
463
+ total_syncs = stats["complete_syncs"] + stats["partial_syncs"]
464
+ if total_syncs > 0:
465
+ stats["completion_rate"] = stats["complete_syncs"] / total_syncs
466
+ stats["timeout_rate"] = stats["timeouts"] / total_syncs
467
+ else:
468
+ stats["completion_rate"] = 0.0
469
+ stats["timeout_rate"] = 0.0
470
+
471
+ stats["output_queue_size"] = self.synchronized_results_queue.qsize()
472
+
473
+ # Add performance metrics
474
+ stats["deployment_count"] = self.deployment_count
475
+
476
+ return stats
477
+
478
+ def get_health_status(self) -> Dict:
479
+ """Get health status of the synchronizer."""
480
+ health = {
481
+ "status": "healthy",
482
+ "is_running": self._is_running,
483
+ "deployments": len(self.results_queues),
484
+ "queue_sizes": {},
485
+ "pending_sync_keys": 0,
486
+ "errors": 0,
487
+ "completion_rate": 0.0,
488
+ "avg_sync_time": 0.0,
489
+ }
490
+
491
+ # Check queue sizes
492
+ for deployment_id, queue in self.results_queues.items():
493
+ queue_size = queue.qsize()
494
+ health["queue_sizes"][deployment_id] = queue_size
495
+
496
+ # Calculate completion rate
497
+ with self._stats_lock:
498
+ total_syncs = self._stats["complete_syncs"] + self._stats["partial_syncs"]
499
+ if total_syncs > 0:
500
+ health["completion_rate"] = self._stats["complete_syncs"] / total_syncs
501
+
502
+ health["errors"] = self._stats["errors"]
503
+ health["pending_sync_keys"] = self._stats["pending_keys"]
504
+ health["avg_sync_time"] = self._timing_stats["avg_sync_time"]
505
+
506
+ # Check for recent errors (within last 60 seconds)
507
+ if (
508
+ self._timing_stats["last_error_time"]
509
+ and (time.time() - self._timing_stats["last_error_time"]) < 60
510
+ ):
511
+ health["status"] = "degraded"
512
+ health["recent_error"] = self._timing_stats["last_error"]
513
+ health["issue"] = f"Recent error: {self._timing_stats['last_error']}"
514
+ logging.warning(f"Synchronizer degraded due to recent error: {self._timing_stats['last_error']}")
515
+
516
+ # Check for excessive pending keys (potential memory issue)
517
+ if self._stats["pending_keys"] > 1000:
518
+ health["status"] = "degraded"
519
+ health["issue"] = f"Too many pending sync keys ({self._stats['pending_keys']})"
520
+ logging.warning(f"Synchronizer degraded: too many pending sync keys ({self._stats['pending_keys']}, threshold: 1000)")
521
+
522
+ # Check completion rate
523
+ if total_syncs > 10 and health["completion_rate"] < 0.8: # Less than 80% completion
524
+ health["status"] = "degraded"
525
+ health["issue"] = f"Low completion rate: {health['completion_rate']:.2%} ({self._stats['complete_syncs']}/{total_syncs})"
526
+ logging.warning(f"Synchronizer degraded: low completion rate {health['completion_rate']:.2%} ({self._stats['complete_syncs']}/{total_syncs} complete)")
527
+
528
+ # Check sync time
529
+ if self._timing_stats["avg_sync_time"] > self.sync_timeout * 0.8: # Average sync time near timeout
530
+ health["status"] = "degraded"
531
+ health["issue"] = f"High average sync time: {self._timing_stats['avg_sync_time']:.2f}s (timeout: {self.sync_timeout}s)"
532
+ logging.warning(f"Synchronizer degraded: high average sync time {self._timing_stats['avg_sync_time']:.2f}s (timeout threshold: {self.sync_timeout * 0.8:.1f}s)")
533
+
534
+ # Check if not running when it should be
535
+ if not self._is_running:
536
+ health["status"] = "unhealthy"
537
+ health["issue"] = "Synchronizer is not running"
538
+ logging.error("Synchronizer is not running")
539
+
540
+ return health
541
+
542
+ def force_sync_pending(self) -> int:
543
+ """Force synchronization of all pending results regardless of completeness."""
544
+ with self._pending_lock:
545
+ pending_count = len(self._pending_results)
546
+ if pending_count == 0:
547
+ return 0
548
+
549
+ # Get all pending results
550
+ synchronized_results = []
551
+ for key, deployment_results in self._pending_results.items():
552
+ sync_start_time = self._result_timestamps.get(key, time.time())
553
+ synchronized_result = self._create_synchronized_result(
554
+ key, deployment_results, False, True, sync_start_time
555
+ )
556
+ synchronized_results.append(synchronized_result)
557
+
558
+ # Clear pending state
559
+ self._pending_results.clear()
560
+ self._result_timestamps.clear()
561
+ self._timeout_queue.clear()
562
+ self._latest_deployment_results.clear()
563
+ # Don't clear _timed_out_keys to maintain duplicate prevention
564
+
565
+ with self._stats_lock:
566
+ self._stats["pending_keys"] = 0
567
+
568
+ # Send each result individually
569
+ for synchronized_result in synchronized_results:
570
+ self._send_synchronized_result(synchronized_result)
571
+
572
+ logging.info(f"Force synchronized {pending_count} pending result keys")
573
+ return pending_count
574
+
575
+ def cleanup(self):
576
+ """Clean up resources."""
577
+ self.stop_synchronization()
578
+
579
+ # Clear queues safely
580
+ try:
581
+ while not self.synchronized_results_queue.empty():
582
+ self.synchronized_results_queue.get_nowait()
583
+ except Exception:
584
+ pass
585
+
586
+ # Clear internal state
587
+ with self._pending_lock:
588
+ self._pending_results.clear()
589
+ self._result_timestamps.clear()
590
+ self._timeout_queue.clear()
591
+ self._timed_out_keys.clear()
592
+ self._latest_deployment_results.clear()
593
+
594
+ logging.info("Results synchronizer cleanup completed")