truthound-dashboard 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (169) hide show
  1. truthound_dashboard/api/alerts.py +258 -0
  2. truthound_dashboard/api/anomaly.py +1302 -0
  3. truthound_dashboard/api/cross_alerts.py +352 -0
  4. truthound_dashboard/api/deps.py +143 -0
  5. truthound_dashboard/api/drift_monitor.py +540 -0
  6. truthound_dashboard/api/lineage.py +1151 -0
  7. truthound_dashboard/api/maintenance.py +363 -0
  8. truthound_dashboard/api/middleware.py +373 -1
  9. truthound_dashboard/api/model_monitoring.py +805 -0
  10. truthound_dashboard/api/notifications_advanced.py +2452 -0
  11. truthound_dashboard/api/plugins.py +2096 -0
  12. truthound_dashboard/api/profile.py +211 -14
  13. truthound_dashboard/api/reports.py +853 -0
  14. truthound_dashboard/api/router.py +147 -0
  15. truthound_dashboard/api/rule_suggestions.py +310 -0
  16. truthound_dashboard/api/schema_evolution.py +231 -0
  17. truthound_dashboard/api/sources.py +47 -3
  18. truthound_dashboard/api/triggers.py +190 -0
  19. truthound_dashboard/api/validations.py +13 -0
  20. truthound_dashboard/api/validators.py +333 -4
  21. truthound_dashboard/api/versioning.py +309 -0
  22. truthound_dashboard/api/websocket.py +301 -0
  23. truthound_dashboard/core/__init__.py +27 -0
  24. truthound_dashboard/core/anomaly.py +1395 -0
  25. truthound_dashboard/core/anomaly_explainer.py +633 -0
  26. truthound_dashboard/core/cache.py +206 -0
  27. truthound_dashboard/core/cached_services.py +422 -0
  28. truthound_dashboard/core/charts.py +352 -0
  29. truthound_dashboard/core/connections.py +1069 -42
  30. truthound_dashboard/core/cross_alerts.py +837 -0
  31. truthound_dashboard/core/drift_monitor.py +1477 -0
  32. truthound_dashboard/core/drift_sampling.py +669 -0
  33. truthound_dashboard/core/i18n/__init__.py +42 -0
  34. truthound_dashboard/core/i18n/detector.py +173 -0
  35. truthound_dashboard/core/i18n/messages.py +564 -0
  36. truthound_dashboard/core/lineage.py +971 -0
  37. truthound_dashboard/core/maintenance.py +443 -5
  38. truthound_dashboard/core/model_monitoring.py +1043 -0
  39. truthound_dashboard/core/notifications/channels.py +1020 -1
  40. truthound_dashboard/core/notifications/deduplication/__init__.py +143 -0
  41. truthound_dashboard/core/notifications/deduplication/policies.py +274 -0
  42. truthound_dashboard/core/notifications/deduplication/service.py +400 -0
  43. truthound_dashboard/core/notifications/deduplication/stores.py +2365 -0
  44. truthound_dashboard/core/notifications/deduplication/strategies.py +422 -0
  45. truthound_dashboard/core/notifications/dispatcher.py +43 -0
  46. truthound_dashboard/core/notifications/escalation/__init__.py +149 -0
  47. truthound_dashboard/core/notifications/escalation/backends.py +1384 -0
  48. truthound_dashboard/core/notifications/escalation/engine.py +429 -0
  49. truthound_dashboard/core/notifications/escalation/models.py +336 -0
  50. truthound_dashboard/core/notifications/escalation/scheduler.py +1187 -0
  51. truthound_dashboard/core/notifications/escalation/state_machine.py +330 -0
  52. truthound_dashboard/core/notifications/escalation/stores.py +2896 -0
  53. truthound_dashboard/core/notifications/events.py +49 -0
  54. truthound_dashboard/core/notifications/metrics/__init__.py +115 -0
  55. truthound_dashboard/core/notifications/metrics/base.py +528 -0
  56. truthound_dashboard/core/notifications/metrics/collectors.py +583 -0
  57. truthound_dashboard/core/notifications/routing/__init__.py +169 -0
  58. truthound_dashboard/core/notifications/routing/combinators.py +184 -0
  59. truthound_dashboard/core/notifications/routing/config.py +375 -0
  60. truthound_dashboard/core/notifications/routing/config_parser.py +867 -0
  61. truthound_dashboard/core/notifications/routing/engine.py +382 -0
  62. truthound_dashboard/core/notifications/routing/expression_engine.py +1269 -0
  63. truthound_dashboard/core/notifications/routing/jinja2_engine.py +774 -0
  64. truthound_dashboard/core/notifications/routing/rules.py +625 -0
  65. truthound_dashboard/core/notifications/routing/validator.py +678 -0
  66. truthound_dashboard/core/notifications/service.py +2 -0
  67. truthound_dashboard/core/notifications/stats_aggregator.py +850 -0
  68. truthound_dashboard/core/notifications/throttling/__init__.py +83 -0
  69. truthound_dashboard/core/notifications/throttling/builder.py +311 -0
  70. truthound_dashboard/core/notifications/throttling/stores.py +1859 -0
  71. truthound_dashboard/core/notifications/throttling/throttlers.py +633 -0
  72. truthound_dashboard/core/openlineage.py +1028 -0
  73. truthound_dashboard/core/plugins/__init__.py +39 -0
  74. truthound_dashboard/core/plugins/docs/__init__.py +39 -0
  75. truthound_dashboard/core/plugins/docs/extractor.py +703 -0
  76. truthound_dashboard/core/plugins/docs/renderers.py +804 -0
  77. truthound_dashboard/core/plugins/hooks/__init__.py +63 -0
  78. truthound_dashboard/core/plugins/hooks/decorators.py +367 -0
  79. truthound_dashboard/core/plugins/hooks/manager.py +403 -0
  80. truthound_dashboard/core/plugins/hooks/protocols.py +265 -0
  81. truthound_dashboard/core/plugins/lifecycle/__init__.py +41 -0
  82. truthound_dashboard/core/plugins/lifecycle/hot_reload.py +584 -0
  83. truthound_dashboard/core/plugins/lifecycle/machine.py +419 -0
  84. truthound_dashboard/core/plugins/lifecycle/states.py +266 -0
  85. truthound_dashboard/core/plugins/loader.py +504 -0
  86. truthound_dashboard/core/plugins/registry.py +810 -0
  87. truthound_dashboard/core/plugins/reporter_executor.py +588 -0
  88. truthound_dashboard/core/plugins/sandbox/__init__.py +59 -0
  89. truthound_dashboard/core/plugins/sandbox/code_validator.py +243 -0
  90. truthound_dashboard/core/plugins/sandbox/engines.py +770 -0
  91. truthound_dashboard/core/plugins/sandbox/protocols.py +194 -0
  92. truthound_dashboard/core/plugins/sandbox.py +617 -0
  93. truthound_dashboard/core/plugins/security/__init__.py +68 -0
  94. truthound_dashboard/core/plugins/security/analyzer.py +535 -0
  95. truthound_dashboard/core/plugins/security/policies.py +311 -0
  96. truthound_dashboard/core/plugins/security/protocols.py +296 -0
  97. truthound_dashboard/core/plugins/security/signing.py +842 -0
  98. truthound_dashboard/core/plugins/security.py +446 -0
  99. truthound_dashboard/core/plugins/validator_executor.py +401 -0
  100. truthound_dashboard/core/plugins/versioning/__init__.py +51 -0
  101. truthound_dashboard/core/plugins/versioning/constraints.py +377 -0
  102. truthound_dashboard/core/plugins/versioning/dependencies.py +541 -0
  103. truthound_dashboard/core/plugins/versioning/semver.py +266 -0
  104. truthound_dashboard/core/profile_comparison.py +601 -0
  105. truthound_dashboard/core/report_history.py +570 -0
  106. truthound_dashboard/core/reporters/__init__.py +57 -0
  107. truthound_dashboard/core/reporters/base.py +296 -0
  108. truthound_dashboard/core/reporters/csv_reporter.py +155 -0
  109. truthound_dashboard/core/reporters/html_reporter.py +598 -0
  110. truthound_dashboard/core/reporters/i18n/__init__.py +65 -0
  111. truthound_dashboard/core/reporters/i18n/base.py +494 -0
  112. truthound_dashboard/core/reporters/i18n/catalogs.py +930 -0
  113. truthound_dashboard/core/reporters/json_reporter.py +160 -0
  114. truthound_dashboard/core/reporters/junit_reporter.py +233 -0
  115. truthound_dashboard/core/reporters/markdown_reporter.py +207 -0
  116. truthound_dashboard/core/reporters/pdf_reporter.py +209 -0
  117. truthound_dashboard/core/reporters/registry.py +272 -0
  118. truthound_dashboard/core/rule_generator.py +2088 -0
  119. truthound_dashboard/core/scheduler.py +822 -12
  120. truthound_dashboard/core/schema_evolution.py +858 -0
  121. truthound_dashboard/core/services.py +152 -9
  122. truthound_dashboard/core/statistics.py +718 -0
  123. truthound_dashboard/core/streaming_anomaly.py +883 -0
  124. truthound_dashboard/core/triggers/__init__.py +45 -0
  125. truthound_dashboard/core/triggers/base.py +226 -0
  126. truthound_dashboard/core/triggers/evaluators.py +609 -0
  127. truthound_dashboard/core/triggers/factory.py +363 -0
  128. truthound_dashboard/core/unified_alerts.py +870 -0
  129. truthound_dashboard/core/validation_limits.py +509 -0
  130. truthound_dashboard/core/versioning.py +709 -0
  131. truthound_dashboard/core/websocket/__init__.py +59 -0
  132. truthound_dashboard/core/websocket/manager.py +512 -0
  133. truthound_dashboard/core/websocket/messages.py +130 -0
  134. truthound_dashboard/db/__init__.py +30 -0
  135. truthound_dashboard/db/models.py +3375 -3
  136. truthound_dashboard/main.py +22 -0
  137. truthound_dashboard/schemas/__init__.py +396 -1
  138. truthound_dashboard/schemas/anomaly.py +1258 -0
  139. truthound_dashboard/schemas/base.py +4 -0
  140. truthound_dashboard/schemas/cross_alerts.py +334 -0
  141. truthound_dashboard/schemas/drift_monitor.py +890 -0
  142. truthound_dashboard/schemas/lineage.py +428 -0
  143. truthound_dashboard/schemas/maintenance.py +154 -0
  144. truthound_dashboard/schemas/model_monitoring.py +374 -0
  145. truthound_dashboard/schemas/notifications_advanced.py +1363 -0
  146. truthound_dashboard/schemas/openlineage.py +704 -0
  147. truthound_dashboard/schemas/plugins.py +1293 -0
  148. truthound_dashboard/schemas/profile.py +420 -34
  149. truthound_dashboard/schemas/profile_comparison.py +242 -0
  150. truthound_dashboard/schemas/reports.py +285 -0
  151. truthound_dashboard/schemas/rule_suggestion.py +434 -0
  152. truthound_dashboard/schemas/schema_evolution.py +164 -0
  153. truthound_dashboard/schemas/source.py +117 -2
  154. truthound_dashboard/schemas/triggers.py +511 -0
  155. truthound_dashboard/schemas/unified_alerts.py +223 -0
  156. truthound_dashboard/schemas/validation.py +25 -1
  157. truthound_dashboard/schemas/validators/__init__.py +11 -0
  158. truthound_dashboard/schemas/validators/base.py +151 -0
  159. truthound_dashboard/schemas/versioning.py +152 -0
  160. truthound_dashboard/static/index.html +2 -2
  161. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/METADATA +142 -18
  162. truthound_dashboard-1.4.0.dist-info/RECORD +239 -0
  163. truthound_dashboard/static/assets/index-BCA8H1hO.js +0 -574
  164. truthound_dashboard/static/assets/index-BNsSQ2fN.css +0 -1
  165. truthound_dashboard/static/assets/unmerged_dictionaries-CsJWCRx9.js +0 -1
  166. truthound_dashboard-1.3.0.dist-info/RECORD +0 -110
  167. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/WHEEL +0 -0
  168. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/entry_points.txt +0 -0
  169. {truthound_dashboard-1.3.0.dist-info → truthound_dashboard-1.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,2365 @@
1
+ """Storage backends for deduplication state.
2
+
3
+ This module provides storage backends for tracking sent notifications
4
+ and detecting duplicates.
5
+
6
+ Storage Backends:
7
+ - InMemoryDeduplicationStore: Simple in-memory storage (development)
8
+ - SQLiteDeduplicationStore: Persistent SQLite storage (production)
9
+ - RedisDeduplicationStore: Redis-based storage (distributed deployments)
10
+
11
+ Each store tracks fingerprints with timestamps and supports
12
+ automatic cleanup of expired entries.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import json
18
+ import sqlite3
19
+ import threading
20
+ import time
21
+ from abc import ABC, abstractmethod
22
+ from dataclasses import dataclass, field
23
+ from datetime import datetime, timedelta
24
+ from pathlib import Path
25
+ from typing import TYPE_CHECKING, Any
26
+
27
+ # Optional Redis dependency
28
+ try:
29
+ import redis
30
+ import redis.asyncio
31
+
32
+ REDIS_AVAILABLE = True
33
+ except ImportError:
34
+ REDIS_AVAILABLE = False
35
+ redis = None # type: ignore[assignment]
36
+
37
+ if TYPE_CHECKING:
38
+ import redis as redis_sync
39
+ import redis.asyncio as redis_async
40
+
41
+
42
+ @dataclass
43
+ class DeduplicationEntry:
44
+ """A stored deduplication entry.
45
+
46
+ Attributes:
47
+ fingerprint: Unique fingerprint identifying the notification.
48
+ first_seen: When this fingerprint was first seen.
49
+ last_seen: When this fingerprint was last seen.
50
+ count: Number of times this fingerprint was seen.
51
+ metadata: Additional entry metadata.
52
+ """
53
+
54
+ fingerprint: str
55
+ first_seen: datetime
56
+ last_seen: datetime
57
+ count: int = 1
58
+ metadata: dict[str, Any] = field(default_factory=dict)
59
+
60
+ def is_expired(self, window_seconds: int) -> bool:
61
+ """Check if entry has expired based on window."""
62
+ expiry = self.last_seen + timedelta(seconds=window_seconds)
63
+ return datetime.utcnow() > expiry
64
+
65
+
66
+ class BaseDeduplicationStore(ABC):
67
+ """Abstract base class for deduplication storage.
68
+
69
+ All stores must implement methods for checking, recording,
70
+ and cleaning up deduplication entries.
71
+ """
72
+
73
+ @abstractmethod
74
+ def exists(self, fingerprint: str, window_seconds: int) -> bool:
75
+ """Check if fingerprint exists within window.
76
+
77
+ Args:
78
+ fingerprint: The fingerprint to check.
79
+ window_seconds: Time window in seconds.
80
+
81
+ Returns:
82
+ True if fingerprint exists and is not expired.
83
+ """
84
+ ...
85
+
86
+ @abstractmethod
87
+ def record(self, fingerprint: str, metadata: dict[str, Any] | None = None) -> None:
88
+ """Record a fingerprint as sent.
89
+
90
+ Args:
91
+ fingerprint: The fingerprint to record.
92
+ metadata: Optional metadata to store.
93
+ """
94
+ ...
95
+
96
+ @abstractmethod
97
+ def get(self, fingerprint: str) -> DeduplicationEntry | None:
98
+ """Get entry by fingerprint.
99
+
100
+ Args:
101
+ fingerprint: The fingerprint to look up.
102
+
103
+ Returns:
104
+ Entry if found, None otherwise.
105
+ """
106
+ ...
107
+
108
+ @abstractmethod
109
+ def cleanup(self, max_age_seconds: int) -> int:
110
+ """Remove expired entries.
111
+
112
+ Args:
113
+ max_age_seconds: Maximum age of entries to keep.
114
+
115
+ Returns:
116
+ Number of entries removed.
117
+ """
118
+ ...
119
+
120
+ @abstractmethod
121
+ def clear(self) -> None:
122
+ """Clear all entries."""
123
+ ...
124
+
125
+ @abstractmethod
126
+ def count(self) -> int:
127
+ """Get total entry count."""
128
+ ...
129
+
130
+
131
+ class InMemoryDeduplicationStore(BaseDeduplicationStore):
132
+ """In-memory deduplication storage.
133
+
134
+ Simple thread-safe in-memory storage suitable for
135
+ development and single-process deployments.
136
+
137
+ Note: Data is lost on process restart.
138
+ """
139
+
140
+ def __init__(self) -> None:
141
+ """Initialize in-memory store."""
142
+ self._entries: dict[str, DeduplicationEntry] = {}
143
+ self._lock = threading.RLock()
144
+
145
+ def exists(self, fingerprint: str, window_seconds: int) -> bool:
146
+ """Check if fingerprint exists within window."""
147
+ with self._lock:
148
+ entry = self._entries.get(fingerprint)
149
+ if entry is None:
150
+ return False
151
+ return not entry.is_expired(window_seconds)
152
+
153
+ def record(self, fingerprint: str, metadata: dict[str, Any] | None = None) -> None:
154
+ """Record a fingerprint."""
155
+ now = datetime.utcnow()
156
+ with self._lock:
157
+ if fingerprint in self._entries:
158
+ entry = self._entries[fingerprint]
159
+ entry.last_seen = now
160
+ entry.count += 1
161
+ if metadata:
162
+ entry.metadata.update(metadata)
163
+ else:
164
+ self._entries[fingerprint] = DeduplicationEntry(
165
+ fingerprint=fingerprint,
166
+ first_seen=now,
167
+ last_seen=now,
168
+ count=1,
169
+ metadata=metadata or {},
170
+ )
171
+
172
+ def get(self, fingerprint: str) -> DeduplicationEntry | None:
173
+ """Get entry by fingerprint."""
174
+ with self._lock:
175
+ return self._entries.get(fingerprint)
176
+
177
+ def cleanup(self, max_age_seconds: int) -> int:
178
+ """Remove expired entries."""
179
+ cutoff = datetime.utcnow() - timedelta(seconds=max_age_seconds)
180
+ removed = 0
181
+
182
+ with self._lock:
183
+ expired = [
184
+ fp for fp, entry in self._entries.items()
185
+ if entry.last_seen < cutoff
186
+ ]
187
+ for fp in expired:
188
+ del self._entries[fp]
189
+ removed += 1
190
+
191
+ return removed
192
+
193
+ def clear(self) -> None:
194
+ """Clear all entries."""
195
+ with self._lock:
196
+ self._entries.clear()
197
+
198
+ def count(self) -> int:
199
+ """Get total entry count."""
200
+ with self._lock:
201
+ return len(self._entries)
202
+
203
+
204
+ class SQLiteDeduplicationStore(BaseDeduplicationStore):
205
+ """SQLite-based persistent deduplication storage.
206
+
207
+ Provides durable storage that survives process restarts.
208
+ Thread-safe using connection pooling.
209
+
210
+ Attributes:
211
+ db_path: Path to SQLite database file.
212
+ """
213
+
214
+ def __init__(self, db_path: str | Path = "deduplication.db") -> None:
215
+ """Initialize SQLite store.
216
+
217
+ Args:
218
+ db_path: Path to database file.
219
+ """
220
+ self.db_path = Path(db_path)
221
+ self._local = threading.local()
222
+ self._init_db()
223
+
224
+ def _get_connection(self) -> sqlite3.Connection:
225
+ """Get thread-local database connection."""
226
+ if not hasattr(self._local, "connection"):
227
+ self._local.connection = sqlite3.connect(
228
+ str(self.db_path),
229
+ check_same_thread=False,
230
+ )
231
+ self._local.connection.row_factory = sqlite3.Row
232
+ return self._local.connection
233
+
234
+ def _init_db(self) -> None:
235
+ """Initialize database schema."""
236
+ conn = self._get_connection()
237
+ conn.execute("""
238
+ CREATE TABLE IF NOT EXISTS deduplication_entries (
239
+ fingerprint TEXT PRIMARY KEY,
240
+ first_seen REAL NOT NULL,
241
+ last_seen REAL NOT NULL,
242
+ count INTEGER NOT NULL DEFAULT 1,
243
+ metadata TEXT
244
+ )
245
+ """)
246
+ conn.execute("""
247
+ CREATE INDEX IF NOT EXISTS idx_dedup_last_seen
248
+ ON deduplication_entries(last_seen)
249
+ """)
250
+ conn.commit()
251
+
252
+ def exists(self, fingerprint: str, window_seconds: int) -> bool:
253
+ """Check if fingerprint exists within window."""
254
+ conn = self._get_connection()
255
+ cutoff = time.time() - window_seconds
256
+
257
+ cursor = conn.execute(
258
+ """
259
+ SELECT 1 FROM deduplication_entries
260
+ WHERE fingerprint = ? AND last_seen >= ?
261
+ """,
262
+ (fingerprint, cutoff),
263
+ )
264
+ return cursor.fetchone() is not None
265
+
266
+ def record(self, fingerprint: str, metadata: dict[str, Any] | None = None) -> None:
267
+ """Record a fingerprint."""
268
+ import json
269
+
270
+ now = time.time()
271
+ conn = self._get_connection()
272
+
273
+ # Try to update existing
274
+ cursor = conn.execute(
275
+ """
276
+ UPDATE deduplication_entries
277
+ SET last_seen = ?, count = count + 1
278
+ WHERE fingerprint = ?
279
+ """,
280
+ (now, fingerprint),
281
+ )
282
+
283
+ if cursor.rowcount == 0:
284
+ # Insert new entry
285
+ metadata_json = json.dumps(metadata) if metadata else None
286
+ conn.execute(
287
+ """
288
+ INSERT INTO deduplication_entries
289
+ (fingerprint, first_seen, last_seen, count, metadata)
290
+ VALUES (?, ?, ?, 1, ?)
291
+ """,
292
+ (fingerprint, now, now, metadata_json),
293
+ )
294
+
295
+ conn.commit()
296
+
297
+ def get(self, fingerprint: str) -> DeduplicationEntry | None:
298
+ """Get entry by fingerprint."""
299
+ import json
300
+
301
+ conn = self._get_connection()
302
+ cursor = conn.execute(
303
+ """
304
+ SELECT fingerprint, first_seen, last_seen, count, metadata
305
+ FROM deduplication_entries
306
+ WHERE fingerprint = ?
307
+ """,
308
+ (fingerprint,),
309
+ )
310
+ row = cursor.fetchone()
311
+
312
+ if row is None:
313
+ return None
314
+
315
+ metadata = {}
316
+ if row["metadata"]:
317
+ try:
318
+ metadata = json.loads(row["metadata"])
319
+ except json.JSONDecodeError:
320
+ pass
321
+
322
+ return DeduplicationEntry(
323
+ fingerprint=row["fingerprint"],
324
+ first_seen=datetime.fromtimestamp(row["first_seen"]),
325
+ last_seen=datetime.fromtimestamp(row["last_seen"]),
326
+ count=row["count"],
327
+ metadata=metadata,
328
+ )
329
+
330
+ def cleanup(self, max_age_seconds: int) -> int:
331
+ """Remove expired entries."""
332
+ conn = self._get_connection()
333
+ cutoff = time.time() - max_age_seconds
334
+
335
+ cursor = conn.execute(
336
+ """
337
+ DELETE FROM deduplication_entries
338
+ WHERE last_seen < ?
339
+ """,
340
+ (cutoff,),
341
+ )
342
+ conn.commit()
343
+
344
+ return cursor.rowcount
345
+
346
+ def clear(self) -> None:
347
+ """Clear all entries."""
348
+ conn = self._get_connection()
349
+ conn.execute("DELETE FROM deduplication_entries")
350
+ conn.commit()
351
+
352
+ def count(self) -> int:
353
+ """Get total entry count."""
354
+ conn = self._get_connection()
355
+ cursor = conn.execute("SELECT COUNT(*) FROM deduplication_entries")
356
+ return cursor.fetchone()[0]
357
+
358
+ def close(self) -> None:
359
+ """Close database connection."""
360
+ if hasattr(self._local, "connection"):
361
+ self._local.connection.close()
362
+ del self._local.connection
363
+
364
+
365
+ class RedisDeduplicationStore(BaseDeduplicationStore):
366
+ """Redis-based deduplication store for distributed deployments.
367
+
368
+ Uses Redis strings with TTL for automatic expiration.
369
+ Supports both sync and async Redis clients with connection pooling.
370
+
371
+ This store is ideal for:
372
+ - Multi-process deployments
373
+ - Distributed systems
374
+ - High-concurrency scenarios
375
+ - Deployments requiring shared state
376
+
377
+ Note: Requires the 'redis' optional dependency.
378
+ Install with: pip install truthound-dashboard[redis]
379
+
380
+ Attributes:
381
+ redis_url: Redis connection URL.
382
+ key_prefix: Prefix for all Redis keys.
383
+ default_ttl: Default TTL in seconds for entries.
384
+ """
385
+
386
+ def __init__(
387
+ self,
388
+ redis_url: str = "redis://localhost:6379/0",
389
+ key_prefix: str = "truthound:dedup:",
390
+ default_ttl: int = 3600, # 1 hour
391
+ max_connections: int = 10,
392
+ socket_timeout: float = 5.0,
393
+ socket_connect_timeout: float = 5.0,
394
+ retry_on_timeout: bool = True,
395
+ ) -> None:
396
+ """Initialize Redis store.
397
+
398
+ Args:
399
+ redis_url: Redis connection URL (e.g., redis://localhost:6379/0).
400
+ key_prefix: Prefix for all deduplication keys.
401
+ default_ttl: Default TTL in seconds for entries.
402
+ max_connections: Maximum connections in the pool.
403
+ socket_timeout: Socket timeout in seconds.
404
+ socket_connect_timeout: Connection timeout in seconds.
405
+ retry_on_timeout: Whether to retry on timeout.
406
+
407
+ Raises:
408
+ ImportError: If redis package is not installed.
409
+ """
410
+ if not REDIS_AVAILABLE:
411
+ raise ImportError(
412
+ "Redis support requires the 'redis' package. "
413
+ "Install with: pip install truthound-dashboard[redis] "
414
+ "or pip install redis"
415
+ )
416
+
417
+ self.redis_url = redis_url
418
+ self.key_prefix = key_prefix
419
+ self.default_ttl = default_ttl
420
+ self.max_connections = max_connections
421
+ self.socket_timeout = socket_timeout
422
+ self.socket_connect_timeout = socket_connect_timeout
423
+ self.retry_on_timeout = retry_on_timeout
424
+
425
+ # Connection pool for sync client
426
+ self._pool: redis.ConnectionPool | None = None
427
+ self._client: redis.Redis | None = None
428
+
429
+ # Connection pool for async client
430
+ self._async_pool: redis.asyncio.ConnectionPool | None = None
431
+ self._async_client: redis.asyncio.Redis | None = None
432
+
433
+ # Lock for thread-safe initialization
434
+ self._lock = threading.Lock()
435
+
436
+ def _get_key(self, fingerprint: str) -> str:
437
+ """Get full Redis key for fingerprint.
438
+
439
+ Args:
440
+ fingerprint: The fingerprint string.
441
+
442
+ Returns:
443
+ Full Redis key with prefix.
444
+ """
445
+ return f"{self.key_prefix}{fingerprint}"
446
+
447
+ def _create_pool(self) -> redis.ConnectionPool:
448
+ """Create a connection pool for sync client.
449
+
450
+ Returns:
451
+ Configured connection pool.
452
+ """
453
+ return redis.ConnectionPool.from_url(
454
+ self.redis_url,
455
+ max_connections=self.max_connections,
456
+ socket_timeout=self.socket_timeout,
457
+ socket_connect_timeout=self.socket_connect_timeout,
458
+ retry_on_timeout=self.retry_on_timeout,
459
+ )
460
+
461
+ async def _create_async_pool(self) -> redis.asyncio.ConnectionPool:
462
+ """Create a connection pool for async client.
463
+
464
+ Returns:
465
+ Configured async connection pool.
466
+ """
467
+ return redis.asyncio.ConnectionPool.from_url(
468
+ self.redis_url,
469
+ max_connections=self.max_connections,
470
+ socket_timeout=self.socket_timeout,
471
+ socket_connect_timeout=self.socket_connect_timeout,
472
+ retry_on_timeout=self.retry_on_timeout,
473
+ )
474
+
475
+ @property
476
+ def client(self) -> redis.Redis:
477
+ """Get sync Redis client with connection pooling.
478
+
479
+ Creates the connection pool and client on first access.
480
+
481
+ Returns:
482
+ Redis client instance.
483
+ """
484
+ if self._client is None:
485
+ with self._lock:
486
+ if self._client is None:
487
+ self._pool = self._create_pool()
488
+ self._client = redis.Redis(connection_pool=self._pool)
489
+ return self._client
490
+
491
+ async def get_async_client(self) -> redis.asyncio.Redis:
492
+ """Get async Redis client with connection pooling.
493
+
494
+ Creates the async connection pool and client on first access.
495
+
496
+ Returns:
497
+ Async Redis client instance.
498
+ """
499
+ if self._async_client is None:
500
+ self._async_pool = await self._create_async_pool()
501
+ self._async_client = redis.asyncio.Redis(
502
+ connection_pool=self._async_pool
503
+ )
504
+ return self._async_client
505
+
506
+ def exists(self, fingerprint: str, window_seconds: int) -> bool:
507
+ """Check if fingerprint exists in Redis.
508
+
509
+ Note: Redis handles expiration via TTL, so window_seconds is not
510
+ used here. The entry either exists (not expired) or doesn't.
511
+
512
+ Args:
513
+ fingerprint: The fingerprint to check.
514
+ window_seconds: Time window (unused, TTL handles expiration).
515
+
516
+ Returns:
517
+ True if fingerprint exists and hasn't expired.
518
+ """
519
+ key = self._get_key(fingerprint)
520
+ return self.client.exists(key) > 0
521
+
522
+ async def exists_async(self, fingerprint: str, window_seconds: int) -> bool:
523
+ """Async check if fingerprint exists in Redis.
524
+
525
+ Args:
526
+ fingerprint: The fingerprint to check.
527
+ window_seconds: Time window (unused, TTL handles expiration).
528
+
529
+ Returns:
530
+ True if fingerprint exists and hasn't expired.
531
+ """
532
+ client = await self.get_async_client()
533
+ key = self._get_key(fingerprint)
534
+ return await client.exists(key) > 0
535
+
536
+ def record(self, fingerprint: str, metadata: dict[str, Any] | None = None) -> None:
537
+ """Record fingerprint with TTL.
538
+
539
+ Stores the fingerprint with metadata and sets TTL for auto-expiration.
540
+ If the fingerprint already exists, updates metadata and resets TTL.
541
+
542
+ Args:
543
+ fingerprint: The fingerprint to record.
544
+ metadata: Optional metadata to store with the entry.
545
+ """
546
+ key = self._get_key(fingerprint)
547
+ now = time.time()
548
+
549
+ # Get existing entry to preserve first_seen and increment count
550
+ existing = self.client.get(key)
551
+ if existing:
552
+ try:
553
+ data = json.loads(existing)
554
+ data["last_seen"] = now
555
+ data["count"] = data.get("count", 1) + 1
556
+ if metadata:
557
+ data["metadata"].update(metadata)
558
+ except (json.JSONDecodeError, KeyError):
559
+ data = {
560
+ "first_seen": now,
561
+ "last_seen": now,
562
+ "count": 1,
563
+ "metadata": metadata or {},
564
+ }
565
+ else:
566
+ data = {
567
+ "first_seen": now,
568
+ "last_seen": now,
569
+ "count": 1,
570
+ "metadata": metadata or {},
571
+ }
572
+
573
+ value = json.dumps(data)
574
+ self.client.setex(key, self.default_ttl, value)
575
+
576
+ async def record_async(
577
+ self, fingerprint: str, metadata: dict[str, Any] | None = None
578
+ ) -> None:
579
+ """Async record fingerprint with TTL.
580
+
581
+ Args:
582
+ fingerprint: The fingerprint to record.
583
+ metadata: Optional metadata to store with the entry.
584
+ """
585
+ client = await self.get_async_client()
586
+ key = self._get_key(fingerprint)
587
+ now = time.time()
588
+
589
+ # Get existing entry to preserve first_seen and increment count
590
+ existing = await client.get(key)
591
+ if existing:
592
+ try:
593
+ data = json.loads(existing)
594
+ data["last_seen"] = now
595
+ data["count"] = data.get("count", 1) + 1
596
+ if metadata:
597
+ data["metadata"].update(metadata)
598
+ except (json.JSONDecodeError, KeyError):
599
+ data = {
600
+ "first_seen": now,
601
+ "last_seen": now,
602
+ "count": 1,
603
+ "metadata": metadata or {},
604
+ }
605
+ else:
606
+ data = {
607
+ "first_seen": now,
608
+ "last_seen": now,
609
+ "count": 1,
610
+ "metadata": metadata or {},
611
+ }
612
+
613
+ value = json.dumps(data)
614
+ await client.setex(key, self.default_ttl, value)
615
+
616
+ def get(self, fingerprint: str) -> DeduplicationEntry | None:
617
+ """Get entry by fingerprint.
618
+
619
+ Args:
620
+ fingerprint: The fingerprint to look up.
621
+
622
+ Returns:
623
+ Entry if found, None otherwise.
624
+ """
625
+ key = self._get_key(fingerprint)
626
+ data = self.client.get(key)
627
+
628
+ if data is None:
629
+ return None
630
+
631
+ try:
632
+ parsed = json.loads(data)
633
+ return DeduplicationEntry(
634
+ fingerprint=fingerprint,
635
+ first_seen=datetime.fromtimestamp(parsed["first_seen"]),
636
+ last_seen=datetime.fromtimestamp(parsed["last_seen"]),
637
+ count=parsed.get("count", 1),
638
+ metadata=parsed.get("metadata", {}),
639
+ )
640
+ except (json.JSONDecodeError, KeyError):
641
+ return None
642
+
643
+ async def get_async(self, fingerprint: str) -> DeduplicationEntry | None:
644
+ """Async get entry by fingerprint.
645
+
646
+ Args:
647
+ fingerprint: The fingerprint to look up.
648
+
649
+ Returns:
650
+ Entry if found, None otherwise.
651
+ """
652
+ client = await self.get_async_client()
653
+ key = self._get_key(fingerprint)
654
+ data = await client.get(key)
655
+
656
+ if data is None:
657
+ return None
658
+
659
+ try:
660
+ parsed = json.loads(data)
661
+ return DeduplicationEntry(
662
+ fingerprint=fingerprint,
663
+ first_seen=datetime.fromtimestamp(parsed["first_seen"]),
664
+ last_seen=datetime.fromtimestamp(parsed["last_seen"]),
665
+ count=parsed.get("count", 1),
666
+ metadata=parsed.get("metadata", {}),
667
+ )
668
+ except (json.JSONDecodeError, KeyError):
669
+ return None
670
+
671
+ def count(self) -> int:
672
+ """Count entries (approximate using SCAN).
673
+
674
+ Uses SCAN to iterate through keys without blocking Redis.
675
+
676
+ Returns:
677
+ Approximate count of deduplication entries.
678
+ """
679
+ count = 0
680
+ cursor = 0
681
+ pattern = f"{self.key_prefix}*"
682
+
683
+ while True:
684
+ cursor, keys = self.client.scan(cursor, match=pattern, count=100)
685
+ count += len(keys)
686
+ if cursor == 0:
687
+ break
688
+
689
+ return count
690
+
691
+ async def count_async(self) -> int:
692
+ """Async count entries.
693
+
694
+ Returns:
695
+ Approximate count of deduplication entries.
696
+ """
697
+ client = await self.get_async_client()
698
+ count = 0
699
+ cursor = 0
700
+ pattern = f"{self.key_prefix}*"
701
+
702
+ while True:
703
+ cursor, keys = await client.scan(cursor, match=pattern, count=100)
704
+ count += len(keys)
705
+ if cursor == 0:
706
+ break
707
+
708
+ return count
709
+
710
+ def cleanup(self, max_age_seconds: int) -> int:
711
+ """Redis handles expiration via TTL, no manual cleanup needed.
712
+
713
+ This method is a no-op for Redis as TTL handles expiration automatically.
714
+
715
+ Args:
716
+ max_age_seconds: Maximum age (unused for Redis).
717
+
718
+ Returns:
719
+ Always returns 0 as Redis handles expiration.
720
+ """
721
+ # Redis handles expiration automatically via TTL
722
+ return 0
723
+
724
+ def clear(self) -> None:
725
+ """Clear all deduplication keys.
726
+
727
+ Uses SCAN to find and delete all keys with the dedup prefix.
728
+ This is done in batches to avoid blocking Redis.
729
+ """
730
+ pattern = f"{self.key_prefix}*"
731
+ cursor = 0
732
+
733
+ while True:
734
+ cursor, keys = self.client.scan(cursor, match=pattern, count=100)
735
+ if keys:
736
+ self.client.delete(*keys)
737
+ if cursor == 0:
738
+ break
739
+
740
+ async def clear_async(self) -> None:
741
+ """Async clear all deduplication keys."""
742
+ client = await self.get_async_client()
743
+ pattern = f"{self.key_prefix}*"
744
+ cursor = 0
745
+
746
+ while True:
747
+ cursor, keys = await client.scan(cursor, match=pattern, count=100)
748
+ if keys:
749
+ await client.delete(*keys)
750
+ if cursor == 0:
751
+ break
752
+
753
+ def health_check(self) -> bool:
754
+ """Check Redis connection health.
755
+
756
+ Performs a PING command to verify connectivity.
757
+
758
+ Returns:
759
+ True if Redis is reachable, False otherwise.
760
+ """
761
+ try:
762
+ return self.client.ping()
763
+ except Exception:
764
+ return False
765
+
766
+ async def health_check_async(self) -> bool:
767
+ """Async check Redis connection health.
768
+
769
+ Returns:
770
+ True if Redis is reachable, False otherwise.
771
+ """
772
+ try:
773
+ client = await self.get_async_client()
774
+ return await client.ping()
775
+ except Exception:
776
+ return False
777
+
778
+ def get_info(self) -> dict[str, Any]:
779
+ """Get Redis server information.
780
+
781
+ Returns:
782
+ Dictionary containing Redis server info.
783
+ """
784
+ try:
785
+ info = self.client.info()
786
+ return {
787
+ "redis_version": info.get("redis_version"),
788
+ "connected_clients": info.get("connected_clients"),
789
+ "used_memory_human": info.get("used_memory_human"),
790
+ "uptime_in_seconds": info.get("uptime_in_seconds"),
791
+ "db0": info.get("db0", {}),
792
+ }
793
+ except Exception as e:
794
+ return {"error": str(e)}
795
+
796
+ def set_ttl(self, fingerprint: str, ttl_seconds: int) -> bool:
797
+ """Set custom TTL for a specific fingerprint.
798
+
799
+ Args:
800
+ fingerprint: The fingerprint to update.
801
+ ttl_seconds: New TTL in seconds.
802
+
803
+ Returns:
804
+ True if TTL was set, False if key doesn't exist.
805
+ """
806
+ key = self._get_key(fingerprint)
807
+ return self.client.expire(key, ttl_seconds)
808
+
809
+ def get_ttl(self, fingerprint: str) -> int:
810
+ """Get remaining TTL for a fingerprint.
811
+
812
+ Args:
813
+ fingerprint: The fingerprint to check.
814
+
815
+ Returns:
816
+ TTL in seconds, -1 if no TTL, -2 if key doesn't exist.
817
+ """
818
+ key = self._get_key(fingerprint)
819
+ return self.client.ttl(key)
820
+
821
+ def close(self) -> None:
822
+ """Close all connections and pools.
823
+
824
+ Should be called when the store is no longer needed
825
+ to release resources.
826
+ """
827
+ if self._client is not None:
828
+ self._client.close()
829
+ self._client = None
830
+
831
+ if self._pool is not None:
832
+ self._pool.disconnect()
833
+ self._pool = None
834
+
835
+ # Note: Async client/pool should be closed in async context
836
+ # using close_async() method
837
+
838
+ async def close_async(self) -> None:
839
+ """Async close all connections and pools."""
840
+ if self._async_client is not None:
841
+ await self._async_client.close()
842
+ self._async_client = None
843
+
844
+ if self._async_pool is not None:
845
+ await self._async_pool.disconnect()
846
+ self._async_pool = None
847
+
848
+ def __enter__(self) -> "RedisDeduplicationStore":
849
+ """Context manager entry."""
850
+ return self
851
+
852
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
853
+ """Context manager exit, closes connections."""
854
+ self.close()
855
+
856
+ async def __aenter__(self) -> "RedisDeduplicationStore":
857
+ """Async context manager entry."""
858
+ return self
859
+
860
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
861
+ """Async context manager exit, closes connections."""
862
+ await self.close_async()
863
+
864
+
865
+ # ============================================================================
866
+ # Redis Streams Deduplication Store
867
+ # ============================================================================
868
+
869
+
870
+ @dataclass
871
+ class DeduplicationMetrics:
872
+ """Metrics for deduplication store operations.
873
+
874
+ Attributes:
875
+ hits: Number of duplicate detections (cache hits).
876
+ misses: Number of non-duplicate entries (cache misses).
877
+ records: Number of fingerprints recorded.
878
+ errors: Number of Redis errors encountered.
879
+ fallbacks: Number of times fallback to InMemory was used.
880
+ reconnections: Number of successful reconnections.
881
+ """
882
+
883
+ hits: int = 0
884
+ misses: int = 0
885
+ records: int = 0
886
+ errors: int = 0
887
+ fallbacks: int = 0
888
+ reconnections: int = 0
889
+
890
+ def to_dict(self) -> dict[str, int]:
891
+ """Convert metrics to dictionary."""
892
+ return {
893
+ "hits": self.hits,
894
+ "misses": self.misses,
895
+ "records": self.records,
896
+ "errors": self.errors,
897
+ "fallbacks": self.fallbacks,
898
+ "reconnections": self.reconnections,
899
+ "total_checks": self.hits + self.misses,
900
+ "hit_rate": round(self.hits / max(1, self.hits + self.misses) * 100, 2),
901
+ }
902
+
903
+
904
+ class RedisStreamsDeduplicationStore(BaseDeduplicationStore):
905
+ """Production-ready Redis Streams based deduplication store.
906
+
907
+ Uses Redis Streams for robust distributed deduplication with:
908
+ - Connection pool management with configurable pool size
909
+ - Automatic reconnection with exponential backoff
910
+ - TTL management for stream entries (auto-cleanup)
911
+ - Consumer groups for distributed deduplication
912
+ - Graceful degradation (fallback to InMemory on Redis failure)
913
+ - Health check endpoint support
914
+ - Comprehensive metrics collection
915
+
916
+ Configuration via environment variables:
917
+ TRUTHOUND_DEDUP_REDIS_URL: Redis connection URL (default: redis://localhost:6379/0)
918
+ TRUTHOUND_DEDUP_REDIS_PREFIX: Key prefix (default: truthound:dedup:streams:)
919
+ TRUTHOUND_DEDUP_REDIS_TTL: Default TTL in seconds (default: 3600)
920
+ TRUTHOUND_DEDUP_REDIS_POOL_SIZE: Connection pool size (default: 10)
921
+ TRUTHOUND_DEDUP_REDIS_SOCKET_TIMEOUT: Socket timeout (default: 5.0)
922
+ TRUTHOUND_DEDUP_REDIS_CONNECT_TIMEOUT: Connection timeout (default: 5.0)
923
+ TRUTHOUND_DEDUP_REDIS_MAX_RETRIES: Max retry attempts (default: 3)
924
+ TRUTHOUND_DEDUP_REDIS_RETRY_BASE_DELAY: Base delay for exponential backoff (default: 1.0)
925
+ TRUTHOUND_DEDUP_REDIS_CONSUMER_GROUP: Consumer group name (default: truthound-dedup)
926
+ TRUTHOUND_DEDUP_REDIS_CONSUMER_NAME: Consumer name (default: auto-generated)
927
+ TRUTHOUND_DEDUP_REDIS_STREAM_MAX_LEN: Max stream length (default: 100000)
928
+ TRUTHOUND_DEDUP_FALLBACK_ENABLED: Enable fallback to InMemory (default: true)
929
+
930
+ Example:
931
+ # Basic usage
932
+ store = RedisStreamsDeduplicationStore()
933
+
934
+ # Custom configuration
935
+ store = RedisStreamsDeduplicationStore(
936
+ redis_url="redis://myredis:6379/1",
937
+ default_ttl=7200,
938
+ max_connections=20,
939
+ enable_fallback=True,
940
+ )
941
+
942
+ # With context manager
943
+ async with RedisStreamsDeduplicationStore() as store:
944
+ if not await store.exists_async("fingerprint", 300):
945
+ await store.record_async("fingerprint", {"key": "value"})
946
+
947
+ Note: Requires the 'redis' optional dependency.
948
+ Install with: pip install truthound-dashboard[redis]
949
+ """
950
+
951
+ # Stream entry field names
952
+ FIELD_FINGERPRINT = "fingerprint"
953
+ FIELD_FIRST_SEEN = "first_seen"
954
+ FIELD_LAST_SEEN = "last_seen"
955
+ FIELD_COUNT = "count"
956
+ FIELD_METADATA = "metadata"
957
+
958
+ def __init__(
959
+ self,
960
+ redis_url: str | None = None,
961
+ key_prefix: str | None = None,
962
+ default_ttl: int | None = None,
963
+ max_connections: int | None = None,
964
+ socket_timeout: float | None = None,
965
+ socket_connect_timeout: float | None = None,
966
+ max_retries: int | None = None,
967
+ retry_base_delay: float | None = None,
968
+ consumer_group: str | None = None,
969
+ consumer_name: str | None = None,
970
+ stream_max_len: int | None = None,
971
+ enable_fallback: bool | None = None,
972
+ logger: Any | None = None,
973
+ ) -> None:
974
+ """Initialize Redis Streams deduplication store.
975
+
976
+ All parameters can be configured via environment variables if not
977
+ explicitly provided.
978
+
979
+ Args:
980
+ redis_url: Redis connection URL.
981
+ key_prefix: Prefix for all Redis keys.
982
+ default_ttl: Default TTL in seconds for entries.
983
+ max_connections: Maximum connections in the pool.
984
+ socket_timeout: Socket timeout in seconds.
985
+ socket_connect_timeout: Connection timeout in seconds.
986
+ max_retries: Maximum retry attempts for reconnection.
987
+ retry_base_delay: Base delay for exponential backoff.
988
+ consumer_group: Consumer group name for stream processing.
989
+ consumer_name: Consumer name (auto-generated if not provided).
990
+ stream_max_len: Maximum stream length (MAXLEN).
991
+ enable_fallback: Enable fallback to InMemory on Redis failure.
992
+ logger: Custom logger instance.
993
+
994
+ Raises:
995
+ ImportError: If redis package is not installed.
996
+ """
997
+ import logging
998
+ import os
999
+ import uuid
1000
+
1001
+ if not REDIS_AVAILABLE:
1002
+ raise ImportError(
1003
+ "Redis support requires the 'redis' package. "
1004
+ "Install with: pip install truthound-dashboard[redis] "
1005
+ "or pip install redis"
1006
+ )
1007
+
1008
+ # Configuration from environment or parameters
1009
+ self.redis_url = redis_url or os.getenv(
1010
+ "TRUTHOUND_DEDUP_REDIS_URL", "redis://localhost:6379/0"
1011
+ )
1012
+ self.key_prefix = key_prefix or os.getenv(
1013
+ "TRUTHOUND_DEDUP_REDIS_PREFIX", "truthound:dedup:streams:"
1014
+ )
1015
+ self.default_ttl = default_ttl or int(
1016
+ os.getenv("TRUTHOUND_DEDUP_REDIS_TTL", "3600")
1017
+ )
1018
+ self.max_connections = max_connections or int(
1019
+ os.getenv("TRUTHOUND_DEDUP_REDIS_POOL_SIZE", "10")
1020
+ )
1021
+ self.socket_timeout = socket_timeout or float(
1022
+ os.getenv("TRUTHOUND_DEDUP_REDIS_SOCKET_TIMEOUT", "5.0")
1023
+ )
1024
+ self.socket_connect_timeout = socket_connect_timeout or float(
1025
+ os.getenv("TRUTHOUND_DEDUP_REDIS_CONNECT_TIMEOUT", "5.0")
1026
+ )
1027
+ self.max_retries = max_retries or int(
1028
+ os.getenv("TRUTHOUND_DEDUP_REDIS_MAX_RETRIES", "3")
1029
+ )
1030
+ self.retry_base_delay = retry_base_delay or float(
1031
+ os.getenv("TRUTHOUND_DEDUP_REDIS_RETRY_BASE_DELAY", "1.0")
1032
+ )
1033
+ self.consumer_group = consumer_group or os.getenv(
1034
+ "TRUTHOUND_DEDUP_REDIS_CONSUMER_GROUP", "truthound-dedup"
1035
+ )
1036
+ self.consumer_name = consumer_name or os.getenv(
1037
+ "TRUTHOUND_DEDUP_REDIS_CONSUMER_NAME", f"consumer-{uuid.uuid4().hex[:8]}"
1038
+ )
1039
+ self.stream_max_len = stream_max_len or int(
1040
+ os.getenv("TRUTHOUND_DEDUP_REDIS_STREAM_MAX_LEN", "100000")
1041
+ )
1042
+
1043
+ fallback_env = os.getenv("TRUTHOUND_DEDUP_FALLBACK_ENABLED", "true")
1044
+ self.enable_fallback = (
1045
+ enable_fallback
1046
+ if enable_fallback is not None
1047
+ else fallback_env.lower() == "true"
1048
+ )
1049
+
1050
+ # Logger setup
1051
+ self._logger = logger or logging.getLogger(__name__)
1052
+
1053
+ # Connection pool for sync client
1054
+ self._pool: redis.ConnectionPool | None = None
1055
+ self._client: redis.Redis | None = None
1056
+
1057
+ # Connection pool for async client
1058
+ self._async_pool: redis.asyncio.ConnectionPool | None = None
1059
+ self._async_client: redis.asyncio.Redis | None = None
1060
+
1061
+ # Locks for thread-safe initialization
1062
+ self._lock = threading.Lock()
1063
+ self._async_lock: Any = None # Created lazily for asyncio
1064
+
1065
+ # Fallback store for graceful degradation
1066
+ self._fallback_store: InMemoryDeduplicationStore | None = None
1067
+ self._using_fallback = False
1068
+
1069
+ # Connection state tracking
1070
+ self._connected = False
1071
+ self._retry_count = 0
1072
+ self._last_error: Exception | None = None
1073
+ self._last_error_time: float | None = None
1074
+
1075
+ # Metrics
1076
+ self._metrics = DeduplicationMetrics()
1077
+
1078
+ # Index tracking key (for fast lookups)
1079
+ self._index_key = f"{self.key_prefix}index"
1080
+
1081
+ # Stream key
1082
+ self._stream_key = f"{self.key_prefix}stream"
1083
+
1084
+ def _get_key(self, fingerprint: str) -> str:
1085
+ """Get full Redis key for fingerprint.
1086
+
1087
+ Args:
1088
+ fingerprint: The fingerprint string.
1089
+
1090
+ Returns:
1091
+ Full Redis key with prefix.
1092
+ """
1093
+ return f"{self.key_prefix}fp:{fingerprint}"
1094
+
1095
+ def _create_pool(self) -> "redis.ConnectionPool":
1096
+ """Create a connection pool for sync client.
1097
+
1098
+ Returns:
1099
+ Configured connection pool.
1100
+ """
1101
+ return redis.ConnectionPool.from_url(
1102
+ self.redis_url,
1103
+ max_connections=self.max_connections,
1104
+ socket_timeout=self.socket_timeout,
1105
+ socket_connect_timeout=self.socket_connect_timeout,
1106
+ retry_on_timeout=True,
1107
+ decode_responses=True,
1108
+ )
1109
+
1110
+ async def _create_async_pool(self) -> "redis.asyncio.ConnectionPool":
1111
+ """Create a connection pool for async client.
1112
+
1113
+ Returns:
1114
+ Configured async connection pool.
1115
+ """
1116
+ return redis.asyncio.ConnectionPool.from_url(
1117
+ self.redis_url,
1118
+ max_connections=self.max_connections,
1119
+ socket_timeout=self.socket_timeout,
1120
+ socket_connect_timeout=self.socket_connect_timeout,
1121
+ retry_on_timeout=True,
1122
+ decode_responses=True,
1123
+ )
1124
+
1125
+ def _get_fallback_store(self) -> InMemoryDeduplicationStore:
1126
+ """Get or create fallback in-memory store.
1127
+
1128
+ Returns:
1129
+ InMemoryDeduplicationStore instance.
1130
+ """
1131
+ if self._fallback_store is None:
1132
+ self._fallback_store = InMemoryDeduplicationStore()
1133
+ return self._fallback_store
1134
+
1135
+ def _calculate_backoff_delay(self) -> float:
1136
+ """Calculate exponential backoff delay.
1137
+
1138
+ Returns:
1139
+ Delay in seconds.
1140
+ """
1141
+ import random
1142
+
1143
+ # Exponential backoff with jitter
1144
+ delay = self.retry_base_delay * (2**self._retry_count)
1145
+ # Add jitter (up to 25% of delay)
1146
+ jitter = delay * random.uniform(0, 0.25)
1147
+ return min(delay + jitter, 60.0) # Cap at 60 seconds
1148
+
1149
+ def _handle_redis_error(self, error: Exception, operation: str) -> None:
1150
+ """Handle Redis errors with logging and metrics.
1151
+
1152
+ Args:
1153
+ error: The exception that occurred.
1154
+ operation: Name of the operation that failed.
1155
+ """
1156
+ self._metrics.errors += 1
1157
+ self._last_error = error
1158
+ self._last_error_time = time.time()
1159
+ self._connected = False
1160
+
1161
+ self._logger.error(
1162
+ f"Redis error during {operation}: {error}",
1163
+ extra={
1164
+ "operation": operation,
1165
+ "error_type": type(error).__name__,
1166
+ "retry_count": self._retry_count,
1167
+ },
1168
+ )
1169
+
1170
+ def _try_reconnect_sync(self) -> bool:
1171
+ """Attempt to reconnect to Redis synchronously.
1172
+
1173
+ Returns:
1174
+ True if reconnection successful, False otherwise.
1175
+ """
1176
+ if self._retry_count >= self.max_retries:
1177
+ self._logger.warning(
1178
+ f"Max retries ({self.max_retries}) reached, using fallback"
1179
+ )
1180
+ return False
1181
+
1182
+ delay = self._calculate_backoff_delay()
1183
+ self._logger.info(
1184
+ f"Attempting Redis reconnection in {delay:.2f}s (attempt {self._retry_count + 1}/{self.max_retries})"
1185
+ )
1186
+
1187
+ time.sleep(delay)
1188
+ self._retry_count += 1
1189
+
1190
+ try:
1191
+ # Close existing connections
1192
+ if self._client:
1193
+ try:
1194
+ self._client.close()
1195
+ except Exception:
1196
+ pass
1197
+ self._client = None
1198
+
1199
+ if self._pool:
1200
+ try:
1201
+ self._pool.disconnect()
1202
+ except Exception:
1203
+ pass
1204
+ self._pool = None
1205
+
1206
+ # Create new connection
1207
+ self._pool = self._create_pool()
1208
+ self._client = redis.Redis(connection_pool=self._pool)
1209
+
1210
+ # Test connection
1211
+ if self._client.ping():
1212
+ self._connected = True
1213
+ self._retry_count = 0
1214
+ self._using_fallback = False
1215
+ self._metrics.reconnections += 1
1216
+ self._logger.info("Redis reconnection successful")
1217
+ return True
1218
+ except Exception as e:
1219
+ self._logger.warning(f"Reconnection attempt failed: {e}")
1220
+
1221
+ return False
1222
+
1223
+ async def _try_reconnect_async(self) -> bool:
1224
+ """Attempt to reconnect to Redis asynchronously.
1225
+
1226
+ Returns:
1227
+ True if reconnection successful, False otherwise.
1228
+ """
1229
+ import asyncio
1230
+
1231
+ if self._retry_count >= self.max_retries:
1232
+ self._logger.warning(
1233
+ f"Max retries ({self.max_retries}) reached, using fallback"
1234
+ )
1235
+ return False
1236
+
1237
+ delay = self._calculate_backoff_delay()
1238
+ self._logger.info(
1239
+ f"Attempting async Redis reconnection in {delay:.2f}s (attempt {self._retry_count + 1}/{self.max_retries})"
1240
+ )
1241
+
1242
+ await asyncio.sleep(delay)
1243
+ self._retry_count += 1
1244
+
1245
+ try:
1246
+ # Close existing connections
1247
+ if self._async_client:
1248
+ try:
1249
+ await self._async_client.close()
1250
+ except Exception:
1251
+ pass
1252
+ self._async_client = None
1253
+
1254
+ if self._async_pool:
1255
+ try:
1256
+ await self._async_pool.disconnect()
1257
+ except Exception:
1258
+ pass
1259
+ self._async_pool = None
1260
+
1261
+ # Create new connection
1262
+ self._async_pool = await self._create_async_pool()
1263
+ self._async_client = redis.asyncio.Redis(connection_pool=self._async_pool)
1264
+
1265
+ # Test connection
1266
+ if await self._async_client.ping():
1267
+ self._connected = True
1268
+ self._retry_count = 0
1269
+ self._using_fallback = False
1270
+ self._metrics.reconnections += 1
1271
+ self._logger.info("Async Redis reconnection successful")
1272
+ return True
1273
+ except Exception as e:
1274
+ self._logger.warning(f"Async reconnection attempt failed: {e}")
1275
+
1276
+ return False
1277
+
1278
+ @property
1279
+ def client(self) -> "redis.Redis":
1280
+ """Get sync Redis client with connection pooling.
1281
+
1282
+ Creates the connection pool and client on first access.
1283
+ Handles reconnection on failure.
1284
+
1285
+ Returns:
1286
+ Redis client instance.
1287
+ """
1288
+ if self._client is None or not self._connected:
1289
+ with self._lock:
1290
+ if self._client is None or not self._connected:
1291
+ try:
1292
+ self._pool = self._create_pool()
1293
+ self._client = redis.Redis(connection_pool=self._pool)
1294
+ # Test connection
1295
+ self._client.ping()
1296
+ self._connected = True
1297
+ self._retry_count = 0
1298
+ self._logger.debug("Redis sync client connected")
1299
+ except Exception as e:
1300
+ self._handle_redis_error(e, "client_init")
1301
+ raise
1302
+ return self._client
1303
+
1304
+ async def get_async_client(self) -> "redis.asyncio.Redis":
1305
+ """Get async Redis client with connection pooling.
1306
+
1307
+ Creates the async connection pool and client on first access.
1308
+
1309
+ Returns:
1310
+ Async Redis client instance.
1311
+ """
1312
+ import asyncio
1313
+
1314
+ if self._async_lock is None:
1315
+ self._async_lock = asyncio.Lock()
1316
+
1317
+ if self._async_client is None or not self._connected:
1318
+ async with self._async_lock:
1319
+ if self._async_client is None or not self._connected:
1320
+ try:
1321
+ self._async_pool = await self._create_async_pool()
1322
+ self._async_client = redis.asyncio.Redis(
1323
+ connection_pool=self._async_pool
1324
+ )
1325
+ # Test connection
1326
+ await self._async_client.ping()
1327
+ self._connected = True
1328
+ self._retry_count = 0
1329
+ self._logger.debug("Redis async client connected")
1330
+ except Exception as e:
1331
+ self._handle_redis_error(e, "async_client_init")
1332
+ raise
1333
+ return self._async_client
1334
+
1335
+ async def _ensure_consumer_group(self, client: "redis.asyncio.Redis") -> None:
1336
+ """Ensure consumer group exists for stream.
1337
+
1338
+ Args:
1339
+ client: Redis async client.
1340
+ """
1341
+ try:
1342
+ await client.xgroup_create(
1343
+ self._stream_key,
1344
+ self.consumer_group,
1345
+ id="0",
1346
+ mkstream=True,
1347
+ )
1348
+ self._logger.debug(f"Created consumer group: {self.consumer_group}")
1349
+ except redis.ResponseError as e:
1350
+ if "BUSYGROUP" not in str(e):
1351
+ raise
1352
+ # Group already exists, which is fine
1353
+
1354
+ def _serialize_entry(
1355
+ self,
1356
+ fingerprint: str,
1357
+ first_seen: float,
1358
+ last_seen: float,
1359
+ count: int,
1360
+ metadata: dict[str, Any] | None,
1361
+ ) -> dict[str, str]:
1362
+ """Serialize entry for Redis storage.
1363
+
1364
+ Args:
1365
+ fingerprint: The fingerprint.
1366
+ first_seen: First seen timestamp.
1367
+ last_seen: Last seen timestamp.
1368
+ count: Occurrence count.
1369
+ metadata: Optional metadata.
1370
+
1371
+ Returns:
1372
+ Dictionary suitable for Redis.
1373
+ """
1374
+ return {
1375
+ self.FIELD_FINGERPRINT: fingerprint,
1376
+ self.FIELD_FIRST_SEEN: str(first_seen),
1377
+ self.FIELD_LAST_SEEN: str(last_seen),
1378
+ self.FIELD_COUNT: str(count),
1379
+ self.FIELD_METADATA: json.dumps(metadata or {}),
1380
+ }
1381
+
1382
+ def _deserialize_entry(
1383
+ self, fingerprint: str, data: dict[str, str]
1384
+ ) -> DeduplicationEntry:
1385
+ """Deserialize entry from Redis storage.
1386
+
1387
+ Args:
1388
+ fingerprint: The fingerprint.
1389
+ data: Dictionary from Redis.
1390
+
1391
+ Returns:
1392
+ DeduplicationEntry instance.
1393
+ """
1394
+ metadata = {}
1395
+ if data.get(self.FIELD_METADATA):
1396
+ try:
1397
+ metadata = json.loads(data[self.FIELD_METADATA])
1398
+ except json.JSONDecodeError:
1399
+ pass
1400
+
1401
+ return DeduplicationEntry(
1402
+ fingerprint=fingerprint,
1403
+ first_seen=datetime.fromtimestamp(float(data.get(self.FIELD_FIRST_SEEN, 0))),
1404
+ last_seen=datetime.fromtimestamp(float(data.get(self.FIELD_LAST_SEEN, 0))),
1405
+ count=int(data.get(self.FIELD_COUNT, 1)),
1406
+ metadata=metadata,
1407
+ )
1408
+
1409
+ def exists(self, fingerprint: str, window_seconds: int) -> bool:
1410
+ """Check if fingerprint exists within window.
1411
+
1412
+ Falls back to InMemory store on Redis failure if enabled.
1413
+
1414
+ Args:
1415
+ fingerprint: The fingerprint to check.
1416
+ window_seconds: Time window in seconds.
1417
+
1418
+ Returns:
1419
+ True if fingerprint exists and is not expired.
1420
+ """
1421
+ # Use fallback if already in fallback mode
1422
+ if self._using_fallback and self.enable_fallback:
1423
+ result = self._get_fallback_store().exists(fingerprint, window_seconds)
1424
+ if result:
1425
+ self._metrics.hits += 1
1426
+ else:
1427
+ self._metrics.misses += 1
1428
+ return result
1429
+
1430
+ try:
1431
+ key = self._get_key(fingerprint)
1432
+ data = self.client.hgetall(key)
1433
+
1434
+ if not data:
1435
+ self._metrics.misses += 1
1436
+ return False
1437
+
1438
+ # Check if expired based on window
1439
+ last_seen = float(data.get(self.FIELD_LAST_SEEN, 0))
1440
+ cutoff = time.time() - window_seconds
1441
+
1442
+ if last_seen >= cutoff:
1443
+ self._metrics.hits += 1
1444
+ return True
1445
+ else:
1446
+ self._metrics.misses += 1
1447
+ return False
1448
+
1449
+ except Exception as e:
1450
+ self._handle_redis_error(e, "exists")
1451
+
1452
+ if self.enable_fallback:
1453
+ self._using_fallback = True
1454
+ self._metrics.fallbacks += 1
1455
+ self._logger.warning("Falling back to InMemory store")
1456
+ result = self._get_fallback_store().exists(fingerprint, window_seconds)
1457
+ if result:
1458
+ self._metrics.hits += 1
1459
+ else:
1460
+ self._metrics.misses += 1
1461
+ return result
1462
+
1463
+ raise
1464
+
1465
+ async def exists_async(self, fingerprint: str, window_seconds: int) -> bool:
1466
+ """Async check if fingerprint exists within window.
1467
+
1468
+ Falls back to InMemory store on Redis failure if enabled.
1469
+
1470
+ Args:
1471
+ fingerprint: The fingerprint to check.
1472
+ window_seconds: Time window in seconds.
1473
+
1474
+ Returns:
1475
+ True if fingerprint exists and is not expired.
1476
+ """
1477
+ # Use fallback if already in fallback mode
1478
+ if self._using_fallback and self.enable_fallback:
1479
+ result = self._get_fallback_store().exists(fingerprint, window_seconds)
1480
+ if result:
1481
+ self._metrics.hits += 1
1482
+ else:
1483
+ self._metrics.misses += 1
1484
+ return result
1485
+
1486
+ try:
1487
+ client = await self.get_async_client()
1488
+ key = self._get_key(fingerprint)
1489
+ data = await client.hgetall(key)
1490
+
1491
+ if not data:
1492
+ self._metrics.misses += 1
1493
+ return False
1494
+
1495
+ # Check if expired based on window
1496
+ last_seen = float(data.get(self.FIELD_LAST_SEEN, 0))
1497
+ cutoff = time.time() - window_seconds
1498
+
1499
+ if last_seen >= cutoff:
1500
+ self._metrics.hits += 1
1501
+ return True
1502
+ else:
1503
+ self._metrics.misses += 1
1504
+ return False
1505
+
1506
+ except Exception as e:
1507
+ self._handle_redis_error(e, "exists_async")
1508
+
1509
+ if self.enable_fallback:
1510
+ self._using_fallback = True
1511
+ self._metrics.fallbacks += 1
1512
+ self._logger.warning("Falling back to InMemory store")
1513
+ result = self._get_fallback_store().exists(fingerprint, window_seconds)
1514
+ if result:
1515
+ self._metrics.hits += 1
1516
+ else:
1517
+ self._metrics.misses += 1
1518
+ return result
1519
+
1520
+ raise
1521
+
1522
+ def record(self, fingerprint: str, metadata: dict[str, Any] | None = None) -> None:
1523
+ """Record a fingerprint with automatic TTL and stream logging.
1524
+
1525
+ Args:
1526
+ fingerprint: The fingerprint to record.
1527
+ metadata: Optional metadata to store.
1528
+ """
1529
+ # Use fallback if already in fallback mode
1530
+ if self._using_fallback and self.enable_fallback:
1531
+ self._get_fallback_store().record(fingerprint, metadata)
1532
+ self._metrics.records += 1
1533
+ return
1534
+
1535
+ try:
1536
+ key = self._get_key(fingerprint)
1537
+ now = time.time()
1538
+ client = self.client
1539
+
1540
+ # Use pipeline for atomicity
1541
+ pipe = client.pipeline()
1542
+
1543
+ # Get existing entry
1544
+ existing = client.hgetall(key)
1545
+
1546
+ if existing:
1547
+ # Update existing entry
1548
+ first_seen = float(existing.get(self.FIELD_FIRST_SEEN, now))
1549
+ count = int(existing.get(self.FIELD_COUNT, 0)) + 1
1550
+ old_metadata = {}
1551
+ if existing.get(self.FIELD_METADATA):
1552
+ try:
1553
+ old_metadata = json.loads(existing[self.FIELD_METADATA])
1554
+ except json.JSONDecodeError:
1555
+ pass
1556
+ if metadata:
1557
+ old_metadata.update(metadata)
1558
+ final_metadata = old_metadata
1559
+ else:
1560
+ first_seen = now
1561
+ count = 1
1562
+ final_metadata = metadata or {}
1563
+
1564
+ # Store entry as hash
1565
+ entry_data = self._serialize_entry(
1566
+ fingerprint, first_seen, now, count, final_metadata
1567
+ )
1568
+ pipe.hset(key, mapping=entry_data)
1569
+ pipe.expire(key, self.default_ttl)
1570
+
1571
+ # Add to index set for tracking
1572
+ pipe.sadd(self._index_key, fingerprint)
1573
+ pipe.expire(self._index_key, self.default_ttl * 2)
1574
+
1575
+ # Add to stream for audit/replay (with MAXLEN for auto-trimming)
1576
+ stream_entry = {
1577
+ "fingerprint": fingerprint,
1578
+ "timestamp": str(now),
1579
+ "action": "record",
1580
+ "count": str(count),
1581
+ }
1582
+ pipe.xadd(
1583
+ self._stream_key,
1584
+ stream_entry,
1585
+ maxlen=self.stream_max_len,
1586
+ approximate=True,
1587
+ )
1588
+
1589
+ pipe.execute()
1590
+ self._metrics.records += 1
1591
+
1592
+ except Exception as e:
1593
+ self._handle_redis_error(e, "record")
1594
+
1595
+ if self.enable_fallback:
1596
+ self._using_fallback = True
1597
+ self._metrics.fallbacks += 1
1598
+ self._logger.warning("Falling back to InMemory store")
1599
+ self._get_fallback_store().record(fingerprint, metadata)
1600
+ self._metrics.records += 1
1601
+ return
1602
+
1603
+ raise
1604
+
1605
+ async def record_async(
1606
+ self, fingerprint: str, metadata: dict[str, Any] | None = None
1607
+ ) -> None:
1608
+ """Async record a fingerprint with automatic TTL and stream logging.
1609
+
1610
+ Args:
1611
+ fingerprint: The fingerprint to record.
1612
+ metadata: Optional metadata to store.
1613
+ """
1614
+ # Use fallback if already in fallback mode
1615
+ if self._using_fallback and self.enable_fallback:
1616
+ self._get_fallback_store().record(fingerprint, metadata)
1617
+ self._metrics.records += 1
1618
+ return
1619
+
1620
+ try:
1621
+ client = await self.get_async_client()
1622
+ key = self._get_key(fingerprint)
1623
+ now = time.time()
1624
+
1625
+ # Ensure consumer group exists
1626
+ await self._ensure_consumer_group(client)
1627
+
1628
+ # Use pipeline for atomicity
1629
+ pipe = client.pipeline()
1630
+
1631
+ # Get existing entry
1632
+ existing = await client.hgetall(key)
1633
+
1634
+ if existing:
1635
+ # Update existing entry
1636
+ first_seen = float(existing.get(self.FIELD_FIRST_SEEN, now))
1637
+ count = int(existing.get(self.FIELD_COUNT, 0)) + 1
1638
+ old_metadata = {}
1639
+ if existing.get(self.FIELD_METADATA):
1640
+ try:
1641
+ old_metadata = json.loads(existing[self.FIELD_METADATA])
1642
+ except json.JSONDecodeError:
1643
+ pass
1644
+ if metadata:
1645
+ old_metadata.update(metadata)
1646
+ final_metadata = old_metadata
1647
+ else:
1648
+ first_seen = now
1649
+ count = 1
1650
+ final_metadata = metadata or {}
1651
+
1652
+ # Store entry as hash
1653
+ entry_data = self._serialize_entry(
1654
+ fingerprint, first_seen, now, count, final_metadata
1655
+ )
1656
+ pipe.hset(key, mapping=entry_data)
1657
+ pipe.expire(key, self.default_ttl)
1658
+
1659
+ # Add to index set for tracking
1660
+ pipe.sadd(self._index_key, fingerprint)
1661
+ pipe.expire(self._index_key, self.default_ttl * 2)
1662
+
1663
+ # Add to stream for audit/replay (with MAXLEN for auto-trimming)
1664
+ stream_entry = {
1665
+ "fingerprint": fingerprint,
1666
+ "timestamp": str(now),
1667
+ "action": "record",
1668
+ "count": str(count),
1669
+ }
1670
+ pipe.xadd(
1671
+ self._stream_key,
1672
+ stream_entry,
1673
+ maxlen=self.stream_max_len,
1674
+ approximate=True,
1675
+ )
1676
+
1677
+ await pipe.execute()
1678
+ self._metrics.records += 1
1679
+
1680
+ except Exception as e:
1681
+ self._handle_redis_error(e, "record_async")
1682
+
1683
+ if self.enable_fallback:
1684
+ self._using_fallback = True
1685
+ self._metrics.fallbacks += 1
1686
+ self._logger.warning("Falling back to InMemory store")
1687
+ self._get_fallback_store().record(fingerprint, metadata)
1688
+ self._metrics.records += 1
1689
+ return
1690
+
1691
+ raise
1692
+
1693
+ def get(self, fingerprint: str) -> DeduplicationEntry | None:
1694
+ """Get entry by fingerprint.
1695
+
1696
+ Args:
1697
+ fingerprint: The fingerprint to look up.
1698
+
1699
+ Returns:
1700
+ Entry if found, None otherwise.
1701
+ """
1702
+ if self._using_fallback and self.enable_fallback:
1703
+ return self._get_fallback_store().get(fingerprint)
1704
+
1705
+ try:
1706
+ key = self._get_key(fingerprint)
1707
+ data = self.client.hgetall(key)
1708
+
1709
+ if not data:
1710
+ return None
1711
+
1712
+ return self._deserialize_entry(fingerprint, data)
1713
+
1714
+ except Exception as e:
1715
+ self._handle_redis_error(e, "get")
1716
+
1717
+ if self.enable_fallback:
1718
+ self._using_fallback = True
1719
+ self._metrics.fallbacks += 1
1720
+ return self._get_fallback_store().get(fingerprint)
1721
+
1722
+ raise
1723
+
1724
+ async def get_async(self, fingerprint: str) -> DeduplicationEntry | None:
1725
+ """Async get entry by fingerprint.
1726
+
1727
+ Args:
1728
+ fingerprint: The fingerprint to look up.
1729
+
1730
+ Returns:
1731
+ Entry if found, None otherwise.
1732
+ """
1733
+ if self._using_fallback and self.enable_fallback:
1734
+ return self._get_fallback_store().get(fingerprint)
1735
+
1736
+ try:
1737
+ client = await self.get_async_client()
1738
+ key = self._get_key(fingerprint)
1739
+ data = await client.hgetall(key)
1740
+
1741
+ if not data:
1742
+ return None
1743
+
1744
+ return self._deserialize_entry(fingerprint, data)
1745
+
1746
+ except Exception as e:
1747
+ self._handle_redis_error(e, "get_async")
1748
+
1749
+ if self.enable_fallback:
1750
+ self._using_fallback = True
1751
+ self._metrics.fallbacks += 1
1752
+ return self._get_fallback_store().get(fingerprint)
1753
+
1754
+ raise
1755
+
1756
+ def cleanup(self, max_age_seconds: int) -> int:
1757
+ """Remove expired entries.
1758
+
1759
+ Redis handles TTL automatically, but this method can be used
1760
+ to perform explicit cleanup of old stream entries.
1761
+
1762
+ Args:
1763
+ max_age_seconds: Maximum age of entries to keep.
1764
+
1765
+ Returns:
1766
+ Number of entries removed.
1767
+ """
1768
+ if self._using_fallback and self.enable_fallback:
1769
+ return self._get_fallback_store().cleanup(max_age_seconds)
1770
+
1771
+ try:
1772
+ client = self.client
1773
+ cutoff = time.time() - max_age_seconds
1774
+ removed = 0
1775
+
1776
+ # Get all fingerprints from index
1777
+ fingerprints = client.smembers(self._index_key)
1778
+
1779
+ for fp in fingerprints:
1780
+ key = self._get_key(fp)
1781
+ data = client.hgetall(key)
1782
+
1783
+ if not data:
1784
+ # Entry expired, remove from index
1785
+ client.srem(self._index_key, fp)
1786
+ removed += 1
1787
+ elif float(data.get(self.FIELD_LAST_SEEN, 0)) < cutoff:
1788
+ # Entry is old, delete it
1789
+ client.delete(key)
1790
+ client.srem(self._index_key, fp)
1791
+ removed += 1
1792
+
1793
+ # Trim stream to remove old entries
1794
+ stream_info = client.xinfo_stream(self._stream_key)
1795
+ if stream_info and stream_info.get("length", 0) > 0:
1796
+ # Get first entry timestamp
1797
+ first_entry = client.xrange(self._stream_key, count=1)
1798
+ if first_entry:
1799
+ entry_id = first_entry[0][0]
1800
+ # Stream ID format: timestamp-sequence
1801
+ entry_ts = int(entry_id.split("-")[0]) / 1000
1802
+ if entry_ts < cutoff:
1803
+ # Trim old entries
1804
+ cutoff_id = f"{int(cutoff * 1000)}-0"
1805
+ trimmed = client.xtrim(
1806
+ self._stream_key, minid=cutoff_id, approximate=True
1807
+ )
1808
+ removed += trimmed
1809
+
1810
+ return removed
1811
+
1812
+ except Exception as e:
1813
+ self._handle_redis_error(e, "cleanup")
1814
+
1815
+ if self.enable_fallback:
1816
+ self._using_fallback = True
1817
+ return self._get_fallback_store().cleanup(max_age_seconds)
1818
+
1819
+ raise
1820
+
1821
+ async def cleanup_async(self, max_age_seconds: int) -> int:
1822
+ """Async remove expired entries.
1823
+
1824
+ Args:
1825
+ max_age_seconds: Maximum age of entries to keep.
1826
+
1827
+ Returns:
1828
+ Number of entries removed.
1829
+ """
1830
+ if self._using_fallback and self.enable_fallback:
1831
+ return self._get_fallback_store().cleanup(max_age_seconds)
1832
+
1833
+ try:
1834
+ client = await self.get_async_client()
1835
+ cutoff = time.time() - max_age_seconds
1836
+ removed = 0
1837
+
1838
+ # Get all fingerprints from index
1839
+ fingerprints = await client.smembers(self._index_key)
1840
+
1841
+ for fp in fingerprints:
1842
+ key = self._get_key(fp)
1843
+ data = await client.hgetall(key)
1844
+
1845
+ if not data:
1846
+ # Entry expired, remove from index
1847
+ await client.srem(self._index_key, fp)
1848
+ removed += 1
1849
+ elif float(data.get(self.FIELD_LAST_SEEN, 0)) < cutoff:
1850
+ # Entry is old, delete it
1851
+ await client.delete(key)
1852
+ await client.srem(self._index_key, fp)
1853
+ removed += 1
1854
+
1855
+ # Trim stream to remove old entries
1856
+ try:
1857
+ stream_info = await client.xinfo_stream(self._stream_key)
1858
+ if stream_info and stream_info.get("length", 0) > 0:
1859
+ # Trim old entries
1860
+ cutoff_id = f"{int(cutoff * 1000)}-0"
1861
+ trimmed = await client.xtrim(
1862
+ self._stream_key, minid=cutoff_id, approximate=True
1863
+ )
1864
+ removed += trimmed
1865
+ except redis.ResponseError:
1866
+ # Stream might not exist
1867
+ pass
1868
+
1869
+ return removed
1870
+
1871
+ except Exception as e:
1872
+ self._handle_redis_error(e, "cleanup_async")
1873
+
1874
+ if self.enable_fallback:
1875
+ self._using_fallback = True
1876
+ return self._get_fallback_store().cleanup(max_age_seconds)
1877
+
1878
+ raise
1879
+
1880
+ def clear(self) -> None:
1881
+ """Clear all deduplication entries."""
1882
+ if self._using_fallback and self.enable_fallback:
1883
+ self._get_fallback_store().clear()
1884
+ return
1885
+
1886
+ try:
1887
+ client = self.client
1888
+
1889
+ # Get all fingerprints from index
1890
+ fingerprints = client.smembers(self._index_key)
1891
+
1892
+ if fingerprints:
1893
+ # Delete all entry keys
1894
+ keys_to_delete = [self._get_key(fp) for fp in fingerprints]
1895
+ client.delete(*keys_to_delete)
1896
+
1897
+ # Delete index
1898
+ client.delete(self._index_key)
1899
+
1900
+ # Delete stream
1901
+ client.delete(self._stream_key)
1902
+
1903
+ except Exception as e:
1904
+ self._handle_redis_error(e, "clear")
1905
+
1906
+ if self.enable_fallback:
1907
+ self._using_fallback = True
1908
+ self._get_fallback_store().clear()
1909
+ return
1910
+
1911
+ raise
1912
+
1913
+ async def clear_async(self) -> None:
1914
+ """Async clear all deduplication entries."""
1915
+ if self._using_fallback and self.enable_fallback:
1916
+ self._get_fallback_store().clear()
1917
+ return
1918
+
1919
+ try:
1920
+ client = await self.get_async_client()
1921
+
1922
+ # Get all fingerprints from index
1923
+ fingerprints = await client.smembers(self._index_key)
1924
+
1925
+ if fingerprints:
1926
+ # Delete all entry keys
1927
+ keys_to_delete = [self._get_key(fp) for fp in fingerprints]
1928
+ await client.delete(*keys_to_delete)
1929
+
1930
+ # Delete index
1931
+ await client.delete(self._index_key)
1932
+
1933
+ # Delete stream
1934
+ await client.delete(self._stream_key)
1935
+
1936
+ except Exception as e:
1937
+ self._handle_redis_error(e, "clear_async")
1938
+
1939
+ if self.enable_fallback:
1940
+ self._using_fallback = True
1941
+ self._get_fallback_store().clear()
1942
+ return
1943
+
1944
+ raise
1945
+
1946
+ def count(self) -> int:
1947
+ """Get total entry count.
1948
+
1949
+ Returns:
1950
+ Number of deduplication entries.
1951
+ """
1952
+ if self._using_fallback and self.enable_fallback:
1953
+ return self._get_fallback_store().count()
1954
+
1955
+ try:
1956
+ return self.client.scard(self._index_key)
1957
+
1958
+ except Exception as e:
1959
+ self._handle_redis_error(e, "count")
1960
+
1961
+ if self.enable_fallback:
1962
+ self._using_fallback = True
1963
+ return self._get_fallback_store().count()
1964
+
1965
+ raise
1966
+
1967
+ async def count_async(self) -> int:
1968
+ """Async get total entry count.
1969
+
1970
+ Returns:
1971
+ Number of deduplication entries.
1972
+ """
1973
+ if self._using_fallback and self.enable_fallback:
1974
+ return self._get_fallback_store().count()
1975
+
1976
+ try:
1977
+ client = await self.get_async_client()
1978
+ return await client.scard(self._index_key)
1979
+
1980
+ except Exception as e:
1981
+ self._handle_redis_error(e, "count_async")
1982
+
1983
+ if self.enable_fallback:
1984
+ self._using_fallback = True
1985
+ return self._get_fallback_store().count()
1986
+
1987
+ raise
1988
+
1989
+ def health_check(self) -> dict[str, Any]:
1990
+ """Perform health check and return status.
1991
+
1992
+ Returns:
1993
+ Dictionary with health status information.
1994
+ """
1995
+ result = {
1996
+ "healthy": False,
1997
+ "connected": self._connected,
1998
+ "using_fallback": self._using_fallback,
1999
+ "redis_url": self._mask_url(self.redis_url),
2000
+ "metrics": self._metrics.to_dict(),
2001
+ }
2002
+
2003
+ if self._using_fallback and self.enable_fallback:
2004
+ result["healthy"] = True
2005
+ result["mode"] = "fallback"
2006
+ result["fallback_entries"] = self._get_fallback_store().count()
2007
+ return result
2008
+
2009
+ try:
2010
+ client = self.client
2011
+ ping_ok = client.ping()
2012
+
2013
+ if ping_ok:
2014
+ result["healthy"] = True
2015
+ result["mode"] = "redis"
2016
+ result["entries"] = self.count()
2017
+
2018
+ # Get stream info
2019
+ try:
2020
+ stream_info = client.xinfo_stream(self._stream_key)
2021
+ result["stream"] = {
2022
+ "length": stream_info.get("length", 0),
2023
+ "first_entry": stream_info.get("first-entry"),
2024
+ "last_entry": stream_info.get("last-entry"),
2025
+ }
2026
+ except redis.ResponseError:
2027
+ result["stream"] = {"length": 0}
2028
+
2029
+ # Get Redis info
2030
+ info = client.info(section="server")
2031
+ result["redis_info"] = {
2032
+ "version": info.get("redis_version"),
2033
+ "uptime_seconds": info.get("uptime_in_seconds"),
2034
+ }
2035
+
2036
+ except Exception as e:
2037
+ result["error"] = str(e)
2038
+ if self._last_error_time:
2039
+ result["last_error_time"] = datetime.fromtimestamp(
2040
+ self._last_error_time
2041
+ ).isoformat()
2042
+
2043
+ return result
2044
+
2045
+ async def health_check_async(self) -> dict[str, Any]:
2046
+ """Async perform health check and return status.
2047
+
2048
+ Returns:
2049
+ Dictionary with health status information.
2050
+ """
2051
+ result = {
2052
+ "healthy": False,
2053
+ "connected": self._connected,
2054
+ "using_fallback": self._using_fallback,
2055
+ "redis_url": self._mask_url(self.redis_url),
2056
+ "metrics": self._metrics.to_dict(),
2057
+ }
2058
+
2059
+ if self._using_fallback and self.enable_fallback:
2060
+ result["healthy"] = True
2061
+ result["mode"] = "fallback"
2062
+ result["fallback_entries"] = self._get_fallback_store().count()
2063
+ return result
2064
+
2065
+ try:
2066
+ client = await self.get_async_client()
2067
+ ping_ok = await client.ping()
2068
+
2069
+ if ping_ok:
2070
+ result["healthy"] = True
2071
+ result["mode"] = "redis"
2072
+ result["entries"] = await self.count_async()
2073
+
2074
+ # Get stream info
2075
+ try:
2076
+ stream_info = await client.xinfo_stream(self._stream_key)
2077
+ result["stream"] = {
2078
+ "length": stream_info.get("length", 0),
2079
+ "first_entry": stream_info.get("first-entry"),
2080
+ "last_entry": stream_info.get("last-entry"),
2081
+ }
2082
+ except redis.ResponseError:
2083
+ result["stream"] = {"length": 0}
2084
+
2085
+ # Get Redis info
2086
+ info = await client.info(section="server")
2087
+ result["redis_info"] = {
2088
+ "version": info.get("redis_version"),
2089
+ "uptime_seconds": info.get("uptime_in_seconds"),
2090
+ }
2091
+
2092
+ except Exception as e:
2093
+ result["error"] = str(e)
2094
+ if self._last_error_time:
2095
+ result["last_error_time"] = datetime.fromtimestamp(
2096
+ self._last_error_time
2097
+ ).isoformat()
2098
+
2099
+ return result
2100
+
2101
+ def _mask_url(self, url: str) -> str:
2102
+ """Mask sensitive parts of Redis URL.
2103
+
2104
+ Args:
2105
+ url: Redis URL to mask.
2106
+
2107
+ Returns:
2108
+ Masked URL string.
2109
+ """
2110
+ import re
2111
+
2112
+ # Mask password if present
2113
+ return re.sub(r"://[^:]+:[^@]+@", "://***:***@", url)
2114
+
2115
+ def get_metrics(self) -> dict[str, Any]:
2116
+ """Get current metrics.
2117
+
2118
+ Returns:
2119
+ Dictionary with metrics data.
2120
+ """
2121
+ return self._metrics.to_dict()
2122
+
2123
+ def reset_metrics(self) -> None:
2124
+ """Reset all metrics to zero."""
2125
+ self._metrics = DeduplicationMetrics()
2126
+
2127
+ async def read_stream(
2128
+ self,
2129
+ count: int = 100,
2130
+ block_ms: int = 0,
2131
+ ) -> list[dict[str, Any]]:
2132
+ """Read entries from the deduplication stream.
2133
+
2134
+ Useful for audit logging or replaying events.
2135
+
2136
+ Args:
2137
+ count: Maximum number of entries to read.
2138
+ block_ms: Block timeout in milliseconds (0 = no blocking).
2139
+
2140
+ Returns:
2141
+ List of stream entries.
2142
+ """
2143
+ try:
2144
+ client = await self.get_async_client()
2145
+
2146
+ # Ensure consumer group exists
2147
+ await self._ensure_consumer_group(client)
2148
+
2149
+ # Read from stream using consumer group
2150
+ entries = await client.xreadgroup(
2151
+ self.consumer_group,
2152
+ self.consumer_name,
2153
+ {self._stream_key: ">"},
2154
+ count=count,
2155
+ block=block_ms,
2156
+ )
2157
+
2158
+ result = []
2159
+ if entries:
2160
+ for stream_name, messages in entries:
2161
+ for msg_id, fields in messages:
2162
+ result.append(
2163
+ {
2164
+ "id": msg_id,
2165
+ "stream": stream_name,
2166
+ "fields": fields,
2167
+ }
2168
+ )
2169
+
2170
+ # Acknowledge the message
2171
+ await client.xack(
2172
+ self._stream_key, self.consumer_group, msg_id
2173
+ )
2174
+
2175
+ return result
2176
+
2177
+ except Exception as e:
2178
+ self._handle_redis_error(e, "read_stream")
2179
+ return []
2180
+
2181
+ async def get_pending_messages(self) -> dict[str, Any]:
2182
+ """Get information about pending messages in consumer group.
2183
+
2184
+ Returns:
2185
+ Dictionary with pending message information.
2186
+ """
2187
+ try:
2188
+ client = await self.get_async_client()
2189
+
2190
+ pending = await client.xpending(self._stream_key, self.consumer_group)
2191
+
2192
+ return {
2193
+ "pending_count": pending.get("pending", 0),
2194
+ "min_id": pending.get("min"),
2195
+ "max_id": pending.get("max"),
2196
+ "consumers": pending.get("consumers", {}),
2197
+ }
2198
+
2199
+ except Exception as e:
2200
+ self._handle_redis_error(e, "get_pending_messages")
2201
+ return {"pending_count": 0, "error": str(e)}
2202
+
2203
+ def close(self) -> None:
2204
+ """Close all connections and pools."""
2205
+ if self._client is not None:
2206
+ try:
2207
+ self._client.close()
2208
+ except Exception:
2209
+ pass
2210
+ self._client = None
2211
+
2212
+ if self._pool is not None:
2213
+ try:
2214
+ self._pool.disconnect()
2215
+ except Exception:
2216
+ pass
2217
+ self._pool = None
2218
+
2219
+ self._connected = False
2220
+
2221
+ async def close_async(self) -> None:
2222
+ """Async close all connections and pools."""
2223
+ if self._async_client is not None:
2224
+ try:
2225
+ await self._async_client.close()
2226
+ except Exception:
2227
+ pass
2228
+ self._async_client = None
2229
+
2230
+ if self._async_pool is not None:
2231
+ try:
2232
+ await self._async_pool.disconnect()
2233
+ except Exception:
2234
+ pass
2235
+ self._async_pool = None
2236
+
2237
+ self._connected = False
2238
+
2239
+ def __enter__(self) -> "RedisStreamsDeduplicationStore":
2240
+ """Context manager entry."""
2241
+ return self
2242
+
2243
+ def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
2244
+ """Context manager exit, closes connections."""
2245
+ self.close()
2246
+
2247
+ async def __aenter__(self) -> "RedisStreamsDeduplicationStore":
2248
+ """Async context manager entry."""
2249
+ return self
2250
+
2251
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
2252
+ """Async context manager exit, closes connections."""
2253
+ await self.close_async()
2254
+
2255
+
2256
+ # ============================================================================
2257
+ # Factory Function
2258
+ # ============================================================================
2259
+
2260
+
2261
+ class DeduplicationStoreType:
2262
+ """Store type constants."""
2263
+
2264
+ MEMORY = "memory"
2265
+ SQLITE = "sqlite"
2266
+ REDIS = "redis"
2267
+ REDIS_STREAMS = "redis_streams"
2268
+
2269
+
2270
+ def create_deduplication_store(
2271
+ store_type: str | None = None,
2272
+ **kwargs: Any,
2273
+ ) -> BaseDeduplicationStore:
2274
+ """Factory function to create appropriate deduplication store.
2275
+
2276
+ Selects the store type based on configuration or environment variables.
2277
+
2278
+ Environment variables:
2279
+ TRUTHOUND_DEDUP_STORE_TYPE: Store type (memory, sqlite, redis, redis_streams)
2280
+ TRUTHOUND_DEDUP_SQLITE_PATH: SQLite database path
2281
+ TRUTHOUND_DEDUP_REDIS_URL: Redis connection URL (enables redis/redis_streams)
2282
+
2283
+ Args:
2284
+ store_type: Explicit store type override. If None, auto-detects.
2285
+ **kwargs: Additional arguments passed to the store constructor.
2286
+
2287
+ Returns:
2288
+ Configured BaseDeduplicationStore instance.
2289
+
2290
+ Example:
2291
+ # Auto-detect based on environment
2292
+ store = create_deduplication_store()
2293
+
2294
+ # Explicit type
2295
+ store = create_deduplication_store("redis_streams", default_ttl=7200)
2296
+
2297
+ # SQLite with custom path
2298
+ store = create_deduplication_store("sqlite", db_path="/tmp/dedup.db")
2299
+ """
2300
+ import logging
2301
+ import os
2302
+
2303
+ logger = logging.getLogger(__name__)
2304
+
2305
+ # Determine store type
2306
+ if store_type is None:
2307
+ store_type = os.getenv("TRUTHOUND_DEDUP_STORE_TYPE")
2308
+
2309
+ # Auto-detect if still None
2310
+ if store_type is None:
2311
+ redis_url = os.getenv("TRUTHOUND_DEDUP_REDIS_URL")
2312
+ if redis_url and REDIS_AVAILABLE:
2313
+ store_type = DeduplicationStoreType.REDIS_STREAMS
2314
+ logger.info(
2315
+ f"Auto-detected Redis Streams store from TRUTHOUND_DEDUP_REDIS_URL"
2316
+ )
2317
+ elif os.getenv("TRUTHOUND_DEDUP_SQLITE_PATH"):
2318
+ store_type = DeduplicationStoreType.SQLITE
2319
+ logger.info("Auto-detected SQLite store from TRUTHOUND_DEDUP_SQLITE_PATH")
2320
+ else:
2321
+ store_type = DeduplicationStoreType.MEMORY
2322
+ logger.info("Using default InMemory store")
2323
+
2324
+ # Normalize store type
2325
+ store_type = store_type.lower().strip()
2326
+
2327
+ # Create store based on type
2328
+ if store_type == DeduplicationStoreType.MEMORY:
2329
+ logger.info("Creating InMemory deduplication store")
2330
+ return InMemoryDeduplicationStore()
2331
+
2332
+ elif store_type == DeduplicationStoreType.SQLITE:
2333
+ db_path = kwargs.pop("db_path", None) or os.getenv(
2334
+ "TRUTHOUND_DEDUP_SQLITE_PATH", "deduplication.db"
2335
+ )
2336
+ logger.info(f"Creating SQLite deduplication store at {db_path}")
2337
+ return SQLiteDeduplicationStore(db_path=db_path)
2338
+
2339
+ elif store_type == DeduplicationStoreType.REDIS:
2340
+ if not REDIS_AVAILABLE:
2341
+ logger.warning(
2342
+ "Redis not available, falling back to InMemory store. "
2343
+ "Install with: pip install truthound-dashboard[redis]"
2344
+ )
2345
+ return InMemoryDeduplicationStore()
2346
+
2347
+ logger.info("Creating Redis deduplication store (simple)")
2348
+ return RedisDeduplicationStore(**kwargs)
2349
+
2350
+ elif store_type == DeduplicationStoreType.REDIS_STREAMS:
2351
+ if not REDIS_AVAILABLE:
2352
+ logger.warning(
2353
+ "Redis not available, falling back to InMemory store. "
2354
+ "Install with: pip install truthound-dashboard[redis]"
2355
+ )
2356
+ return InMemoryDeduplicationStore()
2357
+
2358
+ logger.info("Creating Redis Streams deduplication store (production)")
2359
+ return RedisStreamsDeduplicationStore(**kwargs)
2360
+
2361
+ else:
2362
+ logger.warning(
2363
+ f"Unknown store type '{store_type}', falling back to InMemory store"
2364
+ )
2365
+ return InMemoryDeduplicationStore()