headroom-ai 0.2.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. headroom/__init__.py +212 -0
  2. headroom/cache/__init__.py +76 -0
  3. headroom/cache/anthropic.py +517 -0
  4. headroom/cache/base.py +342 -0
  5. headroom/cache/compression_feedback.py +613 -0
  6. headroom/cache/compression_store.py +814 -0
  7. headroom/cache/dynamic_detector.py +1026 -0
  8. headroom/cache/google.py +884 -0
  9. headroom/cache/openai.py +584 -0
  10. headroom/cache/registry.py +175 -0
  11. headroom/cache/semantic.py +451 -0
  12. headroom/ccr/__init__.py +77 -0
  13. headroom/ccr/context_tracker.py +582 -0
  14. headroom/ccr/mcp_server.py +319 -0
  15. headroom/ccr/response_handler.py +772 -0
  16. headroom/ccr/tool_injection.py +415 -0
  17. headroom/cli.py +219 -0
  18. headroom/client.py +977 -0
  19. headroom/compression/__init__.py +42 -0
  20. headroom/compression/detector.py +424 -0
  21. headroom/compression/handlers/__init__.py +22 -0
  22. headroom/compression/handlers/base.py +219 -0
  23. headroom/compression/handlers/code_handler.py +506 -0
  24. headroom/compression/handlers/json_handler.py +418 -0
  25. headroom/compression/masks.py +345 -0
  26. headroom/compression/universal.py +465 -0
  27. headroom/config.py +474 -0
  28. headroom/exceptions.py +192 -0
  29. headroom/integrations/__init__.py +159 -0
  30. headroom/integrations/agno/__init__.py +53 -0
  31. headroom/integrations/agno/hooks.py +345 -0
  32. headroom/integrations/agno/model.py +625 -0
  33. headroom/integrations/agno/providers.py +154 -0
  34. headroom/integrations/langchain/__init__.py +106 -0
  35. headroom/integrations/langchain/agents.py +326 -0
  36. headroom/integrations/langchain/chat_model.py +1002 -0
  37. headroom/integrations/langchain/langsmith.py +324 -0
  38. headroom/integrations/langchain/memory.py +319 -0
  39. headroom/integrations/langchain/providers.py +200 -0
  40. headroom/integrations/langchain/retriever.py +371 -0
  41. headroom/integrations/langchain/streaming.py +341 -0
  42. headroom/integrations/mcp/__init__.py +37 -0
  43. headroom/integrations/mcp/server.py +533 -0
  44. headroom/memory/__init__.py +37 -0
  45. headroom/memory/extractor.py +390 -0
  46. headroom/memory/fast_store.py +621 -0
  47. headroom/memory/fast_wrapper.py +311 -0
  48. headroom/memory/inline_extractor.py +229 -0
  49. headroom/memory/store.py +434 -0
  50. headroom/memory/worker.py +260 -0
  51. headroom/memory/wrapper.py +321 -0
  52. headroom/models/__init__.py +39 -0
  53. headroom/models/registry.py +687 -0
  54. headroom/parser.py +293 -0
  55. headroom/pricing/__init__.py +51 -0
  56. headroom/pricing/anthropic_prices.py +81 -0
  57. headroom/pricing/litellm_pricing.py +113 -0
  58. headroom/pricing/openai_prices.py +91 -0
  59. headroom/pricing/registry.py +188 -0
  60. headroom/providers/__init__.py +61 -0
  61. headroom/providers/anthropic.py +621 -0
  62. headroom/providers/base.py +131 -0
  63. headroom/providers/cohere.py +362 -0
  64. headroom/providers/google.py +427 -0
  65. headroom/providers/litellm.py +297 -0
  66. headroom/providers/openai.py +566 -0
  67. headroom/providers/openai_compatible.py +521 -0
  68. headroom/proxy/__init__.py +19 -0
  69. headroom/proxy/server.py +2683 -0
  70. headroom/py.typed +0 -0
  71. headroom/relevance/__init__.py +124 -0
  72. headroom/relevance/base.py +106 -0
  73. headroom/relevance/bm25.py +255 -0
  74. headroom/relevance/embedding.py +255 -0
  75. headroom/relevance/hybrid.py +259 -0
  76. headroom/reporting/__init__.py +5 -0
  77. headroom/reporting/generator.py +549 -0
  78. headroom/storage/__init__.py +41 -0
  79. headroom/storage/base.py +125 -0
  80. headroom/storage/jsonl.py +220 -0
  81. headroom/storage/sqlite.py +289 -0
  82. headroom/telemetry/__init__.py +91 -0
  83. headroom/telemetry/collector.py +764 -0
  84. headroom/telemetry/models.py +880 -0
  85. headroom/telemetry/toin.py +1579 -0
  86. headroom/tokenizer.py +80 -0
  87. headroom/tokenizers/__init__.py +75 -0
  88. headroom/tokenizers/base.py +210 -0
  89. headroom/tokenizers/estimator.py +198 -0
  90. headroom/tokenizers/huggingface.py +317 -0
  91. headroom/tokenizers/mistral.py +245 -0
  92. headroom/tokenizers/registry.py +398 -0
  93. headroom/tokenizers/tiktoken_counter.py +248 -0
  94. headroom/transforms/__init__.py +106 -0
  95. headroom/transforms/base.py +57 -0
  96. headroom/transforms/cache_aligner.py +357 -0
  97. headroom/transforms/code_compressor.py +1313 -0
  98. headroom/transforms/content_detector.py +335 -0
  99. headroom/transforms/content_router.py +1158 -0
  100. headroom/transforms/llmlingua_compressor.py +638 -0
  101. headroom/transforms/log_compressor.py +529 -0
  102. headroom/transforms/pipeline.py +297 -0
  103. headroom/transforms/rolling_window.py +350 -0
  104. headroom/transforms/search_compressor.py +365 -0
  105. headroom/transforms/smart_crusher.py +2682 -0
  106. headroom/transforms/text_compressor.py +259 -0
  107. headroom/transforms/tool_crusher.py +338 -0
  108. headroom/utils.py +215 -0
  109. headroom_ai-0.2.13.dist-info/METADATA +315 -0
  110. headroom_ai-0.2.13.dist-info/RECORD +114 -0
  111. headroom_ai-0.2.13.dist-info/WHEEL +4 -0
  112. headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
  113. headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
  114. headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
@@ -0,0 +1,1026 @@
1
+ """
2
+ Dynamic Content Detector for Cache Optimization.
3
+
4
+ This module provides a scalable, language-agnostic approach to detecting dynamic
5
+ content in prompts. Dynamic content (dates, prices, user data, session info) breaks
6
+ cache prefixes. By detecting and moving dynamic content to the end, we maximize
7
+ cache hits.
8
+
9
+ Design Philosophy:
10
+ - NO HARDCODED PATTERNS for locale-specific content (no month names, etc.)
11
+ - Structural detection: "Label: value" patterns where LABEL indicates dynamism
12
+ - Entropy-based detection: High entropy = likely dynamic (UUIDs, tokens, hashes)
13
+ - Universal patterns only: ISO 8601, UUIDs, Unix timestamps (truly universal)
14
+
15
+ Tiers (configurable, each adds latency):
16
+ Tier 1: Regex (~0ms) - Structural patterns, universal formats, entropy-based
17
+ Tier 2: NER (~5-10ms) - Named Entity Recognition for names, money, orgs
18
+ Tier 3: Semantic (~20-50ms) - Embedding similarity to known dynamic patterns
19
+
20
+ Usage:
21
+ from headroom.cache.dynamic_detector import DynamicContentDetector
22
+
23
+ detector = DynamicContentDetector(tiers=["regex", "ner"])
24
+ result = detector.detect("Session: abc123. User: John paid $500.")
25
+
26
+ # result.spans = [
27
+ # DynamicSpan(text="Session: abc123", category="session", tier="regex", ...),
28
+ # DynamicSpan(text="John", category="person", tier="ner", ...),
29
+ # DynamicSpan(text="$500", category="money", tier="ner", ...),
30
+ # ]
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import math
36
+ import re
37
+ from dataclasses import dataclass, field
38
+ from enum import Enum
39
+ from typing import Any, Literal
40
+
41
+ # Optional imports - graceful degradation
42
+ _SPACY_AVAILABLE = False
43
+ _SENTENCE_TRANSFORMERS_AVAILABLE = False
44
+
45
+ try:
46
+ import spacy
47
+
48
+ _SPACY_AVAILABLE = True
49
+ except ImportError:
50
+ spacy = None # type: ignore
51
+
52
+ try:
53
+ import numpy as np
54
+ from sentence_transformers import SentenceTransformer
55
+
56
+ _SENTENCE_TRANSFORMERS_AVAILABLE = True
57
+ except ImportError:
58
+ SentenceTransformer = None # type: ignore
59
+ np = None # type: ignore
60
+
61
+
62
+ class DynamicCategory(str, Enum):
63
+ """Categories of dynamic content."""
64
+
65
+ # Tier 1: Structural/Regex detectable
66
+ DATE = "date"
67
+ TIME = "time"
68
+ DATETIME = "datetime"
69
+ TIMESTAMP = "timestamp"
70
+ UUID = "uuid"
71
+ REQUEST_ID = "request_id"
72
+ VERSION = "version"
73
+ SESSION = "session"
74
+ USER_DATA = "user_data"
75
+ IDENTIFIER = "identifier" # Generic high-entropy ID
76
+
77
+ # Tier 2: NER detectable
78
+ PERSON = "person"
79
+ MONEY = "money"
80
+ ORG = "org"
81
+ LOCATION = "location"
82
+
83
+ # Tier 3: Semantic
84
+ VOLATILE = "volatile" # Semantically detected as changing
85
+ REALTIME = "realtime"
86
+
87
+ # Fallback
88
+ UNKNOWN = "unknown"
89
+
90
+
91
+ @dataclass
92
+ class DynamicSpan:
93
+ """A span of dynamic content detected in text."""
94
+
95
+ # The actual text matched
96
+ text: str
97
+
98
+ # Position in original content
99
+ start: int
100
+ end: int
101
+
102
+ # What category of dynamic content
103
+ category: DynamicCategory
104
+
105
+ # Which tier detected it
106
+ tier: Literal["regex", "ner", "semantic"]
107
+
108
+ # Confidence score (0-1)
109
+ confidence: float = 1.0
110
+
111
+ # Additional metadata (pattern name, entity type, etc.)
112
+ metadata: dict[str, Any] = field(default_factory=dict)
113
+
114
+
115
+ @dataclass
116
+ class DetectionResult:
117
+ """Result of dynamic content detection."""
118
+
119
+ # All detected spans
120
+ spans: list[DynamicSpan]
121
+
122
+ # Content with dynamic parts removed
123
+ static_content: str
124
+
125
+ # Content that was extracted (for reinsertion at end)
126
+ dynamic_content: str
127
+
128
+ # Which tiers were used
129
+ tiers_used: list[str]
130
+
131
+ # Processing time in milliseconds
132
+ processing_time_ms: float = 0.0
133
+
134
+ # Any warnings (e.g., "spaCy not available, skipping NER")
135
+ warnings: list[str] = field(default_factory=list)
136
+
137
+
138
+ @dataclass
139
+ class DetectorConfig:
140
+ """Configuration for the dynamic content detector."""
141
+
142
+ # Which tiers to enable (order matters - later tiers can use earlier results)
143
+ tiers: list[Literal["regex", "ner", "semantic"]] = field(default_factory=lambda: ["regex"])
144
+
145
+ # Tier 1: Structural labels that indicate dynamic content
146
+ # These are the KEY names that hint the VALUE is dynamic
147
+ # Users can add domain-specific labels
148
+ dynamic_labels: list[str] = field(
149
+ default_factory=lambda: [
150
+ # Time-related
151
+ "date",
152
+ "time",
153
+ "timestamp",
154
+ "datetime",
155
+ "created",
156
+ "updated",
157
+ "modified",
158
+ "expires",
159
+ "last",
160
+ "current",
161
+ "today",
162
+ "now",
163
+ # Identifiers
164
+ "id",
165
+ "uuid",
166
+ "guid",
167
+ "session",
168
+ "request",
169
+ "trace",
170
+ "span",
171
+ "transaction",
172
+ "correlation",
173
+ "token",
174
+ "key",
175
+ "secret",
176
+ # User-related
177
+ "user",
178
+ "username",
179
+ "email",
180
+ "name",
181
+ "phone",
182
+ "address",
183
+ "customer",
184
+ "client",
185
+ "employee",
186
+ "member",
187
+ # System state
188
+ "version",
189
+ "build",
190
+ "commit",
191
+ "branch",
192
+ "revision",
193
+ "status",
194
+ "state",
195
+ "count",
196
+ "total",
197
+ "balance",
198
+ "remaining",
199
+ "load",
200
+ "queue",
201
+ "active",
202
+ "pending",
203
+ # Order/ticket related
204
+ "order",
205
+ "ticket",
206
+ "case",
207
+ "invoice",
208
+ "reference",
209
+ ]
210
+ )
211
+
212
+ # Tier 1: Custom regex patterns (user-provided)
213
+ custom_patterns: list[tuple[str, DynamicCategory]] = field(default_factory=list)
214
+
215
+ # Entropy threshold for detecting random strings (0-1 scale normalized)
216
+ # Higher = more selective (only very random strings)
217
+ entropy_threshold: float = 0.7
218
+
219
+ # Minimum length for entropy-based detection
220
+ min_entropy_length: int = 8
221
+
222
+ # Tier 2: NER config
223
+ spacy_model: str = "en_core_web_sm"
224
+ ner_entity_types: list[str] = field(
225
+ default_factory=lambda: ["DATE", "TIME", "MONEY", "PERSON", "ORG", "GPE"]
226
+ )
227
+
228
+ # Tier 3: Semantic config
229
+ embedding_model: str = "all-MiniLM-L6-v2"
230
+ semantic_threshold: float = 0.7
231
+
232
+ # General
233
+ min_span_length: int = 2
234
+ merge_overlapping: bool = True
235
+
236
+
237
+ def calculate_entropy(s: str) -> float:
238
+ """
239
+ Calculate Shannon entropy of a string, normalized to 0-1.
240
+
241
+ Higher entropy = more random/unpredictable = likely dynamic.
242
+ - "aaaaaaa" -> ~0 (low entropy, predictable)
243
+ - "a1b2c3d4" -> ~0.7 (medium entropy)
244
+ - "550e8400-e29b-41d4" -> ~0.9 (high entropy, random-looking)
245
+
246
+ Returns:
247
+ Normalized entropy (0-1). Higher = more likely dynamic.
248
+ """
249
+ if not s:
250
+ return 0.0
251
+
252
+ # Count character frequencies
253
+ freq: dict[str, int] = {}
254
+ for char in s:
255
+ freq[char] = freq.get(char, 0) + 1
256
+
257
+ # Calculate entropy
258
+ length = len(s)
259
+ entropy = 0.0
260
+ for count in freq.values():
261
+ p = count / length
262
+ entropy -= p * math.log2(p)
263
+
264
+ # Normalize: max entropy for string of length n with k unique chars
265
+ # is log2(min(n, alphabet_size)). We'll normalize by log2(length)
266
+ # to get a 0-1 scale
267
+ max_entropy = math.log2(length) if length > 1 else 1.0
268
+ return entropy / max_entropy if max_entropy > 0 else 0.0
269
+
270
+
271
+ class RegexDetector:
272
+ """
273
+ Tier 1: Scalable pattern detection.
274
+
275
+ Uses THREE strategies (no hardcoded month names!):
276
+ 1. Structural: "Label: value" patterns where label indicates dynamic content
277
+ 2. Universal: Truly universal formats (ISO 8601, UUID, Unix timestamps)
278
+ 3. Entropy: High-entropy strings (tokens, hashes, IDs)
279
+ """
280
+
281
+ # Universal patterns (these formats are language-agnostic)
282
+ UNIVERSAL_PATTERNS = [
283
+ # UUID - truly universal format
284
+ (
285
+ r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
286
+ DynamicCategory.UUID,
287
+ "uuid",
288
+ ),
289
+ # ISO 8601 datetime (most universal date format)
290
+ (
291
+ r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?",
292
+ DynamicCategory.DATETIME,
293
+ "iso_datetime",
294
+ ),
295
+ # ISO 8601 date only
296
+ (r"\d{4}-\d{2}-\d{2}(?!\d)", DynamicCategory.DATE, "iso_date"),
297
+ # Unix timestamps (10-13 digits, but NOT within longer numbers)
298
+ (r"(?<![0-9])\d{10,13}(?![0-9])", DynamicCategory.TIMESTAMP, "unix_timestamp"),
299
+ # 24-hour time HH:MM:SS or HH:MM
300
+ (
301
+ r"(?<![0-9])\d{1,2}:\d{2}(?::\d{2})?(?:\s*(?:AM|PM|am|pm))?(?![0-9])",
302
+ DynamicCategory.TIME,
303
+ "time",
304
+ ),
305
+ # Version numbers with v prefix (unambiguous)
306
+ (r"\bv\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?", DynamicCategory.VERSION, "version"),
307
+ # API key/token patterns (prefix + random string)
308
+ (
309
+ r"\b(?:sk|pk|api|key|token|bearer|auth)[-_][a-zA-Z0-9]{16,}",
310
+ DynamicCategory.REQUEST_ID,
311
+ "api_key",
312
+ ),
313
+ # Common prefixed IDs (req_, sess_, txn_, etc.)
314
+ (r"\b[a-z]{2,6}_[a-zA-Z0-9]{8,}", DynamicCategory.REQUEST_ID, "prefixed_id"),
315
+ # Hex strings of common ID lengths (32 = MD5, 40 = SHA1, 64 = SHA256)
316
+ (r"\b[a-fA-F0-9]{32}\b", DynamicCategory.IDENTIFIER, "hex_32"),
317
+ (r"\b[a-fA-F0-9]{40}\b", DynamicCategory.IDENTIFIER, "hex_40"),
318
+ (r"\b[a-fA-F0-9]{64}\b", DynamicCategory.IDENTIFIER, "hex_64"),
319
+ # JWT tokens (three base64 sections separated by dots)
320
+ (
321
+ r"eyJ[a-zA-Z0-9_-]+\.eyJ[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+",
322
+ DynamicCategory.REQUEST_ID,
323
+ "jwt",
324
+ ),
325
+ ]
326
+
327
+ def __init__(self, config: DetectorConfig):
328
+ """Initialize regex detector."""
329
+ self.config = config
330
+
331
+ # Compile universal patterns
332
+ self._universal_patterns: list[tuple[re.Pattern[str], DynamicCategory, str]] = [
333
+ (re.compile(pattern), category, name)
334
+ for pattern, category, name in self.UNIVERSAL_PATTERNS
335
+ ]
336
+
337
+ # Build structural pattern from dynamic labels
338
+ # Pattern: "label" followed by separator then value
339
+ labels_pattern = "|".join(re.escape(label) for label in config.dynamic_labels)
340
+ self._structural_pattern = re.compile(
341
+ rf"(?P<label>(?:{labels_pattern}))(?P<sep>\s*[:=]\s*|\s+)(?P<value>[^\n,;]+)",
342
+ re.IGNORECASE,
343
+ )
344
+
345
+ # Compile custom patterns
346
+ self._custom_patterns: list[tuple[re.Pattern[str], DynamicCategory]] = [
347
+ (re.compile(pattern, re.IGNORECASE), category)
348
+ for pattern, category in config.custom_patterns
349
+ ]
350
+
351
+ def detect(self, content: str) -> list[DynamicSpan]:
352
+ """Detect dynamic content using structural, universal, and entropy detection."""
353
+ spans: list[DynamicSpan] = []
354
+ seen_ranges: set[tuple[int, int]] = set()
355
+
356
+ # 1. Universal patterns first (most specific)
357
+ for pattern, category, pattern_name in self._universal_patterns:
358
+ for match in pattern.finditer(content):
359
+ start, end = match.start(), match.end()
360
+ if self._is_overlapping(start, end, seen_ranges):
361
+ continue
362
+ if end - start < self.config.min_span_length:
363
+ continue
364
+
365
+ spans.append(
366
+ DynamicSpan(
367
+ text=match.group(),
368
+ start=start,
369
+ end=end,
370
+ category=category,
371
+ tier="regex",
372
+ confidence=1.0,
373
+ metadata={"pattern": pattern_name, "method": "universal"},
374
+ )
375
+ )
376
+ seen_ranges.add((start, end))
377
+
378
+ # 2. Structural detection: "Label: value" patterns
379
+ for match in self._structural_pattern.finditer(content):
380
+ # Get the full match range
381
+ start, end = match.start(), match.end()
382
+
383
+ # Skip if overlaps with universal patterns
384
+ if self._is_overlapping(start, end, seen_ranges):
385
+ continue
386
+
387
+ label = match.group("label").lower()
388
+ value = match.group("value").strip()
389
+
390
+ # Determine category from label
391
+ category = self._categorize_label(label)
392
+
393
+ # Only add the value portion (keep label as static)
394
+ value_start = match.start("value")
395
+ value_end = match.end("value")
396
+
397
+ # Skip if value is too short or empty
398
+ if value_end - value_start < self.config.min_span_length:
399
+ continue
400
+ if not value.strip():
401
+ continue
402
+
403
+ spans.append(
404
+ DynamicSpan(
405
+ text=value,
406
+ start=value_start,
407
+ end=value_end,
408
+ category=category,
409
+ tier="regex",
410
+ confidence=0.9,
411
+ metadata={"pattern": "structural", "method": "structural", "label": label},
412
+ )
413
+ )
414
+ seen_ranges.add((value_start, value_end))
415
+
416
+ # 3. Entropy-based detection for remaining potential IDs
417
+ spans.extend(self._detect_high_entropy(content, seen_ranges))
418
+
419
+ # 4. Custom patterns
420
+ for pattern, category in self._custom_patterns:
421
+ for match in pattern.finditer(content):
422
+ start, end = match.start(), match.end()
423
+ if self._is_overlapping(start, end, seen_ranges):
424
+ continue
425
+ if end - start < self.config.min_span_length:
426
+ continue
427
+
428
+ spans.append(
429
+ DynamicSpan(
430
+ text=match.group(),
431
+ start=start,
432
+ end=end,
433
+ category=category,
434
+ tier="regex",
435
+ confidence=0.8,
436
+ metadata={"pattern": "custom", "method": "custom"},
437
+ )
438
+ )
439
+ seen_ranges.add((start, end))
440
+
441
+ return sorted(spans, key=lambda s: s.start)
442
+
443
+ def _detect_high_entropy(
444
+ self,
445
+ content: str,
446
+ seen_ranges: set[tuple[int, int]],
447
+ ) -> list[DynamicSpan]:
448
+ """
449
+ Detect high-entropy strings that look like IDs/tokens.
450
+
451
+ Finds alphanumeric sequences and checks their entropy.
452
+ High entropy = likely random/generated = dynamic.
453
+ """
454
+ spans: list[DynamicSpan] = []
455
+
456
+ # Find alphanumeric sequences (potential IDs)
457
+ # Must be at least min_entropy_length chars, mix of letters/numbers
458
+ pattern = re.compile(r"\b[a-zA-Z0-9_-]{8,}\b")
459
+
460
+ for match in pattern.finditer(content):
461
+ start, end = match.start(), match.end()
462
+ text = match.group()
463
+
464
+ # Skip if already detected
465
+ if self._is_overlapping(start, end, seen_ranges):
466
+ continue
467
+
468
+ # Skip if too short
469
+ if len(text) < self.config.min_entropy_length:
470
+ continue
471
+
472
+ # Skip if all letters or all numbers (not random-looking)
473
+ if text.isalpha() or text.isdigit():
474
+ continue
475
+
476
+ # Skip common words that might look like IDs
477
+ if text.lower() in {"username", "password", "localhost", "undefined"}:
478
+ continue
479
+
480
+ # Calculate entropy
481
+ entropy = calculate_entropy(text)
482
+
483
+ if entropy >= self.config.entropy_threshold:
484
+ spans.append(
485
+ DynamicSpan(
486
+ text=text,
487
+ start=start,
488
+ end=end,
489
+ category=DynamicCategory.IDENTIFIER,
490
+ tier="regex",
491
+ confidence=entropy, # Use entropy as confidence
492
+ metadata={"pattern": "entropy", "method": "entropy", "entropy": entropy},
493
+ )
494
+ )
495
+ seen_ranges.add((start, end))
496
+
497
+ return spans
498
+
499
+ def _is_overlapping(
500
+ self,
501
+ start: int,
502
+ end: int,
503
+ seen_ranges: set[tuple[int, int]],
504
+ ) -> bool:
505
+ """Check if range overlaps with any existing range."""
506
+ return any(not (end <= s or start >= e) for s, e in seen_ranges)
507
+
508
+ def _categorize_label(self, label: str) -> DynamicCategory:
509
+ """Categorize based on the label name."""
510
+ label = label.lower()
511
+
512
+ # Time-related
513
+ if label in {"date", "datetime", "created", "updated", "modified", "expires", "today"}:
514
+ return DynamicCategory.DATE
515
+ if label in {"time", "timestamp", "now"}:
516
+ return DynamicCategory.TIMESTAMP
517
+ if label == "current":
518
+ return DynamicCategory.DATETIME
519
+
520
+ # Identifiers
521
+ if label in {"id", "uuid", "guid"}:
522
+ return DynamicCategory.UUID
523
+ if label in {"session", "request", "trace", "span", "transaction", "correlation"}:
524
+ return DynamicCategory.SESSION
525
+ if label in {"token", "key", "secret"}:
526
+ return DynamicCategory.REQUEST_ID
527
+
528
+ # User-related
529
+ if label in {
530
+ "user",
531
+ "username",
532
+ "email",
533
+ "name",
534
+ "phone",
535
+ "address",
536
+ "customer",
537
+ "client",
538
+ "employee",
539
+ "member",
540
+ }:
541
+ return DynamicCategory.USER_DATA
542
+
543
+ # System state
544
+ if label in {"version", "build", "commit", "branch", "revision"}:
545
+ return DynamicCategory.VERSION
546
+ if label in {
547
+ "status",
548
+ "state",
549
+ "count",
550
+ "total",
551
+ "balance",
552
+ "remaining",
553
+ "load",
554
+ "queue",
555
+ "active",
556
+ "pending",
557
+ }:
558
+ return DynamicCategory.VOLATILE
559
+
560
+ # Order/ticket
561
+ if label in {"order", "ticket", "case", "invoice", "reference"}:
562
+ return DynamicCategory.REQUEST_ID
563
+
564
+ return DynamicCategory.UNKNOWN
565
+
566
+
567
+ class NERDetector:
568
+ """Tier 2: spaCy-based Named Entity Recognition."""
569
+
570
+ # Map spaCy entity types to our categories
571
+ ENTITY_MAP = {
572
+ "DATE": DynamicCategory.DATE,
573
+ "TIME": DynamicCategory.TIME,
574
+ "MONEY": DynamicCategory.MONEY,
575
+ "PERSON": DynamicCategory.PERSON,
576
+ "ORG": DynamicCategory.ORG,
577
+ "GPE": DynamicCategory.LOCATION, # Geo-Political Entity
578
+ "LOC": DynamicCategory.LOCATION,
579
+ "FAC": DynamicCategory.LOCATION, # Facility
580
+ "CARDINAL": DynamicCategory.UNKNOWN, # Numbers
581
+ "ORDINAL": DynamicCategory.UNKNOWN,
582
+ }
583
+
584
+ def __init__(self, config: DetectorConfig):
585
+ """Initialize NER detector, loading spaCy model."""
586
+ self.config = config
587
+ self._nlp = None
588
+ self._load_error: str | None = None
589
+
590
+ if not _SPACY_AVAILABLE:
591
+ self._load_error = (
592
+ "spaCy not installed. Install with: "
593
+ "pip install spacy && python -m spacy download en_core_web_sm"
594
+ )
595
+ return
596
+
597
+ try:
598
+ self._nlp = spacy.load(config.spacy_model)
599
+ except OSError:
600
+ self._load_error = (
601
+ f"spaCy model '{config.spacy_model}' not found. "
602
+ f"Install with: python -m spacy download {config.spacy_model}"
603
+ )
604
+
605
+ @property
606
+ def is_available(self) -> bool:
607
+ """Check if NER is available."""
608
+ return self._nlp is not None
609
+
610
+ def detect(
611
+ self,
612
+ content: str,
613
+ existing_spans: list[DynamicSpan] | None = None,
614
+ ) -> tuple[list[DynamicSpan], str | None]:
615
+ """
616
+ Detect dynamic content using NER.
617
+
618
+ Args:
619
+ content: Text to analyze.
620
+ existing_spans: Spans already detected (to avoid duplicates).
621
+
622
+ Returns:
623
+ Tuple of (new_spans, warning_message).
624
+ """
625
+ if not self.is_available:
626
+ return [], self._load_error
627
+
628
+ # Get existing ranges to avoid duplicates
629
+ existing_ranges = set()
630
+ if existing_spans:
631
+ existing_ranges = {(s.start, s.end) for s in existing_spans}
632
+
633
+ doc = self._nlp(content) # type: ignore[misc]
634
+ spans: list[DynamicSpan] = []
635
+
636
+ for ent in doc.ents:
637
+ # Skip entity types we don't care about
638
+ if ent.label_ not in self.config.ner_entity_types:
639
+ continue
640
+
641
+ # Skip if already detected by regex
642
+ if (ent.start_char, ent.end_char) in existing_ranges:
643
+ continue
644
+
645
+ # Check for overlap with existing spans
646
+ overlaps = any(
647
+ not (ent.end_char <= s or ent.start_char >= e) for s, e in existing_ranges
648
+ )
649
+ if overlaps:
650
+ continue
651
+
652
+ # Map to our category
653
+ category = self.ENTITY_MAP.get(ent.label_, DynamicCategory.UNKNOWN)
654
+
655
+ # Skip unknown categories
656
+ if category == DynamicCategory.UNKNOWN:
657
+ continue
658
+
659
+ spans.append(
660
+ DynamicSpan(
661
+ text=ent.text,
662
+ start=ent.start_char,
663
+ end=ent.end_char,
664
+ category=category,
665
+ tier="ner",
666
+ confidence=0.9,
667
+ metadata={"entity_type": ent.label_},
668
+ )
669
+ )
670
+ existing_ranges.add((ent.start_char, ent.end_char))
671
+
672
+ return sorted(spans, key=lambda s: s.start), None
673
+
674
+
675
+ class SemanticDetector:
676
+ """Tier 3: Embedding-based semantic detection."""
677
+
678
+ # Known phrases that indicate dynamic content
679
+ # These are SEMANTIC patterns, not literal strings to match
680
+ DYNAMIC_EXEMPLARS = [
681
+ # Time-sensitive
682
+ "The current date is",
683
+ "As of today",
684
+ "Updated on",
685
+ "Last refreshed",
686
+ "Real-time data",
687
+ "Live prices",
688
+ "Current stock price",
689
+ # Session-specific
690
+ "Your session ID",
691
+ "Your account balance",
692
+ "Your recent orders",
693
+ "Your conversation history",
694
+ # User-specific
695
+ "Hello [user]",
696
+ "Dear customer",
697
+ "Your name is",
698
+ # System state
699
+ "Server status",
700
+ "System load",
701
+ "Queue length",
702
+ "Active users",
703
+ ]
704
+
705
+ def __init__(self, config: DetectorConfig):
706
+ """Initialize semantic detector with embedding model."""
707
+ self.config = config
708
+ self._model = None
709
+ self._exemplar_embeddings = None
710
+ self._load_error: str | None = None
711
+
712
+ if not _SENTENCE_TRANSFORMERS_AVAILABLE:
713
+ self._load_error = (
714
+ "sentence-transformers not installed. "
715
+ "Install with: pip install sentence-transformers"
716
+ )
717
+ return
718
+
719
+ try:
720
+ self._model = SentenceTransformer(config.embedding_model)
721
+ # Pre-compute exemplar embeddings
722
+ self._exemplar_embeddings = self._model.encode(
723
+ self.DYNAMIC_EXEMPLARS,
724
+ convert_to_numpy=True,
725
+ )
726
+ except Exception as e:
727
+ self._load_error = f"Failed to load embedding model: {e}"
728
+
729
+ @property
730
+ def is_available(self) -> bool:
731
+ """Check if semantic detection is available."""
732
+ return self._model is not None
733
+
734
+ def detect(
735
+ self,
736
+ content: str,
737
+ existing_spans: list[DynamicSpan] | None = None,
738
+ ) -> tuple[list[DynamicSpan], str | None]:
739
+ """
740
+ Detect dynamic content using semantic similarity.
741
+
742
+ Splits content into sentences and checks each against known
743
+ dynamic patterns using embedding similarity.
744
+
745
+ Args:
746
+ content: Text to analyze.
747
+ existing_spans: Spans already detected (to avoid duplicates).
748
+
749
+ Returns:
750
+ Tuple of (new_spans, warning_message).
751
+ """
752
+ if not self.is_available:
753
+ return [], self._load_error
754
+
755
+ # Simple sentence splitting (could use spaCy if available)
756
+ sentences = self._split_sentences(content)
757
+ spans: list[DynamicSpan] = []
758
+
759
+ # Get existing ranges
760
+ existing_ranges = set()
761
+ if existing_spans:
762
+ existing_ranges = {(s.start, s.end) for s in existing_spans}
763
+
764
+ # Encode all sentences
765
+ if not sentences:
766
+ return [], None
767
+
768
+ sentence_texts = [s[0] for s in sentences]
769
+ sentence_embeddings = self._model.encode( # type: ignore[union-attr]
770
+ sentence_texts,
771
+ convert_to_numpy=True,
772
+ )
773
+
774
+ # Compute similarities
775
+ similarities = np.dot(sentence_embeddings, self._exemplar_embeddings.T) # type: ignore[union-attr]
776
+
777
+ for i, (text, start, end) in enumerate(sentences):
778
+ # Get max similarity to any exemplar
779
+ max_sim = float(np.max(similarities[i]))
780
+
781
+ if max_sim < self.config.semantic_threshold:
782
+ continue
783
+
784
+ # Check overlap with existing spans
785
+ overlaps = any(not (end <= s or start >= e) for s, e in existing_ranges)
786
+ if overlaps:
787
+ continue
788
+
789
+ # Find which exemplar matched best
790
+ best_exemplar_idx = int(np.argmax(similarities[i]))
791
+ best_exemplar = self.DYNAMIC_EXEMPLARS[best_exemplar_idx]
792
+
793
+ # Determine category based on exemplar
794
+ category = self._categorize_exemplar(best_exemplar)
795
+
796
+ spans.append(
797
+ DynamicSpan(
798
+ text=text,
799
+ start=start,
800
+ end=end,
801
+ category=category,
802
+ tier="semantic",
803
+ confidence=max_sim,
804
+ metadata={
805
+ "matched_exemplar": best_exemplar,
806
+ "similarity": max_sim,
807
+ },
808
+ )
809
+ )
810
+ existing_ranges.add((start, end))
811
+
812
+ return sorted(spans, key=lambda s: s.start), None
813
+
814
+ def _split_sentences(self, content: str) -> list[tuple[str, int, int]]:
815
+ """Split content into sentences with positions."""
816
+ sentences: list[tuple[str, int, int]] = []
817
+ pattern = r"[^.!?\n]+[.!?\n]?"
818
+ for match in re.finditer(pattern, content):
819
+ text = match.group().strip()
820
+ if len(text) > 10:
821
+ sentences.append((text, match.start(), match.end()))
822
+ return sentences
823
+
824
+ def _categorize_exemplar(self, exemplar: str) -> DynamicCategory:
825
+ """Categorize based on which exemplar matched."""
826
+ exemplar_lower = exemplar.lower()
827
+
828
+ if any(w in exemplar_lower for w in ["date", "today", "updated", "refreshed"]):
829
+ return DynamicCategory.DATE
830
+ elif any(w in exemplar_lower for w in ["price", "stock", "live", "real-time"]):
831
+ return DynamicCategory.REALTIME
832
+ elif any(w in exemplar_lower for w in ["session", "account", "your"]):
833
+ return DynamicCategory.SESSION
834
+ elif any(w in exemplar_lower for w in ["status", "load", "queue", "active"]):
835
+ return DynamicCategory.VOLATILE
836
+ else:
837
+ return DynamicCategory.VOLATILE
838
+
839
+
840
+ class DynamicContentDetector:
841
+ """
842
+ Unified dynamic content detector with tiered detection.
843
+
844
+ Key Design Principles:
845
+ - NO hardcoded locale-specific patterns (no month names)
846
+ - Structural detection: Labels indicate what's dynamic
847
+ - Universal patterns: ISO 8601, UUIDs, Unix timestamps
848
+ - Entropy-based: High entropy = random/generated = dynamic
849
+
850
+ Usage:
851
+ # Fast mode (regex only - structural + universal + entropy)
852
+ detector = DynamicContentDetector(DetectorConfig(tiers=["regex"]))
853
+
854
+ # Balanced mode (regex + NER for names/money)
855
+ detector = DynamicContentDetector(DetectorConfig(tiers=["regex", "ner"]))
856
+
857
+ # Full mode (all tiers)
858
+ detector = DynamicContentDetector(DetectorConfig(
859
+ tiers=["regex", "ner", "semantic"]
860
+ ))
861
+
862
+ result = detector.detect("Session: abc123. User: John paid $500.")
863
+ """
864
+
865
+ def __init__(self, config: DetectorConfig | None = None):
866
+ """Initialize detector with configuration."""
867
+ self.config = config or DetectorConfig()
868
+
869
+ # Initialize detectors based on enabled tiers
870
+ self._regex_detector: RegexDetector | None = None
871
+ self._ner_detector: NERDetector | None = None
872
+ self._semantic_detector: SemanticDetector | None = None
873
+
874
+ if "regex" in self.config.tiers:
875
+ self._regex_detector = RegexDetector(self.config)
876
+
877
+ if "ner" in self.config.tiers:
878
+ self._ner_detector = NERDetector(self.config)
879
+
880
+ if "semantic" in self.config.tiers:
881
+ self._semantic_detector = SemanticDetector(self.config)
882
+
883
+ def detect(self, content: str) -> DetectionResult:
884
+ """
885
+ Detect dynamic content in text.
886
+
887
+ Runs enabled tiers in order, accumulating spans.
888
+ Each tier can see what previous tiers detected.
889
+
890
+ Args:
891
+ content: Text to analyze.
892
+
893
+ Returns:
894
+ DetectionResult with spans, static/dynamic content split, etc.
895
+ """
896
+ import time
897
+
898
+ start_time = time.perf_counter()
899
+
900
+ all_spans: list[DynamicSpan] = []
901
+ tiers_used: list[str] = []
902
+ warnings: list[str] = []
903
+
904
+ # Tier 1: Regex (structural + universal + entropy)
905
+ if self._regex_detector:
906
+ regex_spans = self._regex_detector.detect(content)
907
+ all_spans.extend(regex_spans)
908
+ tiers_used.append("regex")
909
+
910
+ # Tier 2: NER
911
+ if self._ner_detector:
912
+ ner_spans, ner_warning = self._ner_detector.detect(content, all_spans)
913
+ all_spans.extend(ner_spans)
914
+ if ner_warning:
915
+ warnings.append(ner_warning)
916
+ elif ner_spans or self._ner_detector.is_available:
917
+ tiers_used.append("ner")
918
+
919
+ # Tier 3: Semantic
920
+ if self._semantic_detector:
921
+ sem_spans, sem_warning = self._semantic_detector.detect(content, all_spans)
922
+ all_spans.extend(sem_spans)
923
+ if sem_warning:
924
+ warnings.append(sem_warning)
925
+ elif sem_spans or self._semantic_detector.is_available:
926
+ tiers_used.append("semantic")
927
+
928
+ # Sort by position
929
+ all_spans = sorted(all_spans, key=lambda s: s.start)
930
+
931
+ # Build static and dynamic content
932
+ static_content, dynamic_content = self._split_content(content, all_spans)
933
+
934
+ processing_time = (time.perf_counter() - start_time) * 1000
935
+
936
+ return DetectionResult(
937
+ spans=all_spans,
938
+ static_content=static_content,
939
+ dynamic_content=dynamic_content,
940
+ tiers_used=tiers_used,
941
+ processing_time_ms=processing_time,
942
+ warnings=warnings,
943
+ )
944
+
945
+ def _split_content(
946
+ self,
947
+ content: str,
948
+ spans: list[DynamicSpan],
949
+ ) -> tuple[str, str]:
950
+ """Split content into static and dynamic parts."""
951
+ if not spans:
952
+ return content, ""
953
+
954
+ static = content
955
+ dynamic_parts: list[str] = []
956
+
957
+ for span in reversed(spans):
958
+ dynamic_parts.append(span.text)
959
+ static = static[: span.start] + static[span.end :]
960
+
961
+ static = self._clean_static_content(static)
962
+ dynamic_parts.reverse()
963
+ dynamic = "\n".join(dynamic_parts)
964
+
965
+ return static, dynamic
966
+
967
+ def _clean_static_content(self, content: str) -> str:
968
+ """Clean up static content after span removal."""
969
+ lines = content.split("\n")
970
+ cleaned_lines: list[str] = []
971
+ prev_blank = False
972
+
973
+ for line in lines:
974
+ is_blank = not line.strip()
975
+ if is_blank and prev_blank:
976
+ continue
977
+ cleaned_lines.append(line.rstrip())
978
+ prev_blank = is_blank
979
+
980
+ return "\n".join(cleaned_lines).strip()
981
+
982
+ @property
983
+ def available_tiers(self) -> list[str]:
984
+ """Get list of actually available tiers (dependencies installed)."""
985
+ available = []
986
+
987
+ if self._regex_detector:
988
+ available.append("regex")
989
+
990
+ if self._ner_detector and self._ner_detector.is_available:
991
+ available.append("ner")
992
+
993
+ if self._semantic_detector and self._semantic_detector.is_available:
994
+ available.append("semantic")
995
+
996
+ return available
997
+
998
+
999
+ # Convenience function
1000
+ def detect_dynamic_content(
1001
+ content: str,
1002
+ tiers: list[Literal["regex", "ner", "semantic"]] | None = None,
1003
+ ) -> DetectionResult:
1004
+ """
1005
+ Detect dynamic content in text.
1006
+
1007
+ Convenience function that creates a detector with specified tiers.
1008
+
1009
+ Args:
1010
+ content: Text to analyze.
1011
+ tiers: Which tiers to use. Default: ["regex"] for speed.
1012
+
1013
+ Returns:
1014
+ DetectionResult with detected spans and split content.
1015
+
1016
+ Example:
1017
+ >>> result = detect_dynamic_content(
1018
+ ... "Session: abc123xyz. User: John paid $500.",
1019
+ ... tiers=["regex", "ner"]
1020
+ ... )
1021
+ >>> print(result.static_content)
1022
+ >>> print(result.dynamic_content)
1023
+ """
1024
+ config = DetectorConfig(tiers=tiers or ["regex"])
1025
+ detector = DynamicContentDetector(config)
1026
+ return detector.detect(content)