headroom-ai 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- headroom/__init__.py +212 -0
- headroom/cache/__init__.py +76 -0
- headroom/cache/anthropic.py +517 -0
- headroom/cache/base.py +342 -0
- headroom/cache/compression_feedback.py +613 -0
- headroom/cache/compression_store.py +814 -0
- headroom/cache/dynamic_detector.py +1026 -0
- headroom/cache/google.py +884 -0
- headroom/cache/openai.py +584 -0
- headroom/cache/registry.py +175 -0
- headroom/cache/semantic.py +451 -0
- headroom/ccr/__init__.py +77 -0
- headroom/ccr/context_tracker.py +582 -0
- headroom/ccr/mcp_server.py +319 -0
- headroom/ccr/response_handler.py +772 -0
- headroom/ccr/tool_injection.py +415 -0
- headroom/cli.py +219 -0
- headroom/client.py +977 -0
- headroom/compression/__init__.py +42 -0
- headroom/compression/detector.py +424 -0
- headroom/compression/handlers/__init__.py +22 -0
- headroom/compression/handlers/base.py +219 -0
- headroom/compression/handlers/code_handler.py +506 -0
- headroom/compression/handlers/json_handler.py +418 -0
- headroom/compression/masks.py +345 -0
- headroom/compression/universal.py +465 -0
- headroom/config.py +474 -0
- headroom/exceptions.py +192 -0
- headroom/integrations/__init__.py +159 -0
- headroom/integrations/agno/__init__.py +53 -0
- headroom/integrations/agno/hooks.py +345 -0
- headroom/integrations/agno/model.py +625 -0
- headroom/integrations/agno/providers.py +154 -0
- headroom/integrations/langchain/__init__.py +106 -0
- headroom/integrations/langchain/agents.py +326 -0
- headroom/integrations/langchain/chat_model.py +1002 -0
- headroom/integrations/langchain/langsmith.py +324 -0
- headroom/integrations/langchain/memory.py +319 -0
- headroom/integrations/langchain/providers.py +200 -0
- headroom/integrations/langchain/retriever.py +371 -0
- headroom/integrations/langchain/streaming.py +341 -0
- headroom/integrations/mcp/__init__.py +37 -0
- headroom/integrations/mcp/server.py +533 -0
- headroom/memory/__init__.py +37 -0
- headroom/memory/extractor.py +390 -0
- headroom/memory/fast_store.py +621 -0
- headroom/memory/fast_wrapper.py +311 -0
- headroom/memory/inline_extractor.py +229 -0
- headroom/memory/store.py +434 -0
- headroom/memory/worker.py +260 -0
- headroom/memory/wrapper.py +321 -0
- headroom/models/__init__.py +39 -0
- headroom/models/registry.py +687 -0
- headroom/parser.py +293 -0
- headroom/pricing/__init__.py +51 -0
- headroom/pricing/anthropic_prices.py +81 -0
- headroom/pricing/litellm_pricing.py +113 -0
- headroom/pricing/openai_prices.py +91 -0
- headroom/pricing/registry.py +188 -0
- headroom/providers/__init__.py +61 -0
- headroom/providers/anthropic.py +621 -0
- headroom/providers/base.py +131 -0
- headroom/providers/cohere.py +362 -0
- headroom/providers/google.py +427 -0
- headroom/providers/litellm.py +297 -0
- headroom/providers/openai.py +566 -0
- headroom/providers/openai_compatible.py +521 -0
- headroom/proxy/__init__.py +19 -0
- headroom/proxy/server.py +2683 -0
- headroom/py.typed +0 -0
- headroom/relevance/__init__.py +124 -0
- headroom/relevance/base.py +106 -0
- headroom/relevance/bm25.py +255 -0
- headroom/relevance/embedding.py +255 -0
- headroom/relevance/hybrid.py +259 -0
- headroom/reporting/__init__.py +5 -0
- headroom/reporting/generator.py +549 -0
- headroom/storage/__init__.py +41 -0
- headroom/storage/base.py +125 -0
- headroom/storage/jsonl.py +220 -0
- headroom/storage/sqlite.py +289 -0
- headroom/telemetry/__init__.py +91 -0
- headroom/telemetry/collector.py +764 -0
- headroom/telemetry/models.py +880 -0
- headroom/telemetry/toin.py +1579 -0
- headroom/tokenizer.py +80 -0
- headroom/tokenizers/__init__.py +75 -0
- headroom/tokenizers/base.py +210 -0
- headroom/tokenizers/estimator.py +198 -0
- headroom/tokenizers/huggingface.py +317 -0
- headroom/tokenizers/mistral.py +245 -0
- headroom/tokenizers/registry.py +398 -0
- headroom/tokenizers/tiktoken_counter.py +248 -0
- headroom/transforms/__init__.py +106 -0
- headroom/transforms/base.py +57 -0
- headroom/transforms/cache_aligner.py +357 -0
- headroom/transforms/code_compressor.py +1313 -0
- headroom/transforms/content_detector.py +335 -0
- headroom/transforms/content_router.py +1158 -0
- headroom/transforms/llmlingua_compressor.py +638 -0
- headroom/transforms/log_compressor.py +529 -0
- headroom/transforms/pipeline.py +297 -0
- headroom/transforms/rolling_window.py +350 -0
- headroom/transforms/search_compressor.py +365 -0
- headroom/transforms/smart_crusher.py +2682 -0
- headroom/transforms/text_compressor.py +259 -0
- headroom/transforms/tool_crusher.py +338 -0
- headroom/utils.py +215 -0
- headroom_ai-0.2.13.dist-info/METADATA +315 -0
- headroom_ai-0.2.13.dist-info/RECORD +114 -0
- headroom_ai-0.2.13.dist-info/WHEEL +4 -0
- headroom_ai-0.2.13.dist-info/entry_points.txt +2 -0
- headroom_ai-0.2.13.dist-info/licenses/LICENSE +190 -0
- headroom_ai-0.2.13.dist-info/licenses/NOTICE +43 -0
|
@@ -0,0 +1,1026 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dynamic Content Detector for Cache Optimization.
|
|
3
|
+
|
|
4
|
+
This module provides a scalable, language-agnostic approach to detecting dynamic
|
|
5
|
+
content in prompts. Dynamic content (dates, prices, user data, session info) breaks
|
|
6
|
+
cache prefixes. By detecting and moving dynamic content to the end, we maximize
|
|
7
|
+
cache hits.
|
|
8
|
+
|
|
9
|
+
Design Philosophy:
|
|
10
|
+
- NO HARDCODED PATTERNS for locale-specific content (no month names, etc.)
|
|
11
|
+
- Structural detection: "Label: value" patterns where LABEL indicates dynamism
|
|
12
|
+
- Entropy-based detection: High entropy = likely dynamic (UUIDs, tokens, hashes)
|
|
13
|
+
- Universal patterns only: ISO 8601, UUIDs, Unix timestamps (truly universal)
|
|
14
|
+
|
|
15
|
+
Tiers (configurable, each adds latency):
|
|
16
|
+
Tier 1: Regex (~0ms) - Structural patterns, universal formats, entropy-based
|
|
17
|
+
Tier 2: NER (~5-10ms) - Named Entity Recognition for names, money, orgs
|
|
18
|
+
Tier 3: Semantic (~20-50ms) - Embedding similarity to known dynamic patterns
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
from headroom.cache.dynamic_detector import DynamicContentDetector
|
|
22
|
+
|
|
23
|
+
detector = DynamicContentDetector(tiers=["regex", "ner"])
|
|
24
|
+
result = detector.detect("Session: abc123. User: John paid $500.")
|
|
25
|
+
|
|
26
|
+
# result.spans = [
|
|
27
|
+
# DynamicSpan(text="Session: abc123", category="session", tier="regex", ...),
|
|
28
|
+
# DynamicSpan(text="John", category="person", tier="ner", ...),
|
|
29
|
+
# DynamicSpan(text="$500", category="money", tier="ner", ...),
|
|
30
|
+
# ]
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import math
|
|
36
|
+
import re
|
|
37
|
+
from dataclasses import dataclass, field
|
|
38
|
+
from enum import Enum
|
|
39
|
+
from typing import Any, Literal
|
|
40
|
+
|
|
41
|
+
# Optional imports - graceful degradation
|
|
42
|
+
_SPACY_AVAILABLE = False
|
|
43
|
+
_SENTENCE_TRANSFORMERS_AVAILABLE = False
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
import spacy
|
|
47
|
+
|
|
48
|
+
_SPACY_AVAILABLE = True
|
|
49
|
+
except ImportError:
|
|
50
|
+
spacy = None # type: ignore
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
import numpy as np
|
|
54
|
+
from sentence_transformers import SentenceTransformer
|
|
55
|
+
|
|
56
|
+
_SENTENCE_TRANSFORMERS_AVAILABLE = True
|
|
57
|
+
except ImportError:
|
|
58
|
+
SentenceTransformer = None # type: ignore
|
|
59
|
+
np = None # type: ignore
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DynamicCategory(str, Enum):
|
|
63
|
+
"""Categories of dynamic content."""
|
|
64
|
+
|
|
65
|
+
# Tier 1: Structural/Regex detectable
|
|
66
|
+
DATE = "date"
|
|
67
|
+
TIME = "time"
|
|
68
|
+
DATETIME = "datetime"
|
|
69
|
+
TIMESTAMP = "timestamp"
|
|
70
|
+
UUID = "uuid"
|
|
71
|
+
REQUEST_ID = "request_id"
|
|
72
|
+
VERSION = "version"
|
|
73
|
+
SESSION = "session"
|
|
74
|
+
USER_DATA = "user_data"
|
|
75
|
+
IDENTIFIER = "identifier" # Generic high-entropy ID
|
|
76
|
+
|
|
77
|
+
# Tier 2: NER detectable
|
|
78
|
+
PERSON = "person"
|
|
79
|
+
MONEY = "money"
|
|
80
|
+
ORG = "org"
|
|
81
|
+
LOCATION = "location"
|
|
82
|
+
|
|
83
|
+
# Tier 3: Semantic
|
|
84
|
+
VOLATILE = "volatile" # Semantically detected as changing
|
|
85
|
+
REALTIME = "realtime"
|
|
86
|
+
|
|
87
|
+
# Fallback
|
|
88
|
+
UNKNOWN = "unknown"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class DynamicSpan:
|
|
93
|
+
"""A span of dynamic content detected in text."""
|
|
94
|
+
|
|
95
|
+
# The actual text matched
|
|
96
|
+
text: str
|
|
97
|
+
|
|
98
|
+
# Position in original content
|
|
99
|
+
start: int
|
|
100
|
+
end: int
|
|
101
|
+
|
|
102
|
+
# What category of dynamic content
|
|
103
|
+
category: DynamicCategory
|
|
104
|
+
|
|
105
|
+
# Which tier detected it
|
|
106
|
+
tier: Literal["regex", "ner", "semantic"]
|
|
107
|
+
|
|
108
|
+
# Confidence score (0-1)
|
|
109
|
+
confidence: float = 1.0
|
|
110
|
+
|
|
111
|
+
# Additional metadata (pattern name, entity type, etc.)
|
|
112
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class DetectionResult:
|
|
117
|
+
"""Result of dynamic content detection."""
|
|
118
|
+
|
|
119
|
+
# All detected spans
|
|
120
|
+
spans: list[DynamicSpan]
|
|
121
|
+
|
|
122
|
+
# Content with dynamic parts removed
|
|
123
|
+
static_content: str
|
|
124
|
+
|
|
125
|
+
# Content that was extracted (for reinsertion at end)
|
|
126
|
+
dynamic_content: str
|
|
127
|
+
|
|
128
|
+
# Which tiers were used
|
|
129
|
+
tiers_used: list[str]
|
|
130
|
+
|
|
131
|
+
# Processing time in milliseconds
|
|
132
|
+
processing_time_ms: float = 0.0
|
|
133
|
+
|
|
134
|
+
# Any warnings (e.g., "spaCy not available, skipping NER")
|
|
135
|
+
warnings: list[str] = field(default_factory=list)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class DetectorConfig:
|
|
140
|
+
"""Configuration for the dynamic content detector."""
|
|
141
|
+
|
|
142
|
+
# Which tiers to enable (order matters - later tiers can use earlier results)
|
|
143
|
+
tiers: list[Literal["regex", "ner", "semantic"]] = field(default_factory=lambda: ["regex"])
|
|
144
|
+
|
|
145
|
+
# Tier 1: Structural labels that indicate dynamic content
|
|
146
|
+
# These are the KEY names that hint the VALUE is dynamic
|
|
147
|
+
# Users can add domain-specific labels
|
|
148
|
+
dynamic_labels: list[str] = field(
|
|
149
|
+
default_factory=lambda: [
|
|
150
|
+
# Time-related
|
|
151
|
+
"date",
|
|
152
|
+
"time",
|
|
153
|
+
"timestamp",
|
|
154
|
+
"datetime",
|
|
155
|
+
"created",
|
|
156
|
+
"updated",
|
|
157
|
+
"modified",
|
|
158
|
+
"expires",
|
|
159
|
+
"last",
|
|
160
|
+
"current",
|
|
161
|
+
"today",
|
|
162
|
+
"now",
|
|
163
|
+
# Identifiers
|
|
164
|
+
"id",
|
|
165
|
+
"uuid",
|
|
166
|
+
"guid",
|
|
167
|
+
"session",
|
|
168
|
+
"request",
|
|
169
|
+
"trace",
|
|
170
|
+
"span",
|
|
171
|
+
"transaction",
|
|
172
|
+
"correlation",
|
|
173
|
+
"token",
|
|
174
|
+
"key",
|
|
175
|
+
"secret",
|
|
176
|
+
# User-related
|
|
177
|
+
"user",
|
|
178
|
+
"username",
|
|
179
|
+
"email",
|
|
180
|
+
"name",
|
|
181
|
+
"phone",
|
|
182
|
+
"address",
|
|
183
|
+
"customer",
|
|
184
|
+
"client",
|
|
185
|
+
"employee",
|
|
186
|
+
"member",
|
|
187
|
+
# System state
|
|
188
|
+
"version",
|
|
189
|
+
"build",
|
|
190
|
+
"commit",
|
|
191
|
+
"branch",
|
|
192
|
+
"revision",
|
|
193
|
+
"status",
|
|
194
|
+
"state",
|
|
195
|
+
"count",
|
|
196
|
+
"total",
|
|
197
|
+
"balance",
|
|
198
|
+
"remaining",
|
|
199
|
+
"load",
|
|
200
|
+
"queue",
|
|
201
|
+
"active",
|
|
202
|
+
"pending",
|
|
203
|
+
# Order/ticket related
|
|
204
|
+
"order",
|
|
205
|
+
"ticket",
|
|
206
|
+
"case",
|
|
207
|
+
"invoice",
|
|
208
|
+
"reference",
|
|
209
|
+
]
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Tier 1: Custom regex patterns (user-provided)
|
|
213
|
+
custom_patterns: list[tuple[str, DynamicCategory]] = field(default_factory=list)
|
|
214
|
+
|
|
215
|
+
# Entropy threshold for detecting random strings (0-1 scale normalized)
|
|
216
|
+
# Higher = more selective (only very random strings)
|
|
217
|
+
entropy_threshold: float = 0.7
|
|
218
|
+
|
|
219
|
+
# Minimum length for entropy-based detection
|
|
220
|
+
min_entropy_length: int = 8
|
|
221
|
+
|
|
222
|
+
# Tier 2: NER config
|
|
223
|
+
spacy_model: str = "en_core_web_sm"
|
|
224
|
+
ner_entity_types: list[str] = field(
|
|
225
|
+
default_factory=lambda: ["DATE", "TIME", "MONEY", "PERSON", "ORG", "GPE"]
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Tier 3: Semantic config
|
|
229
|
+
embedding_model: str = "all-MiniLM-L6-v2"
|
|
230
|
+
semantic_threshold: float = 0.7
|
|
231
|
+
|
|
232
|
+
# General
|
|
233
|
+
min_span_length: int = 2
|
|
234
|
+
merge_overlapping: bool = True
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def calculate_entropy(s: str) -> float:
|
|
238
|
+
"""
|
|
239
|
+
Calculate Shannon entropy of a string, normalized to 0-1.
|
|
240
|
+
|
|
241
|
+
Higher entropy = more random/unpredictable = likely dynamic.
|
|
242
|
+
- "aaaaaaa" -> ~0 (low entropy, predictable)
|
|
243
|
+
- "a1b2c3d4" -> ~0.7 (medium entropy)
|
|
244
|
+
- "550e8400-e29b-41d4" -> ~0.9 (high entropy, random-looking)
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
Normalized entropy (0-1). Higher = more likely dynamic.
|
|
248
|
+
"""
|
|
249
|
+
if not s:
|
|
250
|
+
return 0.0
|
|
251
|
+
|
|
252
|
+
# Count character frequencies
|
|
253
|
+
freq: dict[str, int] = {}
|
|
254
|
+
for char in s:
|
|
255
|
+
freq[char] = freq.get(char, 0) + 1
|
|
256
|
+
|
|
257
|
+
# Calculate entropy
|
|
258
|
+
length = len(s)
|
|
259
|
+
entropy = 0.0
|
|
260
|
+
for count in freq.values():
|
|
261
|
+
p = count / length
|
|
262
|
+
entropy -= p * math.log2(p)
|
|
263
|
+
|
|
264
|
+
# Normalize: max entropy for string of length n with k unique chars
|
|
265
|
+
# is log2(min(n, alphabet_size)). We'll normalize by log2(length)
|
|
266
|
+
# to get a 0-1 scale
|
|
267
|
+
max_entropy = math.log2(length) if length > 1 else 1.0
|
|
268
|
+
return entropy / max_entropy if max_entropy > 0 else 0.0
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
class RegexDetector:
|
|
272
|
+
"""
|
|
273
|
+
Tier 1: Scalable pattern detection.
|
|
274
|
+
|
|
275
|
+
Uses THREE strategies (no hardcoded month names!):
|
|
276
|
+
1. Structural: "Label: value" patterns where label indicates dynamic content
|
|
277
|
+
2. Universal: Truly universal formats (ISO 8601, UUID, Unix timestamps)
|
|
278
|
+
3. Entropy: High-entropy strings (tokens, hashes, IDs)
|
|
279
|
+
"""
|
|
280
|
+
|
|
281
|
+
# Universal patterns (these formats are language-agnostic)
|
|
282
|
+
UNIVERSAL_PATTERNS = [
|
|
283
|
+
# UUID - truly universal format
|
|
284
|
+
(
|
|
285
|
+
r"[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}",
|
|
286
|
+
DynamicCategory.UUID,
|
|
287
|
+
"uuid",
|
|
288
|
+
),
|
|
289
|
+
# ISO 8601 datetime (most universal date format)
|
|
290
|
+
(
|
|
291
|
+
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?",
|
|
292
|
+
DynamicCategory.DATETIME,
|
|
293
|
+
"iso_datetime",
|
|
294
|
+
),
|
|
295
|
+
# ISO 8601 date only
|
|
296
|
+
(r"\d{4}-\d{2}-\d{2}(?!\d)", DynamicCategory.DATE, "iso_date"),
|
|
297
|
+
# Unix timestamps (10-13 digits, but NOT within longer numbers)
|
|
298
|
+
(r"(?<![0-9])\d{10,13}(?![0-9])", DynamicCategory.TIMESTAMP, "unix_timestamp"),
|
|
299
|
+
# 24-hour time HH:MM:SS or HH:MM
|
|
300
|
+
(
|
|
301
|
+
r"(?<![0-9])\d{1,2}:\d{2}(?::\d{2})?(?:\s*(?:AM|PM|am|pm))?(?![0-9])",
|
|
302
|
+
DynamicCategory.TIME,
|
|
303
|
+
"time",
|
|
304
|
+
),
|
|
305
|
+
# Version numbers with v prefix (unambiguous)
|
|
306
|
+
(r"\bv\d+\.\d+(?:\.\d+)?(?:-[a-zA-Z0-9.]+)?", DynamicCategory.VERSION, "version"),
|
|
307
|
+
# API key/token patterns (prefix + random string)
|
|
308
|
+
(
|
|
309
|
+
r"\b(?:sk|pk|api|key|token|bearer|auth)[-_][a-zA-Z0-9]{16,}",
|
|
310
|
+
DynamicCategory.REQUEST_ID,
|
|
311
|
+
"api_key",
|
|
312
|
+
),
|
|
313
|
+
# Common prefixed IDs (req_, sess_, txn_, etc.)
|
|
314
|
+
(r"\b[a-z]{2,6}_[a-zA-Z0-9]{8,}", DynamicCategory.REQUEST_ID, "prefixed_id"),
|
|
315
|
+
# Hex strings of common ID lengths (32 = MD5, 40 = SHA1, 64 = SHA256)
|
|
316
|
+
(r"\b[a-fA-F0-9]{32}\b", DynamicCategory.IDENTIFIER, "hex_32"),
|
|
317
|
+
(r"\b[a-fA-F0-9]{40}\b", DynamicCategory.IDENTIFIER, "hex_40"),
|
|
318
|
+
(r"\b[a-fA-F0-9]{64}\b", DynamicCategory.IDENTIFIER, "hex_64"),
|
|
319
|
+
# JWT tokens (three base64 sections separated by dots)
|
|
320
|
+
(
|
|
321
|
+
r"eyJ[a-zA-Z0-9_-]+\.eyJ[a-zA-Z0-9_-]+\.[a-zA-Z0-9_-]+",
|
|
322
|
+
DynamicCategory.REQUEST_ID,
|
|
323
|
+
"jwt",
|
|
324
|
+
),
|
|
325
|
+
]
|
|
326
|
+
|
|
327
|
+
def __init__(self, config: DetectorConfig):
|
|
328
|
+
"""Initialize regex detector."""
|
|
329
|
+
self.config = config
|
|
330
|
+
|
|
331
|
+
# Compile universal patterns
|
|
332
|
+
self._universal_patterns: list[tuple[re.Pattern[str], DynamicCategory, str]] = [
|
|
333
|
+
(re.compile(pattern), category, name)
|
|
334
|
+
for pattern, category, name in self.UNIVERSAL_PATTERNS
|
|
335
|
+
]
|
|
336
|
+
|
|
337
|
+
# Build structural pattern from dynamic labels
|
|
338
|
+
# Pattern: "label" followed by separator then value
|
|
339
|
+
labels_pattern = "|".join(re.escape(label) for label in config.dynamic_labels)
|
|
340
|
+
self._structural_pattern = re.compile(
|
|
341
|
+
rf"(?P<label>(?:{labels_pattern}))(?P<sep>\s*[:=]\s*|\s+)(?P<value>[^\n,;]+)",
|
|
342
|
+
re.IGNORECASE,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Compile custom patterns
|
|
346
|
+
self._custom_patterns: list[tuple[re.Pattern[str], DynamicCategory]] = [
|
|
347
|
+
(re.compile(pattern, re.IGNORECASE), category)
|
|
348
|
+
for pattern, category in config.custom_patterns
|
|
349
|
+
]
|
|
350
|
+
|
|
351
|
+
def detect(self, content: str) -> list[DynamicSpan]:
|
|
352
|
+
"""Detect dynamic content using structural, universal, and entropy detection."""
|
|
353
|
+
spans: list[DynamicSpan] = []
|
|
354
|
+
seen_ranges: set[tuple[int, int]] = set()
|
|
355
|
+
|
|
356
|
+
# 1. Universal patterns first (most specific)
|
|
357
|
+
for pattern, category, pattern_name in self._universal_patterns:
|
|
358
|
+
for match in pattern.finditer(content):
|
|
359
|
+
start, end = match.start(), match.end()
|
|
360
|
+
if self._is_overlapping(start, end, seen_ranges):
|
|
361
|
+
continue
|
|
362
|
+
if end - start < self.config.min_span_length:
|
|
363
|
+
continue
|
|
364
|
+
|
|
365
|
+
spans.append(
|
|
366
|
+
DynamicSpan(
|
|
367
|
+
text=match.group(),
|
|
368
|
+
start=start,
|
|
369
|
+
end=end,
|
|
370
|
+
category=category,
|
|
371
|
+
tier="regex",
|
|
372
|
+
confidence=1.0,
|
|
373
|
+
metadata={"pattern": pattern_name, "method": "universal"},
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
seen_ranges.add((start, end))
|
|
377
|
+
|
|
378
|
+
# 2. Structural detection: "Label: value" patterns
|
|
379
|
+
for match in self._structural_pattern.finditer(content):
|
|
380
|
+
# Get the full match range
|
|
381
|
+
start, end = match.start(), match.end()
|
|
382
|
+
|
|
383
|
+
# Skip if overlaps with universal patterns
|
|
384
|
+
if self._is_overlapping(start, end, seen_ranges):
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
label = match.group("label").lower()
|
|
388
|
+
value = match.group("value").strip()
|
|
389
|
+
|
|
390
|
+
# Determine category from label
|
|
391
|
+
category = self._categorize_label(label)
|
|
392
|
+
|
|
393
|
+
# Only add the value portion (keep label as static)
|
|
394
|
+
value_start = match.start("value")
|
|
395
|
+
value_end = match.end("value")
|
|
396
|
+
|
|
397
|
+
# Skip if value is too short or empty
|
|
398
|
+
if value_end - value_start < self.config.min_span_length:
|
|
399
|
+
continue
|
|
400
|
+
if not value.strip():
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
spans.append(
|
|
404
|
+
DynamicSpan(
|
|
405
|
+
text=value,
|
|
406
|
+
start=value_start,
|
|
407
|
+
end=value_end,
|
|
408
|
+
category=category,
|
|
409
|
+
tier="regex",
|
|
410
|
+
confidence=0.9,
|
|
411
|
+
metadata={"pattern": "structural", "method": "structural", "label": label},
|
|
412
|
+
)
|
|
413
|
+
)
|
|
414
|
+
seen_ranges.add((value_start, value_end))
|
|
415
|
+
|
|
416
|
+
# 3. Entropy-based detection for remaining potential IDs
|
|
417
|
+
spans.extend(self._detect_high_entropy(content, seen_ranges))
|
|
418
|
+
|
|
419
|
+
# 4. Custom patterns
|
|
420
|
+
for pattern, category in self._custom_patterns:
|
|
421
|
+
for match in pattern.finditer(content):
|
|
422
|
+
start, end = match.start(), match.end()
|
|
423
|
+
if self._is_overlapping(start, end, seen_ranges):
|
|
424
|
+
continue
|
|
425
|
+
if end - start < self.config.min_span_length:
|
|
426
|
+
continue
|
|
427
|
+
|
|
428
|
+
spans.append(
|
|
429
|
+
DynamicSpan(
|
|
430
|
+
text=match.group(),
|
|
431
|
+
start=start,
|
|
432
|
+
end=end,
|
|
433
|
+
category=category,
|
|
434
|
+
tier="regex",
|
|
435
|
+
confidence=0.8,
|
|
436
|
+
metadata={"pattern": "custom", "method": "custom"},
|
|
437
|
+
)
|
|
438
|
+
)
|
|
439
|
+
seen_ranges.add((start, end))
|
|
440
|
+
|
|
441
|
+
return sorted(spans, key=lambda s: s.start)
|
|
442
|
+
|
|
443
|
+
def _detect_high_entropy(
|
|
444
|
+
self,
|
|
445
|
+
content: str,
|
|
446
|
+
seen_ranges: set[tuple[int, int]],
|
|
447
|
+
) -> list[DynamicSpan]:
|
|
448
|
+
"""
|
|
449
|
+
Detect high-entropy strings that look like IDs/tokens.
|
|
450
|
+
|
|
451
|
+
Finds alphanumeric sequences and checks their entropy.
|
|
452
|
+
High entropy = likely random/generated = dynamic.
|
|
453
|
+
"""
|
|
454
|
+
spans: list[DynamicSpan] = []
|
|
455
|
+
|
|
456
|
+
# Find alphanumeric sequences (potential IDs)
|
|
457
|
+
# Must be at least min_entropy_length chars, mix of letters/numbers
|
|
458
|
+
pattern = re.compile(r"\b[a-zA-Z0-9_-]{8,}\b")
|
|
459
|
+
|
|
460
|
+
for match in pattern.finditer(content):
|
|
461
|
+
start, end = match.start(), match.end()
|
|
462
|
+
text = match.group()
|
|
463
|
+
|
|
464
|
+
# Skip if already detected
|
|
465
|
+
if self._is_overlapping(start, end, seen_ranges):
|
|
466
|
+
continue
|
|
467
|
+
|
|
468
|
+
# Skip if too short
|
|
469
|
+
if len(text) < self.config.min_entropy_length:
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
# Skip if all letters or all numbers (not random-looking)
|
|
473
|
+
if text.isalpha() or text.isdigit():
|
|
474
|
+
continue
|
|
475
|
+
|
|
476
|
+
# Skip common words that might look like IDs
|
|
477
|
+
if text.lower() in {"username", "password", "localhost", "undefined"}:
|
|
478
|
+
continue
|
|
479
|
+
|
|
480
|
+
# Calculate entropy
|
|
481
|
+
entropy = calculate_entropy(text)
|
|
482
|
+
|
|
483
|
+
if entropy >= self.config.entropy_threshold:
|
|
484
|
+
spans.append(
|
|
485
|
+
DynamicSpan(
|
|
486
|
+
text=text,
|
|
487
|
+
start=start,
|
|
488
|
+
end=end,
|
|
489
|
+
category=DynamicCategory.IDENTIFIER,
|
|
490
|
+
tier="regex",
|
|
491
|
+
confidence=entropy, # Use entropy as confidence
|
|
492
|
+
metadata={"pattern": "entropy", "method": "entropy", "entropy": entropy},
|
|
493
|
+
)
|
|
494
|
+
)
|
|
495
|
+
seen_ranges.add((start, end))
|
|
496
|
+
|
|
497
|
+
return spans
|
|
498
|
+
|
|
499
|
+
def _is_overlapping(
|
|
500
|
+
self,
|
|
501
|
+
start: int,
|
|
502
|
+
end: int,
|
|
503
|
+
seen_ranges: set[tuple[int, int]],
|
|
504
|
+
) -> bool:
|
|
505
|
+
"""Check if range overlaps with any existing range."""
|
|
506
|
+
return any(not (end <= s or start >= e) for s, e in seen_ranges)
|
|
507
|
+
|
|
508
|
+
def _categorize_label(self, label: str) -> DynamicCategory:
|
|
509
|
+
"""Categorize based on the label name."""
|
|
510
|
+
label = label.lower()
|
|
511
|
+
|
|
512
|
+
# Time-related
|
|
513
|
+
if label in {"date", "datetime", "created", "updated", "modified", "expires", "today"}:
|
|
514
|
+
return DynamicCategory.DATE
|
|
515
|
+
if label in {"time", "timestamp", "now"}:
|
|
516
|
+
return DynamicCategory.TIMESTAMP
|
|
517
|
+
if label == "current":
|
|
518
|
+
return DynamicCategory.DATETIME
|
|
519
|
+
|
|
520
|
+
# Identifiers
|
|
521
|
+
if label in {"id", "uuid", "guid"}:
|
|
522
|
+
return DynamicCategory.UUID
|
|
523
|
+
if label in {"session", "request", "trace", "span", "transaction", "correlation"}:
|
|
524
|
+
return DynamicCategory.SESSION
|
|
525
|
+
if label in {"token", "key", "secret"}:
|
|
526
|
+
return DynamicCategory.REQUEST_ID
|
|
527
|
+
|
|
528
|
+
# User-related
|
|
529
|
+
if label in {
|
|
530
|
+
"user",
|
|
531
|
+
"username",
|
|
532
|
+
"email",
|
|
533
|
+
"name",
|
|
534
|
+
"phone",
|
|
535
|
+
"address",
|
|
536
|
+
"customer",
|
|
537
|
+
"client",
|
|
538
|
+
"employee",
|
|
539
|
+
"member",
|
|
540
|
+
}:
|
|
541
|
+
return DynamicCategory.USER_DATA
|
|
542
|
+
|
|
543
|
+
# System state
|
|
544
|
+
if label in {"version", "build", "commit", "branch", "revision"}:
|
|
545
|
+
return DynamicCategory.VERSION
|
|
546
|
+
if label in {
|
|
547
|
+
"status",
|
|
548
|
+
"state",
|
|
549
|
+
"count",
|
|
550
|
+
"total",
|
|
551
|
+
"balance",
|
|
552
|
+
"remaining",
|
|
553
|
+
"load",
|
|
554
|
+
"queue",
|
|
555
|
+
"active",
|
|
556
|
+
"pending",
|
|
557
|
+
}:
|
|
558
|
+
return DynamicCategory.VOLATILE
|
|
559
|
+
|
|
560
|
+
# Order/ticket
|
|
561
|
+
if label in {"order", "ticket", "case", "invoice", "reference"}:
|
|
562
|
+
return DynamicCategory.REQUEST_ID
|
|
563
|
+
|
|
564
|
+
return DynamicCategory.UNKNOWN
|
|
565
|
+
|
|
566
|
+
|
|
567
|
+
class NERDetector:
|
|
568
|
+
"""Tier 2: spaCy-based Named Entity Recognition."""
|
|
569
|
+
|
|
570
|
+
# Map spaCy entity types to our categories
|
|
571
|
+
ENTITY_MAP = {
|
|
572
|
+
"DATE": DynamicCategory.DATE,
|
|
573
|
+
"TIME": DynamicCategory.TIME,
|
|
574
|
+
"MONEY": DynamicCategory.MONEY,
|
|
575
|
+
"PERSON": DynamicCategory.PERSON,
|
|
576
|
+
"ORG": DynamicCategory.ORG,
|
|
577
|
+
"GPE": DynamicCategory.LOCATION, # Geo-Political Entity
|
|
578
|
+
"LOC": DynamicCategory.LOCATION,
|
|
579
|
+
"FAC": DynamicCategory.LOCATION, # Facility
|
|
580
|
+
"CARDINAL": DynamicCategory.UNKNOWN, # Numbers
|
|
581
|
+
"ORDINAL": DynamicCategory.UNKNOWN,
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
def __init__(self, config: DetectorConfig):
|
|
585
|
+
"""Initialize NER detector, loading spaCy model."""
|
|
586
|
+
self.config = config
|
|
587
|
+
self._nlp = None
|
|
588
|
+
self._load_error: str | None = None
|
|
589
|
+
|
|
590
|
+
if not _SPACY_AVAILABLE:
|
|
591
|
+
self._load_error = (
|
|
592
|
+
"spaCy not installed. Install with: "
|
|
593
|
+
"pip install spacy && python -m spacy download en_core_web_sm"
|
|
594
|
+
)
|
|
595
|
+
return
|
|
596
|
+
|
|
597
|
+
try:
|
|
598
|
+
self._nlp = spacy.load(config.spacy_model)
|
|
599
|
+
except OSError:
|
|
600
|
+
self._load_error = (
|
|
601
|
+
f"spaCy model '{config.spacy_model}' not found. "
|
|
602
|
+
f"Install with: python -m spacy download {config.spacy_model}"
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
@property
|
|
606
|
+
def is_available(self) -> bool:
|
|
607
|
+
"""Check if NER is available."""
|
|
608
|
+
return self._nlp is not None
|
|
609
|
+
|
|
610
|
+
def detect(
|
|
611
|
+
self,
|
|
612
|
+
content: str,
|
|
613
|
+
existing_spans: list[DynamicSpan] | None = None,
|
|
614
|
+
) -> tuple[list[DynamicSpan], str | None]:
|
|
615
|
+
"""
|
|
616
|
+
Detect dynamic content using NER.
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
content: Text to analyze.
|
|
620
|
+
existing_spans: Spans already detected (to avoid duplicates).
|
|
621
|
+
|
|
622
|
+
Returns:
|
|
623
|
+
Tuple of (new_spans, warning_message).
|
|
624
|
+
"""
|
|
625
|
+
if not self.is_available:
|
|
626
|
+
return [], self._load_error
|
|
627
|
+
|
|
628
|
+
# Get existing ranges to avoid duplicates
|
|
629
|
+
existing_ranges = set()
|
|
630
|
+
if existing_spans:
|
|
631
|
+
existing_ranges = {(s.start, s.end) for s in existing_spans}
|
|
632
|
+
|
|
633
|
+
doc = self._nlp(content) # type: ignore[misc]
|
|
634
|
+
spans: list[DynamicSpan] = []
|
|
635
|
+
|
|
636
|
+
for ent in doc.ents:
|
|
637
|
+
# Skip entity types we don't care about
|
|
638
|
+
if ent.label_ not in self.config.ner_entity_types:
|
|
639
|
+
continue
|
|
640
|
+
|
|
641
|
+
# Skip if already detected by regex
|
|
642
|
+
if (ent.start_char, ent.end_char) in existing_ranges:
|
|
643
|
+
continue
|
|
644
|
+
|
|
645
|
+
# Check for overlap with existing spans
|
|
646
|
+
overlaps = any(
|
|
647
|
+
not (ent.end_char <= s or ent.start_char >= e) for s, e in existing_ranges
|
|
648
|
+
)
|
|
649
|
+
if overlaps:
|
|
650
|
+
continue
|
|
651
|
+
|
|
652
|
+
# Map to our category
|
|
653
|
+
category = self.ENTITY_MAP.get(ent.label_, DynamicCategory.UNKNOWN)
|
|
654
|
+
|
|
655
|
+
# Skip unknown categories
|
|
656
|
+
if category == DynamicCategory.UNKNOWN:
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
spans.append(
|
|
660
|
+
DynamicSpan(
|
|
661
|
+
text=ent.text,
|
|
662
|
+
start=ent.start_char,
|
|
663
|
+
end=ent.end_char,
|
|
664
|
+
category=category,
|
|
665
|
+
tier="ner",
|
|
666
|
+
confidence=0.9,
|
|
667
|
+
metadata={"entity_type": ent.label_},
|
|
668
|
+
)
|
|
669
|
+
)
|
|
670
|
+
existing_ranges.add((ent.start_char, ent.end_char))
|
|
671
|
+
|
|
672
|
+
return sorted(spans, key=lambda s: s.start), None
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
class SemanticDetector:
|
|
676
|
+
"""Tier 3: Embedding-based semantic detection."""
|
|
677
|
+
|
|
678
|
+
# Known phrases that indicate dynamic content
|
|
679
|
+
# These are SEMANTIC patterns, not literal strings to match
|
|
680
|
+
DYNAMIC_EXEMPLARS = [
|
|
681
|
+
# Time-sensitive
|
|
682
|
+
"The current date is",
|
|
683
|
+
"As of today",
|
|
684
|
+
"Updated on",
|
|
685
|
+
"Last refreshed",
|
|
686
|
+
"Real-time data",
|
|
687
|
+
"Live prices",
|
|
688
|
+
"Current stock price",
|
|
689
|
+
# Session-specific
|
|
690
|
+
"Your session ID",
|
|
691
|
+
"Your account balance",
|
|
692
|
+
"Your recent orders",
|
|
693
|
+
"Your conversation history",
|
|
694
|
+
# User-specific
|
|
695
|
+
"Hello [user]",
|
|
696
|
+
"Dear customer",
|
|
697
|
+
"Your name is",
|
|
698
|
+
# System state
|
|
699
|
+
"Server status",
|
|
700
|
+
"System load",
|
|
701
|
+
"Queue length",
|
|
702
|
+
"Active users",
|
|
703
|
+
]
|
|
704
|
+
|
|
705
|
+
def __init__(self, config: DetectorConfig):
|
|
706
|
+
"""Initialize semantic detector with embedding model."""
|
|
707
|
+
self.config = config
|
|
708
|
+
self._model = None
|
|
709
|
+
self._exemplar_embeddings = None
|
|
710
|
+
self._load_error: str | None = None
|
|
711
|
+
|
|
712
|
+
if not _SENTENCE_TRANSFORMERS_AVAILABLE:
|
|
713
|
+
self._load_error = (
|
|
714
|
+
"sentence-transformers not installed. "
|
|
715
|
+
"Install with: pip install sentence-transformers"
|
|
716
|
+
)
|
|
717
|
+
return
|
|
718
|
+
|
|
719
|
+
try:
|
|
720
|
+
self._model = SentenceTransformer(config.embedding_model)
|
|
721
|
+
# Pre-compute exemplar embeddings
|
|
722
|
+
self._exemplar_embeddings = self._model.encode(
|
|
723
|
+
self.DYNAMIC_EXEMPLARS,
|
|
724
|
+
convert_to_numpy=True,
|
|
725
|
+
)
|
|
726
|
+
except Exception as e:
|
|
727
|
+
self._load_error = f"Failed to load embedding model: {e}"
|
|
728
|
+
|
|
729
|
+
@property
|
|
730
|
+
def is_available(self) -> bool:
|
|
731
|
+
"""Check if semantic detection is available."""
|
|
732
|
+
return self._model is not None
|
|
733
|
+
|
|
734
|
+
def detect(
|
|
735
|
+
self,
|
|
736
|
+
content: str,
|
|
737
|
+
existing_spans: list[DynamicSpan] | None = None,
|
|
738
|
+
) -> tuple[list[DynamicSpan], str | None]:
|
|
739
|
+
"""
|
|
740
|
+
Detect dynamic content using semantic similarity.
|
|
741
|
+
|
|
742
|
+
Splits content into sentences and checks each against known
|
|
743
|
+
dynamic patterns using embedding similarity.
|
|
744
|
+
|
|
745
|
+
Args:
|
|
746
|
+
content: Text to analyze.
|
|
747
|
+
existing_spans: Spans already detected (to avoid duplicates).
|
|
748
|
+
|
|
749
|
+
Returns:
|
|
750
|
+
Tuple of (new_spans, warning_message).
|
|
751
|
+
"""
|
|
752
|
+
if not self.is_available:
|
|
753
|
+
return [], self._load_error
|
|
754
|
+
|
|
755
|
+
# Simple sentence splitting (could use spaCy if available)
|
|
756
|
+
sentences = self._split_sentences(content)
|
|
757
|
+
spans: list[DynamicSpan] = []
|
|
758
|
+
|
|
759
|
+
# Get existing ranges
|
|
760
|
+
existing_ranges = set()
|
|
761
|
+
if existing_spans:
|
|
762
|
+
existing_ranges = {(s.start, s.end) for s in existing_spans}
|
|
763
|
+
|
|
764
|
+
# Encode all sentences
|
|
765
|
+
if not sentences:
|
|
766
|
+
return [], None
|
|
767
|
+
|
|
768
|
+
sentence_texts = [s[0] for s in sentences]
|
|
769
|
+
sentence_embeddings = self._model.encode( # type: ignore[union-attr]
|
|
770
|
+
sentence_texts,
|
|
771
|
+
convert_to_numpy=True,
|
|
772
|
+
)
|
|
773
|
+
|
|
774
|
+
# Compute similarities
|
|
775
|
+
similarities = np.dot(sentence_embeddings, self._exemplar_embeddings.T) # type: ignore[union-attr]
|
|
776
|
+
|
|
777
|
+
for i, (text, start, end) in enumerate(sentences):
|
|
778
|
+
# Get max similarity to any exemplar
|
|
779
|
+
max_sim = float(np.max(similarities[i]))
|
|
780
|
+
|
|
781
|
+
if max_sim < self.config.semantic_threshold:
|
|
782
|
+
continue
|
|
783
|
+
|
|
784
|
+
# Check overlap with existing spans
|
|
785
|
+
overlaps = any(not (end <= s or start >= e) for s, e in existing_ranges)
|
|
786
|
+
if overlaps:
|
|
787
|
+
continue
|
|
788
|
+
|
|
789
|
+
# Find which exemplar matched best
|
|
790
|
+
best_exemplar_idx = int(np.argmax(similarities[i]))
|
|
791
|
+
best_exemplar = self.DYNAMIC_EXEMPLARS[best_exemplar_idx]
|
|
792
|
+
|
|
793
|
+
# Determine category based on exemplar
|
|
794
|
+
category = self._categorize_exemplar(best_exemplar)
|
|
795
|
+
|
|
796
|
+
spans.append(
|
|
797
|
+
DynamicSpan(
|
|
798
|
+
text=text,
|
|
799
|
+
start=start,
|
|
800
|
+
end=end,
|
|
801
|
+
category=category,
|
|
802
|
+
tier="semantic",
|
|
803
|
+
confidence=max_sim,
|
|
804
|
+
metadata={
|
|
805
|
+
"matched_exemplar": best_exemplar,
|
|
806
|
+
"similarity": max_sim,
|
|
807
|
+
},
|
|
808
|
+
)
|
|
809
|
+
)
|
|
810
|
+
existing_ranges.add((start, end))
|
|
811
|
+
|
|
812
|
+
return sorted(spans, key=lambda s: s.start), None
|
|
813
|
+
|
|
814
|
+
def _split_sentences(self, content: str) -> list[tuple[str, int, int]]:
|
|
815
|
+
"""Split content into sentences with positions."""
|
|
816
|
+
sentences: list[tuple[str, int, int]] = []
|
|
817
|
+
pattern = r"[^.!?\n]+[.!?\n]?"
|
|
818
|
+
for match in re.finditer(pattern, content):
|
|
819
|
+
text = match.group().strip()
|
|
820
|
+
if len(text) > 10:
|
|
821
|
+
sentences.append((text, match.start(), match.end()))
|
|
822
|
+
return sentences
|
|
823
|
+
|
|
824
|
+
def _categorize_exemplar(self, exemplar: str) -> DynamicCategory:
|
|
825
|
+
"""Categorize based on which exemplar matched."""
|
|
826
|
+
exemplar_lower = exemplar.lower()
|
|
827
|
+
|
|
828
|
+
if any(w in exemplar_lower for w in ["date", "today", "updated", "refreshed"]):
|
|
829
|
+
return DynamicCategory.DATE
|
|
830
|
+
elif any(w in exemplar_lower for w in ["price", "stock", "live", "real-time"]):
|
|
831
|
+
return DynamicCategory.REALTIME
|
|
832
|
+
elif any(w in exemplar_lower for w in ["session", "account", "your"]):
|
|
833
|
+
return DynamicCategory.SESSION
|
|
834
|
+
elif any(w in exemplar_lower for w in ["status", "load", "queue", "active"]):
|
|
835
|
+
return DynamicCategory.VOLATILE
|
|
836
|
+
else:
|
|
837
|
+
return DynamicCategory.VOLATILE
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
class DynamicContentDetector:
|
|
841
|
+
"""
|
|
842
|
+
Unified dynamic content detector with tiered detection.
|
|
843
|
+
|
|
844
|
+
Key Design Principles:
|
|
845
|
+
- NO hardcoded locale-specific patterns (no month names)
|
|
846
|
+
- Structural detection: Labels indicate what's dynamic
|
|
847
|
+
- Universal patterns: ISO 8601, UUIDs, Unix timestamps
|
|
848
|
+
- Entropy-based: High entropy = random/generated = dynamic
|
|
849
|
+
|
|
850
|
+
Usage:
|
|
851
|
+
# Fast mode (regex only - structural + universal + entropy)
|
|
852
|
+
detector = DynamicContentDetector(DetectorConfig(tiers=["regex"]))
|
|
853
|
+
|
|
854
|
+
# Balanced mode (regex + NER for names/money)
|
|
855
|
+
detector = DynamicContentDetector(DetectorConfig(tiers=["regex", "ner"]))
|
|
856
|
+
|
|
857
|
+
# Full mode (all tiers)
|
|
858
|
+
detector = DynamicContentDetector(DetectorConfig(
|
|
859
|
+
tiers=["regex", "ner", "semantic"]
|
|
860
|
+
))
|
|
861
|
+
|
|
862
|
+
result = detector.detect("Session: abc123. User: John paid $500.")
|
|
863
|
+
"""
|
|
864
|
+
|
|
865
|
+
def __init__(self, config: DetectorConfig | None = None):
|
|
866
|
+
"""Initialize detector with configuration."""
|
|
867
|
+
self.config = config or DetectorConfig()
|
|
868
|
+
|
|
869
|
+
# Initialize detectors based on enabled tiers
|
|
870
|
+
self._regex_detector: RegexDetector | None = None
|
|
871
|
+
self._ner_detector: NERDetector | None = None
|
|
872
|
+
self._semantic_detector: SemanticDetector | None = None
|
|
873
|
+
|
|
874
|
+
if "regex" in self.config.tiers:
|
|
875
|
+
self._regex_detector = RegexDetector(self.config)
|
|
876
|
+
|
|
877
|
+
if "ner" in self.config.tiers:
|
|
878
|
+
self._ner_detector = NERDetector(self.config)
|
|
879
|
+
|
|
880
|
+
if "semantic" in self.config.tiers:
|
|
881
|
+
self._semantic_detector = SemanticDetector(self.config)
|
|
882
|
+
|
|
883
|
+
def detect(self, content: str) -> DetectionResult:
|
|
884
|
+
"""
|
|
885
|
+
Detect dynamic content in text.
|
|
886
|
+
|
|
887
|
+
Runs enabled tiers in order, accumulating spans.
|
|
888
|
+
Each tier can see what previous tiers detected.
|
|
889
|
+
|
|
890
|
+
Args:
|
|
891
|
+
content: Text to analyze.
|
|
892
|
+
|
|
893
|
+
Returns:
|
|
894
|
+
DetectionResult with spans, static/dynamic content split, etc.
|
|
895
|
+
"""
|
|
896
|
+
import time
|
|
897
|
+
|
|
898
|
+
start_time = time.perf_counter()
|
|
899
|
+
|
|
900
|
+
all_spans: list[DynamicSpan] = []
|
|
901
|
+
tiers_used: list[str] = []
|
|
902
|
+
warnings: list[str] = []
|
|
903
|
+
|
|
904
|
+
# Tier 1: Regex (structural + universal + entropy)
|
|
905
|
+
if self._regex_detector:
|
|
906
|
+
regex_spans = self._regex_detector.detect(content)
|
|
907
|
+
all_spans.extend(regex_spans)
|
|
908
|
+
tiers_used.append("regex")
|
|
909
|
+
|
|
910
|
+
# Tier 2: NER
|
|
911
|
+
if self._ner_detector:
|
|
912
|
+
ner_spans, ner_warning = self._ner_detector.detect(content, all_spans)
|
|
913
|
+
all_spans.extend(ner_spans)
|
|
914
|
+
if ner_warning:
|
|
915
|
+
warnings.append(ner_warning)
|
|
916
|
+
elif ner_spans or self._ner_detector.is_available:
|
|
917
|
+
tiers_used.append("ner")
|
|
918
|
+
|
|
919
|
+
# Tier 3: Semantic
|
|
920
|
+
if self._semantic_detector:
|
|
921
|
+
sem_spans, sem_warning = self._semantic_detector.detect(content, all_spans)
|
|
922
|
+
all_spans.extend(sem_spans)
|
|
923
|
+
if sem_warning:
|
|
924
|
+
warnings.append(sem_warning)
|
|
925
|
+
elif sem_spans or self._semantic_detector.is_available:
|
|
926
|
+
tiers_used.append("semantic")
|
|
927
|
+
|
|
928
|
+
# Sort by position
|
|
929
|
+
all_spans = sorted(all_spans, key=lambda s: s.start)
|
|
930
|
+
|
|
931
|
+
# Build static and dynamic content
|
|
932
|
+
static_content, dynamic_content = self._split_content(content, all_spans)
|
|
933
|
+
|
|
934
|
+
processing_time = (time.perf_counter() - start_time) * 1000
|
|
935
|
+
|
|
936
|
+
return DetectionResult(
|
|
937
|
+
spans=all_spans,
|
|
938
|
+
static_content=static_content,
|
|
939
|
+
dynamic_content=dynamic_content,
|
|
940
|
+
tiers_used=tiers_used,
|
|
941
|
+
processing_time_ms=processing_time,
|
|
942
|
+
warnings=warnings,
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
def _split_content(
|
|
946
|
+
self,
|
|
947
|
+
content: str,
|
|
948
|
+
spans: list[DynamicSpan],
|
|
949
|
+
) -> tuple[str, str]:
|
|
950
|
+
"""Split content into static and dynamic parts."""
|
|
951
|
+
if not spans:
|
|
952
|
+
return content, ""
|
|
953
|
+
|
|
954
|
+
static = content
|
|
955
|
+
dynamic_parts: list[str] = []
|
|
956
|
+
|
|
957
|
+
for span in reversed(spans):
|
|
958
|
+
dynamic_parts.append(span.text)
|
|
959
|
+
static = static[: span.start] + static[span.end :]
|
|
960
|
+
|
|
961
|
+
static = self._clean_static_content(static)
|
|
962
|
+
dynamic_parts.reverse()
|
|
963
|
+
dynamic = "\n".join(dynamic_parts)
|
|
964
|
+
|
|
965
|
+
return static, dynamic
|
|
966
|
+
|
|
967
|
+
def _clean_static_content(self, content: str) -> str:
|
|
968
|
+
"""Clean up static content after span removal."""
|
|
969
|
+
lines = content.split("\n")
|
|
970
|
+
cleaned_lines: list[str] = []
|
|
971
|
+
prev_blank = False
|
|
972
|
+
|
|
973
|
+
for line in lines:
|
|
974
|
+
is_blank = not line.strip()
|
|
975
|
+
if is_blank and prev_blank:
|
|
976
|
+
continue
|
|
977
|
+
cleaned_lines.append(line.rstrip())
|
|
978
|
+
prev_blank = is_blank
|
|
979
|
+
|
|
980
|
+
return "\n".join(cleaned_lines).strip()
|
|
981
|
+
|
|
982
|
+
@property
|
|
983
|
+
def available_tiers(self) -> list[str]:
|
|
984
|
+
"""Get list of actually available tiers (dependencies installed)."""
|
|
985
|
+
available = []
|
|
986
|
+
|
|
987
|
+
if self._regex_detector:
|
|
988
|
+
available.append("regex")
|
|
989
|
+
|
|
990
|
+
if self._ner_detector and self._ner_detector.is_available:
|
|
991
|
+
available.append("ner")
|
|
992
|
+
|
|
993
|
+
if self._semantic_detector and self._semantic_detector.is_available:
|
|
994
|
+
available.append("semantic")
|
|
995
|
+
|
|
996
|
+
return available
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
# Convenience function
|
|
1000
|
+
def detect_dynamic_content(
|
|
1001
|
+
content: str,
|
|
1002
|
+
tiers: list[Literal["regex", "ner", "semantic"]] | None = None,
|
|
1003
|
+
) -> DetectionResult:
|
|
1004
|
+
"""
|
|
1005
|
+
Detect dynamic content in text.
|
|
1006
|
+
|
|
1007
|
+
Convenience function that creates a detector with specified tiers.
|
|
1008
|
+
|
|
1009
|
+
Args:
|
|
1010
|
+
content: Text to analyze.
|
|
1011
|
+
tiers: Which tiers to use. Default: ["regex"] for speed.
|
|
1012
|
+
|
|
1013
|
+
Returns:
|
|
1014
|
+
DetectionResult with detected spans and split content.
|
|
1015
|
+
|
|
1016
|
+
Example:
|
|
1017
|
+
>>> result = detect_dynamic_content(
|
|
1018
|
+
... "Session: abc123xyz. User: John paid $500.",
|
|
1019
|
+
... tiers=["regex", "ner"]
|
|
1020
|
+
... )
|
|
1021
|
+
>>> print(result.static_content)
|
|
1022
|
+
>>> print(result.dynamic_content)
|
|
1023
|
+
"""
|
|
1024
|
+
config = DetectorConfig(tiers=tiers or ["regex"])
|
|
1025
|
+
detector = DynamicContentDetector(config)
|
|
1026
|
+
return detector.detect(content)
|