superlocalmemory 3.2.3 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +43 -1
- package/README.md +106 -71
- package/package.json +1 -2
- package/pyproject.toml +16 -1
- package/src/superlocalmemory/cli/commands.py +309 -0
- package/src/superlocalmemory/cli/main.py +44 -0
- package/src/superlocalmemory/core/config.py +276 -4
- package/src/superlocalmemory/core/consolidation_engine.py +37 -0
- package/src/superlocalmemory/core/engine.py +21 -0
- package/src/superlocalmemory/core/engine_wiring.py +58 -8
- package/src/superlocalmemory/dynamics/activation_guided_quantization.py +374 -0
- package/src/superlocalmemory/dynamics/eap_scheduler.py +276 -0
- package/src/superlocalmemory/dynamics/ebbinghaus_langevin_coupling.py +171 -0
- package/src/superlocalmemory/encoding/cognitive_consolidator.py +804 -0
- package/src/superlocalmemory/hooks/auto_invoker.py +46 -8
- package/src/superlocalmemory/hooks/auto_parameterize.py +147 -0
- package/src/superlocalmemory/infra/heartbeat_monitor.py +140 -0
- package/src/superlocalmemory/infra/pid_manager.py +193 -0
- package/src/superlocalmemory/infra/process_reaper.py +572 -0
- package/src/superlocalmemory/learning/consolidation_quantization_worker.py +115 -0
- package/src/superlocalmemory/learning/forgetting_scheduler.py +263 -0
- package/src/superlocalmemory/learning/quantization_scheduler.py +320 -0
- package/src/superlocalmemory/math/ebbinghaus.py +309 -0
- package/src/superlocalmemory/math/fisher_quantized.py +251 -0
- package/src/superlocalmemory/math/hopfield.py +279 -0
- package/src/superlocalmemory/math/polar_quant.py +379 -0
- package/src/superlocalmemory/math/qjl.py +115 -0
- package/src/superlocalmemory/mcp/server.py +2 -0
- package/src/superlocalmemory/mcp/tools_v3.py +10 -0
- package/src/superlocalmemory/mcp/tools_v33.py +351 -0
- package/src/superlocalmemory/parameterization/__init__.py +47 -0
- package/src/superlocalmemory/parameterization/pattern_extractor.py +534 -0
- package/src/superlocalmemory/parameterization/pii_filter.py +106 -0
- package/src/superlocalmemory/parameterization/prompt_injector.py +216 -0
- package/src/superlocalmemory/parameterization/prompt_lifecycle.py +275 -0
- package/src/superlocalmemory/parameterization/soft_prompt_generator.py +425 -0
- package/src/superlocalmemory/retrieval/engine.py +21 -3
- package/src/superlocalmemory/retrieval/forgetting_filter.py +145 -0
- package/src/superlocalmemory/retrieval/hopfield_channel.py +335 -0
- package/src/superlocalmemory/retrieval/quantization_aware_search.py +133 -0
- package/src/superlocalmemory/retrieval/strategy.py +16 -6
- package/src/superlocalmemory/server/routes/agents.py +68 -8
- package/src/superlocalmemory/server/routes/learning.py +18 -1
- package/src/superlocalmemory/server/routes/lifecycle.py +36 -17
- package/src/superlocalmemory/server/routes/v3_api.py +503 -1
- package/src/superlocalmemory/storage/database.py +206 -0
- package/src/superlocalmemory/storage/embedding_migrator.py +178 -0
- package/src/superlocalmemory/storage/migration_v33.py +140 -0
- package/src/superlocalmemory/storage/quantized_store.py +261 -0
- package/src/superlocalmemory/storage/schema_v32.py +137 -0
- package/conftest.py +0 -5
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the MIT License - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3.3
|
|
4
|
+
|
|
5
|
+
"""PatternExtractor — Mine patterns from 4 sources for soft prompt generation.
|
|
6
|
+
|
|
7
|
+
Sources: core memory blocks, behavioral patterns, cross-project preferences,
|
|
8
|
+
workflow sequences. Includes deduplication (independence assumption) and
|
|
9
|
+
contradiction resolution (temporal ordering + close-confidence alternate marking).
|
|
10
|
+
|
|
11
|
+
[AUDIT FIX F-1] Sheaf removed — heuristic key-value comparison for contradictions.
|
|
12
|
+
[AUDIT FIX F-4] source_ids as tuple for true immutability.
|
|
13
|
+
|
|
14
|
+
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from datetime import datetime, timezone
|
|
23
|
+
from enum import Enum
|
|
24
|
+
from typing import TYPE_CHECKING
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from superlocalmemory.storage.database import DatabaseManager
|
|
28
|
+
from superlocalmemory.learning.behavioral import BehavioralPatternStore
|
|
29
|
+
from superlocalmemory.learning.cross_project import CrossProjectAggregator
|
|
30
|
+
from superlocalmemory.learning.workflows import WorkflowMiner
|
|
31
|
+
from superlocalmemory.core.config import ParameterizationConfig
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
# Data models
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
# Keyword sets for classification heuristics
|
|
40
|
+
_IDENTITY_KEYWORDS = frozenset({
|
|
41
|
+
"role", "title", "senior", "architect", "engineer", "manager",
|
|
42
|
+
"developer", "designer", "expertise", "lead", "director", "analyst",
|
|
43
|
+
})
|
|
44
|
+
_TECH_KEYWORDS = frozenset({
|
|
45
|
+
"typescript", "python", "react", "vue", "angular", "node", "java",
|
|
46
|
+
"rust", "go", "framework", "library", "language", "database",
|
|
47
|
+
"postgres", "mongodb", "redis", "aws", "azure", "gcp", "docker",
|
|
48
|
+
"kubernetes", "tool", "sdk", "api",
|
|
49
|
+
})
|
|
50
|
+
_STYLE_KEYWORDS = frozenset({
|
|
51
|
+
"prefer", "style", "tone", "format", "concise", "verbose",
|
|
52
|
+
"detailed", "brief", "formal", "casual",
|
|
53
|
+
})
|
|
54
|
+
_AVOIDANCE_KEYWORDS = frozenset({
|
|
55
|
+
"avoid", "never", "don't", "dont", "stop", "hate", "dislike",
|
|
56
|
+
"not use", "refuse",
|
|
57
|
+
})
|
|
58
|
+
_DECISION_KEYWORDS = frozenset({
|
|
59
|
+
"decided", "chose", "selected", "picked", "switched", "migrated",
|
|
60
|
+
"adopted", "dropped",
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
# Behavioral pattern type -> PatternCategory mapping
|
|
64
|
+
_BEHAVIORAL_TYPE_MAP: dict[str, str] = {
|
|
65
|
+
"entity_pref": "tech_preference",
|
|
66
|
+
"query_type": "workflow_pattern",
|
|
67
|
+
"time_of_day": "workflow_pattern",
|
|
68
|
+
"refinement": "communication_style",
|
|
69
|
+
"interest": "tech_preference",
|
|
70
|
+
"archival": "avoidance",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Cross-project key -> PatternCategory mapping
|
|
74
|
+
_CROSS_PROJECT_KEY_MAP: dict[str, str] = {
|
|
75
|
+
"frontend_framework": "tech_preference",
|
|
76
|
+
"backend_framework": "tech_preference",
|
|
77
|
+
"language": "tech_preference",
|
|
78
|
+
"database": "tech_preference",
|
|
79
|
+
"cloud": "tech_preference",
|
|
80
|
+
"testing": "tech_preference",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class PatternCategory(str, Enum):
|
|
85
|
+
"""Categories for extracted pattern assertions."""
|
|
86
|
+
|
|
87
|
+
IDENTITY = "identity"
|
|
88
|
+
TECH_PREFERENCE = "tech_preference"
|
|
89
|
+
COMMUNICATION_STYLE = "communication_style"
|
|
90
|
+
WORKFLOW_PATTERN = "workflow_pattern"
|
|
91
|
+
PROJECT_CONTEXT = "project_context"
|
|
92
|
+
DECISION_HISTORY = "decision_history"
|
|
93
|
+
AVOIDANCE = "avoidance"
|
|
94
|
+
CUSTOM = "custom"
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass(frozen=True)
|
|
98
|
+
class PatternAssertion:
|
|
99
|
+
"""Single extracted pattern assertion with provenance."""
|
|
100
|
+
|
|
101
|
+
category: PatternCategory
|
|
102
|
+
key: str
|
|
103
|
+
value: str
|
|
104
|
+
confidence: float
|
|
105
|
+
evidence_count: int
|
|
106
|
+
source: str # "core_memory" | "behavioral" | "cross_project" | "workflow"
|
|
107
|
+
source_ids: tuple[str, ...] = ()
|
|
108
|
+
cross_project_validated: bool = False
|
|
109
|
+
created_at: str = ""
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# ---------------------------------------------------------------------------
|
|
113
|
+
# PatternExtractor class
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
|
|
116
|
+
class PatternExtractor:
|
|
117
|
+
"""Extract patterns from 4 SLM sources for soft prompt generation.
|
|
118
|
+
|
|
119
|
+
Sources:
|
|
120
|
+
1. Core Memory blocks (user_profile, behavioral_patterns, learned_preferences)
|
|
121
|
+
2. Behavioral pattern store (entity_pref, query_type, etc.)
|
|
122
|
+
3. Cross-project aggregator (transferable preferences)
|
|
123
|
+
4. Workflow miner (action sequences)
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def __init__(
|
|
127
|
+
self,
|
|
128
|
+
db: DatabaseManager,
|
|
129
|
+
behavioral_store: BehavioralPatternStore,
|
|
130
|
+
cross_project: CrossProjectAggregator,
|
|
131
|
+
workflow_miner: WorkflowMiner,
|
|
132
|
+
config: ParameterizationConfig,
|
|
133
|
+
) -> None:
|
|
134
|
+
if not (0.3 <= config.min_confidence <= 1.0):
|
|
135
|
+
raise ValueError(
|
|
136
|
+
f"min_confidence must be in [0.3, 1.0], got {config.min_confidence}"
|
|
137
|
+
)
|
|
138
|
+
self._db = db
|
|
139
|
+
self._behavioral_store = behavioral_store
|
|
140
|
+
self._cross_project = cross_project
|
|
141
|
+
self._workflow_miner = workflow_miner
|
|
142
|
+
self._config = config
|
|
143
|
+
|
|
144
|
+
# ------------------------------------------------------------------
|
|
145
|
+
# Public API
|
|
146
|
+
# ------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
def extract(self, profile_id: str) -> list[PatternAssertion]:
|
|
149
|
+
"""Master extraction pipeline across all 4 sources.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
profile_id: Profile to extract patterns for.
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Deduplicated, contradiction-resolved list sorted by confidence DESC.
|
|
156
|
+
"""
|
|
157
|
+
all_patterns: list[PatternAssertion] = []
|
|
158
|
+
all_patterns.extend(self._extract_from_core_memory(profile_id))
|
|
159
|
+
all_patterns.extend(self._extract_from_behavioral(profile_id))
|
|
160
|
+
all_patterns.extend(self._extract_from_cross_project())
|
|
161
|
+
all_patterns.extend(self._extract_from_workflows(profile_id))
|
|
162
|
+
|
|
163
|
+
if not all_patterns:
|
|
164
|
+
return []
|
|
165
|
+
|
|
166
|
+
deduped = self._deduplicate(all_patterns)
|
|
167
|
+
resolved = self._check_contradictions(deduped, profile_id)
|
|
168
|
+
resolved.sort(key=lambda p: p.confidence, reverse=True)
|
|
169
|
+
return resolved
|
|
170
|
+
|
|
171
|
+
# ------------------------------------------------------------------
|
|
172
|
+
# Source extractors
|
|
173
|
+
# ------------------------------------------------------------------
|
|
174
|
+
|
|
175
|
+
def _extract_from_core_memory(
|
|
176
|
+
self, profile_id: str,
|
|
177
|
+
) -> list[PatternAssertion]:
|
|
178
|
+
"""Extract patterns from core memory blocks."""
|
|
179
|
+
rows = self._db.execute(
|
|
180
|
+
"SELECT block_id, block_type, content, source_fact_ids "
|
|
181
|
+
"FROM core_memory_blocks "
|
|
182
|
+
"WHERE profile_id = ? AND block_type IN "
|
|
183
|
+
"('user_profile', 'behavioral_patterns', 'learned_preferences')",
|
|
184
|
+
(profile_id,),
|
|
185
|
+
)
|
|
186
|
+
patterns: list[PatternAssertion] = []
|
|
187
|
+
for row in rows:
|
|
188
|
+
block_id = row["block_id"]
|
|
189
|
+
content = row["content"]
|
|
190
|
+
raw_ids = row["source_fact_ids"]
|
|
191
|
+
try:
|
|
192
|
+
fact_ids = json.loads(raw_ids) if raw_ids else []
|
|
193
|
+
except (json.JSONDecodeError, TypeError):
|
|
194
|
+
fact_ids = []
|
|
195
|
+
|
|
196
|
+
evidence = len(fact_ids)
|
|
197
|
+
assertions = self._split_assertions(content)
|
|
198
|
+
for text in assertions:
|
|
199
|
+
category = self._classify_text(text)
|
|
200
|
+
confidence = min(evidence / 10.0, 1.0)
|
|
201
|
+
if confidence < self._config.min_confidence:
|
|
202
|
+
continue
|
|
203
|
+
key = self._extract_key(text)
|
|
204
|
+
value = text.strip()[:200]
|
|
205
|
+
patterns.append(PatternAssertion(
|
|
206
|
+
category=category,
|
|
207
|
+
key=key,
|
|
208
|
+
value=value,
|
|
209
|
+
confidence=confidence,
|
|
210
|
+
evidence_count=evidence,
|
|
211
|
+
source="core_memory",
|
|
212
|
+
source_ids=(block_id,),
|
|
213
|
+
created_at=datetime.now(timezone.utc).isoformat(),
|
|
214
|
+
))
|
|
215
|
+
return patterns
|
|
216
|
+
|
|
217
|
+
def _extract_from_behavioral(
|
|
218
|
+
self, profile_id: str,
|
|
219
|
+
) -> list[PatternAssertion]:
|
|
220
|
+
"""Extract patterns from behavioral pattern store."""
|
|
221
|
+
raw_patterns = self._behavioral_store.get_patterns(
|
|
222
|
+
profile_id, min_confidence=self._config.min_confidence,
|
|
223
|
+
)
|
|
224
|
+
patterns: list[PatternAssertion] = []
|
|
225
|
+
for bp in raw_patterns:
|
|
226
|
+
if isinstance(bp, dict):
|
|
227
|
+
ev_count = bp.get("evidence_count", 0)
|
|
228
|
+
if ev_count < self._config.min_evidence:
|
|
229
|
+
continue
|
|
230
|
+
p_type = bp.get("pattern_type", "")
|
|
231
|
+
cat_str = _BEHAVIORAL_TYPE_MAP.get(p_type, "custom")
|
|
232
|
+
category = PatternCategory(cat_str)
|
|
233
|
+
p_key = bp.get("pattern_key", "")
|
|
234
|
+
p_value = bp.get("pattern_value", p_key)
|
|
235
|
+
conf = bp.get("confidence", 0.0)
|
|
236
|
+
p_id = str(bp.get("pattern_id", ""))
|
|
237
|
+
patterns.append(PatternAssertion(
|
|
238
|
+
category=category,
|
|
239
|
+
key=p_key,
|
|
240
|
+
value=p_value,
|
|
241
|
+
confidence=conf,
|
|
242
|
+
evidence_count=ev_count,
|
|
243
|
+
source="behavioral",
|
|
244
|
+
source_ids=(p_id,),
|
|
245
|
+
created_at=datetime.now(timezone.utc).isoformat(),
|
|
246
|
+
))
|
|
247
|
+
else:
|
|
248
|
+
# BehavioralPattern object
|
|
249
|
+
ev_count = getattr(bp, "evidence_count", 0)
|
|
250
|
+
if ev_count < self._config.min_evidence:
|
|
251
|
+
continue
|
|
252
|
+
p_type = getattr(bp, "pattern_type", "")
|
|
253
|
+
cat_str = _BEHAVIORAL_TYPE_MAP.get(p_type, "custom")
|
|
254
|
+
category = PatternCategory(cat_str)
|
|
255
|
+
p_key = getattr(bp, "pattern_key", "")
|
|
256
|
+
p_value = getattr(bp, "pattern_value", p_key)
|
|
257
|
+
conf = getattr(bp, "confidence", 0.0)
|
|
258
|
+
p_id = str(getattr(bp, "pattern_id", ""))
|
|
259
|
+
patterns.append(PatternAssertion(
|
|
260
|
+
category=category,
|
|
261
|
+
key=p_key,
|
|
262
|
+
value=p_value,
|
|
263
|
+
confidence=conf,
|
|
264
|
+
evidence_count=ev_count,
|
|
265
|
+
source="behavioral",
|
|
266
|
+
source_ids=(p_id,),
|
|
267
|
+
created_at=datetime.now(timezone.utc).isoformat(),
|
|
268
|
+
))
|
|
269
|
+
return patterns
|
|
270
|
+
|
|
271
|
+
def _extract_from_cross_project(self) -> list[PatternAssertion]:
|
|
272
|
+
"""Extract patterns from cross-project aggregator."""
|
|
273
|
+
preferences = self._cross_project.get_preferences(
|
|
274
|
+
min_confidence=self._config.min_confidence,
|
|
275
|
+
)
|
|
276
|
+
patterns: list[PatternAssertion] = []
|
|
277
|
+
for key, data in preferences.items():
|
|
278
|
+
cat_str = _CROSS_PROJECT_KEY_MAP.get(key, "custom")
|
|
279
|
+
category = PatternCategory(cat_str)
|
|
280
|
+
raw_conf = data.get("confidence", 0.0)
|
|
281
|
+
boosted = min(1.0, raw_conf * self._config.cross_project_boost)
|
|
282
|
+
ev_count = data.get("evidence_count", 0)
|
|
283
|
+
patterns.append(PatternAssertion(
|
|
284
|
+
category=category,
|
|
285
|
+
key=key,
|
|
286
|
+
value=data.get("value", ""),
|
|
287
|
+
confidence=boosted,
|
|
288
|
+
evidence_count=ev_count,
|
|
289
|
+
source="cross_project",
|
|
290
|
+
source_ids=(),
|
|
291
|
+
cross_project_validated=True,
|
|
292
|
+
created_at=datetime.now(timezone.utc).isoformat(),
|
|
293
|
+
))
|
|
294
|
+
return patterns
|
|
295
|
+
|
|
296
|
+
def _extract_from_workflows(
|
|
297
|
+
self, profile_id: str,
|
|
298
|
+
) -> list[PatternAssertion]:
|
|
299
|
+
"""Extract patterns from workflow miner."""
|
|
300
|
+
raw = self._workflow_miner.mine(profile_id, min_support=0.3)
|
|
301
|
+
patterns: list[PatternAssertion] = []
|
|
302
|
+
for wp in raw:
|
|
303
|
+
count = wp.get("count", 0)
|
|
304
|
+
if count < self._config.min_evidence:
|
|
305
|
+
continue
|
|
306
|
+
sequence = wp.get("sequence", [])
|
|
307
|
+
value = " -> ".join(sequence)
|
|
308
|
+
confidence = wp.get("support", 0.0)
|
|
309
|
+
if confidence < self._config.min_confidence:
|
|
310
|
+
continue
|
|
311
|
+
key = f"workflow_{len(sequence)}gram"
|
|
312
|
+
patterns.append(PatternAssertion(
|
|
313
|
+
category=PatternCategory.WORKFLOW_PATTERN,
|
|
314
|
+
key=key,
|
|
315
|
+
value=value,
|
|
316
|
+
confidence=confidence,
|
|
317
|
+
evidence_count=count,
|
|
318
|
+
source="workflow",
|
|
319
|
+
created_at=datetime.now(timezone.utc).isoformat(),
|
|
320
|
+
))
|
|
321
|
+
return patterns
|
|
322
|
+
|
|
323
|
+
# ------------------------------------------------------------------
|
|
324
|
+
# Deduplication & contradiction resolution
|
|
325
|
+
# ------------------------------------------------------------------
|
|
326
|
+
|
|
327
|
+
def _deduplicate(
|
|
328
|
+
self, patterns: list[PatternAssertion],
|
|
329
|
+
) -> list[PatternAssertion]:
|
|
330
|
+
"""Merge patterns sharing (category, key) using independence assumption.
|
|
331
|
+
|
|
332
|
+
Merged confidence: c_merged = 1 - product(1 - c_i)
|
|
333
|
+
"""
|
|
334
|
+
groups: dict[tuple[str, str], list[PatternAssertion]] = {}
|
|
335
|
+
for p in patterns:
|
|
336
|
+
gkey = (p.category.value, p.key)
|
|
337
|
+
groups.setdefault(gkey, []).append(p)
|
|
338
|
+
|
|
339
|
+
result: list[PatternAssertion] = []
|
|
340
|
+
for group in groups.values():
|
|
341
|
+
if len(group) == 1:
|
|
342
|
+
result.append(group[0])
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
# Independence-assumption merge
|
|
346
|
+
c_complement = 1.0
|
|
347
|
+
for p in group:
|
|
348
|
+
c_complement *= (1.0 - p.confidence)
|
|
349
|
+
merged_confidence = 1.0 - c_complement
|
|
350
|
+
|
|
351
|
+
total_evidence = sum(p.evidence_count for p in group)
|
|
352
|
+
# Value from highest-confidence pattern
|
|
353
|
+
best = max(group, key=lambda p: p.confidence)
|
|
354
|
+
# Collect all source_ids
|
|
355
|
+
all_ids: list[str] = []
|
|
356
|
+
for p in group:
|
|
357
|
+
all_ids.extend(p.source_ids)
|
|
358
|
+
cross_validated = any(p.cross_project_validated for p in group)
|
|
359
|
+
# Use latest created_at
|
|
360
|
+
timestamps = [p.created_at for p in group if p.created_at]
|
|
361
|
+
latest_ts = max(timestamps) if timestamps else ""
|
|
362
|
+
|
|
363
|
+
result.append(PatternAssertion(
|
|
364
|
+
category=best.category,
|
|
365
|
+
key=best.key,
|
|
366
|
+
value=best.value,
|
|
367
|
+
confidence=merged_confidence,
|
|
368
|
+
evidence_count=total_evidence,
|
|
369
|
+
source=best.source,
|
|
370
|
+
source_ids=tuple(all_ids),
|
|
371
|
+
cross_project_validated=cross_validated,
|
|
372
|
+
created_at=latest_ts,
|
|
373
|
+
))
|
|
374
|
+
return result
|
|
375
|
+
|
|
376
|
+
def _check_contradictions(
|
|
377
|
+
self,
|
|
378
|
+
patterns: list[PatternAssertion],
|
|
379
|
+
profile_id: str,
|
|
380
|
+
) -> list[PatternAssertion]:
|
|
381
|
+
"""Resolve contradictions within same category+key.
|
|
382
|
+
|
|
383
|
+
Resolution: temporal ordering (newer wins). For tech_preference with
|
|
384
|
+
close confidence (both >= 0.8, diff <= 0.1), keep both with _alternate.
|
|
385
|
+
"""
|
|
386
|
+
# Group by category
|
|
387
|
+
cat_groups: dict[str, list[PatternAssertion]] = {}
|
|
388
|
+
for p in patterns:
|
|
389
|
+
cat_groups.setdefault(p.category.value, []).append(p)
|
|
390
|
+
|
|
391
|
+
resolved: list[PatternAssertion] = []
|
|
392
|
+
for cat_value, group in cat_groups.items():
|
|
393
|
+
# Group by key within category
|
|
394
|
+
key_groups: dict[str, list[PatternAssertion]] = {}
|
|
395
|
+
for p in group:
|
|
396
|
+
key_groups.setdefault(p.key, []).append(p)
|
|
397
|
+
|
|
398
|
+
for key, key_patterns in key_groups.items():
|
|
399
|
+
if len(key_patterns) == 1:
|
|
400
|
+
resolved.append(key_patterns[0])
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
# Check pairwise for contradictions
|
|
404
|
+
surviving = list(key_patterns)
|
|
405
|
+
to_remove: set[int] = set()
|
|
406
|
+
|
|
407
|
+
for i in range(len(surviving)):
|
|
408
|
+
for j in range(i + 1, len(surviving)):
|
|
409
|
+
if i in to_remove or j in to_remove:
|
|
410
|
+
continue
|
|
411
|
+
p_a = surviving[i]
|
|
412
|
+
p_b = surviving[j]
|
|
413
|
+
|
|
414
|
+
if p_a.value == p_b.value:
|
|
415
|
+
continue # Same value = not a contradiction
|
|
416
|
+
|
|
417
|
+
# Check close confidence for tech_preference
|
|
418
|
+
if (
|
|
419
|
+
cat_value == "tech_preference"
|
|
420
|
+
and p_a.confidence >= 0.8
|
|
421
|
+
and p_b.confidence >= 0.8
|
|
422
|
+
and abs(p_a.confidence - p_b.confidence) <= 0.1
|
|
423
|
+
):
|
|
424
|
+
# Keep both — mark lower as alternate
|
|
425
|
+
if p_a.confidence >= p_b.confidence:
|
|
426
|
+
surviving[j] = PatternAssertion(
|
|
427
|
+
category=p_b.category,
|
|
428
|
+
key=f"{p_b.key}_alternate",
|
|
429
|
+
value=p_b.value,
|
|
430
|
+
confidence=p_b.confidence,
|
|
431
|
+
evidence_count=p_b.evidence_count,
|
|
432
|
+
source=p_b.source,
|
|
433
|
+
source_ids=p_b.source_ids,
|
|
434
|
+
cross_project_validated=p_b.cross_project_validated,
|
|
435
|
+
created_at=p_b.created_at,
|
|
436
|
+
)
|
|
437
|
+
else:
|
|
438
|
+
surviving[i] = PatternAssertion(
|
|
439
|
+
category=p_a.category,
|
|
440
|
+
key=f"{p_a.key}_alternate",
|
|
441
|
+
value=p_a.value,
|
|
442
|
+
confidence=p_a.confidence,
|
|
443
|
+
evidence_count=p_a.evidence_count,
|
|
444
|
+
source=p_a.source,
|
|
445
|
+
source_ids=p_a.source_ids,
|
|
446
|
+
cross_project_validated=p_a.cross_project_validated,
|
|
447
|
+
created_at=p_a.created_at,
|
|
448
|
+
)
|
|
449
|
+
logger.warning(
|
|
450
|
+
"Close confidence conflict in %s: '%s' vs '%s' "
|
|
451
|
+
"for key '%s'. Both kept as alternate.",
|
|
452
|
+
cat_value, p_a.value, p_b.value, key,
|
|
453
|
+
)
|
|
454
|
+
continue
|
|
455
|
+
|
|
456
|
+
# Temporal resolution
|
|
457
|
+
if p_a.created_at and p_b.created_at:
|
|
458
|
+
if p_a.created_at > p_b.created_at:
|
|
459
|
+
to_remove.add(j)
|
|
460
|
+
resolution = "temporal (newer wins)"
|
|
461
|
+
elif p_b.created_at > p_a.created_at:
|
|
462
|
+
to_remove.add(i)
|
|
463
|
+
resolution = "temporal (newer wins)"
|
|
464
|
+
else:
|
|
465
|
+
# Same timestamp — keep higher confidence
|
|
466
|
+
if p_a.confidence >= p_b.confidence:
|
|
467
|
+
to_remove.add(j)
|
|
468
|
+
else:
|
|
469
|
+
to_remove.add(i)
|
|
470
|
+
resolution = "confidence (higher wins)"
|
|
471
|
+
else:
|
|
472
|
+
# No timestamps — keep higher confidence
|
|
473
|
+
if p_a.confidence >= p_b.confidence:
|
|
474
|
+
to_remove.add(j)
|
|
475
|
+
else:
|
|
476
|
+
to_remove.add(i)
|
|
477
|
+
resolution = "confidence (higher wins)"
|
|
478
|
+
|
|
479
|
+
logger.warning(
|
|
480
|
+
"Contradiction in %s: '%s' vs '%s' for key '%s'. "
|
|
481
|
+
"Resolved by %s.",
|
|
482
|
+
cat_value, p_a.value, p_b.value, key, resolution,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
for idx, p in enumerate(surviving):
|
|
486
|
+
if idx not in to_remove:
|
|
487
|
+
resolved.append(p)
|
|
488
|
+
|
|
489
|
+
return resolved
|
|
490
|
+
|
|
491
|
+
# ------------------------------------------------------------------
|
|
492
|
+
# Helpers
|
|
493
|
+
# ------------------------------------------------------------------
|
|
494
|
+
|
|
495
|
+
@staticmethod
|
|
496
|
+
def _split_assertions(content: str) -> list[str]:
|
|
497
|
+
"""Split block content into atomic assertions."""
|
|
498
|
+
import re
|
|
499
|
+
parts = re.split(r"\n[-*\.]\s+", content)
|
|
500
|
+
result = []
|
|
501
|
+
for part in parts:
|
|
502
|
+
stripped = part.strip()
|
|
503
|
+
if stripped:
|
|
504
|
+
# Remove leading bullet markers from first element
|
|
505
|
+
stripped = re.sub(r"^[-*\.]\s*", "", stripped)
|
|
506
|
+
if stripped:
|
|
507
|
+
result.append(stripped)
|
|
508
|
+
return result
|
|
509
|
+
|
|
510
|
+
@staticmethod
|
|
511
|
+
def _classify_text(text: str) -> PatternCategory:
|
|
512
|
+
"""Classify assertion text into a PatternCategory using keywords."""
|
|
513
|
+
lower = text.lower()
|
|
514
|
+
words = set(lower.split())
|
|
515
|
+
|
|
516
|
+
if words & _AVOIDANCE_KEYWORDS:
|
|
517
|
+
return PatternCategory.AVOIDANCE
|
|
518
|
+
if words & _DECISION_KEYWORDS:
|
|
519
|
+
return PatternCategory.DECISION_HISTORY
|
|
520
|
+
if words & _IDENTITY_KEYWORDS:
|
|
521
|
+
return PatternCategory.IDENTITY
|
|
522
|
+
if words & _TECH_KEYWORDS:
|
|
523
|
+
return PatternCategory.TECH_PREFERENCE
|
|
524
|
+
if words & _STYLE_KEYWORDS:
|
|
525
|
+
return PatternCategory.COMMUNICATION_STYLE
|
|
526
|
+
return PatternCategory.CUSTOM
|
|
527
|
+
|
|
528
|
+
@staticmethod
|
|
529
|
+
def _extract_key(text: str) -> str:
|
|
530
|
+
"""Extract a key from assertion text (first significant phrase)."""
|
|
531
|
+
# Take first few words as key, normalized
|
|
532
|
+
words = text.strip().split()[:4]
|
|
533
|
+
key = "_".join(w.lower().strip(",.;:") for w in words if w)
|
|
534
|
+
return key or "unknown"
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
|
|
2
|
+
# Licensed under the MIT License - see LICENSE file
|
|
3
|
+
# Part of SuperLocalMemory V3.3
|
|
4
|
+
|
|
5
|
+
"""PII Filter — Stateless PII detection and redaction for soft prompts.
|
|
6
|
+
|
|
7
|
+
Ensures no personally identifiable information leaks into generated prompts.
|
|
8
|
+
Patterns cover: email, phone, SSN, credit card, IP address, API keys.
|
|
9
|
+
|
|
10
|
+
Part of Qualixar | Author: Varun Pratap Bhardwaj
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# PII regex patterns
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
# Order matters: longer/more-specific patterns first to prevent partial matches.
|
|
25
|
+
# Credit card (16 digits) must come before phone (7-12 digits).
|
|
26
|
+
# SSN (XXX-XX-XXXX) must come before phone to avoid partial match.
|
|
27
|
+
PII_PATTERNS: dict[str, re.Pattern] = {
|
|
28
|
+
"api_key": re.compile(
|
|
29
|
+
r"\b(?:sk-|pk-|api[_-]?key[_-]?)[A-Za-z0-9_-]{20,}\b", re.IGNORECASE
|
|
30
|
+
),
|
|
31
|
+
"email": re.compile(
|
|
32
|
+
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b"
|
|
33
|
+
),
|
|
34
|
+
"credit_card": re.compile(
|
|
35
|
+
r"\b(?:\d{4}[-\s]?){3}\d{4}\b"
|
|
36
|
+
),
|
|
37
|
+
"ssn": re.compile(
|
|
38
|
+
r"\b\d{3}-\d{2}-\d{4}\b"
|
|
39
|
+
),
|
|
40
|
+
"ip_address": re.compile(
|
|
41
|
+
r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
|
|
42
|
+
),
|
|
43
|
+
"phone": re.compile(
|
|
44
|
+
r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{2,4}\)?[-.\s]?\d{3,4}[-.\s]?\d{3,4}\b"
|
|
45
|
+
),
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
# PIIFilter class
|
|
51
|
+
# ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
class PIIFilter:
|
|
54
|
+
"""Stateless PII detection and redaction.
|
|
55
|
+
|
|
56
|
+
Usage:
|
|
57
|
+
pii = PIIFilter()
|
|
58
|
+
clean = pii.filter_text("email user@test.com")
|
|
59
|
+
# -> "email [REDACTED:email]"
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(self) -> None:
|
|
63
|
+
"""Stateless — no initialization needed."""
|
|
64
|
+
|
|
65
|
+
def filter_text(self, text: str) -> str:
|
|
66
|
+
"""Replace all PII matches with [REDACTED:type] placeholders.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
text: Input text potentially containing PII.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Text with all detected PII replaced by redaction markers.
|
|
73
|
+
"""
|
|
74
|
+
result = text
|
|
75
|
+
for pii_type, pattern in PII_PATTERNS.items():
|
|
76
|
+
result = pattern.sub(f"[REDACTED:{pii_type}]", result)
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
def has_pii(self, text: str) -> bool:
|
|
80
|
+
"""Check whether text contains any PII.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
text: Input text to scan.
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
True if any PII pattern matches.
|
|
87
|
+
"""
|
|
88
|
+
for pattern in PII_PATTERNS.values():
|
|
89
|
+
if pattern.search(text):
|
|
90
|
+
return True
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
def detect_pii_types(self, text: str) -> list[str]:
|
|
94
|
+
"""Detect which PII types are present in text.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
text: Input text to scan.
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
List of PII type names found (e.g., ["email", "phone"]).
|
|
101
|
+
"""
|
|
102
|
+
found: list[str] = []
|
|
103
|
+
for pii_type, pattern in PII_PATTERNS.items():
|
|
104
|
+
if pattern.search(text):
|
|
105
|
+
found.append(pii_type)
|
|
106
|
+
return found
|