brainlayer 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brainlayer/__init__.py +3 -0
- brainlayer/cli/__init__.py +1545 -0
- brainlayer/cli/wizard.py +132 -0
- brainlayer/cli_new.py +151 -0
- brainlayer/client.py +164 -0
- brainlayer/clustering.py +736 -0
- brainlayer/daemon.py +1105 -0
- brainlayer/dashboard/README.md +129 -0
- brainlayer/dashboard/__init__.py +5 -0
- brainlayer/dashboard/app.py +151 -0
- brainlayer/dashboard/search.py +229 -0
- brainlayer/dashboard/views.py +230 -0
- brainlayer/embeddings.py +131 -0
- brainlayer/engine.py +550 -0
- brainlayer/index_new.py +87 -0
- brainlayer/mcp/__init__.py +1558 -0
- brainlayer/migrate.py +205 -0
- brainlayer/paths.py +43 -0
- brainlayer/pipeline/__init__.py +47 -0
- brainlayer/pipeline/analyze_communication.py +508 -0
- brainlayer/pipeline/brain_graph.py +567 -0
- brainlayer/pipeline/chat_tags.py +63 -0
- brainlayer/pipeline/chunk.py +422 -0
- brainlayer/pipeline/classify.py +472 -0
- brainlayer/pipeline/cluster_sampling.py +73 -0
- brainlayer/pipeline/enrichment.py +810 -0
- brainlayer/pipeline/extract.py +66 -0
- brainlayer/pipeline/extract_claude_desktop.py +149 -0
- brainlayer/pipeline/extract_corrections.py +231 -0
- brainlayer/pipeline/extract_markdown.py +195 -0
- brainlayer/pipeline/extract_whatsapp.py +227 -0
- brainlayer/pipeline/git_overlay.py +301 -0
- brainlayer/pipeline/longitudinal_analyzer.py +568 -0
- brainlayer/pipeline/obsidian_export.py +455 -0
- brainlayer/pipeline/operation_grouping.py +486 -0
- brainlayer/pipeline/plan_linking.py +313 -0
- brainlayer/pipeline/sanitize.py +549 -0
- brainlayer/pipeline/semantic_style.py +574 -0
- brainlayer/pipeline/session_enrichment.py +472 -0
- brainlayer/pipeline/style_embed.py +67 -0
- brainlayer/pipeline/style_index.py +139 -0
- brainlayer/pipeline/temporal_chains.py +203 -0
- brainlayer/pipeline/time_batcher.py +248 -0
- brainlayer/pipeline/unified_timeline.py +569 -0
- brainlayer/storage.py +66 -0
- brainlayer/store.py +155 -0
- brainlayer/taxonomy.json +80 -0
- brainlayer/vector_store.py +1891 -0
- brainlayer-1.0.0.dist-info/METADATA +313 -0
- brainlayer-1.0.0.dist-info/RECORD +53 -0
- brainlayer-1.0.0.dist-info/WHEEL +4 -0
- brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
- brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,549 @@
|
|
|
1
|
+
"""PII sanitization pipeline for BrainLayer chunks.
|
|
2
|
+
|
|
3
|
+
Strips personally identifiable information from chunk content before sending
|
|
4
|
+
to external LLM APIs (Gemini, Groq). Three detection layers:
|
|
5
|
+
|
|
6
|
+
1. Regex — owner name, emails, file paths, IPs, JWTs, phone numbers, op:// refs
|
|
7
|
+
2. Known names dictionary — WhatsApp contacts + manual list (Hebrew + English)
|
|
8
|
+
3. spaCy NER — unknown English person names (en_core_web_sm)
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from brainlayer.pipeline.sanitize import Sanitizer
|
|
12
|
+
|
|
13
|
+
sanitizer = Sanitizer.from_env()
|
|
14
|
+
result = sanitizer.sanitize("Etan said hello to David")
|
|
15
|
+
print(result.sanitized) # "[OWNER] said hello to [PERSON_a1b2c3d4]"
|
|
16
|
+
print(result.pii_detected) # True
|
|
17
|
+
|
|
18
|
+
# Batch mode
|
|
19
|
+
results = sanitizer.sanitize_batch(chunks, parallel=4)
|
|
20
|
+
|
|
21
|
+
# Build name dictionary from WhatsApp contacts in DB
|
|
22
|
+
names = sanitizer.build_name_dictionary(store)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import hashlib
|
|
28
|
+
import json
|
|
29
|
+
import os
|
|
30
|
+
import re
|
|
31
|
+
import threading
|
|
32
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
33
|
+
from dataclasses import dataclass, field
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Any, Optional
|
|
36
|
+
|
|
37
|
+
# AIDEV-NOTE: spaCy is lazy-loaded to avoid slow import on every pipeline import.
|
|
38
|
+
# Only loaded when use_spacy_ner=True and sanitize() is first called.
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ── Types ──────────────────────────────────────────────────────────────
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class Replacement:
|
|
46
|
+
"""Single PII replacement record."""
|
|
47
|
+
|
|
48
|
+
category: str # "owner", "person_name", "email", "file_path", "ip", "jwt", "op_ref", "phone", "github"
|
|
49
|
+
original: str # The matched text
|
|
50
|
+
placeholder: str # What it was replaced with
|
|
51
|
+
start: int # Position in original text
|
|
52
|
+
end: int # End position in original text
|
|
53
|
+
source: str # "regex", "spacy", "name_dict"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class SanitizeResult:
|
|
58
|
+
"""Output of sanitization — the cleaned text + audit metadata."""
|
|
59
|
+
|
|
60
|
+
sanitized: str
|
|
61
|
+
original_length: int
|
|
62
|
+
replacements: list[Replacement] = field(default_factory=list)
|
|
63
|
+
pii_detected: bool = False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True)
|
|
67
|
+
class SanitizeConfig:
|
|
68
|
+
"""What to sanitize and how."""
|
|
69
|
+
|
|
70
|
+
owner_names: tuple[str, ...] = ()
|
|
71
|
+
owner_emails: tuple[str, ...] = ()
|
|
72
|
+
owner_paths: tuple[str, ...] = ()
|
|
73
|
+
known_names: frozenset[str] = frozenset()
|
|
74
|
+
strip_emails: bool = True
|
|
75
|
+
strip_ips: bool = True
|
|
76
|
+
strip_jwts: bool = True
|
|
77
|
+
strip_op_refs: bool = True
|
|
78
|
+
strip_phone_numbers: bool = True
|
|
79
|
+
use_spacy_ner: bool = True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ── Helpers ─────────────────────────────────────────────────────────────
|
|
83
|
+
|
|
84
|
+
# Hebrew nikud (diacritics) range: U+0591 to U+05C7
|
|
85
|
+
_NIKUD_RE = re.compile(r"[\u0591-\u05c7]")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _strip_nikud(text: str) -> str:
|
|
89
|
+
"""Remove Hebrew nikud (diacritical marks) for fuzzy name matching."""
|
|
90
|
+
return _NIKUD_RE.sub("", text)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _nikud_offset_map(original: str) -> list[int]:
|
|
94
|
+
"""Build mapping from nikud-stripped positions to original positions.
|
|
95
|
+
|
|
96
|
+
Returns a list where map[stripped_idx] = original_idx, plus one extra
|
|
97
|
+
entry at the end for end-of-string positions.
|
|
98
|
+
"""
|
|
99
|
+
offset_map: list[int] = []
|
|
100
|
+
for orig_idx, ch in enumerate(original):
|
|
101
|
+
if not _NIKUD_RE.match(ch):
|
|
102
|
+
offset_map.append(orig_idx)
|
|
103
|
+
# Sentinel for end positions
|
|
104
|
+
offset_map.append(len(original))
|
|
105
|
+
return offset_map
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ── Regex patterns ─────────────────────────────────────────────────────
|
|
109
|
+
|
|
110
|
+
# Email: simplified RFC 5322
|
|
111
|
+
_EMAIL_RE = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
|
|
112
|
+
|
|
113
|
+
# IPv4
|
|
114
|
+
_IPV4_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
|
|
115
|
+
|
|
116
|
+
# JWT tokens (3 base64 segments separated by dots)
|
|
117
|
+
_JWT_RE = re.compile(r"\beyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b")
|
|
118
|
+
|
|
119
|
+
# 1Password references
|
|
120
|
+
_OP_REF_RE = re.compile(r"op://[^\s\"']+")
|
|
121
|
+
|
|
122
|
+
# Phone numbers: international format (+972..., +1..., etc.)
|
|
123
|
+
_PHONE_RE = re.compile(r"\+\d{1,3}[\s.-]?\d{1,4}[\s.-]?\d{3,4}[\s.-]?\d{3,4}\b")
|
|
124
|
+
|
|
125
|
+
# Code blocks (to exclude from NER)
|
|
126
|
+
_CODE_BLOCK_RE = re.compile(r"```[\s\S]*?```")
|
|
127
|
+
|
|
128
|
+
# GitHub URLs and @mentions
|
|
129
|
+
_GITHUB_URL_RE = re.compile(r"github\.com/([a-zA-Z0-9_-]+)")
|
|
130
|
+
_GITHUB_MENTION_RE = re.compile(r"@([a-zA-Z0-9_-]+)")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
# ── Sanitizer ──────────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class Sanitizer:
|
|
137
|
+
"""Reusable PII sanitizer for BrainLayer chunks.
|
|
138
|
+
|
|
139
|
+
Thread-safe for batch processing — spaCy model is loaded once and shared.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
def __init__(self, config: SanitizeConfig) -> None:
|
|
143
|
+
self.config = config
|
|
144
|
+
self._nlp = None # Lazy-loaded spaCy model
|
|
145
|
+
self._name_to_pseudo: dict[str, str] = {} # name.lower() → placeholder
|
|
146
|
+
self._pseudo_lock = threading.Lock() # Thread-safe pseudonym access
|
|
147
|
+
self._owner_re: Optional[re.Pattern[str]] = None
|
|
148
|
+
self._known_names_re: Optional[re.Pattern[str]] = None
|
|
149
|
+
|
|
150
|
+
self._build_owner_regex()
|
|
151
|
+
self._build_known_names_regex()
|
|
152
|
+
|
|
153
|
+
def _build_owner_regex(self) -> None:
|
|
154
|
+
"""Build compiled regex for owner name variants."""
|
|
155
|
+
if not self.config.owner_names:
|
|
156
|
+
return
|
|
157
|
+
# Sort by length descending so longer matches take priority
|
|
158
|
+
sorted_names = sorted(self.config.owner_names, key=len, reverse=True)
|
|
159
|
+
escaped = [re.escape(name) for name in sorted_names]
|
|
160
|
+
self._owner_re = re.compile(
|
|
161
|
+
r"\b(?:" + "|".join(escaped) + r")\b",
|
|
162
|
+
re.IGNORECASE,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
def _build_known_names_regex(self) -> None:
|
|
166
|
+
"""Build compiled regex for known names dictionary.
|
|
167
|
+
|
|
168
|
+
Uses word boundaries for Latin names. For Hebrew names (containing
|
|
169
|
+
Hebrew Unicode chars), uses lookahead/lookbehind on whitespace since
|
|
170
|
+
\\b doesn't work reliably with Hebrew script.
|
|
171
|
+
"""
|
|
172
|
+
if not self.config.known_names:
|
|
173
|
+
return
|
|
174
|
+
|
|
175
|
+
latin_names: list[str] = []
|
|
176
|
+
hebrew_names: list[str] = []
|
|
177
|
+
|
|
178
|
+
for name in sorted(self.config.known_names, key=len, reverse=True):
|
|
179
|
+
name = name.strip()
|
|
180
|
+
if not name or len(name) < 2:
|
|
181
|
+
continue
|
|
182
|
+
# Check if name contains Hebrew characters (Unicode block 0x0590-0x05FF)
|
|
183
|
+
if any("\u0590" <= ch <= "\u05ff" for ch in name):
|
|
184
|
+
# Normalize: strip nikud (diacritics U+0591-U+05C7) for matching
|
|
185
|
+
normalized = _strip_nikud(name)
|
|
186
|
+
hebrew_names.append(re.escape(normalized))
|
|
187
|
+
else:
|
|
188
|
+
latin_names.append(re.escape(name))
|
|
189
|
+
|
|
190
|
+
parts: list[str] = []
|
|
191
|
+
if latin_names:
|
|
192
|
+
parts.append(r"\b(?:" + "|".join(latin_names) + r")\b")
|
|
193
|
+
if hebrew_names:
|
|
194
|
+
# Hebrew word boundary: preceded/followed by whitespace, start, or end
|
|
195
|
+
parts.append(r"(?:^|(?<=\s))(?:" + "|".join(hebrew_names) + r")(?=\s|$)")
|
|
196
|
+
|
|
197
|
+
if parts:
|
|
198
|
+
self._known_names_re = re.compile("|".join(parts), re.IGNORECASE | re.MULTILINE)
|
|
199
|
+
|
|
200
|
+
def _get_nlp(self):
|
|
201
|
+
"""Lazy-load spaCy model on first use."""
|
|
202
|
+
if self._nlp is None and self.config.use_spacy_ner:
|
|
203
|
+
try:
|
|
204
|
+
import spacy
|
|
205
|
+
|
|
206
|
+
self._nlp = spacy.load("en_core_web_sm", disable=["parser", "lemmatizer"])
|
|
207
|
+
except (ImportError, OSError) as e:
|
|
208
|
+
import sys
|
|
209
|
+
|
|
210
|
+
print(f" spaCy unavailable ({e}), skipping NER layer", file=sys.stderr)
|
|
211
|
+
self._nlp = False # Sentinel: tried and failed
|
|
212
|
+
return self._nlp if self._nlp is not False else None
|
|
213
|
+
|
|
214
|
+
def _pseudonym(self, name: str) -> str:
|
|
215
|
+
"""Get or create a stable pseudonym for a name. Thread-safe."""
|
|
216
|
+
key = name.lower().strip()
|
|
217
|
+
with self._pseudo_lock:
|
|
218
|
+
if key not in self._name_to_pseudo:
|
|
219
|
+
h = hashlib.sha256(key.encode("utf-8")).hexdigest()[:8]
|
|
220
|
+
self._name_to_pseudo[key] = f"[PERSON_{h}]"
|
|
221
|
+
return self._name_to_pseudo[key]
|
|
222
|
+
|
|
223
|
+
def sanitize(
|
|
224
|
+
self,
|
|
225
|
+
content: str,
|
|
226
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
227
|
+
) -> SanitizeResult:
|
|
228
|
+
"""Sanitize a single chunk's content. Returns result with cleaned text.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
content: The raw chunk text to sanitize.
|
|
232
|
+
metadata: Optional chunk metadata (source, sender, etc.) for context.
|
|
233
|
+
"""
|
|
234
|
+
if not content:
|
|
235
|
+
return SanitizeResult(sanitized="", original_length=0, pii_detected=False)
|
|
236
|
+
|
|
237
|
+
original_length = len(content)
|
|
238
|
+
replacements: list[Replacement] = []
|
|
239
|
+
text = content
|
|
240
|
+
|
|
241
|
+
# Track already-replaced spans to avoid double-replacement
|
|
242
|
+
replaced_spans: list[tuple[int, int]] = []
|
|
243
|
+
|
|
244
|
+
def _apply_replacements(
|
|
245
|
+
text: str,
|
|
246
|
+
matches: list[tuple[int, int, str, str, str]],
|
|
247
|
+
) -> str:
|
|
248
|
+
"""Apply a list of (start, end, placeholder, category, source) replacements.
|
|
249
|
+
|
|
250
|
+
Works backwards to preserve positions.
|
|
251
|
+
"""
|
|
252
|
+
# Sort by start position descending
|
|
253
|
+
sorted_matches = sorted(matches, key=lambda m: m[0], reverse=True)
|
|
254
|
+
for start, end, placeholder, category, source in sorted_matches:
|
|
255
|
+
# Skip if overlaps with already-replaced span
|
|
256
|
+
if any(s <= start < e or s < end <= e for s, e in replaced_spans):
|
|
257
|
+
continue
|
|
258
|
+
original = text[start:end]
|
|
259
|
+
replacements.append(
|
|
260
|
+
Replacement(
|
|
261
|
+
category=category,
|
|
262
|
+
original=original,
|
|
263
|
+
placeholder=placeholder,
|
|
264
|
+
start=start,
|
|
265
|
+
end=end,
|
|
266
|
+
source=source,
|
|
267
|
+
)
|
|
268
|
+
)
|
|
269
|
+
text = text[:start] + placeholder + text[end:]
|
|
270
|
+
replaced_spans.append((start, start + len(placeholder)))
|
|
271
|
+
return text
|
|
272
|
+
|
|
273
|
+
# ── Layer 1: Regex (owner + known patterns) ──
|
|
274
|
+
# Order matters: match longer/more-specific patterns first to avoid
|
|
275
|
+
# partial matches (e.g., "jane" inside "jane@example.com").
|
|
276
|
+
|
|
277
|
+
# Owner emails FIRST (before owner names, to avoid partial match)
|
|
278
|
+
for email in self.config.owner_emails:
|
|
279
|
+
if email.lower() in text.lower():
|
|
280
|
+
email_matches = [
|
|
281
|
+
(m.start(), m.end(), "[OWNER_EMAIL]", "email", "regex")
|
|
282
|
+
for m in re.finditer(re.escape(email), text, re.IGNORECASE)
|
|
283
|
+
]
|
|
284
|
+
text = _apply_replacements(text, email_matches)
|
|
285
|
+
|
|
286
|
+
# Owner file paths SECOND (before owner names, same reason)
|
|
287
|
+
for path_prefix in self.config.owner_paths:
|
|
288
|
+
if path_prefix in text:
|
|
289
|
+
path_matches = [
|
|
290
|
+
(
|
|
291
|
+
m.start(),
|
|
292
|
+
m.end(),
|
|
293
|
+
m.group(0).replace(path_prefix, "/Users/[OWNER]"),
|
|
294
|
+
"file_path",
|
|
295
|
+
"regex",
|
|
296
|
+
)
|
|
297
|
+
for m in re.finditer(re.escape(path_prefix) + r"[^\s\"']*", text)
|
|
298
|
+
]
|
|
299
|
+
text = _apply_replacements(text, path_matches)
|
|
300
|
+
|
|
301
|
+
# GitHub username THIRD
|
|
302
|
+
for owner_name in self.config.owner_names:
|
|
303
|
+
for pattern in [_GITHUB_URL_RE, _GITHUB_MENTION_RE]:
|
|
304
|
+
for m in pattern.finditer(text):
|
|
305
|
+
if m.group(1).lower() == owner_name.lower():
|
|
306
|
+
matches = [(m.start(), m.end(), "[OWNER_GITHUB]", "github", "regex")]
|
|
307
|
+
text = _apply_replacements(text, matches)
|
|
308
|
+
|
|
309
|
+
# Owner names LAST (after emails/paths/github are already replaced)
|
|
310
|
+
if self._owner_re:
|
|
311
|
+
matches = [(m.start(), m.end(), "[OWNER]", "owner", "regex") for m in self._owner_re.finditer(text)]
|
|
312
|
+
text = _apply_replacements(text, matches)
|
|
313
|
+
|
|
314
|
+
# General emails
|
|
315
|
+
if self.config.strip_emails:
|
|
316
|
+
counter = 0
|
|
317
|
+
general_emails = []
|
|
318
|
+
for m in _EMAIL_RE.finditer(text):
|
|
319
|
+
counter += 1
|
|
320
|
+
general_emails.append((m.start(), m.end(), f"[EMAIL_{counter}]", "email", "regex"))
|
|
321
|
+
text = _apply_replacements(text, general_emails)
|
|
322
|
+
|
|
323
|
+
# IPs
|
|
324
|
+
if self.config.strip_ips:
|
|
325
|
+
ip_matches = [
|
|
326
|
+
(m.start(), m.end(), "[IP_ADDR]", "ip", "regex")
|
|
327
|
+
for m in _IPV4_RE.finditer(text)
|
|
328
|
+
# Skip common non-PII IPs
|
|
329
|
+
if not m.group(0).startswith(("127.", "0.", "255.")) and m.group(0) != "0.0.0.0"
|
|
330
|
+
]
|
|
331
|
+
text = _apply_replacements(text, ip_matches)
|
|
332
|
+
|
|
333
|
+
# JWTs
|
|
334
|
+
if self.config.strip_jwts:
|
|
335
|
+
jwt_matches = [(m.start(), m.end(), "[JWT_TOKEN]", "jwt", "regex") for m in _JWT_RE.finditer(text)]
|
|
336
|
+
text = _apply_replacements(text, jwt_matches)
|
|
337
|
+
|
|
338
|
+
# 1Password refs
|
|
339
|
+
if self.config.strip_op_refs:
|
|
340
|
+
op_matches = [(m.start(), m.end(), "[OP_REF]", "op_ref", "regex") for m in _OP_REF_RE.finditer(text)]
|
|
341
|
+
text = _apply_replacements(text, op_matches)
|
|
342
|
+
|
|
343
|
+
# Phone numbers
|
|
344
|
+
if self.config.strip_phone_numbers:
|
|
345
|
+
phone_matches = [(m.start(), m.end(), "[PHONE]", "phone", "regex") for m in _PHONE_RE.finditer(text)]
|
|
346
|
+
text = _apply_replacements(text, phone_matches)
|
|
347
|
+
|
|
348
|
+
# ── Layer 2: Known names dictionary ──
|
|
349
|
+
|
|
350
|
+
if self._known_names_re:
|
|
351
|
+
# Match against nikud-stripped text but replace in original
|
|
352
|
+
text_no_nikud = _strip_nikud(text)
|
|
353
|
+
if text_no_nikud != text:
|
|
354
|
+
# Hebrew text with nikud — match on stripped version, map positions back
|
|
355
|
+
omap = _nikud_offset_map(text)
|
|
356
|
+
name_matches = [
|
|
357
|
+
(
|
|
358
|
+
omap[m.start()],
|
|
359
|
+
omap[m.end()],
|
|
360
|
+
self._pseudonym(m.group(0)),
|
|
361
|
+
"person_name",
|
|
362
|
+
"name_dict",
|
|
363
|
+
)
|
|
364
|
+
for m in self._known_names_re.finditer(text_no_nikud)
|
|
365
|
+
]
|
|
366
|
+
else:
|
|
367
|
+
name_matches = [
|
|
368
|
+
(m.start(), m.end(), self._pseudonym(m.group(0)), "person_name", "name_dict")
|
|
369
|
+
for m in self._known_names_re.finditer(text)
|
|
370
|
+
]
|
|
371
|
+
text = _apply_replacements(text, name_matches)
|
|
372
|
+
|
|
373
|
+
# ── Layer 3: spaCy NER (English names only) ──
|
|
374
|
+
|
|
375
|
+
nlp = self._get_nlp()
|
|
376
|
+
if nlp is not None:
|
|
377
|
+
# Find code blocks to exclude
|
|
378
|
+
code_spans: set[tuple[int, int]] = set()
|
|
379
|
+
for m in _CODE_BLOCK_RE.finditer(text):
|
|
380
|
+
code_spans.add((m.start(), m.end()))
|
|
381
|
+
|
|
382
|
+
doc = nlp(text)
|
|
383
|
+
ner_matches = []
|
|
384
|
+
for ent in doc.ents:
|
|
385
|
+
if ent.label_ != "PERSON":
|
|
386
|
+
continue
|
|
387
|
+
# Skip if inside a code block
|
|
388
|
+
if any(cs <= ent.start_char < ce for cs, ce in code_spans):
|
|
389
|
+
continue
|
|
390
|
+
# Skip very short entities (likely false positives)
|
|
391
|
+
if len(ent.text.strip()) < 3:
|
|
392
|
+
continue
|
|
393
|
+
# Skip if already replaced
|
|
394
|
+
if any(s <= ent.start_char < e for s, e in replaced_spans):
|
|
395
|
+
continue
|
|
396
|
+
ner_matches.append(
|
|
397
|
+
(
|
|
398
|
+
ent.start_char,
|
|
399
|
+
ent.end_char,
|
|
400
|
+
self._pseudonym(ent.text),
|
|
401
|
+
"person_name",
|
|
402
|
+
"spacy",
|
|
403
|
+
)
|
|
404
|
+
)
|
|
405
|
+
text = _apply_replacements(text, ner_matches)
|
|
406
|
+
|
|
407
|
+
pii_detected = len(replacements) > 0
|
|
408
|
+
return SanitizeResult(
|
|
409
|
+
sanitized=text,
|
|
410
|
+
original_length=original_length,
|
|
411
|
+
replacements=replacements,
|
|
412
|
+
pii_detected=pii_detected,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
def sanitize_batch(
|
|
416
|
+
self,
|
|
417
|
+
chunks: list[dict[str, Any]],
|
|
418
|
+
content_key: str = "content",
|
|
419
|
+
metadata_key: str = "metadata",
|
|
420
|
+
parallel: int = 1,
|
|
421
|
+
) -> list[SanitizeResult]:
|
|
422
|
+
"""Sanitize a batch of chunks.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
chunks: List of chunk dicts with at least a content field.
|
|
426
|
+
content_key: Key for content in each chunk dict.
|
|
427
|
+
metadata_key: Key for metadata in each chunk dict.
|
|
428
|
+
parallel: Number of parallel workers (1=sequential).
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
List of SanitizeResult in same order as input chunks.
|
|
432
|
+
"""
|
|
433
|
+
if not chunks:
|
|
434
|
+
return []
|
|
435
|
+
|
|
436
|
+
# Pre-load spaCy model before parallel execution
|
|
437
|
+
if self.config.use_spacy_ner:
|
|
438
|
+
self._get_nlp()
|
|
439
|
+
|
|
440
|
+
if parallel <= 1:
|
|
441
|
+
return [
|
|
442
|
+
self.sanitize(
|
|
443
|
+
chunk.get(content_key, ""),
|
|
444
|
+
chunk.get(metadata_key),
|
|
445
|
+
)
|
|
446
|
+
for chunk in chunks
|
|
447
|
+
]
|
|
448
|
+
|
|
449
|
+
# Parallel execution — spaCy model is thread-safe for inference
|
|
450
|
+
results: list[Optional[SanitizeResult]] = [None] * len(chunks)
|
|
451
|
+
with ThreadPoolExecutor(max_workers=parallel) as pool:
|
|
452
|
+
futures = {
|
|
453
|
+
pool.submit(
|
|
454
|
+
self.sanitize,
|
|
455
|
+
chunk.get(content_key, ""),
|
|
456
|
+
chunk.get(metadata_key),
|
|
457
|
+
): idx
|
|
458
|
+
for idx, chunk in enumerate(chunks)
|
|
459
|
+
}
|
|
460
|
+
for future in as_completed(futures):
|
|
461
|
+
idx = futures[future]
|
|
462
|
+
results[idx] = future.result()
|
|
463
|
+
|
|
464
|
+
return [r for r in results if r is not None]
|
|
465
|
+
|
|
466
|
+
def build_name_dictionary(self, store: "VectorStore") -> set[str]:
|
|
467
|
+
"""Extract unique sender names from WhatsApp chunks in the DB.
|
|
468
|
+
|
|
469
|
+
Queries the chunks table for distinct sender values where source='whatsapp'.
|
|
470
|
+
Returns the set of names found (can be passed to SanitizeConfig.known_names).
|
|
471
|
+
"""
|
|
472
|
+
cursor = store.conn.cursor()
|
|
473
|
+
rows = list(
|
|
474
|
+
cursor.execute(
|
|
475
|
+
"SELECT DISTINCT sender FROM chunks WHERE source = 'whatsapp' AND sender IS NOT NULL AND sender != ''"
|
|
476
|
+
)
|
|
477
|
+
)
|
|
478
|
+
names = {row[0].strip() for row in rows if row[0] and row[0].strip()}
|
|
479
|
+
return names
|
|
480
|
+
|
|
481
|
+
def save_mapping(self, path: Path) -> None:
|
|
482
|
+
"""Save the name→pseudonym mapping to a JSON file for reversibility.
|
|
483
|
+
|
|
484
|
+
This file should NEVER be uploaded to external services.
|
|
485
|
+
"""
|
|
486
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
487
|
+
mapping = {
|
|
488
|
+
"name_to_pseudonym": self._name_to_pseudo,
|
|
489
|
+
"pseudonym_to_name": {v: k for k, v in self._name_to_pseudo.items()},
|
|
490
|
+
}
|
|
491
|
+
path.write_text(json.dumps(mapping, indent=2, ensure_ascii=False))
|
|
492
|
+
|
|
493
|
+
def load_mapping(self, path: Path) -> None:
|
|
494
|
+
"""Load a previously saved mapping to maintain pseudonym consistency."""
|
|
495
|
+
if not path.exists():
|
|
496
|
+
return
|
|
497
|
+
try:
|
|
498
|
+
data = json.loads(path.read_text())
|
|
499
|
+
existing = data.get("name_to_pseudonym", {})
|
|
500
|
+
self._name_to_pseudo.update(existing)
|
|
501
|
+
except (json.JSONDecodeError, KeyError, OSError) as e:
|
|
502
|
+
import sys
|
|
503
|
+
|
|
504
|
+
print(f" Warning: could not load PII mapping from {path}: {e}", file=sys.stderr)
|
|
505
|
+
|
|
506
|
+
@classmethod
|
|
507
|
+
def from_env(cls) -> Sanitizer:
|
|
508
|
+
"""Create a Sanitizer from environment variables with sensible defaults."""
|
|
509
|
+
owner_names = tuple(
|
|
510
|
+
n.strip()
|
|
511
|
+
for n in os.environ.get(
|
|
512
|
+
"BRAINLAYER_SANITIZE_OWNER_NAMES",
|
|
513
|
+
"",
|
|
514
|
+
).split(",")
|
|
515
|
+
if n.strip()
|
|
516
|
+
)
|
|
517
|
+
owner_emails = tuple(
|
|
518
|
+
e.strip()
|
|
519
|
+
for e in os.environ.get(
|
|
520
|
+
"BRAINLAYER_SANITIZE_OWNER_EMAILS",
|
|
521
|
+
"",
|
|
522
|
+
).split(",")
|
|
523
|
+
if e.strip()
|
|
524
|
+
)
|
|
525
|
+
owner_paths = tuple(
|
|
526
|
+
p.strip()
|
|
527
|
+
for p in os.environ.get(
|
|
528
|
+
"BRAINLAYER_SANITIZE_OWNER_PATHS",
|
|
529
|
+
"",
|
|
530
|
+
).split(",")
|
|
531
|
+
if p.strip()
|
|
532
|
+
)
|
|
533
|
+
extra_names = frozenset(
|
|
534
|
+
n.strip() for n in os.environ.get("BRAINLAYER_SANITIZE_EXTRA_NAMES", "").split(",") if n.strip()
|
|
535
|
+
)
|
|
536
|
+
use_spacy = os.environ.get("BRAINLAYER_SANITIZE_USE_SPACY", "true").lower() in (
|
|
537
|
+
"true",
|
|
538
|
+
"1",
|
|
539
|
+
"yes",
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
config = SanitizeConfig(
|
|
543
|
+
owner_names=owner_names,
|
|
544
|
+
owner_emails=owner_emails,
|
|
545
|
+
owner_paths=owner_paths,
|
|
546
|
+
known_names=extra_names,
|
|
547
|
+
use_spacy_ner=use_spacy,
|
|
548
|
+
)
|
|
549
|
+
return cls(config)
|