brainlayer 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. brainlayer/__init__.py +3 -0
  2. brainlayer/cli/__init__.py +1545 -0
  3. brainlayer/cli/wizard.py +132 -0
  4. brainlayer/cli_new.py +151 -0
  5. brainlayer/client.py +164 -0
  6. brainlayer/clustering.py +736 -0
  7. brainlayer/daemon.py +1105 -0
  8. brainlayer/dashboard/README.md +129 -0
  9. brainlayer/dashboard/__init__.py +5 -0
  10. brainlayer/dashboard/app.py +151 -0
  11. brainlayer/dashboard/search.py +229 -0
  12. brainlayer/dashboard/views.py +230 -0
  13. brainlayer/embeddings.py +131 -0
  14. brainlayer/engine.py +550 -0
  15. brainlayer/index_new.py +87 -0
  16. brainlayer/mcp/__init__.py +1558 -0
  17. brainlayer/migrate.py +205 -0
  18. brainlayer/paths.py +43 -0
  19. brainlayer/pipeline/__init__.py +47 -0
  20. brainlayer/pipeline/analyze_communication.py +508 -0
  21. brainlayer/pipeline/brain_graph.py +567 -0
  22. brainlayer/pipeline/chat_tags.py +63 -0
  23. brainlayer/pipeline/chunk.py +422 -0
  24. brainlayer/pipeline/classify.py +472 -0
  25. brainlayer/pipeline/cluster_sampling.py +73 -0
  26. brainlayer/pipeline/enrichment.py +810 -0
  27. brainlayer/pipeline/extract.py +66 -0
  28. brainlayer/pipeline/extract_claude_desktop.py +149 -0
  29. brainlayer/pipeline/extract_corrections.py +231 -0
  30. brainlayer/pipeline/extract_markdown.py +195 -0
  31. brainlayer/pipeline/extract_whatsapp.py +227 -0
  32. brainlayer/pipeline/git_overlay.py +301 -0
  33. brainlayer/pipeline/longitudinal_analyzer.py +568 -0
  34. brainlayer/pipeline/obsidian_export.py +455 -0
  35. brainlayer/pipeline/operation_grouping.py +486 -0
  36. brainlayer/pipeline/plan_linking.py +313 -0
  37. brainlayer/pipeline/sanitize.py +549 -0
  38. brainlayer/pipeline/semantic_style.py +574 -0
  39. brainlayer/pipeline/session_enrichment.py +472 -0
  40. brainlayer/pipeline/style_embed.py +67 -0
  41. brainlayer/pipeline/style_index.py +139 -0
  42. brainlayer/pipeline/temporal_chains.py +203 -0
  43. brainlayer/pipeline/time_batcher.py +248 -0
  44. brainlayer/pipeline/unified_timeline.py +569 -0
  45. brainlayer/storage.py +66 -0
  46. brainlayer/store.py +155 -0
  47. brainlayer/taxonomy.json +80 -0
  48. brainlayer/vector_store.py +1891 -0
  49. brainlayer-1.0.0.dist-info/METADATA +313 -0
  50. brainlayer-1.0.0.dist-info/RECORD +53 -0
  51. brainlayer-1.0.0.dist-info/WHEEL +4 -0
  52. brainlayer-1.0.0.dist-info/entry_points.txt +4 -0
  53. brainlayer-1.0.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,549 @@
1
+ """PII sanitization pipeline for BrainLayer chunks.
2
+
3
+ Strips personally identifiable information from chunk content before sending
4
+ to external LLM APIs (Gemini, Groq). Three detection layers:
5
+
6
+ 1. Regex — owner name, emails, file paths, IPs, JWTs, phone numbers, op:// refs
7
+ 2. Known names dictionary — WhatsApp contacts + manual list (Hebrew + English)
8
+ 3. spaCy NER — unknown English person names (en_core_web_sm)
9
+
10
+ Usage:
11
+ from brainlayer.pipeline.sanitize import Sanitizer
12
+
13
+ sanitizer = Sanitizer.from_env()
14
+ result = sanitizer.sanitize("Etan said hello to David")
15
+ print(result.sanitized) # "[OWNER] said hello to [PERSON_a1b2c3d4]"
16
+ print(result.pii_detected) # True
17
+
18
+ # Batch mode
19
+ results = sanitizer.sanitize_batch(chunks, parallel=4)
20
+
21
+ # Build name dictionary from WhatsApp contacts in DB
22
+ names = sanitizer.build_name_dictionary(store)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import hashlib
28
+ import json
29
+ import os
30
+ import re
31
+ import threading
32
+ from concurrent.futures import ThreadPoolExecutor, as_completed
33
+ from dataclasses import dataclass, field
34
+ from pathlib import Path
35
+ from typing import Any, Optional
36
+
37
+ # AIDEV-NOTE: spaCy is lazy-loaded to avoid slow import on every pipeline import.
38
+ # Only loaded when use_spacy_ner=True and sanitize() is first called.
39
+
40
+
41
+ # ── Types ──────────────────────────────────────────────────────────────
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class Replacement:
46
+ """Single PII replacement record."""
47
+
48
+ category: str # "owner", "person_name", "email", "file_path", "ip", "jwt", "op_ref", "phone", "github"
49
+ original: str # The matched text
50
+ placeholder: str # What it was replaced with
51
+ start: int # Position in original text
52
+ end: int # End position in original text
53
+ source: str # "regex", "spacy", "name_dict"
54
+
55
+
56
+ @dataclass
57
+ class SanitizeResult:
58
+ """Output of sanitization — the cleaned text + audit metadata."""
59
+
60
+ sanitized: str
61
+ original_length: int
62
+ replacements: list[Replacement] = field(default_factory=list)
63
+ pii_detected: bool = False
64
+
65
+
66
+ @dataclass(frozen=True)
67
+ class SanitizeConfig:
68
+ """What to sanitize and how."""
69
+
70
+ owner_names: tuple[str, ...] = ()
71
+ owner_emails: tuple[str, ...] = ()
72
+ owner_paths: tuple[str, ...] = ()
73
+ known_names: frozenset[str] = frozenset()
74
+ strip_emails: bool = True
75
+ strip_ips: bool = True
76
+ strip_jwts: bool = True
77
+ strip_op_refs: bool = True
78
+ strip_phone_numbers: bool = True
79
+ use_spacy_ner: bool = True
80
+
81
+
82
+ # ── Helpers ─────────────────────────────────────────────────────────────
83
+
84
+ # Hebrew nikud (diacritics) range: U+0591 to U+05C7
85
+ _NIKUD_RE = re.compile(r"[\u0591-\u05c7]")
86
+
87
+
88
+ def _strip_nikud(text: str) -> str:
89
+ """Remove Hebrew nikud (diacritical marks) for fuzzy name matching."""
90
+ return _NIKUD_RE.sub("", text)
91
+
92
+
93
+ def _nikud_offset_map(original: str) -> list[int]:
94
+ """Build mapping from nikud-stripped positions to original positions.
95
+
96
+ Returns a list where map[stripped_idx] = original_idx, plus one extra
97
+ entry at the end for end-of-string positions.
98
+ """
99
+ offset_map: list[int] = []
100
+ for orig_idx, ch in enumerate(original):
101
+ if not _NIKUD_RE.match(ch):
102
+ offset_map.append(orig_idx)
103
+ # Sentinel for end positions
104
+ offset_map.append(len(original))
105
+ return offset_map
106
+
107
+
108
+ # ── Regex patterns ─────────────────────────────────────────────────────
109
+
110
+ # Email: simplified RFC 5322
111
+ _EMAIL_RE = re.compile(r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b")
112
+
113
+ # IPv4
114
+ _IPV4_RE = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
115
+
116
+ # JWT tokens (3 base64 segments separated by dots)
117
+ _JWT_RE = re.compile(r"\beyJ[A-Za-z0-9_-]{10,}\.eyJ[A-Za-z0-9_-]{10,}\.[A-Za-z0-9_-]{10,}\b")
118
+
119
+ # 1Password references
120
+ _OP_REF_RE = re.compile(r"op://[^\s\"']+")
121
+
122
+ # Phone numbers: international format (+972..., +1..., etc.)
123
+ _PHONE_RE = re.compile(r"\+\d{1,3}[\s.-]?\d{1,4}[\s.-]?\d{3,4}[\s.-]?\d{3,4}\b")
124
+
125
+ # Code blocks (to exclude from NER)
126
+ _CODE_BLOCK_RE = re.compile(r"```[\s\S]*?```")
127
+
128
+ # GitHub URLs and @mentions
129
+ _GITHUB_URL_RE = re.compile(r"github\.com/([a-zA-Z0-9_-]+)")
130
+ _GITHUB_MENTION_RE = re.compile(r"@([a-zA-Z0-9_-]+)")
131
+
132
+
133
+ # ── Sanitizer ──────────────────────────────────────────────────────────
134
+
135
+
136
+ class Sanitizer:
137
+ """Reusable PII sanitizer for BrainLayer chunks.
138
+
139
+ Thread-safe for batch processing — spaCy model is loaded once and shared.
140
+ """
141
+
142
+ def __init__(self, config: SanitizeConfig) -> None:
143
+ self.config = config
144
+ self._nlp = None # Lazy-loaded spaCy model
145
+ self._name_to_pseudo: dict[str, str] = {} # name.lower() → placeholder
146
+ self._pseudo_lock = threading.Lock() # Thread-safe pseudonym access
147
+ self._owner_re: Optional[re.Pattern[str]] = None
148
+ self._known_names_re: Optional[re.Pattern[str]] = None
149
+
150
+ self._build_owner_regex()
151
+ self._build_known_names_regex()
152
+
153
+ def _build_owner_regex(self) -> None:
154
+ """Build compiled regex for owner name variants."""
155
+ if not self.config.owner_names:
156
+ return
157
+ # Sort by length descending so longer matches take priority
158
+ sorted_names = sorted(self.config.owner_names, key=len, reverse=True)
159
+ escaped = [re.escape(name) for name in sorted_names]
160
+ self._owner_re = re.compile(
161
+ r"\b(?:" + "|".join(escaped) + r")\b",
162
+ re.IGNORECASE,
163
+ )
164
+
165
+ def _build_known_names_regex(self) -> None:
166
+ """Build compiled regex for known names dictionary.
167
+
168
+ Uses word boundaries for Latin names. For Hebrew names (containing
169
+ Hebrew Unicode chars), uses lookahead/lookbehind on whitespace since
170
+ \\b doesn't work reliably with Hebrew script.
171
+ """
172
+ if not self.config.known_names:
173
+ return
174
+
175
+ latin_names: list[str] = []
176
+ hebrew_names: list[str] = []
177
+
178
+ for name in sorted(self.config.known_names, key=len, reverse=True):
179
+ name = name.strip()
180
+ if not name or len(name) < 2:
181
+ continue
182
+ # Check if name contains Hebrew characters (Unicode block 0x0590-0x05FF)
183
+ if any("\u0590" <= ch <= "\u05ff" for ch in name):
184
+ # Normalize: strip nikud (diacritics U+0591-U+05C7) for matching
185
+ normalized = _strip_nikud(name)
186
+ hebrew_names.append(re.escape(normalized))
187
+ else:
188
+ latin_names.append(re.escape(name))
189
+
190
+ parts: list[str] = []
191
+ if latin_names:
192
+ parts.append(r"\b(?:" + "|".join(latin_names) + r")\b")
193
+ if hebrew_names:
194
+ # Hebrew word boundary: preceded/followed by whitespace, start, or end
195
+ parts.append(r"(?:^|(?<=\s))(?:" + "|".join(hebrew_names) + r")(?=\s|$)")
196
+
197
+ if parts:
198
+ self._known_names_re = re.compile("|".join(parts), re.IGNORECASE | re.MULTILINE)
199
+
200
+ def _get_nlp(self):
201
+ """Lazy-load spaCy model on first use."""
202
+ if self._nlp is None and self.config.use_spacy_ner:
203
+ try:
204
+ import spacy
205
+
206
+ self._nlp = spacy.load("en_core_web_sm", disable=["parser", "lemmatizer"])
207
+ except (ImportError, OSError) as e:
208
+ import sys
209
+
210
+ print(f" spaCy unavailable ({e}), skipping NER layer", file=sys.stderr)
211
+ self._nlp = False # Sentinel: tried and failed
212
+ return self._nlp if self._nlp is not False else None
213
+
214
+ def _pseudonym(self, name: str) -> str:
215
+ """Get or create a stable pseudonym for a name. Thread-safe."""
216
+ key = name.lower().strip()
217
+ with self._pseudo_lock:
218
+ if key not in self._name_to_pseudo:
219
+ h = hashlib.sha256(key.encode("utf-8")).hexdigest()[:8]
220
+ self._name_to_pseudo[key] = f"[PERSON_{h}]"
221
+ return self._name_to_pseudo[key]
222
+
223
+ def sanitize(
224
+ self,
225
+ content: str,
226
+ metadata: Optional[dict[str, Any]] = None,
227
+ ) -> SanitizeResult:
228
+ """Sanitize a single chunk's content. Returns result with cleaned text.
229
+
230
+ Args:
231
+ content: The raw chunk text to sanitize.
232
+ metadata: Optional chunk metadata (source, sender, etc.) for context.
233
+ """
234
+ if not content:
235
+ return SanitizeResult(sanitized="", original_length=0, pii_detected=False)
236
+
237
+ original_length = len(content)
238
+ replacements: list[Replacement] = []
239
+ text = content
240
+
241
+ # Track already-replaced spans to avoid double-replacement
242
+ replaced_spans: list[tuple[int, int]] = []
243
+
244
+ def _apply_replacements(
245
+ text: str,
246
+ matches: list[tuple[int, int, str, str, str]],
247
+ ) -> str:
248
+ """Apply a list of (start, end, placeholder, category, source) replacements.
249
+
250
+ Works backwards to preserve positions.
251
+ """
252
+ # Sort by start position descending
253
+ sorted_matches = sorted(matches, key=lambda m: m[0], reverse=True)
254
+ for start, end, placeholder, category, source in sorted_matches:
255
+ # Skip if overlaps with already-replaced span
256
+ if any(s <= start < e or s < end <= e for s, e in replaced_spans):
257
+ continue
258
+ original = text[start:end]
259
+ replacements.append(
260
+ Replacement(
261
+ category=category,
262
+ original=original,
263
+ placeholder=placeholder,
264
+ start=start,
265
+ end=end,
266
+ source=source,
267
+ )
268
+ )
269
+ text = text[:start] + placeholder + text[end:]
270
+ replaced_spans.append((start, start + len(placeholder)))
271
+ return text
272
+
273
+ # ── Layer 1: Regex (owner + known patterns) ──
274
+ # Order matters: match longer/more-specific patterns first to avoid
275
+ # partial matches (e.g., "jane" inside "jane@example.com").
276
+
277
+ # Owner emails FIRST (before owner names, to avoid partial match)
278
+ for email in self.config.owner_emails:
279
+ if email.lower() in text.lower():
280
+ email_matches = [
281
+ (m.start(), m.end(), "[OWNER_EMAIL]", "email", "regex")
282
+ for m in re.finditer(re.escape(email), text, re.IGNORECASE)
283
+ ]
284
+ text = _apply_replacements(text, email_matches)
285
+
286
+ # Owner file paths SECOND (before owner names, same reason)
287
+ for path_prefix in self.config.owner_paths:
288
+ if path_prefix in text:
289
+ path_matches = [
290
+ (
291
+ m.start(),
292
+ m.end(),
293
+ m.group(0).replace(path_prefix, "/Users/[OWNER]"),
294
+ "file_path",
295
+ "regex",
296
+ )
297
+ for m in re.finditer(re.escape(path_prefix) + r"[^\s\"']*", text)
298
+ ]
299
+ text = _apply_replacements(text, path_matches)
300
+
301
+ # GitHub username THIRD
302
+ for owner_name in self.config.owner_names:
303
+ for pattern in [_GITHUB_URL_RE, _GITHUB_MENTION_RE]:
304
+ for m in pattern.finditer(text):
305
+ if m.group(1).lower() == owner_name.lower():
306
+ matches = [(m.start(), m.end(), "[OWNER_GITHUB]", "github", "regex")]
307
+ text = _apply_replacements(text, matches)
308
+
309
+ # Owner names LAST (after emails/paths/github are already replaced)
310
+ if self._owner_re:
311
+ matches = [(m.start(), m.end(), "[OWNER]", "owner", "regex") for m in self._owner_re.finditer(text)]
312
+ text = _apply_replacements(text, matches)
313
+
314
+ # General emails
315
+ if self.config.strip_emails:
316
+ counter = 0
317
+ general_emails = []
318
+ for m in _EMAIL_RE.finditer(text):
319
+ counter += 1
320
+ general_emails.append((m.start(), m.end(), f"[EMAIL_{counter}]", "email", "regex"))
321
+ text = _apply_replacements(text, general_emails)
322
+
323
+ # IPs
324
+ if self.config.strip_ips:
325
+ ip_matches = [
326
+ (m.start(), m.end(), "[IP_ADDR]", "ip", "regex")
327
+ for m in _IPV4_RE.finditer(text)
328
+ # Skip common non-PII IPs
329
+ if not m.group(0).startswith(("127.", "0.", "255.")) and m.group(0) != "0.0.0.0"
330
+ ]
331
+ text = _apply_replacements(text, ip_matches)
332
+
333
+ # JWTs
334
+ if self.config.strip_jwts:
335
+ jwt_matches = [(m.start(), m.end(), "[JWT_TOKEN]", "jwt", "regex") for m in _JWT_RE.finditer(text)]
336
+ text = _apply_replacements(text, jwt_matches)
337
+
338
+ # 1Password refs
339
+ if self.config.strip_op_refs:
340
+ op_matches = [(m.start(), m.end(), "[OP_REF]", "op_ref", "regex") for m in _OP_REF_RE.finditer(text)]
341
+ text = _apply_replacements(text, op_matches)
342
+
343
+ # Phone numbers
344
+ if self.config.strip_phone_numbers:
345
+ phone_matches = [(m.start(), m.end(), "[PHONE]", "phone", "regex") for m in _PHONE_RE.finditer(text)]
346
+ text = _apply_replacements(text, phone_matches)
347
+
348
+ # ── Layer 2: Known names dictionary ──
349
+
350
+ if self._known_names_re:
351
+ # Match against nikud-stripped text but replace in original
352
+ text_no_nikud = _strip_nikud(text)
353
+ if text_no_nikud != text:
354
+ # Hebrew text with nikud — match on stripped version, map positions back
355
+ omap = _nikud_offset_map(text)
356
+ name_matches = [
357
+ (
358
+ omap[m.start()],
359
+ omap[m.end()],
360
+ self._pseudonym(m.group(0)),
361
+ "person_name",
362
+ "name_dict",
363
+ )
364
+ for m in self._known_names_re.finditer(text_no_nikud)
365
+ ]
366
+ else:
367
+ name_matches = [
368
+ (m.start(), m.end(), self._pseudonym(m.group(0)), "person_name", "name_dict")
369
+ for m in self._known_names_re.finditer(text)
370
+ ]
371
+ text = _apply_replacements(text, name_matches)
372
+
373
+ # ── Layer 3: spaCy NER (English names only) ──
374
+
375
+ nlp = self._get_nlp()
376
+ if nlp is not None:
377
+ # Find code blocks to exclude
378
+ code_spans: set[tuple[int, int]] = set()
379
+ for m in _CODE_BLOCK_RE.finditer(text):
380
+ code_spans.add((m.start(), m.end()))
381
+
382
+ doc = nlp(text)
383
+ ner_matches = []
384
+ for ent in doc.ents:
385
+ if ent.label_ != "PERSON":
386
+ continue
387
+ # Skip if inside a code block
388
+ if any(cs <= ent.start_char < ce for cs, ce in code_spans):
389
+ continue
390
+ # Skip very short entities (likely false positives)
391
+ if len(ent.text.strip()) < 3:
392
+ continue
393
+ # Skip if already replaced
394
+ if any(s <= ent.start_char < e for s, e in replaced_spans):
395
+ continue
396
+ ner_matches.append(
397
+ (
398
+ ent.start_char,
399
+ ent.end_char,
400
+ self._pseudonym(ent.text),
401
+ "person_name",
402
+ "spacy",
403
+ )
404
+ )
405
+ text = _apply_replacements(text, ner_matches)
406
+
407
+ pii_detected = len(replacements) > 0
408
+ return SanitizeResult(
409
+ sanitized=text,
410
+ original_length=original_length,
411
+ replacements=replacements,
412
+ pii_detected=pii_detected,
413
+ )
414
+
415
+ def sanitize_batch(
416
+ self,
417
+ chunks: list[dict[str, Any]],
418
+ content_key: str = "content",
419
+ metadata_key: str = "metadata",
420
+ parallel: int = 1,
421
+ ) -> list[SanitizeResult]:
422
+ """Sanitize a batch of chunks.
423
+
424
+ Args:
425
+ chunks: List of chunk dicts with at least a content field.
426
+ content_key: Key for content in each chunk dict.
427
+ metadata_key: Key for metadata in each chunk dict.
428
+ parallel: Number of parallel workers (1=sequential).
429
+
430
+ Returns:
431
+ List of SanitizeResult in same order as input chunks.
432
+ """
433
+ if not chunks:
434
+ return []
435
+
436
+ # Pre-load spaCy model before parallel execution
437
+ if self.config.use_spacy_ner:
438
+ self._get_nlp()
439
+
440
+ if parallel <= 1:
441
+ return [
442
+ self.sanitize(
443
+ chunk.get(content_key, ""),
444
+ chunk.get(metadata_key),
445
+ )
446
+ for chunk in chunks
447
+ ]
448
+
449
+ # Parallel execution — spaCy model is thread-safe for inference
450
+ results: list[Optional[SanitizeResult]] = [None] * len(chunks)
451
+ with ThreadPoolExecutor(max_workers=parallel) as pool:
452
+ futures = {
453
+ pool.submit(
454
+ self.sanitize,
455
+ chunk.get(content_key, ""),
456
+ chunk.get(metadata_key),
457
+ ): idx
458
+ for idx, chunk in enumerate(chunks)
459
+ }
460
+ for future in as_completed(futures):
461
+ idx = futures[future]
462
+ results[idx] = future.result()
463
+
464
+ return [r for r in results if r is not None]
465
+
466
+ def build_name_dictionary(self, store: "VectorStore") -> set[str]:
467
+ """Extract unique sender names from WhatsApp chunks in the DB.
468
+
469
+ Queries the chunks table for distinct sender values where source='whatsapp'.
470
+ Returns the set of names found (can be passed to SanitizeConfig.known_names).
471
+ """
472
+ cursor = store.conn.cursor()
473
+ rows = list(
474
+ cursor.execute(
475
+ "SELECT DISTINCT sender FROM chunks WHERE source = 'whatsapp' AND sender IS NOT NULL AND sender != ''"
476
+ )
477
+ )
478
+ names = {row[0].strip() for row in rows if row[0] and row[0].strip()}
479
+ return names
480
+
481
+ def save_mapping(self, path: Path) -> None:
482
+ """Save the name→pseudonym mapping to a JSON file for reversibility.
483
+
484
+ This file should NEVER be uploaded to external services.
485
+ """
486
+ path.parent.mkdir(parents=True, exist_ok=True)
487
+ mapping = {
488
+ "name_to_pseudonym": self._name_to_pseudo,
489
+ "pseudonym_to_name": {v: k for k, v in self._name_to_pseudo.items()},
490
+ }
491
+ path.write_text(json.dumps(mapping, indent=2, ensure_ascii=False))
492
+
493
+ def load_mapping(self, path: Path) -> None:
494
+ """Load a previously saved mapping to maintain pseudonym consistency."""
495
+ if not path.exists():
496
+ return
497
+ try:
498
+ data = json.loads(path.read_text())
499
+ existing = data.get("name_to_pseudonym", {})
500
+ self._name_to_pseudo.update(existing)
501
+ except (json.JSONDecodeError, KeyError, OSError) as e:
502
+ import sys
503
+
504
+ print(f" Warning: could not load PII mapping from {path}: {e}", file=sys.stderr)
505
+
506
+ @classmethod
507
+ def from_env(cls) -> Sanitizer:
508
+ """Create a Sanitizer from environment variables with sensible defaults."""
509
+ owner_names = tuple(
510
+ n.strip()
511
+ for n in os.environ.get(
512
+ "BRAINLAYER_SANITIZE_OWNER_NAMES",
513
+ "",
514
+ ).split(",")
515
+ if n.strip()
516
+ )
517
+ owner_emails = tuple(
518
+ e.strip()
519
+ for e in os.environ.get(
520
+ "BRAINLAYER_SANITIZE_OWNER_EMAILS",
521
+ "",
522
+ ).split(",")
523
+ if e.strip()
524
+ )
525
+ owner_paths = tuple(
526
+ p.strip()
527
+ for p in os.environ.get(
528
+ "BRAINLAYER_SANITIZE_OWNER_PATHS",
529
+ "",
530
+ ).split(",")
531
+ if p.strip()
532
+ )
533
+ extra_names = frozenset(
534
+ n.strip() for n in os.environ.get("BRAINLAYER_SANITIZE_EXTRA_NAMES", "").split(",") if n.strip()
535
+ )
536
+ use_spacy = os.environ.get("BRAINLAYER_SANITIZE_USE_SPACY", "true").lower() in (
537
+ "true",
538
+ "1",
539
+ "yes",
540
+ )
541
+
542
+ config = SanitizeConfig(
543
+ owner_names=owner_names,
544
+ owner_emails=owner_emails,
545
+ owner_paths=owner_paths,
546
+ known_names=extra_names,
547
+ use_spacy_ner=use_spacy,
548
+ )
549
+ return cls(config)