@openthread/claude-code-plugin 0.1.5 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,669 @@
1
+ """Privacy masking library for Claude Code session content.
2
+
3
+ This is a Python port of ``apps/api/src/lib/privacy-mask.ts``. The
4
+ pipeline, regex patterns, and replacement values are kept in lock-step
5
+ with the TypeScript source so that client-side masked content matches
6
+ the server's expectations.
7
+
8
+ Pipeline:
9
+ 0. sanitize.normalize(text)
10
+ A. rewrite_project(text, cwds) -> cwd-relative [project] rewrite
11
+ B. paths -> home directories across OSes
12
+ C. secrets -> known prefixes, JWTs, etc.
13
+ D. entropy fallback -> high-entropy near sensitive labels
14
+ E. PII -> email / IP / MAC / shell prompts
15
+ F. usernames -> back-reference bare usernames
16
+
17
+ Python regex caveats vs JavaScript (documented here because they affect
18
+ fidelity with the TS source):
19
+
20
+ * Python's ``re`` does not support ``\p{L}`` / ``\p{N}``. We emulate
21
+ with a custom character class that matches unicode letters and
22
+ digits where needed. ``re.UNICODE`` is on by default in Python 3.
23
+ * Python does not need the TS "lastIndex" reset dance — we use
24
+ ``finditer`` and module-level compiled patterns.
25
+ * The TS ``^`` / ``$`` with ``/m`` is the same as Python's
26
+ ``re.MULTILINE``.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import math
32
+ import re
33
+ from typing import Any, Iterable
34
+
35
+ try:
36
+ from . import sanitize # package-style import (preferred)
37
+ except ImportError: # pragma: no cover - fallback when lib is on sys.path
38
+ import sanitize # type: ignore[no-redef]
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Unicode character-class helpers
43
+ # ---------------------------------------------------------------------------
44
+ #
45
+ # Python's stdlib re does not support \p{L} / \p{N}. We approximate the TS
46
+ # "username segment" class by matching the Latin-compatible character set:
47
+ # all letters/digits that Python treats as word characters except ``_`` and
48
+ # the additional punctuation the TS class allows. Because ``\w`` under
49
+ # ``re.UNICODE`` already spans ``\p{L}\p{N}_``, we widen it with
50
+ # ``._+-`` and explicitly strip the leading underscore behaviour when we
51
+ # need the stricter variant.
52
+
53
+ # Matches \p{L}\p{N}._+-
54
+ USER_SEG_CHARS = r"\w.+\-" # \w already covers letters/digits/underscore
55
+
56
+ # Bounded username segment: 1..64 of the above characters.
57
+ USER_SEG = r"[" + USER_SEG_CHARS + r"]{1,64}"
58
+
59
+ # A stricter "letter or digit" class used for word-boundary assertions in
60
+ # the username back-reference pass. Matches Python's \w minus underscore.
61
+ LETTER_OR_DIGIT = r"[^\W_]"
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # A. Project (cwd) rewrite
66
+ # ---------------------------------------------------------------------------
67
+
68
+ # Chars that end a "path-like" run. We don't let the cwd rewriter consume
69
+ # trailing punctuation (comma, colon, etc.).
70
+ _PROJECT_BOUNDARY = r"(?=/|\s|$|[,;:\)\]\}\"'`>])"
71
+
72
+
73
+ def rewrite_project(text: str, cwds: Iterable[str] | None) -> str:
74
+ """Replace occurrences of the user's cwd with the literal token
75
+ ``[project]``. The cwd rewrite runs BEFORE the generic home-dir rules
76
+ so that deeper paths collapse to ``[project]/sub/path`` instead of
77
+ ``[user-home]/...``.
78
+
79
+ * ``cwds`` is sorted longest-first so nested working directories
80
+ (e.g. ``/Users/a/code/app`` and ``/Users/a/code``) are matched in
81
+ the most-specific-first order.
82
+ * Matching is case-insensitive to handle macOS case-preserving
83
+ filesystems.
84
+ * The bare root ``/`` is silently skipped: rewriting ``/`` would
85
+ blank every absolute path in the document.
86
+ """
87
+ if not text or not cwds:
88
+ return text
89
+ # De-duplicate + longest-first.
90
+ unique = {c.rstrip("/") for c in cwds if isinstance(c, str) and c and c != "/"}
91
+ if not unique:
92
+ return text
93
+ ordered = sorted(unique, key=len, reverse=True)
94
+ out = text
95
+ for cwd in ordered:
96
+ pattern = re.compile(re.escape(cwd) + _PROJECT_BOUNDARY, re.IGNORECASE)
97
+ out = pattern.sub("[project]", out)
98
+ return out
99
+
100
+
101
+ # ---------------------------------------------------------------------------
102
+ # B. Path rules
103
+ # ---------------------------------------------------------------------------
104
+
105
+ # Order matters: longest / most specific first. Each tuple is
106
+ # ``(compiled_regex, replacement)``.
107
+ _PATH_RULES: list[tuple[re.Pattern[str], str]] = [
108
+ # Windows UNC: \\server\share\... -> \\[server]\[share]
109
+ (
110
+ re.compile(r"\\\\[A-Za-z0-9._\-$]{1,64}\\[A-Za-z0-9._\-$]{1,64}"),
111
+ r"\\\\[server]\\[share]",
112
+ ),
113
+ # Windows drive: C:\Users\<name>
114
+ (
115
+ re.compile(
116
+ r"[A-Za-z]:\\[Uu][Ss][Ee][Rr][Ss]\\" + USER_SEG + r"(?=\\|$|[^\w.+\-])",
117
+ ),
118
+ r"C:\\Users\\[user]",
119
+ ),
120
+ # WSL: /mnt/<drive>/Users/<name>
121
+ (
122
+ re.compile(r"/mnt/[a-zA-Z]/[Uu][Ss][Ee][Rr][Ss]/" + USER_SEG),
123
+ "/mnt/[drive]/Users/[user]",
124
+ ),
125
+ # macOS Volumes: /Volumes/<vol>
126
+ (
127
+ re.compile(r"/Volumes/[\w.+\- ]{1,64}"),
128
+ "/Volumes/[volume]",
129
+ ),
130
+ # Linux mounted media: /mnt/<name> or /media/<name>
131
+ (
132
+ re.compile(r"/(?:mnt|media)/" + USER_SEG),
133
+ "/[mount]/[user]",
134
+ ),
135
+ # POSIX /Users/<name>
136
+ (
137
+ re.compile(r"/[Uu][Ss][Ee][Rr][Ss]/" + USER_SEG),
138
+ "/Users/[user]",
139
+ ),
140
+ # Linux /home/<name>
141
+ (
142
+ re.compile(r"/home/" + USER_SEG),
143
+ "/home/[user]",
144
+ ),
145
+ # /root
146
+ (
147
+ re.compile(r"/root(?=/|$|[^\w])"),
148
+ "/root",
149
+ ),
150
+ # ~/... tilde paths. Exclude shell metacharacters so we don't eat a
151
+ # shell prompt terminator and don't re-match an already redacted
152
+ # "~/[path]". Mirrors the TS exclude set.
153
+ (
154
+ re.compile(r"~/[^\s\"'`\)\]\}>$#\[\]]{1,512}"),
155
+ "~/[path]",
156
+ ),
157
+ ]
158
+
159
+
160
+ def _apply_path_rules(text: str) -> str:
161
+ out = text
162
+ for pattern, repl in _PATH_RULES:
163
+ out = pattern.sub(repl, out)
164
+ return out
165
+
166
+
167
+ # A second pass that converts the stable "[user-home]" marker used in
168
+ # replacements into a shorter form. After ``_apply_path_rules`` runs, all
169
+ # POSIX ``/Users/<name>`` segments become ``/Users/[user]``. For the
170
+ # plugin-side flow the user wants the leading root path to become
171
+ # ``[user-home]`` when no cwd matched. We therefore run a final pass to
172
+ # collapse ``/Users/[user]``, ``/home/[user]``, and tilde forms to
173
+ # ``[user-home]`` ONLY when not already rewritten to ``[project]``.
174
+ _HOME_COLLAPSE_RE = re.compile(
175
+ r"(?:/Users/\[user\]|/home/\[user\]|C:\\Users\\\[user\]|/mnt/\[drive\]/Users/\[user\])"
176
+ )
177
+
178
+
179
+ def _collapse_home(text: str) -> str:
180
+ return _HOME_COLLAPSE_RE.sub("[user-home]", text)
181
+
182
+
183
+ # ---------------------------------------------------------------------------
184
+ # C. Secret rules
185
+ # ---------------------------------------------------------------------------
186
+
187
+ _SECRET_RULES: list[tuple[re.Pattern[str], str]] = [
188
+ # PEM private keys (generic)
189
+ (
190
+ re.compile(
191
+ r"-----BEGIN[ \t]+(?:[A-Z0-9]{1,12}[ \t]){0,4}PRIVATE KEY-----"
192
+ r"[\s\S]{1,16384}?"
193
+ r"-----END[ \t]+(?:[A-Z0-9]{1,12}[ \t]){0,4}PRIVATE KEY-----"
194
+ ),
195
+ "[REDACTED_PRIVATE_KEY]",
196
+ ),
197
+ # PGP private key block
198
+ (
199
+ re.compile(
200
+ r"-----BEGIN PGP PRIVATE KEY BLOCK-----"
201
+ r"[\s\S]{1,16384}?"
202
+ r"-----END PGP PRIVATE KEY BLOCK-----"
203
+ ),
204
+ "[REDACTED_PRIVATE_KEY]",
205
+ ),
206
+ # OpenSSH private key
207
+ (
208
+ re.compile(
209
+ r"-----BEGIN OPENSSH PRIVATE KEY-----"
210
+ r"[\s\S]{1,16384}?"
211
+ r"-----END OPENSSH PRIVATE KEY-----"
212
+ ),
213
+ "[REDACTED_PRIVATE_KEY]",
214
+ ),
215
+ # JWT (header.payload.signature)
216
+ (
217
+ re.compile(
218
+ r"\beyJ[A-Za-z0-9_-]{10,4096}\.[A-Za-z0-9_-]{10,4096}"
219
+ r"(?:\.[A-Za-z0-9_-]{0,4096})?"
220
+ ),
221
+ "[REDACTED_JWT]",
222
+ ),
223
+ # Authorization: <anything>
224
+ (
225
+ re.compile(r"Authorization[ \t]*:[ \t]*[^\r\n]{1,4096}", re.IGNORECASE),
226
+ "Authorization: [REDACTED]",
227
+ ),
228
+ # Bearer tokens
229
+ (
230
+ re.compile(r"\bBearer[ \t]+[A-Za-z0-9_\-.~+/]{8,4096}=*"),
231
+ "Bearer [REDACTED_TOKEN]",
232
+ ),
233
+ # Basic auth embedded in URL: scheme://user:pass@host
234
+ (
235
+ re.compile(
236
+ r"\b([a-zA-Z][a-zA-Z0-9+.\-]{1,16}):"
237
+ r"//[^\s/:@]{1,256}:[^\s/@]{1,256}@"
238
+ ),
239
+ r"\1://[REDACTED_BASIC_AUTH]@",
240
+ ),
241
+ # DB connection strings (runs after basic-auth scrub above)
242
+ (
243
+ re.compile(
244
+ r"\b(?:postgres(?:ql)?|mysql|mongodb(?:\+srv)?|redis|amqp|amqps)"
245
+ r"[+\w]{0,16}://[^\s\"'`\)\]\}>]{1,2048}",
246
+ re.IGNORECASE,
247
+ ),
248
+ "[REDACTED_CONNECTION_STRING]",
249
+ ),
250
+ # Anthropic
251
+ (re.compile(r"\bsk-ant-[A-Za-z0-9_\-]{20,200}"), "[REDACTED_KEY]"),
252
+ (re.compile(r"\banthropic-api03-[A-Za-z0-9_\-]{20,200}"), "[REDACTED_KEY]"),
253
+ # OpenAI
254
+ (re.compile(r"\bsk-(?:proj-|svcacct-)?[A-Za-z0-9_\-]{20,200}"), "[REDACTED_KEY]"),
255
+ # Google API key
256
+ (re.compile(r"\bAIza[0-9A-Za-z_\-]{35}"), "[REDACTED_KEY]"),
257
+ # AWS access keys
258
+ (re.compile(r"\b(?:AKIA|ASIA)[0-9A-Z]{16}"), "[REDACTED_KEY]"),
259
+ # GitHub
260
+ (re.compile(r"\bgh[pousr]_[A-Za-z0-9]{20,255}"), "[REDACTED_KEY]"),
261
+ (re.compile(r"\bgithub_pat_[A-Za-z0-9_]{20,255}"), "[REDACTED_KEY]"),
262
+ # GitLab
263
+ (re.compile(r"\bglpat-[A-Za-z0-9_\-]{20,255}"), "[REDACTED_KEY]"),
264
+ # Slack
265
+ (re.compile(r"\bxox[bpars]-[A-Za-z0-9\-]{10,255}"), "[REDACTED_KEY]"),
266
+ (re.compile(r"\bxapp-[A-Za-z0-9\-]{10,255}"), "[REDACTED_KEY]"),
267
+ # Stripe live/test secret/restricted/publishable
268
+ (re.compile(r"\b(?:sk|rk|pk)_live_[A-Za-z0-9]{20,255}"), "[REDACTED_KEY]"),
269
+ (re.compile(r"\b(?:sk|rk|pk)_test_[A-Za-z0-9]{20,255}"), "[REDACTED_KEY]"),
270
+ # Stripe legacy SK<32> / AC<32>
271
+ (re.compile(r"\b(?:SK|AC)[0-9a-fA-F]{32}\b"), "[REDACTED_KEY]"),
272
+ # Hugging Face
273
+ (re.compile(r"\bhf_[A-Za-z0-9]{20,255}"), "[REDACTED_KEY]"),
274
+ # xAI
275
+ (re.compile(r"\bxai-[A-Za-z0-9]{20,255}"), "[REDACTED_KEY]"),
276
+ # DigitalOcean PAT
277
+ (re.compile(r"\bdop_v1_[a-f0-9]{40,128}"), "[REDACTED_KEY]"),
278
+ # Generic env-var assignment lines: KEY=value
279
+ (
280
+ re.compile(
281
+ r"^([A-Z][A-Z0-9_]{2,64})[ \t]*=[ \t]*(?![ \t]*[{(])[^\r\n]{1,2048}$",
282
+ re.MULTILINE,
283
+ ),
284
+ r"\1=[REDACTED]",
285
+ ),
286
+ ]
287
+
288
+
289
+ def _apply_secret_rules(text: str) -> str:
290
+ out = text
291
+ for pattern, repl in _SECRET_RULES:
292
+ out = pattern.sub(repl, out)
293
+ return out
294
+
295
+
296
+ # ---------------------------------------------------------------------------
297
+ # D. Entropy fallback
298
+ # ---------------------------------------------------------------------------
299
+
300
+ _SECRET_LABELS_RE = re.compile(
301
+ r"(api[_\-]?key|api[_\-]?secret|auth(?:orization)?|access[_\-]?token|"
302
+ r"refresh[_\-]?token|secret|password|passwd|passphrase|token|bearer|"
303
+ r"credential|client[_\-]?secret|private[_\-]?key)",
304
+ re.IGNORECASE,
305
+ )
306
+
307
+ _ADJACENT_TOKEN_RE = re.compile(r"[A-Za-z0-9+/=._\-]{24,512}")
308
+
309
+
310
+ def _shannon_entropy(s: str) -> float:
311
+ if not s:
312
+ return 0.0
313
+ counts: dict[str, int] = {}
314
+ for ch in s:
315
+ counts[ch] = counts.get(ch, 0) + 1
316
+ length = len(s)
317
+ h = 0.0
318
+ for c in counts.values():
319
+ p = c / length
320
+ h -= p * math.log2(p)
321
+ return h
322
+
323
+
324
+ def _apply_entropy_rule(text: str) -> str:
325
+ out = text
326
+ replacements: list[tuple[int, int]] = []
327
+ for label in _SECRET_LABELS_RE.finditer(out):
328
+ label_end = label.end()
329
+ window = out[label_end : label_end + 256]
330
+ for tok in _ADJACENT_TOKEN_RE.finditer(window):
331
+ value = tok.group(0)
332
+ if value.startswith("[REDACTED"):
333
+ continue
334
+ if _shannon_entropy(value) >= 4.0:
335
+ start = label_end + tok.start()
336
+ replacements.append((start, start + len(value)))
337
+ # Mask only the first qualifying token per label.
338
+ break
339
+ if not replacements:
340
+ return out
341
+ # Apply right-to-left so indices remain valid.
342
+ replacements.sort(key=lambda r: r[0], reverse=True)
343
+ for start, end in replacements:
344
+ out = out[:start] + "[REDACTED_KEY]" + out[end:]
345
+ return out
346
+
347
+
348
+ # ---------------------------------------------------------------------------
349
+ # E. PII rules
350
+ # ---------------------------------------------------------------------------
351
+
352
+ # IPv4 with RFC1918 + loopback + link-local exemptions.
353
+ _IPV4_RE = re.compile(
354
+ r"\b"
355
+ r"(?!127\.)"
356
+ r"(?!10\.)"
357
+ r"(?!192\.168\.)"
358
+ r"(?!172\.(?:1[6-9]|2\d|3[01])\.)"
359
+ r"(?!169\.254\.)"
360
+ r"(?!0\.0\.0\.0\b)"
361
+ r"(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)"
362
+ r"(?:\.(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}"
363
+ r"\b"
364
+ )
365
+
366
+ _IPV6_RE = re.compile(
367
+ r"\b(?:[0-9A-Fa-f]{1,4}:){2,7}[0-9A-Fa-f]{0,4}\b"
368
+ r"|\b(?:[0-9A-Fa-f]{1,4}:){1,7}:"
369
+ r"|\b::(?:[0-9A-Fa-f]{1,4}:){0,6}[0-9A-Fa-f]{1,4}\b"
370
+ )
371
+
372
+ _MAC_RE = re.compile(r"\b(?:[0-9A-Fa-f]{2}[:\-]){5}[0-9A-Fa-f]{2}\b")
373
+
374
+ _EMAIL_RE = re.compile(
375
+ r"[A-Za-z0-9._%+\-]{1,64}@[A-Za-z0-9.\-]{1,253}\.[A-Za-z]{2,24}"
376
+ )
377
+
378
+ # Shell prompt: user@host:path[$#]. We accept unicode letters/digits via
379
+ # \w (which includes _) and extend with the TS punctuation class.
380
+ _SHELL_PROMPT_RE = re.compile(
381
+ r"\b[\w.\-]{1,64}@[\w.\-]{1,253}:[^\s$#]{0,256}[$#]"
382
+ )
383
+
384
+
385
+ def _mask_ipv6(match: "re.Match[str]") -> str:
386
+ m = match.group(0)
387
+ colons = m.count(":")
388
+ if colons < 2:
389
+ return m
390
+ if m == "::1":
391
+ return m
392
+ return "[ipv6]"
393
+
394
+
395
+ def _apply_pii_rules(text: str) -> str:
396
+ out = text
397
+ # Shell prompts first so user@host isn't chewed by email.
398
+ out = _SHELL_PROMPT_RE.sub("[shell-prompt]", out)
399
+ out = _EMAIL_RE.sub("[email]", out)
400
+ # MAC before IPv6: IPv6 regex would otherwise eat MAC sequences.
401
+ out = _MAC_RE.sub("[mac]", out)
402
+ out = _IPV6_RE.sub(_mask_ipv6, out)
403
+ out = _IPV4_RE.sub("[ip_address]", out)
404
+ return out
405
+
406
+
407
+ # ---------------------------------------------------------------------------
408
+ # F. Username back-references
409
+ # ---------------------------------------------------------------------------
410
+
411
+ _USERNAME_EXTRACT_PATTERNS: list[re.Pattern[str]] = [
412
+ re.compile(r"/[Uu][Ss][Ee][Rr][Ss]/([" + USER_SEG_CHARS + r"]{2,64})"),
413
+ re.compile(r"/home/([" + USER_SEG_CHARS + r"]{2,64})"),
414
+ re.compile(
415
+ r"[A-Za-z]:\\[Uu][Ss][Ee][Rr][Ss]\\([" + USER_SEG_CHARS + r"]{2,64})"
416
+ ),
417
+ re.compile(
418
+ r"/mnt/[a-zA-Z]/[Uu][Ss][Ee][Rr][Ss]/([" + USER_SEG_CHARS + r"]{2,64})"
419
+ ),
420
+ ]
421
+
422
+ _RESERVED_USERNAMES = frozenset(
423
+ {
424
+ "[user]",
425
+ "[path]",
426
+ "Public",
427
+ "Shared",
428
+ "Default",
429
+ "All Users",
430
+ "root",
431
+ "admin",
432
+ }
433
+ )
434
+
435
+
436
+ def _extract_usernames(text: str) -> set[str]:
437
+ found: set[str] = set()
438
+ for pattern in _USERNAME_EXTRACT_PATTERNS:
439
+ for match in pattern.finditer(text):
440
+ name = match.group(1)
441
+ if not name:
442
+ continue
443
+ if len(name) < 2 or len(name) > 64:
444
+ continue
445
+ if name in _RESERVED_USERNAMES:
446
+ continue
447
+ found.add(name)
448
+ return found
449
+
450
+
451
+ def _mask_bare_usernames(text: str, usernames: set[str]) -> str:
452
+ result = text
453
+ for username in usernames:
454
+ if len(username) < 3:
455
+ continue
456
+ escaped = re.escape(username)
457
+ # Hostname-style: alice-macbook, alice.dev
458
+ result = re.sub(
459
+ r"\b" + escaped
460
+ + r"s?[-.](?:macbook|laptop|desktop|pc|dev|server|local)[\w.\-]{0,64}",
461
+ "[hostname]",
462
+ result,
463
+ flags=re.IGNORECASE,
464
+ )
465
+ # Bare standalone reference. Approximate JS \p{L}\p{N}_ boundaries
466
+ # using Python's \w (which in Python 3 covers unicode letters +
467
+ # digits + underscore).
468
+ result = re.sub(
469
+ r"(^|[^\w])" + escaped + r"(?=$|[^\w])",
470
+ r"\1[user]",
471
+ result,
472
+ )
473
+ return result
474
+
475
+
476
+ # ---------------------------------------------------------------------------
477
+ # Public API
478
+ # ---------------------------------------------------------------------------
479
+
480
+
481
+ def mask(
482
+ text: str,
483
+ cwds: list[str] | None = None,
484
+ home: str | None = None,
485
+ ) -> str:
486
+ """Mask sensitive data in a single string.
487
+
488
+ ``cwds`` — optional list of working directories to rewrite to
489
+ ``[project]``. Provide this for Claude Code session content so that
490
+ the actual project root becomes a stable placeholder regardless of
491
+ where the user cloned the repo.
492
+
493
+ ``home`` — optional explicit home-directory path to rewrite to
494
+ ``[user-home]`` early. If omitted, falls back to the path-rule
495
+ pipeline which still collapses home dirs via pattern matching.
496
+ """
497
+ if not isinstance(text, str) or not text:
498
+ return text
499
+
500
+ out = sanitize.normalize(text)
501
+
502
+ # Shell prompts first (see TS: before path rules so prompt "$" doesn't
503
+ # get consumed).
504
+ out = _SHELL_PROMPT_RE.sub("[shell-prompt]", out)
505
+
506
+ # A. cwd -> [project]
507
+ if cwds:
508
+ out = rewrite_project(out, cwds)
509
+
510
+ # A'. explicit home rewrite (optional). Done after cwd so cwd wins
511
+ # when it is a subpath of home.
512
+ if home and isinstance(home, str) and home.strip() and home != "/":
513
+ home_clean = home.rstrip("/")
514
+ out = re.sub(
515
+ re.escape(home_clean) + _PROJECT_BOUNDARY,
516
+ "[user-home]",
517
+ out,
518
+ flags=re.IGNORECASE,
519
+ )
520
+
521
+ # B. generic path rules
522
+ out = _apply_path_rules(out)
523
+
524
+ # B'. Collapse residual /Users/[user], C:\Users\[user], etc. to
525
+ # the shorter [user-home] placeholder for a nicer UX.
526
+ out = _collapse_home(out)
527
+
528
+ # C. secrets
529
+ out = _apply_secret_rules(out)
530
+
531
+ # D. entropy fallback
532
+ out = _apply_entropy_rule(out)
533
+
534
+ # E. PII
535
+ out = _apply_pii_rules(out)
536
+
537
+ # F. username back-references. Extract from normalized original so
538
+ # the path rule hasn't erased them yet.
539
+ normalized_original = sanitize.normalize(text)
540
+ usernames = _extract_usernames(normalized_original)
541
+ if usernames:
542
+ out = _mask_bare_usernames(out, usernames)
543
+
544
+ return out
545
+
546
+
547
+ def _mask_value_deep(value: Any, usernames: set[str], cwds: list[str] | None, home: str | None) -> Any:
548
+ if isinstance(value, str):
549
+ masked = mask(value, cwds=cwds, home=home)
550
+ if usernames:
551
+ masked = _mask_bare_usernames(masked, usernames)
552
+ return masked
553
+ if isinstance(value, list):
554
+ return [_mask_value_deep(v, usernames, cwds, home) for v in value]
555
+ if isinstance(value, dict):
556
+ return {k: _mask_value_deep(v, usernames, cwds, home) for k, v in value.items()}
557
+ return value
558
+
559
+
560
+ def _collect_block_strings(block: dict) -> list[str]:
561
+ """Return every text-bearing string in a Claude Code content block so
562
+ the two-pass username scan can see them."""
563
+ parts: list[str] = []
564
+
565
+ def visit(v: Any) -> None:
566
+ if isinstance(v, str):
567
+ parts.append(v)
568
+ elif isinstance(v, list):
569
+ for x in v:
570
+ visit(x)
571
+ elif isinstance(v, dict):
572
+ for x in v.values():
573
+ visit(x)
574
+
575
+ btype = block.get("type")
576
+ if btype in ("text", "thinking", "tool_result"):
577
+ content = block.get("content") or block.get("text") or ""
578
+ if isinstance(content, str):
579
+ parts.append(content)
580
+ elif btype == "code":
581
+ parts.append(str(block.get("content", "")))
582
+ if block.get("filename"):
583
+ parts.append(str(block["filename"]))
584
+ elif btype == "file":
585
+ parts.append(str(block.get("content", "")))
586
+ parts.append(str(block.get("filename", "")))
587
+ elif btype == "error":
588
+ parts.append(str(block.get("message", "")))
589
+ elif btype == "artifact":
590
+ parts.append(str(block.get("content", "")))
591
+ parts.append(str(block.get("title", "")))
592
+ parts.append(str(block.get("identifier", "")))
593
+ elif btype == "tool_use":
594
+ visit(block.get("input"))
595
+ return parts
596
+
597
+
598
+ def mask_block_content(
599
+ block: dict,
600
+ usernames: set[str],
601
+ *,
602
+ cwds: list[str] | None = None,
603
+ home: str | None = None,
604
+ ) -> dict:
605
+ """Mask a single content block. ``usernames`` is the pre-gathered
606
+ set from a two-pass scan so cross-block bare references are caught.
607
+ """
608
+ btype = block.get("type")
609
+
610
+ def _mask_str(s: Any) -> Any:
611
+ if not isinstance(s, str):
612
+ return s
613
+ masked = mask(s, cwds=cwds, home=home)
614
+ if usernames:
615
+ masked = _mask_bare_usernames(masked, usernames)
616
+ return masked
617
+
618
+ if btype in ("text", "thinking", "tool_result"):
619
+ return {**block, "content": _mask_str(block.get("content", ""))}
620
+ if btype == "code":
621
+ out = {**block, "content": _mask_str(block.get("content", ""))}
622
+ if block.get("filename"):
623
+ out["filename"] = _mask_str(block["filename"])
624
+ return out
625
+ if btype == "file":
626
+ return {
627
+ **block,
628
+ "content": _mask_str(block.get("content", "")),
629
+ "filename": _mask_str(block.get("filename", "")),
630
+ }
631
+ if btype == "error":
632
+ return {**block, "message": _mask_str(block.get("message", ""))}
633
+ if btype == "artifact":
634
+ return {
635
+ **block,
636
+ "content": _mask_str(block.get("content", "")),
637
+ "title": _mask_str(block.get("title", "")),
638
+ }
639
+ if btype == "tool_use":
640
+ return {
641
+ **block,
642
+ "input": _mask_value_deep(block.get("input"), usernames, cwds, home),
643
+ }
644
+ # image, math, unknown: pass through.
645
+ return block
646
+
647
+
648
+ def mask_thread_blocks(
649
+ blocks: list[dict],
650
+ *,
651
+ cwds: list[str] | None = None,
652
+ home: str | None = None,
653
+ ) -> list[dict]:
654
+ """Two-pass masking over a list of Claude Code content blocks.
655
+
656
+ Pass 1 scans every text-bearing field to gather usernames so that a
657
+ bare reference in block B can be masked because the path appeared in
658
+ block A.
659
+ """
660
+ haystack_parts: list[str] = []
661
+ for block in blocks:
662
+ haystack_parts.extend(_collect_block_strings(block))
663
+ haystack = sanitize.normalize("\n".join(haystack_parts))
664
+ usernames = _extract_usernames(haystack)
665
+
666
+ return [
667
+ mask_block_content(block, usernames, cwds=cwds, home=home)
668
+ for block in blocks
669
+ ]