traceredact 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ """traceredact — redact PII & secrets from AI prompts, traces and tool-call args.
2
+
3
+ Quick start::
4
+
5
+ from traceredact import redact
6
+
7
+ result = redact({"args": {"email": "a@b.com", "key": "sk-abc123..."}})
8
+ result.value # -> {"args": {"email": "[REDACTED:pii]", "key": "[REDACTED:secret]"}}
9
+ result.findings # -> [Finding(json_path="args.email", ...), ...]
10
+ result.has_findings # -> True
11
+
12
+ Deterministic, no data retained. Configure via a :class:`Policy` (in code or a
13
+ ``traceredact.yml`` file).
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from typing import Any
19
+
20
+ from traceredact.detectors.base import Finding
21
+ from traceredact.engine import Engine, RedactionResult
22
+ from traceredact.policy import CustomPattern, Policy
23
+
24
+ __version__ = "0.1.0"
25
+
26
+ __all__ = [
27
+ "redact",
28
+ "Engine",
29
+ "RedactionResult",
30
+ "Finding",
31
+ "Policy",
32
+ "CustomPattern",
33
+ "__version__",
34
+ ]
35
+
36
+
37
+ def redact(value: Any, policy: Policy | None = None) -> RedactionResult:
38
+ """Redact ``value`` (str, dict, list, or nested mix) and return the result.
39
+
40
+ ``policy`` defaults to a sensible built-in policy. The input is never
41
+ mutated; ``result.value`` is a redacted copy.
42
+ """
43
+ return Engine(policy).redact(value)
traceredact/cli.py ADDED
@@ -0,0 +1,172 @@
1
+ """``traceredact`` command-line interface.
2
+
3
+ Two commands, both CI-gateable (non-zero exit when findings exist):
4
+
5
+ * ``traceredact scan <path>`` — report findings, do not write anything.
6
+ * ``traceredact redact <file>`` — print/emit the redacted content.
7
+
8
+ ``--format pretty|json`` controls output. JSON is stable for piping into CI.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ from enum import StrEnum
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ import typer
19
+
20
+ from traceredact import __version__
21
+ from traceredact.engine import Engine, RedactionResult
22
+ from traceredact.policy import Policy
23
+
24
+ app = typer.Typer(
25
+ add_completion=False,
26
+ help="Redact PII & secrets from AI prompts, traces and tool-call arguments.",
27
+ no_args_is_help=True,
28
+ )
29
+
30
+
31
+ class OutputFormat(StrEnum):
32
+ pretty = "pretty"
33
+ json = "json"
34
+
35
+
36
+ def _load_policy(policy_path: Path | None) -> Policy:
37
+ if policy_path is not None:
38
+ return Policy.load(policy_path)
39
+ # Auto-discover a traceredact.yml next to the cwd, else default.
40
+ default = Path("traceredact.yml")
41
+ return Policy.load(default) if default.exists() else Policy.default()
42
+
43
+
44
+ def _parse_content(text: str) -> Any:
45
+ """Parse JSON if it looks like JSON, else treat as raw text."""
46
+ stripped = text.strip()
47
+ if stripped[:1] in "{[":
48
+ try:
49
+ return json.loads(stripped)
50
+ except json.JSONDecodeError:
51
+ return text
52
+ return text
53
+
54
+
55
+ def _iter_files(path: Path) -> list[Path]:
56
+ if path.is_dir():
57
+ return sorted(p for p in path.rglob("*") if p.is_file())
58
+ return [path]
59
+
60
+
61
+ def _render_pretty(per_file: list[tuple[Path, RedactionResult]]) -> int:
62
+ total = 0
63
+ for fpath, result in per_file:
64
+ if not result.findings:
65
+ continue
66
+ typer.secho(f"\n{fpath}", fg=typer.colors.CYAN, bold=True)
67
+ typer.echo(f" {'DETECTOR':<26} {'CONF':>5} {'PATH':<24} PREVIEW")
68
+ for f in result.findings:
69
+ total += 1
70
+ path = f.json_path or "(root)"
71
+ typer.echo(f" {f.detector_id:<26} {f.confidence:>5.2f} {path:<24} {f.preview}")
72
+ if total:
73
+ typer.secho(
74
+ f"\n{total} finding(s) across {len(per_file)} file(s).",
75
+ fg=typer.colors.RED,
76
+ bold=True,
77
+ )
78
+ else:
79
+ typer.secho("No findings.", fg=typer.colors.GREEN)
80
+ return total
81
+
82
+
83
+ def _render_json(per_file: list[tuple[Path, RedactionResult]]) -> int:
84
+ payload = []
85
+ total = 0
86
+ for fpath, result in per_file:
87
+ findings = [
88
+ {
89
+ "detector_id": f.detector_id,
90
+ "category": f.category,
91
+ "confidence": f.confidence,
92
+ "json_path": f.json_path,
93
+ "span": list(f.span),
94
+ "preview": f.preview,
95
+ "replacement": f.replacement,
96
+ }
97
+ for f in result.findings
98
+ ]
99
+ total += len(findings)
100
+ payload.append({"file": str(fpath), "findings": findings})
101
+ typer.echo(json.dumps({"total": total, "files": payload}, indent=2))
102
+ return total
103
+
104
+
105
+ @app.command()
106
+ def scan(
107
+ path: Path = typer.Argument(..., exists=True, help="File or directory to scan."),
108
+ fmt: OutputFormat = typer.Option(OutputFormat.pretty, "--format", "-f"),
109
+ policy_path: Path | None = typer.Option(None, "--policy", "-p", help="traceredact.yml path."),
110
+ ) -> None:
111
+ """Scan a file or directory and report findings. Exits non-zero if any."""
112
+ policy = _load_policy(policy_path)
113
+ engine = Engine(policy)
114
+ per_file: list[tuple[Path, RedactionResult]] = []
115
+ for fpath in _iter_files(path):
116
+ try:
117
+ content = _parse_content(fpath.read_text(errors="replace"))
118
+ except OSError:
119
+ continue
120
+ per_file.append((fpath, engine.redact(content)))
121
+
122
+ total = _render_json(per_file) if fmt is OutputFormat.json else _render_pretty(per_file)
123
+ raise typer.Exit(code=1 if total else 0)
124
+
125
+
126
+ @app.command()
127
+ def redact(
128
+ file: Path = typer.Argument(..., exists=True, help="File to redact."),
129
+ fmt: OutputFormat = typer.Option(OutputFormat.pretty, "--format", "-f"),
130
+ policy_path: Path | None = typer.Option(None, "--policy", "-p", help="traceredact.yml path."),
131
+ output: Path | None = typer.Option(None, "--output", "-o", help="Write redacted content here."),
132
+ ) -> None:
133
+ """Redact a file's content and print (or write) the result. Non-zero exit if findings."""
134
+ policy = _load_policy(policy_path)
135
+ engine = Engine(policy)
136
+ content = _parse_content(file.read_text(errors="replace"))
137
+ result = engine.redact(content)
138
+
139
+ if isinstance(result.value, str):
140
+ rendered = result.value
141
+ else:
142
+ rendered = json.dumps(result.value, indent=2, default=str)
143
+
144
+ if fmt is OutputFormat.json:
145
+ rendered = json.dumps(
146
+ {"value": result.value, "findings_count": len(result.findings)},
147
+ indent=2,
148
+ default=str,
149
+ )
150
+
151
+ if output is not None:
152
+ output.write_text(rendered)
153
+ typer.secho(f"Wrote redacted content to {output} ({len(result.findings)} finding(s)).",
154
+ fg=typer.colors.GREEN)
155
+ else:
156
+ typer.echo(rendered)
157
+
158
+ raise typer.Exit(code=1 if result.has_findings else 0)
159
+
160
+
161
+ @app.command()
162
+ def version() -> None:
163
+ """Print the version."""
164
+ typer.echo(__version__)
165
+
166
+
167
+ def main() -> None: # pragma: no cover - console-script shim
168
+ app()
169
+
170
+
171
+ if __name__ == "__main__": # pragma: no cover
172
+ app()
@@ -0,0 +1,17 @@
1
+ """Detector package: base protocol, secret and PII detectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from traceredact.detectors.base import Detector, Finding, RuleDetector
6
+ from traceredact.detectors.pii import pii_detectors
7
+ from traceredact.detectors.secrets import EntropyDetector, secret_detectors, shannon_entropy
8
+
9
+ __all__ = [
10
+ "Detector",
11
+ "Finding",
12
+ "RuleDetector",
13
+ "EntropyDetector",
14
+ "shannon_entropy",
15
+ "secret_detectors",
16
+ "pii_detectors",
17
+ ]
@@ -0,0 +1,109 @@
1
+ """Base detector protocol and the core Finding model.
2
+
3
+ A detector inspects a single *string* and yields zero or more :class:`Finding`
4
+ objects describing spans that should be redacted. Detectors never mutate input
5
+ and never retain data — the engine owns replacement and assembly.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+ from collections.abc import Iterable, Iterator
12
+
13
+ from pydantic import BaseModel, Field
14
+
15
+ from traceredact.rules import PatternRule
16
+
17
+
18
+ class Finding(BaseModel):
19
+ """A single redactable hit.
20
+
21
+ ``span`` is a ``(start, end)`` half-open index into the *string* that was
22
+ scanned. ``json_path`` is set by the structured walker to record where in a
23
+ nested document the scanned string lived (e.g. ``args.email`` or
24
+ ``messages[0].content``). ``matched`` holds the raw matched text so the
25
+ engine can build a replacement and so allowlists can be applied; callers who
26
+ need to avoid handling the secret should prefer :attr:`preview`.
27
+ """
28
+
29
+ model_config = {"frozen": True}
30
+
31
+ detector_id: str
32
+ category: str
33
+ confidence: float = Field(ge=0.0, le=1.0)
34
+ span: tuple[int, int]
35
+ matched: str
36
+ replacement: str
37
+ json_path: str | None = None
38
+
39
+ @property
40
+ def preview(self) -> str:
41
+ """A short, non-reversible preview safe to print in logs/tables."""
42
+ n = len(self.matched)
43
+ if n <= 8:
44
+ return "*" * n
45
+ return f"{self.matched[:2]}{'*' * (n - 4)}{self.matched[-2:]}"
46
+
47
+
48
+ class Detector(ABC):
49
+ """Base class for all detectors.
50
+
51
+ Subclasses implement :meth:`scan` over a string. The engine handles
52
+ allowlisting, replacement, overlap resolution and structured traversal, so
53
+ detectors stay small and data-driven.
54
+ """
55
+
56
+ #: Stable identifier, e.g. ``"secrets.openai_key"``.
57
+ detector_id: str
58
+ #: Category bucket used in placeholders, e.g. ``"secret"`` / ``"pii"``.
59
+ category: str
60
+
61
+ @abstractmethod
62
+ def scan(self, text: str) -> Iterable[Finding]:
63
+ """Yield findings for ``text``. Must not mutate or retain ``text``."""
64
+ raise NotImplementedError
65
+
66
+ def __iter__(self) -> Iterator[Detector]: # convenience for flat iteration
67
+ yield self
68
+
69
+
70
+ class RuleDetector(Detector):
71
+ """Runs a list of :class:`~traceredact.rules.PatternRule` over a string.
72
+
73
+ Applies each rule's cheap literal prefilter before the (bounded) regex, then
74
+ its optional validator. The matched span is taken from ``rule.group`` so
75
+ rules like the env-assignment pattern redact only the *value*, not the key.
76
+ """
77
+
78
+ detector_id = "rules"
79
+ category = "mixed"
80
+
81
+ def __init__(self, rules: Iterable[PatternRule]) -> None:
82
+ self.rules = tuple(rules)
83
+
84
+ def scan(self, text: str) -> Iterable[Finding]:
85
+ lowered = text.lower()
86
+ for rule in self.rules:
87
+ if not rule.prefilter_hit(lowered):
88
+ continue
89
+ for m in rule.regex.finditer(text):
90
+ start, end = m.span(rule.group)
91
+ if start < 0: # optional group didn't participate
92
+ continue
93
+ matched = m.group(rule.group)
94
+ if not matched:
95
+ continue
96
+ confidence = rule.confidence
97
+ if rule.validator is not None:
98
+ adjusted = rule.validator(matched)
99
+ if adjusted is None:
100
+ continue
101
+ confidence = adjusted
102
+ yield Finding(
103
+ detector_id=rule.id,
104
+ category=rule.category,
105
+ confidence=confidence,
106
+ span=(start, end),
107
+ matched=matched,
108
+ replacement="", # filled in by the engine from policy
109
+ )
@@ -0,0 +1,15 @@
1
+ """PII detectors: email, phone, credit card (Luhn), IBAN (mod-97), IP.
2
+
3
+ All validation lives in :mod:`traceredact.rules` so the detector here is just a
4
+ thin wrapper over the PII rule set — keeping detection data-driven.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from traceredact.detectors.base import Detector, RuleDetector
10
+ from traceredact.rules import PII_RULES
11
+
12
+
13
+ def pii_detectors() -> list[Detector]:
14
+ """The default PII detector stack."""
15
+ return [RuleDetector(PII_RULES)]
@@ -0,0 +1,72 @@
1
+ """Secret detectors: pattern rules + a Shannon-entropy detector.
2
+
3
+ The entropy detector catches *unstructured* high-entropy tokens (random API
4
+ keys, base64 blobs) that no named pattern covers. It is deliberately
5
+ conservative — long tokens only, with a tunable threshold — because entropy is
6
+ a fuzzy signal and we'd rather a pattern rule own a known secret class.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import math
12
+ import re
13
+ from collections.abc import Iterable
14
+
15
+ from traceredact.detectors.base import Detector, Finding, RuleDetector
16
+ from traceredact.rules import SECRET_RULES
17
+
18
+ # Candidate tokens for entropy scoring: runs of base64/hex characters (optional
19
+ # trailing '=' padding). We deliberately exclude '=', '_' and '-' as *internal*
20
+ # connectors so an identifier like ``API_KEY=...`` splits into "API_KEY" and the
21
+ # value, instead of being scored as one merged token. Named/base64url secrets
22
+ # are owned by the pattern rules; entropy is the catch-all fallback.
23
+ _TOKEN_RE = re.compile(r"[A-Za-z0-9+/]{16,512}={0,2}")
24
+
25
+
26
+ def shannon_entropy(value: str) -> float:
27
+ """Bits of Shannon entropy per character."""
28
+ if not value:
29
+ return 0.0
30
+ counts: dict[str, int] = {}
31
+ for ch in value:
32
+ counts[ch] = counts.get(ch, 0) + 1
33
+ n = len(value)
34
+ return -sum((c / n) * math.log2(c / n) for c in counts.values())
35
+
36
+
37
+ class EntropyDetector(Detector):
38
+ """Flags long, high-entropy tokens as likely secrets."""
39
+
40
+ detector_id = "secrets.high_entropy"
41
+ category = "secret"
42
+
43
+ def __init__(self, threshold: float = 4.0, min_len: int = 20) -> None:
44
+ self.threshold = threshold
45
+ self.min_len = min_len
46
+
47
+ def scan(self, text: str) -> Iterable[Finding]:
48
+ for m in _TOKEN_RE.finditer(text):
49
+ token = m.group(0)
50
+ if len(token) < self.min_len:
51
+ continue
52
+ ent = shannon_entropy(token)
53
+ if ent < self.threshold:
54
+ continue
55
+ # Map entropy onto a confidence in [0.5, 0.95].
56
+ confidence = min(0.95, 0.5 + (ent - self.threshold) / 4.0)
57
+ yield Finding(
58
+ detector_id=self.detector_id,
59
+ category=self.category,
60
+ confidence=round(confidence, 3),
61
+ span=m.span(0),
62
+ matched=token,
63
+ replacement="",
64
+ )
65
+
66
+
67
+ def secret_detectors(entropy_threshold: float = 4.0, min_entropy_len: int = 20) -> list[Detector]:
68
+ """The default secret detector stack."""
69
+ return [
70
+ RuleDetector(SECRET_RULES),
71
+ EntropyDetector(threshold=entropy_threshold, min_len=min_entropy_len),
72
+ ]