traceredact 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- traceredact/__init__.py +43 -0
- traceredact/cli.py +172 -0
- traceredact/detectors/__init__.py +17 -0
- traceredact/detectors/base.py +109 -0
- traceredact/detectors/pii.py +15 -0
- traceredact/detectors/secrets.py +72 -0
- traceredact/engine.py +362 -0
- traceredact/integrations/__init__.py +5 -0
- traceredact/integrations/anthropic.py +46 -0
- traceredact/integrations/langchain.py +46 -0
- traceredact/integrations/openai.py +60 -0
- traceredact/policy.py +116 -0
- traceredact/py.typed +0 -0
- traceredact/rules.py +285 -0
- traceredact-0.1.0.dist-info/METADATA +140 -0
- traceredact-0.1.0.dist-info/RECORD +19 -0
- traceredact-0.1.0.dist-info/WHEEL +4 -0
- traceredact-0.1.0.dist-info/entry_points.txt +2 -0
- traceredact-0.1.0.dist-info/licenses/LICENSE +166 -0
traceredact/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""traceredact — redact PII & secrets from AI prompts, traces and tool-call args.
|
|
2
|
+
|
|
3
|
+
Quick start::
|
|
4
|
+
|
|
5
|
+
from traceredact import redact
|
|
6
|
+
|
|
7
|
+
result = redact({"args": {"email": "a@b.com", "key": "sk-abc123..."}})
|
|
8
|
+
result.value # -> {"args": {"email": "[REDACTED:pii]", "key": "[REDACTED:secret]"}}
|
|
9
|
+
result.findings # -> [Finding(json_path="args.email", ...), ...]
|
|
10
|
+
result.has_findings # -> True
|
|
11
|
+
|
|
12
|
+
Deterministic, no data retained. Configure via a :class:`Policy` (in code or a
|
|
13
|
+
``traceredact.yml`` file).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
from traceredact.detectors.base import Finding
|
|
21
|
+
from traceredact.engine import Engine, RedactionResult
|
|
22
|
+
from traceredact.policy import CustomPattern, Policy
|
|
23
|
+
|
|
24
|
+
__version__ = "0.1.0"
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"redact",
|
|
28
|
+
"Engine",
|
|
29
|
+
"RedactionResult",
|
|
30
|
+
"Finding",
|
|
31
|
+
"Policy",
|
|
32
|
+
"CustomPattern",
|
|
33
|
+
"__version__",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def redact(value: Any, policy: Policy | None = None) -> RedactionResult:
|
|
38
|
+
"""Redact ``value`` (str, dict, list, or nested mix) and return the result.
|
|
39
|
+
|
|
40
|
+
``policy`` defaults to a sensible built-in policy. The input is never
|
|
41
|
+
mutated; ``result.value`` is a redacted copy.
|
|
42
|
+
"""
|
|
43
|
+
return Engine(policy).redact(value)
|
traceredact/cli.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""``traceredact`` command-line interface.
|
|
2
|
+
|
|
3
|
+
Two commands, both CI-gateable (non-zero exit when findings exist):
|
|
4
|
+
|
|
5
|
+
* ``traceredact scan <path>`` — report findings, do not write anything.
|
|
6
|
+
* ``traceredact redact <file>`` — print/emit the redacted content.
|
|
7
|
+
|
|
8
|
+
``--format pretty|json`` controls output. JSON is stable for piping into CI.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from enum import StrEnum
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
import typer
|
|
19
|
+
|
|
20
|
+
from traceredact import __version__
|
|
21
|
+
from traceredact.engine import Engine, RedactionResult
|
|
22
|
+
from traceredact.policy import Policy
|
|
23
|
+
|
|
24
|
+
app = typer.Typer(
|
|
25
|
+
add_completion=False,
|
|
26
|
+
help="Redact PII & secrets from AI prompts, traces and tool-call arguments.",
|
|
27
|
+
no_args_is_help=True,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class OutputFormat(StrEnum):
|
|
32
|
+
pretty = "pretty"
|
|
33
|
+
json = "json"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _load_policy(policy_path: Path | None) -> Policy:
|
|
37
|
+
if policy_path is not None:
|
|
38
|
+
return Policy.load(policy_path)
|
|
39
|
+
# Auto-discover a traceredact.yml next to the cwd, else default.
|
|
40
|
+
default = Path("traceredact.yml")
|
|
41
|
+
return Policy.load(default) if default.exists() else Policy.default()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _parse_content(text: str) -> Any:
|
|
45
|
+
"""Parse JSON if it looks like JSON, else treat as raw text."""
|
|
46
|
+
stripped = text.strip()
|
|
47
|
+
if stripped[:1] in "{[":
|
|
48
|
+
try:
|
|
49
|
+
return json.loads(stripped)
|
|
50
|
+
except json.JSONDecodeError:
|
|
51
|
+
return text
|
|
52
|
+
return text
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _iter_files(path: Path) -> list[Path]:
|
|
56
|
+
if path.is_dir():
|
|
57
|
+
return sorted(p for p in path.rglob("*") if p.is_file())
|
|
58
|
+
return [path]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _render_pretty(per_file: list[tuple[Path, RedactionResult]]) -> int:
|
|
62
|
+
total = 0
|
|
63
|
+
for fpath, result in per_file:
|
|
64
|
+
if not result.findings:
|
|
65
|
+
continue
|
|
66
|
+
typer.secho(f"\n{fpath}", fg=typer.colors.CYAN, bold=True)
|
|
67
|
+
typer.echo(f" {'DETECTOR':<26} {'CONF':>5} {'PATH':<24} PREVIEW")
|
|
68
|
+
for f in result.findings:
|
|
69
|
+
total += 1
|
|
70
|
+
path = f.json_path or "(root)"
|
|
71
|
+
typer.echo(f" {f.detector_id:<26} {f.confidence:>5.2f} {path:<24} {f.preview}")
|
|
72
|
+
if total:
|
|
73
|
+
typer.secho(
|
|
74
|
+
f"\n{total} finding(s) across {len(per_file)} file(s).",
|
|
75
|
+
fg=typer.colors.RED,
|
|
76
|
+
bold=True,
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
typer.secho("No findings.", fg=typer.colors.GREEN)
|
|
80
|
+
return total
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _render_json(per_file: list[tuple[Path, RedactionResult]]) -> int:
|
|
84
|
+
payload = []
|
|
85
|
+
total = 0
|
|
86
|
+
for fpath, result in per_file:
|
|
87
|
+
findings = [
|
|
88
|
+
{
|
|
89
|
+
"detector_id": f.detector_id,
|
|
90
|
+
"category": f.category,
|
|
91
|
+
"confidence": f.confidence,
|
|
92
|
+
"json_path": f.json_path,
|
|
93
|
+
"span": list(f.span),
|
|
94
|
+
"preview": f.preview,
|
|
95
|
+
"replacement": f.replacement,
|
|
96
|
+
}
|
|
97
|
+
for f in result.findings
|
|
98
|
+
]
|
|
99
|
+
total += len(findings)
|
|
100
|
+
payload.append({"file": str(fpath), "findings": findings})
|
|
101
|
+
typer.echo(json.dumps({"total": total, "files": payload}, indent=2))
|
|
102
|
+
return total
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@app.command()
|
|
106
|
+
def scan(
|
|
107
|
+
path: Path = typer.Argument(..., exists=True, help="File or directory to scan."),
|
|
108
|
+
fmt: OutputFormat = typer.Option(OutputFormat.pretty, "--format", "-f"),
|
|
109
|
+
policy_path: Path | None = typer.Option(None, "--policy", "-p", help="traceredact.yml path."),
|
|
110
|
+
) -> None:
|
|
111
|
+
"""Scan a file or directory and report findings. Exits non-zero if any."""
|
|
112
|
+
policy = _load_policy(policy_path)
|
|
113
|
+
engine = Engine(policy)
|
|
114
|
+
per_file: list[tuple[Path, RedactionResult]] = []
|
|
115
|
+
for fpath in _iter_files(path):
|
|
116
|
+
try:
|
|
117
|
+
content = _parse_content(fpath.read_text(errors="replace"))
|
|
118
|
+
except OSError:
|
|
119
|
+
continue
|
|
120
|
+
per_file.append((fpath, engine.redact(content)))
|
|
121
|
+
|
|
122
|
+
total = _render_json(per_file) if fmt is OutputFormat.json else _render_pretty(per_file)
|
|
123
|
+
raise typer.Exit(code=1 if total else 0)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@app.command()
|
|
127
|
+
def redact(
|
|
128
|
+
file: Path = typer.Argument(..., exists=True, help="File to redact."),
|
|
129
|
+
fmt: OutputFormat = typer.Option(OutputFormat.pretty, "--format", "-f"),
|
|
130
|
+
policy_path: Path | None = typer.Option(None, "--policy", "-p", help="traceredact.yml path."),
|
|
131
|
+
output: Path | None = typer.Option(None, "--output", "-o", help="Write redacted content here."),
|
|
132
|
+
) -> None:
|
|
133
|
+
"""Redact a file's content and print (or write) the result. Non-zero exit if findings."""
|
|
134
|
+
policy = _load_policy(policy_path)
|
|
135
|
+
engine = Engine(policy)
|
|
136
|
+
content = _parse_content(file.read_text(errors="replace"))
|
|
137
|
+
result = engine.redact(content)
|
|
138
|
+
|
|
139
|
+
if isinstance(result.value, str):
|
|
140
|
+
rendered = result.value
|
|
141
|
+
else:
|
|
142
|
+
rendered = json.dumps(result.value, indent=2, default=str)
|
|
143
|
+
|
|
144
|
+
if fmt is OutputFormat.json:
|
|
145
|
+
rendered = json.dumps(
|
|
146
|
+
{"value": result.value, "findings_count": len(result.findings)},
|
|
147
|
+
indent=2,
|
|
148
|
+
default=str,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
if output is not None:
|
|
152
|
+
output.write_text(rendered)
|
|
153
|
+
typer.secho(f"Wrote redacted content to {output} ({len(result.findings)} finding(s)).",
|
|
154
|
+
fg=typer.colors.GREEN)
|
|
155
|
+
else:
|
|
156
|
+
typer.echo(rendered)
|
|
157
|
+
|
|
158
|
+
raise typer.Exit(code=1 if result.has_findings else 0)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@app.command()
|
|
162
|
+
def version() -> None:
|
|
163
|
+
"""Print the version."""
|
|
164
|
+
typer.echo(__version__)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def main() -> None: # pragma: no cover - console-script shim
|
|
168
|
+
app()
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
if __name__ == "__main__": # pragma: no cover
|
|
172
|
+
app()
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Detector package: base protocol, secret and PII detectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from traceredact.detectors.base import Detector, Finding, RuleDetector
|
|
6
|
+
from traceredact.detectors.pii import pii_detectors
|
|
7
|
+
from traceredact.detectors.secrets import EntropyDetector, secret_detectors, shannon_entropy
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"Detector",
|
|
11
|
+
"Finding",
|
|
12
|
+
"RuleDetector",
|
|
13
|
+
"EntropyDetector",
|
|
14
|
+
"shannon_entropy",
|
|
15
|
+
"secret_detectors",
|
|
16
|
+
"pii_detectors",
|
|
17
|
+
]
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Base detector protocol and the core Finding model.
|
|
2
|
+
|
|
3
|
+
A detector inspects a single *string* and yields zero or more :class:`Finding`
|
|
4
|
+
objects describing spans that should be redacted. Detectors never mutate input
|
|
5
|
+
and never retain data — the engine owns replacement and assembly.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from collections.abc import Iterable, Iterator
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
from traceredact.rules import PatternRule
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Finding(BaseModel):
|
|
19
|
+
"""A single redactable hit.
|
|
20
|
+
|
|
21
|
+
``span`` is a ``(start, end)`` half-open index into the *string* that was
|
|
22
|
+
scanned. ``json_path`` is set by the structured walker to record where in a
|
|
23
|
+
nested document the scanned string lived (e.g. ``args.email`` or
|
|
24
|
+
``messages[0].content``). ``matched`` holds the raw matched text so the
|
|
25
|
+
engine can build a replacement and so allowlists can be applied; callers who
|
|
26
|
+
need to avoid handling the secret should prefer :attr:`preview`.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
model_config = {"frozen": True}
|
|
30
|
+
|
|
31
|
+
detector_id: str
|
|
32
|
+
category: str
|
|
33
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
34
|
+
span: tuple[int, int]
|
|
35
|
+
matched: str
|
|
36
|
+
replacement: str
|
|
37
|
+
json_path: str | None = None
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def preview(self) -> str:
|
|
41
|
+
"""A short, non-reversible preview safe to print in logs/tables."""
|
|
42
|
+
n = len(self.matched)
|
|
43
|
+
if n <= 8:
|
|
44
|
+
return "*" * n
|
|
45
|
+
return f"{self.matched[:2]}{'*' * (n - 4)}{self.matched[-2:]}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Detector(ABC):
|
|
49
|
+
"""Base class for all detectors.
|
|
50
|
+
|
|
51
|
+
Subclasses implement :meth:`scan` over a string. The engine handles
|
|
52
|
+
allowlisting, replacement, overlap resolution and structured traversal, so
|
|
53
|
+
detectors stay small and data-driven.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
#: Stable identifier, e.g. ``"secrets.openai_key"``.
|
|
57
|
+
detector_id: str
|
|
58
|
+
#: Category bucket used in placeholders, e.g. ``"secret"`` / ``"pii"``.
|
|
59
|
+
category: str
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def scan(self, text: str) -> Iterable[Finding]:
|
|
63
|
+
"""Yield findings for ``text``. Must not mutate or retain ``text``."""
|
|
64
|
+
raise NotImplementedError
|
|
65
|
+
|
|
66
|
+
def __iter__(self) -> Iterator[Detector]: # convenience for flat iteration
|
|
67
|
+
yield self
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class RuleDetector(Detector):
|
|
71
|
+
"""Runs a list of :class:`~traceredact.rules.PatternRule` over a string.
|
|
72
|
+
|
|
73
|
+
Applies each rule's cheap literal prefilter before the (bounded) regex, then
|
|
74
|
+
its optional validator. The matched span is taken from ``rule.group`` so
|
|
75
|
+
rules like the env-assignment pattern redact only the *value*, not the key.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
detector_id = "rules"
|
|
79
|
+
category = "mixed"
|
|
80
|
+
|
|
81
|
+
def __init__(self, rules: Iterable[PatternRule]) -> None:
|
|
82
|
+
self.rules = tuple(rules)
|
|
83
|
+
|
|
84
|
+
def scan(self, text: str) -> Iterable[Finding]:
|
|
85
|
+
lowered = text.lower()
|
|
86
|
+
for rule in self.rules:
|
|
87
|
+
if not rule.prefilter_hit(lowered):
|
|
88
|
+
continue
|
|
89
|
+
for m in rule.regex.finditer(text):
|
|
90
|
+
start, end = m.span(rule.group)
|
|
91
|
+
if start < 0: # optional group didn't participate
|
|
92
|
+
continue
|
|
93
|
+
matched = m.group(rule.group)
|
|
94
|
+
if not matched:
|
|
95
|
+
continue
|
|
96
|
+
confidence = rule.confidence
|
|
97
|
+
if rule.validator is not None:
|
|
98
|
+
adjusted = rule.validator(matched)
|
|
99
|
+
if adjusted is None:
|
|
100
|
+
continue
|
|
101
|
+
confidence = adjusted
|
|
102
|
+
yield Finding(
|
|
103
|
+
detector_id=rule.id,
|
|
104
|
+
category=rule.category,
|
|
105
|
+
confidence=confidence,
|
|
106
|
+
span=(start, end),
|
|
107
|
+
matched=matched,
|
|
108
|
+
replacement="", # filled in by the engine from policy
|
|
109
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""PII detectors: email, phone, credit card (Luhn), IBAN (mod-97), IP.
|
|
2
|
+
|
|
3
|
+
All validation lives in :mod:`traceredact.rules` so the detector here is just a
|
|
4
|
+
thin wrapper over the PII rule set — keeping detection data-driven.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from traceredact.detectors.base import Detector, RuleDetector
|
|
10
|
+
from traceredact.rules import PII_RULES
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def pii_detectors() -> list[Detector]:
|
|
14
|
+
"""The default PII detector stack."""
|
|
15
|
+
return [RuleDetector(PII_RULES)]
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Secret detectors: pattern rules + a Shannon-entropy detector.
|
|
2
|
+
|
|
3
|
+
The entropy detector catches *unstructured* high-entropy tokens (random API
|
|
4
|
+
keys, base64 blobs) that no named pattern covers. It is deliberately
|
|
5
|
+
conservative — long tokens only, with a tunable threshold — because entropy is
|
|
6
|
+
a fuzzy signal and we'd rather a pattern rule own a known secret class.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
from collections.abc import Iterable
|
|
14
|
+
|
|
15
|
+
from traceredact.detectors.base import Detector, Finding, RuleDetector
|
|
16
|
+
from traceredact.rules import SECRET_RULES
|
|
17
|
+
|
|
18
|
+
# Candidate tokens for entropy scoring: runs of base64/hex characters (optional
|
|
19
|
+
# trailing '=' padding). We deliberately exclude '=', '_' and '-' as *internal*
|
|
20
|
+
# connectors so an identifier like ``API_KEY=...`` splits into "API_KEY" and the
|
|
21
|
+
# value, instead of being scored as one merged token. Named/base64url secrets
|
|
22
|
+
# are owned by the pattern rules; entropy is the catch-all fallback.
|
|
23
|
+
_TOKEN_RE = re.compile(r"[A-Za-z0-9+/]{16,512}={0,2}")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def shannon_entropy(value: str) -> float:
|
|
27
|
+
"""Bits of Shannon entropy per character."""
|
|
28
|
+
if not value:
|
|
29
|
+
return 0.0
|
|
30
|
+
counts: dict[str, int] = {}
|
|
31
|
+
for ch in value:
|
|
32
|
+
counts[ch] = counts.get(ch, 0) + 1
|
|
33
|
+
n = len(value)
|
|
34
|
+
return -sum((c / n) * math.log2(c / n) for c in counts.values())
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class EntropyDetector(Detector):
|
|
38
|
+
"""Flags long, high-entropy tokens as likely secrets."""
|
|
39
|
+
|
|
40
|
+
detector_id = "secrets.high_entropy"
|
|
41
|
+
category = "secret"
|
|
42
|
+
|
|
43
|
+
def __init__(self, threshold: float = 4.0, min_len: int = 20) -> None:
|
|
44
|
+
self.threshold = threshold
|
|
45
|
+
self.min_len = min_len
|
|
46
|
+
|
|
47
|
+
def scan(self, text: str) -> Iterable[Finding]:
|
|
48
|
+
for m in _TOKEN_RE.finditer(text):
|
|
49
|
+
token = m.group(0)
|
|
50
|
+
if len(token) < self.min_len:
|
|
51
|
+
continue
|
|
52
|
+
ent = shannon_entropy(token)
|
|
53
|
+
if ent < self.threshold:
|
|
54
|
+
continue
|
|
55
|
+
# Map entropy onto a confidence in [0.5, 0.95].
|
|
56
|
+
confidence = min(0.95, 0.5 + (ent - self.threshold) / 4.0)
|
|
57
|
+
yield Finding(
|
|
58
|
+
detector_id=self.detector_id,
|
|
59
|
+
category=self.category,
|
|
60
|
+
confidence=round(confidence, 3),
|
|
61
|
+
span=m.span(0),
|
|
62
|
+
matched=token,
|
|
63
|
+
replacement="",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def secret_detectors(entropy_threshold: float = 4.0, min_entropy_len: int = 20) -> list[Detector]:
|
|
68
|
+
"""The default secret detector stack."""
|
|
69
|
+
return [
|
|
70
|
+
RuleDetector(SECRET_RULES),
|
|
71
|
+
EntropyDetector(threshold=entropy_threshold, min_len=min_entropy_len),
|
|
72
|
+
]
|