content-guard 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ """Policy-driven content scanning and redaction."""
2
+
3
+ from .engine import GuardResult, scan_text, redact_text
4
+ from .policy import Policy, load_policy
5
+
6
+ __all__ = ["GuardResult", "Policy", "load_policy", "scan_text", "redact_text"]
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
content_guard/cli.py ADDED
@@ -0,0 +1,176 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import difflib
5
+ import json
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from .engine import scan_text
10
+ from .policy import load_policy
11
+ from .report import to_json, to_payload, to_text
12
+ from .types import ScanOptions
13
+
14
+
15
+ def main(argv: list[str] | None = None) -> int:
16
+ parser = build_parser()
17
+ args = parser.parse_args(argv)
18
+
19
+ if args.command == "scan":
20
+ return _scan(args)
21
+ if args.command == "redact":
22
+ return _redact(args)
23
+ if args.command == "diff":
24
+ return _diff(args)
25
+
26
+ parser.error(f"unknown command: {args.command}")
27
+ return 2
28
+
29
+
30
+ def build_parser() -> argparse.ArgumentParser:
31
+ parser = argparse.ArgumentParser(
32
+ prog="content-guard",
33
+ description="Policy-driven content scanning and redaction.",
34
+ )
35
+ sub = parser.add_subparsers(dest="command", required=True)
36
+
37
+ for name in ("scan", "redact", "diff"):
38
+ cmd = sub.add_parser(name)
39
+ cmd.add_argument("target", nargs="?", help="file to read, or stdin when omitted")
40
+ cmd.add_argument("--policy", help="JSON policy file")
41
+ cmd.add_argument("--opf", action="store_true", help="run optional OPF backend")
42
+ cmd.add_argument("--opf-bin", help="path to opf binary")
43
+ cmd.add_argument("--opf-device", help="OPF device, default comes from policy or cpu")
44
+ cmd.add_argument("--scan-frontmatter", action="store_true", help="scan YAML frontmatter")
45
+ cmd.add_argument("--skip-code-blocks", action="store_true", help="ignore fenced code blocks")
46
+ cmd.add_argument("--no-allow-comments", action="store_true", help="ignore content-guard allow comments")
47
+
48
+ sub.choices["scan"].add_argument("--json", action="store_true", help="emit JSON report")
49
+ sub.choices["redact"].add_argument("--in-place", action="store_true", help="rewrite the target file")
50
+ return parser
51
+
52
+
53
+ def _options(args: argparse.Namespace) -> ScanOptions:
54
+ return ScanOptions(
55
+ scan_frontmatter=args.scan_frontmatter,
56
+ scan_code_blocks=not args.skip_code_blocks,
57
+ honor_allow_comments=not args.no_allow_comments,
58
+ include_opf=args.opf,
59
+ opf_device=args.opf_device,
60
+ opf_bin=args.opf_bin,
61
+ )
62
+
63
+
64
+ def _read_target(target: str | None) -> tuple[str, str | None]:
65
+ if not target or target == "-":
66
+ return sys.stdin.read(), None
67
+ path = Path(target)
68
+ return path.read_text(), str(path)
69
+
70
+
71
+ def _scan(args: argparse.Namespace) -> int:
72
+ policy = load_policy(args.policy)
73
+ options = _options(args)
74
+ target_path = Path(args.target) if args.target and args.target != "-" else None
75
+
76
+ if target_path and target_path.is_dir():
77
+ results = _scan_directory(target_path, policy, options)
78
+ blocked = any(result.blocked for _, result in results)
79
+ if args.json:
80
+ print(
81
+ json.dumps(
82
+ {
83
+ "ok": not blocked,
84
+ "blocked": blocked,
85
+ "files_scanned": len(results),
86
+ "files": [
87
+ {"path": str(path), **to_payload(result)}
88
+ for path, result in results
89
+ if result.findings
90
+ ],
91
+ },
92
+ indent=2,
93
+ sort_keys=True,
94
+ )
95
+ )
96
+ elif not any(result.findings for _, result in results):
97
+ print(f"Clean. {len(results)} file(s) checked.")
98
+ else:
99
+ for path, result in results:
100
+ if result.findings:
101
+ print(to_text(result, path=str(path)))
102
+ return 1 if blocked else 0
103
+
104
+ text, path = _read_target(args.target)
105
+ result = scan_text(text, policy=policy, options=options)
106
+ if args.json:
107
+ print(to_json(result))
108
+ else:
109
+ print(to_text(result, path=path or "<stdin>"))
110
+ return 1 if result.blocked else 0
111
+
112
+
113
+ def _redact(args: argparse.Namespace) -> int:
114
+ policy = load_policy(args.policy)
115
+ options = _options(args)
116
+ target_path = Path(args.target) if args.target and args.target != "-" else None
117
+
118
+ if target_path and target_path.is_dir():
119
+ if not args.in_place:
120
+ print("directory redact requires --in-place", file=sys.stderr)
121
+ return 2
122
+ results = _scan_directory(target_path, policy, options)
123
+ for path, result in results:
124
+ if result.changed:
125
+ path.write_text(result.redacted_text)
126
+ return 1 if any(result.blocked for _, result in results) else 0
127
+
128
+ text, path = _read_target(args.target)
129
+ result = scan_text(text, policy=policy, options=options)
130
+
131
+ if args.in_place:
132
+ if not path:
133
+ print("--in-place requires a file target", file=sys.stderr)
134
+ return 2
135
+ Path(path).write_text(result.redacted_text)
136
+ else:
137
+ sys.stdout.write(result.redacted_text)
138
+ return 1 if result.blocked else 0
139
+
140
+
141
+ def _diff(args: argparse.Namespace) -> int:
142
+ policy = load_policy(args.policy)
143
+ options = _options(args)
144
+ target_path = Path(args.target) if args.target and args.target != "-" else None
145
+
146
+ if target_path and target_path.is_dir():
147
+ results = _scan_directory(target_path, policy, options)
148
+ for path, result in results:
149
+ if not result.changed:
150
+ continue
151
+ _write_diff(result.text, result.redacted_text, str(path))
152
+ return 1 if any(result.blocked for _, result in results) else 0
153
+
154
+ text, path = _read_target(args.target)
155
+ result = scan_text(text, policy=policy, options=options)
156
+ source_name = path or "<stdin>"
157
+ _write_diff(text, result.redacted_text, source_name)
158
+ return 1 if result.blocked else 0
159
+
160
+
161
+ def _scan_directory(path: Path, policy, options: ScanOptions):
162
+ results = []
163
+ for file_path in sorted(path.rglob("*.md")):
164
+ text = file_path.read_text()
165
+ results.append((file_path, scan_text(text, policy=policy, options=options)))
166
+ return results
167
+
168
+
169
+ def _write_diff(text: str, redacted_text: str, source_name: str) -> None:
170
+ diff = difflib.unified_diff(
171
+ text.splitlines(keepends=True),
172
+ redacted_text.splitlines(keepends=True),
173
+ fromfile=source_name,
174
+ tofile=f"{source_name} (redacted)",
175
+ )
176
+ sys.stdout.writelines(diff)
@@ -0,0 +1 @@
1
+ """Detector backends."""
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import subprocess
5
+ import tempfile
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class OpfResult:
12
+ available: bool
13
+ changed: bool
14
+ redacted_text: str
15
+ error: str = ""
16
+
17
+
18
+ def default_opf_bin() -> str:
19
+ return os.environ.get("CONTENT_GUARD_OPF_BIN") or str(Path.home() / ".opf-venv" / "bin" / "opf")
20
+
21
+
22
+ def run_opf(text: str, *, opf_bin: str | None = None, device: str = "cpu", timeout: int = 120) -> OpfResult:
23
+ opf = opf_bin or default_opf_bin()
24
+ if not os.path.exists(opf) or not os.access(opf, os.X_OK):
25
+ return OpfResult(False, False, text, f"opf binary not found: {opf}")
26
+
27
+ with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as handle:
28
+ handle.write(text)
29
+ path = handle.name
30
+
31
+ try:
32
+ proc = subprocess.run(
33
+ [opf, "--device", device, "-f", path],
34
+ capture_output=True,
35
+ text=True,
36
+ timeout=timeout,
37
+ check=False,
38
+ )
39
+ except (OSError, subprocess.SubprocessError) as exc:
40
+ return OpfResult(True, False, text, str(exc))
41
+ finally:
42
+ try:
43
+ os.unlink(path)
44
+ except OSError:
45
+ pass
46
+
47
+ if proc.returncode != 0:
48
+ error = (proc.stderr or proc.stdout or "opf failed").strip()
49
+ return OpfResult(True, False, text, error)
50
+
51
+ redacted = proc.stdout
52
+ return OpfResult(True, redacted != text, redacted, "")
@@ -0,0 +1,230 @@
1
+ from __future__ import annotations
2
+
3
+ import bisect
4
+ import re
5
+
6
+ from .detectors.opf import run_opf
7
+ from .policy import Policy
8
+ from .types import Finding, GuardResult, Rule, ScanOptions, TextEdit
9
+
10
+ ALLOW_RE = re.compile(r"content-guard:\s*allow\s+([A-Za-z0-9_.:-]+|all)")
11
+
12
+
13
+ def scan_text(text: str, policy: Policy | None = None, options: ScanOptions | None = None) -> GuardResult:
14
+ active_policy = policy or Policy()
15
+ active_options = options or ScanOptions()
16
+
17
+ line_starts = _line_starts(text)
18
+ skipped_ranges = _skipped_ranges(text, active_options)
19
+ allow_by_line = _allow_comments_by_line(text) if active_options.honor_allow_comments else {}
20
+
21
+ findings: list[Finding] = []
22
+ occupied: list[tuple[int, int]] = []
23
+
24
+ for rule in active_policy.all_rules():
25
+ regex = re.compile(rule.pattern, rule.flags)
26
+ for match in regex.finditer(text):
27
+ start, end = match.span()
28
+ if start == end:
29
+ continue
30
+ if _inside_ranges(start, end, skipped_ranges):
31
+ continue
32
+ if _overlaps(start, end, occupied):
33
+ continue
34
+
35
+ line = _line_for_offset(line_starts, start)
36
+ allowed_by = _allowed_by(rule.id, line, allow_by_line)
37
+ action = "allow" if allowed_by else active_policy.action_for(rule)
38
+ findings.append(
39
+ Finding(
40
+ rule_id=rule.id,
41
+ category=rule.category,
42
+ action=action,
43
+ match=match.group(0),
44
+ replacement=rule.replacement,
45
+ line=line,
46
+ column=start - line_starts[line - 1] + 1,
47
+ start=start,
48
+ end=end,
49
+ source="regex",
50
+ message=rule.description,
51
+ allowed_by=allowed_by,
52
+ )
53
+ )
54
+ occupied.append((start, end))
55
+
56
+ redacted = _apply_edits(text, _edits_for(findings))
57
+
58
+ include_opf = active_options.include_opf or active_policy.opf_backend.enabled
59
+ opf_device = active_options.opf_device or active_policy.opf_backend.device
60
+ opf_bin = active_options.opf_bin or active_policy.opf_backend.bin
61
+
62
+ if include_opf:
63
+ opf_rule = Rule(
64
+ id="opf-pii",
65
+ category="pii",
66
+ pattern="",
67
+ replacement="<PRIVATE_DATA>",
68
+ description="OPF changed the text, indicating model-detected PII.",
69
+ )
70
+ opf_result = run_opf(
71
+ text,
72
+ opf_bin=opf_bin,
73
+ device=opf_device,
74
+ )
75
+ if opf_result.changed:
76
+ action = active_policy.action_for(opf_rule)
77
+ findings.append(
78
+ Finding(
79
+ rule_id=opf_rule.id,
80
+ category=opf_rule.category,
81
+ action=action,
82
+ match="<OPF_DETECTED_PII>",
83
+ replacement="<PRIVATE_DATA>",
84
+ line=1,
85
+ column=1,
86
+ start=0,
87
+ end=0,
88
+ source="opf",
89
+ message="OPF redacted one or more spans.",
90
+ )
91
+ )
92
+ if action in {"redact", "block"}:
93
+ redacted = run_opf(
94
+ redacted,
95
+ opf_bin=opf_bin,
96
+ device=opf_device,
97
+ ).redacted_text
98
+ elif opf_result.available and opf_result.error:
99
+ findings.append(
100
+ Finding(
101
+ rule_id="opf-error",
102
+ category="tooling",
103
+ action="warn",
104
+ match="opf",
105
+ replacement="",
106
+ line=1,
107
+ column=1,
108
+ start=0,
109
+ end=0,
110
+ source="opf",
111
+ message=opf_result.error,
112
+ )
113
+ )
114
+ elif not opf_result.available:
115
+ findings.append(
116
+ Finding(
117
+ rule_id="opf-unavailable",
118
+ category="tooling",
119
+ action="warn",
120
+ match="opf",
121
+ replacement="",
122
+ line=1,
123
+ column=1,
124
+ start=0,
125
+ end=0,
126
+ source="opf",
127
+ message=opf_result.error,
128
+ )
129
+ )
130
+
131
+ findings.sort(key=lambda item: (item.line, item.column, item.rule_id))
132
+ return GuardResult(text=text, redacted_text=redacted, findings=findings)
133
+
134
+
135
+ def redact_text(text: str, policy: Policy | None = None, options: ScanOptions | None = None) -> str:
136
+ return scan_text(text, policy=policy, options=options).redacted_text
137
+
138
+
139
+ def _line_starts(text: str) -> list[int]:
140
+ starts = [0]
141
+ for match in re.finditer("\n", text):
142
+ starts.append(match.end())
143
+ return starts
144
+
145
+
146
+ def _line_for_offset(starts: list[int], offset: int) -> int:
147
+ return bisect.bisect_right(starts, offset)
148
+
149
+
150
+ def _allow_comments_by_line(text: str) -> dict[int, set[str]]:
151
+ allowed: dict[int, set[str]] = {}
152
+ for line_no, line in enumerate(text.splitlines(), 1):
153
+ match = ALLOW_RE.search(line)
154
+ if not match:
155
+ continue
156
+ token = match.group(1)
157
+ allowed.setdefault(line_no, set()).add(token)
158
+ allowed.setdefault(line_no + 1, set()).add(token)
159
+ return allowed
160
+
161
+
162
+ def _allowed_by(rule_id: str, line: int, allow_by_line: dict[int, set[str]]) -> str | None:
163
+ tokens = allow_by_line.get(line, set())
164
+ if "all" in tokens:
165
+ return "all"
166
+ if rule_id in tokens:
167
+ return rule_id
168
+ return None
169
+
170
+
171
+ def _skipped_ranges(text: str, options: ScanOptions) -> list[tuple[int, int]]:
172
+ ranges: list[tuple[int, int]] = []
173
+ lines = text.splitlines(keepends=True)
174
+ offset = 0
175
+
176
+ if not options.scan_frontmatter and lines and lines[0].strip() == "---":
177
+ end = len(lines[0])
178
+ for line in lines[1:]:
179
+ end += len(line)
180
+ if line.strip() == "---":
181
+ ranges.append((0, end))
182
+ break
183
+
184
+ if not options.scan_code_blocks:
185
+ in_fence = False
186
+ fence_start = 0
187
+ current = 0
188
+ for line in lines:
189
+ stripped = line.lstrip()
190
+ if stripped.startswith("```") or stripped.startswith("~~~"):
191
+ if not in_fence:
192
+ in_fence = True
193
+ fence_start = current
194
+ else:
195
+ ranges.append((fence_start, current + len(line)))
196
+ in_fence = False
197
+ current += len(line)
198
+ if in_fence:
199
+ ranges.append((fence_start, len(text)))
200
+
201
+ current = 0
202
+ for line in lines:
203
+ if "content-guard:" in line:
204
+ ranges.append((current, current + len(line)))
205
+ current += len(line)
206
+
207
+ return ranges
208
+
209
+
210
+ def _inside_ranges(start: int, end: int, ranges: list[tuple[int, int]]) -> bool:
211
+ return any(start < range_end and end > range_start for range_start, range_end in ranges)
212
+
213
+
214
+ def _overlaps(start: int, end: int, occupied: list[tuple[int, int]]) -> bool:
215
+ return any(start < prev_end and end > prev_start for prev_start, prev_end in occupied)
216
+
217
+
218
+ def _edits_for(findings: list[Finding]) -> list[TextEdit]:
219
+ return [
220
+ TextEdit(finding.start, finding.end, finding.replacement)
221
+ for finding in findings
222
+ if finding.redacts and finding.start < finding.end
223
+ ]
224
+
225
+
226
+ def _apply_edits(text: str, edits: list[TextEdit]) -> str:
227
+ result = text
228
+ for edit in sorted(edits, key=lambda item: item.start, reverse=True):
229
+ result = result[: edit.start] + edit.replacement + result[edit.end :]
230
+ return result
@@ -0,0 +1,145 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import subprocess
6
+ import sys
7
+
8
+ from .engine import scan_text
9
+ from .policy import Policy, load_policy
10
+ from .report import to_text
11
+
12
+
13
+ def main(argv: list[str] | None = None) -> int:
14
+ parser = argparse.ArgumentParser(
15
+ prog="content-guard-commits",
16
+ description="Scan Git commit messages before publishing or pushing.",
17
+ )
18
+ parser.add_argument("--policy", help="JSON policy file")
19
+ parser.add_argument("--range", dest="rev_range", help="revision range to scan, for example origin/main..HEAD")
20
+ parser.add_argument("--all", action="store_true", help="scan all reachable commits")
21
+ parser.add_argument("--json", action="store_true", help="emit JSON report")
22
+ args = parser.parse_args(argv)
23
+
24
+ policy = load_policy(args.policy) if args.policy else _default_commit_policy()
25
+ revs = _commit_revs(args)
26
+
27
+ results = []
28
+ blocked = False
29
+ for rev in revs:
30
+ message = _git(["log", "-1", "--format=%B", rev])
31
+ result = scan_text(message, policy=policy)
32
+ if result.findings:
33
+ blocked = blocked or result.blocked
34
+ results.append((rev, _subject(result.redacted_text), result))
35
+
36
+ if args.json:
37
+ print(
38
+ json.dumps(
39
+ {
40
+ "ok": not blocked,
41
+ "blocked": blocked,
42
+ "commits_scanned": len(revs),
43
+ "commits_with_findings": len(results),
44
+ "commits": [
45
+ {
46
+ "commit": rev,
47
+ "subject": subject,
48
+ "blocked": result.blocked,
49
+ "changed": result.changed,
50
+ "counts_by_action": result.counts_by_action(),
51
+ "counts_by_category": result.counts_by_category(),
52
+ "findings": [
53
+ {
54
+ "rule_id": finding.rule_id,
55
+ "category": finding.category,
56
+ "action": finding.action,
57
+ "line": finding.line,
58
+ "column": finding.column,
59
+ "source": finding.source,
60
+ "message": finding.message,
61
+ }
62
+ for finding in result.findings
63
+ ],
64
+ }
65
+ for rev, subject, result in results
66
+ ],
67
+ },
68
+ indent=2,
69
+ sort_keys=True,
70
+ )
71
+ )
72
+ elif not results:
73
+ print(f"Clean. {len(revs)} commit message(s) checked.")
74
+ else:
75
+ for rev, subject, result in results:
76
+ label = f"commit {rev[:12]} {subject}".strip()
77
+ print(to_text(result, path=label))
78
+
79
+ return 1 if blocked else 0
80
+
81
+
82
+ def _default_commit_policy() -> Policy:
83
+ return Policy(
84
+ name="public-commit-default",
85
+ defaults={
86
+ "infrastructure": "block",
87
+ "secret": "block",
88
+ "pii": "block",
89
+ "personal": "block",
90
+ "business": "block",
91
+ "attribution": "block",
92
+ "tooling": "warn",
93
+ },
94
+ rules={"opf-pii": "warn"},
95
+ )
96
+
97
+
98
+ def _commit_revs(args: argparse.Namespace) -> list[str]:
99
+ if args.all:
100
+ cmd = ["rev-list", "--reverse", "--all"]
101
+ else:
102
+ if not args.rev_range and not _has_head():
103
+ return []
104
+ rev_range = args.rev_range or _default_range()
105
+ cmd = ["rev-list", "--reverse", rev_range]
106
+
107
+ output = _git(cmd)
108
+ return [line for line in output.splitlines() if line.strip()]
109
+
110
+
111
+ def _default_range() -> str:
112
+ proc = subprocess.run(
113
+ ["git", "rev-parse", "--abbrev-ref", "--symbolic-full-name", "@{upstream}"],
114
+ capture_output=True,
115
+ text=True,
116
+ check=False,
117
+ )
118
+ upstream = proc.stdout.strip()
119
+ if proc.returncode == 0 and upstream:
120
+ return f"{upstream}..HEAD"
121
+ return "HEAD"
122
+
123
+
124
+ def _has_head() -> bool:
125
+ proc = subprocess.run(["git", "rev-parse", "--verify", "HEAD"], capture_output=True, text=True, check=False)
126
+ return proc.returncode == 0
127
+
128
+
129
+ def _git(args: list[str]) -> str:
130
+ proc = subprocess.run(["git", *args], capture_output=True, text=True, check=False)
131
+ if proc.returncode != 0:
132
+ print((proc.stderr or proc.stdout or "git command failed").strip(), file=sys.stderr)
133
+ raise SystemExit(2)
134
+ return proc.stdout
135
+
136
+
137
+ def _subject(message: str) -> str:
138
+ for line in message.splitlines():
139
+ if line.strip():
140
+ return line.strip()
141
+ return "(empty subject)"
142
+
143
+
144
+ if __name__ == "__main__":
145
+ raise SystemExit(main())