PyPI - content-guard - Versions diffs - 0.1.1__py3-none-any.whl - Mend

content-guard 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

content_guard/__init__.py +6 -0
content_guard/__main__.py +4 -0
content_guard/cli.py +176 -0
content_guard/detectors/__init__.py +1 -0
content_guard/detectors/opf.py +52 -0
content_guard/engine.py +230 -0
content_guard/git_commits.py +145 -0
content_guard/git_scan.py +123 -0
content_guard/n8n_advisory.py +95 -0
content_guard/n8n_validate.py +153 -0
content_guard/policies/openclaw-message.json +32 -0
content_guard/policies/pr-draft.json +23 -0
content_guard/policies/public-content.json +25 -0
content_guard/policies/public-repo.json +36 -0
content_guard/policy.py +168 -0
content_guard/pr_draft.py +73 -0
content_guard/pr_prepare.py +131 -0
content_guard/publish_check.py +257 -0
content_guard/report.py +39 -0
content_guard/rules.py +107 -0
content_guard/types.py +84 -0
content_guard-0.1.1.dist-info/METADATA +188 -0
content_guard-0.1.1.dist-info/RECORD +26 -0
content_guard-0.1.1.dist-info/WHEEL +5 -0
content_guard-0.1.1.dist-info/entry_points.txt +9 -0
content_guard-0.1.1.dist-info/top_level.txt +1 -0

content_guard/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Policy-driven content scanning and redaction."""
+from .engine import GuardResult, scan_text, redact_text
+from .policy import Policy, load_policy
+__all__ = ["GuardResult", "Policy", "load_policy", "scan_text", "redact_text"]

content_guard/__main__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .cli import main
+if __name__ == "__main__":
+    raise SystemExit(main())

content_guard/cli.py ADDED Viewed

@@ -0,0 +1,176 @@
+from __future__ import annotations
+import argparse
+import difflib
+import json
+import sys
+from pathlib import Path
+from .engine import scan_text
+from .policy import load_policy
+from .report import to_json, to_payload, to_text
+from .types import ScanOptions
+def main(argv: list[str] | None = None) -> int:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    if args.command == "scan":
+        return _scan(args)
+    if args.command == "redact":
+        return _redact(args)
+    if args.command == "diff":
+        return _diff(args)
+    parser.error(f"unknown command: {args.command}")
+    return 2
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="content-guard",
+        description="Policy-driven content scanning and redaction.",
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+    for name in ("scan", "redact", "diff"):
+        cmd = sub.add_parser(name)
+        cmd.add_argument("target", nargs="?", help="file to read, or stdin when omitted")
+        cmd.add_argument("--policy", help="JSON policy file")
+        cmd.add_argument("--opf", action="store_true", help="run optional OPF backend")
+        cmd.add_argument("--opf-bin", help="path to opf binary")
+        cmd.add_argument("--opf-device", help="OPF device, default comes from policy or cpu")
+        cmd.add_argument("--scan-frontmatter", action="store_true", help="scan YAML frontmatter")
+        cmd.add_argument("--skip-code-blocks", action="store_true", help="ignore fenced code blocks")
+        cmd.add_argument("--no-allow-comments", action="store_true", help="ignore content-guard allow comments")
+    sub.choices["scan"].add_argument("--json", action="store_true", help="emit JSON report")
+    sub.choices["redact"].add_argument("--in-place", action="store_true", help="rewrite the target file")
+    return parser
+def _options(args: argparse.Namespace) -> ScanOptions:
+    return ScanOptions(
+        scan_frontmatter=args.scan_frontmatter,
+        scan_code_blocks=not args.skip_code_blocks,
+        honor_allow_comments=not args.no_allow_comments,
+        include_opf=args.opf,
+        opf_device=args.opf_device,
+        opf_bin=args.opf_bin,
+    )
+def _read_target(target: str | None) -> tuple[str, str | None]:
+    if not target or target == "-":
+        return sys.stdin.read(), None
+    path = Path(target)
+    return path.read_text(), str(path)
+def _scan(args: argparse.Namespace) -> int:
+    policy = load_policy(args.policy)
+    options = _options(args)
+    target_path = Path(args.target) if args.target and args.target != "-" else None
+    if target_path and target_path.is_dir():
+        results = _scan_directory(target_path, policy, options)
+        blocked = any(result.blocked for _, result in results)
+        if args.json:
+            print(
+                json.dumps(
+                    {
+                        "ok": not blocked,
+                        "blocked": blocked,
+                        "files_scanned": len(results),
+                        "files": [
+                            {"path": str(path), **to_payload(result)}
+                            for path, result in results
+                            if result.findings
+                        ],
+                    },
+                    indent=2,
+                    sort_keys=True,
+                )
+            )
+        elif not any(result.findings for _, result in results):
+            print(f"Clean. {len(results)} file(s) checked.")
+        else:
+            for path, result in results:
+                if result.findings:
+                    print(to_text(result, path=str(path)))
+        return 1 if blocked else 0
+    text, path = _read_target(args.target)
+    result = scan_text(text, policy=policy, options=options)
+    if args.json:
+        print(to_json(result))
+    else:
+        print(to_text(result, path=path or "<stdin>"))
+    return 1 if result.blocked else 0
+def _redact(args: argparse.Namespace) -> int:
+    policy = load_policy(args.policy)
+    options = _options(args)
+    target_path = Path(args.target) if args.target and args.target != "-" else None
+    if target_path and target_path.is_dir():
+        if not args.in_place:
+            print("directory redact requires --in-place", file=sys.stderr)
+            return 2
+        results = _scan_directory(target_path, policy, options)
+        for path, result in results:
+            if result.changed:
+                path.write_text(result.redacted_text)
+        return 1 if any(result.blocked for _, result in results) else 0
+    text, path = _read_target(args.target)
+    result = scan_text(text, policy=policy, options=options)
+    if args.in_place:
+        if not path:
+            print("--in-place requires a file target", file=sys.stderr)
+            return 2
+        Path(path).write_text(result.redacted_text)
+    else:
+        sys.stdout.write(result.redacted_text)
+    return 1 if result.blocked else 0
+def _diff(args: argparse.Namespace) -> int:
+    policy = load_policy(args.policy)
+    options = _options(args)
+    target_path = Path(args.target) if args.target and args.target != "-" else None
+    if target_path and target_path.is_dir():
+        results = _scan_directory(target_path, policy, options)
+        for path, result in results:
+            if not result.changed:
+                continue
+            _write_diff(result.text, result.redacted_text, str(path))
+        return 1 if any(result.blocked for _, result in results) else 0
+    text, path = _read_target(args.target)
+    result = scan_text(text, policy=policy, options=options)
+    source_name = path or "<stdin>"
+    _write_diff(text, result.redacted_text, source_name)
+    return 1 if result.blocked else 0
+def _scan_directory(path: Path, policy, options: ScanOptions):
+    results = []
+    for file_path in sorted(path.rglob("*.md")):
+        text = file_path.read_text()
+        results.append((file_path, scan_text(text, policy=policy, options=options)))
+    return results
+def _write_diff(text: str, redacted_text: str, source_name: str) -> None:
+    diff = difflib.unified_diff(
+        text.splitlines(keepends=True),
+        redacted_text.splitlines(keepends=True),
+        fromfile=source_name,
+        tofile=f"{source_name} (redacted)",
+    )
+    sys.stdout.writelines(diff)

content_guard/detectors/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Detector backends."""

content_guard/detectors/opf.py ADDED Viewed

@@ -0,0 +1,52 @@
+from __future__ import annotations
+import os
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class OpfResult:
+    available: bool
+    changed: bool
+    redacted_text: str
+    error: str = ""
+def default_opf_bin() -> str:
+    return os.environ.get("CONTENT_GUARD_OPF_BIN") or str(Path.home() / ".opf-venv" / "bin" / "opf")
+def run_opf(text: str, *, opf_bin: str | None = None, device: str = "cpu", timeout: int = 120) -> OpfResult:
+    opf = opf_bin or default_opf_bin()
+    if not os.path.exists(opf) or not os.access(opf, os.X_OK):
+        return OpfResult(False, False, text, f"opf binary not found: {opf}")
+    with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as handle:
+        handle.write(text)
+        path = handle.name
+    try:
+        proc = subprocess.run(
+            [opf, "--device", device, "-f", path],
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            check=False,
+        )
+    except (OSError, subprocess.SubprocessError) as exc:
+        return OpfResult(True, False, text, str(exc))
+    finally:
+        try:
+            os.unlink(path)
+        except OSError:
+            pass
+    if proc.returncode != 0:
+        error = (proc.stderr or proc.stdout or "opf failed").strip()
+        return OpfResult(True, False, text, error)
+    redacted = proc.stdout
+    return OpfResult(True, redacted != text, redacted, "")

content_guard/engine.py ADDED Viewed

@@ -0,0 +1,230 @@
+from __future__ import annotations
+import bisect
+import re
+from .detectors.opf import run_opf
+from .policy import Policy
+from .types import Finding, GuardResult, Rule, ScanOptions, TextEdit
+ALLOW_RE = re.compile(r"content-guard:\s*allow\s+([A-Za-z0-9_.:-]+|all)")
+def scan_text(text: str, policy: Policy | None = None, options: ScanOptions | None = None) -> GuardResult:
+    active_policy = policy or Policy()
+    active_options = options or ScanOptions()
+    line_starts = _line_starts(text)
+    skipped_ranges = _skipped_ranges(text, active_options)
+    allow_by_line = _allow_comments_by_line(text) if active_options.honor_allow_comments else {}
+    findings: list[Finding] = []
+    occupied: list[tuple[int, int]] = []
+    for rule in active_policy.all_rules():
+        regex = re.compile(rule.pattern, rule.flags)
+        for match in regex.finditer(text):
+            start, end = match.span()
+            if start == end:
+                continue
+            if _inside_ranges(start, end, skipped_ranges):
+                continue
+            if _overlaps(start, end, occupied):
+                continue
+            line = _line_for_offset(line_starts, start)
+            allowed_by = _allowed_by(rule.id, line, allow_by_line)
+            action = "allow" if allowed_by else active_policy.action_for(rule)
+            findings.append(
+                Finding(
+                    rule_id=rule.id,
+                    category=rule.category,
+                    action=action,
+                    match=match.group(0),
+                    replacement=rule.replacement,
+                    line=line,
+                    column=start - line_starts[line - 1] + 1,
+                    start=start,
+                    end=end,
+                    source="regex",
+                    message=rule.description,
+                    allowed_by=allowed_by,
+                )
+            )
+            occupied.append((start, end))
+    redacted = _apply_edits(text, _edits_for(findings))
+    include_opf = active_options.include_opf or active_policy.opf_backend.enabled
+    opf_device = active_options.opf_device or active_policy.opf_backend.device
+    opf_bin = active_options.opf_bin or active_policy.opf_backend.bin
+    if include_opf:
+        opf_rule = Rule(
+            id="opf-pii",
+            category="pii",
+            pattern="",
+            replacement="<PRIVATE_DATA>",
+            description="OPF changed the text, indicating model-detected PII.",
+        )
+        opf_result = run_opf(
+            text,
+            opf_bin=opf_bin,
+            device=opf_device,
+        )
+        if opf_result.changed:
+            action = active_policy.action_for(opf_rule)
+            findings.append(
+                Finding(
+                    rule_id=opf_rule.id,
+                    category=opf_rule.category,
+                    action=action,
+                    match="<OPF_DETECTED_PII>",
+                    replacement="<PRIVATE_DATA>",
+                    line=1,
+                    column=1,
+                    start=0,
+                    end=0,
+                    source="opf",
+                    message="OPF redacted one or more spans.",
+                )
+            )
+            if action in {"redact", "block"}:
+                redacted = run_opf(
+                    redacted,
+                    opf_bin=opf_bin,
+                    device=opf_device,
+                ).redacted_text
+        elif opf_result.available and opf_result.error:
+            findings.append(
+                Finding(
+                    rule_id="opf-error",
+                    category="tooling",
+                    action="warn",
+                    match="opf",
+                    replacement="",
+                    line=1,
+                    column=1,
+                    start=0,
+                    end=0,
+                    source="opf",
+                    message=opf_result.error,
+                )
+            )
+        elif not opf_result.available:
+            findings.append(
+                Finding(
+                    rule_id="opf-unavailable",
+                    category="tooling",
+                    action="warn",
+                    match="opf",
+                    replacement="",
+                    line=1,
+                    column=1,
+                    start=0,
+                    end=0,
+                    source="opf",
+                    message=opf_result.error,
+                )
+            )
+    findings.sort(key=lambda item: (item.line, item.column, item.rule_id))
+    return GuardResult(text=text, redacted_text=redacted, findings=findings)
+def redact_text(text: str, policy: Policy | None = None, options: ScanOptions | None = None) -> str:
+    return scan_text(text, policy=policy, options=options).redacted_text
+def _line_starts(text: str) -> list[int]:
+    starts = [0]
+    for match in re.finditer("\n", text):
+        starts.append(match.end())
+    return starts
+def _line_for_offset(starts: list[int], offset: int) -> int:
+    return bisect.bisect_right(starts, offset)
+def _allow_comments_by_line(text: str) -> dict[int, set[str]]:
+    allowed: dict[int, set[str]] = {}
+    for line_no, line in enumerate(text.splitlines(), 1):
+        match = ALLOW_RE.search(line)
+        if not match:
+            continue
+        token = match.group(1)
+        allowed.setdefault(line_no, set()).add(token)
+        allowed.setdefault(line_no + 1, set()).add(token)
+    return allowed
+def _allowed_by(rule_id: str, line: int, allow_by_line: dict[int, set[str]]) -> str | None:
+    tokens = allow_by_line.get(line, set())
+    if "all" in tokens:
+        return "all"
+    if rule_id in tokens:
+        return rule_id
+    return None
+def _skipped_ranges(text: str, options: ScanOptions) -> list[tuple[int, int]]:
+    ranges: list[tuple[int, int]] = []
+    lines = text.splitlines(keepends=True)
+    offset = 0
+    if not options.scan_frontmatter and lines and lines[0].strip() == "---":
+        end = len(lines[0])
+        for line in lines[1:]:
+            end += len(line)
+            if line.strip() == "---":
+                ranges.append((0, end))
+                break
+    if not options.scan_code_blocks:
+        in_fence = False
+        fence_start = 0
+        current = 0
+        for line in lines:
+            stripped = line.lstrip()
+            if stripped.startswith("```") or stripped.startswith("~~~"):
+                if not in_fence:
+                    in_fence = True
+                    fence_start = current
+                else:
+                    ranges.append((fence_start, current + len(line)))
+                    in_fence = False
+            current += len(line)
+        if in_fence:
+            ranges.append((fence_start, len(text)))
+    current = 0
+    for line in lines:
+        if "content-guard:" in line:
+            ranges.append((current, current + len(line)))
+        current += len(line)
+    return ranges
+def _inside_ranges(start: int, end: int, ranges: list[tuple[int, int]]) -> bool:
+    return any(start < range_end and end > range_start for range_start, range_end in ranges)
+def _overlaps(start: int, end: int, occupied: list[tuple[int, int]]) -> bool:
+    return any(start < prev_end and end > prev_start for prev_start, prev_end in occupied)
+def _edits_for(findings: list[Finding]) -> list[TextEdit]:
+    return [
+        TextEdit(finding.start, finding.end, finding.replacement)
+        for finding in findings
+        if finding.redacts and finding.start < finding.end
+    ]
+def _apply_edits(text: str, edits: list[TextEdit]) -> str:
+    result = text
+    for edit in sorted(edits, key=lambda item: item.start, reverse=True):
+        result = result[: edit.start] + edit.replacement + result[edit.end :]
+    return result

content_guard/git_commits.py ADDED Viewed

@@ -0,0 +1,145 @@
+from __future__ import annotations
+import argparse
+import json
+import subprocess
+import sys
+from .engine import scan_text
+from .policy import Policy, load_policy
+from .report import to_text
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(
+        prog="content-guard-commits",
+        description="Scan Git commit messages before publishing or pushing.",
+    )
+    parser.add_argument("--policy", help="JSON policy file")
+    parser.add_argument("--range", dest="rev_range", help="revision range to scan, for example origin/main..HEAD")
+    parser.add_argument("--all", action="store_true", help="scan all reachable commits")
+    parser.add_argument("--json", action="store_true", help="emit JSON report")
+    args = parser.parse_args(argv)
+    policy = load_policy(args.policy) if args.policy else _default_commit_policy()
+    revs = _commit_revs(args)
+    results = []
+    blocked = False
+    for rev in revs:
+        message = _git(["log", "-1", "--format=%B", rev])
+        result = scan_text(message, policy=policy)
+        if result.findings:
+            blocked = blocked or result.blocked
+            results.append((rev, _subject(result.redacted_text), result))
+    if args.json:
+        print(
+            json.dumps(
+                {
+                    "ok": not blocked,
+                    "blocked": blocked,
+                    "commits_scanned": len(revs),
+                    "commits_with_findings": len(results),
+                    "commits": [
+                        {
+                            "commit": rev,
+                            "subject": subject,
+                            "blocked": result.blocked,
+                            "changed": result.changed,
+                            "counts_by_action": result.counts_by_action(),
+                            "counts_by_category": result.counts_by_category(),
+                            "findings": [
+                                {
+                                    "rule_id": finding.rule_id,
+                                    "category": finding.category,
+                                    "action": finding.action,
+                                    "line": finding.line,
+                                    "column": finding.column,
+                                    "source": finding.source,
+                                    "message": finding.message,
+                                }
+                                for finding in result.findings
+                            ],
+                        }
+                        for rev, subject, result in results
+                    ],
+                },
+                indent=2,
+                sort_keys=True,
+            )
+        )
+    elif not results:
+        print(f"Clean. {len(revs)} commit message(s) checked.")
+    else:
+        for rev, subject, result in results:
+            label = f"commit {rev[:12]} {subject}".strip()
+            print(to_text(result, path=label))
+    return 1 if blocked else 0
+def _default_commit_policy() -> Policy:
+    return Policy(
+        name="public-commit-default",
+        defaults={
+            "infrastructure": "block",
+            "secret": "block",
+            "pii": "block",
+            "personal": "block",
+            "business": "block",
+            "attribution": "block",
+            "tooling": "warn",
+        },
+        rules={"opf-pii": "warn"},
+    )
+def _commit_revs(args: argparse.Namespace) -> list[str]:
+    if args.all:
+        cmd = ["rev-list", "--reverse", "--all"]
+    else:
+        if not args.rev_range and not _has_head():
+            return []
+        rev_range = args.rev_range or _default_range()
+        cmd = ["rev-list", "--reverse", rev_range]
+    output = _git(cmd)
+    return [line for line in output.splitlines() if line.strip()]
+def _default_range() -> str:
+    proc = subprocess.run(
+        ["git", "rev-parse", "--abbrev-ref", "--symbolic-full-name", "@{upstream}"],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    upstream = proc.stdout.strip()
+    if proc.returncode == 0 and upstream:
+        return f"{upstream}..HEAD"
+    return "HEAD"
+def _has_head() -> bool:
+    proc = subprocess.run(["git", "rev-parse", "--verify", "HEAD"], capture_output=True, text=True, check=False)
+    return proc.returncode == 0
+def _git(args: list[str]) -> str:
+    proc = subprocess.run(["git", *args], capture_output=True, text=True, check=False)
+    if proc.returncode != 0:
+        print((proc.stderr or proc.stdout or "git command failed").strip(), file=sys.stderr)
+        raise SystemExit(2)
+    return proc.stdout
+def _subject(message: str) -> str:
+    for line in message.splitlines():
+        if line.strip():
+            return line.strip()
+    return "(empty subject)"
+if __name__ == "__main__":
+    raise SystemExit(main())