content-guard 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. content_guard-0.1.1/PKG-INFO +188 -0
  2. content_guard-0.1.1/README.md +176 -0
  3. content_guard-0.1.1/pyproject.toml +33 -0
  4. content_guard-0.1.1/setup.cfg +4 -0
  5. content_guard-0.1.1/src/content_guard/__init__.py +6 -0
  6. content_guard-0.1.1/src/content_guard/__main__.py +4 -0
  7. content_guard-0.1.1/src/content_guard/cli.py +176 -0
  8. content_guard-0.1.1/src/content_guard/detectors/__init__.py +1 -0
  9. content_guard-0.1.1/src/content_guard/detectors/opf.py +52 -0
  10. content_guard-0.1.1/src/content_guard/engine.py +230 -0
  11. content_guard-0.1.1/src/content_guard/git_commits.py +145 -0
  12. content_guard-0.1.1/src/content_guard/git_scan.py +123 -0
  13. content_guard-0.1.1/src/content_guard/n8n_advisory.py +95 -0
  14. content_guard-0.1.1/src/content_guard/n8n_validate.py +153 -0
  15. content_guard-0.1.1/src/content_guard/policies/openclaw-message.json +32 -0
  16. content_guard-0.1.1/src/content_guard/policies/pr-draft.json +23 -0
  17. content_guard-0.1.1/src/content_guard/policies/public-content.json +25 -0
  18. content_guard-0.1.1/src/content_guard/policies/public-repo.json +36 -0
  19. content_guard-0.1.1/src/content_guard/policy.py +168 -0
  20. content_guard-0.1.1/src/content_guard/pr_draft.py +73 -0
  21. content_guard-0.1.1/src/content_guard/pr_prepare.py +131 -0
  22. content_guard-0.1.1/src/content_guard/publish_check.py +257 -0
  23. content_guard-0.1.1/src/content_guard/report.py +39 -0
  24. content_guard-0.1.1/src/content_guard/rules.py +107 -0
  25. content_guard-0.1.1/src/content_guard/types.py +84 -0
  26. content_guard-0.1.1/src/content_guard.egg-info/PKG-INFO +188 -0
  27. content_guard-0.1.1/src/content_guard.egg-info/SOURCES.txt +31 -0
  28. content_guard-0.1.1/src/content_guard.egg-info/dependency_links.txt +1 -0
  29. content_guard-0.1.1/src/content_guard.egg-info/entry_points.txt +9 -0
  30. content_guard-0.1.1/src/content_guard.egg-info/top_level.txt +1 -0
  31. content_guard-0.1.1/tests/test_cli.py +569 -0
  32. content_guard-0.1.1/tests/test_engine.py +137 -0
  33. content_guard-0.1.1/tests/test_rules.py +162 -0
@@ -0,0 +1,188 @@
1
+ Metadata-Version: 2.4
2
+ Name: content-guard
3
+ Version: 0.1.1
4
+ Summary: Policy-driven content scanning and redaction for public publishing and agent output.
5
+ Author: Solomon Neas
6
+ License: Apache-2.0
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Requires-Python: >=3.11
11
+ Description-Content-Type: text/markdown
12
+
13
+ <p align="center">
14
+ <img src="docs/assets/content-guard-banner.jpg" alt="Content Guard banner">
15
+ </p>
16
+
17
+ <h1 align="center">Content Guard</h1>
18
+
19
+ <p align="center">
20
+ <strong>Policy-driven scanning and redaction for public content, publishing pipelines, and agent output.</strong>
21
+ </p>
22
+
23
+ <p align="center">
24
+ <img src="https://img.shields.io/badge/python-3.11%2B-3776AB?style=for-the-badge&logo=python&logoColor=white" alt="Python 3.11+">
25
+ <img src="https://img.shields.io/badge/license-Apache--2.0-blue?style=for-the-badge" alt="Apache-2.0 license">
26
+ <img src="https://img.shields.io/badge/dependencies-zero_required-2ea44f?style=for-the-badge" alt="Zero required third-party dependencies">
27
+ <img src="https://img.shields.io/badge/OPF-optional-8A2BE2?style=for-the-badge" alt="Optional OPF backend">
28
+ <img src="https://img.shields.io/badge/markdown-aware-083344?style=for-the-badge&logo=markdown&logoColor=white" alt="Markdown aware">
29
+ </p>
30
+
31
+ Content Guard keeps private infrastructure, secrets, and personal context out of public surfaces before they ship. It is built for Markdown docs, PR bodies, social drafts, generated agent output, and automation pipelines where one sloppy paste can leak more than intended.
32
+
33
+ It takes the practical parts of the local content scrubber and the useful model-backed idea behind Privacy Filter, then turns them into one maintainable system.
34
+
35
+ ## What It Checks
36
+
37
+ - Deterministic rules for infrastructure, secrets, and high-confidence patterns
38
+ - Optional OPF backend for model-based PII review and redaction
39
+ - Custom policy files for private names, internal projects, unreleased plans, and environment-specific rules
40
+ - Blocking, warning, redaction, and allow decisions from one report format
41
+ - Markdown-aware scanning with frontmatter and allow-comment support
42
+
43
+ The core package has no required third-party dependencies. OPF is optional and runs through its CLI when available.
44
+
45
+ ## Quick Start
46
+
47
+ Install from a local clone:
48
+
49
+ ```bash
50
+ python -m pip install -e .
51
+ ```
52
+
53
+ Scan or redact a file:
54
+
55
+ ```bash
56
+ content-guard scan examples/sample.md --policy policies/public-content.json
57
+ content-guard redact examples/sample.md --policy policies/public-content.json
58
+ content-guard scan examples/sample.md --json
59
+ content-guard scan examples/ --policy policies/public-content.json
60
+ ```
61
+
62
+ Use OPF if it is installed locally:
63
+
64
+ ```bash
65
+ content-guard redact examples/sample.md --opf
66
+ ```
67
+
68
+ By default, `--opf` looks for `~/.opf-venv/bin/opf`. Override it with:
69
+
70
+ ```bash
71
+ CONTENT_GUARD_OPF_BIN=/path/to/opf content-guard scan file.md --opf
72
+ ```
73
+
74
+ OPF can also be enabled from a policy file:
75
+
76
+ ```json
77
+ {
78
+ "backends": {
79
+ "opf": {
80
+ "enabled": true,
81
+ "action": "warn",
82
+ "device": "cpu"
83
+ }
84
+ }
85
+ }
86
+ ```
87
+
88
+ ## Policies
89
+
90
+ Policies are JSON so the project stays dependency-free. A policy can set default actions by category, override individual rules, and add private custom regex rules.
91
+
92
+ ```json
93
+ {
94
+ "name": "public-content",
95
+ "defaults": {
96
+ "infrastructure": "block",
97
+ "secret": "block",
98
+ "pii": "warn"
99
+ },
100
+ "rules": {
101
+ "email": "warn"
102
+ },
103
+ "custom_rules": [
104
+ {
105
+ "id": "internal-hostname-example",
106
+ "category": "infrastructure",
107
+ "pattern": "\\\\binternal-host\\\\b",
108
+ "replacement": "[redacted-host]"
109
+ }
110
+ ]
111
+ }
112
+ ```
113
+
114
+ Actions:
115
+
116
+ - `block`: fail the scan, usually for publish gates
117
+ - `redact`: rewrite matching content
118
+ - `warn`: report without failing
119
+ - `allow`: ignore matching findings
120
+
121
+ ### Bundled Policies
122
+
123
+ Two bundled policies share the `infrastructure` category but treat it differently on purpose:
124
+
125
+ - `policies/public-repo.json`: for technical docs repos. It keeps `private-ipv4` (RFC 1918), secrets, PII, and `Co-authored-by` trailers as hard blocks, but downgrades `loopback-ipv4` (127.x), `localhost-port`, `localhost-bare`, and `port-reference` to warnings. README and CONTRIBUTING files often need to discuss `localhost`, named ports, and `127.0.0.1` for setup instructions. See [policies/public-repo.md](policies/public-repo.md) for the long-form rationale.
126
+ - `policies/public-content.json`: for blog posts and social drafts. It keeps the full infrastructure category at block because marketing surfaces have a higher leak risk and should not expose internal addresses or named ports.
127
+
128
+ ## Allow Comments
129
+
130
+ Use a local allow comment on the same line or directly above a line:
131
+
132
+ ```md
133
+ <!-- content-guard: allow localhost-bare -->
134
+ This tutorial uses localhost as an example.
135
+ ```
136
+
137
+ Use `content-guard: allow all` sparingly for examples where every finding is intentional.
138
+
139
+ ## PR and Git Guards
140
+
141
+ PR bodies and public repository content are publishing boundaries too. Use stricter policies before copying generated summaries, dogfood notes, local test output, fixtures, or docs into public GitHub surfaces:
142
+
143
+ ```bash
144
+ content-guard scan examples/pr-body.md --policy policies/pr-draft.json
145
+ content-guard diff examples/pr-body.md --policy policies/pr-draft.json
146
+ content-guard-pr examples/pr-body.md
147
+ content-guard-pr-prepare examples/pr-body.md --json
148
+ content-guard-publish-check --pr-body examples/pr-body.md --json
149
+ content-guard-n8n-advisory < payload.json
150
+ content-guard-n8n-validate --json
151
+ content-guard-git --policy policies/public-repo.json
152
+ content-guard-git --all-tracked --policy policies/public-repo.json
153
+ content-guard-commits --range origin/main..HEAD --policy policies/public-repo.json
154
+ ```
155
+
156
+ See [docs/PR_DRAFTS.md](docs/PR_DRAFTS.md) and [docs/GIT_PUBLIC_REPO_GUARD.md](docs/GIT_PUBLIC_REPO_GUARD.md).
157
+
158
+ Use `content-guard-publish-check` as the practical local pre-publish wrapper. It prepares a sanitized PR body when `--pr-body` is provided, scans staged files, scans commit messages, and can optionally scan all tracked files:
159
+
160
+ ```bash
161
+ content-guard-publish-check --pr-body pr-body.md --json
162
+ content-guard-publish-check --pr-body pr-body.md --all-tracked
163
+ ```
164
+
165
+ PR body findings are advisory by default because the wrapper writes a sanitized body and prints `publish_body_file`. Staged file, commit message, and optional all-tracked blockers fail the command unless `--advisory-only` is set.
166
+
167
+ Use `content-guard-pr-prepare` when a later PR publishing step needs a stable sanitized body path:
168
+
169
+ ```bash
170
+ content-guard-pr-prepare pr-body.md
171
+ gh pr create --body-file .content-guard/pr-drafts/pr-body.public.md
172
+ ```
173
+
174
+ For local run-alongside testing against the legacy scrubber, see [docs/DOGFOOD_TEST_REPO.md](docs/DOGFOOD_TEST_REPO.md).
175
+
176
+ For n8n publish workflows, start with an advisory step that reports findings without mutating live publishes. See [docs/N8N_ADVISORY.md](docs/N8N_ADVISORY.md) and [docs/N8N_WORKFLOW_RECIPE.md](docs/N8N_WORKFLOW_RECIPE.md). Validate cloned workflow wiring with [docs/N8N_VALIDATION_PACK.md](docs/N8N_VALIDATION_PACK.md).
177
+
178
+ ## OpenClaw Plugin
179
+
180
+ Content Guard can also run as an OpenClaw outbound message plugin. The plugin lives in `openclaw-plugin/` and shells out to the same Python engine, so OpenClaw messages use the same policy model as publish gates.
181
+
182
+ See [docs/OPENCLAW_PLUGIN.md](docs/OPENCLAW_PLUGIN.md).
183
+
184
+ ## Design Notes
185
+
186
+ Privacy Filter influenced the optional model-backed PII layer, especially the idea that some personal data detection benefits from context. Content Guard does not copy Privacy Filter code. OPF integration is a subprocess adapter so the deterministic engine remains portable and maintainable.
187
+
188
+ The deterministic rules are intentionally conservative. Public publishing should fail loudly on infrastructure and secret leakage, while model findings are better treated as review signals until a local policy proves they are reliable enough to block.
@@ -0,0 +1,176 @@
1
+ <p align="center">
2
+ <img src="docs/assets/content-guard-banner.jpg" alt="Content Guard banner">
3
+ </p>
4
+
5
+ <h1 align="center">Content Guard</h1>
6
+
7
+ <p align="center">
8
+ <strong>Policy-driven scanning and redaction for public content, publishing pipelines, and agent output.</strong>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <img src="https://img.shields.io/badge/python-3.11%2B-3776AB?style=for-the-badge&logo=python&logoColor=white" alt="Python 3.11+">
13
+ <img src="https://img.shields.io/badge/license-Apache--2.0-blue?style=for-the-badge" alt="Apache-2.0 license">
14
+ <img src="https://img.shields.io/badge/dependencies-zero_required-2ea44f?style=for-the-badge" alt="Zero required third-party dependencies">
15
+ <img src="https://img.shields.io/badge/OPF-optional-8A2BE2?style=for-the-badge" alt="Optional OPF backend">
16
+ <img src="https://img.shields.io/badge/markdown-aware-083344?style=for-the-badge&logo=markdown&logoColor=white" alt="Markdown aware">
17
+ </p>
18
+
19
+ Content Guard keeps private infrastructure, secrets, and personal context out of public surfaces before they ship. It is built for Markdown docs, PR bodies, social drafts, generated agent output, and automation pipelines where one sloppy paste can leak more than intended.
20
+
21
+ It takes the practical parts of the local content scrubber and the useful model-backed idea behind Privacy Filter, then turns them into one maintainable system.
22
+
23
+ ## What It Checks
24
+
25
+ - Deterministic rules for infrastructure, secrets, and high-confidence patterns
26
+ - Optional OPF backend for model-based PII review and redaction
27
+ - Custom policy files for private names, internal projects, unreleased plans, and environment-specific rules
28
+ - Blocking, warning, redaction, and allow decisions from one report format
29
+ - Markdown-aware scanning with frontmatter and allow-comment support
30
+
31
+ The core package has no required third-party dependencies. OPF is optional and runs through its CLI when available.
32
+
33
+ ## Quick Start
34
+
35
+ Install from a local clone:
36
+
37
+ ```bash
38
+ python -m pip install -e .
39
+ ```
40
+
41
+ Scan or redact a file:
42
+
43
+ ```bash
44
+ content-guard scan examples/sample.md --policy policies/public-content.json
45
+ content-guard redact examples/sample.md --policy policies/public-content.json
46
+ content-guard scan examples/sample.md --json
47
+ content-guard scan examples/ --policy policies/public-content.json
48
+ ```
49
+
50
+ Use OPF if it is installed locally:
51
+
52
+ ```bash
53
+ content-guard redact examples/sample.md --opf
54
+ ```
55
+
56
+ By default, `--opf` looks for `~/.opf-venv/bin/opf`. Override it with:
57
+
58
+ ```bash
59
+ CONTENT_GUARD_OPF_BIN=/path/to/opf content-guard scan file.md --opf
60
+ ```
61
+
62
+ OPF can also be enabled from a policy file:
63
+
64
+ ```json
65
+ {
66
+ "backends": {
67
+ "opf": {
68
+ "enabled": true,
69
+ "action": "warn",
70
+ "device": "cpu"
71
+ }
72
+ }
73
+ }
74
+ ```
75
+
76
+ ## Policies
77
+
78
+ Policies are JSON so the project stays dependency-free. A policy can set default actions by category, override individual rules, and add private custom regex rules.
79
+
80
+ ```json
81
+ {
82
+ "name": "public-content",
83
+ "defaults": {
84
+ "infrastructure": "block",
85
+ "secret": "block",
86
+ "pii": "warn"
87
+ },
88
+ "rules": {
89
+ "email": "warn"
90
+ },
91
+ "custom_rules": [
92
+ {
93
+ "id": "internal-hostname-example",
94
+ "category": "infrastructure",
95
+ "pattern": "\\\\binternal-host\\\\b",
96
+ "replacement": "[redacted-host]"
97
+ }
98
+ ]
99
+ }
100
+ ```
101
+
102
+ Actions:
103
+
104
+ - `block`: fail the scan, usually for publish gates
105
+ - `redact`: rewrite matching content
106
+ - `warn`: report without failing
107
+ - `allow`: ignore matching findings
108
+
109
+ ### Bundled Policies
110
+
111
+ Two bundled policies share the `infrastructure` category but treat it differently on purpose:
112
+
113
+ - `policies/public-repo.json`: for technical docs repos. It keeps `private-ipv4` (RFC 1918), secrets, PII, and `Co-authored-by` trailers as hard blocks, but downgrades `loopback-ipv4` (127.x), `localhost-port`, `localhost-bare`, and `port-reference` to warnings. README and CONTRIBUTING files often need to discuss `localhost`, named ports, and `127.0.0.1` for setup instructions. See [policies/public-repo.md](policies/public-repo.md) for the long-form rationale.
114
+ - `policies/public-content.json`: for blog posts and social drafts. It keeps the full infrastructure category at block because marketing surfaces have a higher leak risk and should not expose internal addresses or named ports.
115
+
116
+ ## Allow Comments
117
+
118
+ Use a local allow comment on the same line or directly above a line:
119
+
120
+ ```md
121
+ <!-- content-guard: allow localhost-bare -->
122
+ This tutorial uses localhost as an example.
123
+ ```
124
+
125
+ Use `content-guard: allow all` sparingly for examples where every finding is intentional.
126
+
127
+ ## PR and Git Guards
128
+
129
+ PR bodies and public repository content are publishing boundaries too. Use stricter policies before copying generated summaries, dogfood notes, local test output, fixtures, or docs into public GitHub surfaces:
130
+
131
+ ```bash
132
+ content-guard scan examples/pr-body.md --policy policies/pr-draft.json
133
+ content-guard diff examples/pr-body.md --policy policies/pr-draft.json
134
+ content-guard-pr examples/pr-body.md
135
+ content-guard-pr-prepare examples/pr-body.md --json
136
+ content-guard-publish-check --pr-body examples/pr-body.md --json
137
+ content-guard-n8n-advisory < payload.json
138
+ content-guard-n8n-validate --json
139
+ content-guard-git --policy policies/public-repo.json
140
+ content-guard-git --all-tracked --policy policies/public-repo.json
141
+ content-guard-commits --range origin/main..HEAD --policy policies/public-repo.json
142
+ ```
143
+
144
+ See [docs/PR_DRAFTS.md](docs/PR_DRAFTS.md) and [docs/GIT_PUBLIC_REPO_GUARD.md](docs/GIT_PUBLIC_REPO_GUARD.md).
145
+
146
+ Use `content-guard-publish-check` as the practical local pre-publish wrapper. It prepares a sanitized PR body when `--pr-body` is provided, scans staged files, scans commit messages, and can optionally scan all tracked files:
147
+
148
+ ```bash
149
+ content-guard-publish-check --pr-body pr-body.md --json
150
+ content-guard-publish-check --pr-body pr-body.md --all-tracked
151
+ ```
152
+
153
+ PR body findings are advisory by default because the wrapper writes a sanitized body and prints `publish_body_file`. Staged file, commit message, and optional all-tracked blockers fail the command unless `--advisory-only` is set.
154
+
155
+ Use `content-guard-pr-prepare` when a later PR publishing step needs a stable sanitized body path:
156
+
157
+ ```bash
158
+ content-guard-pr-prepare pr-body.md
159
+ gh pr create --body-file .content-guard/pr-drafts/pr-body.public.md
160
+ ```
161
+
162
+ For local run-alongside testing against the legacy scrubber, see [docs/DOGFOOD_TEST_REPO.md](docs/DOGFOOD_TEST_REPO.md).
163
+
164
+ For n8n publish workflows, start with an advisory step that reports findings without mutating live publishes. See [docs/N8N_ADVISORY.md](docs/N8N_ADVISORY.md) and [docs/N8N_WORKFLOW_RECIPE.md](docs/N8N_WORKFLOW_RECIPE.md). Validate cloned workflow wiring with [docs/N8N_VALIDATION_PACK.md](docs/N8N_VALIDATION_PACK.md).
165
+
166
+ ## OpenClaw Plugin
167
+
168
+ Content Guard can also run as an OpenClaw outbound message plugin. The plugin lives in `openclaw-plugin/` and shells out to the same Python engine, so OpenClaw messages use the same policy model as publish gates.
169
+
170
+ See [docs/OPENCLAW_PLUGIN.md](docs/OPENCLAW_PLUGIN.md).
171
+
172
+ ## Design Notes
173
+
174
+ Privacy Filter influenced the optional model-backed PII layer, especially the idea that some personal data detection benefits from context. Content Guard does not copy Privacy Filter code. OPF integration is a subprocess adapter so the deterministic engine remains portable and maintainable.
175
+
176
+ The deterministic rules are intentionally conservative. Public publishing should fail loudly on infrastructure and secret leakage, while model findings are better treated as review signals until a local policy proves they are reliable enough to block.
@@ -0,0 +1,33 @@
1
+ [build-system]
2
+ requires = ["setuptools>=69"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "content-guard"
7
+ version = "0.1.1"
8
+ description = "Policy-driven content scanning and redaction for public publishing and agent output."
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ license = { text = "Apache-2.0" }
12
+ authors = [{ name = "Solomon Neas" }]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.11",
16
+ "License :: OSI Approved :: Apache Software License",
17
+ ]
18
+
19
+ [project.scripts]
20
+ content-guard = "content_guard.cli:main"
21
+ content-guard-git = "content_guard.git_scan:main"
22
+ content-guard-commits = "content_guard.git_commits:main"
23
+ content-guard-publish-check = "content_guard.publish_check:main"
24
+ content-guard-n8n-advisory = "content_guard.n8n_advisory:main"
25
+ content-guard-n8n-validate = "content_guard.n8n_validate:main"
26
+ content-guard-pr = "content_guard.pr_draft:main"
27
+ content-guard-pr-prepare = "content_guard.pr_prepare:main"
28
+
29
+ [tool.setuptools.packages.find]
30
+ where = ["src"]
31
+
32
+ [tool.setuptools.package-data]
33
+ content_guard = ["policies/*.json"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,6 @@
1
+ """Policy-driven content scanning and redaction."""
2
+
3
+ from .engine import GuardResult, scan_text, redact_text
4
+ from .policy import Policy, load_policy
5
+
6
+ __all__ = ["GuardResult", "Policy", "load_policy", "scan_text", "redact_text"]
@@ -0,0 +1,4 @@
1
+ from .cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
@@ -0,0 +1,176 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import difflib
5
+ import json
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from .engine import scan_text
10
+ from .policy import load_policy
11
+ from .report import to_json, to_payload, to_text
12
+ from .types import ScanOptions
13
+
14
+
15
+ def main(argv: list[str] | None = None) -> int:
16
+ parser = build_parser()
17
+ args = parser.parse_args(argv)
18
+
19
+ if args.command == "scan":
20
+ return _scan(args)
21
+ if args.command == "redact":
22
+ return _redact(args)
23
+ if args.command == "diff":
24
+ return _diff(args)
25
+
26
+ parser.error(f"unknown command: {args.command}")
27
+ return 2
28
+
29
+
30
+ def build_parser() -> argparse.ArgumentParser:
31
+ parser = argparse.ArgumentParser(
32
+ prog="content-guard",
33
+ description="Policy-driven content scanning and redaction.",
34
+ )
35
+ sub = parser.add_subparsers(dest="command", required=True)
36
+
37
+ for name in ("scan", "redact", "diff"):
38
+ cmd = sub.add_parser(name)
39
+ cmd.add_argument("target", nargs="?", help="file to read, or stdin when omitted")
40
+ cmd.add_argument("--policy", help="JSON policy file")
41
+ cmd.add_argument("--opf", action="store_true", help="run optional OPF backend")
42
+ cmd.add_argument("--opf-bin", help="path to opf binary")
43
+ cmd.add_argument("--opf-device", help="OPF device, default comes from policy or cpu")
44
+ cmd.add_argument("--scan-frontmatter", action="store_true", help="scan YAML frontmatter")
45
+ cmd.add_argument("--skip-code-blocks", action="store_true", help="ignore fenced code blocks")
46
+ cmd.add_argument("--no-allow-comments", action="store_true", help="ignore content-guard allow comments")
47
+
48
+ sub.choices["scan"].add_argument("--json", action="store_true", help="emit JSON report")
49
+ sub.choices["redact"].add_argument("--in-place", action="store_true", help="rewrite the target file")
50
+ return parser
51
+
52
+
53
+ def _options(args: argparse.Namespace) -> ScanOptions:
54
+ return ScanOptions(
55
+ scan_frontmatter=args.scan_frontmatter,
56
+ scan_code_blocks=not args.skip_code_blocks,
57
+ honor_allow_comments=not args.no_allow_comments,
58
+ include_opf=args.opf,
59
+ opf_device=args.opf_device,
60
+ opf_bin=args.opf_bin,
61
+ )
62
+
63
+
64
+ def _read_target(target: str | None) -> tuple[str, str | None]:
65
+ if not target or target == "-":
66
+ return sys.stdin.read(), None
67
+ path = Path(target)
68
+ return path.read_text(), str(path)
69
+
70
+
71
+ def _scan(args: argparse.Namespace) -> int:
72
+ policy = load_policy(args.policy)
73
+ options = _options(args)
74
+ target_path = Path(args.target) if args.target and args.target != "-" else None
75
+
76
+ if target_path and target_path.is_dir():
77
+ results = _scan_directory(target_path, policy, options)
78
+ blocked = any(result.blocked for _, result in results)
79
+ if args.json:
80
+ print(
81
+ json.dumps(
82
+ {
83
+ "ok": not blocked,
84
+ "blocked": blocked,
85
+ "files_scanned": len(results),
86
+ "files": [
87
+ {"path": str(path), **to_payload(result)}
88
+ for path, result in results
89
+ if result.findings
90
+ ],
91
+ },
92
+ indent=2,
93
+ sort_keys=True,
94
+ )
95
+ )
96
+ elif not any(result.findings for _, result in results):
97
+ print(f"Clean. {len(results)} file(s) checked.")
98
+ else:
99
+ for path, result in results:
100
+ if result.findings:
101
+ print(to_text(result, path=str(path)))
102
+ return 1 if blocked else 0
103
+
104
+ text, path = _read_target(args.target)
105
+ result = scan_text(text, policy=policy, options=options)
106
+ if args.json:
107
+ print(to_json(result))
108
+ else:
109
+ print(to_text(result, path=path or "<stdin>"))
110
+ return 1 if result.blocked else 0
111
+
112
+
113
+ def _redact(args: argparse.Namespace) -> int:
114
+ policy = load_policy(args.policy)
115
+ options = _options(args)
116
+ target_path = Path(args.target) if args.target and args.target != "-" else None
117
+
118
+ if target_path and target_path.is_dir():
119
+ if not args.in_place:
120
+ print("directory redact requires --in-place", file=sys.stderr)
121
+ return 2
122
+ results = _scan_directory(target_path, policy, options)
123
+ for path, result in results:
124
+ if result.changed:
125
+ path.write_text(result.redacted_text)
126
+ return 1 if any(result.blocked for _, result in results) else 0
127
+
128
+ text, path = _read_target(args.target)
129
+ result = scan_text(text, policy=policy, options=options)
130
+
131
+ if args.in_place:
132
+ if not path:
133
+ print("--in-place requires a file target", file=sys.stderr)
134
+ return 2
135
+ Path(path).write_text(result.redacted_text)
136
+ else:
137
+ sys.stdout.write(result.redacted_text)
138
+ return 1 if result.blocked else 0
139
+
140
+
141
+ def _diff(args: argparse.Namespace) -> int:
142
+ policy = load_policy(args.policy)
143
+ options = _options(args)
144
+ target_path = Path(args.target) if args.target and args.target != "-" else None
145
+
146
+ if target_path and target_path.is_dir():
147
+ results = _scan_directory(target_path, policy, options)
148
+ for path, result in results:
149
+ if not result.changed:
150
+ continue
151
+ _write_diff(result.text, result.redacted_text, str(path))
152
+ return 1 if any(result.blocked for _, result in results) else 0
153
+
154
+ text, path = _read_target(args.target)
155
+ result = scan_text(text, policy=policy, options=options)
156
+ source_name = path or "<stdin>"
157
+ _write_diff(text, result.redacted_text, source_name)
158
+ return 1 if result.blocked else 0
159
+
160
+
161
+ def _scan_directory(path: Path, policy, options: ScanOptions):
162
+ results = []
163
+ for file_path in sorted(path.rglob("*.md")):
164
+ text = file_path.read_text()
165
+ results.append((file_path, scan_text(text, policy=policy, options=options)))
166
+ return results
167
+
168
+
169
+ def _write_diff(text: str, redacted_text: str, source_name: str) -> None:
170
+ diff = difflib.unified_diff(
171
+ text.splitlines(keepends=True),
172
+ redacted_text.splitlines(keepends=True),
173
+ fromfile=source_name,
174
+ tofile=f"{source_name} (redacted)",
175
+ )
176
+ sys.stdout.writelines(diff)
@@ -0,0 +1 @@
1
+ """Detector backends."""
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import subprocess
5
+ import tempfile
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class OpfResult:
12
+ available: bool
13
+ changed: bool
14
+ redacted_text: str
15
+ error: str = ""
16
+
17
+
18
+ def default_opf_bin() -> str:
19
+ return os.environ.get("CONTENT_GUARD_OPF_BIN") or str(Path.home() / ".opf-venv" / "bin" / "opf")
20
+
21
+
22
+ def run_opf(text: str, *, opf_bin: str | None = None, device: str = "cpu", timeout: int = 120) -> OpfResult:
23
+ opf = opf_bin or default_opf_bin()
24
+ if not os.path.exists(opf) or not os.access(opf, os.X_OK):
25
+ return OpfResult(False, False, text, f"opf binary not found: {opf}")
26
+
27
+ with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as handle:
28
+ handle.write(text)
29
+ path = handle.name
30
+
31
+ try:
32
+ proc = subprocess.run(
33
+ [opf, "--device", device, "-f", path],
34
+ capture_output=True,
35
+ text=True,
36
+ timeout=timeout,
37
+ check=False,
38
+ )
39
+ except (OSError, subprocess.SubprocessError) as exc:
40
+ return OpfResult(True, False, text, str(exc))
41
+ finally:
42
+ try:
43
+ os.unlink(path)
44
+ except OSError:
45
+ pass
46
+
47
+ if proc.returncode != 0:
48
+ error = (proc.stderr or proc.stdout or "opf failed").strip()
49
+ return OpfResult(True, False, text, error)
50
+
51
+ redacted = proc.stdout
52
+ return OpfResult(True, redacted != text, redacted, "")