content-guard 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_guard-0.1.1/PKG-INFO +188 -0
- content_guard-0.1.1/README.md +176 -0
- content_guard-0.1.1/pyproject.toml +33 -0
- content_guard-0.1.1/setup.cfg +4 -0
- content_guard-0.1.1/src/content_guard/__init__.py +6 -0
- content_guard-0.1.1/src/content_guard/__main__.py +4 -0
- content_guard-0.1.1/src/content_guard/cli.py +176 -0
- content_guard-0.1.1/src/content_guard/detectors/__init__.py +1 -0
- content_guard-0.1.1/src/content_guard/detectors/opf.py +52 -0
- content_guard-0.1.1/src/content_guard/engine.py +230 -0
- content_guard-0.1.1/src/content_guard/git_commits.py +145 -0
- content_guard-0.1.1/src/content_guard/git_scan.py +123 -0
- content_guard-0.1.1/src/content_guard/n8n_advisory.py +95 -0
- content_guard-0.1.1/src/content_guard/n8n_validate.py +153 -0
- content_guard-0.1.1/src/content_guard/policies/openclaw-message.json +32 -0
- content_guard-0.1.1/src/content_guard/policies/pr-draft.json +23 -0
- content_guard-0.1.1/src/content_guard/policies/public-content.json +25 -0
- content_guard-0.1.1/src/content_guard/policies/public-repo.json +36 -0
- content_guard-0.1.1/src/content_guard/policy.py +168 -0
- content_guard-0.1.1/src/content_guard/pr_draft.py +73 -0
- content_guard-0.1.1/src/content_guard/pr_prepare.py +131 -0
- content_guard-0.1.1/src/content_guard/publish_check.py +257 -0
- content_guard-0.1.1/src/content_guard/report.py +39 -0
- content_guard-0.1.1/src/content_guard/rules.py +107 -0
- content_guard-0.1.1/src/content_guard/types.py +84 -0
- content_guard-0.1.1/src/content_guard.egg-info/PKG-INFO +188 -0
- content_guard-0.1.1/src/content_guard.egg-info/SOURCES.txt +31 -0
- content_guard-0.1.1/src/content_guard.egg-info/dependency_links.txt +1 -0
- content_guard-0.1.1/src/content_guard.egg-info/entry_points.txt +9 -0
- content_guard-0.1.1/src/content_guard.egg-info/top_level.txt +1 -0
- content_guard-0.1.1/tests/test_cli.py +569 -0
- content_guard-0.1.1/tests/test_engine.py +137 -0
- content_guard-0.1.1/tests/test_rules.py +162 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: content-guard
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Policy-driven content scanning and redaction for public publishing and agent output.
|
|
5
|
+
Author: Solomon Neas
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Requires-Python: >=3.11
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
<p align="center">
|
|
14
|
+
<img src="docs/assets/content-guard-banner.jpg" alt="Content Guard banner">
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
<h1 align="center">Content Guard</h1>
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<strong>Policy-driven scanning and redaction for public content, publishing pipelines, and agent output.</strong>
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
<p align="center">
|
|
24
|
+
<img src="https://img.shields.io/badge/python-3.11%2B-3776AB?style=for-the-badge&logo=python&logoColor=white" alt="Python 3.11+">
|
|
25
|
+
<img src="https://img.shields.io/badge/license-Apache--2.0-blue?style=for-the-badge" alt="Apache-2.0 license">
|
|
26
|
+
<img src="https://img.shields.io/badge/dependencies-zero_required-2ea44f?style=for-the-badge" alt="Zero required third-party dependencies">
|
|
27
|
+
<img src="https://img.shields.io/badge/OPF-optional-8A2BE2?style=for-the-badge" alt="Optional OPF backend">
|
|
28
|
+
<img src="https://img.shields.io/badge/markdown-aware-083344?style=for-the-badge&logo=markdown&logoColor=white" alt="Markdown aware">
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
Content Guard keeps private infrastructure, secrets, and personal context out of public surfaces before they ship. It is built for Markdown docs, PR bodies, social drafts, generated agent output, and automation pipelines where one sloppy paste can leak more than intended.
|
|
32
|
+
|
|
33
|
+
It takes the practical parts of the local content scrubber and the useful model-backed idea behind Privacy Filter, then turns them into one maintainable system.
|
|
34
|
+
|
|
35
|
+
## What It Checks
|
|
36
|
+
|
|
37
|
+
- Deterministic rules for infrastructure, secrets, and high-confidence patterns
|
|
38
|
+
- Optional OPF backend for model-based PII review and redaction
|
|
39
|
+
- Custom policy files for private names, internal projects, unreleased plans, and environment-specific rules
|
|
40
|
+
- Blocking, warning, redaction, and allow decisions from one report format
|
|
41
|
+
- Markdown-aware scanning with frontmatter and allow-comment support
|
|
42
|
+
|
|
43
|
+
The core package has no required third-party dependencies. OPF is optional and runs through its CLI when available.
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
Install from a local clone:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
python -m pip install -e .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Scan or redact a file:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
content-guard scan examples/sample.md --policy policies/public-content.json
|
|
57
|
+
content-guard redact examples/sample.md --policy policies/public-content.json
|
|
58
|
+
content-guard scan examples/sample.md --json
|
|
59
|
+
content-guard scan examples/ --policy policies/public-content.json
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Use OPF if it is installed locally:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
content-guard redact examples/sample.md --opf
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
By default, `--opf` looks for `~/.opf-venv/bin/opf`. Override it with:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
CONTENT_GUARD_OPF_BIN=/path/to/opf content-guard scan file.md --opf
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
OPF can also be enabled from a policy file:
|
|
75
|
+
|
|
76
|
+
```json
|
|
77
|
+
{
|
|
78
|
+
"backends": {
|
|
79
|
+
"opf": {
|
|
80
|
+
"enabled": true,
|
|
81
|
+
"action": "warn",
|
|
82
|
+
"device": "cpu"
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Policies
|
|
89
|
+
|
|
90
|
+
Policies are JSON so the project stays dependency-free. A policy can set default actions by category, override individual rules, and add private custom regex rules.
|
|
91
|
+
|
|
92
|
+
```json
|
|
93
|
+
{
|
|
94
|
+
"name": "public-content",
|
|
95
|
+
"defaults": {
|
|
96
|
+
"infrastructure": "block",
|
|
97
|
+
"secret": "block",
|
|
98
|
+
"pii": "warn"
|
|
99
|
+
},
|
|
100
|
+
"rules": {
|
|
101
|
+
"email": "warn"
|
|
102
|
+
},
|
|
103
|
+
"custom_rules": [
|
|
104
|
+
{
|
|
105
|
+
"id": "internal-hostname-example",
|
|
106
|
+
"category": "infrastructure",
|
|
107
|
+
"pattern": "\\\\binternal-host\\\\b",
|
|
108
|
+
"replacement": "[redacted-host]"
|
|
109
|
+
}
|
|
110
|
+
]
|
|
111
|
+
}
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Actions:
|
|
115
|
+
|
|
116
|
+
- `block`: fail the scan, usually for publish gates
|
|
117
|
+
- `redact`: rewrite matching content
|
|
118
|
+
- `warn`: report without failing
|
|
119
|
+
- `allow`: ignore matching findings
|
|
120
|
+
|
|
121
|
+
### Bundled Policies
|
|
122
|
+
|
|
123
|
+
Two bundled policies share the `infrastructure` category but treat it differently on purpose:
|
|
124
|
+
|
|
125
|
+
- `policies/public-repo.json`: for technical docs repos. It keeps `private-ipv4` (RFC 1918), secrets, PII, and `Co-authored-by` trailers as hard blocks, but downgrades `loopback-ipv4` (127.x), `localhost-port`, `localhost-bare`, and `port-reference` to warnings. README and CONTRIBUTING files often need to discuss `localhost`, named ports, and `127.0.0.1` for setup instructions. See [policies/public-repo.md](policies/public-repo.md) for the long-form rationale.
|
|
126
|
+
- `policies/public-content.json`: for blog posts and social drafts. It keeps the full infrastructure category at block because marketing surfaces have a higher leak risk and should not expose internal addresses or named ports.
|
|
127
|
+
|
|
128
|
+
## Allow Comments
|
|
129
|
+
|
|
130
|
+
Use a local allow comment on the same line or directly above a line:
|
|
131
|
+
|
|
132
|
+
```md
|
|
133
|
+
<!-- content-guard: allow localhost-bare -->
|
|
134
|
+
This tutorial uses localhost as an example.
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Use `content-guard: allow all` sparingly for examples where every finding is intentional.
|
|
138
|
+
|
|
139
|
+
## PR and Git Guards
|
|
140
|
+
|
|
141
|
+
PR bodies and public repository content are publishing boundaries too. Use stricter policies before copying generated summaries, dogfood notes, local test output, fixtures, or docs into public GitHub surfaces:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
content-guard scan examples/pr-body.md --policy policies/pr-draft.json
|
|
145
|
+
content-guard diff examples/pr-body.md --policy policies/pr-draft.json
|
|
146
|
+
content-guard-pr examples/pr-body.md
|
|
147
|
+
content-guard-pr-prepare examples/pr-body.md --json
|
|
148
|
+
content-guard-publish-check --pr-body examples/pr-body.md --json
|
|
149
|
+
content-guard-n8n-advisory < payload.json
|
|
150
|
+
content-guard-n8n-validate --json
|
|
151
|
+
content-guard-git --policy policies/public-repo.json
|
|
152
|
+
content-guard-git --all-tracked --policy policies/public-repo.json
|
|
153
|
+
content-guard-commits --range origin/main..HEAD --policy policies/public-repo.json
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
See [docs/PR_DRAFTS.md](docs/PR_DRAFTS.md) and [docs/GIT_PUBLIC_REPO_GUARD.md](docs/GIT_PUBLIC_REPO_GUARD.md).
|
|
157
|
+
|
|
158
|
+
Use `content-guard-publish-check` as the practical local pre-publish wrapper. It prepares a sanitized PR body when `--pr-body` is provided, scans staged files, scans commit messages, and can optionally scan all tracked files:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
content-guard-publish-check --pr-body pr-body.md --json
|
|
162
|
+
content-guard-publish-check --pr-body pr-body.md --all-tracked
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
PR body findings are advisory by default because the wrapper writes a sanitized body and prints `publish_body_file`. Staged file, commit message, and optional all-tracked blockers fail the command unless `--advisory-only` is set.
|
|
166
|
+
|
|
167
|
+
Use `content-guard-pr-prepare` when a later PR publishing step needs a stable sanitized body path:
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
content-guard-pr-prepare pr-body.md
|
|
171
|
+
gh pr create --body-file .content-guard/pr-drafts/pr-body.public.md
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
For local run-alongside testing against the legacy scrubber, see [docs/DOGFOOD_TEST_REPO.md](docs/DOGFOOD_TEST_REPO.md).
|
|
175
|
+
|
|
176
|
+
For n8n publish workflows, start with an advisory step that reports findings without mutating live publishes. See [docs/N8N_ADVISORY.md](docs/N8N_ADVISORY.md) and [docs/N8N_WORKFLOW_RECIPE.md](docs/N8N_WORKFLOW_RECIPE.md). Validate cloned workflow wiring with [docs/N8N_VALIDATION_PACK.md](docs/N8N_VALIDATION_PACK.md).
|
|
177
|
+
|
|
178
|
+
## OpenClaw Plugin
|
|
179
|
+
|
|
180
|
+
Content Guard can also run as an OpenClaw outbound message plugin. The plugin lives in `openclaw-plugin/` and shells out to the same Python engine, so OpenClaw messages use the same policy model as publish gates.
|
|
181
|
+
|
|
182
|
+
See [docs/OPENCLAW_PLUGIN.md](docs/OPENCLAW_PLUGIN.md).
|
|
183
|
+
|
|
184
|
+
## Design Notes
|
|
185
|
+
|
|
186
|
+
Privacy Filter influenced the optional model-backed PII layer, especially the idea that some personal data detection benefits from context. Content Guard does not copy Privacy Filter code. OPF integration is a subprocess adapter so the deterministic engine remains portable and maintainable.
|
|
187
|
+
|
|
188
|
+
The deterministic rules are intentionally conservative. Public publishing should fail loudly on infrastructure and secret leakage, while model findings are better treated as review signals until a local policy proves they are reliable enough to block.
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="docs/assets/content-guard-banner.jpg" alt="Content Guard banner">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">Content Guard</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>Policy-driven scanning and redaction for public content, publishing pipelines, and agent output.</strong>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<img src="https://img.shields.io/badge/python-3.11%2B-3776AB?style=for-the-badge&logo=python&logoColor=white" alt="Python 3.11+">
|
|
13
|
+
<img src="https://img.shields.io/badge/license-Apache--2.0-blue?style=for-the-badge" alt="Apache-2.0 license">
|
|
14
|
+
<img src="https://img.shields.io/badge/dependencies-zero_required-2ea44f?style=for-the-badge" alt="Zero required third-party dependencies">
|
|
15
|
+
<img src="https://img.shields.io/badge/OPF-optional-8A2BE2?style=for-the-badge" alt="Optional OPF backend">
|
|
16
|
+
<img src="https://img.shields.io/badge/markdown-aware-083344?style=for-the-badge&logo=markdown&logoColor=white" alt="Markdown aware">
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
Content Guard keeps private infrastructure, secrets, and personal context out of public surfaces before they ship. It is built for Markdown docs, PR bodies, social drafts, generated agent output, and automation pipelines where one sloppy paste can leak more than intended.
|
|
20
|
+
|
|
21
|
+
It takes the practical parts of the local content scrubber and the useful model-backed idea behind Privacy Filter, then turns them into one maintainable system.
|
|
22
|
+
|
|
23
|
+
## What It Checks
|
|
24
|
+
|
|
25
|
+
- Deterministic rules for infrastructure, secrets, and high-confidence patterns
|
|
26
|
+
- Optional OPF backend for model-based PII review and redaction
|
|
27
|
+
- Custom policy files for private names, internal projects, unreleased plans, and environment-specific rules
|
|
28
|
+
- Blocking, warning, redaction, and allow decisions from one report format
|
|
29
|
+
- Markdown-aware scanning with frontmatter and allow-comment support
|
|
30
|
+
|
|
31
|
+
The core package has no required third-party dependencies. OPF is optional and runs through its CLI when available.
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
Install from a local clone:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
python -m pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Scan or redact a file:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
content-guard scan examples/sample.md --policy policies/public-content.json
|
|
45
|
+
content-guard redact examples/sample.md --policy policies/public-content.json
|
|
46
|
+
content-guard scan examples/sample.md --json
|
|
47
|
+
content-guard scan examples/ --policy policies/public-content.json
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Use OPF if it is installed locally:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
content-guard redact examples/sample.md --opf
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
By default, `--opf` looks for `~/.opf-venv/bin/opf`. Override it with:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
CONTENT_GUARD_OPF_BIN=/path/to/opf content-guard scan file.md --opf
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
OPF can also be enabled from a policy file:
|
|
63
|
+
|
|
64
|
+
```json
|
|
65
|
+
{
|
|
66
|
+
"backends": {
|
|
67
|
+
"opf": {
|
|
68
|
+
"enabled": true,
|
|
69
|
+
"action": "warn",
|
|
70
|
+
"device": "cpu"
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Policies
|
|
77
|
+
|
|
78
|
+
Policies are JSON so the project stays dependency-free. A policy can set default actions by category, override individual rules, and add private custom regex rules.
|
|
79
|
+
|
|
80
|
+
```json
|
|
81
|
+
{
|
|
82
|
+
"name": "public-content",
|
|
83
|
+
"defaults": {
|
|
84
|
+
"infrastructure": "block",
|
|
85
|
+
"secret": "block",
|
|
86
|
+
"pii": "warn"
|
|
87
|
+
},
|
|
88
|
+
"rules": {
|
|
89
|
+
"email": "warn"
|
|
90
|
+
},
|
|
91
|
+
"custom_rules": [
|
|
92
|
+
{
|
|
93
|
+
"id": "internal-hostname-example",
|
|
94
|
+
"category": "infrastructure",
|
|
95
|
+
"pattern": "\\\\binternal-host\\\\b",
|
|
96
|
+
"replacement": "[redacted-host]"
|
|
97
|
+
}
|
|
98
|
+
]
|
|
99
|
+
}
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Actions:
|
|
103
|
+
|
|
104
|
+
- `block`: fail the scan, usually for publish gates
|
|
105
|
+
- `redact`: rewrite matching content
|
|
106
|
+
- `warn`: report without failing
|
|
107
|
+
- `allow`: ignore matching findings
|
|
108
|
+
|
|
109
|
+
### Bundled Policies
|
|
110
|
+
|
|
111
|
+
Two bundled policies share the `infrastructure` category but treat it differently on purpose:
|
|
112
|
+
|
|
113
|
+
- `policies/public-repo.json`: for technical docs repos. It keeps `private-ipv4` (RFC 1918), secrets, PII, and `Co-authored-by` trailers as hard blocks, but downgrades `loopback-ipv4` (127.x), `localhost-port`, `localhost-bare`, and `port-reference` to warnings. README and CONTRIBUTING files often need to discuss `localhost`, named ports, and `127.0.0.1` for setup instructions. See [policies/public-repo.md](policies/public-repo.md) for the long-form rationale.
|
|
114
|
+
- `policies/public-content.json`: for blog posts and social drafts. It keeps the full infrastructure category at block because marketing surfaces have a higher leak risk and should not expose internal addresses or named ports.
|
|
115
|
+
|
|
116
|
+
## Allow Comments
|
|
117
|
+
|
|
118
|
+
Use a local allow comment on the same line or directly above a line:
|
|
119
|
+
|
|
120
|
+
```md
|
|
121
|
+
<!-- content-guard: allow localhost-bare -->
|
|
122
|
+
This tutorial uses localhost as an example.
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Use `content-guard: allow all` sparingly for examples where every finding is intentional.
|
|
126
|
+
|
|
127
|
+
## PR and Git Guards
|
|
128
|
+
|
|
129
|
+
PR bodies and public repository content are publishing boundaries too. Use stricter policies before copying generated summaries, dogfood notes, local test output, fixtures, or docs into public GitHub surfaces:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
content-guard scan examples/pr-body.md --policy policies/pr-draft.json
|
|
133
|
+
content-guard diff examples/pr-body.md --policy policies/pr-draft.json
|
|
134
|
+
content-guard-pr examples/pr-body.md
|
|
135
|
+
content-guard-pr-prepare examples/pr-body.md --json
|
|
136
|
+
content-guard-publish-check --pr-body examples/pr-body.md --json
|
|
137
|
+
content-guard-n8n-advisory < payload.json
|
|
138
|
+
content-guard-n8n-validate --json
|
|
139
|
+
content-guard-git --policy policies/public-repo.json
|
|
140
|
+
content-guard-git --all-tracked --policy policies/public-repo.json
|
|
141
|
+
content-guard-commits --range origin/main..HEAD --policy policies/public-repo.json
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
See [docs/PR_DRAFTS.md](docs/PR_DRAFTS.md) and [docs/GIT_PUBLIC_REPO_GUARD.md](docs/GIT_PUBLIC_REPO_GUARD.md).
|
|
145
|
+
|
|
146
|
+
Use `content-guard-publish-check` as the practical local pre-publish wrapper. It prepares a sanitized PR body when `--pr-body` is provided, scans staged files, scans commit messages, and can optionally scan all tracked files:
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
content-guard-publish-check --pr-body pr-body.md --json
|
|
150
|
+
content-guard-publish-check --pr-body pr-body.md --all-tracked
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
PR body findings are advisory by default because the wrapper writes a sanitized body and prints `publish_body_file`. Staged file, commit message, and optional all-tracked blockers fail the command unless `--advisory-only` is set.
|
|
154
|
+
|
|
155
|
+
Use `content-guard-pr-prepare` when a later PR publishing step needs a stable sanitized body path:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
content-guard-pr-prepare pr-body.md
|
|
159
|
+
gh pr create --body-file .content-guard/pr-drafts/pr-body.public.md
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
For local run-alongside testing against the legacy scrubber, see [docs/DOGFOOD_TEST_REPO.md](docs/DOGFOOD_TEST_REPO.md).
|
|
163
|
+
|
|
164
|
+
For n8n publish workflows, start with an advisory step that reports findings without mutating live publishes. See [docs/N8N_ADVISORY.md](docs/N8N_ADVISORY.md) and [docs/N8N_WORKFLOW_RECIPE.md](docs/N8N_WORKFLOW_RECIPE.md). Validate cloned workflow wiring with [docs/N8N_VALIDATION_PACK.md](docs/N8N_VALIDATION_PACK.md).
|
|
165
|
+
|
|
166
|
+
## OpenClaw Plugin
|
|
167
|
+
|
|
168
|
+
Content Guard can also run as an OpenClaw outbound message plugin. The plugin lives in `openclaw-plugin/` and shells out to the same Python engine, so OpenClaw messages use the same policy model as publish gates.
|
|
169
|
+
|
|
170
|
+
See [docs/OPENCLAW_PLUGIN.md](docs/OPENCLAW_PLUGIN.md).
|
|
171
|
+
|
|
172
|
+
## Design Notes
|
|
173
|
+
|
|
174
|
+
Privacy Filter influenced the optional model-backed PII layer, especially the idea that some personal data detection benefits from context. Content Guard does not copy Privacy Filter code. OPF integration is a subprocess adapter so the deterministic engine remains portable and maintainable.
|
|
175
|
+
|
|
176
|
+
The deterministic rules are intentionally conservative. Public publishing should fail loudly on infrastructure and secret leakage, while model findings are better treated as review signals until a local policy proves they are reliable enough to block.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "content-guard"
|
|
7
|
+
version = "0.1.1"
|
|
8
|
+
description = "Policy-driven content scanning and redaction for public publishing and agent output."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
authors = [{ name = "Solomon Neas" }]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.11",
|
|
16
|
+
"License :: OSI Approved :: Apache Software License",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
content-guard = "content_guard.cli:main"
|
|
21
|
+
content-guard-git = "content_guard.git_scan:main"
|
|
22
|
+
content-guard-commits = "content_guard.git_commits:main"
|
|
23
|
+
content-guard-publish-check = "content_guard.publish_check:main"
|
|
24
|
+
content-guard-n8n-advisory = "content_guard.n8n_advisory:main"
|
|
25
|
+
content-guard-n8n-validate = "content_guard.n8n_validate:main"
|
|
26
|
+
content-guard-pr = "content_guard.pr_draft:main"
|
|
27
|
+
content-guard-pr-prepare = "content_guard.pr_prepare:main"
|
|
28
|
+
|
|
29
|
+
[tool.setuptools.packages.find]
|
|
30
|
+
where = ["src"]
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.package-data]
|
|
33
|
+
content_guard = ["policies/*.json"]
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import difflib
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from .engine import scan_text
|
|
10
|
+
from .policy import load_policy
|
|
11
|
+
from .report import to_json, to_payload, to_text
|
|
12
|
+
from .types import ScanOptions
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main(argv: list[str] | None = None) -> int:
|
|
16
|
+
parser = build_parser()
|
|
17
|
+
args = parser.parse_args(argv)
|
|
18
|
+
|
|
19
|
+
if args.command == "scan":
|
|
20
|
+
return _scan(args)
|
|
21
|
+
if args.command == "redact":
|
|
22
|
+
return _redact(args)
|
|
23
|
+
if args.command == "diff":
|
|
24
|
+
return _diff(args)
|
|
25
|
+
|
|
26
|
+
parser.error(f"unknown command: {args.command}")
|
|
27
|
+
return 2
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
31
|
+
parser = argparse.ArgumentParser(
|
|
32
|
+
prog="content-guard",
|
|
33
|
+
description="Policy-driven content scanning and redaction.",
|
|
34
|
+
)
|
|
35
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
36
|
+
|
|
37
|
+
for name in ("scan", "redact", "diff"):
|
|
38
|
+
cmd = sub.add_parser(name)
|
|
39
|
+
cmd.add_argument("target", nargs="?", help="file to read, or stdin when omitted")
|
|
40
|
+
cmd.add_argument("--policy", help="JSON policy file")
|
|
41
|
+
cmd.add_argument("--opf", action="store_true", help="run optional OPF backend")
|
|
42
|
+
cmd.add_argument("--opf-bin", help="path to opf binary")
|
|
43
|
+
cmd.add_argument("--opf-device", help="OPF device, default comes from policy or cpu")
|
|
44
|
+
cmd.add_argument("--scan-frontmatter", action="store_true", help="scan YAML frontmatter")
|
|
45
|
+
cmd.add_argument("--skip-code-blocks", action="store_true", help="ignore fenced code blocks")
|
|
46
|
+
cmd.add_argument("--no-allow-comments", action="store_true", help="ignore content-guard allow comments")
|
|
47
|
+
|
|
48
|
+
sub.choices["scan"].add_argument("--json", action="store_true", help="emit JSON report")
|
|
49
|
+
sub.choices["redact"].add_argument("--in-place", action="store_true", help="rewrite the target file")
|
|
50
|
+
return parser
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _options(args: argparse.Namespace) -> ScanOptions:
|
|
54
|
+
return ScanOptions(
|
|
55
|
+
scan_frontmatter=args.scan_frontmatter,
|
|
56
|
+
scan_code_blocks=not args.skip_code_blocks,
|
|
57
|
+
honor_allow_comments=not args.no_allow_comments,
|
|
58
|
+
include_opf=args.opf,
|
|
59
|
+
opf_device=args.opf_device,
|
|
60
|
+
opf_bin=args.opf_bin,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _read_target(target: str | None) -> tuple[str, str | None]:
|
|
65
|
+
if not target or target == "-":
|
|
66
|
+
return sys.stdin.read(), None
|
|
67
|
+
path = Path(target)
|
|
68
|
+
return path.read_text(), str(path)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _scan(args: argparse.Namespace) -> int:
|
|
72
|
+
policy = load_policy(args.policy)
|
|
73
|
+
options = _options(args)
|
|
74
|
+
target_path = Path(args.target) if args.target and args.target != "-" else None
|
|
75
|
+
|
|
76
|
+
if target_path and target_path.is_dir():
|
|
77
|
+
results = _scan_directory(target_path, policy, options)
|
|
78
|
+
blocked = any(result.blocked for _, result in results)
|
|
79
|
+
if args.json:
|
|
80
|
+
print(
|
|
81
|
+
json.dumps(
|
|
82
|
+
{
|
|
83
|
+
"ok": not blocked,
|
|
84
|
+
"blocked": blocked,
|
|
85
|
+
"files_scanned": len(results),
|
|
86
|
+
"files": [
|
|
87
|
+
{"path": str(path), **to_payload(result)}
|
|
88
|
+
for path, result in results
|
|
89
|
+
if result.findings
|
|
90
|
+
],
|
|
91
|
+
},
|
|
92
|
+
indent=2,
|
|
93
|
+
sort_keys=True,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
elif not any(result.findings for _, result in results):
|
|
97
|
+
print(f"Clean. {len(results)} file(s) checked.")
|
|
98
|
+
else:
|
|
99
|
+
for path, result in results:
|
|
100
|
+
if result.findings:
|
|
101
|
+
print(to_text(result, path=str(path)))
|
|
102
|
+
return 1 if blocked else 0
|
|
103
|
+
|
|
104
|
+
text, path = _read_target(args.target)
|
|
105
|
+
result = scan_text(text, policy=policy, options=options)
|
|
106
|
+
if args.json:
|
|
107
|
+
print(to_json(result))
|
|
108
|
+
else:
|
|
109
|
+
print(to_text(result, path=path or "<stdin>"))
|
|
110
|
+
return 1 if result.blocked else 0
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _redact(args: argparse.Namespace) -> int:
|
|
114
|
+
policy = load_policy(args.policy)
|
|
115
|
+
options = _options(args)
|
|
116
|
+
target_path = Path(args.target) if args.target and args.target != "-" else None
|
|
117
|
+
|
|
118
|
+
if target_path and target_path.is_dir():
|
|
119
|
+
if not args.in_place:
|
|
120
|
+
print("directory redact requires --in-place", file=sys.stderr)
|
|
121
|
+
return 2
|
|
122
|
+
results = _scan_directory(target_path, policy, options)
|
|
123
|
+
for path, result in results:
|
|
124
|
+
if result.changed:
|
|
125
|
+
path.write_text(result.redacted_text)
|
|
126
|
+
return 1 if any(result.blocked for _, result in results) else 0
|
|
127
|
+
|
|
128
|
+
text, path = _read_target(args.target)
|
|
129
|
+
result = scan_text(text, policy=policy, options=options)
|
|
130
|
+
|
|
131
|
+
if args.in_place:
|
|
132
|
+
if not path:
|
|
133
|
+
print("--in-place requires a file target", file=sys.stderr)
|
|
134
|
+
return 2
|
|
135
|
+
Path(path).write_text(result.redacted_text)
|
|
136
|
+
else:
|
|
137
|
+
sys.stdout.write(result.redacted_text)
|
|
138
|
+
return 1 if result.blocked else 0
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _diff(args: argparse.Namespace) -> int:
|
|
142
|
+
policy = load_policy(args.policy)
|
|
143
|
+
options = _options(args)
|
|
144
|
+
target_path = Path(args.target) if args.target and args.target != "-" else None
|
|
145
|
+
|
|
146
|
+
if target_path and target_path.is_dir():
|
|
147
|
+
results = _scan_directory(target_path, policy, options)
|
|
148
|
+
for path, result in results:
|
|
149
|
+
if not result.changed:
|
|
150
|
+
continue
|
|
151
|
+
_write_diff(result.text, result.redacted_text, str(path))
|
|
152
|
+
return 1 if any(result.blocked for _, result in results) else 0
|
|
153
|
+
|
|
154
|
+
text, path = _read_target(args.target)
|
|
155
|
+
result = scan_text(text, policy=policy, options=options)
|
|
156
|
+
source_name = path or "<stdin>"
|
|
157
|
+
_write_diff(text, result.redacted_text, source_name)
|
|
158
|
+
return 1 if result.blocked else 0
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _scan_directory(path: Path, policy, options: ScanOptions):
|
|
162
|
+
results = []
|
|
163
|
+
for file_path in sorted(path.rglob("*.md")):
|
|
164
|
+
text = file_path.read_text()
|
|
165
|
+
results.append((file_path, scan_text(text, policy=policy, options=options)))
|
|
166
|
+
return results
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _write_diff(text: str, redacted_text: str, source_name: str) -> None:
|
|
170
|
+
diff = difflib.unified_diff(
|
|
171
|
+
text.splitlines(keepends=True),
|
|
172
|
+
redacted_text.splitlines(keepends=True),
|
|
173
|
+
fromfile=source_name,
|
|
174
|
+
tofile=f"{source_name} (redacted)",
|
|
175
|
+
)
|
|
176
|
+
sys.stdout.writelines(diff)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Detector backends."""
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import subprocess
|
|
5
|
+
import tempfile
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class OpfResult:
|
|
12
|
+
available: bool
|
|
13
|
+
changed: bool
|
|
14
|
+
redacted_text: str
|
|
15
|
+
error: str = ""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def default_opf_bin() -> str:
|
|
19
|
+
return os.environ.get("CONTENT_GUARD_OPF_BIN") or str(Path.home() / ".opf-venv" / "bin" / "opf")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def run_opf(text: str, *, opf_bin: str | None = None, device: str = "cpu", timeout: int = 120) -> OpfResult:
|
|
23
|
+
opf = opf_bin or default_opf_bin()
|
|
24
|
+
if not os.path.exists(opf) or not os.access(opf, os.X_OK):
|
|
25
|
+
return OpfResult(False, False, text, f"opf binary not found: {opf}")
|
|
26
|
+
|
|
27
|
+
with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as handle:
|
|
28
|
+
handle.write(text)
|
|
29
|
+
path = handle.name
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
proc = subprocess.run(
|
|
33
|
+
[opf, "--device", device, "-f", path],
|
|
34
|
+
capture_output=True,
|
|
35
|
+
text=True,
|
|
36
|
+
timeout=timeout,
|
|
37
|
+
check=False,
|
|
38
|
+
)
|
|
39
|
+
except (OSError, subprocess.SubprocessError) as exc:
|
|
40
|
+
return OpfResult(True, False, text, str(exc))
|
|
41
|
+
finally:
|
|
42
|
+
try:
|
|
43
|
+
os.unlink(path)
|
|
44
|
+
except OSError:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
if proc.returncode != 0:
|
|
48
|
+
error = (proc.stderr or proc.stdout or "opf failed").strip()
|
|
49
|
+
return OpfResult(True, False, text, error)
|
|
50
|
+
|
|
51
|
+
redacted = proc.stdout
|
|
52
|
+
return OpfResult(True, redacted != text, redacted, "")
|