@svayam-opensource/prj 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,211 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Privacy check for the publish branch.
4
+
5
+ Verifies that none of the per-org values from main's org-config.yaml have
6
+ leaked into the publish branch. This is the inverse of STRICT_PLACEHOLDERS=1
7
+ (which catches placeholders leaking into main): here we catch real values
8
+ leaking out of main into publish.
9
+
10
+ How it works:
11
+ 1. Read main's org-config.yaml via `git show main:org-config.yaml`.
12
+ 2. Extract values that are org-specific (skip generic defaults like
13
+ "main", "dev", "@*-tbd", placeholder strings).
14
+ 3. Grep the working tree for any of those values.
15
+ 4. Any match is a privacy leak — exit non-zero.
16
+
17
+ Used as a CI gate on PRs to publish (defense in depth alongside the
18
+ test-merge gate, scripts/sync-from-publish.sh's STRICT_PLACEHOLDERS=1
19
+ check, and the discipline of editing publish-side only).
20
+
21
+ Designed to run in the private (source) repo where main is accessible.
22
+ Not intended for the public mirror repo.
23
+
24
+ Usage:
25
+ python3 scripts/validate/check_privacy.py [REPO_ROOT]
26
+
27
+ Exits 0 if no leaks, 1 if leaks found, 2 on error.
28
+ """
29
+
30
+ import re
31
+ import subprocess
32
+ import sys
33
+ from pathlib import Path
34
+
35
+ try:
36
+ import yaml
37
+ except ImportError:
38
+ print("[FAIL] PyYAML not installed. Run: bash scripts/install-deps.sh", file=sys.stderr)
39
+ sys.exit(2)
40
+
41
+
42
+ # Keys whose values are scanned for leaks
43
+ PRIVATE_KEYS = [
44
+ "org_name",
45
+ "org_short_name",
46
+ "org_slug",
47
+ "org_slug_lower",
48
+ "github_org",
49
+ "workspace_repo",
50
+ "policy_owner_email",
51
+ "policy_owner_github",
52
+ "legal_owner_github",
53
+ "infra_owner_github",
54
+ "system_arch_owner_github",
55
+ "data_arch_owner_github",
56
+ ]
57
+
58
+ # Generic values — never considered org-specific
59
+ GENERIC_VALUES = {
60
+ "", "main", "dev", "master",
61
+ "YYYY-MM-DD",
62
+ "Your Organization Name", "YourOrg",
63
+ "ORG", "org",
64
+ "your-github-org", "000-org-prj",
65
+ "you@example.com", "@your-github-handle",
66
+ }
67
+
68
+ # Patterns that indicate a value is still a placeholder / not org-specific
69
+ PLACEHOLDER_VALUE_PATTERNS = [
70
+ re.compile(r"^@[a-z-]*-tbd$"),
71
+ re.compile(r"^\{\{[A-Za-z_]+\}\}$"),
72
+ re.compile(r"^\d{4}-\d{2}-\d{2}$"),
73
+ ]
74
+
75
+ # File patterns to scan
76
+ SCAN_SUFFIXES = {".md", ".yaml", ".yml", ".sh", ".py"}
77
+ SCAN_NAMES = {"CODEOWNERS", "prj"}
78
+
79
+ # Files where leak values are expected and not actually leaks
80
+ ALLOWED_FILES = {
81
+ "setup.sh", # contains placeholder strings as part of substitution rules
82
+ "org-config.yaml", # the source of values; not itself in publish content
83
+ }
84
+
85
+ # Per-key attribution allowance: these (key, file) combinations are NOT leaks.
86
+ # Legitimate copyright/attribution of the framework's original author in the
87
+ # standard public-facing files. Any other key in those files, or these keys
88
+ # elsewhere, are still flagged.
89
+ # org_short_name is included because it is typically a substring of org_name
90
+ # (e.g., a short brand name appearing inside the full legal name) and would
91
+ # otherwise false-positive on every legitimate copyright line.
92
+ ATTRIBUTION_KEYS = {"org_name", "org_short_name"}
93
+ ATTRIBUTION_FILES = {
94
+ "LICENSE", "README.md",
95
+ "CONTRIBUTING.md", "CODE_OF_CONDUCT.md", "SECURITY.md",
96
+ }
97
+
98
+
99
+ def is_generic(value: str) -> bool:
100
+ if not isinstance(value, str):
101
+ return True
102
+ if value in GENERIC_VALUES:
103
+ return True
104
+ for pattern in PLACEHOLDER_VALUE_PATTERNS:
105
+ if pattern.match(value):
106
+ return True
107
+ return False
108
+
109
+
110
+ def get_main_config(repo_root: Path) -> dict | None:
111
+ try:
112
+ result = subprocess.run(
113
+ ["git", "-C", str(repo_root), "show", "main:org-config.yaml"],
114
+ capture_output=True, text=True, check=True,
115
+ )
116
+ config = yaml.safe_load(result.stdout)
117
+ if not isinstance(config, dict):
118
+ print("[ERROR] main:org-config.yaml is not a mapping.", file=sys.stderr)
119
+ return None
120
+ return config
121
+ except subprocess.CalledProcessError as e:
122
+ print(
123
+ "[ERROR] Cannot read 'main:org-config.yaml'. "
124
+ "Privacy check requires access to main branch in this repo.\n"
125
+ f" git error: {e.stderr.strip()}",
126
+ file=sys.stderr,
127
+ )
128
+ return None
129
+ except yaml.YAMLError as e:
130
+ print(f"[ERROR] main:org-config.yaml does not parse: {e}", file=sys.stderr)
131
+ return None
132
+
133
+
134
+ def scannable_files(repo_root: Path):
135
+ for f in repo_root.rglob("*"):
136
+ if not f.is_file():
137
+ continue
138
+ rel = f.relative_to(repo_root)
139
+ if any(p.startswith(".") for p in rel.parts):
140
+ continue
141
+ if f.name in ALLOWED_FILES:
142
+ continue
143
+ if f.suffix in SCAN_SUFFIXES or f.name in SCAN_NAMES:
144
+ yield f
145
+
146
+
147
+ def find_leaks(repo_root: Path, key: str, value: str) -> list[tuple[Path, int, str]]:
148
+ leaks: list[tuple[Path, int, str]] = []
149
+ attribution_ok = key in ATTRIBUTION_KEYS
150
+ for f in scannable_files(repo_root):
151
+ # Skip attribution-allowed files for keys that are valid attribution
152
+ if attribution_ok and f.name in ATTRIBUTION_FILES:
153
+ continue
154
+ try:
155
+ text = f.read_text()
156
+ except Exception:
157
+ continue
158
+ for lineno, line in enumerate(text.splitlines(), 1):
159
+ if value in line:
160
+ leaks.append((f.relative_to(repo_root), lineno, line.strip()))
161
+ return leaks
162
+
163
+
164
+ def main() -> int:
165
+ repo_root = Path(sys.argv[1] if len(sys.argv) > 1 else ".").resolve()
166
+
167
+ config = get_main_config(repo_root)
168
+ if config is None:
169
+ return 2
170
+
171
+ # Collect non-generic values to check
172
+ to_check: list[tuple[str, str]] = []
173
+ for key in PRIVATE_KEYS:
174
+ value = config.get(key)
175
+ if isinstance(value, str) and not is_generic(value):
176
+ # For @-prefixed handles, also strip the @ for substring matching
177
+ to_check.append((key, value))
178
+
179
+ if not to_check:
180
+ print("=== no org-specific values to check (main appears to be unconfigured) ===")
181
+ return 0
182
+
183
+ print(f"Checking publish for leaks of {len(to_check)} value(s) from main:org-config.yaml...")
184
+ print()
185
+
186
+ total_leaks = 0
187
+ for key, value in to_check:
188
+ leaks = find_leaks(repo_root, key, value)
189
+ if leaks:
190
+ total_leaks += len(leaks)
191
+ print(f"[LEAK] {key}={value!r} appears in {len(leaks)} location(s):")
192
+ for relpath, lineno, line in leaks[:10]:
193
+ snippet = line if len(line) <= 100 else line[:97] + "..."
194
+ print(f" {relpath}:{lineno}: {snippet}")
195
+ if len(leaks) > 10:
196
+ print(f" ... and {len(leaks) - 10} more")
197
+ print()
198
+
199
+ if total_leaks:
200
+ print(f"=== {total_leaks} privacy leak(s) found ===")
201
+ print()
202
+ print("These values from main:org-config.yaml have leaked into publish content.")
203
+ print("Remove them or replace with placeholder/example values.")
204
+ return 1
205
+
206
+ print("=== no privacy leaks ===")
207
+ return 0
208
+
209
+
210
+ if __name__ == "__main__":
211
+ sys.exit(main())
@@ -0,0 +1,117 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Session-start protocol integrity check (#54 Increment 1 — Layer 3-A).
4
+
5
+ The session-start protocol (agent/session-protocol.md) is only enforceable if it
6
+ is reliably DELIVERED. This validator gates that precondition, author-agnostic
7
+ and deterministic:
8
+
9
+ 1. agent/session-protocol.md exists and is non-empty.
10
+ 2. It still carries the §0 "agent speaks first / context manifest / before you
11
+ change any code" mandate — i.e. nobody gutted the protocol while leaving the
12
+ file in place.
13
+ 3. Every generated tool copy is in sync with the canonical protocol
14
+ (scripts/render-harness.sh --check) — i.e. no generated copy was hand-edited
15
+ to weaken or drop the protocol, and the protocol is rendered to every active
16
+ install path (a missing path counts as drift).
17
+
18
+ Invoked via scripts/validate/run.py (the test-merge `validate` gate) and runnable
19
+ directly: python3 scripts/validate/check_protocol.py [REPO_ROOT].
20
+ """
21
+
22
+ import subprocess
23
+ import sys
24
+ from pathlib import Path
25
+
26
+ # Stable anchors from session-protocol.md §0. If any disappears, the protocol's
27
+ # core mandate was removed even if the file still exists.
28
+ MANDATE_ANCHORS = (
29
+ "agent speaks first",
30
+ "context manifest",
31
+ "before you change any code",
32
+ )
33
+
34
+
35
+ def check_protocol(repo_root: Path) -> list[str]:
36
+ errors: list[str] = []
37
+
38
+ protocol = repo_root / "agent" / "session-protocol.md"
39
+ if not protocol.exists():
40
+ return ["agent/session-protocol.md is missing — the session-start protocol is undelivered"]
41
+ text = protocol.read_text(encoding="utf-8", errors="replace")
42
+ if not text.strip():
43
+ errors.append("agent/session-protocol.md is empty")
44
+ else:
45
+ low = text.lower()
46
+ missing = [a for a in MANDATE_ANCHORS if a not in low]
47
+ if missing:
48
+ errors.append(
49
+ "agent/session-protocol.md no longer contains its §0 mandate "
50
+ f"(missing: {', '.join(missing)})"
51
+ )
52
+
53
+ # Generated copies must match the canonical protocol. Reuse render-harness's
54
+ # own --check (it also treats a missing install path as drift).
55
+ renderer = repo_root / "scripts" / "render-harness.sh"
56
+ manifest = repo_root / "agent" / "harness-manifest.yaml"
57
+ if renderer.exists() and manifest.exists():
58
+ try:
59
+ r = subprocess.run(
60
+ ["bash", str(renderer), "--check"],
61
+ cwd=str(repo_root), capture_output=True, text=True,
62
+ )
63
+ if r.returncode != 0:
64
+ detail = (r.stdout + r.stderr).strip().replace("\n", " ")
65
+ errors.append(
66
+ "generated protocol copies are out of sync with "
67
+ f"agent/session-protocol.md — run ./scripts/render-harness.sh ({detail[:300]})"
68
+ )
69
+ except OSError as e:
70
+ errors.append(f"could not run render-harness.sh --check: {e}")
71
+
72
+ # #54 Increment 2 — if the Claude Code client gate is configured, its parts
73
+ # must all be present (guard against a hook being deleted while settings.json
74
+ # still points at it, which would break the gate). Conditional on
75
+ # settings.json existing, so repos that haven't adopted the gate aren't forced
76
+ # to.
77
+ settings = repo_root / ".claude" / "settings.json"
78
+ if settings.exists() and "session-start" in settings.read_text(encoding="utf-8", errors="replace"):
79
+ for rel in (
80
+ ".claude/hooks/session-start.sh",
81
+ ".claude/hooks/pre-tool-gate.sh",
82
+ ".claude/hooks/session-ack.sh",
83
+ ".claude/commands/session-start.md",
84
+ ):
85
+ p = repo_root / rel
86
+ if not p.exists() or not p.read_text(encoding="utf-8", errors="replace").strip():
87
+ errors.append(f"session-start gate is configured but {rel} is missing/empty")
88
+
89
+ # Same guard for the Cursor client gate (.cursor/hooks.json).
90
+ cursor_hooks = repo_root / ".cursor" / "hooks.json"
91
+ if cursor_hooks.exists() and "session-gate" in cursor_hooks.read_text(encoding="utf-8", errors="replace"):
92
+ for rel in (
93
+ ".cursor/hooks/session-start.sh",
94
+ ".cursor/hooks/session-gate.sh",
95
+ ".cursor/hooks/session-ack.sh",
96
+ ):
97
+ p = repo_root / rel
98
+ if not p.exists() or not p.read_text(encoding="utf-8", errors="replace").strip():
99
+ errors.append(f"Cursor session-start gate is configured but {rel} is missing/empty")
100
+
101
+ return errors
102
+
103
+
104
+ def main() -> int:
105
+ repo_root = Path(sys.argv[1] if len(sys.argv) > 1 else ".").resolve()
106
+ errors = check_protocol(repo_root)
107
+ if errors:
108
+ print(f"[FAIL] protocol ({len(errors)} error(s)):")
109
+ for e in errors:
110
+ print(f" - {e}")
111
+ return 1
112
+ print("=== protocol integrity ok ===")
113
+ return 0
114
+
115
+
116
+ if __name__ == "__main__":
117
+ sys.exit(main())
@@ -0,0 +1,175 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Secret / credential scanner (POL-143 enforcement).
4
+
5
+ PRJ-013 audit finding H8 (dimension H/I): POL-143 forbids committing secrets,
6
+ tokens, keys, and credentials, but the framework had ZERO automated
7
+ enforcement — privacy-check.yml/check_privacy.py only catch org-config values
8
+ leaking to publish, not credentials. This scanner closes that gap as a CI gate
9
+ (it runs inside scripts/validate/run.py, which the test-merge `validate` job
10
+ invokes on every PR to main/publish).
11
+
12
+ Design goals:
13
+ - HIGH SIGNAL, LOW false-positive. Patterns match only well-known,
14
+ structurally-distinctive credential shapes, not anything that merely looks
15
+ "password-ish". This keeps the gate trustworthy so it is never disabled.
16
+ - stdlib only (runs under any python3, no pip deps).
17
+ - tracked text files only (skips .git, skips binaries via a NUL-byte sniff).
18
+ - an inline `# pragma: allowlist secret` on the matching line suppresses the
19
+ finding (for fixtures, docs that must show an example token shape, etc.).
20
+
21
+ Usage:
22
+ python3 scripts/validate/check_secrets.py [REPO_ROOT]
23
+
24
+ Exits 0 if no secrets found, 1 if any high-confidence secret is found.
25
+ Invokable directly or via run.py's `check_secrets(repo_root)` entry point.
26
+ """
27
+
28
+ import re
29
+ import subprocess
30
+ import sys
31
+ from pathlib import Path
32
+
33
+ # Inline suppression marker (detect-secrets compatible style).
34
+ ALLOWLIST_MARKER = "pragma: allowlist secret"
35
+
36
+ # Directories never scanned (in addition to anything git doesn't track).
37
+ SKIP_DIR_PARTS = {".git"}
38
+
39
+ # A placeholder value is NOT a secret. Used by the generic assignment patterns
40
+ # (password=/api_key=) to avoid flagging example/empty/template values.
41
+ PLACEHOLDER_RE = re.compile(
42
+ r"""^(?:
43
+ | \s* # empty / whitespace
44
+ | x+ | \.\.\. | -+ | \*+ # x..., ---, ***
45
+ | changeme | example | placeholder
46
+ | your[-_].* | my[-_].* | some[-_].*
47
+ | redacted | dummy | sample | test(?:ing)? | fake
48
+ | none | null | nil | true | false
49
+ | \$\{[^}]+\} | \{\{[^}]+\}\} # ${VAR} / {{TOKEN}}
50
+ | \$[A-Za-z_][A-Za-z0-9_]* # $VAR
51
+ | <[^>]+> # <your-token>
52
+ )$""",
53
+ re.IGNORECASE | re.VERBOSE,
54
+ )
55
+
56
+ # ── High-signal patterns ──────────────────────────────────────────────────────
57
+ # Each entry: (label, compiled-regex). A regex matching anywhere on a line
58
+ # (that is not an allowlisted line) is a finding. These shapes are
59
+ # distinctive enough that a match is almost certainly a real credential.
60
+ PATTERNS = [
61
+ (
62
+ "private key block",
63
+ re.compile(r"-----BEGIN (?:[A-Z0-9]+ )*PRIVATE KEY-----"),
64
+ ),
65
+ (
66
+ "GitHub token",
67
+ re.compile(r"\b(?:ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9]{36,}\b"),
68
+ ),
69
+ (
70
+ "GitHub fine-grained PAT",
71
+ re.compile(r"\bgithub_pat_[A-Za-z0-9_]{60,}\b"),
72
+ ),
73
+ (
74
+ "AWS access key id",
75
+ re.compile(r"\b(?:AKIA|ASIA|AGPA|AIDA|AROA|ANPA|ANVA)[0-9A-Z]{16}\b"),
76
+ ),
77
+ (
78
+ "Slack token",
79
+ re.compile(r"\bxox[baprs]-[A-Za-z0-9-]{10,}\b"),
80
+ ),
81
+ ]
82
+
83
+ # Generic assignment patterns: `password = "..."` / `api_key: '...'` etc.
84
+ # These are the only patterns that gate on the VALUE not being a placeholder,
85
+ # since the keyword alone (password/api_key) is common in legitimate code/docs.
86
+ ASSIGNMENT_RE = re.compile(
87
+ r"""(?P<key>\b(?:password|passwd|pwd|api[_-]?key|secret[_-]?key
88
+ |access[_-]?token|auth[_-]?token|client[_-]?secret)\b)
89
+ \s*[:=]\s*
90
+ (?P<q>["'])(?P<val>[^"']{6,})(?P=q)""",
91
+ re.IGNORECASE | re.VERBOSE,
92
+ )
93
+
94
+
95
+ def is_placeholder(value: str) -> bool:
96
+ return bool(PLACEHOLDER_RE.match(value.strip()))
97
+
98
+
99
+ def tracked_files(repo_root: Path) -> list[Path]:
100
+ """Return git-tracked files; fall back to a filesystem walk if not a repo."""
101
+ try:
102
+ result = subprocess.run(
103
+ ["git", "-C", str(repo_root), "ls-files", "-z"],
104
+ capture_output=True, check=True,
105
+ )
106
+ names = result.stdout.decode("utf-8", "replace").split("\0")
107
+ return [repo_root / n for n in names if n]
108
+ except (subprocess.CalledProcessError, FileNotFoundError):
109
+ out = []
110
+ for f in repo_root.rglob("*"):
111
+ if f.is_file() and not any(p in SKIP_DIR_PARTS for p in f.relative_to(repo_root).parts):
112
+ out.append(f)
113
+ return out
114
+
115
+
116
+ def is_binary(path: Path) -> bool:
117
+ try:
118
+ with open(path, "rb") as fh:
119
+ return b"\0" in fh.read(8192)
120
+ except OSError:
121
+ return True
122
+
123
+
124
+ def scan_file(repo_root: Path, path: Path) -> list[tuple[str, int, str, str]]:
125
+ """Return (relpath, lineno, label, snippet) findings for one file."""
126
+ findings: list[tuple[str, int, str, str]] = []
127
+ rel = str(path.relative_to(repo_root))
128
+ try:
129
+ text = path.read_text(encoding="utf-8")
130
+ except (OSError, UnicodeDecodeError):
131
+ return findings
132
+
133
+ for lineno, line in enumerate(text.splitlines(), 1):
134
+ if ALLOWLIST_MARKER in line:
135
+ continue
136
+ for label, pat in PATTERNS:
137
+ if pat.search(line):
138
+ findings.append((rel, lineno, label, line.strip()[:120]))
139
+ m = ASSIGNMENT_RE.search(line)
140
+ if m and not is_placeholder(m.group("val")):
141
+ findings.append((rel, lineno, "hardcoded credential", line.strip()[:120]))
142
+ return findings
143
+
144
+
145
+ def check_secrets(repo_root: Path) -> list[str]:
146
+ """run.py entry point: return a list of error strings (empty == pass)."""
147
+ errors: list[str] = []
148
+ for path in tracked_files(repo_root):
149
+ if any(p in SKIP_DIR_PARTS for p in path.relative_to(repo_root).parts):
150
+ continue
151
+ if not path.is_file() or is_binary(path):
152
+ continue
153
+ for rel, lineno, label, snippet in scan_file(repo_root, path):
154
+ errors.append(f"{rel}:{lineno}: {label}: {snippet}")
155
+ return errors
156
+
157
+
158
+ def main() -> int:
159
+ repo_root = Path(sys.argv[1] if len(sys.argv) > 1 else ".").resolve()
160
+ errors = check_secrets(repo_root)
161
+ if errors:
162
+ print(f"[FAIL] secrets ({len(errors)} finding(s)):")
163
+ for e in errors:
164
+ print(f" - {e}")
165
+ print()
166
+ print("POL-143: remove the credential and rotate it. If this is a known")
167
+ print("non-secret (fixture/example), append '# pragma: allowlist secret'")
168
+ print("to the line.")
169
+ return 1
170
+ print("=== no secrets found ===")
171
+ return 0
172
+
173
+
174
+ if __name__ == "__main__":
175
+ sys.exit(main())