cursordoctrine 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/INSTALL.md +113 -0
  2. package/LICENSE +21 -0
  3. package/README.md +86 -0
  4. package/bin/cli.mjs +413 -0
  5. package/linux/USER-RULES.md +12 -0
  6. package/linux/doctrine.md +172 -0
  7. package/linux/hooks/anti-slop-audit.sh +163 -0
  8. package/linux/hooks/anti-slop.md +56 -0
  9. package/linux/hooks/final-review.md +52 -0
  10. package/linux/hooks/final-review.sh +99 -0
  11. package/linux/hooks/hook-common.sh +120 -0
  12. package/linux/hooks/minimal-edit-audit.sh +112 -0
  13. package/linux/hooks/permission-gate.sh +75 -0
  14. package/linux/hooks/post-tool-use.sh +53 -0
  15. package/linux/hooks/self-review-trigger.sh +56 -0
  16. package/linux/hooks/self-review.md +48 -0
  17. package/linux/hooks/subagent-stop-review.sh +93 -0
  18. package/linux/hooks.json +64 -0
  19. package/linux/inject-doctrine.sh +31 -0
  20. package/package.json +40 -0
  21. package/skills/anti-slop/SKILL.md +267 -0
  22. package/skills/anti-slop/scripts/scan_slop.py +986 -0
  23. package/windows/USER-RULES.md +12 -0
  24. package/windows/doctrine.md +172 -0
  25. package/windows/hooks/anti-slop-audit.ps1 +182 -0
  26. package/windows/hooks/anti-slop.md +56 -0
  27. package/windows/hooks/final-review.md +52 -0
  28. package/windows/hooks/final-review.ps1 +105 -0
  29. package/windows/hooks/hook-common.ps1 +84 -0
  30. package/windows/hooks/minimal-edit-audit.ps1 +116 -0
  31. package/windows/hooks/permission-gate.ps1 +98 -0
  32. package/windows/hooks/post-tool-use.ps1 +46 -0
  33. package/windows/hooks/self-review-trigger.ps1 +83 -0
  34. package/windows/hooks/self-review.md +48 -0
  35. package/windows/hooks/subagent-stop-review.ps1 +89 -0
  36. package/windows/hooks.json +64 -0
  37. package/windows/inject-doctrine.ps1 +58 -0
@@ -0,0 +1,986 @@
1
+ #!/usr/bin/env python3
2
+ """scan_slop.py - static AI-slop signal scanner for the anti-slop skill.
3
+
4
+ Reports cheap, high-precision slop signals so the agent starts its de-slop sweep
5
+ from a deterministic inventory. Semantic slop (cargo-cult, superficial tests) is
6
+ deliberately NOT guessed - that stays the model's job. This seeds it.
7
+
8
+ Scopes:
9
+ * DIFF (default): only what changed vs --base (git diff). Silent on a clean
10
+ tree by design. Flags NEW deps, premature abstractions, redundant comments,
11
+ AI residue (placeholder phrases / banner comments / emoji), type escapes
12
+ (as any, as unknown as, @ts-ignore, Python type-ignore pragmas), swallowed
13
+ errors (empty catch, broad except+pass), tautological asserts, pointless
14
+ async wrappers (await Promise.resolve, async executors), deepening guard
15
+ chains (the optional-chaining shape), boolean-pair call traps, SELECT *
16
+ in .sql files, and Tailwind class soup / magic-px values. All per-file
17
+ signals also run in AUDIT scope; only new-dependency detection is diff-only
18
+ (every line of an existing manifest would otherwise read as "new").
19
+ * AUDIT (--all, or explicit paths): the WHOLE codebase, with the duplication
20
+ analysis that catches the isRecord()-class slop:
21
+ - Clone Proliferation : same function name in multiple files
22
+ - Knowledge Duplication : identical body under different names (DRY)
23
+ - Semantic Fragmentation : near-identical bodies (same shape, drifted
24
+ names/values) - the diverged clones
25
+ - Semantic Density Collapse: tiny helpers used 0-1 times (dead / inline)
26
+ (--all only: reference counts over a partial file list are meaningless,
27
+ so explicit-path scans suppress this analysis instead of guessing)
28
+ - Generated Fingerprints : isRecord/safeParse/sleep/retry/... repeated
29
+ - Duplicate type/interface names across files
30
+ Functions/methods are parsed for JS/TS, Python, Go, Rust, Ruby, PHP, and
31
+ (best-effort) Java/Kotlin/C#/Scala.
32
+
33
+ Every duplication finding carries a confidence tier: exact (identical
34
+ normalized body) > structural (same shape, drifted names/values) > name-only
35
+ (same identifier). The report header states the scope; caps that were hit
36
+ (file list, per-file lines, per-body lines) are disclosed as notes.
37
+
38
+ Stdlib only; Python 3.9+. REPORTS only - never edits. Exits 0, except with
39
+ --gate: exit 1 when slop is found (the size-only "substantial change" note
40
+ never gates). The agent does the fixing.
41
+
42
+ Usage:
43
+ python scripts/scan_slop.py --all --root . # WHOLE-codebase audit (recommended)
44
+ python scripts/scan_slop.py --root . # diff vs HEAD (a change in progress)
45
+ python scripts/scan_slop.py src/foo.ts src/bar.py # audit specific files
46
+ python scripts/scan_slop.py --all --format json # machine-readable
47
+ """
48
+ from __future__ import annotations
49
+
50
+ import argparse
51
+ import hashlib
52
+ import json
53
+ import os
54
+ import re
55
+ import subprocess
56
+ import sys
57
+ from collections import Counter, defaultdict
58
+ from itertools import islice
59
+ from typing import Any
60
+
61
+ Finding = dict[str, Any]
62
+
63
+ # ---- per-file signal patterns -------------------------------------------
64
+ # Only manifest formats the DEP regex genuinely parses (name [:=@] version).
65
+ # go.mod / pom.xml / Gradle / Gemfile / .csproj declare deps in syntaxes DEP
66
+ # cannot match, so listing them would only over-claim coverage.
67
+ MANIFEST = re.compile(
68
+ r"^(package\.json|requirements[\w.\-]*\.txt|pyproject\.toml|Pipfile"
69
+ r"|Cargo\.toml|composer\.json)$"
70
+ )
71
+ DEP = re.compile(
72
+ r"""(?:^|[{,])\s*(?:"|')?(?P<name>[A-Za-z@][\w@\-./\[\]]*)(?:"|')?\s*"""
73
+ r"""(?:[:=]\s*(?:"|')?[\^~><=*v]?\d|[><=~!]=\s*\d|@\s*\^?\d)"""
74
+ )
75
+ # Manifest metadata that pairs a name with a version-looking value without
76
+ # declaring a dependency. Skipped only at line start: `serde = { version = ...`
77
+ # matches mid-line and IS a real dependency.
78
+ META_KEYS = frozenset({
79
+ "version", "name", "node", "npm", "yarn", "pnpm", "packagemanager",
80
+ "engines", "python", "private", "description",
81
+ })
82
+ ABSTRACTION = re.compile(
83
+ r"\b(?:class|interface|struct|trait|protocol)\s+"
84
+ r"([A-Z][A-Za-z0-9_]*(?:Factory|Repository|Mediator|Strategy|Singleton"
85
+ r"|Facade|Builder|Visitor|Decorator|Wrapper|Orchestrator|Registry))\b"
86
+ )
87
+ # [C]QRS: the brackets change nothing semantically but keep the alternative
88
+ # from matching its own definition when the scanner audits itself.
89
+ VOCAB = re.compile(
90
+ r"\b([C]QRS|Event[\s\-]?Sourc(?:e|ing)|Domain[\s\-]?Driven|Aggregate\s?Root"
91
+ r"|Bounded\s?Context|Hexagonal\s+Architecture|Onion\s+Architecture)\b",
92
+ re.I,
93
+ )
94
+ COMMENT = re.compile(
95
+ r"^\s*(?://|#|/\*+|\*+)\s*(?:increment|decrement|loop (?:over|through)|iterate"
96
+ r"|returns?(?: the)?(?: result| value)?\s*$|set\s+\w+\s+to\b|getter\b"
97
+ r"|setter\b|constructor\b|initiali[sz]e\b|instantiate\b|create (?:a |an |the )"
98
+ r"|declare\b|define\b|assign\b|end (?:of|for)\b|begin\b|start (?:of|the))",
99
+ re.I,
100
+ )
101
+ _CMT_MARKER = re.compile(r"^\s*(?://+|#+|/\*+|\*+)\s*")
102
+ # ---- residue / type-escape / swallow / tautology signals -----------------
103
+ # Failure classes these seed: AI-Specific (prompt residue), Type-System
104
+ # (any-driven development), Defensive Code Inflation (swallowed errors) and
105
+ # Testing (test theater). \s+ between words instead of literal spaces doubles
106
+ # as robustness to spacing AND keeps each pattern from matching its own
107
+ # source when the scanner audits itself.
108
+ RESIDUE_PHRASE = re.compile(
109
+ r"\bin\s+a\s+real\s+(?:app(?:lication)?|world|scenario)\b"
110
+ r"|\bfor\s+production\s+use\b|\bthis\s+is\s+a\s+simplified\b"
111
+ r"|\bTODO:?\s+implement\s+actual\b|\breplace\s+(?:this\s+)?with\s+your\b",
112
+ re.I,
113
+ )
114
+ # = * # walls only: `# ----` section dividers are a long-standing human
115
+ # convention (numpy, stdlib); `// =====` banners are the generated-code tell.
116
+ RESIDUE_BANNER = re.compile(r"^\s*(?://|#|/\*)\s*[=*#]{5,}")
117
+ RESIDUE_EMOJI = re.compile("[\u2600-\u27bf\U0001f300-\U0001faff]")
118
+ # NOT @ts-expect-error: that one is the sanctioned, self-expiring form.
119
+ TS_SUPPRESS = re.compile(r"@ts-(?:ignore|nocheck)\b")
120
+ TS_ANY = re.compile(r"\bas\s+unknown\s+as\b|\bas\s+any\b|[,:<]\s*any\b|\bany\[\]")
121
+ PY_TYPE_ESCAPE = re.compile(r"#\s*type:\s*ignore\b")
122
+ # Only bare/broad swallows: `except ImportError: pass` is a legitimate idiom
123
+ # (optional dependency); `except Exception: pass` hides every failure.
124
+ _EXC_BROAD = r"^\s*except\b\s*(?:\(?\s*(?:Base)?Exception\s*\)?\s*(?:as\s+\w+\s*)?)?:"
125
+ PY_SWALLOW = re.compile(_EXC_BROAD + r"\s*pass\b")
126
+ PY_SWALLOW_HEAD = re.compile(_EXC_BROAD + r"\s*(?:#.*)?$")
127
+ PY_PASS = re.compile(r"^\s*pass\s*(?:#.*)?$")
128
+ JS_SWALLOW = re.compile(
129
+ r"\bcatch\s*(?:\(\s*[^)]*\))?\s*\{\s*\}"
130
+ r"|\.catch\(\s*(?:\(\s*\w*\s*\)|\w+)\s*=>\s*(?:\{\s*\}|null|undefined)\s*\)"
131
+ r"|\.catch\(\s*function\s*\(\s*\w*\s*\)\s*\{\s*\}\s*\)"
132
+ )
133
+ JS_TAUTOLOGY = re.compile(
134
+ r"\bexpect\(\s*(true|false|\d+|'[^']*'|\"[^\"]*\")\s*\)\s*\.\s*"
135
+ r"(?:toBe|toEqual|toStrictEqual)\(\s*\1\s*\)"
136
+ r"|\bassert(?:True)?\(\s*true\s*\)"
137
+ )
138
+ PY_TAUTOLOGY = re.compile(r"^\s*assert\s+True\s*(?:$|[,#])|\bassertTrue\(\s*True\s*\)")
139
+ # ---- framework failure modes (vibe-coding stack) --------------------------
140
+ # await Promise.resolve() is always pointless; an async promise executor
141
+ # swallows its own rejections.
142
+ ASYNC_WRAPPER = re.compile(
143
+ r"\bawait\s+Promise\s*\.\s*resolve\s*\(|\bnew\s+Promise\s*\(\s*async\b"
144
+ )
145
+ # A lone negated guard-return; chains where each test deepens the previous
146
+ # (`!data` then `!data.user`) are the optional-chaining shape.
147
+ GUARD_RETURN = re.compile(
148
+ r"^\s*if\s*\(\s*!\s*([\w$][\w$.]*)\s*\)\s*(?:return|continue|break)\b[^;{}]*;?\s*$"
149
+ )
150
+ # Two adjacent literal booleans inside a call's argument list (Boolean Trap).
151
+ # Bracket/brace exclusion keeps array literals like useState([true, false]) out.
152
+ BOOL_PAIR_JS = re.compile(
153
+ r"[\w$]\s*\(\s*[^()\[\]{}]*\b(?:true|false)\s*,\s*(?:true|false)\b"
154
+ )
155
+ BOOL_PAIR_PY = re.compile(
156
+ r"\w\s*\(\s*[^()\[\]{}]*\b(?:True|False)\s*,\s*(?:True|False)\b"
157
+ )
158
+ SELECT_STAR = re.compile(r"\bSELECT\s+\*", re.I)
159
+ TAILWIND_SOUP = re.compile(r"class(?:Name)?\s*=\s*[{]?\s*['\"`][^'\"`]{200,}")
160
+ # Arbitrary >=100px values defeat the spacing scale (w-[347px] magic numbers).
161
+ TAILWIND_MAGIC_PX = re.compile(r"-\[\d{3,}(?:\.\d+)?px\]")
162
+ SOURCE = re.compile(
163
+ r"\.(?:ts|tsx|js|jsx|mjs|cjs|py|go|rs|java|kt|kts|cs|cpp|cc|cxx|c|h|hpp|rb"
164
+ r"|php|swift|scala|m|mm|sh|ps1|lua|dart|ex|exs|vue|svelte|astro|html|sql)$"
165
+ )
166
+ CHECKLIST_LINES = 40
167
+ READ_CAP = 4000 # lines read per file
168
+ BODY_CAP = 80 # lines captured per function body
169
+ FILE_CAP = 6000 # tracked files scanned in --all
170
+ SHOW_CAP = 10 # findings printed per list (counts stay exact)
171
+
172
+ # ---- duplication / clone machinery --------------------------------------
173
+ ID = re.compile(r"[A-Za-z_$][\w$]*")
174
+ TYPE_DECL = re.compile(r"\b(?:type|interface)\s+([A-Z][A-Za-z0-9_]*)\b")
175
+ MICRO_PREFIX = re.compile(
176
+ r"^(?:is|has|can|should|assert|ensure|safe|to|from|get|set|make|create"
177
+ r"|parse|format|with|map|build|validate)[A-Z0-9]"
178
+ )
179
+ COMMON_NAMES = {
180
+ "render", "main", "default", "index", "setup", "run", "start", "stop",
181
+ "init", "handler", "handle", "callback", "loader", "action", "middleware",
182
+ "reducer", "app", "page", "layout", "constructor", "tostring", "tojson",
183
+ "valueof", "equals", "dispose", "close", "open", "connect", "get", "set",
184
+ "update", "create", "delete", "list", "find", "save", "load", "execute",
185
+ "process", "build", "test", "describe", "it", "expect", "beforeeach",
186
+ "aftereach", "beforeall", "afterall", "getstaticprops", "getserversideprops",
187
+ "getstaticpaths", "generatemetadata", "usestate", "useeffect", "wrapper",
188
+ "new", "string", "tostr", "clone", "copy", "value", "data", "result",
189
+ "componentdidmount", "componentwillunmount", "componentdidupdate",
190
+ "componentdidcatch", "getderivedstatefromerror", "getderivedstatefromprops",
191
+ "shouldcomponentupdate", "getsnapshotbeforeupdate", "ngoninit", "ngondestroy",
192
+ "connectedcallback", "disconnectedcallback",
193
+ }
194
+ # Common per-module type names - excluded from type-duplication reporting.
195
+ COMMON_TYPES = frozenset({
196
+ "props", "state", "options", "option", "params", "parameters", "result",
197
+ "config", "configuration", "context", "ctx", "data", "item", "items",
198
+ "response", "request", "error", "react", "window", "ref", "children",
199
+ "theme", "style", "styles", "value", "values", "model", "entity", "dto",
200
+ "payload", "meta", "args", "arg", "input", "output", "node", "element",
201
+ "key", "id", "type", "types", "field", "fields", "row", "column", "event",
202
+ "handler", "callback", "fn", "cb",
203
+ })
204
+ FINGERPRINTS = {
205
+ "isrecord", "isobject", "isplainobject", "isdictionary", "isstring",
206
+ "isnumber", "isboolean", "isarraylike", "isnil", "isempty", "isdefined",
207
+ "ensurearray", "assertarray", "safeparse", "safeparsejson", "safejsonparse",
208
+ "sleep", "delay", "retry", "assertnever", "deepclone", "deepequal", "noop",
209
+ "clamp", "uniq", "unique", "capitalize", "classnames", "cn", "tryparse",
210
+ }
211
+ # Only genuine structural/control-flow keywords. Deliberately EXCLUDES words that
212
+ # are commonly variable/method names in other languages (val, map, go, select,
213
+ # ...) - masking those would break the structural hash (val stays val while value
214
+ # becomes I -> false-negative near-dups).
215
+ KEYWORDS = frozenset({
216
+ "if", "else", "elif", "for", "while", "do", "switch", "case", "default",
217
+ "break", "continue", "return", "function", "func", "fn", "def", "class",
218
+ "struct", "interface", "type", "trait", "impl", "enum", "const", "let",
219
+ "var", "new", "delete", "typeof", "instanceof", "in", "of", "is",
220
+ "as", "not", "and", "or", "null", "nil", "none", "None", "true", "false",
221
+ "True", "False", "undefined", "void", "this", "self", "super", "yield",
222
+ "await", "async", "try", "catch", "except", "finally", "throw", "raise",
223
+ "with", "public", "private", "protected", "static", "import",
224
+ "from", "export", "package", "lambda", "pass", "end",
225
+ })
226
+ # Class-method signature (JS/TS + best-effort C-style); names that are really
227
+ # control flow are filtered out after match.
228
+ METHOD_JS = re.compile(
229
+ r"^\s*(?:public\s+|private\s+|protected\s+|static\s+|readonly\s+|async\s+"
230
+ r"|get\s+|set\s+|override\s+|\*\s*)*([A-Za-z_$][\w$]*)\s*\([^;{]*\)\s*"
231
+ r"(?::\s*[^={;]+)?\s*\{"
232
+ )
233
+ METHOD_CSTYLE = re.compile(
234
+ r"^\s*(?:(?:public|private|protected|internal|static|final|virtual|override"
235
+ r"|abstract|async|sealed|unsafe)\s+)+[\w<>\[\].,?]+\s+([A-Za-z_]\w*)\s*"
236
+ r"\([^;{]*\)\s*(?:where[^{]+)?\{"
237
+ )
238
+ NOT_METHOD = {
239
+ "if", "for", "while", "switch", "catch", "return", "function", "do",
240
+ "else", "with", "await", "new", "delete", "void", "yield", "case",
241
+ "throw", "super", "typeof", "using", "lock", "fixed", "foreach",
242
+ }
243
+ EXPORT_KEYWORD = re.compile(r"\b(?:export|public|pub)\b")
244
+ FUNC_PATTERNS = {
245
+ "js": [
246
+ re.compile(r"(?:^|\s)(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s*\*?\s*([A-Za-z_$][\w$]*)\s*\("),
247
+ re.compile(r"(?:^|\s)(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>"),
248
+ METHOD_JS,
249
+ ],
250
+ "py": [re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(")],
251
+ "go": [re.compile(r"^\s*func\s+(?:\([^)]*\)\s*)?([A-Za-z_]\w*)\s*\(")],
252
+ "rust": [re.compile(r"\bfn\s+([A-Za-z_]\w*)\s*[(<]")],
253
+ "ruby": [re.compile(r"^\s*def\s+(?:self\.)?([A-Za-z_]\w*[!?=]?)")],
254
+ "php": [re.compile(r"\bfunction\s+([A-Za-z_]\w*)\s*\(")],
255
+ "cstyle": [METHOD_CSTYLE],
256
+ }
257
+
258
+ # ---- comment/string tokenization (language-aware, single pass) -----------
259
+ # One combined regex per language family, string alternatives FIRST, so a //
260
+ # or # inside a string literal can never be amputated as a comment, and an
261
+ # unbalanced quote left by comment-stripping can never swallow real code.
262
+ # Known line-based blind spots (accepted for zero deps): JS regex literals,
263
+ # Rust lifetimes ('a), raw strings ending in a backslash.
264
+ _STR_DQ = r'"(?:[^"\\\n]|\\.)*"'
265
+ _STR_SQ = r"'(?:[^'\\\n]|\\.)*'"
266
+ _STR_BT = r"`(?:[^`\\]|\\.)*`"
267
+ _STR_TRIPLE = r"'''(?s:.*?)'''|\"\"\"(?s:.*?)\"\"\""
268
+ _CMT_SLASH = r"//[^\n]*"
269
+ _CMT_HASH = r"#[^\n]*"
270
+ _CMT_BLOCK = r"/\*(?s:.*?)\*/"
271
+ _FAMILY_SYNTAX = {
272
+ # family: (string alternatives, comment alternatives)
273
+ "py": (f"{_STR_TRIPLE}|{_STR_DQ}|{_STR_SQ}", _CMT_HASH),
274
+ "ruby": (f"{_STR_DQ}|{_STR_SQ}", _CMT_HASH),
275
+ "php": (f"{_STR_DQ}|{_STR_SQ}", f"{_CMT_SLASH}|{_CMT_BLOCK}|{_CMT_HASH}"),
276
+ "bt": (f"{_STR_BT}|{_STR_DQ}|{_STR_SQ}", f"{_CMT_SLASH}|{_CMT_BLOCK}"),
277
+ "c": (f"{_STR_DQ}|{_STR_SQ}", f"{_CMT_SLASH}|{_CMT_BLOCK}"),
278
+ }
279
+ _LANG_FAMILY = {"py": "py", "ruby": "ruby", "php": "php", "js": "bt", "go": "bt"}
280
+ _TOKEN_RX = {
281
+ fam: re.compile(f"(?P<s>{strs})|(?P<c>{cmts})")
282
+ for fam, (strs, cmts) in _FAMILY_SYNTAX.items()
283
+ }
284
+
285
+
286
+ def _strip_comments(text: str, lang: str, string_repl: str | None) -> str:
287
+ """Drop comments for `lang`; keep strings verbatim (string_repl=None) or
288
+ mask each one with string_repl."""
289
+ rx = _TOKEN_RX[_LANG_FAMILY.get(lang, "c")]
290
+
291
+ def repl(m: re.Match[str]) -> str:
292
+ if m.lastgroup == "s":
293
+ return m.group(0) if string_repl is None else string_repl
294
+ return " "
295
+
296
+ return rx.sub(repl, text)
297
+
298
+
299
+ def _mask_strings(text: str, lang: str) -> str:
300
+ """Mask string-literal contents but KEEP comments. For detectors whose
301
+ habitat is comments/code (residue phrases, boolean call args): a slop
302
+ phrase *quoted in a string* is a fixture or UI copy - the model pass
303
+ judges those in context, the scanner must not."""
304
+ rx = _TOKEN_RX[_LANG_FAMILY.get(lang, "c")]
305
+ return rx.sub(lambda m: "L" if m.lastgroup == "s" else m.group(0), text)
306
+
307
+
308
+ def lang_of(rel: str) -> str:
309
+ ext = rel.rsplit(".", 1)[-1].lower() if "." in rel else ""
310
+ if ext == "py":
311
+ return "py"
312
+ if ext == "go":
313
+ return "go"
314
+ if ext == "rs":
315
+ return "rust"
316
+ if ext == "rb":
317
+ return "ruby"
318
+ if ext == "php":
319
+ return "php"
320
+ if ext in ("java", "kt", "kts", "cs", "scala"):
321
+ return "cstyle"
322
+ if ext in ("ts", "tsx", "js", "jsx", "mjs", "cjs", "vue", "svelte", "astro"):
323
+ return "js"
324
+ return "other"
325
+
326
+
327
+ def git(root: str, *args: str) -> str | None:
328
+ # quotepath=false: git would otherwise octal-escape non-ASCII paths and the
329
+ # escaped name would never open. UTF-8 decode: text=True would use the
330
+ # locale codec (cp1252 on Windows) and crash on real UTF-8 diffs.
331
+ try:
332
+ p = subprocess.run(
333
+ ["git", "-C", root, "-c", "core.quotepath=false", *args],
334
+ capture_output=True, encoding="utf-8", errors="replace",
335
+ )
336
+ except OSError:
337
+ return None
338
+ return p.stdout if p.returncode == 0 else ""
339
+
340
+
341
+ def _unquote_path(p: str) -> str:
342
+ """Undo git's C-style quoting of paths with quotes/specials (rare once
343
+ core.quotepath=false handles non-ASCII)."""
344
+ if len(p) >= 2 and p.startswith('"') and p.endswith('"'):
345
+ return re.sub(
346
+ r"\\(.)",
347
+ lambda m: {"n": "\n", "t": "\t"}.get(m.group(1), m.group(1)),
348
+ p[1:-1],
349
+ )
350
+ return p
351
+
352
+
353
+ def parse_added_by_file(diff_text: str) -> dict[str, list[str]]:
354
+ """Added lines per file from ONE `git diff` run (not one process per file)."""
355
+ added: dict[str, list[str]] = {}
356
+ cur: str | None = None
357
+ for ln in diff_text.splitlines():
358
+ if ln.startswith("+++ "):
359
+ path = _unquote_path(ln[4:].strip())
360
+ if path == "/dev/null":
361
+ cur = None
362
+ else:
363
+ cur = path[2:] if path.startswith("b/") else path
364
+ elif cur is not None and ln.startswith("+") and not ln.startswith("+++"):
365
+ added.setdefault(cur, []).append(ln[1:])
366
+ return added
367
+
368
+
369
+ def read_whole(root: str, rel: str) -> tuple[list[str] | None, bool]:
370
+ """First READ_CAP lines and whether the file had more (truncated).
371
+ None = unreadable (missing/permission); callers must surface it, because a
372
+ silent skip turns a vanished file into a false 'no slop found'."""
373
+ try:
374
+ # utf-8-sig: PowerShell writes BOMs by default; a surviving \ufeff on
375
+ # line 1 would defeat every ^-anchored detector.
376
+ with open(os.path.join(root, rel), encoding="utf-8-sig", errors="ignore") as fh:
377
+ lines = [ln.rstrip("\n") for ln in islice(fh, READ_CAP + 1)]
378
+ except OSError:
379
+ return None, False
380
+ if len(lines) > READ_CAP:
381
+ return lines[:READ_CAP], True
382
+ return lines, False
383
+
384
+
385
+ def _read_for_scan(root: str, rel: str, unreadable: list[str]) -> tuple[list[str], bool]:
386
+ lines, truncated = read_whole(root, rel)
387
+ if lines is None:
388
+ unreadable.append(rel)
389
+ return [], False
390
+ return lines, truncated
391
+
392
+
393
+ def _uniq(items: list[str]) -> list[str]:
394
+ seen: set[str] = set()
395
+ out: list[str] = []
396
+ for it in items:
397
+ if it not in seen:
398
+ seen.add(it)
399
+ out.append(it)
400
+ return out
401
+
402
+
403
+ def is_redundant_comment(line: str) -> bool:
404
+ if not COMMENT.search(line):
405
+ return False
406
+ body = re.sub(r"\*/\s*$", "", _CMT_MARKER.sub("", line)).strip()
407
+ return len(body.split()) <= 6
408
+
409
+
410
+ def _dep_line_hit(ln: str) -> bool:
411
+ for m in DEP.finditer(ln):
412
+ # Line-start metadata ("version": "2.1.0") is a manifest field, not a dep.
413
+ if m.start() == 0 and m.group("name").lower() in META_KEYS:
414
+ continue
415
+ return True
416
+ return False
417
+
418
+
419
+ # Every per-file slop signal scan_lines emits, with its report label (padded
420
+ # to one column) and summary label. Gate, totals, and printing derive from
421
+ # this one table.
422
+ _SIGNALS = {
423
+ "dependencies": ("new dependency ", "dep"),
424
+ "abstractions": ("premature abstraction", "abstraction"),
425
+ "redundant_comments": ("redundant comment ", "redundant-comment"),
426
+ "ai_residue": ("AI residue ", "residue"),
427
+ "type_escapes": ("type escape ", "type-escape"),
428
+ "swallowed_errors": ("swallowed error ", "swallowed-error"),
429
+ "tautological_tests": ("tautological test ", "tautology"),
430
+ "async_wrappers": ("async wrapper ", "async-wrapper"),
431
+ "guard_chains": ("guard chain (use ?.) ", "guard-chain"),
432
+ "boolean_traps": ("boolean trap ", "boolean-trap"),
433
+ "select_star": ("SELECT * ", "select-star"),
434
+ "tailwind_slop": ("tailwind smell ", "tailwind"),
435
+ }
436
+ _SIGNAL_KEYS = tuple(_SIGNALS)
437
+
438
+
439
+ def _file_slop(r: Finding) -> bool:
440
+ return any(r[k] for k in _SIGNAL_KEYS)
441
+
442
+
443
+ def scan_lines(rel: str, lines: list[str], audit: bool) -> Finding | None:
444
+ if not lines:
445
+ return None
446
+ lang = lang_of(rel)
447
+ ext = rel.rsplit(".", 1)[-1].lower() if "." in rel else ""
448
+ is_source = bool(SOURCE.search(rel))
449
+ found: dict[str, list[str]] = {k: [] for k in _SIGNAL_KEYS}
450
+ check_deps = (not audit) and bool(MANIFEST.search(os.path.basename(rel)))
451
+ for i, ln in enumerate(lines):
452
+ # Dep detection reads raw lines: in manifests the strings ARE the data.
453
+ if check_deps and _dep_line_hit(ln):
454
+ found["dependencies"].append(ln.strip()[:100])
455
+ # Comment-habitat detectors run on the string-masked line (comments
456
+ # kept): a slop pattern *quoted in a string* is a fixture, a log
457
+ # message, or UI copy - context for the model pass, not the scanner.
458
+ masked = _mask_strings(ln, lang)
459
+ # Code signals only apply to source files: pattern vocabulary inside
460
+ # markdown prose is documentation, not an abstraction. (Deps stay
461
+ # separate - manifests are not SOURCE files.)
462
+ if is_source:
463
+ m = ABSTRACTION.search(masked)
464
+ if m:
465
+ found["abstractions"].append(m.group(1))
466
+ else:
467
+ v = VOCAB.search(masked)
468
+ if v:
469
+ found["abstractions"].append(v.group(1))
470
+ if is_redundant_comment(masked):
471
+ found["redundant_comments"].append(ln.strip()[:100])
472
+ # Banner/emoji stay on the raw line (a celebration emoji in
473
+ # user-facing copy IS the slop).
474
+ if (RESIDUE_PHRASE.search(masked) or RESIDUE_BANNER.match(ln)
475
+ or RESIDUE_EMOJI.search(ln)):
476
+ found["ai_residue"].append(ln.strip()[:100])
477
+ if lang in ("js", "cstyle", "php"):
478
+ # Mask strings + comments first: `as any` and `catch {}` are also
479
+ # English prose / string content.
480
+ code = _strip_comments(ln, lang, "L")
481
+ if lang == "js" and (
482
+ TS_SUPPRESS.search(masked)
483
+ or (not _CMT_MARKER.match(ln) and TS_ANY.search(code))
484
+ ):
485
+ found["type_escapes"].append(ln.strip()[:100])
486
+ if JS_SWALLOW.search(code):
487
+ found["swallowed_errors"].append(ln.strip()[:100])
488
+ # Raw line: the string-literal tautology alternative needs the
489
+ # actual quotes, which masking would erase.
490
+ if JS_TAUTOLOGY.search(ln):
491
+ found["tautological_tests"].append(ln.strip()[:100])
492
+ if BOOL_PAIR_JS.search(code):
493
+ found["boolean_traps"].append(ln.strip()[:100])
494
+ if lang == "js":
495
+ if ASYNC_WRAPPER.search(code):
496
+ found["async_wrappers"].append(ln.strip()[:100])
497
+ gm = GUARD_RETURN.match(ln)
498
+ if gm and i + 1 < len(lines):
499
+ nxt = GUARD_RETURN.match(lines[i + 1])
500
+ if nxt and nxt.group(1).startswith(gm.group(1) + "."):
501
+ found["guard_chains"].append(ln.strip()[:100])
502
+ elif lang == "py":
503
+ if PY_TYPE_ESCAPE.search(masked):
504
+ found["type_escapes"].append(ln.strip()[:100])
505
+ if PY_SWALLOW.match(ln) or (
506
+ PY_SWALLOW_HEAD.match(ln)
507
+ and i + 1 < len(lines) and PY_PASS.match(lines[i + 1])
508
+ ):
509
+ found["swallowed_errors"].append(ln.strip()[:100])
510
+ if PY_TAUTOLOGY.search(ln):
511
+ found["tautological_tests"].append(ln.strip()[:100])
512
+ if BOOL_PAIR_PY.search(masked):
513
+ found["boolean_traps"].append(ln.strip()[:100])
514
+ if ext == "sql" and not ln.lstrip().startswith("--") and SELECT_STAR.search(ln):
515
+ found["select_star"].append(ln.strip()[:100])
516
+ if (lang == "js" or ext == "html") and (
517
+ TAILWIND_SOUP.search(ln) or TAILWIND_MAGIC_PX.search(ln)
518
+ ):
519
+ found["tailwind_slop"].append(ln.strip()[:100])
520
+ found = {k: _uniq(v) for k, v in found.items()}
521
+ added_count = sum(1 for ln in lines if ln.strip())
522
+ substantial = (not audit) and is_source and added_count >= CHECKLIST_LINES
523
+ if not (any(found.values()) or substantial):
524
+ return None
525
+ out: Finding = {"file": rel, "added_lines": added_count,
526
+ "substantial": substantial}
527
+ out.update(found)
528
+ return out
529
+
530
+
531
+ # ---- body capture + hashing ---------------------------------------------
532
+ def _indent(s: str) -> int:
533
+ return len(s) - len(s.lstrip())
534
+
535
+
536
+ def capture_body(lines: list[str], start_idx: int, lang: str) -> tuple[str, bool]:
537
+ """Body text and whether capture hit the BODY_CAP window (truncated)."""
538
+ if lang in ("py", "ruby"):
539
+ base = _indent(lines[start_idx])
540
+ window = lines[start_idx + 1:start_idx + 1 + BODY_CAP]
541
+ out: list[str] = []
542
+ for ln in window:
543
+ if lang == "py" and ln.strip() and _indent(ln) <= base:
544
+ break
545
+ if lang == "ruby" and ln.strip() == "end" and _indent(ln) <= base:
546
+ break
547
+ out.append(ln)
548
+ else:
549
+ return "\n".join(out), len(lines) > start_idx + 1 + BODY_CAP
550
+ return "\n".join(out), False
551
+ # brace languages
552
+ sig = lines[start_idx]
553
+ arrow = sig.find("=>")
554
+ if arrow >= 0 and sig.find("{", arrow) < 0:
555
+ return sig[arrow + 2:].split(";")[0], False
556
+ brace_line = -1
557
+ for k in range(start_idx, min(start_idx + 6, len(lines))):
558
+ if "{" in lines[k]:
559
+ brace_line = k
560
+ break
561
+ if ";" in lines[k] and "=>" not in lines[k]:
562
+ return "", False
563
+ if brace_line < 0:
564
+ return "", False
565
+ depth, started = 0, False
566
+ out_chars: list[str] = []
567
+ for ln in lines[brace_line:brace_line + 1 + BODY_CAP]:
568
+ for ch in ln:
569
+ if ch == "{":
570
+ depth += 1
571
+ if depth == 1:
572
+ started = True
573
+ continue
574
+ elif ch == "}":
575
+ depth -= 1
576
+ if depth == 0 and started:
577
+ return "".join(out_chars), False
578
+ if started:
579
+ out_chars.append(ch)
580
+ if started:
581
+ out_chars.append("\n")
582
+ # Never saw the closing brace inside the window: cap hit (or EOF mid-body).
583
+ return "".join(out_chars), True
584
+
585
+
586
+ def normalize_body(text: str, lang: str) -> str:
587
+ """Whitespace/comment-insensitive but literal-sensitive text, for the
588
+ exact-duplicate hash. Strings survive verbatim (URLs differ => bodies differ)."""
589
+ text = _strip_comments(text, lang, string_repl=None)
590
+ return re.sub(r"\s+", " ", text).strip()
591
+
592
+
593
+ def structural_body(text: str, lang: str) -> str:
594
+ """Mask identifiers + literals; keep keywords/operators. Two bodies with the
595
+ same control-flow shape but drifted names/values hash equal => near-duplicate."""
596
+ text = _strip_comments(text, lang, string_repl="L")
597
+ text = re.sub(r"\b\d[\w.]*\b", "N", text)
598
+ text = ID.sub(lambda m: m.group(0) if m.group(0) in KEYWORDS else "I", text)
599
+ return re.sub(r"\s+", "", text)
600
+
601
+
602
+ def _digest(s: str) -> str:
603
+ # blake2b, not md5: FIPS-enabled Pythons refuse md5 outright.
604
+ return hashlib.blake2b(s.encode("utf-8"), digest_size=16).hexdigest()
605
+
606
+
607
+ def _is_exported(name: str, line: str, lang: str) -> bool:
608
+ if lang == "py":
609
+ return not name.startswith("_") # public def: importable anywhere
610
+ if lang == "go":
611
+ return name[:1].isupper() # Go exports by capitalization
612
+ return (bool(EXPORT_KEYWORD.search(line))
613
+ or "module.exports" in line or "exports." in line)
614
+
615
+
616
+ def collect_defs(rel: str, lines: list[str]) -> list[Finding]:
617
+ lang = lang_of(rel)
618
+ decls = FUNC_PATTERNS.get(lang)
619
+ if not decls:
620
+ return []
621
+ defs: list[Finding] = []
622
+ for i, ln in enumerate(lines):
623
+ name = None
624
+ for rx in decls:
625
+ m = rx.search(ln)
626
+ if m:
627
+ cand = m.group(1)
628
+ if rx in (METHOD_JS, METHOD_CSTYLE) and cand in NOT_METHOD:
629
+ continue
630
+ name = cand
631
+ break
632
+ if not name:
633
+ continue
634
+ raw, truncated = capture_body(lines, i, lang)
635
+ nb = normalize_body(raw, lang)
636
+ sb = structural_body(raw, lang)
637
+ # Non-blank lines only: the brace walk pads raw with edge newlines,
638
+ # and counting them would let `super(props);` pass the 3-line floor.
639
+ body_lines = sum(1 for s in raw.splitlines() if s.strip()) or 1
640
+ # Exact-dup hash needs substance (>=3 lines or >=60 chars): one-line
641
+ # boilerplate like `super(props);` is not knowledge worth consolidating.
642
+ # A truncated body is a prefix, not the function - never call it exact.
643
+ hash_exact = (not truncated and len(nb) >= 12
644
+ and (body_lines >= 3 or len(nb) >= 60))
645
+ defs.append({
646
+ "name": name, "file": rel, "line": i + 1,
647
+ "exported": _is_exported(name, ln, lang),
648
+ "exact": _digest(nb) if hash_exact else None,
649
+ "struct": _digest(sb) if len(sb) >= 20 else None, # skip trivial one-liners (return I;)
650
+ "body_lines": body_lines,
651
+ "truncated": truncated,
652
+ })
653
+ return defs
654
+
655
+
656
+ def collect_types(rel: str, lines: list[str]) -> list[Finding]:
657
+ if lang_of(rel) != "js":
658
+ return []
659
+ out: list[Finding] = []
660
+ for ln in lines:
661
+ m = TYPE_DECL.search(ln)
662
+ if m:
663
+ out.append({"name": m.group(1), "file": rel})
664
+ return out
665
+
666
+
667
+ def analyze_duplication(defs: list[Finding], types: list[Finding],
668
+ idfreq: Counter[str], full_scope: bool) -> Finding:
669
+ by_name: dict[str, list[Finding]] = defaultdict(list)
670
+ by_exact: dict[str, list[Finding]] = defaultdict(list)
671
+ by_struct: dict[str, list[Finding]] = defaultdict(list)
672
+ name_def_counts: Counter[str] = Counter()
673
+ for d in defs:
674
+ by_name[d["name"]].append(d)
675
+ name_def_counts[d["name"]] += 1
676
+ if d["exact"]:
677
+ by_exact[d["exact"]].append(d)
678
+ if d["struct"]:
679
+ by_struct[d["struct"]].append(d)
680
+
681
+ name_clones = []
682
+ for name, ds in by_name.items():
683
+ files = sorted({d["file"] for d in ds})
684
+ if len(files) >= 2 and name.lower() not in COMMON_NAMES:
685
+ name_clones.append({"name": name, "count": len(ds), "files": files,
686
+ "confidence": "name-only"})
687
+ name_clones.sort(key=lambda x: -x["count"])
688
+
689
+ body_clones = []
690
+ for ds in by_exact.values():
691
+ names = sorted({d["name"] for d in ds})
692
+ files = sorted({d["file"] for d in ds})
693
+ if len(ds) >= 2 and (len(files) >= 2 or len(names) >= 2):
694
+ body_clones.append({"names": names, "files": files, "count": len(ds),
695
+ "confidence": "exact"})
696
+ body_clones.sort(key=lambda x: -x["count"])
697
+
698
+ near_clones = []
699
+ for ds in by_struct.values():
700
+ exacts = {d["exact"] for d in ds if d["exact"]}
701
+ if len(ds) >= 2 and len(exacts) >= 2: # same shape, genuinely different bodies
702
+ names = sorted({d["name"] for d in ds})
703
+ files = sorted({d["file"] for d in ds})
704
+ near_clones.append({"names": names, "files": files, "count": len(ds),
705
+ "confidence": "structural"})
706
+ near_clones.sort(key=lambda x: -x["count"])
707
+
708
+ # Reference counts only mean something when every file was scanned; on a
709
+ # partial file list "0 references" is an artifact of the scope, and acting
710
+ # on it would delete live code.
711
+ single_use: list[Finding] = []
712
+ if full_scope:
713
+ seen_su: set[str] = set()
714
+ for d in defs:
715
+ name = d["name"]
716
+ if name in seen_su or name.lower() in COMMON_NAMES:
717
+ continue
718
+ small = d["body_lines"] <= 3
719
+ util_ish = bool(MICRO_PREFIX.search(name)) or name.lower() in FINGERPRINTS
720
+ if not (small or util_ish):
721
+ continue
722
+ # Export-aware: exported defs may be public API / framework entry
723
+ # points, so we can't call them dead. Only judge repo-internal defs.
724
+ if d.get("exported"):
725
+ continue
726
+ refs = idfreq.get(name, 0) - name_def_counts[name]
727
+ if refs == 0:
728
+ seen_su.add(name)
729
+ single_use.append({"name": name, "file": d["file"], "kind": "dead"})
730
+ elif refs == 1 and util_ish:
731
+ seen_su.add(name)
732
+ single_use.append({"name": name, "file": d["file"], "kind": "inline"})
733
+
734
+ fp: dict[str, int] = defaultdict(int)
735
+ for d in defs:
736
+ if d["name"].lower() in FINGERPRINTS:
737
+ fp[d["name"]] += 1
738
+ micro = sum(1 for d in defs if MICRO_PREFIX.search(d["name"]) and d["body_lines"] <= 3)
739
+
740
+ type_files: dict[str, set[str]] = defaultdict(set)
741
+ for t in types:
742
+ type_files[t["name"]].add(t["file"])
743
+ type_clones = [{"name": n, "files": sorted(fs), "confidence": "name-only"}
744
+ for n, fs in type_files.items()
745
+ if len(fs) >= 2 and n.lower() not in COMMON_TYPES]
746
+
747
+ return {"name_clones": name_clones, "body_clones": body_clones,
748
+ "near_clones": near_clones, "single_use": single_use,
749
+ "type_clones": type_clones, "fingerprints": dict(sorted(fp.items(), key=lambda x: -x[1])),
750
+ "micro_count": micro, "total_defs": len(defs),
751
+ "truncated_defs": sum(1 for d in defs if d["truncated"])}
752
+
753
+
754
+ def target_files(root: str, base: str, paths: list[str], is_git: bool,
755
+ all_mode: bool) -> tuple[list[str], set[str], bool]:
756
+ """(files to scan, untracked subset, hit-the-FILE_CAP flag)."""
757
+ if paths:
758
+ return [p.replace("\\", "/") for p in paths], set(), False
759
+ if not is_git:
760
+ return [], set(), False
761
+ if all_mode:
762
+ tracked = (git(root, "ls-files") or "").splitlines()
763
+ srcs = [_unquote_path(f.strip()) for f in tracked
764
+ if f.strip() and SOURCE.search(_unquote_path(f.strip()))]
765
+ return srcs[:FILE_CAP], set(), len(srcs) > FILE_CAP
766
+ names = (git(root, "diff", "--name-only", base) or "").splitlines()
767
+ others = (git(root, "ls-files", "--others", "--exclude-standard") or "").splitlines()
768
+ untracked = {_unquote_path(f.strip()) for f in others if f.strip()}
769
+ seen: set[str] = set()
770
+ out: list[str] = []
771
+ for f in names + others:
772
+ f = _unquote_path(f.strip())
773
+ if f and f not in seen:
774
+ seen.add(f)
775
+ out.append(f)
776
+ return out, untracked, False
777
+
778
+
779
+ def _dup_has_findings(dup: Finding | None) -> bool:
780
+ if not dup:
781
+ return False
782
+ return bool(dup["name_clones"] or dup["body_clones"] or dup["near_clones"]
783
+ or dup["single_use"] or dup["type_clones"] or dup["fingerprints"]
784
+ or dup["micro_count"])
785
+
786
+
787
+ def _print_capped(label: str, items: list[str]) -> None:
788
+ for it in items[:SHOW_CAP]:
789
+ print(f" {label}: {it}")
790
+ if len(items) > SHOW_CAP:
791
+ print(f" ... +{len(items) - SHOW_CAP} more {label.strip()}")
792
+
793
+
794
+ def _print_duplication(dup: Finding) -> bool:
795
+ nc, bc, near, su = dup["name_clones"], dup["body_clones"], dup["near_clones"], dup["single_use"]
796
+ tc, fp = dup["type_clones"], dup["fingerprints"]
797
+ if not _dup_has_findings(dup):
798
+ return False
799
+ print("DUPLICATION (whole-codebase - the isRecord-class slop):")
800
+ if nc:
801
+ print(" Clone proliferation - same function name in multiple files [confidence: name-only]:")
802
+ for c in nc[:15]:
803
+ print(f" {c['name']:<22} x{c['count']:<3} {', '.join(c['files'][:6])}")
804
+ if bc:
805
+ print(" Knowledge duplication - identical body, consolidate to ONE (DRY) [confidence: exact]:")
806
+ for c in bc[:15]:
807
+ print(f" [{'/'.join(c['names'][:4])}] x{c['count']} across {', '.join(c['files'][:5])}")
808
+ if near:
809
+ print(" Semantic fragmentation - near-identical bodies (drifted clones) [confidence: structural]:")
810
+ for c in near[:12]:
811
+ print(f" [{'/'.join(c['names'][:4])}] x{c['count']} across {', '.join(c['files'][:5])}")
812
+ if su:
813
+ print(" Semantic density collapse - dead / single-use helpers:")
814
+ for c in su[:15]:
815
+ tag = "unused & not exported -> delete" if c["kind"] == "dead" else "used once -> inline"
816
+ print(f" {c['name']:<24} {tag:<32} {c['file']}")
817
+ if tc:
818
+ print(" Duplicate type/interface names [confidence: name-only]:")
819
+ for c in tc[:10]:
820
+ print(f" {c['name']:<22} {', '.join(c['files'][:6])}")
821
+ if fp:
822
+ print(" Generated-code fingerprints present: "
823
+ + ", ".join(f"{k}({v})" for k, v in list(fp.items())[:12]))
824
+ if dup["micro_count"]:
825
+ print(f" Micro-abstraction load: {dup['micro_count']} tiny is*/assert*/safe* "
826
+ f"helpers of {dup['total_defs']} defs (Helper Hell risk)")
827
+ print(" -> Consolidate clones to one shared definition, inline single-use helpers,")
828
+ print(" re-point imports, delete the rest. One source of truth per concept.\n")
829
+ return True
830
+
831
+
832
+ def main() -> int:
833
+ # A cp1252 pipe (Windows capture) must degrade, not crash, on the
834
+ # non-ASCII paths core.quotepath=false deliberately preserves.
835
+ if hasattr(sys.stdout, "reconfigure"):
836
+ sys.stdout.reconfigure(errors="replace")
837
+ ap = argparse.ArgumentParser(description="Static AI-slop signal scanner (reports only).")
838
+ ap.add_argument("paths", nargs="*", help="specific files to audit (default: the git diff)")
839
+ ap.add_argument("--root", default=".", help="repo root (default: cwd)")
840
+ ap.add_argument("--base", default="HEAD", help="git ref to diff against in diff scope")
841
+ ap.add_argument("--all", action="store_true", help="audit ALL tracked source files + duplication")
842
+ ap.add_argument("--gate", action="store_true",
843
+ help="exit non-zero if slop is found, in any output format "
844
+ "(the size-only 'substantial' note never gates)")
845
+ ap.add_argument("--format", choices=["text", "json"], default="text")
846
+ args = ap.parse_args()
847
+
848
+ root = os.path.abspath(args.root)
849
+ is_git = bool(git(root, "rev-parse", "--git-dir"))
850
+ audit = args.all or bool(args.paths)
851
+ full_scope = args.all and not args.paths
852
+ files, untracked, files_capped = target_files(root, args.base, args.paths, is_git, args.all)
853
+
854
+ warnings: list[str] = []
855
+ if files_capped:
856
+ warnings.append(f"file list capped at {FILE_CAP} tracked source files; scan is partial")
857
+ if audit and not full_scope:
858
+ warnings.append("partial scope (explicit paths): single-use/dead-helper analysis "
859
+ "suppressed - reference counts need the whole codebase (--all)")
860
+
861
+ file_lines: dict[str, list[str]] = {}
862
+ read_capped: list[str] = []
863
+ unreadable: list[str] = []
864
+ if audit:
865
+ for f in files:
866
+ lines, truncated = _read_for_scan(root, f, unreadable)
867
+ file_lines[f] = lines
868
+ if truncated:
869
+ read_capped.append(f)
870
+ else:
871
+ # User diff.noprefix / diff.mnemonicPrefix config would change the
872
+ # +++ headers and silently break the b/ stripping in the parser.
873
+ diff_added = parse_added_by_file(
874
+ git(root, "-c", "diff.noprefix=false", "-c", "diff.mnemonicprefix=false",
875
+ "diff", args.base) or "") if is_git else {}
876
+ for f in files:
877
+ if f in untracked:
878
+ lines, truncated = _read_for_scan(root, f, unreadable)
879
+ file_lines[f] = lines
880
+ if truncated:
881
+ read_capped.append(f)
882
+ else:
883
+ file_lines[f] = diff_added.get(f, [])
884
+ if read_capped:
885
+ shown = ", ".join(read_capped[:5]) + (" ..." if len(read_capped) > 5 else "")
886
+ warnings.append(f"{len(read_capped)} file(s) hit the {READ_CAP}-line read cap "
887
+ f"(tail unscanned): {shown}")
888
+ if unreadable:
889
+ shown = ", ".join(unreadable[:5]) + (" ..." if len(unreadable) > 5 else "")
890
+ warnings.append(f"{len(unreadable)} file(s) could NOT be read and were NOT "
891
+ f"scanned (missing or unreadable): {shown}")
892
+
893
+ results = [r for r in (scan_lines(f, ls, audit) for f, ls in file_lines.items()) if r]
894
+
895
+ dup: Finding | None = None
896
+ if audit:
897
+ idfreq: Counter[str] = Counter()
898
+ defs: list[Finding] = []
899
+ types: list[Finding] = []
900
+ for f, ls in file_lines.items():
901
+ for ln in ls:
902
+ idfreq.update(ID.findall(ln))
903
+ defs.extend(collect_defs(f, ls))
904
+ types.extend(collect_types(f, ls))
905
+ dup = analyze_duplication(defs, types, idfreq, full_scope)
906
+ if dup["truncated_defs"]:
907
+ warnings.append(f"{dup['truncated_defs']} function bodies hit the {BODY_CAP}-line "
908
+ "capture cap; excluded from exact-duplicate hashing")
909
+ mode = "audit" if audit else "diff"
910
+ scope = ("explicit paths" if args.paths
911
+ else "whole codebase (--all)" if args.all
912
+ else f"diff vs {args.base}")
913
+
914
+ totals: dict[str, int] = {k: sum(len(r[k]) for r in results)
915
+ for k in _SIGNAL_KEYS}
916
+ totals["files"] = len(results)
917
+
918
+ # The gate trips on slop only. "Substantial" is a checklist nudge for big
919
+ # clean changes, not a defect - a clean 40-line diff must pass.
920
+ slop_found = any(_file_slop(r) for r in results) or _dup_has_findings(dup)
921
+ exit_code = 1 if args.gate and slop_found else 0
922
+
923
+ if args.format == "json":
924
+ print(json.dumps({"mode": mode, "scope": scope, "base": args.base, "git": is_git,
925
+ "files_scanned": len(files), "totals": totals,
926
+ "slop_found": slop_found, "warnings": warnings,
927
+ "files": results, "duplication": dup}, indent=2))
928
+ return exit_code
929
+
930
+ if not is_git and not args.paths:
931
+ print("anti-slop scan: not a git repo and no paths given.")
932
+ print("Audit files: `scan_slop.py src/foo.ts`, or run inside a git repo with --all.")
933
+ return 0
934
+
935
+ print(f"anti-slop scan - scope: {scope}, {len(files)} file(s)\n")
936
+ for w in warnings:
937
+ print(f" note: {w}")
938
+ if warnings:
939
+ print()
940
+
941
+ has_dup = bool(dup and _print_duplication(dup))
942
+
943
+ if results:
944
+ print(f"{totals['files']} file(s) with per-file signals\n")
945
+ for r in results:
946
+ print(r["file"])
947
+ for key, (label, _short) in _SIGNALS.items():
948
+ _print_capped(label, r[key])
949
+ if r["substantial"] and not _file_slop(r):
950
+ print(f" +{r['added_lines']} added lines (>= {CHECKLIST_LINES}: run the checklist)")
951
+ print()
952
+
953
+ if not results and not has_dup:
954
+ if audit:
955
+ print(f"No static slop patterns across {len(files)} file(s).")
956
+ print("Clean of the deterministic signals. Semantic slop still needs a model pass:")
957
+ print("invoke the anti-slop skill for edge cases / superficial tests / cargo-cult.")
958
+ elif not files:
959
+ print(f"Nothing changed vs {args.base} (clean working tree).")
960
+ print("Diff scope only vets a change in progress. To review existing code:")
961
+ print(" scan_slop.py --all (whole codebase + duplication)")
962
+ print(" scan_slop.py path/to/file (specific files)")
963
+ else:
964
+ print(f"{len(files)} changed file(s) scanned - no static slop signals.")
965
+ print("Semantic slop still needs a model pass; walk the SKILL.md taxonomy.")
966
+ if args.gate:
967
+ print("GATE: PASS (no slop)")
968
+ return exit_code
969
+
970
+ parts = [f"{totals[k]} {short}" for k, (_label, short) in _SIGNALS.items()
971
+ if totals[k]]
972
+ print(f"SUMMARY ({mode}): "
973
+ + (", ".join(parts) if parts else "no per-file slop signals"), end="")
974
+ if dup:
975
+ print(f"; {len(dup['name_clones'])} name-clone, {len(dup['body_clones'])} exact-dup, "
976
+ f"{len(dup['near_clones'])} near-dup, {len(dup['single_use'])} single-use", end="")
977
+ print(".")
978
+ if slop_found:
979
+ print("Fix every signal above, then walk the full taxonomy in SKILL.md and re-scan (expect clean).")
980
+ if args.gate:
981
+ print("GATE: FAIL (slop found)" if slop_found else "GATE: PASS (no slop)")
982
+ return exit_code
983
+
984
+
985
+ if __name__ == "__main__":
986
+ sys.exit(main())