cursordoctrine 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/INSTALL.md +113 -0
- package/LICENSE +21 -0
- package/README.md +86 -0
- package/bin/cli.mjs +413 -0
- package/linux/USER-RULES.md +12 -0
- package/linux/doctrine.md +172 -0
- package/linux/hooks/anti-slop-audit.sh +163 -0
- package/linux/hooks/anti-slop.md +56 -0
- package/linux/hooks/final-review.md +52 -0
- package/linux/hooks/final-review.sh +99 -0
- package/linux/hooks/hook-common.sh +120 -0
- package/linux/hooks/minimal-edit-audit.sh +112 -0
- package/linux/hooks/permission-gate.sh +75 -0
- package/linux/hooks/post-tool-use.sh +53 -0
- package/linux/hooks/self-review-trigger.sh +56 -0
- package/linux/hooks/self-review.md +48 -0
- package/linux/hooks/subagent-stop-review.sh +93 -0
- package/linux/hooks.json +64 -0
- package/linux/inject-doctrine.sh +31 -0
- package/package.json +40 -0
- package/skills/anti-slop/SKILL.md +267 -0
- package/skills/anti-slop/scripts/scan_slop.py +986 -0
- package/windows/USER-RULES.md +12 -0
- package/windows/doctrine.md +172 -0
- package/windows/hooks/anti-slop-audit.ps1 +182 -0
- package/windows/hooks/anti-slop.md +56 -0
- package/windows/hooks/final-review.md +52 -0
- package/windows/hooks/final-review.ps1 +105 -0
- package/windows/hooks/hook-common.ps1 +84 -0
- package/windows/hooks/minimal-edit-audit.ps1 +116 -0
- package/windows/hooks/permission-gate.ps1 +98 -0
- package/windows/hooks/post-tool-use.ps1 +46 -0
- package/windows/hooks/self-review-trigger.ps1 +83 -0
- package/windows/hooks/self-review.md +48 -0
- package/windows/hooks/subagent-stop-review.ps1 +89 -0
- package/windows/hooks.json +64 -0
- package/windows/inject-doctrine.ps1 +58 -0
|
@@ -0,0 +1,986 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""scan_slop.py - static AI-slop signal scanner for the anti-slop skill.
|
|
3
|
+
|
|
4
|
+
Reports cheap, high-precision slop signals so the agent starts its de-slop sweep
|
|
5
|
+
from a deterministic inventory. Semantic slop (cargo-cult, superficial tests) is
|
|
6
|
+
deliberately NOT guessed - that stays the model's job. This seeds it.
|
|
7
|
+
|
|
8
|
+
Scopes:
|
|
9
|
+
* DIFF (default): only what changed vs --base (git diff). Silent on a clean
|
|
10
|
+
tree by design. Flags NEW deps, premature abstractions, redundant comments,
|
|
11
|
+
AI residue (placeholder phrases / banner comments / emoji), type escapes
|
|
12
|
+
(as any, as unknown as, @ts-ignore, Python type-ignore pragmas), swallowed
|
|
13
|
+
errors (empty catch, broad except+pass), tautological asserts, pointless
|
|
14
|
+
async wrappers (await Promise.resolve, async executors), deepening guard
|
|
15
|
+
chains (the optional-chaining shape), boolean-pair call traps, SELECT *
|
|
16
|
+
in .sql files, and Tailwind class soup / magic-px values. All per-file
|
|
17
|
+
signals also run in AUDIT scope; only new-dependency detection is diff-only
|
|
18
|
+
(every line of an existing manifest would otherwise read as "new").
|
|
19
|
+
* AUDIT (--all, or explicit paths): the WHOLE codebase, with the duplication
|
|
20
|
+
analysis that catches the isRecord()-class slop:
|
|
21
|
+
- Clone Proliferation : same function name in multiple files
|
|
22
|
+
- Knowledge Duplication : identical body under different names (DRY)
|
|
23
|
+
- Semantic Fragmentation : near-identical bodies (same shape, drifted
|
|
24
|
+
names/values) - the diverged clones
|
|
25
|
+
- Semantic Density Collapse: tiny helpers used 0-1 times (dead / inline)
|
|
26
|
+
(--all only: reference counts over a partial file list are meaningless,
|
|
27
|
+
so explicit-path scans suppress this analysis instead of guessing)
|
|
28
|
+
- Generated Fingerprints : isRecord/safeParse/sleep/retry/... repeated
|
|
29
|
+
- Duplicate type/interface names across files
|
|
30
|
+
Functions/methods are parsed for JS/TS, Python, Go, Rust, Ruby, PHP, and
|
|
31
|
+
(best-effort) Java/Kotlin/C#/Scala.
|
|
32
|
+
|
|
33
|
+
Every duplication finding carries a confidence tier: exact (identical
|
|
34
|
+
normalized body) > structural (same shape, drifted names/values) > name-only
|
|
35
|
+
(same identifier). The report header states the scope; caps that were hit
|
|
36
|
+
(file list, per-file lines, per-body lines) are disclosed as notes.
|
|
37
|
+
|
|
38
|
+
Stdlib only; Python 3.9+. REPORTS only - never edits. Exits 0, except with
|
|
39
|
+
--gate: exit 1 when slop is found (the size-only "substantial change" note
|
|
40
|
+
never gates). The agent does the fixing.
|
|
41
|
+
|
|
42
|
+
Usage:
|
|
43
|
+
python scripts/scan_slop.py --all --root . # WHOLE-codebase audit (recommended)
|
|
44
|
+
python scripts/scan_slop.py --root . # diff vs HEAD (a change in progress)
|
|
45
|
+
python scripts/scan_slop.py src/foo.ts src/bar.py # audit specific files
|
|
46
|
+
python scripts/scan_slop.py --all --format json # machine-readable
|
|
47
|
+
"""
|
|
48
|
+
from __future__ import annotations
|
|
49
|
+
|
|
50
|
+
import argparse
|
|
51
|
+
import hashlib
|
|
52
|
+
import json
|
|
53
|
+
import os
|
|
54
|
+
import re
|
|
55
|
+
import subprocess
|
|
56
|
+
import sys
|
|
57
|
+
from collections import Counter, defaultdict
|
|
58
|
+
from itertools import islice
|
|
59
|
+
from typing import Any
|
|
60
|
+
|
|
61
|
+
Finding = dict[str, Any]
|
|
62
|
+
|
|
63
|
+
# ---- per-file signal patterns -------------------------------------------
|
|
64
|
+
# Only manifest formats the DEP regex genuinely parses (name [:=@] version).
|
|
65
|
+
# go.mod / pom.xml / Gradle / Gemfile / .csproj declare deps in syntaxes DEP
|
|
66
|
+
# cannot match, so listing them would only over-claim coverage.
|
|
67
|
+
MANIFEST = re.compile(
|
|
68
|
+
r"^(package\.json|requirements[\w.\-]*\.txt|pyproject\.toml|Pipfile"
|
|
69
|
+
r"|Cargo\.toml|composer\.json)$"
|
|
70
|
+
)
|
|
71
|
+
DEP = re.compile(
|
|
72
|
+
r"""(?:^|[{,])\s*(?:"|')?(?P<name>[A-Za-z@][\w@\-./\[\]]*)(?:"|')?\s*"""
|
|
73
|
+
r"""(?:[:=]\s*(?:"|')?[\^~><=*v]?\d|[><=~!]=\s*\d|@\s*\^?\d)"""
|
|
74
|
+
)
|
|
75
|
+
# Manifest metadata that pairs a name with a version-looking value without
|
|
76
|
+
# declaring a dependency. Skipped only at line start: `serde = { version = ...`
|
|
77
|
+
# matches mid-line and IS a real dependency.
|
|
78
|
+
META_KEYS = frozenset({
|
|
79
|
+
"version", "name", "node", "npm", "yarn", "pnpm", "packagemanager",
|
|
80
|
+
"engines", "python", "private", "description",
|
|
81
|
+
})
|
|
82
|
+
ABSTRACTION = re.compile(
|
|
83
|
+
r"\b(?:class|interface|struct|trait|protocol)\s+"
|
|
84
|
+
r"([A-Z][A-Za-z0-9_]*(?:Factory|Repository|Mediator|Strategy|Singleton"
|
|
85
|
+
r"|Facade|Builder|Visitor|Decorator|Wrapper|Orchestrator|Registry))\b"
|
|
86
|
+
)
|
|
87
|
+
# [C]QRS: the brackets change nothing semantically but keep the alternative
|
|
88
|
+
# from matching its own definition when the scanner audits itself.
|
|
89
|
+
VOCAB = re.compile(
|
|
90
|
+
r"\b([C]QRS|Event[\s\-]?Sourc(?:e|ing)|Domain[\s\-]?Driven|Aggregate\s?Root"
|
|
91
|
+
r"|Bounded\s?Context|Hexagonal\s+Architecture|Onion\s+Architecture)\b",
|
|
92
|
+
re.I,
|
|
93
|
+
)
|
|
94
|
+
COMMENT = re.compile(
|
|
95
|
+
r"^\s*(?://|#|/\*+|\*+)\s*(?:increment|decrement|loop (?:over|through)|iterate"
|
|
96
|
+
r"|returns?(?: the)?(?: result| value)?\s*$|set\s+\w+\s+to\b|getter\b"
|
|
97
|
+
r"|setter\b|constructor\b|initiali[sz]e\b|instantiate\b|create (?:a |an |the )"
|
|
98
|
+
r"|declare\b|define\b|assign\b|end (?:of|for)\b|begin\b|start (?:of|the))",
|
|
99
|
+
re.I,
|
|
100
|
+
)
|
|
101
|
+
_CMT_MARKER = re.compile(r"^\s*(?://+|#+|/\*+|\*+)\s*")
|
|
102
|
+
# ---- residue / type-escape / swallow / tautology signals -----------------
|
|
103
|
+
# Failure classes these seed: AI-Specific (prompt residue), Type-System
|
|
104
|
+
# (any-driven development), Defensive Code Inflation (swallowed errors) and
|
|
105
|
+
# Testing (test theater). \s+ between words instead of literal spaces doubles
|
|
106
|
+
# as robustness to spacing AND keeps each pattern from matching its own
|
|
107
|
+
# source when the scanner audits itself.
|
|
108
|
+
RESIDUE_PHRASE = re.compile(
|
|
109
|
+
r"\bin\s+a\s+real\s+(?:app(?:lication)?|world|scenario)\b"
|
|
110
|
+
r"|\bfor\s+production\s+use\b|\bthis\s+is\s+a\s+simplified\b"
|
|
111
|
+
r"|\bTODO:?\s+implement\s+actual\b|\breplace\s+(?:this\s+)?with\s+your\b",
|
|
112
|
+
re.I,
|
|
113
|
+
)
|
|
114
|
+
# = * # walls only: `# ----` section dividers are a long-standing human
|
|
115
|
+
# convention (numpy, stdlib); `// =====` banners are the generated-code tell.
|
|
116
|
+
RESIDUE_BANNER = re.compile(r"^\s*(?://|#|/\*)\s*[=*#]{5,}")
|
|
117
|
+
RESIDUE_EMOJI = re.compile("[\u2600-\u27bf\U0001f300-\U0001faff]")
|
|
118
|
+
# NOT @ts-expect-error: that one is the sanctioned, self-expiring form.
|
|
119
|
+
TS_SUPPRESS = re.compile(r"@ts-(?:ignore|nocheck)\b")
|
|
120
|
+
TS_ANY = re.compile(r"\bas\s+unknown\s+as\b|\bas\s+any\b|[,:<]\s*any\b|\bany\[\]")
|
|
121
|
+
PY_TYPE_ESCAPE = re.compile(r"#\s*type:\s*ignore\b")
|
|
122
|
+
# Only bare/broad swallows: `except ImportError: pass` is a legitimate idiom
|
|
123
|
+
# (optional dependency); `except Exception: pass` hides every failure.
|
|
124
|
+
_EXC_BROAD = r"^\s*except\b\s*(?:\(?\s*(?:Base)?Exception\s*\)?\s*(?:as\s+\w+\s*)?)?:"
|
|
125
|
+
PY_SWALLOW = re.compile(_EXC_BROAD + r"\s*pass\b")
|
|
126
|
+
PY_SWALLOW_HEAD = re.compile(_EXC_BROAD + r"\s*(?:#.*)?$")
|
|
127
|
+
PY_PASS = re.compile(r"^\s*pass\s*(?:#.*)?$")
|
|
128
|
+
JS_SWALLOW = re.compile(
|
|
129
|
+
r"\bcatch\s*(?:\(\s*[^)]*\))?\s*\{\s*\}"
|
|
130
|
+
r"|\.catch\(\s*(?:\(\s*\w*\s*\)|\w+)\s*=>\s*(?:\{\s*\}|null|undefined)\s*\)"
|
|
131
|
+
r"|\.catch\(\s*function\s*\(\s*\w*\s*\)\s*\{\s*\}\s*\)"
|
|
132
|
+
)
|
|
133
|
+
JS_TAUTOLOGY = re.compile(
|
|
134
|
+
r"\bexpect\(\s*(true|false|\d+|'[^']*'|\"[^\"]*\")\s*\)\s*\.\s*"
|
|
135
|
+
r"(?:toBe|toEqual|toStrictEqual)\(\s*\1\s*\)"
|
|
136
|
+
r"|\bassert(?:True)?\(\s*true\s*\)"
|
|
137
|
+
)
|
|
138
|
+
PY_TAUTOLOGY = re.compile(r"^\s*assert\s+True\s*(?:$|[,#])|\bassertTrue\(\s*True\s*\)")
|
|
139
|
+
# ---- framework failure modes (vibe-coding stack) --------------------------
|
|
140
|
+
# await Promise.resolve() is always pointless; an async promise executor
|
|
141
|
+
# swallows its own rejections.
|
|
142
|
+
ASYNC_WRAPPER = re.compile(
|
|
143
|
+
r"\bawait\s+Promise\s*\.\s*resolve\s*\(|\bnew\s+Promise\s*\(\s*async\b"
|
|
144
|
+
)
|
|
145
|
+
# A lone negated guard-return; chains where each test deepens the previous
|
|
146
|
+
# (`!data` then `!data.user`) are the optional-chaining shape.
|
|
147
|
+
GUARD_RETURN = re.compile(
|
|
148
|
+
r"^\s*if\s*\(\s*!\s*([\w$][\w$.]*)\s*\)\s*(?:return|continue|break)\b[^;{}]*;?\s*$"
|
|
149
|
+
)
|
|
150
|
+
# Two adjacent literal booleans inside a call's argument list (Boolean Trap).
|
|
151
|
+
# Bracket/brace exclusion keeps array literals like useState([true, false]) out.
|
|
152
|
+
BOOL_PAIR_JS = re.compile(
|
|
153
|
+
r"[\w$]\s*\(\s*[^()\[\]{}]*\b(?:true|false)\s*,\s*(?:true|false)\b"
|
|
154
|
+
)
|
|
155
|
+
BOOL_PAIR_PY = re.compile(
|
|
156
|
+
r"\w\s*\(\s*[^()\[\]{}]*\b(?:True|False)\s*,\s*(?:True|False)\b"
|
|
157
|
+
)
|
|
158
|
+
SELECT_STAR = re.compile(r"\bSELECT\s+\*", re.I)
|
|
159
|
+
TAILWIND_SOUP = re.compile(r"class(?:Name)?\s*=\s*[{]?\s*['\"`][^'\"`]{200,}")
|
|
160
|
+
# Arbitrary >=100px values defeat the spacing scale (w-[347px] magic numbers).
|
|
161
|
+
TAILWIND_MAGIC_PX = re.compile(r"-\[\d{3,}(?:\.\d+)?px\]")
|
|
162
|
+
SOURCE = re.compile(
|
|
163
|
+
r"\.(?:ts|tsx|js|jsx|mjs|cjs|py|go|rs|java|kt|kts|cs|cpp|cc|cxx|c|h|hpp|rb"
|
|
164
|
+
r"|php|swift|scala|m|mm|sh|ps1|lua|dart|ex|exs|vue|svelte|astro|html|sql)$"
|
|
165
|
+
)
|
|
166
|
+
CHECKLIST_LINES = 40
|
|
167
|
+
READ_CAP = 4000 # lines read per file
|
|
168
|
+
BODY_CAP = 80 # lines captured per function body
|
|
169
|
+
FILE_CAP = 6000 # tracked files scanned in --all
|
|
170
|
+
SHOW_CAP = 10 # findings printed per list (counts stay exact)
|
|
171
|
+
|
|
172
|
+
# ---- duplication / clone machinery --------------------------------------
|
|
173
|
+
ID = re.compile(r"[A-Za-z_$][\w$]*")
|
|
174
|
+
TYPE_DECL = re.compile(r"\b(?:type|interface)\s+([A-Z][A-Za-z0-9_]*)\b")
|
|
175
|
+
MICRO_PREFIX = re.compile(
|
|
176
|
+
r"^(?:is|has|can|should|assert|ensure|safe|to|from|get|set|make|create"
|
|
177
|
+
r"|parse|format|with|map|build|validate)[A-Z0-9]"
|
|
178
|
+
)
|
|
179
|
+
COMMON_NAMES = {
|
|
180
|
+
"render", "main", "default", "index", "setup", "run", "start", "stop",
|
|
181
|
+
"init", "handler", "handle", "callback", "loader", "action", "middleware",
|
|
182
|
+
"reducer", "app", "page", "layout", "constructor", "tostring", "tojson",
|
|
183
|
+
"valueof", "equals", "dispose", "close", "open", "connect", "get", "set",
|
|
184
|
+
"update", "create", "delete", "list", "find", "save", "load", "execute",
|
|
185
|
+
"process", "build", "test", "describe", "it", "expect", "beforeeach",
|
|
186
|
+
"aftereach", "beforeall", "afterall", "getstaticprops", "getserversideprops",
|
|
187
|
+
"getstaticpaths", "generatemetadata", "usestate", "useeffect", "wrapper",
|
|
188
|
+
"new", "string", "tostr", "clone", "copy", "value", "data", "result",
|
|
189
|
+
"componentdidmount", "componentwillunmount", "componentdidupdate",
|
|
190
|
+
"componentdidcatch", "getderivedstatefromerror", "getderivedstatefromprops",
|
|
191
|
+
"shouldcomponentupdate", "getsnapshotbeforeupdate", "ngoninit", "ngondestroy",
|
|
192
|
+
"connectedcallback", "disconnectedcallback",
|
|
193
|
+
}
|
|
194
|
+
# Common per-module type names - excluded from type-duplication reporting.
|
|
195
|
+
COMMON_TYPES = frozenset({
|
|
196
|
+
"props", "state", "options", "option", "params", "parameters", "result",
|
|
197
|
+
"config", "configuration", "context", "ctx", "data", "item", "items",
|
|
198
|
+
"response", "request", "error", "react", "window", "ref", "children",
|
|
199
|
+
"theme", "style", "styles", "value", "values", "model", "entity", "dto",
|
|
200
|
+
"payload", "meta", "args", "arg", "input", "output", "node", "element",
|
|
201
|
+
"key", "id", "type", "types", "field", "fields", "row", "column", "event",
|
|
202
|
+
"handler", "callback", "fn", "cb",
|
|
203
|
+
})
|
|
204
|
+
FINGERPRINTS = {
|
|
205
|
+
"isrecord", "isobject", "isplainobject", "isdictionary", "isstring",
|
|
206
|
+
"isnumber", "isboolean", "isarraylike", "isnil", "isempty", "isdefined",
|
|
207
|
+
"ensurearray", "assertarray", "safeparse", "safeparsejson", "safejsonparse",
|
|
208
|
+
"sleep", "delay", "retry", "assertnever", "deepclone", "deepequal", "noop",
|
|
209
|
+
"clamp", "uniq", "unique", "capitalize", "classnames", "cn", "tryparse",
|
|
210
|
+
}
|
|
211
|
+
# Only genuine structural/control-flow keywords. Deliberately EXCLUDES words that
|
|
212
|
+
# are commonly variable/method names in other languages (val, map, go, select,
|
|
213
|
+
# ...) - masking those would break the structural hash (val stays val while value
|
|
214
|
+
# becomes I -> false-negative near-dups).
|
|
215
|
+
KEYWORDS = frozenset({
|
|
216
|
+
"if", "else", "elif", "for", "while", "do", "switch", "case", "default",
|
|
217
|
+
"break", "continue", "return", "function", "func", "fn", "def", "class",
|
|
218
|
+
"struct", "interface", "type", "trait", "impl", "enum", "const", "let",
|
|
219
|
+
"var", "new", "delete", "typeof", "instanceof", "in", "of", "is",
|
|
220
|
+
"as", "not", "and", "or", "null", "nil", "none", "None", "true", "false",
|
|
221
|
+
"True", "False", "undefined", "void", "this", "self", "super", "yield",
|
|
222
|
+
"await", "async", "try", "catch", "except", "finally", "throw", "raise",
|
|
223
|
+
"with", "public", "private", "protected", "static", "import",
|
|
224
|
+
"from", "export", "package", "lambda", "pass", "end",
|
|
225
|
+
})
|
|
226
|
+
# Class-method signature (JS/TS + best-effort C-style); names that are really
|
|
227
|
+
# control flow are filtered out after match.
|
|
228
|
+
METHOD_JS = re.compile(
|
|
229
|
+
r"^\s*(?:public\s+|private\s+|protected\s+|static\s+|readonly\s+|async\s+"
|
|
230
|
+
r"|get\s+|set\s+|override\s+|\*\s*)*([A-Za-z_$][\w$]*)\s*\([^;{]*\)\s*"
|
|
231
|
+
r"(?::\s*[^={;]+)?\s*\{"
|
|
232
|
+
)
|
|
233
|
+
METHOD_CSTYLE = re.compile(
|
|
234
|
+
r"^\s*(?:(?:public|private|protected|internal|static|final|virtual|override"
|
|
235
|
+
r"|abstract|async|sealed|unsafe)\s+)+[\w<>\[\].,?]+\s+([A-Za-z_]\w*)\s*"
|
|
236
|
+
r"\([^;{]*\)\s*(?:where[^{]+)?\{"
|
|
237
|
+
)
|
|
238
|
+
NOT_METHOD = {
|
|
239
|
+
"if", "for", "while", "switch", "catch", "return", "function", "do",
|
|
240
|
+
"else", "with", "await", "new", "delete", "void", "yield", "case",
|
|
241
|
+
"throw", "super", "typeof", "using", "lock", "fixed", "foreach",
|
|
242
|
+
}
|
|
243
|
+
EXPORT_KEYWORD = re.compile(r"\b(?:export|public|pub)\b")
|
|
244
|
+
FUNC_PATTERNS = {
|
|
245
|
+
"js": [
|
|
246
|
+
re.compile(r"(?:^|\s)(?:export\s+)?(?:default\s+)?(?:async\s+)?function\s*\*?\s*([A-Za-z_$][\w$]*)\s*\("),
|
|
247
|
+
re.compile(r"(?:^|\s)(?:export\s+)?(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>"),
|
|
248
|
+
METHOD_JS,
|
|
249
|
+
],
|
|
250
|
+
"py": [re.compile(r"^\s*(?:async\s+)?def\s+([A-Za-z_]\w*)\s*\(")],
|
|
251
|
+
"go": [re.compile(r"^\s*func\s+(?:\([^)]*\)\s*)?([A-Za-z_]\w*)\s*\(")],
|
|
252
|
+
"rust": [re.compile(r"\bfn\s+([A-Za-z_]\w*)\s*[(<]")],
|
|
253
|
+
"ruby": [re.compile(r"^\s*def\s+(?:self\.)?([A-Za-z_]\w*[!?=]?)")],
|
|
254
|
+
"php": [re.compile(r"\bfunction\s+([A-Za-z_]\w*)\s*\(")],
|
|
255
|
+
"cstyle": [METHOD_CSTYLE],
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
# ---- comment/string tokenization (language-aware, single pass) -----------
|
|
259
|
+
# One combined regex per language family, string alternatives FIRST, so a //
|
|
260
|
+
# or # inside a string literal can never be amputated as a comment, and an
|
|
261
|
+
# unbalanced quote left by comment-stripping can never swallow real code.
|
|
262
|
+
# Known line-based blind spots (accepted for zero deps): JS regex literals,
|
|
263
|
+
# Rust lifetimes ('a), raw strings ending in a backslash.
|
|
264
|
+
_STR_DQ = r'"(?:[^"\\\n]|\\.)*"'
|
|
265
|
+
_STR_SQ = r"'(?:[^'\\\n]|\\.)*'"
|
|
266
|
+
_STR_BT = r"`(?:[^`\\]|\\.)*`"
|
|
267
|
+
_STR_TRIPLE = r"'''(?s:.*?)'''|\"\"\"(?s:.*?)\"\"\""
|
|
268
|
+
_CMT_SLASH = r"//[^\n]*"
|
|
269
|
+
_CMT_HASH = r"#[^\n]*"
|
|
270
|
+
_CMT_BLOCK = r"/\*(?s:.*?)\*/"
|
|
271
|
+
_FAMILY_SYNTAX = {
|
|
272
|
+
# family: (string alternatives, comment alternatives)
|
|
273
|
+
"py": (f"{_STR_TRIPLE}|{_STR_DQ}|{_STR_SQ}", _CMT_HASH),
|
|
274
|
+
"ruby": (f"{_STR_DQ}|{_STR_SQ}", _CMT_HASH),
|
|
275
|
+
"php": (f"{_STR_DQ}|{_STR_SQ}", f"{_CMT_SLASH}|{_CMT_BLOCK}|{_CMT_HASH}"),
|
|
276
|
+
"bt": (f"{_STR_BT}|{_STR_DQ}|{_STR_SQ}", f"{_CMT_SLASH}|{_CMT_BLOCK}"),
|
|
277
|
+
"c": (f"{_STR_DQ}|{_STR_SQ}", f"{_CMT_SLASH}|{_CMT_BLOCK}"),
|
|
278
|
+
}
|
|
279
|
+
_LANG_FAMILY = {"py": "py", "ruby": "ruby", "php": "php", "js": "bt", "go": "bt"}
|
|
280
|
+
_TOKEN_RX = {
|
|
281
|
+
fam: re.compile(f"(?P<s>{strs})|(?P<c>{cmts})")
|
|
282
|
+
for fam, (strs, cmts) in _FAMILY_SYNTAX.items()
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _strip_comments(text: str, lang: str, string_repl: str | None) -> str:
|
|
287
|
+
"""Drop comments for `lang`; keep strings verbatim (string_repl=None) or
|
|
288
|
+
mask each one with string_repl."""
|
|
289
|
+
rx = _TOKEN_RX[_LANG_FAMILY.get(lang, "c")]
|
|
290
|
+
|
|
291
|
+
def repl(m: re.Match[str]) -> str:
|
|
292
|
+
if m.lastgroup == "s":
|
|
293
|
+
return m.group(0) if string_repl is None else string_repl
|
|
294
|
+
return " "
|
|
295
|
+
|
|
296
|
+
return rx.sub(repl, text)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _mask_strings(text: str, lang: str) -> str:
|
|
300
|
+
"""Mask string-literal contents but KEEP comments. For detectors whose
|
|
301
|
+
habitat is comments/code (residue phrases, boolean call args): a slop
|
|
302
|
+
phrase *quoted in a string* is a fixture or UI copy - the model pass
|
|
303
|
+
judges those in context, the scanner must not."""
|
|
304
|
+
rx = _TOKEN_RX[_LANG_FAMILY.get(lang, "c")]
|
|
305
|
+
return rx.sub(lambda m: "L" if m.lastgroup == "s" else m.group(0), text)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def lang_of(rel: str) -> str:
|
|
309
|
+
ext = rel.rsplit(".", 1)[-1].lower() if "." in rel else ""
|
|
310
|
+
if ext == "py":
|
|
311
|
+
return "py"
|
|
312
|
+
if ext == "go":
|
|
313
|
+
return "go"
|
|
314
|
+
if ext == "rs":
|
|
315
|
+
return "rust"
|
|
316
|
+
if ext == "rb":
|
|
317
|
+
return "ruby"
|
|
318
|
+
if ext == "php":
|
|
319
|
+
return "php"
|
|
320
|
+
if ext in ("java", "kt", "kts", "cs", "scala"):
|
|
321
|
+
return "cstyle"
|
|
322
|
+
if ext in ("ts", "tsx", "js", "jsx", "mjs", "cjs", "vue", "svelte", "astro"):
|
|
323
|
+
return "js"
|
|
324
|
+
return "other"
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def git(root: str, *args: str) -> str | None:
|
|
328
|
+
# quotepath=false: git would otherwise octal-escape non-ASCII paths and the
|
|
329
|
+
# escaped name would never open. UTF-8 decode: text=True would use the
|
|
330
|
+
# locale codec (cp1252 on Windows) and crash on real UTF-8 diffs.
|
|
331
|
+
try:
|
|
332
|
+
p = subprocess.run(
|
|
333
|
+
["git", "-C", root, "-c", "core.quotepath=false", *args],
|
|
334
|
+
capture_output=True, encoding="utf-8", errors="replace",
|
|
335
|
+
)
|
|
336
|
+
except OSError:
|
|
337
|
+
return None
|
|
338
|
+
return p.stdout if p.returncode == 0 else ""
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def _unquote_path(p: str) -> str:
|
|
342
|
+
"""Undo git's C-style quoting of paths with quotes/specials (rare once
|
|
343
|
+
core.quotepath=false handles non-ASCII)."""
|
|
344
|
+
if len(p) >= 2 and p.startswith('"') and p.endswith('"'):
|
|
345
|
+
return re.sub(
|
|
346
|
+
r"\\(.)",
|
|
347
|
+
lambda m: {"n": "\n", "t": "\t"}.get(m.group(1), m.group(1)),
|
|
348
|
+
p[1:-1],
|
|
349
|
+
)
|
|
350
|
+
return p
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def parse_added_by_file(diff_text: str) -> dict[str, list[str]]:
|
|
354
|
+
"""Added lines per file from ONE `git diff` run (not one process per file)."""
|
|
355
|
+
added: dict[str, list[str]] = {}
|
|
356
|
+
cur: str | None = None
|
|
357
|
+
for ln in diff_text.splitlines():
|
|
358
|
+
if ln.startswith("+++ "):
|
|
359
|
+
path = _unquote_path(ln[4:].strip())
|
|
360
|
+
if path == "/dev/null":
|
|
361
|
+
cur = None
|
|
362
|
+
else:
|
|
363
|
+
cur = path[2:] if path.startswith("b/") else path
|
|
364
|
+
elif cur is not None and ln.startswith("+") and not ln.startswith("+++"):
|
|
365
|
+
added.setdefault(cur, []).append(ln[1:])
|
|
366
|
+
return added
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def read_whole(root: str, rel: str) -> tuple[list[str] | None, bool]:
|
|
370
|
+
"""First READ_CAP lines and whether the file had more (truncated).
|
|
371
|
+
None = unreadable (missing/permission); callers must surface it, because a
|
|
372
|
+
silent skip turns a vanished file into a false 'no slop found'."""
|
|
373
|
+
try:
|
|
374
|
+
# utf-8-sig: PowerShell writes BOMs by default; a surviving \ufeff on
|
|
375
|
+
# line 1 would defeat every ^-anchored detector.
|
|
376
|
+
with open(os.path.join(root, rel), encoding="utf-8-sig", errors="ignore") as fh:
|
|
377
|
+
lines = [ln.rstrip("\n") for ln in islice(fh, READ_CAP + 1)]
|
|
378
|
+
except OSError:
|
|
379
|
+
return None, False
|
|
380
|
+
if len(lines) > READ_CAP:
|
|
381
|
+
return lines[:READ_CAP], True
|
|
382
|
+
return lines, False
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _read_for_scan(root: str, rel: str, unreadable: list[str]) -> tuple[list[str], bool]:
|
|
386
|
+
lines, truncated = read_whole(root, rel)
|
|
387
|
+
if lines is None:
|
|
388
|
+
unreadable.append(rel)
|
|
389
|
+
return [], False
|
|
390
|
+
return lines, truncated
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _uniq(items: list[str]) -> list[str]:
|
|
394
|
+
seen: set[str] = set()
|
|
395
|
+
out: list[str] = []
|
|
396
|
+
for it in items:
|
|
397
|
+
if it not in seen:
|
|
398
|
+
seen.add(it)
|
|
399
|
+
out.append(it)
|
|
400
|
+
return out
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def is_redundant_comment(line: str) -> bool:
|
|
404
|
+
if not COMMENT.search(line):
|
|
405
|
+
return False
|
|
406
|
+
body = re.sub(r"\*/\s*$", "", _CMT_MARKER.sub("", line)).strip()
|
|
407
|
+
return len(body.split()) <= 6
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def _dep_line_hit(ln: str) -> bool:
|
|
411
|
+
for m in DEP.finditer(ln):
|
|
412
|
+
# Line-start metadata ("version": "2.1.0") is a manifest field, not a dep.
|
|
413
|
+
if m.start() == 0 and m.group("name").lower() in META_KEYS:
|
|
414
|
+
continue
|
|
415
|
+
return True
|
|
416
|
+
return False
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
# Every per-file slop signal scan_lines emits, with its report label (padded
|
|
420
|
+
# to one column) and summary label. Gate, totals, and printing derive from
|
|
421
|
+
# this one table.
|
|
422
|
+
_SIGNALS = {
|
|
423
|
+
"dependencies": ("new dependency ", "dep"),
|
|
424
|
+
"abstractions": ("premature abstraction", "abstraction"),
|
|
425
|
+
"redundant_comments": ("redundant comment ", "redundant-comment"),
|
|
426
|
+
"ai_residue": ("AI residue ", "residue"),
|
|
427
|
+
"type_escapes": ("type escape ", "type-escape"),
|
|
428
|
+
"swallowed_errors": ("swallowed error ", "swallowed-error"),
|
|
429
|
+
"tautological_tests": ("tautological test ", "tautology"),
|
|
430
|
+
"async_wrappers": ("async wrapper ", "async-wrapper"),
|
|
431
|
+
"guard_chains": ("guard chain (use ?.) ", "guard-chain"),
|
|
432
|
+
"boolean_traps": ("boolean trap ", "boolean-trap"),
|
|
433
|
+
"select_star": ("SELECT * ", "select-star"),
|
|
434
|
+
"tailwind_slop": ("tailwind smell ", "tailwind"),
|
|
435
|
+
}
|
|
436
|
+
_SIGNAL_KEYS = tuple(_SIGNALS)
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _file_slop(r: Finding) -> bool:
|
|
440
|
+
return any(r[k] for k in _SIGNAL_KEYS)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def scan_lines(rel: str, lines: list[str], audit: bool) -> Finding | None:
|
|
444
|
+
if not lines:
|
|
445
|
+
return None
|
|
446
|
+
lang = lang_of(rel)
|
|
447
|
+
ext = rel.rsplit(".", 1)[-1].lower() if "." in rel else ""
|
|
448
|
+
is_source = bool(SOURCE.search(rel))
|
|
449
|
+
found: dict[str, list[str]] = {k: [] for k in _SIGNAL_KEYS}
|
|
450
|
+
check_deps = (not audit) and bool(MANIFEST.search(os.path.basename(rel)))
|
|
451
|
+
for i, ln in enumerate(lines):
|
|
452
|
+
# Dep detection reads raw lines: in manifests the strings ARE the data.
|
|
453
|
+
if check_deps and _dep_line_hit(ln):
|
|
454
|
+
found["dependencies"].append(ln.strip()[:100])
|
|
455
|
+
# Comment-habitat detectors run on the string-masked line (comments
|
|
456
|
+
# kept): a slop pattern *quoted in a string* is a fixture, a log
|
|
457
|
+
# message, or UI copy - context for the model pass, not the scanner.
|
|
458
|
+
masked = _mask_strings(ln, lang)
|
|
459
|
+
# Code signals only apply to source files: pattern vocabulary inside
|
|
460
|
+
# markdown prose is documentation, not an abstraction. (Deps stay
|
|
461
|
+
# separate - manifests are not SOURCE files.)
|
|
462
|
+
if is_source:
|
|
463
|
+
m = ABSTRACTION.search(masked)
|
|
464
|
+
if m:
|
|
465
|
+
found["abstractions"].append(m.group(1))
|
|
466
|
+
else:
|
|
467
|
+
v = VOCAB.search(masked)
|
|
468
|
+
if v:
|
|
469
|
+
found["abstractions"].append(v.group(1))
|
|
470
|
+
if is_redundant_comment(masked):
|
|
471
|
+
found["redundant_comments"].append(ln.strip()[:100])
|
|
472
|
+
# Banner/emoji stay on the raw line (a celebration emoji in
|
|
473
|
+
# user-facing copy IS the slop).
|
|
474
|
+
if (RESIDUE_PHRASE.search(masked) or RESIDUE_BANNER.match(ln)
|
|
475
|
+
or RESIDUE_EMOJI.search(ln)):
|
|
476
|
+
found["ai_residue"].append(ln.strip()[:100])
|
|
477
|
+
if lang in ("js", "cstyle", "php"):
|
|
478
|
+
# Mask strings + comments first: `as any` and `catch {}` are also
|
|
479
|
+
# English prose / string content.
|
|
480
|
+
code = _strip_comments(ln, lang, "L")
|
|
481
|
+
if lang == "js" and (
|
|
482
|
+
TS_SUPPRESS.search(masked)
|
|
483
|
+
or (not _CMT_MARKER.match(ln) and TS_ANY.search(code))
|
|
484
|
+
):
|
|
485
|
+
found["type_escapes"].append(ln.strip()[:100])
|
|
486
|
+
if JS_SWALLOW.search(code):
|
|
487
|
+
found["swallowed_errors"].append(ln.strip()[:100])
|
|
488
|
+
# Raw line: the string-literal tautology alternative needs the
|
|
489
|
+
# actual quotes, which masking would erase.
|
|
490
|
+
if JS_TAUTOLOGY.search(ln):
|
|
491
|
+
found["tautological_tests"].append(ln.strip()[:100])
|
|
492
|
+
if BOOL_PAIR_JS.search(code):
|
|
493
|
+
found["boolean_traps"].append(ln.strip()[:100])
|
|
494
|
+
if lang == "js":
|
|
495
|
+
if ASYNC_WRAPPER.search(code):
|
|
496
|
+
found["async_wrappers"].append(ln.strip()[:100])
|
|
497
|
+
gm = GUARD_RETURN.match(ln)
|
|
498
|
+
if gm and i + 1 < len(lines):
|
|
499
|
+
nxt = GUARD_RETURN.match(lines[i + 1])
|
|
500
|
+
if nxt and nxt.group(1).startswith(gm.group(1) + "."):
|
|
501
|
+
found["guard_chains"].append(ln.strip()[:100])
|
|
502
|
+
elif lang == "py":
|
|
503
|
+
if PY_TYPE_ESCAPE.search(masked):
|
|
504
|
+
found["type_escapes"].append(ln.strip()[:100])
|
|
505
|
+
if PY_SWALLOW.match(ln) or (
|
|
506
|
+
PY_SWALLOW_HEAD.match(ln)
|
|
507
|
+
and i + 1 < len(lines) and PY_PASS.match(lines[i + 1])
|
|
508
|
+
):
|
|
509
|
+
found["swallowed_errors"].append(ln.strip()[:100])
|
|
510
|
+
if PY_TAUTOLOGY.search(ln):
|
|
511
|
+
found["tautological_tests"].append(ln.strip()[:100])
|
|
512
|
+
if BOOL_PAIR_PY.search(masked):
|
|
513
|
+
found["boolean_traps"].append(ln.strip()[:100])
|
|
514
|
+
if ext == "sql" and not ln.lstrip().startswith("--") and SELECT_STAR.search(ln):
|
|
515
|
+
found["select_star"].append(ln.strip()[:100])
|
|
516
|
+
if (lang == "js" or ext == "html") and (
|
|
517
|
+
TAILWIND_SOUP.search(ln) or TAILWIND_MAGIC_PX.search(ln)
|
|
518
|
+
):
|
|
519
|
+
found["tailwind_slop"].append(ln.strip()[:100])
|
|
520
|
+
found = {k: _uniq(v) for k, v in found.items()}
|
|
521
|
+
added_count = sum(1 for ln in lines if ln.strip())
|
|
522
|
+
substantial = (not audit) and is_source and added_count >= CHECKLIST_LINES
|
|
523
|
+
if not (any(found.values()) or substantial):
|
|
524
|
+
return None
|
|
525
|
+
out: Finding = {"file": rel, "added_lines": added_count,
|
|
526
|
+
"substantial": substantial}
|
|
527
|
+
out.update(found)
|
|
528
|
+
return out
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
# ---- body capture + hashing ---------------------------------------------
|
|
532
|
+
def _indent(s: str) -> int:
|
|
533
|
+
return len(s) - len(s.lstrip())
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
def capture_body(lines: list[str], start_idx: int, lang: str) -> tuple[str, bool]:
|
|
537
|
+
"""Body text and whether capture hit the BODY_CAP window (truncated)."""
|
|
538
|
+
if lang in ("py", "ruby"):
|
|
539
|
+
base = _indent(lines[start_idx])
|
|
540
|
+
window = lines[start_idx + 1:start_idx + 1 + BODY_CAP]
|
|
541
|
+
out: list[str] = []
|
|
542
|
+
for ln in window:
|
|
543
|
+
if lang == "py" and ln.strip() and _indent(ln) <= base:
|
|
544
|
+
break
|
|
545
|
+
if lang == "ruby" and ln.strip() == "end" and _indent(ln) <= base:
|
|
546
|
+
break
|
|
547
|
+
out.append(ln)
|
|
548
|
+
else:
|
|
549
|
+
return "\n".join(out), len(lines) > start_idx + 1 + BODY_CAP
|
|
550
|
+
return "\n".join(out), False
|
|
551
|
+
# brace languages
|
|
552
|
+
sig = lines[start_idx]
|
|
553
|
+
arrow = sig.find("=>")
|
|
554
|
+
if arrow >= 0 and sig.find("{", arrow) < 0:
|
|
555
|
+
return sig[arrow + 2:].split(";")[0], False
|
|
556
|
+
brace_line = -1
|
|
557
|
+
for k in range(start_idx, min(start_idx + 6, len(lines))):
|
|
558
|
+
if "{" in lines[k]:
|
|
559
|
+
brace_line = k
|
|
560
|
+
break
|
|
561
|
+
if ";" in lines[k] and "=>" not in lines[k]:
|
|
562
|
+
return "", False
|
|
563
|
+
if brace_line < 0:
|
|
564
|
+
return "", False
|
|
565
|
+
depth, started = 0, False
|
|
566
|
+
out_chars: list[str] = []
|
|
567
|
+
for ln in lines[brace_line:brace_line + 1 + BODY_CAP]:
|
|
568
|
+
for ch in ln:
|
|
569
|
+
if ch == "{":
|
|
570
|
+
depth += 1
|
|
571
|
+
if depth == 1:
|
|
572
|
+
started = True
|
|
573
|
+
continue
|
|
574
|
+
elif ch == "}":
|
|
575
|
+
depth -= 1
|
|
576
|
+
if depth == 0 and started:
|
|
577
|
+
return "".join(out_chars), False
|
|
578
|
+
if started:
|
|
579
|
+
out_chars.append(ch)
|
|
580
|
+
if started:
|
|
581
|
+
out_chars.append("\n")
|
|
582
|
+
# Never saw the closing brace inside the window: cap hit (or EOF mid-body).
|
|
583
|
+
return "".join(out_chars), True
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def normalize_body(text: str, lang: str) -> str:
|
|
587
|
+
"""Whitespace/comment-insensitive but literal-sensitive text, for the
|
|
588
|
+
exact-duplicate hash. Strings survive verbatim (URLs differ => bodies differ)."""
|
|
589
|
+
text = _strip_comments(text, lang, string_repl=None)
|
|
590
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
def structural_body(text: str, lang: str) -> str:
|
|
594
|
+
"""Mask identifiers + literals; keep keywords/operators. Two bodies with the
|
|
595
|
+
same control-flow shape but drifted names/values hash equal => near-duplicate."""
|
|
596
|
+
text = _strip_comments(text, lang, string_repl="L")
|
|
597
|
+
text = re.sub(r"\b\d[\w.]*\b", "N", text)
|
|
598
|
+
text = ID.sub(lambda m: m.group(0) if m.group(0) in KEYWORDS else "I", text)
|
|
599
|
+
return re.sub(r"\s+", "", text)
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
def _digest(s: str) -> str:
|
|
603
|
+
# blake2b, not md5: FIPS-enabled Pythons refuse md5 outright.
|
|
604
|
+
return hashlib.blake2b(s.encode("utf-8"), digest_size=16).hexdigest()
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
def _is_exported(name: str, line: str, lang: str) -> bool:
|
|
608
|
+
if lang == "py":
|
|
609
|
+
return not name.startswith("_") # public def: importable anywhere
|
|
610
|
+
if lang == "go":
|
|
611
|
+
return name[:1].isupper() # Go exports by capitalization
|
|
612
|
+
return (bool(EXPORT_KEYWORD.search(line))
|
|
613
|
+
or "module.exports" in line or "exports." in line)
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
def collect_defs(rel: str, lines: list[str]) -> list[Finding]:
|
|
617
|
+
lang = lang_of(rel)
|
|
618
|
+
decls = FUNC_PATTERNS.get(lang)
|
|
619
|
+
if not decls:
|
|
620
|
+
return []
|
|
621
|
+
defs: list[Finding] = []
|
|
622
|
+
for i, ln in enumerate(lines):
|
|
623
|
+
name = None
|
|
624
|
+
for rx in decls:
|
|
625
|
+
m = rx.search(ln)
|
|
626
|
+
if m:
|
|
627
|
+
cand = m.group(1)
|
|
628
|
+
if rx in (METHOD_JS, METHOD_CSTYLE) and cand in NOT_METHOD:
|
|
629
|
+
continue
|
|
630
|
+
name = cand
|
|
631
|
+
break
|
|
632
|
+
if not name:
|
|
633
|
+
continue
|
|
634
|
+
raw, truncated = capture_body(lines, i, lang)
|
|
635
|
+
nb = normalize_body(raw, lang)
|
|
636
|
+
sb = structural_body(raw, lang)
|
|
637
|
+
# Non-blank lines only: the brace walk pads raw with edge newlines,
|
|
638
|
+
# and counting them would let `super(props);` pass the 3-line floor.
|
|
639
|
+
body_lines = sum(1 for s in raw.splitlines() if s.strip()) or 1
|
|
640
|
+
# Exact-dup hash needs substance (>=3 lines or >=60 chars): one-line
|
|
641
|
+
# boilerplate like `super(props);` is not knowledge worth consolidating.
|
|
642
|
+
# A truncated body is a prefix, not the function - never call it exact.
|
|
643
|
+
hash_exact = (not truncated and len(nb) >= 12
|
|
644
|
+
and (body_lines >= 3 or len(nb) >= 60))
|
|
645
|
+
defs.append({
|
|
646
|
+
"name": name, "file": rel, "line": i + 1,
|
|
647
|
+
"exported": _is_exported(name, ln, lang),
|
|
648
|
+
"exact": _digest(nb) if hash_exact else None,
|
|
649
|
+
"struct": _digest(sb) if len(sb) >= 20 else None, # skip trivial one-liners (return I;)
|
|
650
|
+
"body_lines": body_lines,
|
|
651
|
+
"truncated": truncated,
|
|
652
|
+
})
|
|
653
|
+
return defs
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def collect_types(rel: str, lines: list[str]) -> list[Finding]:
|
|
657
|
+
if lang_of(rel) != "js":
|
|
658
|
+
return []
|
|
659
|
+
out: list[Finding] = []
|
|
660
|
+
for ln in lines:
|
|
661
|
+
m = TYPE_DECL.search(ln)
|
|
662
|
+
if m:
|
|
663
|
+
out.append({"name": m.group(1), "file": rel})
|
|
664
|
+
return out
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def analyze_duplication(defs: list[Finding], types: list[Finding],
|
|
668
|
+
idfreq: Counter[str], full_scope: bool) -> Finding:
|
|
669
|
+
by_name: dict[str, list[Finding]] = defaultdict(list)
|
|
670
|
+
by_exact: dict[str, list[Finding]] = defaultdict(list)
|
|
671
|
+
by_struct: dict[str, list[Finding]] = defaultdict(list)
|
|
672
|
+
name_def_counts: Counter[str] = Counter()
|
|
673
|
+
for d in defs:
|
|
674
|
+
by_name[d["name"]].append(d)
|
|
675
|
+
name_def_counts[d["name"]] += 1
|
|
676
|
+
if d["exact"]:
|
|
677
|
+
by_exact[d["exact"]].append(d)
|
|
678
|
+
if d["struct"]:
|
|
679
|
+
by_struct[d["struct"]].append(d)
|
|
680
|
+
|
|
681
|
+
name_clones = []
|
|
682
|
+
for name, ds in by_name.items():
|
|
683
|
+
files = sorted({d["file"] for d in ds})
|
|
684
|
+
if len(files) >= 2 and name.lower() not in COMMON_NAMES:
|
|
685
|
+
name_clones.append({"name": name, "count": len(ds), "files": files,
|
|
686
|
+
"confidence": "name-only"})
|
|
687
|
+
name_clones.sort(key=lambda x: -x["count"])
|
|
688
|
+
|
|
689
|
+
body_clones = []
|
|
690
|
+
for ds in by_exact.values():
|
|
691
|
+
names = sorted({d["name"] for d in ds})
|
|
692
|
+
files = sorted({d["file"] for d in ds})
|
|
693
|
+
if len(ds) >= 2 and (len(files) >= 2 or len(names) >= 2):
|
|
694
|
+
body_clones.append({"names": names, "files": files, "count": len(ds),
|
|
695
|
+
"confidence": "exact"})
|
|
696
|
+
body_clones.sort(key=lambda x: -x["count"])
|
|
697
|
+
|
|
698
|
+
near_clones = []
|
|
699
|
+
for ds in by_struct.values():
|
|
700
|
+
exacts = {d["exact"] for d in ds if d["exact"]}
|
|
701
|
+
if len(ds) >= 2 and len(exacts) >= 2: # same shape, genuinely different bodies
|
|
702
|
+
names = sorted({d["name"] for d in ds})
|
|
703
|
+
files = sorted({d["file"] for d in ds})
|
|
704
|
+
near_clones.append({"names": names, "files": files, "count": len(ds),
|
|
705
|
+
"confidence": "structural"})
|
|
706
|
+
near_clones.sort(key=lambda x: -x["count"])
|
|
707
|
+
|
|
708
|
+
# Reference counts only mean something when every file was scanned; on a
|
|
709
|
+
# partial file list "0 references" is an artifact of the scope, and acting
|
|
710
|
+
# on it would delete live code.
|
|
711
|
+
single_use: list[Finding] = []
|
|
712
|
+
if full_scope:
|
|
713
|
+
seen_su: set[str] = set()
|
|
714
|
+
for d in defs:
|
|
715
|
+
name = d["name"]
|
|
716
|
+
if name in seen_su or name.lower() in COMMON_NAMES:
|
|
717
|
+
continue
|
|
718
|
+
small = d["body_lines"] <= 3
|
|
719
|
+
util_ish = bool(MICRO_PREFIX.search(name)) or name.lower() in FINGERPRINTS
|
|
720
|
+
if not (small or util_ish):
|
|
721
|
+
continue
|
|
722
|
+
# Export-aware: exported defs may be public API / framework entry
|
|
723
|
+
# points, so we can't call them dead. Only judge repo-internal defs.
|
|
724
|
+
if d.get("exported"):
|
|
725
|
+
continue
|
|
726
|
+
refs = idfreq.get(name, 0) - name_def_counts[name]
|
|
727
|
+
if refs == 0:
|
|
728
|
+
seen_su.add(name)
|
|
729
|
+
single_use.append({"name": name, "file": d["file"], "kind": "dead"})
|
|
730
|
+
elif refs == 1 and util_ish:
|
|
731
|
+
seen_su.add(name)
|
|
732
|
+
single_use.append({"name": name, "file": d["file"], "kind": "inline"})
|
|
733
|
+
|
|
734
|
+
fp: dict[str, int] = defaultdict(int)
|
|
735
|
+
for d in defs:
|
|
736
|
+
if d["name"].lower() in FINGERPRINTS:
|
|
737
|
+
fp[d["name"]] += 1
|
|
738
|
+
micro = sum(1 for d in defs if MICRO_PREFIX.search(d["name"]) and d["body_lines"] <= 3)
|
|
739
|
+
|
|
740
|
+
type_files: dict[str, set[str]] = defaultdict(set)
|
|
741
|
+
for t in types:
|
|
742
|
+
type_files[t["name"]].add(t["file"])
|
|
743
|
+
type_clones = [{"name": n, "files": sorted(fs), "confidence": "name-only"}
|
|
744
|
+
for n, fs in type_files.items()
|
|
745
|
+
if len(fs) >= 2 and n.lower() not in COMMON_TYPES]
|
|
746
|
+
|
|
747
|
+
return {"name_clones": name_clones, "body_clones": body_clones,
|
|
748
|
+
"near_clones": near_clones, "single_use": single_use,
|
|
749
|
+
"type_clones": type_clones, "fingerprints": dict(sorted(fp.items(), key=lambda x: -x[1])),
|
|
750
|
+
"micro_count": micro, "total_defs": len(defs),
|
|
751
|
+
"truncated_defs": sum(1 for d in defs if d["truncated"])}
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
def target_files(root: str, base: str, paths: list[str], is_git: bool,
|
|
755
|
+
all_mode: bool) -> tuple[list[str], set[str], bool]:
|
|
756
|
+
"""(files to scan, untracked subset, hit-the-FILE_CAP flag)."""
|
|
757
|
+
if paths:
|
|
758
|
+
return [p.replace("\\", "/") for p in paths], set(), False
|
|
759
|
+
if not is_git:
|
|
760
|
+
return [], set(), False
|
|
761
|
+
if all_mode:
|
|
762
|
+
tracked = (git(root, "ls-files") or "").splitlines()
|
|
763
|
+
srcs = [_unquote_path(f.strip()) for f in tracked
|
|
764
|
+
if f.strip() and SOURCE.search(_unquote_path(f.strip()))]
|
|
765
|
+
return srcs[:FILE_CAP], set(), len(srcs) > FILE_CAP
|
|
766
|
+
names = (git(root, "diff", "--name-only", base) or "").splitlines()
|
|
767
|
+
others = (git(root, "ls-files", "--others", "--exclude-standard") or "").splitlines()
|
|
768
|
+
untracked = {_unquote_path(f.strip()) for f in others if f.strip()}
|
|
769
|
+
seen: set[str] = set()
|
|
770
|
+
out: list[str] = []
|
|
771
|
+
for f in names + others:
|
|
772
|
+
f = _unquote_path(f.strip())
|
|
773
|
+
if f and f not in seen:
|
|
774
|
+
seen.add(f)
|
|
775
|
+
out.append(f)
|
|
776
|
+
return out, untracked, False
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
def _dup_has_findings(dup: Finding | None) -> bool:
|
|
780
|
+
if not dup:
|
|
781
|
+
return False
|
|
782
|
+
return bool(dup["name_clones"] or dup["body_clones"] or dup["near_clones"]
|
|
783
|
+
or dup["single_use"] or dup["type_clones"] or dup["fingerprints"]
|
|
784
|
+
or dup["micro_count"])
|
|
785
|
+
|
|
786
|
+
|
|
787
|
+
def _print_capped(label: str, items: list[str]) -> None:
|
|
788
|
+
for it in items[:SHOW_CAP]:
|
|
789
|
+
print(f" {label}: {it}")
|
|
790
|
+
if len(items) > SHOW_CAP:
|
|
791
|
+
print(f" ... +{len(items) - SHOW_CAP} more {label.strip()}")
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def _print_duplication(dup: Finding) -> bool:
|
|
795
|
+
nc, bc, near, su = dup["name_clones"], dup["body_clones"], dup["near_clones"], dup["single_use"]
|
|
796
|
+
tc, fp = dup["type_clones"], dup["fingerprints"]
|
|
797
|
+
if not _dup_has_findings(dup):
|
|
798
|
+
return False
|
|
799
|
+
print("DUPLICATION (whole-codebase - the isRecord-class slop):")
|
|
800
|
+
if nc:
|
|
801
|
+
print(" Clone proliferation - same function name in multiple files [confidence: name-only]:")
|
|
802
|
+
for c in nc[:15]:
|
|
803
|
+
print(f" {c['name']:<22} x{c['count']:<3} {', '.join(c['files'][:6])}")
|
|
804
|
+
if bc:
|
|
805
|
+
print(" Knowledge duplication - identical body, consolidate to ONE (DRY) [confidence: exact]:")
|
|
806
|
+
for c in bc[:15]:
|
|
807
|
+
print(f" [{'/'.join(c['names'][:4])}] x{c['count']} across {', '.join(c['files'][:5])}")
|
|
808
|
+
if near:
|
|
809
|
+
print(" Semantic fragmentation - near-identical bodies (drifted clones) [confidence: structural]:")
|
|
810
|
+
for c in near[:12]:
|
|
811
|
+
print(f" [{'/'.join(c['names'][:4])}] x{c['count']} across {', '.join(c['files'][:5])}")
|
|
812
|
+
if su:
|
|
813
|
+
print(" Semantic density collapse - dead / single-use helpers:")
|
|
814
|
+
for c in su[:15]:
|
|
815
|
+
tag = "unused & not exported -> delete" if c["kind"] == "dead" else "used once -> inline"
|
|
816
|
+
print(f" {c['name']:<24} {tag:<32} {c['file']}")
|
|
817
|
+
if tc:
|
|
818
|
+
print(" Duplicate type/interface names [confidence: name-only]:")
|
|
819
|
+
for c in tc[:10]:
|
|
820
|
+
print(f" {c['name']:<22} {', '.join(c['files'][:6])}")
|
|
821
|
+
if fp:
|
|
822
|
+
print(" Generated-code fingerprints present: "
|
|
823
|
+
+ ", ".join(f"{k}({v})" for k, v in list(fp.items())[:12]))
|
|
824
|
+
if dup["micro_count"]:
|
|
825
|
+
print(f" Micro-abstraction load: {dup['micro_count']} tiny is*/assert*/safe* "
|
|
826
|
+
f"helpers of {dup['total_defs']} defs (Helper Hell risk)")
|
|
827
|
+
print(" -> Consolidate clones to one shared definition, inline single-use helpers,")
|
|
828
|
+
print(" re-point imports, delete the rest. One source of truth per concept.\n")
|
|
829
|
+
return True
|
|
830
|
+
|
|
831
|
+
|
|
832
|
+
def main() -> int:
|
|
833
|
+
# A cp1252 pipe (Windows capture) must degrade, not crash, on the
|
|
834
|
+
# non-ASCII paths core.quotepath=false deliberately preserves.
|
|
835
|
+
if hasattr(sys.stdout, "reconfigure"):
|
|
836
|
+
sys.stdout.reconfigure(errors="replace")
|
|
837
|
+
ap = argparse.ArgumentParser(description="Static AI-slop signal scanner (reports only).")
|
|
838
|
+
ap.add_argument("paths", nargs="*", help="specific files to audit (default: the git diff)")
|
|
839
|
+
ap.add_argument("--root", default=".", help="repo root (default: cwd)")
|
|
840
|
+
ap.add_argument("--base", default="HEAD", help="git ref to diff against in diff scope")
|
|
841
|
+
ap.add_argument("--all", action="store_true", help="audit ALL tracked source files + duplication")
|
|
842
|
+
ap.add_argument("--gate", action="store_true",
|
|
843
|
+
help="exit non-zero if slop is found, in any output format "
|
|
844
|
+
"(the size-only 'substantial' note never gates)")
|
|
845
|
+
ap.add_argument("--format", choices=["text", "json"], default="text")
|
|
846
|
+
args = ap.parse_args()
|
|
847
|
+
|
|
848
|
+
root = os.path.abspath(args.root)
|
|
849
|
+
is_git = bool(git(root, "rev-parse", "--git-dir"))
|
|
850
|
+
audit = args.all or bool(args.paths)
|
|
851
|
+
full_scope = args.all and not args.paths
|
|
852
|
+
files, untracked, files_capped = target_files(root, args.base, args.paths, is_git, args.all)
|
|
853
|
+
|
|
854
|
+
warnings: list[str] = []
|
|
855
|
+
if files_capped:
|
|
856
|
+
warnings.append(f"file list capped at {FILE_CAP} tracked source files; scan is partial")
|
|
857
|
+
if audit and not full_scope:
|
|
858
|
+
warnings.append("partial scope (explicit paths): single-use/dead-helper analysis "
|
|
859
|
+
"suppressed - reference counts need the whole codebase (--all)")
|
|
860
|
+
|
|
861
|
+
file_lines: dict[str, list[str]] = {}
|
|
862
|
+
read_capped: list[str] = []
|
|
863
|
+
unreadable: list[str] = []
|
|
864
|
+
if audit:
|
|
865
|
+
for f in files:
|
|
866
|
+
lines, truncated = _read_for_scan(root, f, unreadable)
|
|
867
|
+
file_lines[f] = lines
|
|
868
|
+
if truncated:
|
|
869
|
+
read_capped.append(f)
|
|
870
|
+
else:
|
|
871
|
+
# User diff.noprefix / diff.mnemonicPrefix config would change the
|
|
872
|
+
# +++ headers and silently break the b/ stripping in the parser.
|
|
873
|
+
diff_added = parse_added_by_file(
|
|
874
|
+
git(root, "-c", "diff.noprefix=false", "-c", "diff.mnemonicprefix=false",
|
|
875
|
+
"diff", args.base) or "") if is_git else {}
|
|
876
|
+
for f in files:
|
|
877
|
+
if f in untracked:
|
|
878
|
+
lines, truncated = _read_for_scan(root, f, unreadable)
|
|
879
|
+
file_lines[f] = lines
|
|
880
|
+
if truncated:
|
|
881
|
+
read_capped.append(f)
|
|
882
|
+
else:
|
|
883
|
+
file_lines[f] = diff_added.get(f, [])
|
|
884
|
+
if read_capped:
|
|
885
|
+
shown = ", ".join(read_capped[:5]) + (" ..." if len(read_capped) > 5 else "")
|
|
886
|
+
warnings.append(f"{len(read_capped)} file(s) hit the {READ_CAP}-line read cap "
|
|
887
|
+
f"(tail unscanned): {shown}")
|
|
888
|
+
if unreadable:
|
|
889
|
+
shown = ", ".join(unreadable[:5]) + (" ..." if len(unreadable) > 5 else "")
|
|
890
|
+
warnings.append(f"{len(unreadable)} file(s) could NOT be read and were NOT "
|
|
891
|
+
f"scanned (missing or unreadable): {shown}")
|
|
892
|
+
|
|
893
|
+
results = [r for r in (scan_lines(f, ls, audit) for f, ls in file_lines.items()) if r]
|
|
894
|
+
|
|
895
|
+
dup: Finding | None = None
|
|
896
|
+
if audit:
|
|
897
|
+
idfreq: Counter[str] = Counter()
|
|
898
|
+
defs: list[Finding] = []
|
|
899
|
+
types: list[Finding] = []
|
|
900
|
+
for f, ls in file_lines.items():
|
|
901
|
+
for ln in ls:
|
|
902
|
+
idfreq.update(ID.findall(ln))
|
|
903
|
+
defs.extend(collect_defs(f, ls))
|
|
904
|
+
types.extend(collect_types(f, ls))
|
|
905
|
+
dup = analyze_duplication(defs, types, idfreq, full_scope)
|
|
906
|
+
if dup["truncated_defs"]:
|
|
907
|
+
warnings.append(f"{dup['truncated_defs']} function bodies hit the {BODY_CAP}-line "
|
|
908
|
+
"capture cap; excluded from exact-duplicate hashing")
|
|
909
|
+
mode = "audit" if audit else "diff"
|
|
910
|
+
scope = ("explicit paths" if args.paths
|
|
911
|
+
else "whole codebase (--all)" if args.all
|
|
912
|
+
else f"diff vs {args.base}")
|
|
913
|
+
|
|
914
|
+
totals: dict[str, int] = {k: sum(len(r[k]) for r in results)
|
|
915
|
+
for k in _SIGNAL_KEYS}
|
|
916
|
+
totals["files"] = len(results)
|
|
917
|
+
|
|
918
|
+
# The gate trips on slop only. "Substantial" is a checklist nudge for big
|
|
919
|
+
# clean changes, not a defect - a clean 40-line diff must pass.
|
|
920
|
+
slop_found = any(_file_slop(r) for r in results) or _dup_has_findings(dup)
|
|
921
|
+
exit_code = 1 if args.gate and slop_found else 0
|
|
922
|
+
|
|
923
|
+
if args.format == "json":
|
|
924
|
+
print(json.dumps({"mode": mode, "scope": scope, "base": args.base, "git": is_git,
|
|
925
|
+
"files_scanned": len(files), "totals": totals,
|
|
926
|
+
"slop_found": slop_found, "warnings": warnings,
|
|
927
|
+
"files": results, "duplication": dup}, indent=2))
|
|
928
|
+
return exit_code
|
|
929
|
+
|
|
930
|
+
if not is_git and not args.paths:
|
|
931
|
+
print("anti-slop scan: not a git repo and no paths given.")
|
|
932
|
+
print("Audit files: `scan_slop.py src/foo.ts`, or run inside a git repo with --all.")
|
|
933
|
+
return 0
|
|
934
|
+
|
|
935
|
+
print(f"anti-slop scan - scope: {scope}, {len(files)} file(s)\n")
|
|
936
|
+
for w in warnings:
|
|
937
|
+
print(f" note: {w}")
|
|
938
|
+
if warnings:
|
|
939
|
+
print()
|
|
940
|
+
|
|
941
|
+
has_dup = bool(dup and _print_duplication(dup))
|
|
942
|
+
|
|
943
|
+
if results:
|
|
944
|
+
print(f"{totals['files']} file(s) with per-file signals\n")
|
|
945
|
+
for r in results:
|
|
946
|
+
print(r["file"])
|
|
947
|
+
for key, (label, _short) in _SIGNALS.items():
|
|
948
|
+
_print_capped(label, r[key])
|
|
949
|
+
if r["substantial"] and not _file_slop(r):
|
|
950
|
+
print(f" +{r['added_lines']} added lines (>= {CHECKLIST_LINES}: run the checklist)")
|
|
951
|
+
print()
|
|
952
|
+
|
|
953
|
+
if not results and not has_dup:
|
|
954
|
+
if audit:
|
|
955
|
+
print(f"No static slop patterns across {len(files)} file(s).")
|
|
956
|
+
print("Clean of the deterministic signals. Semantic slop still needs a model pass:")
|
|
957
|
+
print("invoke the anti-slop skill for edge cases / superficial tests / cargo-cult.")
|
|
958
|
+
elif not files:
|
|
959
|
+
print(f"Nothing changed vs {args.base} (clean working tree).")
|
|
960
|
+
print("Diff scope only vets a change in progress. To review existing code:")
|
|
961
|
+
print(" scan_slop.py --all (whole codebase + duplication)")
|
|
962
|
+
print(" scan_slop.py path/to/file (specific files)")
|
|
963
|
+
else:
|
|
964
|
+
print(f"{len(files)} changed file(s) scanned - no static slop signals.")
|
|
965
|
+
print("Semantic slop still needs a model pass; walk the SKILL.md taxonomy.")
|
|
966
|
+
if args.gate:
|
|
967
|
+
print("GATE: PASS (no slop)")
|
|
968
|
+
return exit_code
|
|
969
|
+
|
|
970
|
+
parts = [f"{totals[k]} {short}" for k, (_label, short) in _SIGNALS.items()
|
|
971
|
+
if totals[k]]
|
|
972
|
+
print(f"SUMMARY ({mode}): "
|
|
973
|
+
+ (", ".join(parts) if parts else "no per-file slop signals"), end="")
|
|
974
|
+
if dup:
|
|
975
|
+
print(f"; {len(dup['name_clones'])} name-clone, {len(dup['body_clones'])} exact-dup, "
|
|
976
|
+
f"{len(dup['near_clones'])} near-dup, {len(dup['single_use'])} single-use", end="")
|
|
977
|
+
print(".")
|
|
978
|
+
if slop_found:
|
|
979
|
+
print("Fix every signal above, then walk the full taxonomy in SKILL.md and re-scan (expect clean).")
|
|
980
|
+
if args.gate:
|
|
981
|
+
print("GATE: FAIL (slop found)" if slop_found else "GATE: PASS (no slop)")
|
|
982
|
+
return exit_code
|
|
983
|
+
|
|
984
|
+
|
|
985
|
+
if __name__ == "__main__":
|
|
986
|
+
sys.exit(main())
|