cursordoctrine 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/INSTALL.md +1 -1
- package/README.md +4 -3
- package/bin/cli.mjs +7 -6
- package/linux/hooks/minimal-edit-audit.sh +8 -0
- package/linux/hooks/semantic-density-audit.sh +151 -0
- package/linux/hooks/subagent-stop-review.sh +103 -103
- package/linux/hooks.json +7 -1
- package/package.json +5 -2
- package/skills/anti-slop/SKILL.md +9 -5
- package/skills/anti-slop/scripts/density_scan.py +78 -0
- package/skills/anti-slop/scripts/low_density.py +405 -0
- package/skills/anti-slop/scripts/scan_slop.py +29 -8
- package/windows/hooks/anti-slop-audit.ps1 +226 -226
- package/windows/hooks/final-review.md +67 -67
- package/windows/hooks/minimal-edit-audit.ps1 +124 -116
- package/windows/hooks/semantic-density-audit.ps1 +137 -0
- package/windows/hooks.json +70 -64
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
"""low_density.py - semantic-density scorer (the anti semantic-opacity layer).
|
|
2
|
+
|
|
3
|
+
Shared source of truth for the semantic-density signal. Used in two places:
|
|
4
|
+
|
|
5
|
+
1. scan_slop.py imports `score_identifiers()` to add a thirteenth signal
|
|
6
|
+
bucket (semantic_density) to its audit-of-record.
|
|
7
|
+
2. density_scan.py (the per-edit hook wrapper) imports the same functions
|
|
8
|
+
to flag only the identifiers the agent JUST introduced in the diff.
|
|
9
|
+
|
|
10
|
+
One denylist, two execution points, zero drift.
|
|
11
|
+
|
|
12
|
+
THE INVARIANT
|
|
13
|
+
If you cannot predict what a function/class/file does from its name alone,
|
|
14
|
+
there is semantic debt. DataManager, process(), utils.ts, CoreEngine -
|
|
15
|
+
names that exist but communicate no intent. High-density names
|
|
16
|
+
(InvoiceEmailSender, PostgresUserRepository, GenerateMonthlyReport) pass.
|
|
17
|
+
|
|
18
|
+
SCORING MODEL (three tiers, deliberately conservative on FAIL)
|
|
19
|
+
FAIL - the name IS a low-density token, or a generic-suffix class with no
|
|
20
|
+
domain noun before it (DataManager, CoreEngine, process, utils).
|
|
21
|
+
These almost never have a defensible reading.
|
|
22
|
+
WARN - the name carries a low-density token but has a domain noun, OR is
|
|
23
|
+
an anemic verb alone, OR a 1-2 char id, OR a placeholder
|
|
24
|
+
(UserManager, handle(), fn, x1, tempFix). Suspicious but defensible
|
|
25
|
+
in context; the model judges.
|
|
26
|
+
OK - none of the above.
|
|
27
|
+
|
|
28
|
+
The Repository/Service/Provider DDD cases are the false-positive risk. They
|
|
29
|
+
land as WARN (not FAIL) when a domain noun precedes them
|
|
30
|
+
(PostgresUserRepository -> WARN, kept), and FAIL only when naked
|
|
31
|
+
(Repository -> FAIL). Calibrated against real DDD code before shipping.
|
|
32
|
+
|
|
33
|
+
Stdlib only; Python 3.9+. REPORTS only - never edits.
|
|
34
|
+
"""
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
import re
|
|
38
|
+
import sys
|
|
39
|
+
from typing import Any
|
|
40
|
+
|
|
41
|
+
# Resolve sibling scan_slop.py at runtime. The hook wrapper (density_scan.py)
|
|
42
|
+
# is invoked from arbitrary cwds with no package context, and scan_slop imports
|
|
43
|
+
# this module via `from low_density import ...` only when both sit in the same
|
|
44
|
+
# scripts/ dir. The sys.path insert makes both directions work regardless of
|
|
45
|
+
# where Python was launched from.
|
|
46
|
+
_SCRIPT_DIR = re.sub(r"[\\/][^\\/]+$", "", __file__) or "."
|
|
47
|
+
if _SCRIPT_DIR not in sys.path:
|
|
48
|
+
sys.path.insert(0, _SCRIPT_DIR)
|
|
49
|
+
|
|
50
|
+
# Reuse scan_slop's language detection + comment/string stripping rather than
|
|
51
|
+
# duplicating the per-language tokenization that already lives there.
|
|
52
|
+
import scan_slop # noqa: E402 (path set up above)
|
|
53
|
+
|
|
54
|
+
ID = scan_slop.ID
|
|
55
|
+
TYPE_DECL = scan_slop.TYPE_DECL
|
|
56
|
+
FUNC_PATTERNS = scan_slop.FUNC_PATTERNS
|
|
57
|
+
METHOD_JS = scan_slop.METHOD_JS
|
|
58
|
+
METHOD_CSTYLE = scan_slop.METHOD_CSTYLE
|
|
59
|
+
NOT_METHOD = scan_slop.NOT_METHOD
|
|
60
|
+
lang_of = scan_slop.lang_of
|
|
61
|
+
_strip_comments = scan_slop._strip_comments
|
|
62
|
+
|
|
63
|
+
# ---- the denylist (single source of truth) --------------------------------
|
|
64
|
+
# Token stems, lowercased. Matched case-insensitively against identifier
|
|
65
|
+
# tokens. Kept short and high-signal: every entry here is one a senior would
|
|
66
|
+
# flag on sight, and every false positive costs trust in the whole layer.
|
|
67
|
+
LOW_DENSITY_TOKENS = frozenset({
|
|
68
|
+
# generic role nouns - describe a category, not a thing
|
|
69
|
+
"manager", "mgr", "handler", "processor", "controller", "provider",
|
|
70
|
+
"service", "svc", "engine", "framework", "system", "base", "core",
|
|
71
|
+
"common", "shared", "generic", "universal", "global",
|
|
72
|
+
# filler nouns - mean nothing on their own
|
|
73
|
+
"data", "info", "thing", "things", "stuff", "object", "item", "entity",
|
|
74
|
+
"business", "misc", "util", "utils", "utility", "helper", "helpers",
|
|
75
|
+
"tool", "tools",
|
|
76
|
+
# placeholder / temporaries that leaked to prod
|
|
77
|
+
"temp", "tmp", "new", "old", "current", "local", "main", "simple",
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
# Filenames whose bare basename IS the low-density signal. A file named
|
|
81
|
+
# utils.ts, helpers.py, manager.go communicates nothing. The fix is a
|
|
82
|
+
# domain name: invoice_totals.ts, smtp_retry.py.
|
|
83
|
+
LOW_DENSITY_FILENAMES = frozenset({
|
|
84
|
+
"utils", "helpers", "helper", "common", "shared", "manager", "service",
|
|
85
|
+
"provider", "handler", "processor", "engine", "base", "core", "misc",
|
|
86
|
+
"stuff", "things", "temp", "tmp", "generic", "util", "utility",
|
|
87
|
+
"controller", "framework", "system", "business", "global",
|
|
88
|
+
# NOTE: 'main' and 'app' are intentionally EXCLUDED - they are conventional
|
|
89
|
+
# entry-point names (next.js app/, rails application.py, fastapi main.py).
|
|
90
|
+
# Flagging them would make the hook fire on every new project scaffold.
|
|
91
|
+
})
|
|
92
|
+
|
|
93
|
+
# Verbs that name an action without naming the object of the action. Fine as
|
|
94
|
+
# part of a longer name (GenerateMonthlyReport), suspect alone (process()).
|
|
95
|
+
# Deliberately EXCLUDES concrete action verbs (get/set/send/load/save/fetch/
|
|
96
|
+
# render/format/parse/validate/check/verify) - those name a specific operation
|
|
97
|
+
# and are legitimate as methods on a domain-noun class (InvoiceEmailSender.send).
|
|
98
|
+
# Only the truly content-free verbs (do/run/execute/process/handle/...) stay.
|
|
99
|
+
ANEMIC_VERBS = frozenset({
|
|
100
|
+
"do", "run", "execute", "process", "handle", "manage", "perform",
|
|
101
|
+
"apply", "compute", "calculate", "make", "build", "update",
|
|
102
|
+
"delete", "remove", "add",
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
# Suffixes that, on a class with no domain noun before them, are textbook
|
|
106
|
+
# Meaningless Abstraction: DataProvider, CoreEngine, SystemManager. When a
|
|
107
|
+
# domain noun precedes (PostgresUserRepository) the score stays WARN not FAIL.
|
|
108
|
+
GENERIC_SUFFIXES = re.compile(
|
|
109
|
+
r"(Manager|Handler|Processor|Controller|Provider|Service|Engine"
|
|
110
|
+
r"|Framework|System|Factory|Builder|Wrapper|Adapter|Resolver"
|
|
111
|
+
r"|Strategy|Mediator|Orchestrator|Registry|Repository)$"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Placeholder fingerprints: tempFix, newThing, finalFinal, test2, abc, x1, fn.
|
|
115
|
+
PLACEHOLDER = re.compile(
|
|
116
|
+
r"^(temp|tmp|new|old|final|test|fix|copy|backup|draft|wip)"
|
|
117
|
+
r"[A-Z0-9_]"
|
|
118
|
+
r"|^(final){2,}"
|
|
119
|
+
r"|^[a-z]{0,2}\d+$" # x1, fn2, abc123 (kept narrow: a-c only)
|
|
120
|
+
r"|^[a-z]{1,2}$" # bare 1-2 char alpha: fn, cb, x, aq - predict nothing
|
|
121
|
+
r"|^(foo|bar|baz|qux|tmp|asdf|qwerty)$",
|
|
122
|
+
re.I,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# A "domain noun" is an identifier token that is NOT in LOW_DENSITY_TOKENS and
|
|
126
|
+
# NOT an anemic verb. Used to decide whether a generic-suffix class has real
|
|
127
|
+
# subject matter or is pure filler (DataManager vs InvoiceEmailSender).
|
|
128
|
+
def _tokens_of(name: str) -> list[str]:
|
|
129
|
+
"""Split a PascalCase / camelCase / snake_case name into lowercased word
|
|
130
|
+
stems. UserEmailSender -> [user, email, sender]. process_data ->
|
|
131
|
+
[process, data]. Single-word names return [self.lower()]."""
|
|
132
|
+
# split on non-alnum, then on case boundaries (aB -> a|B), then trailing digits
|
|
133
|
+
s = re.sub(r"[^A-Za-z0-9]+", " ", name)
|
|
134
|
+
s = re.sub(r"([a-z0-9])([A-Z])", r"\1 \2", s)
|
|
135
|
+
s = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1 \2", s) # HTTPSConnection -> HTTPS Connection
|
|
136
|
+
s = re.sub(r"(\d+)$", r" \1", s)
|
|
137
|
+
return [w.lower() for w in s.split() if w]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _has_domain_noun(tokens: list[str]) -> bool:
|
|
141
|
+
"""True if at least one token is neither low-density filler nor an anemic
|
|
142
|
+
verb. DataManager -> tokens [data, manager] -> no domain noun -> False.
|
|
143
|
+
InvoiceEmailSender -> [invoice, email, sender] -> 'invoice' is a noun ->
|
|
144
|
+
True."""
|
|
145
|
+
for t in tokens:
|
|
146
|
+
if t in LOW_DENSITY_TOKENS:
|
|
147
|
+
continue
|
|
148
|
+
if t in ANEMIC_VERBS:
|
|
149
|
+
continue
|
|
150
|
+
return True
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# Conventional CLI/runtime entrypoint names that are exempt from the
|
|
155
|
+
# low-density signal even when they appear as a bare single token. `main` is
|
|
156
|
+
# THE Python/C convention (`if __name__ == "__main__": main()`); `run` is the
|
|
157
|
+
# common CLI entrypoint in Go binaries and many scripts. These get a pass only
|
|
158
|
+
# as exact single-token function names, never inside larger names.
|
|
159
|
+
ENTRYPOINT_NAMES = frozenset({"main", "run", "cli", "app"})
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
Finding = dict[str, Any]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def score_density(name: str) -> tuple[str, list[str]]:
|
|
166
|
+
"""Return (severity, reasons) for one identifier name.
|
|
167
|
+
|
|
168
|
+
severity in {"ok", "warn", "fail"}. reasons is a list of short human
|
|
169
|
+
strings explaining each contributing rule. Empty reasons + "ok" = clean.
|
|
170
|
+
|
|
171
|
+
The function is pure and side-effect free; density_scan.py and scan_slop.py
|
|
172
|
+
both rely on that.
|
|
173
|
+
"""
|
|
174
|
+
if not name or not name.strip():
|
|
175
|
+
return "ok", []
|
|
176
|
+
name = name.strip()
|
|
177
|
+
lower = name.lower()
|
|
178
|
+
tokens = _tokens_of(name)
|
|
179
|
+
|
|
180
|
+
# 0. Conventional CLI/runtime entrypoints get a free pass as bare names:
|
|
181
|
+
# `main` (Python/C), `run`/`cli` (CLIs). These are idioms, not slop.
|
|
182
|
+
if lower in ENTRYPOINT_NAMES:
|
|
183
|
+
return "ok", []
|
|
184
|
+
|
|
185
|
+
# 1. Placeholder / temp-leaked-to-prod. Always FAIL - these never belong.
|
|
186
|
+
if PLACEHOLDER.search(name):
|
|
187
|
+
if re.match(r"^(final){2,}", name, re.I):
|
|
188
|
+
return "fail", ["placeholder name (finalFinal / repeated 'final')"]
|
|
189
|
+
if re.match(r"^[a-z]{0,2}\d+$", name, re.I):
|
|
190
|
+
return "fail", [f"cryptic short id '{name}' - predict nothing"]
|
|
191
|
+
if re.match(r"^[a-z]{1,2}$", name, re.I):
|
|
192
|
+
return "fail", [f"cryptic 1-2 char id '{name}' - predict nothing"]
|
|
193
|
+
return "fail", [f"placeholder name '{name}' - temp/test marker leaked to prod"]
|
|
194
|
+
|
|
195
|
+
# 2. Bare low-density single token: the name IS the filler word.
|
|
196
|
+
# "Helper", "Utils", "Manager", "process", "Data" on their own.
|
|
197
|
+
if len(tokens) == 1 and lower in LOW_DENSITY_TOKENS:
|
|
198
|
+
return "fail", [f"bare low-density token '{name}' - names a category, not a thing"]
|
|
199
|
+
|
|
200
|
+
# 3. Generic-suffix class with no domain noun: DataManager, CoreEngine,
|
|
201
|
+
# SystemProvider. The whole point of the abstraction is hidden.
|
|
202
|
+
if GENERIC_SUFFIXES.search(name) and not _has_domain_noun(tokens):
|
|
203
|
+
suffix = GENERIC_SUFFIXES.search(name).group(1)
|
|
204
|
+
return "fail", [
|
|
205
|
+
f"{suffix} with no domain noun - predict nothing from the name",
|
|
206
|
+
"fix: replace with verb+noun naming the concrete responsibility",
|
|
207
|
+
" (e.g. DataManager -> InvoiceRepository or PersistUserSessions)",
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
# 4. Generic-suffix class WITH a domain noun: UserRepository,
|
|
211
|
+
# StripePaymentProvider. WARN - defensible DDD, still worth a glance.
|
|
212
|
+
if GENERIC_SUFFIXES.search(name) and _has_domain_noun(tokens):
|
|
213
|
+
suffix = GENERIC_SUFFIXES.search(name).group(1)
|
|
214
|
+
return "warn", [f"{suffix} suffix (has domain noun -> defensible, still generic)"]
|
|
215
|
+
|
|
216
|
+
# 5. Anemic verb alone (process, handle, run, do). Function-shaped, empty.
|
|
217
|
+
if len(tokens) == 1 and lower in ANEMIC_VERBS:
|
|
218
|
+
return "warn", [f"anemic verb '{name}()' - names an action without its object"]
|
|
219
|
+
|
|
220
|
+
# 6. Any remaining low-density token present (multiword): UserManager,
|
|
221
|
+
# processStuff, DataThing. WARN if there's any domain noun, FAIL if not.
|
|
222
|
+
low_hits = [t for t in tokens if t in LOW_DENSITY_TOKENS]
|
|
223
|
+
if low_hits:
|
|
224
|
+
joined = ", ".join(sorted(set(low_hits)))
|
|
225
|
+
if _has_domain_noun(tokens):
|
|
226
|
+
return "warn", [f"low-density token(s) [{joined}] but has domain noun -> defensible"]
|
|
227
|
+
return "fail", [f"low-density token(s) [{joined}] and no domain noun -> opaque"]
|
|
228
|
+
|
|
229
|
+
# 7. Two-or-more word name where the leading token is an anemic verb and
|
|
230
|
+
# the rest is filler: handleStuff, processData, doThing. FAIL - the
|
|
231
|
+
# classic AI-slop function shape.
|
|
232
|
+
if len(tokens) >= 2 and tokens[0] in ANEMIC_VERBS:
|
|
233
|
+
rest_low = all(t in LOW_DENSITY_TOKENS for t in tokens[1:])
|
|
234
|
+
if rest_low:
|
|
235
|
+
return "fail", [f"anemic verb + filler [{lower}] - action with no real object"]
|
|
236
|
+
|
|
237
|
+
return "ok", []
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def score_filename(base: str) -> tuple[str, list[str]]:
|
|
241
|
+
"""Score a file basename (no extension). utils.ts -> FAIL.
|
|
242
|
+
invoice_totals.ts -> OK. Mirrors score_density for the file-name case."""
|
|
243
|
+
if not base:
|
|
244
|
+
return "ok", []
|
|
245
|
+
stem = re.sub(r"\.[A-Za-z0-9]+$", "", base).lower()
|
|
246
|
+
if stem in LOW_DENSITY_FILENAMES:
|
|
247
|
+
return "fail", [f"file named '{base}' - basename is a generic category, not a module"]
|
|
248
|
+
# conventional entry-point / layout names get a pass
|
|
249
|
+
if stem in {"index", "mod", "main", "app", "server", "test", "tests",
|
|
250
|
+
"conftest", "__init__", "setup"}:
|
|
251
|
+
return "ok", []
|
|
252
|
+
# multi-word file names (invoice_totals, user_repository) are fine even
|
|
253
|
+
# if they contain a low-density word, because the other word carries meaning
|
|
254
|
+
parts = re.split(r"[^a-z0-9]+", stem)
|
|
255
|
+
parts = [p for p in parts if p]
|
|
256
|
+
low_hits = [p for p in parts if p in LOW_DENSITY_FILENAMES]
|
|
257
|
+
if low_hits and not any(p not in LOW_DENSITY_FILENAMES for p in parts):
|
|
258
|
+
# all parts are low-density: e.g. utils_helpers.py
|
|
259
|
+
return "fail", [f"file '{base}' - all name parts are generic ({', '.join(low_hits)})"]
|
|
260
|
+
if low_hits:
|
|
261
|
+
return "warn", [f"file '{base}' contains generic part(s) ({', '.join(low_hits)}) but has a specific part"]
|
|
262
|
+
return "ok", []
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def _def_names_from_patterns(line: str, lang: str) -> list[tuple[str, str]]:
|
|
266
|
+
"""Return [(kind, name), ...] for definitions declared on `line`, or [] if
|
|
267
|
+
the line declares nothing. kind in {func, class, type, method}. Uses the
|
|
268
|
+
same FUNC_PATTERNS as scan_slop so definitions parse identically."""
|
|
269
|
+
out: list[tuple[str, str]] = []
|
|
270
|
+
# type/interface declarations (TS)
|
|
271
|
+
m = TYPE_DECL.search(line)
|
|
272
|
+
if m:
|
|
273
|
+
out.append(("type", m.group(1)))
|
|
274
|
+
# class/struct/trait/protocol/interface (cstyle/rust/swift/etc.)
|
|
275
|
+
cm = re.search(
|
|
276
|
+
r"\b(?:class|struct|trait|protocol|interface|enum)\s+([A-Z][A-Za-z0-9_]*)\b",
|
|
277
|
+
line,
|
|
278
|
+
)
|
|
279
|
+
if cm:
|
|
280
|
+
out.append(("class", cm.group(1)))
|
|
281
|
+
# function-shaped declarations, per language
|
|
282
|
+
patterns = FUNC_PATTERNS.get(lang, [])
|
|
283
|
+
seen: set[str] = set()
|
|
284
|
+
for rx in patterns:
|
|
285
|
+
fm = rx.search(line)
|
|
286
|
+
if fm:
|
|
287
|
+
cand = fm.group(1)
|
|
288
|
+
if rx in (METHOD_JS, METHOD_CSTYLE) and cand in NOT_METHOD:
|
|
289
|
+
continue
|
|
290
|
+
if cand not in seen:
|
|
291
|
+
seen.add(cand)
|
|
292
|
+
kind = "method" if rx in (METHOD_JS, METHOD_CSTYLE) else "func"
|
|
293
|
+
out.append((kind, cand))
|
|
294
|
+
break
|
|
295
|
+
return out
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def extract_identifiers(added_lines: list[str], rel: str) -> list[Finding]:
|
|
299
|
+
"""Walk `added_lines` and return identifier findings worth scoring:
|
|
300
|
+
newly DECLARED function/class/type/method names plus the filename itself.
|
|
301
|
+
|
|
302
|
+
Only declarations count, not references. We are judging what the agent
|
|
303
|
+
chose to NAME, not every token it touched. A call to `processData(x)` is
|
|
304
|
+
not interesting unless the agent also declared `function processData`.
|
|
305
|
+
|
|
306
|
+
Comment lines are skipped: a comment that happens to say "// the Manager"
|
|
307
|
+
is documentation, not a naming decision.
|
|
308
|
+
"""
|
|
309
|
+
lang = lang_of(rel)
|
|
310
|
+
if lang == "other":
|
|
311
|
+
# Still score the filename even for unknown languages.
|
|
312
|
+
base = rel.rsplit("/", 1)[-1]
|
|
313
|
+
sevs, reasons = score_filename(base)
|
|
314
|
+
if sevs != "ok":
|
|
315
|
+
return [{"name": base, "line": 0, "kind": "file",
|
|
316
|
+
"severity": sevs, "reasons": reasons}]
|
|
317
|
+
return []
|
|
318
|
+
|
|
319
|
+
findings: list[Finding] = []
|
|
320
|
+
seen_names: set[str] = set()
|
|
321
|
+
|
|
322
|
+
for i, raw in enumerate(added_lines, start=1):
|
|
323
|
+
if not raw.strip():
|
|
324
|
+
continue
|
|
325
|
+
# Skip pure-comment lines so a docstring mentioning "Manager" cannot
|
|
326
|
+
# trip the scorer. _strip_comments with string_repl="L" keeps strings
|
|
327
|
+
# masked so a string literal cannot trip it either.
|
|
328
|
+
stripped = _strip_comments(raw, lang, "L").strip()
|
|
329
|
+
if not stripped:
|
|
330
|
+
continue
|
|
331
|
+
for kind, name in _def_names_from_patterns(raw, lang):
|
|
332
|
+
if name in seen_names:
|
|
333
|
+
continue
|
|
334
|
+
seen_names.add(name)
|
|
335
|
+
sevs, reasons = score_density(name)
|
|
336
|
+
if sevs != "ok":
|
|
337
|
+
findings.append({"name": name, "line": i, "kind": kind,
|
|
338
|
+
"severity": sevs, "reasons": reasons})
|
|
339
|
+
|
|
340
|
+
# The file's own name is a naming decision too. Only flag if it's the
|
|
341
|
+
# file being created/renamed (the basename is always available; we score
|
|
342
|
+
# it always but it's cheap and a renamed utils.ts -> foo.ts deserves
|
|
343
|
+
# flagging the new name).
|
|
344
|
+
base = rel.rsplit("/", 1)[-1]
|
|
345
|
+
if base and base not in seen_names:
|
|
346
|
+
sevs, reasons = score_filename(base)
|
|
347
|
+
if sevs != "ok":
|
|
348
|
+
findings.append({"name": base, "line": 0, "kind": "file",
|
|
349
|
+
"severity": sevs, "reasons": reasons})
|
|
350
|
+
|
|
351
|
+
return findings
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def score_identifiers(added_lines: list[str], rel: str) -> list[Finding]:
|
|
355
|
+
"""Convenience wrapper: extract + filter to warn/fail only. This is what
|
|
356
|
+
scan_slop.py's signal bucket calls."""
|
|
357
|
+
return [f for f in extract_identifiers(added_lines, rel) if f["severity"] != "ok"]
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def format_for_report(findings: list[Finding]) -> list[str]:
|
|
361
|
+
"""Flatten findings into the short strings scan_slop prints per finding."""
|
|
362
|
+
out: list[str] = []
|
|
363
|
+
for f in findings:
|
|
364
|
+
tag = f["severity"].upper()
|
|
365
|
+
kind = f["kind"]
|
|
366
|
+
name = f["name"]
|
|
367
|
+
reason = "; ".join(f["reasons"]) if f["reasons"] else "low semantic density"
|
|
368
|
+
loc = f"line {f['line']}" if f["line"] else "file name"
|
|
369
|
+
out.append(f"[{tag}] {kind} '{name}' ({loc}): {reason}")
|
|
370
|
+
return out
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
if __name__ == "__main__":
|
|
374
|
+
# Smoke entrypoint: score names passed as argv so a human can sanity-check
|
|
375
|
+
# the scorer without writing a test harness.
|
|
376
|
+
import json
|
|
377
|
+
if len(sys.argv) > 1 and sys.argv[1] == "--self-test":
|
|
378
|
+
cases = [
|
|
379
|
+
("DataManager", "fail"), ("CoreEngine", "fail"), ("process", "warn"),
|
|
380
|
+
("InvoiceEmailSender", "ok"), ("PostgresUserRepository", "warn"),
|
|
381
|
+
("GenerateMonthlyReport", "ok"), ("Helper", "fail"), ("Utils", "fail"),
|
|
382
|
+
("UserManager", "warn"), ("handle", "warn"), ("doStuff", "fail"),
|
|
383
|
+
("processData", "fail"), ("x1", "fail"), ("finalFinal", "fail"),
|
|
384
|
+
("tempFix", "fail"), ("SystemProvider", "fail"),
|
|
385
|
+
("StripePaymentProvider", "warn"), ("utils.ts", "fail"),
|
|
386
|
+
("invoice_totals.ts", "ok"), ("SendInvoiceEmail", "ok"),
|
|
387
|
+
("DiscordWebhookClient", "ok"), ("fn", "fail"), ("Helper", "fail"),
|
|
388
|
+
("BaseService", "fail"), ("framework", "fail"), ("BusinessProcessor", "fail"),
|
|
389
|
+
]
|
|
390
|
+
fails = 0
|
|
391
|
+
for name, want in cases:
|
|
392
|
+
sevs, _ = score_density(name) if "." not in name else score_filename(name)
|
|
393
|
+
mark = "OK " if sevs == want else "BAD"
|
|
394
|
+
if sevs != want:
|
|
395
|
+
fails += 1
|
|
396
|
+
print(f" {mark} {name:<32} got={sevs:<5} want={want}")
|
|
397
|
+
print(f"\n{'PASS' if not fails else f'{fails} FAILURES'}")
|
|
398
|
+
sys.exit(1 if fails else 0)
|
|
399
|
+
# default: score argv names as identifiers
|
|
400
|
+
for n in sys.argv[1:]:
|
|
401
|
+
sevs, reasons = score_density(n)
|
|
402
|
+
print(f"{n}: {sevs} ({'; '.join(reasons)})")
|
|
403
|
+
# JSON dump mode for the hook wrapper
|
|
404
|
+
if "--json" in sys.argv:
|
|
405
|
+
print(json.dumps({"note": "use density_scan.py for stdin->json"}))
|
|
@@ -13,9 +13,11 @@ Scopes:
|
|
|
13
13
|
errors (empty catch, broad except+pass), tautological asserts, pointless
|
|
14
14
|
async wrappers (await Promise.resolve, async executors), deepening guard
|
|
15
15
|
chains (the optional-chaining shape), boolean-pair call traps, SELECT *
|
|
16
|
-
in .sql files,
|
|
17
|
-
|
|
18
|
-
|
|
16
|
+
in .sql files, Tailwind class soup / magic-px values, and SEMANTIC OPACITY
|
|
17
|
+
(low-density identifiers - DataManager, process(), utils.ts - scored via
|
|
18
|
+
the shared low_density module). All per-file signals also run in AUDIT
|
|
19
|
+
scope; only new-dependency detection is diff-only (every line of an
|
|
20
|
+
existing manifest would otherwise read as "new").
|
|
19
21
|
* AUDIT (--all, or explicit paths): the WHOLE codebase, with the duplication
|
|
20
22
|
analysis that catches the isRecord()-class slop:
|
|
21
23
|
- Clone Proliferation : same function name in multiple files
|
|
@@ -432,6 +434,7 @@ _SIGNALS = {
|
|
|
432
434
|
"boolean_traps": ("boolean trap ", "boolean-trap"),
|
|
433
435
|
"select_star": ("SELECT * ", "select-star"),
|
|
434
436
|
"tailwind_slop": ("tailwind smell ", "tailwind"),
|
|
437
|
+
"semantic_density": ("semantic opacity ", "semantic-density"),
|
|
435
438
|
}
|
|
436
439
|
_SIGNAL_KEYS = tuple(_SIGNALS)
|
|
437
440
|
|
|
@@ -517,6 +520,22 @@ def scan_lines(rel: str, lines: list[str], audit: bool) -> Finding | None:
|
|
|
517
520
|
TAILWIND_SOUP.search(ln) or TAILWIND_MAGIC_PX.search(ln)
|
|
518
521
|
):
|
|
519
522
|
found["tailwind_slop"].append(ln.strip()[:100])
|
|
523
|
+
# Semantic opacity: low-density identifiers introduced in this file. Lazy
|
|
524
|
+
# import because low_density imports scan_slop at module load (sibling
|
|
525
|
+
# resolution) - a top-level import here would cycle. Only declarations
|
|
526
|
+
# count, not references, so a CALL to processData(x) does not trip unless
|
|
527
|
+
# the agent also declared function processData on an audited line.
|
|
528
|
+
if is_source:
|
|
529
|
+
try:
|
|
530
|
+
import low_density
|
|
531
|
+
for item in low_density.format_for_report(
|
|
532
|
+
low_density.score_identifiers(lines, rel)):
|
|
533
|
+
found["semantic_density"].append(item[:140])
|
|
534
|
+
except Exception:
|
|
535
|
+
# Never let the density layer break the rest of the scan. If
|
|
536
|
+
# low_density.py is absent (older install) or errors, the other
|
|
537
|
+
# twelve signals still run.
|
|
538
|
+
pass
|
|
520
539
|
found = {k: _uniq(v) for k, v in found.items()}
|
|
521
540
|
added_count = sum(1 for ln in lines if ln.strip())
|
|
522
541
|
substantial = (not audit) and is_source and added_count >= CHECKLIST_LINES
|
|
@@ -635,13 +654,15 @@ def collect_defs(rel: str, lines: list[str]) -> list[Finding]:
|
|
|
635
654
|
nb = normalize_body(raw, lang)
|
|
636
655
|
sb = structural_body(raw, lang)
|
|
637
656
|
# Non-blank lines only: the brace walk pads raw with edge newlines,
|
|
638
|
-
# and counting them would let `super(props);`
|
|
657
|
+
# and counting them would let `super(props);` pad its body_line count.
|
|
639
658
|
body_lines = sum(1 for s in raw.splitlines() if s.strip()) or 1
|
|
640
|
-
# Exact-dup hash needs substance (>=
|
|
641
|
-
#
|
|
659
|
+
# Exact-dup hash needs substance (>=12 normalized chars). An earlier
|
|
660
|
+
# >=3-lines-or->=60-chars floor excluded the skill's own marquee case -
|
|
661
|
+
# tiny predicates like isRecord/isObject (1 line, ~40 chars) whose
|
|
662
|
+
# byte-identical bodies are exactly the duplication worth surfacing.
|
|
663
|
+
# Boilerplate like `return;`/`return x;` stays under the 12-char floor.
|
|
642
664
|
# A truncated body is a prefix, not the function - never call it exact.
|
|
643
|
-
hash_exact = (not truncated and len(nb) >= 12
|
|
644
|
-
and (body_lines >= 3 or len(nb) >= 60))
|
|
665
|
+
hash_exact = (not truncated and len(nb) >= 12)
|
|
645
666
|
defs.append({
|
|
646
667
|
"name": name, "file": rel, "line": i + 1,
|
|
647
668
|
"exported": _is_exported(name, ln, lang),
|