docguard-cli 0.9.2 → 0.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/commands/diagnose.mjs +2 -2
- package/cli/commands/generate.mjs +49 -10
- package/cli/validators/doc-quality.mjs +184 -163
- package/package.json +1 -1
|
@@ -120,10 +120,10 @@ export function runDiagnose(projectDir, config, flags) {
|
|
|
120
120
|
});
|
|
121
121
|
} catch { /* init may partially succeed */ }
|
|
122
122
|
|
|
123
|
-
// Run generate to fill in content
|
|
123
|
+
// Run generate to fill in MISSING content only (never --force, which would overwrite existing docs)
|
|
124
124
|
try {
|
|
125
125
|
const cliPath = resolve(dirname(fileURLToPath(import.meta.url)), '..', 'docguard.mjs');
|
|
126
|
-
execSync(`node "${cliPath}" generate --dir "${projectDir}"
|
|
126
|
+
execSync(`node "${cliPath}" generate --dir "${projectDir}"`, {
|
|
127
127
|
encoding: 'utf-8',
|
|
128
128
|
stdio: 'pipe',
|
|
129
129
|
});
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* This is the "killer feature" — take any project and auto-generate CDD docs.
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
-
import { existsSync, readFileSync, writeFileSync, readdirSync, statSync, mkdirSync } from 'node:fs';
|
|
8
|
+
import { existsSync, readFileSync, writeFileSync, readdirSync, statSync, mkdirSync, copyFileSync } from 'node:fs';
|
|
9
9
|
import { resolve, join, extname, basename, relative, dirname } from 'node:path';
|
|
10
10
|
import { c } from '../shared.mjs';
|
|
11
11
|
import { detectDocTools } from '../scanners/doc-tools.mjs';
|
|
@@ -18,6 +18,30 @@ const IGNORE_DIRS = new Set([
|
|
|
18
18
|
'.amplify-hosting', '.serverless',
|
|
19
19
|
]);
|
|
20
20
|
|
|
21
|
+
/**
|
|
22
|
+
* Create a .bak backup of an existing file before --force overwrites it.
|
|
23
|
+
* Only backs up if the file exists and has content.
|
|
24
|
+
*/
|
|
25
|
+
function backupFile(filePath) {
|
|
26
|
+
if (existsSync(filePath)) {
|
|
27
|
+
try {
|
|
28
|
+
const content = readFileSync(filePath, 'utf-8');
|
|
29
|
+
if (content.trim().length > 0) {
|
|
30
|
+
copyFileSync(filePath, filePath + '.bak');
|
|
31
|
+
}
|
|
32
|
+
} catch { /* backup failure is non-fatal */ }
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Safe write — creates a .bak backup before overwriting existing files.
|
|
38
|
+
* Call this instead of raw writeFileSync when generating docs.
|
|
39
|
+
*/
|
|
40
|
+
function safeWrite(filePath, content) {
|
|
41
|
+
backupFile(filePath);
|
|
42
|
+
writeFileSync(filePath, content, 'utf-8');
|
|
43
|
+
}
|
|
44
|
+
|
|
21
45
|
const CODE_EXTENSIONS = new Set([
|
|
22
46
|
'.js', '.mjs', '.cjs', '.ts', '.tsx', '.jsx',
|
|
23
47
|
'.py', '.java', '.go', '.rs', '.rb', '.php', '.cs',
|
|
@@ -137,6 +161,21 @@ export function runGenerate(projectDir, config, flags) {
|
|
|
137
161
|
mkdirSync(docsDir, { recursive: true });
|
|
138
162
|
}
|
|
139
163
|
|
|
164
|
+
// ── Safety: warn if --force will overwrite existing files ──
|
|
165
|
+
if (flags.force) {
|
|
166
|
+
const targetFiles = [
|
|
167
|
+
'docs-canonical/ARCHITECTURE.md', 'docs-canonical/API-REFERENCE.md',
|
|
168
|
+
'docs-canonical/DATA-MODEL.md', 'docs-canonical/ENVIRONMENT.md',
|
|
169
|
+
'docs-canonical/TEST-SPEC.md', 'docs-canonical/SECURITY.md',
|
|
170
|
+
'AGENTS.md', 'CHANGELOG.md', 'DRIFT-LOG.md',
|
|
171
|
+
];
|
|
172
|
+
const existing = targetFiles.filter(f => existsSync(resolve(projectDir, f)));
|
|
173
|
+
if (existing.length > 0) {
|
|
174
|
+
console.log(` ${c.yellow}⚠️ --force: ${existing.length} existing file(s) will be overwritten.${c.reset}`);
|
|
175
|
+
console.log(` ${c.dim} Backups saved as .bak files.${c.reset}\n`);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
140
179
|
let created = 0;
|
|
141
180
|
let skipped = 0;
|
|
142
181
|
|
|
@@ -633,7 +672,7 @@ See \\\`docs-canonical/KNOWN-GOTCHAS.md\\\` for known issues.
|
|
|
633
672
|
| 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (arc42 + C4 aligned) |
|
|
634
673
|
`;
|
|
635
674
|
|
|
636
|
-
|
|
675
|
+
safeWrite(path, appendStandardsCitation(content, 'ARCHITECTURE.md'), 'utf-8');
|
|
637
676
|
console.log(` ${c.green}✅ ARCHITECTURE.md${c.reset} (arc42 §1-§12, ${componentRows.length} components, ${Object.values(stack).filter(Boolean).length} tech)`);
|
|
638
677
|
return true;
|
|
639
678
|
}
|
|
@@ -730,7 +769,7 @@ ${resourceSections}
|
|
|
730
769
|
| 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (${deepRoutes.length} endpoints from ${deepRoutes[0]?.source || 'code'}) |
|
|
731
770
|
`;
|
|
732
771
|
|
|
733
|
-
|
|
772
|
+
safeWrite(path, appendStandardsCitation(content, 'API-REFERENCE.md'), 'utf-8');
|
|
734
773
|
console.log(` ${c.green}✅ API-REFERENCE.md${c.reset} (${deepRoutes.length} endpoints, ${Object.keys(groups).length} resources)`);
|
|
735
774
|
return true;
|
|
736
775
|
}
|
|
@@ -885,7 +924,7 @@ ${erDiagram}
|
|
|
885
924
|
| 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (${entities.length} entities, ${relationships.length} relationships from ${schemaSource}) |
|
|
886
925
|
`;
|
|
887
926
|
|
|
888
|
-
|
|
927
|
+
safeWrite(path, appendStandardsCitation(content, 'DATA-MODEL.md'), 'utf-8');
|
|
889
928
|
console.log(` ${c.green}✅ DATA-MODEL.md${c.reset} (${entities.length} entities, ${relationships.length} relationships from ${schemaSource})`);
|
|
890
929
|
return true;
|
|
891
930
|
}
|
|
@@ -948,7 +987,7 @@ ${envVarRows || '| <!-- No .env.example found --> | | | | |'}
|
|
|
948
987
|
| 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (${scan.envVars.length} env vars found) |
|
|
949
988
|
`;
|
|
950
989
|
|
|
951
|
-
|
|
990
|
+
safeWrite(path, appendStandardsCitation(content, 'ENVIRONMENT.md'), 'utf-8');
|
|
952
991
|
console.log(` ${c.green}✅ ENVIRONMENT.md${c.reset} (${scan.envVars.length} env vars detected)`);
|
|
953
992
|
return true;
|
|
954
993
|
}
|
|
@@ -1033,7 +1072,7 @@ ${serviceRows || '| <!-- No services found --> | | | |'}
|
|
|
1033
1072
|
| 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (${scan.tests.length} test files, ${serviceMap.filter(s => s.status === '✅').length}/${serviceMap.length} mapped) |
|
|
1034
1073
|
`;
|
|
1035
1074
|
|
|
1036
|
-
|
|
1075
|
+
safeWrite(path, appendStandardsCitation(content, 'TEST-SPEC.md'), 'utf-8');
|
|
1037
1076
|
console.log(` ${c.green}✅ TEST-SPEC.md${c.reset} (${scan.tests.length} tests, ${serviceMap.filter(s => s.status === '✅').length}/${serviceMap.length} services mapped)`);
|
|
1038
1077
|
return true;
|
|
1039
1078
|
}
|
|
@@ -1099,7 +1138,7 @@ ${scan.envVars.filter(v => isSecretVar(v.name)).map(v =>
|
|
|
1099
1138
|
| 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated |
|
|
1100
1139
|
`;
|
|
1101
1140
|
|
|
1102
|
-
|
|
1141
|
+
safeWrite(path, appendStandardsCitation(content, 'SECURITY.md'), 'utf-8');
|
|
1103
1142
|
console.log(` ${c.green}✅ SECURITY.md${c.reset} (auth: ${stack.auth || 'not detected'})`);
|
|
1104
1143
|
return true;
|
|
1105
1144
|
}
|
|
@@ -1209,7 +1248,7 @@ npx docguard-cli generate # Generate docs from code
|
|
|
1209
1248
|
- Test requirements in TEST-SPEC.md must be met
|
|
1210
1249
|
- Documentation changes must pass \`docguard guard\`
|
|
1211
1250
|
`;
|
|
1212
|
-
|
|
1251
|
+
safeWrite(agentsPath, content);
|
|
1213
1252
|
console.log(` ${c.green}✅ AGENTS.md${c.reset} (AGENTS.md standard compliant)`);
|
|
1214
1253
|
created++;
|
|
1215
1254
|
} else {
|
|
@@ -1231,7 +1270,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
|
1231
1270
|
### Added
|
|
1232
1271
|
- CDD documentation via DocGuard generate
|
|
1233
1272
|
`;
|
|
1234
|
-
|
|
1273
|
+
safeWrite(changelogPath, content);
|
|
1235
1274
|
console.log(` ${c.green}✅ CHANGELOG.md${c.reset}`);
|
|
1236
1275
|
created++;
|
|
1237
1276
|
} else {
|
|
@@ -1251,7 +1290,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
|
1251
1290
|
|------|------|---------------|-------------------|----------|------------|
|
|
1252
1291
|
| | | | | | |
|
|
1253
1292
|
`;
|
|
1254
|
-
|
|
1293
|
+
safeWrite(driftPath, content);
|
|
1255
1294
|
console.log(` ${c.green}✅ DRIFT-LOG.md${c.reset}`);
|
|
1256
1295
|
created++;
|
|
1257
1296
|
} else {
|
|
@@ -11,6 +11,11 @@
|
|
|
11
11
|
* Readability: Flesch Reading Ease, Flesch-Kincaid Grade Level
|
|
12
12
|
* Cognitive: Sentence Length, Negation Load, Conditional Load
|
|
13
13
|
*
|
|
14
|
+
* v0.9.3 — Prose-Only Extraction Engine:
|
|
15
|
+
* Instead of stripping markdown and measuring residue (which treats table
|
|
16
|
+
* cells as "long sentences"), this version extracts ONLY actual prose
|
|
17
|
+
* paragraphs. Docs that are mostly tables/code skip readability scoring.
|
|
18
|
+
*
|
|
14
19
|
* Optional: If `understanding` CLI is installed, runs a full 31-metric deep scan.
|
|
15
20
|
*
|
|
16
21
|
* Zero dependencies — pure Node.js built-ins only.
|
|
@@ -25,134 +30,197 @@ import { execSync } from 'node:child_process';
|
|
|
25
30
|
// Values are based on IEEE 830 best practices and readability research.
|
|
26
31
|
|
|
27
32
|
const THRESHOLDS = {
|
|
28
|
-
passiveVoiceRatio: { warn: 0.
|
|
33
|
+
passiveVoiceRatio: { warn: 0.25, label: 'Passive voice ratio' }, // >25% passive = warn
|
|
29
34
|
ambiguousPronounRatio: { warn: 0.15, label: 'Ambiguous pronoun ratio' }, // >15% ambiguous pronouns = warn
|
|
30
|
-
atomicityScore: { warn: 0.
|
|
31
|
-
fleschReadingEase: { warn:
|
|
32
|
-
fleschKincaidGrade: { warn:
|
|
33
|
-
avgSentenceLength: { warn:
|
|
34
|
-
negationLoad: { warn: 0.
|
|
35
|
+
atomicityScore: { warn: 0.35, label: 'Non-atomic sentence ratio' }, // >35% compound sentences = warn
|
|
36
|
+
fleschReadingEase: { warn: 15, label: 'Flesch reading ease' }, // <15 = truly unreadable prose
|
|
37
|
+
fleschKincaidGrade: { warn: 18, label: 'Flesch-Kincaid grade' }, // >18 = graduate level+
|
|
38
|
+
avgSentenceLength: { warn: 30, label: 'Avg sentence length' }, // >30 words = too long
|
|
39
|
+
negationLoad: { warn: 0.20, label: 'Negation load' }, // >20% sentences with negation = warn
|
|
35
40
|
conditionalLoad: { warn: 0.30, label: 'Conditional load' }, // >30% sentences conditional = warn
|
|
36
41
|
};
|
|
37
42
|
|
|
38
|
-
//
|
|
43
|
+
// Minimum prose words required for readability scoring.
|
|
44
|
+
// Docs with less than this are reference docs (tables, code) — skip readability.
|
|
45
|
+
const MIN_PROSE_WORDS = 50;
|
|
46
|
+
|
|
47
|
+
// ──── Technical Vocabulary ────
|
|
48
|
+
// Terms the target audience knows. Treated as 2-syllable words for Flesch scoring
|
|
49
|
+
// so they don't artificially inflate difficulty.
|
|
50
|
+
|
|
51
|
+
const TECH_VOCAB = new Set([
|
|
52
|
+
// Infrastructure & databases
|
|
53
|
+
'dynamodb', 'redis', 'postgres', 'postgresql', 'mongodb', 'mysql', 'sqlite',
|
|
54
|
+
'kubernetes', 'docker', 'dockerfile', 'nginx', 'apache', 'cloudfront',
|
|
55
|
+
'cloudwatch', 'elasticsearch', 'opensearch', 'terraform', 'ansible',
|
|
56
|
+
'memcached', 'cassandra', 'rabbitmq', 'kafka',
|
|
57
|
+
// Frameworks & languages
|
|
58
|
+
'typescript', 'javascript', 'python', 'fastify', 'express', 'nextjs',
|
|
59
|
+
'webpack', 'vite', 'vitest', 'playwright', 'cypress', 'mocha',
|
|
60
|
+
'nestjs', 'angular', 'svelte', 'nuxtjs', 'gatsby', 'remix',
|
|
61
|
+
// Protocols & patterns
|
|
62
|
+
'websocket', 'websockets', 'middleware', 'microservice', 'microservices',
|
|
63
|
+
'graphql', 'restful', 'oauth', 'openapi', 'webhook', 'webhooks',
|
|
64
|
+
'grpc', 'protobuf', 'pubsub',
|
|
65
|
+
// AWS services
|
|
66
|
+
'lambda', 'cognito', 'amplify', 'apprunner', 'cloudformation',
|
|
67
|
+
'apigateway', 'secretsmanager', 'parameterstore', 'eventbridge',
|
|
68
|
+
'fargate', 'elasticache', 'sagemaker',
|
|
69
|
+
// Common developer terms
|
|
70
|
+
'namespace', 'endpoint', 'endpoints', 'timestamp', 'timestamps',
|
|
71
|
+
'boolean', 'callback', 'callbacks', 'codebase', 'monorepo',
|
|
72
|
+
'frontend', 'backend', 'fullstack', 'changelog', 'localhost',
|
|
73
|
+
'hostname', 'username', 'eslint', 'prettier', 'rollup',
|
|
74
|
+
'authentication', 'authorization', 'infrastructure', 'serialization',
|
|
75
|
+
'deserialization', 'middleware', 'polymorphism', 'abstraction',
|
|
76
|
+
]);
|
|
77
|
+
|
|
78
|
+
// ──── Prose Extraction Engine ────
|
|
39
79
|
|
|
40
80
|
/**
|
|
41
|
-
*
|
|
42
|
-
*
|
|
43
|
-
*
|
|
81
|
+
* Extract only prose paragraphs from markdown content.
|
|
82
|
+
*
|
|
83
|
+
* Instead of stripping markdown and measuring residue (where table cells
|
|
84
|
+
* become "146-word sentences"), this identifies actual prose — blocks of
|
|
85
|
+
* text that form readable sentences — and returns only those.
|
|
86
|
+
*
|
|
87
|
+
* A line qualifies as prose if it:
|
|
88
|
+
* - Is not inside a code block / HTML comment
|
|
89
|
+
* - Is not a table row, header, horizontal rule, or metadata
|
|
90
|
+
* - Has ≥55% alphabetic characters (filters out paths/URLs/symbol-heavy lines)
|
|
91
|
+
* - Has ≥5 words (fragments aren't prose)
|
|
44
92
|
*/
|
|
45
|
-
function
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
// Remove YAML frontmatter (---...---)
|
|
62
|
-
text = text.replace(/^---[\s\S]*?---\n/m, '');
|
|
63
|
-
|
|
64
|
-
// Remove table rows (lines starting with |) and table separators
|
|
65
|
-
text = text.replace(/^\|.*$/gm, '');
|
|
66
|
-
text = text.replace(/^[|:\-\s]+$/gm, '');
|
|
67
|
-
|
|
68
|
-
// Remove horizontal rules
|
|
69
|
-
text = text.replace(/^[-*_]{3,}\s*$/gm, '');
|
|
70
|
-
|
|
71
|
-
// Remove badge images (shield.io etc.) — before generic image removal
|
|
72
|
-
text = text.replace(/!\[.*?\]\(https?:\/\/[^)]+\)/g, '');
|
|
73
|
-
|
|
74
|
-
// Remove images: 
|
|
75
|
-
text = text.replace(/!\[.*?\]\(.*?\)/g, '');
|
|
76
|
-
|
|
77
|
-
// Remove links, keep link text: [text](url) → text
|
|
78
|
-
text = text.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
|
79
|
-
|
|
80
|
-
// Remove inline code
|
|
81
|
-
text = text.replace(/`[^`]+`/g, '');
|
|
82
|
-
|
|
83
|
-
// Remove header markers (# ## ### etc.)
|
|
84
|
-
text = text.replace(/^#{1,6}\s+/gm, '');
|
|
85
|
-
|
|
86
|
-
// Remove list markers (-, *, 1.)
|
|
87
|
-
text = text.replace(/^\s*[-*+]\s+/gm, '');
|
|
88
|
-
text = text.replace(/^\s*\d+\.\s+/gm, '');
|
|
89
|
-
|
|
90
|
-
// Remove bold/italic markers
|
|
91
|
-
text = text.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
|
|
92
|
-
text = text.replace(/_{1,3}([^_]+)_{1,3}/g, '$1');
|
|
93
|
-
|
|
94
|
-
// Remove definition-style lines (key: value or key | value)
|
|
95
|
-
text = text.replace(/^\s*\w[\w\s]*\s*[:|]\s*.*$/gm, (match) => {
|
|
96
|
-
// Only strip if it looks like a key-value pair, not a sentence
|
|
97
|
-
if (match.includes('.') || match.split(/\s+/).length > 8) return match;
|
|
98
|
-
return '';
|
|
99
|
-
});
|
|
93
|
+
function extractProse(content) {
|
|
94
|
+
const lines = content.split('\n');
|
|
95
|
+
const proseLines = [];
|
|
96
|
+
let inCodeBlock = false;
|
|
97
|
+
let inHtmlComment = false;
|
|
98
|
+
|
|
99
|
+
for (const rawLine of lines) {
|
|
100
|
+
const line = rawLine.trim();
|
|
101
|
+
|
|
102
|
+
// Track code block boundaries (``` and ````)
|
|
103
|
+
if (/^`{3,}/.test(line)) {
|
|
104
|
+
inCodeBlock = !inCodeBlock;
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
if (inCodeBlock) continue;
|
|
100
108
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
+
// Track multi-line HTML comments
|
|
110
|
+
if (line.includes('<!--') && !line.includes('-->')) {
|
|
111
|
+
inHtmlComment = true;
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
if (inHtmlComment) {
|
|
115
|
+
if (line.includes('-->')) inHtmlComment = false;
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
109
118
|
|
|
110
|
-
|
|
111
|
-
|
|
119
|
+
// Skip non-prose line types
|
|
120
|
+
if (line.startsWith('|')) continue; // Table rows
|
|
121
|
+
if (line.startsWith('#')) continue; // Headers
|
|
122
|
+
if (line.startsWith('!')) continue; // Images
|
|
123
|
+
if (/^[-*_]{3,}\s*$/.test(line)) continue; // Horizontal rules
|
|
124
|
+
if (/^[|:\-\s]+$/.test(line)) continue; // Table separators
|
|
125
|
+
if (/^<!--.*-->$/.test(line)) continue; // Inline HTML comments
|
|
126
|
+
if (/^<[^>]+>/.test(line)) continue; // HTML tags
|
|
127
|
+
if (/^---\s*$/.test(line)) continue; // YAML frontmatter
|
|
128
|
+
if (line.length === 0) continue; // Empty lines
|
|
129
|
+
|
|
130
|
+
// Clean the line: extract text from markdown formatting
|
|
131
|
+
let cleaned = line;
|
|
132
|
+
cleaned = cleaned.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1'); // Links → text only
|
|
133
|
+
cleaned = cleaned.replace(/`[^`]+`/g, ''); // Remove inline code
|
|
134
|
+
cleaned = cleaned.replace(/!\[.*?\]\(.*?\)/g, ''); // Remove images
|
|
135
|
+
cleaned = cleaned.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1'); // Bold/italic → text
|
|
136
|
+
cleaned = cleaned.replace(/_{1,3}([^_]+)_{1,3}/g, '$1'); // Underline emphasis
|
|
137
|
+
cleaned = cleaned.replace(/^[-*+]\s+/, ''); // List markers
|
|
138
|
+
cleaned = cleaned.replace(/^\d+\.\s+/, ''); // Numbered list markers
|
|
139
|
+
cleaned = cleaned.trim();
|
|
140
|
+
|
|
141
|
+
if (cleaned.length < 15) continue;
|
|
142
|
+
|
|
143
|
+
// Prose heuristic: check alphabetic ratio and word count
|
|
144
|
+
const alphaCount = (cleaned.match(/[a-zA-Z]/g) || []).length;
|
|
145
|
+
const alphaRatio = alphaCount / cleaned.length;
|
|
146
|
+
const wordCount = cleaned.split(/\s+/).length;
|
|
147
|
+
|
|
148
|
+
// A prose line needs ≥55% letters and ≥5 words
|
|
149
|
+
if (alphaRatio >= 0.55 && wordCount >= 5) {
|
|
150
|
+
proseLines.push(cleaned);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
112
153
|
|
|
113
|
-
return
|
|
154
|
+
return proseLines.join('\n');
|
|
114
155
|
}
|
|
115
156
|
|
|
116
157
|
/**
|
|
117
|
-
* Split text into sentences
|
|
118
|
-
*
|
|
158
|
+
* Split text into sentences with markdown-aware boundary detection.
|
|
159
|
+
*
|
|
160
|
+
* Protects against false splits from:
|
|
161
|
+
* - File paths (src/services/auth.ts → the dot isn't a sentence boundary)
|
|
162
|
+
* - Version numbers (v0.9.2, Node.js 18)
|
|
163
|
+
* - URLs (https://example.com)
|
|
164
|
+
* - Common abbreviations (e.g., i.e., etc., vs.)
|
|
165
|
+
* - Technical dotted names (package.json, .env.local)
|
|
119
166
|
*/
|
|
120
167
|
function splitSentences(text) {
|
|
121
168
|
if (!text || text.trim().length === 0) return [];
|
|
122
169
|
|
|
123
|
-
// Protect common abbreviations from false sentence splits
|
|
124
170
|
let protected_ = text;
|
|
125
|
-
|
|
171
|
+
|
|
172
|
+
// Protect dotted filenames (package.json, .env.local, auth.ts)
|
|
173
|
+
protected_ = protected_.replace(/[\w.-]+\.[a-z]{1,4}(?=[\s,;:)\]|]|$)/gi, (m) => m.replace(/\./g, '≈'));
|
|
174
|
+
|
|
175
|
+
// Protect version numbers (v0.9.2, 1.2.3)
|
|
176
|
+
protected_ = protected_.replace(/\bv?\d+\.\d+(?:\.\d+)*\b/g, (m) => m.replace(/\./g, '≈'));
|
|
177
|
+
|
|
178
|
+
// Protect URLs
|
|
179
|
+
protected_ = protected_.replace(/https?:\/\/[^\s)]+/g, (m) => m.replace(/\./g, '≈'));
|
|
180
|
+
|
|
181
|
+
// Protect common abbreviations
|
|
182
|
+
const abbreviations = ['Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'approx', 'incl'];
|
|
126
183
|
for (const abbr of abbreviations) {
|
|
127
184
|
const regex = new RegExp(`\\b${abbr}\\.`, 'gi');
|
|
128
|
-
protected_ = protected_.replace(regex,
|
|
185
|
+
protected_ = protected_.replace(regex, (m) => m.replace(/\./g, '≈'));
|
|
129
186
|
}
|
|
130
187
|
|
|
188
|
+
// Protect e.g. and i.e. specifically (have dots in the abbreviation itself)
|
|
189
|
+
protected_ = protected_.replace(/\be\.g\./gi, 'e≈g≈');
|
|
190
|
+
protected_ = protected_.replace(/\bi\.e\./gi, 'i≈e≈');
|
|
191
|
+
|
|
192
|
+
// Protect Node.js, Vue.js, etc.
|
|
193
|
+
protected_ = protected_.replace(/\b(\w+)\.js\b/gi, '$1≈js');
|
|
194
|
+
|
|
131
195
|
// Protect decimal numbers (3.14)
|
|
132
196
|
protected_ = protected_.replace(/(\d)\.(\d)/g, '$1≈$2');
|
|
133
197
|
|
|
134
|
-
// Split on sentence-ending punctuation followed by
|
|
135
|
-
const raw = protected_.split(/[.!?]+(?:\s
|
|
198
|
+
// Split on sentence-ending punctuation followed by whitespace/newline/end
|
|
199
|
+
const raw = protected_.split(/[.!?]+(?:\s+|\n|$)/);
|
|
136
200
|
|
|
137
|
-
// Restore protected characters and filter empties
|
|
201
|
+
// Restore protected characters and filter empties/fragments
|
|
138
202
|
return raw
|
|
139
203
|
.map(s => s.replace(/≈/g, '.').trim())
|
|
140
|
-
.filter(s =>
|
|
204
|
+
.filter(s => {
|
|
205
|
+
if (s.length < 10) return false;
|
|
206
|
+
return s.split(/\s+/).length >= 3; // At least 3 words
|
|
207
|
+
});
|
|
141
208
|
}
|
|
142
209
|
|
|
143
210
|
/**
|
|
144
|
-
* Count syllables
|
|
145
|
-
*
|
|
146
|
-
*
|
|
147
|
-
*
|
|
148
|
-
*
|
|
149
|
-
* 4. Minimum 1 syllable per word
|
|
211
|
+
* Count syllables with technical vocabulary normalization.
|
|
212
|
+
*
|
|
213
|
+
* Technical terms (DynamoDB, WebSocket, middleware) are normalized to
|
|
214
|
+
* 2 syllables. The target audience knows these terms — they don't make
|
|
215
|
+
* the text harder to read.
|
|
150
216
|
*/
|
|
151
217
|
function countSyllables(word) {
|
|
152
218
|
word = word.toLowerCase().replace(/[^a-z]/g, '');
|
|
153
219
|
if (word.length <= 2) return 1;
|
|
154
220
|
|
|
155
|
-
//
|
|
221
|
+
// Technical vocabulary → 2 syllables (known terms)
|
|
222
|
+
if (TECH_VOCAB.has(word)) return 2;
|
|
223
|
+
|
|
156
224
|
const exceptions = {
|
|
157
225
|
'the': 1, 'are': 1, 'were': 1, 'have': 1, 'there': 1,
|
|
158
226
|
'where': 1, 'here': 1, 'every': 3, 'everything': 4,
|
|
@@ -164,17 +232,16 @@ function countSyllables(word) {
|
|
|
164
232
|
const vowelGroups = word.match(/[aeiouy]+/g);
|
|
165
233
|
let count = vowelGroups ? vowelGroups.length : 1;
|
|
166
234
|
|
|
167
|
-
// Subtract silent-e at end (but not
|
|
235
|
+
// Subtract silent-e at end (but not -le, -ce, -ge)
|
|
168
236
|
if (word.endsWith('e') && !word.endsWith('le') && !word.endsWith('ce') && !word.endsWith('ge')) {
|
|
169
237
|
count--;
|
|
170
238
|
}
|
|
171
239
|
|
|
172
|
-
// Subtract for common
|
|
240
|
+
// Subtract for common past-tense endings
|
|
173
241
|
if (word.endsWith('ed') && !word.endsWith('ted') && !word.endsWith('ded')) {
|
|
174
242
|
count--;
|
|
175
243
|
}
|
|
176
244
|
|
|
177
|
-
// Ensure minimum 1 syllable
|
|
178
245
|
return Math.max(1, count);
|
|
179
246
|
}
|
|
180
247
|
|
|
@@ -202,7 +269,6 @@ function tokenizeWords(text) {
|
|
|
202
269
|
function measurePassiveVoice(sentences) {
|
|
203
270
|
if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
204
271
|
|
|
205
|
-
// Passive voice pattern: be-verb followed by past participle
|
|
206
272
|
const passivePattern = /\b(is|was|were|been|being|are|be|am)\s+([\w]+\s+)?([\w]*(?:ed|en|wn|lt|nt|pt|ft|zed))\b/i;
|
|
207
273
|
|
|
208
274
|
let passiveCount = 0;
|
|
@@ -221,11 +287,6 @@ function measurePassiveVoice(sentences) {
|
|
|
221
287
|
|
|
222
288
|
/**
|
|
223
289
|
* Ambiguous Pronoun Ratio (Structure, 3.0% weight in Understanding)
|
|
224
|
-
*
|
|
225
|
-
* Counts pronouns that lack clear antecedents: it, this, that, they, them, these, those.
|
|
226
|
-
* In technical documentation, these often create confusion about what exactly is referenced.
|
|
227
|
-
*
|
|
228
|
-
* Returns ratio of ambiguous pronouns to total word count.
|
|
229
290
|
*/
|
|
230
291
|
function measureAmbiguousPronouns(words) {
|
|
231
292
|
if (words.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
@@ -250,30 +311,19 @@ function measureAmbiguousPronouns(words) {
|
|
|
250
311
|
}
|
|
251
312
|
|
|
252
313
|
/**
|
|
253
|
-
* Atomicity Score (Structure, 9.0% weight in Understanding
|
|
254
|
-
*
|
|
255
|
-
* Measures how "atomic" (single-purpose) sentences are.
|
|
256
|
-
* Compound sentences with and/or/also/additionally indicate non-atomic requirements.
|
|
257
|
-
* IEEE 830 §4.1 recommends atomic requirements that can be independently verified.
|
|
258
|
-
*
|
|
259
|
-
* Returns ratio of NON-atomic sentences (compound) to total sentences.
|
|
314
|
+
* Atomicity Score (Structure, 9.0% weight — HIGHEST in Understanding)
|
|
260
315
|
*/
|
|
261
316
|
function measureAtomicity(sentences) {
|
|
262
317
|
if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
263
318
|
|
|
264
|
-
// Compound indicators (sentence-level conjunctions, not word-level)
|
|
265
|
-
// We match these only when preceded/followed by spaces to avoid matching within words
|
|
266
319
|
const compoundPattern = /\b(and also|and then|as well as|in addition to|additionally|furthermore|moreover)\b/i;
|
|
267
|
-
// Simple "and" / "or" — only flag if >1 occurrence in a sentence (natural language has legitimate single "and")
|
|
268
320
|
const simpleCompound = /\band\b/gi;
|
|
269
|
-
const simpleOr = /\bor\b/gi;
|
|
270
321
|
|
|
271
322
|
let compoundCount = 0;
|
|
272
323
|
for (const sentence of sentences) {
|
|
273
324
|
if (compoundPattern.test(sentence)) {
|
|
274
325
|
compoundCount++;
|
|
275
326
|
} else {
|
|
276
|
-
// Count simple "and" — 2+ indicates compound
|
|
277
327
|
const andMatches = sentence.match(simpleCompound);
|
|
278
328
|
if (andMatches && andMatches.length >= 2) {
|
|
279
329
|
compoundCount++;
|
|
@@ -289,16 +339,8 @@ function measureAtomicity(sentences) {
|
|
|
289
339
|
}
|
|
290
340
|
|
|
291
341
|
/**
|
|
292
|
-
* Flesch Reading Ease (Readability
|
|
293
|
-
*
|
|
294
|
-
* Formula: 206.835 - 1.015 * (total words / total sentences) - 84.6 * (total syllables / total words)
|
|
295
|
-
* Source: Flesch, R. (1948). "A new readability yardstick." Journal of Applied Psychology.
|
|
296
|
-
*
|
|
297
|
-
* Scale: 0-100, higher = easier to read.
|
|
298
|
-
* 90-100: Very Easy (5th grade)
|
|
299
|
-
* 60-69: Standard (8th-9th grade)
|
|
300
|
-
* 30-49: Difficult (college level)
|
|
301
|
-
* 0-29: Very Confusing (graduate level)
|
|
342
|
+
* Flesch Reading Ease (Readability)
|
|
343
|
+
* Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
|
|
302
344
|
*/
|
|
303
345
|
function measureFleschReadingEase(words, sentences) {
|
|
304
346
|
if (words.length === 0 || sentences.length === 0) return 0;
|
|
@@ -312,12 +354,8 @@ function measureFleschReadingEase(words, sentences) {
|
|
|
312
354
|
}
|
|
313
355
|
|
|
314
356
|
/**
|
|
315
|
-
* Flesch-Kincaid Grade Level (Readability
|
|
316
|
-
*
|
|
317
|
-
* Formula: 0.39 * (total words / total sentences) + 11.8 * (total syllables / total words) - 15.59
|
|
318
|
-
* Source: Kincaid, J.P. et al. (1975). "Derivation of new readability formulas."
|
|
319
|
-
*
|
|
320
|
-
* Returns US grade level (8 = 8th grade, 12 = high school senior, 16+ = graduate)
|
|
357
|
+
* Flesch-Kincaid Grade Level (Readability)
|
|
358
|
+
* Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
|
|
321
359
|
*/
|
|
322
360
|
function measureFleschKincaidGrade(words, sentences) {
|
|
323
361
|
if (words.length === 0 || sentences.length === 0) return 0;
|
|
@@ -331,10 +369,7 @@ function measureFleschKincaidGrade(words, sentences) {
|
|
|
331
369
|
}
|
|
332
370
|
|
|
333
371
|
/**
|
|
334
|
-
* Sentence Length (Cognitive
|
|
335
|
-
*
|
|
336
|
-
* Average words per sentence. Cognitive load research (Sweller, 1988) shows that
|
|
337
|
-
* sentences over 25 words significantly increase processing effort.
|
|
372
|
+
* Sentence Length (Cognitive)
|
|
338
373
|
*/
|
|
339
374
|
function measureSentenceLength(words, sentences) {
|
|
340
375
|
if (sentences.length === 0) return 0;
|
|
@@ -342,11 +377,7 @@ function measureSentenceLength(words, sentences) {
|
|
|
342
377
|
}
|
|
343
378
|
|
|
344
379
|
/**
|
|
345
|
-
* Negation Load (Cognitive
|
|
346
|
-
*
|
|
347
|
-
* Ratio of sentences containing negation words.
|
|
348
|
-
* Negation increases cognitive load because readers must mentally invert meaning.
|
|
349
|
-
* IEEE 830 §4.3 recommends positive phrasing in requirements.
|
|
380
|
+
* Negation Load (Cognitive)
|
|
350
381
|
*/
|
|
351
382
|
function measureNegationLoad(sentences) {
|
|
352
383
|
if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
@@ -368,10 +399,7 @@ function measureNegationLoad(sentences) {
|
|
|
368
399
|
}
|
|
369
400
|
|
|
370
401
|
/**
|
|
371
|
-
* Conditional Load (Cognitive
|
|
372
|
-
*
|
|
373
|
-
* Ratio of sentences containing conditional keywords.
|
|
374
|
-
* Excessive conditionals make documentation hard to follow and test.
|
|
402
|
+
* Conditional Load (Cognitive)
|
|
375
403
|
*/
|
|
376
404
|
function measureConditionalLoad(sentences) {
|
|
377
405
|
if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
@@ -400,6 +428,7 @@ function getReadabilityLabel(score) {
|
|
|
400
428
|
if (score >= 60) return 'Standard';
|
|
401
429
|
if (score >= 50) return 'Fairly Difficult';
|
|
402
430
|
if (score >= 30) return 'Difficult';
|
|
431
|
+
if (score >= 15) return 'Hard — Technical';
|
|
403
432
|
return 'Very Confusing';
|
|
404
433
|
}
|
|
405
434
|
|
|
@@ -416,11 +445,9 @@ function getGradeLabel(grade) {
|
|
|
416
445
|
|
|
417
446
|
/**
|
|
418
447
|
* Check if the `understanding` CLI is available on the system.
|
|
419
|
-
* Returns the path to the executable or null.
|
|
420
448
|
*/
|
|
421
449
|
function findUnderstandingCli() {
|
|
422
450
|
try {
|
|
423
|
-
// Use 'which' on Unix/Mac, 'where' on Windows — never redirect to NUL (creates file on Mac)
|
|
424
451
|
const cmd = process.platform === 'win32' ? 'where understanding' : 'which understanding';
|
|
425
452
|
const result = execSync(`${cmd} 2>/dev/null`, {
|
|
426
453
|
encoding: 'utf-8',
|
|
@@ -434,7 +461,6 @@ function findUnderstandingCli() {
|
|
|
434
461
|
|
|
435
462
|
/**
|
|
436
463
|
* Run the `understanding` CLI on a file and parse results.
|
|
437
|
-
* Returns understanding's quality score or null if it fails.
|
|
438
464
|
*/
|
|
439
465
|
function runUnderstandingDeepScan(filePath) {
|
|
440
466
|
try {
|
|
@@ -484,20 +510,22 @@ function getCanonicalDocs(projectDir) {
|
|
|
484
510
|
|
|
485
511
|
/**
|
|
486
512
|
* Analyze a single document and return per-metric results.
|
|
513
|
+
*
|
|
514
|
+
* Uses extractProse() instead of stripMarkdown() — only actual prose
|
|
515
|
+
* paragraphs are scored. Documents that are mostly tables/code/reference
|
|
516
|
+
* material are skipped for readability (they'd score 0/100 unfairly).
|
|
487
517
|
*/
|
|
488
518
|
function analyzeDocument(doc) {
|
|
489
519
|
const content = readFileSync(doc.path, 'utf-8');
|
|
490
|
-
const
|
|
491
|
-
|
|
492
|
-
if (plainText.length < 50) {
|
|
493
|
-
return { skipped: true, reason: 'too short', name: doc.name };
|
|
494
|
-
}
|
|
520
|
+
const proseText = extractProse(content);
|
|
495
521
|
|
|
496
|
-
const sentences = splitSentences(
|
|
497
|
-
const words = tokenizeWords(
|
|
522
|
+
const sentences = splitSentences(proseText);
|
|
523
|
+
const words = tokenizeWords(proseText);
|
|
498
524
|
|
|
499
|
-
|
|
500
|
-
|
|
525
|
+
// Skip if insufficient prose content
|
|
526
|
+
// Reference docs (mostly tables, code, lists) shouldn't be scored for readability
|
|
527
|
+
if (words.length < MIN_PROSE_WORDS || sentences.length < 3) {
|
|
528
|
+
return { skipped: true, reason: 'insufficient prose (reference document)', name: doc.name };
|
|
501
529
|
}
|
|
502
530
|
|
|
503
531
|
const passive = measurePassiveVoice(sentences);
|
|
@@ -539,7 +567,6 @@ export function validateDocQuality(projectDir, config) {
|
|
|
539
567
|
|
|
540
568
|
const docs = getCanonicalDocs(projectDir);
|
|
541
569
|
if (docs.length === 0) {
|
|
542
|
-
// No docs to analyze — structure validator catches this
|
|
543
570
|
return results;
|
|
544
571
|
}
|
|
545
572
|
|
|
@@ -606,7 +633,7 @@ export function validateDocQuality(projectDir, config) {
|
|
|
606
633
|
} else {
|
|
607
634
|
results.warnings.push(
|
|
608
635
|
`${doc.name}: Reading level too high (grade ${m.fleschKincaidGrade} — ${getGradeLabel(m.fleschKincaidGrade)}). ` +
|
|
609
|
-
`Aim for grade
|
|
636
|
+
`Aim for grade 12-16 for technical docs`
|
|
610
637
|
);
|
|
611
638
|
}
|
|
612
639
|
|
|
@@ -617,7 +644,7 @@ export function validateDocQuality(projectDir, config) {
|
|
|
617
644
|
} else {
|
|
618
645
|
results.warnings.push(
|
|
619
646
|
`${doc.name}: Average sentence too long (${m.avgSentenceLength} words). ` +
|
|
620
|
-
`Target ≤
|
|
647
|
+
`Target ≤30 words per sentence for readability`
|
|
621
648
|
);
|
|
622
649
|
}
|
|
623
650
|
|
|
@@ -644,11 +671,5 @@ export function validateDocQuality(projectDir, config) {
|
|
|
644
671
|
}
|
|
645
672
|
}
|
|
646
673
|
|
|
647
|
-
// ── Optional: Understanding deep scan note ──
|
|
648
|
-
if (!understandingCli && docs.length > 0) {
|
|
649
|
-
// Don't add as warning — just a note in verbose mode
|
|
650
|
-
// Users who want full 31-metric scan can install understanding
|
|
651
|
-
}
|
|
652
|
-
|
|
653
674
|
return results;
|
|
654
675
|
}
|