docguard-cli 0.9.1 → 0.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/cli/commands/diagnose.mjs +1 -1
- package/cli/validators/doc-quality.mjs +186 -140
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
import { c } from '../shared.mjs';
|
|
17
17
|
import { runGuardInternal } from './guard.mjs';
|
|
18
18
|
import { runScoreInternal } from './score.mjs';
|
|
19
|
-
import { existsSync, readFileSync
|
|
19
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
20
20
|
import { resolve, dirname } from 'node:path';
|
|
21
21
|
import { fileURLToPath } from 'node:url';
|
|
22
22
|
import { execSync } from 'node:child_process';
|
|
@@ -11,6 +11,11 @@
|
|
|
11
11
|
* Readability: Flesch Reading Ease, Flesch-Kincaid Grade Level
|
|
12
12
|
* Cognitive: Sentence Length, Negation Load, Conditional Load
|
|
13
13
|
*
|
|
14
|
+
* v0.9.3 — Prose-Only Extraction Engine:
|
|
15
|
+
* Instead of stripping markdown and measuring residue (which treats table
|
|
16
|
+
* cells as "long sentences"), this version extracts ONLY actual prose
|
|
17
|
+
* paragraphs. Docs that are mostly tables/code skip readability scoring.
|
|
18
|
+
*
|
|
14
19
|
* Optional: If `understanding` CLI is installed, runs a full 31-metric deep scan.
|
|
15
20
|
*
|
|
16
21
|
* Zero dependencies — pure Node.js built-ins only.
|
|
@@ -25,111 +30,197 @@ import { execSync } from 'node:child_process';
|
|
|
25
30
|
// Values are based on IEEE 830 best practices and readability research.
|
|
26
31
|
|
|
27
32
|
const THRESHOLDS = {
|
|
28
|
-
passiveVoiceRatio: { warn: 0.
|
|
33
|
+
passiveVoiceRatio: { warn: 0.25, label: 'Passive voice ratio' }, // >25% passive = warn
|
|
29
34
|
ambiguousPronounRatio: { warn: 0.15, label: 'Ambiguous pronoun ratio' }, // >15% ambiguous pronouns = warn
|
|
30
|
-
atomicityScore: { warn: 0.
|
|
31
|
-
fleschReadingEase: { warn:
|
|
32
|
-
fleschKincaidGrade: { warn:
|
|
33
|
-
avgSentenceLength: { warn:
|
|
34
|
-
negationLoad: { warn: 0.
|
|
35
|
+
atomicityScore: { warn: 0.35, label: 'Non-atomic sentence ratio' }, // >35% compound sentences = warn
|
|
36
|
+
fleschReadingEase: { warn: 15, label: 'Flesch reading ease' }, // <15 = truly unreadable prose
|
|
37
|
+
fleschKincaidGrade: { warn: 18, label: 'Flesch-Kincaid grade' }, // >18 = graduate level+
|
|
38
|
+
avgSentenceLength: { warn: 30, label: 'Avg sentence length' }, // >30 words = too long
|
|
39
|
+
negationLoad: { warn: 0.20, label: 'Negation load' }, // >20% sentences with negation = warn
|
|
35
40
|
conditionalLoad: { warn: 0.30, label: 'Conditional load' }, // >30% sentences conditional = warn
|
|
36
41
|
};
|
|
37
42
|
|
|
38
|
-
//
|
|
43
|
+
// Minimum prose words required for readability scoring.
|
|
44
|
+
// Docs with less than this are reference docs (tables, code) — skip readability.
|
|
45
|
+
const MIN_PROSE_WORDS = 50;
|
|
46
|
+
|
|
47
|
+
// ──── Technical Vocabulary ────
|
|
48
|
+
// Terms the target audience knows. Treated as 2-syllable words for Flesch scoring
|
|
49
|
+
// so they don't artificially inflate difficulty.
|
|
50
|
+
|
|
51
|
+
const TECH_VOCAB = new Set([
|
|
52
|
+
// Infrastructure & databases
|
|
53
|
+
'dynamodb', 'redis', 'postgres', 'postgresql', 'mongodb', 'mysql', 'sqlite',
|
|
54
|
+
'kubernetes', 'docker', 'dockerfile', 'nginx', 'apache', 'cloudfront',
|
|
55
|
+
'cloudwatch', 'elasticsearch', 'opensearch', 'terraform', 'ansible',
|
|
56
|
+
'memcached', 'cassandra', 'rabbitmq', 'kafka',
|
|
57
|
+
// Frameworks & languages
|
|
58
|
+
'typescript', 'javascript', 'python', 'fastify', 'express', 'nextjs',
|
|
59
|
+
'webpack', 'vite', 'vitest', 'playwright', 'cypress', 'mocha',
|
|
60
|
+
'nestjs', 'angular', 'svelte', 'nuxtjs', 'gatsby', 'remix',
|
|
61
|
+
// Protocols & patterns
|
|
62
|
+
'websocket', 'websockets', 'middleware', 'microservice', 'microservices',
|
|
63
|
+
'graphql', 'restful', 'oauth', 'openapi', 'webhook', 'webhooks',
|
|
64
|
+
'grpc', 'protobuf', 'pubsub',
|
|
65
|
+
// AWS services
|
|
66
|
+
'lambda', 'cognito', 'amplify', 'apprunner', 'cloudformation',
|
|
67
|
+
'apigateway', 'secretsmanager', 'parameterstore', 'eventbridge',
|
|
68
|
+
'fargate', 'elasticache', 'sagemaker',
|
|
69
|
+
// Common developer terms
|
|
70
|
+
'namespace', 'endpoint', 'endpoints', 'timestamp', 'timestamps',
|
|
71
|
+
'boolean', 'callback', 'callbacks', 'codebase', 'monorepo',
|
|
72
|
+
'frontend', 'backend', 'fullstack', 'changelog', 'localhost',
|
|
73
|
+
'hostname', 'username', 'eslint', 'prettier', 'rollup',
|
|
74
|
+
'authentication', 'authorization', 'infrastructure', 'serialization',
|
|
75
|
+
'deserialization', 'middleware', 'polymorphism', 'abstraction',
|
|
76
|
+
]);
|
|
77
|
+
|
|
78
|
+
// ──── Prose Extraction Engine ────
|
|
39
79
|
|
|
40
80
|
/**
|
|
41
|
-
*
|
|
42
|
-
*
|
|
43
|
-
*
|
|
81
|
+
* Extract only prose paragraphs from markdown content.
|
|
82
|
+
*
|
|
83
|
+
* Instead of stripping markdown and measuring residue (where table cells
|
|
84
|
+
* become "146-word sentences"), this identifies actual prose — blocks of
|
|
85
|
+
* text that form readable sentences — and returns only those.
|
|
86
|
+
*
|
|
87
|
+
* A line qualifies as prose if it:
|
|
88
|
+
* - Is not inside a code block / HTML comment
|
|
89
|
+
* - Is not a table row, header, horizontal rule, or metadata
|
|
90
|
+
* - Has ≥55% alphabetic characters (filters out paths/URLs/symbol-heavy lines)
|
|
91
|
+
* - Has ≥5 words (fragments aren't prose)
|
|
44
92
|
*/
|
|
45
|
-
function
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
// Remove horizontal rules
|
|
62
|
-
text = text.replace(/^[-*_]{3,}\s*$/gm, '');
|
|
63
|
-
|
|
64
|
-
// Remove images: 
|
|
65
|
-
text = text.replace(/!\[.*?\]\(.*?\)/g, '');
|
|
66
|
-
|
|
67
|
-
// Remove links, keep link text: [text](url) → text
|
|
68
|
-
text = text.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
|
|
69
|
-
|
|
70
|
-
// Remove inline code
|
|
71
|
-
text = text.replace(/`[^`]+`/g, '');
|
|
72
|
-
|
|
73
|
-
// Remove header markers (# ## ### etc.)
|
|
74
|
-
text = text.replace(/^#{1,6}\s+/gm, '');
|
|
75
|
-
|
|
76
|
-
// Remove list markers (-, *, 1.)
|
|
77
|
-
text = text.replace(/^\s*[-*+]\s+/gm, '');
|
|
78
|
-
text = text.replace(/^\s*\d+\.\s+/gm, '');
|
|
79
|
-
|
|
80
|
-
// Remove bold/italic markers
|
|
81
|
-
text = text.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
|
|
82
|
-
text = text.replace(/_{1,3}([^_]+)_{1,3}/g, '$1');
|
|
93
|
+
function extractProse(content) {
|
|
94
|
+
const lines = content.split('\n');
|
|
95
|
+
const proseLines = [];
|
|
96
|
+
let inCodeBlock = false;
|
|
97
|
+
let inHtmlComment = false;
|
|
98
|
+
|
|
99
|
+
for (const rawLine of lines) {
|
|
100
|
+
const line = rawLine.trim();
|
|
101
|
+
|
|
102
|
+
// Track code block boundaries (``` and ````)
|
|
103
|
+
if (/^`{3,}/.test(line)) {
|
|
104
|
+
inCodeBlock = !inCodeBlock;
|
|
105
|
+
continue;
|
|
106
|
+
}
|
|
107
|
+
if (inCodeBlock) continue;
|
|
83
108
|
|
|
84
|
-
|
|
85
|
-
|
|
109
|
+
// Track multi-line HTML comments
|
|
110
|
+
if (line.includes('<!--') && !line.includes('-->')) {
|
|
111
|
+
inHtmlComment = true;
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
if (inHtmlComment) {
|
|
115
|
+
if (line.includes('-->')) inHtmlComment = false;
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
86
118
|
|
|
87
|
-
|
|
88
|
-
|
|
119
|
+
// Skip non-prose line types
|
|
120
|
+
if (line.startsWith('|')) continue; // Table rows
|
|
121
|
+
if (line.startsWith('#')) continue; // Headers
|
|
122
|
+
if (line.startsWith('!')) continue; // Images
|
|
123
|
+
if (/^[-*_]{3,}\s*$/.test(line)) continue; // Horizontal rules
|
|
124
|
+
if (/^[|:\-\s]+$/.test(line)) continue; // Table separators
|
|
125
|
+
if (/^<!--.*-->$/.test(line)) continue; // Inline HTML comments
|
|
126
|
+
if (/^<[^>]+>/.test(line)) continue; // HTML tags
|
|
127
|
+
if (/^---\s*$/.test(line)) continue; // YAML frontmatter
|
|
128
|
+
if (line.length === 0) continue; // Empty lines
|
|
129
|
+
|
|
130
|
+
// Clean the line: extract text from markdown formatting
|
|
131
|
+
let cleaned = line;
|
|
132
|
+
cleaned = cleaned.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1'); // Links → text only
|
|
133
|
+
cleaned = cleaned.replace(/`[^`]+`/g, ''); // Remove inline code
|
|
134
|
+
cleaned = cleaned.replace(/!\[.*?\]\(.*?\)/g, ''); // Remove images
|
|
135
|
+
cleaned = cleaned.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1'); // Bold/italic → text
|
|
136
|
+
cleaned = cleaned.replace(/_{1,3}([^_]+)_{1,3}/g, '$1'); // Underline emphasis
|
|
137
|
+
cleaned = cleaned.replace(/^[-*+]\s+/, ''); // List markers
|
|
138
|
+
cleaned = cleaned.replace(/^\d+\.\s+/, ''); // Numbered list markers
|
|
139
|
+
cleaned = cleaned.trim();
|
|
140
|
+
|
|
141
|
+
if (cleaned.length < 15) continue;
|
|
142
|
+
|
|
143
|
+
// Prose heuristic: check alphabetic ratio and word count
|
|
144
|
+
const alphaCount = (cleaned.match(/[a-zA-Z]/g) || []).length;
|
|
145
|
+
const alphaRatio = alphaCount / cleaned.length;
|
|
146
|
+
const wordCount = cleaned.split(/\s+/).length;
|
|
147
|
+
|
|
148
|
+
// A prose line needs ≥55% letters and ≥5 words
|
|
149
|
+
if (alphaRatio >= 0.55 && wordCount >= 5) {
|
|
150
|
+
proseLines.push(cleaned);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
89
153
|
|
|
90
|
-
return
|
|
154
|
+
return proseLines.join('\n');
|
|
91
155
|
}
|
|
92
156
|
|
|
93
157
|
/**
|
|
94
|
-
* Split text into sentences
|
|
95
|
-
*
|
|
158
|
+
* Split text into sentences with markdown-aware boundary detection.
|
|
159
|
+
*
|
|
160
|
+
* Protects against false splits from:
|
|
161
|
+
* - File paths (src/services/auth.ts → the dot isn't a sentence boundary)
|
|
162
|
+
* - Version numbers (v0.9.2, Node.js 18)
|
|
163
|
+
* - URLs (https://example.com)
|
|
164
|
+
* - Common abbreviations (e.g., i.e., etc., vs.)
|
|
165
|
+
* - Technical dotted names (package.json, .env.local)
|
|
96
166
|
*/
|
|
97
167
|
function splitSentences(text) {
|
|
98
168
|
if (!text || text.trim().length === 0) return [];
|
|
99
169
|
|
|
100
|
-
// Protect common abbreviations from false sentence splits
|
|
101
170
|
let protected_ = text;
|
|
102
|
-
|
|
171
|
+
|
|
172
|
+
// Protect dotted filenames (package.json, .env.local, auth.ts)
|
|
173
|
+
protected_ = protected_.replace(/[\w.-]+\.[a-z]{1,4}(?=[\s,;:)\]|]|$)/gi, (m) => m.replace(/\./g, '≈'));
|
|
174
|
+
|
|
175
|
+
// Protect version numbers (v0.9.2, 1.2.3)
|
|
176
|
+
protected_ = protected_.replace(/\bv?\d+\.\d+(?:\.\d+)*\b/g, (m) => m.replace(/\./g, '≈'));
|
|
177
|
+
|
|
178
|
+
// Protect URLs
|
|
179
|
+
protected_ = protected_.replace(/https?:\/\/[^\s)]+/g, (m) => m.replace(/\./g, '≈'));
|
|
180
|
+
|
|
181
|
+
// Protect common abbreviations
|
|
182
|
+
const abbreviations = ['Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'approx', 'incl'];
|
|
103
183
|
for (const abbr of abbreviations) {
|
|
104
184
|
const regex = new RegExp(`\\b${abbr}\\.`, 'gi');
|
|
105
|
-
protected_ = protected_.replace(regex,
|
|
185
|
+
protected_ = protected_.replace(regex, (m) => m.replace(/\./g, '≈'));
|
|
106
186
|
}
|
|
107
187
|
|
|
188
|
+
// Protect e.g. and i.e. specifically (have dots in the abbreviation itself)
|
|
189
|
+
protected_ = protected_.replace(/\be\.g\./gi, 'e≈g≈');
|
|
190
|
+
protected_ = protected_.replace(/\bi\.e\./gi, 'i≈e≈');
|
|
191
|
+
|
|
192
|
+
// Protect Node.js, Vue.js, etc.
|
|
193
|
+
protected_ = protected_.replace(/\b(\w+)\.js\b/gi, '$1≈js');
|
|
194
|
+
|
|
108
195
|
// Protect decimal numbers (3.14)
|
|
109
196
|
protected_ = protected_.replace(/(\d)\.(\d)/g, '$1≈$2');
|
|
110
197
|
|
|
111
|
-
// Split on sentence-ending punctuation followed by
|
|
112
|
-
const raw = protected_.split(/[.!?]+(?:\s
|
|
198
|
+
// Split on sentence-ending punctuation followed by whitespace/newline/end
|
|
199
|
+
const raw = protected_.split(/[.!?]+(?:\s+|\n|$)/);
|
|
113
200
|
|
|
114
|
-
// Restore protected characters and filter empties
|
|
201
|
+
// Restore protected characters and filter empties/fragments
|
|
115
202
|
return raw
|
|
116
203
|
.map(s => s.replace(/≈/g, '.').trim())
|
|
117
|
-
.filter(s =>
|
|
204
|
+
.filter(s => {
|
|
205
|
+
if (s.length < 10) return false;
|
|
206
|
+
return s.split(/\s+/).length >= 3; // At least 3 words
|
|
207
|
+
});
|
|
118
208
|
}
|
|
119
209
|
|
|
120
210
|
/**
|
|
121
|
-
* Count syllables
|
|
122
|
-
*
|
|
123
|
-
*
|
|
124
|
-
*
|
|
125
|
-
*
|
|
126
|
-
* 4. Minimum 1 syllable per word
|
|
211
|
+
* Count syllables with technical vocabulary normalization.
|
|
212
|
+
*
|
|
213
|
+
* Technical terms (DynamoDB, WebSocket, middleware) are normalized to
|
|
214
|
+
* 2 syllables. The target audience knows these terms — they don't make
|
|
215
|
+
* the text harder to read.
|
|
127
216
|
*/
|
|
128
217
|
function countSyllables(word) {
|
|
129
218
|
word = word.toLowerCase().replace(/[^a-z]/g, '');
|
|
130
219
|
if (word.length <= 2) return 1;
|
|
131
220
|
|
|
132
|
-
//
|
|
221
|
+
// Technical vocabulary → 2 syllables (known terms)
|
|
222
|
+
if (TECH_VOCAB.has(word)) return 2;
|
|
223
|
+
|
|
133
224
|
const exceptions = {
|
|
134
225
|
'the': 1, 'are': 1, 'were': 1, 'have': 1, 'there': 1,
|
|
135
226
|
'where': 1, 'here': 1, 'every': 3, 'everything': 4,
|
|
@@ -141,17 +232,16 @@ function countSyllables(word) {
|
|
|
141
232
|
const vowelGroups = word.match(/[aeiouy]+/g);
|
|
142
233
|
let count = vowelGroups ? vowelGroups.length : 1;
|
|
143
234
|
|
|
144
|
-
// Subtract silent-e at end (but not
|
|
235
|
+
// Subtract silent-e at end (but not -le, -ce, -ge)
|
|
145
236
|
if (word.endsWith('e') && !word.endsWith('le') && !word.endsWith('ce') && !word.endsWith('ge')) {
|
|
146
237
|
count--;
|
|
147
238
|
}
|
|
148
239
|
|
|
149
|
-
// Subtract for common
|
|
240
|
+
// Subtract for common past-tense endings
|
|
150
241
|
if (word.endsWith('ed') && !word.endsWith('ted') && !word.endsWith('ded')) {
|
|
151
242
|
count--;
|
|
152
243
|
}
|
|
153
244
|
|
|
154
|
-
// Ensure minimum 1 syllable
|
|
155
245
|
return Math.max(1, count);
|
|
156
246
|
}
|
|
157
247
|
|
|
@@ -179,7 +269,6 @@ function tokenizeWords(text) {
|
|
|
179
269
|
function measurePassiveVoice(sentences) {
|
|
180
270
|
if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
181
271
|
|
|
182
|
-
// Passive voice pattern: be-verb followed by past participle
|
|
183
272
|
const passivePattern = /\b(is|was|were|been|being|are|be|am)\s+([\w]+\s+)?([\w]*(?:ed|en|wn|lt|nt|pt|ft|zed))\b/i;
|
|
184
273
|
|
|
185
274
|
let passiveCount = 0;
|
|
@@ -198,11 +287,6 @@ function measurePassiveVoice(sentences) {
|
|
|
198
287
|
|
|
199
288
|
/**
|
|
200
289
|
* Ambiguous Pronoun Ratio (Structure, 3.0% weight in Understanding)
|
|
201
|
-
*
|
|
202
|
-
* Counts pronouns that lack clear antecedents: it, this, that, they, them, these, those.
|
|
203
|
-
* In technical documentation, these often create confusion about what exactly is referenced.
|
|
204
|
-
*
|
|
205
|
-
* Returns ratio of ambiguous pronouns to total word count.
|
|
206
290
|
*/
|
|
207
291
|
function measureAmbiguousPronouns(words) {
|
|
208
292
|
if (words.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
@@ -227,30 +311,19 @@ function measureAmbiguousPronouns(words) {
|
|
|
227
311
|
}
|
|
228
312
|
|
|
229
313
|
/**
|
|
230
|
-
* Atomicity Score (Structure, 9.0% weight in Understanding
|
|
231
|
-
*
|
|
232
|
-
* Measures how "atomic" (single-purpose) sentences are.
|
|
233
|
-
* Compound sentences with and/or/also/additionally indicate non-atomic requirements.
|
|
234
|
-
* IEEE 830 §4.1 recommends atomic requirements that can be independently verified.
|
|
235
|
-
*
|
|
236
|
-
* Returns ratio of NON-atomic sentences (compound) to total sentences.
|
|
314
|
+
* Atomicity Score (Structure, 9.0% weight — HIGHEST in Understanding)
|
|
237
315
|
*/
|
|
238
316
|
function measureAtomicity(sentences) {
|
|
239
317
|
if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
240
318
|
|
|
241
|
-
// Compound indicators (sentence-level conjunctions, not word-level)
|
|
242
|
-
// We match these only when preceded/followed by spaces to avoid matching within words
|
|
243
319
|
const compoundPattern = /\b(and also|and then|as well as|in addition to|additionally|furthermore|moreover)\b/i;
|
|
244
|
-
// Simple "and" / "or" — only flag if >1 occurrence in a sentence (natural language has legitimate single "and")
|
|
245
320
|
const simpleCompound = /\band\b/gi;
|
|
246
|
-
const simpleOr = /\bor\b/gi;
|
|
247
321
|
|
|
248
322
|
let compoundCount = 0;
|
|
249
323
|
for (const sentence of sentences) {
|
|
250
324
|
if (compoundPattern.test(sentence)) {
|
|
251
325
|
compoundCount++;
|
|
252
326
|
} else {
|
|
253
|
-
// Count simple "and" — 2+ indicates compound
|
|
254
327
|
const andMatches = sentence.match(simpleCompound);
|
|
255
328
|
if (andMatches && andMatches.length >= 2) {
|
|
256
329
|
compoundCount++;
|
|
@@ -266,16 +339,8 @@ function measureAtomicity(sentences) {
|
|
|
266
339
|
}
|
|
267
340
|
|
|
268
341
|
/**
|
|
269
|
-
* Flesch Reading Ease (Readability
|
|
270
|
-
*
|
|
271
|
-
* Formula: 206.835 - 1.015 * (total words / total sentences) - 84.6 * (total syllables / total words)
|
|
272
|
-
* Source: Flesch, R. (1948). "A new readability yardstick." Journal of Applied Psychology.
|
|
273
|
-
*
|
|
274
|
-
* Scale: 0-100, higher = easier to read.
|
|
275
|
-
* 90-100: Very Easy (5th grade)
|
|
276
|
-
* 60-69: Standard (8th-9th grade)
|
|
277
|
-
* 30-49: Difficult (college level)
|
|
278
|
-
* 0-29: Very Confusing (graduate level)
|
|
342
|
+
* Flesch Reading Ease (Readability)
|
|
343
|
+
* Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
|
|
279
344
|
*/
|
|
280
345
|
function measureFleschReadingEase(words, sentences) {
|
|
281
346
|
if (words.length === 0 || sentences.length === 0) return 0;
|
|
@@ -289,12 +354,8 @@ function measureFleschReadingEase(words, sentences) {
|
|
|
289
354
|
}
|
|
290
355
|
|
|
291
356
|
/**
|
|
292
|
-
* Flesch-Kincaid Grade Level (Readability
|
|
293
|
-
*
|
|
294
|
-
* Formula: 0.39 * (total words / total sentences) + 11.8 * (total syllables / total words) - 15.59
|
|
295
|
-
* Source: Kincaid, J.P. et al. (1975). "Derivation of new readability formulas."
|
|
296
|
-
*
|
|
297
|
-
* Returns US grade level (8 = 8th grade, 12 = high school senior, 16+ = graduate)
|
|
357
|
+
* Flesch-Kincaid Grade Level (Readability)
|
|
358
|
+
* Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
|
|
298
359
|
*/
|
|
299
360
|
function measureFleschKincaidGrade(words, sentences) {
|
|
300
361
|
if (words.length === 0 || sentences.length === 0) return 0;
|
|
@@ -308,10 +369,7 @@ function measureFleschKincaidGrade(words, sentences) {
|
|
|
308
369
|
}
|
|
309
370
|
|
|
310
371
|
/**
|
|
311
|
-
* Sentence Length (Cognitive
|
|
312
|
-
*
|
|
313
|
-
* Average words per sentence. Cognitive load research (Sweller, 1988) shows that
|
|
314
|
-
* sentences over 25 words significantly increase processing effort.
|
|
372
|
+
* Sentence Length (Cognitive)
|
|
315
373
|
*/
|
|
316
374
|
function measureSentenceLength(words, sentences) {
|
|
317
375
|
if (sentences.length === 0) return 0;
|
|
@@ -319,11 +377,7 @@ function measureSentenceLength(words, sentences) {
|
|
|
319
377
|
}
|
|
320
378
|
|
|
321
379
|
/**
|
|
322
|
-
* Negation Load (Cognitive
|
|
323
|
-
*
|
|
324
|
-
* Ratio of sentences containing negation words.
|
|
325
|
-
* Negation increases cognitive load because readers must mentally invert meaning.
|
|
326
|
-
* IEEE 830 §4.3 recommends positive phrasing in requirements.
|
|
380
|
+
* Negation Load (Cognitive)
|
|
327
381
|
*/
|
|
328
382
|
function measureNegationLoad(sentences) {
|
|
329
383
|
if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
@@ -345,10 +399,7 @@ function measureNegationLoad(sentences) {
|
|
|
345
399
|
}
|
|
346
400
|
|
|
347
401
|
/**
|
|
348
|
-
* Conditional Load (Cognitive
|
|
349
|
-
*
|
|
350
|
-
* Ratio of sentences containing conditional keywords.
|
|
351
|
-
* Excessive conditionals make documentation hard to follow and test.
|
|
402
|
+
* Conditional Load (Cognitive)
|
|
352
403
|
*/
|
|
353
404
|
function measureConditionalLoad(sentences) {
|
|
354
405
|
if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
|
|
@@ -377,6 +428,7 @@ function getReadabilityLabel(score) {
|
|
|
377
428
|
if (score >= 60) return 'Standard';
|
|
378
429
|
if (score >= 50) return 'Fairly Difficult';
|
|
379
430
|
if (score >= 30) return 'Difficult';
|
|
431
|
+
if (score >= 15) return 'Hard — Technical';
|
|
380
432
|
return 'Very Confusing';
|
|
381
433
|
}
|
|
382
434
|
|
|
@@ -393,11 +445,11 @@ function getGradeLabel(grade) {
|
|
|
393
445
|
|
|
394
446
|
/**
|
|
395
447
|
* Check if the `understanding` CLI is available on the system.
|
|
396
|
-
* Returns the path to the executable or null.
|
|
397
448
|
*/
|
|
398
449
|
function findUnderstandingCli() {
|
|
399
450
|
try {
|
|
400
|
-
const
|
|
451
|
+
const cmd = process.platform === 'win32' ? 'where understanding' : 'which understanding';
|
|
452
|
+
const result = execSync(`${cmd} 2>/dev/null`, {
|
|
401
453
|
encoding: 'utf-8',
|
|
402
454
|
timeout: 3000,
|
|
403
455
|
}).trim();
|
|
@@ -409,7 +461,6 @@ function findUnderstandingCli() {
|
|
|
409
461
|
|
|
410
462
|
/**
|
|
411
463
|
* Run the `understanding` CLI on a file and parse results.
|
|
412
|
-
* Returns understanding's quality score or null if it fails.
|
|
413
464
|
*/
|
|
414
465
|
function runUnderstandingDeepScan(filePath) {
|
|
415
466
|
try {
|
|
@@ -459,20 +510,22 @@ function getCanonicalDocs(projectDir) {
|
|
|
459
510
|
|
|
460
511
|
/**
|
|
461
512
|
* Analyze a single document and return per-metric results.
|
|
513
|
+
*
|
|
514
|
+
* Uses extractProse() instead of stripMarkdown() — only actual prose
|
|
515
|
+
* paragraphs are scored. Documents that are mostly tables/code/reference
|
|
516
|
+
* material are skipped for readability (they'd score 0/100 unfairly).
|
|
462
517
|
*/
|
|
463
518
|
function analyzeDocument(doc) {
|
|
464
519
|
const content = readFileSync(doc.path, 'utf-8');
|
|
465
|
-
const
|
|
466
|
-
|
|
467
|
-
if (plainText.length < 50) {
|
|
468
|
-
return { skipped: true, reason: 'too short', name: doc.name };
|
|
469
|
-
}
|
|
520
|
+
const proseText = extractProse(content);
|
|
470
521
|
|
|
471
|
-
const sentences = splitSentences(
|
|
472
|
-
const words = tokenizeWords(
|
|
522
|
+
const sentences = splitSentences(proseText);
|
|
523
|
+
const words = tokenizeWords(proseText);
|
|
473
524
|
|
|
474
|
-
|
|
475
|
-
|
|
525
|
+
// Skip if insufficient prose content
|
|
526
|
+
// Reference docs (mostly tables, code, lists) shouldn't be scored for readability
|
|
527
|
+
if (words.length < MIN_PROSE_WORDS || sentences.length < 3) {
|
|
528
|
+
return { skipped: true, reason: 'insufficient prose (reference document)', name: doc.name };
|
|
476
529
|
}
|
|
477
530
|
|
|
478
531
|
const passive = measurePassiveVoice(sentences);
|
|
@@ -514,7 +567,6 @@ export function validateDocQuality(projectDir, config) {
|
|
|
514
567
|
|
|
515
568
|
const docs = getCanonicalDocs(projectDir);
|
|
516
569
|
if (docs.length === 0) {
|
|
517
|
-
// No docs to analyze — structure validator catches this
|
|
518
570
|
return results;
|
|
519
571
|
}
|
|
520
572
|
|
|
@@ -581,7 +633,7 @@ export function validateDocQuality(projectDir, config) {
|
|
|
581
633
|
} else {
|
|
582
634
|
results.warnings.push(
|
|
583
635
|
`${doc.name}: Reading level too high (grade ${m.fleschKincaidGrade} — ${getGradeLabel(m.fleschKincaidGrade)}). ` +
|
|
584
|
-
`Aim for grade
|
|
636
|
+
`Aim for grade 12-16 for technical docs`
|
|
585
637
|
);
|
|
586
638
|
}
|
|
587
639
|
|
|
@@ -592,7 +644,7 @@ export function validateDocQuality(projectDir, config) {
|
|
|
592
644
|
} else {
|
|
593
645
|
results.warnings.push(
|
|
594
646
|
`${doc.name}: Average sentence too long (${m.avgSentenceLength} words). ` +
|
|
595
|
-
`Target ≤
|
|
647
|
+
`Target ≤30 words per sentence for readability`
|
|
596
648
|
);
|
|
597
649
|
}
|
|
598
650
|
|
|
@@ -619,11 +671,5 @@ export function validateDocQuality(projectDir, config) {
|
|
|
619
671
|
}
|
|
620
672
|
}
|
|
621
673
|
|
|
622
|
-
// ── Optional: Understanding deep scan note ──
|
|
623
|
-
if (!understandingCli && docs.length > 0) {
|
|
624
|
-
// Don't add as warning — just a note in verbose mode
|
|
625
|
-
// Users who want full 31-metric scan can install understanding
|
|
626
|
-
}
|
|
627
|
-
|
|
628
674
|
return results;
|
|
629
675
|
}
|