docguard-cli 0.9.2 → 0.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -120,10 +120,10 @@ export function runDiagnose(projectDir, config, flags) {
120
120
  });
121
121
  } catch { /* init may partially succeed */ }
122
122
 
123
- // Run generate to fill in content
123
+ // Run generate to fill in MISSING content only (never --force, which would overwrite existing docs)
124
124
  try {
125
125
  const cliPath = resolve(dirname(fileURLToPath(import.meta.url)), '..', 'docguard.mjs');
126
- execSync(`node "${cliPath}" generate --dir "${projectDir}" --force`, {
126
+ execSync(`node "${cliPath}" generate --dir "${projectDir}"`, {
127
127
  encoding: 'utf-8',
128
128
  stdio: 'pipe',
129
129
  });
@@ -5,7 +5,7 @@
5
5
  * This is the "killer feature" — take any project and auto-generate CDD docs.
6
6
  */
7
7
 
8
- import { existsSync, readFileSync, writeFileSync, readdirSync, statSync, mkdirSync } from 'node:fs';
8
+ import { existsSync, readFileSync, writeFileSync, readdirSync, statSync, mkdirSync, copyFileSync } from 'node:fs';
9
9
  import { resolve, join, extname, basename, relative, dirname } from 'node:path';
10
10
  import { c } from '../shared.mjs';
11
11
  import { detectDocTools } from '../scanners/doc-tools.mjs';
@@ -18,6 +18,30 @@ const IGNORE_DIRS = new Set([
18
18
  '.amplify-hosting', '.serverless',
19
19
  ]);
20
20
 
21
+ /**
22
+ * Create a .bak backup of an existing file before --force overwrites it.
23
+ * Only backs up if the file exists and has content.
24
+ */
25
+ function backupFile(filePath) {
26
+ if (existsSync(filePath)) {
27
+ try {
28
+ const content = readFileSync(filePath, 'utf-8');
29
+ if (content.trim().length > 0) {
30
+ copyFileSync(filePath, filePath + '.bak');
31
+ }
32
+ } catch { /* backup failure is non-fatal */ }
33
+ }
34
+ }
35
+
36
+ /**
37
+ * Safe write — creates a .bak backup before overwriting existing files.
38
+ * Call this instead of raw writeFileSync when generating docs.
39
+ */
40
+ function safeWrite(filePath, content) {
41
+ backupFile(filePath);
42
+ writeFileSync(filePath, content, 'utf-8');
43
+ }
44
+
21
45
  const CODE_EXTENSIONS = new Set([
22
46
  '.js', '.mjs', '.cjs', '.ts', '.tsx', '.jsx',
23
47
  '.py', '.java', '.go', '.rs', '.rb', '.php', '.cs',
@@ -137,6 +161,21 @@ export function runGenerate(projectDir, config, flags) {
137
161
  mkdirSync(docsDir, { recursive: true });
138
162
  }
139
163
 
164
+ // ── Safety: warn if --force will overwrite existing files ──
165
+ if (flags.force) {
166
+ const targetFiles = [
167
+ 'docs-canonical/ARCHITECTURE.md', 'docs-canonical/API-REFERENCE.md',
168
+ 'docs-canonical/DATA-MODEL.md', 'docs-canonical/ENVIRONMENT.md',
169
+ 'docs-canonical/TEST-SPEC.md', 'docs-canonical/SECURITY.md',
170
+ 'AGENTS.md', 'CHANGELOG.md', 'DRIFT-LOG.md',
171
+ ];
172
+ const existing = targetFiles.filter(f => existsSync(resolve(projectDir, f)));
173
+ if (existing.length > 0) {
174
+ console.log(` ${c.yellow}⚠️ --force: ${existing.length} existing file(s) will be overwritten.${c.reset}`);
175
+ console.log(` ${c.dim} Backups saved as .bak files.${c.reset}\n`);
176
+ }
177
+ }
178
+
140
179
  let created = 0;
141
180
  let skipped = 0;
142
181
 
@@ -633,7 +672,7 @@ See \\\`docs-canonical/KNOWN-GOTCHAS.md\\\` for known issues.
633
672
  | 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (arc42 + C4 aligned) |
634
673
  `;
635
674
 
636
- writeFileSync(path, appendStandardsCitation(content, 'ARCHITECTURE.md'), 'utf-8');
675
+ safeWrite(path, appendStandardsCitation(content, 'ARCHITECTURE.md'), 'utf-8');
637
676
  console.log(` ${c.green}✅ ARCHITECTURE.md${c.reset} (arc42 §1-§12, ${componentRows.length} components, ${Object.values(stack).filter(Boolean).length} tech)`);
638
677
  return true;
639
678
  }
@@ -730,7 +769,7 @@ ${resourceSections}
730
769
  | 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (${deepRoutes.length} endpoints from ${deepRoutes[0]?.source || 'code'}) |
731
770
  `;
732
771
 
733
- writeFileSync(path, appendStandardsCitation(content, 'API-REFERENCE.md'), 'utf-8');
772
+ safeWrite(path, appendStandardsCitation(content, 'API-REFERENCE.md'), 'utf-8');
734
773
  console.log(` ${c.green}✅ API-REFERENCE.md${c.reset} (${deepRoutes.length} endpoints, ${Object.keys(groups).length} resources)`);
735
774
  return true;
736
775
  }
@@ -885,7 +924,7 @@ ${erDiagram}
885
924
  | 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (${entities.length} entities, ${relationships.length} relationships from ${schemaSource}) |
886
925
  `;
887
926
 
888
- writeFileSync(path, appendStandardsCitation(content, 'DATA-MODEL.md'), 'utf-8');
927
+ safeWrite(path, appendStandardsCitation(content, 'DATA-MODEL.md'), 'utf-8');
889
928
  console.log(` ${c.green}✅ DATA-MODEL.md${c.reset} (${entities.length} entities, ${relationships.length} relationships from ${schemaSource})`);
890
929
  return true;
891
930
  }
@@ -948,7 +987,7 @@ ${envVarRows || '| <!-- No .env.example found --> | | | | |'}
948
987
  | 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (${scan.envVars.length} env vars found) |
949
988
  `;
950
989
 
951
- writeFileSync(path, appendStandardsCitation(content, 'ENVIRONMENT.md'), 'utf-8');
990
+ safeWrite(path, appendStandardsCitation(content, 'ENVIRONMENT.md'), 'utf-8');
952
991
  console.log(` ${c.green}✅ ENVIRONMENT.md${c.reset} (${scan.envVars.length} env vars detected)`);
953
992
  return true;
954
993
  }
@@ -1033,7 +1072,7 @@ ${serviceRows || '| <!-- No services found --> | | | |'}
1033
1072
  | 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated (${scan.tests.length} test files, ${serviceMap.filter(s => s.status === '✅').length}/${serviceMap.length} mapped) |
1034
1073
  `;
1035
1074
 
1036
- writeFileSync(path, appendStandardsCitation(content, 'TEST-SPEC.md'), 'utf-8');
1075
+ safeWrite(path, appendStandardsCitation(content, 'TEST-SPEC.md'), 'utf-8');
1037
1076
  console.log(` ${c.green}✅ TEST-SPEC.md${c.reset} (${scan.tests.length} tests, ${serviceMap.filter(s => s.status === '✅').length}/${serviceMap.length} services mapped)`);
1038
1077
  return true;
1039
1078
  }
@@ -1099,7 +1138,7 @@ ${scan.envVars.filter(v => isSecretVar(v.name)).map(v =>
1099
1138
  | 0.1.0 | ${new Date().toISOString().split('T')[0]} | DocGuard Generate | Auto-generated |
1100
1139
  `;
1101
1140
 
1102
- writeFileSync(path, appendStandardsCitation(content, 'SECURITY.md'), 'utf-8');
1141
+ safeWrite(path, appendStandardsCitation(content, 'SECURITY.md'), 'utf-8');
1103
1142
  console.log(` ${c.green}✅ SECURITY.md${c.reset} (auth: ${stack.auth || 'not detected'})`);
1104
1143
  return true;
1105
1144
  }
@@ -1209,7 +1248,7 @@ npx docguard-cli generate # Generate docs from code
1209
1248
  - Test requirements in TEST-SPEC.md must be met
1210
1249
  - Documentation changes must pass \`docguard guard\`
1211
1250
  `;
1212
- writeFileSync(agentsPath, content, 'utf-8');
1251
+ safeWrite(agentsPath, content);
1213
1252
  console.log(` ${c.green}✅ AGENTS.md${c.reset} (AGENTS.md standard compliant)`);
1214
1253
  created++;
1215
1254
  } else {
@@ -1231,7 +1270,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1231
1270
  ### Added
1232
1271
  - CDD documentation via DocGuard generate
1233
1272
  `;
1234
- writeFileSync(changelogPath, content, 'utf-8');
1273
+ safeWrite(changelogPath, content);
1235
1274
  console.log(` ${c.green}✅ CHANGELOG.md${c.reset}`);
1236
1275
  created++;
1237
1276
  } else {
@@ -1251,7 +1290,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1251
1290
  |------|------|---------------|-------------------|----------|------------|
1252
1291
  | | | | | | |
1253
1292
  `;
1254
- writeFileSync(driftPath, content, 'utf-8');
1293
+ safeWrite(driftPath, content);
1255
1294
  console.log(` ${c.green}✅ DRIFT-LOG.md${c.reset}`);
1256
1295
  created++;
1257
1296
  } else {
@@ -11,6 +11,11 @@
11
11
  * Readability: Flesch Reading Ease, Flesch-Kincaid Grade Level
12
12
  * Cognitive: Sentence Length, Negation Load, Conditional Load
13
13
  *
14
+ * v0.9.3 — Prose-Only Extraction Engine:
15
+ * Instead of stripping markdown and measuring residue (which treats table
16
+ * cells as "long sentences"), this version extracts ONLY actual prose
17
+ * paragraphs. Docs that are mostly tables/code skip readability scoring.
18
+ *
14
19
  * Optional: If `understanding` CLI is installed, runs a full 31-metric deep scan.
15
20
  *
16
21
  * Zero dependencies — pure Node.js built-ins only.
@@ -25,134 +30,197 @@ import { execSync } from 'node:child_process';
25
30
  // Values are based on IEEE 830 best practices and readability research.
26
31
 
27
32
  const THRESHOLDS = {
28
- passiveVoiceRatio: { warn: 0.20, label: 'Passive voice ratio' }, // >20% passive = warn
33
+ passiveVoiceRatio: { warn: 0.25, label: 'Passive voice ratio' }, // >25% passive = warn
29
34
  ambiguousPronounRatio: { warn: 0.15, label: 'Ambiguous pronoun ratio' }, // >15% ambiguous pronouns = warn
30
- atomicityScore: { warn: 0.30, label: 'Non-atomic sentence ratio' }, // >30% compound sentences = warn
31
- fleschReadingEase: { warn: 20, label: 'Flesch reading ease' }, // <20 = very hard to read (lowered from 30 for technical markdown)
32
- fleschKincaidGrade: { warn: 16, label: 'Flesch-Kincaid grade' }, // >16 = graduate level+
33
- avgSentenceLength: { warn: 25, label: 'Avg sentence length' }, // >25 words = too long
34
- negationLoad: { warn: 0.15, label: 'Negation load' }, // >15% sentences with negation = warn
35
+ atomicityScore: { warn: 0.35, label: 'Non-atomic sentence ratio' }, // >35% compound sentences = warn
36
+ fleschReadingEase: { warn: 15, label: 'Flesch reading ease' }, // <15 = truly unreadable prose
37
+ fleschKincaidGrade: { warn: 18, label: 'Flesch-Kincaid grade' }, // >18 = graduate level+
38
+ avgSentenceLength: { warn: 30, label: 'Avg sentence length' }, // >30 words = too long
39
+ negationLoad: { warn: 0.20, label: 'Negation load' }, // >20% sentences with negation = warn
35
40
  conditionalLoad: { warn: 0.30, label: 'Conditional load' }, // >30% sentences conditional = warn
36
41
  };
37
42
 
38
- // ──── Text Processing Utilities ────
43
+ // Minimum prose words required for readability scoring.
44
+ // Docs with less than this are reference docs (tables, code) — skip readability.
45
+ const MIN_PROSE_WORDS = 50;
46
+
47
+ // ──── Technical Vocabulary ────
48
+ // Terms the target audience knows. Treated as 2-syllable words for Flesch scoring
49
+ // so they don't artificially inflate difficulty.
50
+
51
+ const TECH_VOCAB = new Set([
52
+ // Infrastructure & databases
53
+ 'dynamodb', 'redis', 'postgres', 'postgresql', 'mongodb', 'mysql', 'sqlite',
54
+ 'kubernetes', 'docker', 'dockerfile', 'nginx', 'apache', 'cloudfront',
55
+ 'cloudwatch', 'elasticsearch', 'opensearch', 'terraform', 'ansible',
56
+ 'memcached', 'cassandra', 'rabbitmq', 'kafka',
57
+ // Frameworks & languages
58
+ 'typescript', 'javascript', 'python', 'fastify', 'express', 'nextjs',
59
+ 'webpack', 'vite', 'vitest', 'playwright', 'cypress', 'mocha',
60
+ 'nestjs', 'angular', 'svelte', 'nuxtjs', 'gatsby', 'remix',
61
+ // Protocols & patterns
62
+ 'websocket', 'websockets', 'middleware', 'microservice', 'microservices',
63
+ 'graphql', 'restful', 'oauth', 'openapi', 'webhook', 'webhooks',
64
+ 'grpc', 'protobuf', 'pubsub',
65
+ // AWS services
66
+ 'lambda', 'cognito', 'amplify', 'apprunner', 'cloudformation',
67
+ 'apigateway', 'secretsmanager', 'parameterstore', 'eventbridge',
68
+ 'fargate', 'elasticache', 'sagemaker',
69
+ // Common developer terms
70
+ 'namespace', 'endpoint', 'endpoints', 'timestamp', 'timestamps',
71
+ 'boolean', 'callback', 'callbacks', 'codebase', 'monorepo',
72
+ 'frontend', 'backend', 'fullstack', 'changelog', 'localhost',
73
+ 'hostname', 'username', 'eslint', 'prettier', 'rollup',
74
+ 'authentication', 'authorization', 'infrastructure', 'serialization',
75
+ 'deserialization', 'middleware', 'polymorphism', 'abstraction',
76
+ ]);
77
+
78
+ // ──── Prose Extraction Engine ────
39
79
 
40
80
  /**
41
- * Strip markdown formatting to get plain prose text.
42
- * Removes: code blocks, inline code, headers, links, images, tables,
43
- * HTML comments, metadata blocks, horizontal rules, list markers.
81
+ * Extract only prose paragraphs from markdown content.
82
+ *
83
+ * Instead of stripping markdown and measuring residue (where table cells
84
+ * become "146-word sentences"), this identifies actual prose — blocks of
85
+ * text that form readable sentences — and returns only those.
86
+ *
87
+ * A line qualifies as prose if it:
88
+ * - Is not inside a code block / HTML comment
89
+ * - Is not a table row, header, horizontal rule, or metadata
90
+ * - Has ≥55% alphabetic characters (filters out paths/URLs/symbol-heavy lines)
91
+ * - Has ≥5 words (fragments aren't prose)
44
92
  */
45
- function stripMarkdown(content) {
46
- let text = content;
47
-
48
- // Remove fenced code blocks (```...```) and (````...````)
49
- text = text.replace(/````[\s\S]*?````/g, '');
50
- text = text.replace(/```[\s\S]*?```/g, '');
51
-
52
- // Remove mermaid diagrams
53
- text = text.replace(/```mermaid[\s\S]*?```/g, '');
54
-
55
- // Remove HTML comments (<!-- ... -->)
56
- text = text.replace(/<!--[\s\S]*?-->/g, '');
57
-
58
- // Remove HTML tags
59
- text = text.replace(/<[^>]+>/g, '');
60
-
61
- // Remove YAML frontmatter (---...---)
62
- text = text.replace(/^---[\s\S]*?---\n/m, '');
63
-
64
- // Remove table rows (lines starting with |) and table separators
65
- text = text.replace(/^\|.*$/gm, '');
66
- text = text.replace(/^[|:\-\s]+$/gm, '');
67
-
68
- // Remove horizontal rules
69
- text = text.replace(/^[-*_]{3,}\s*$/gm, '');
70
-
71
- // Remove badge images (shield.io etc.) — before generic image removal
72
- text = text.replace(/!\[.*?\]\(https?:\/\/[^)]+\)/g, '');
73
-
74
- // Remove images: ![alt](url)
75
- text = text.replace(/!\[.*?\]\(.*?\)/g, '');
76
-
77
- // Remove links, keep link text: [text](url) → text
78
- text = text.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
79
-
80
- // Remove inline code
81
- text = text.replace(/`[^`]+`/g, '');
82
-
83
- // Remove header markers (# ## ### etc.)
84
- text = text.replace(/^#{1,6}\s+/gm, '');
85
-
86
- // Remove list markers (-, *, 1.)
87
- text = text.replace(/^\s*[-*+]\s+/gm, '');
88
- text = text.replace(/^\s*\d+\.\s+/gm, '');
89
-
90
- // Remove bold/italic markers
91
- text = text.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1');
92
- text = text.replace(/_{1,3}([^_]+)_{1,3}/g, '$1');
93
-
94
- // Remove definition-style lines (key: value or key | value)
95
- text = text.replace(/^\s*\w[\w\s]*\s*[:|]\s*.*$/gm, (match) => {
96
- // Only strip if it looks like a key-value pair, not a sentence
97
- if (match.includes('.') || match.split(/\s+/).length > 8) return match;
98
- return '';
99
- });
93
+ function extractProse(content) {
94
+ const lines = content.split('\n');
95
+ const proseLines = [];
96
+ let inCodeBlock = false;
97
+ let inHtmlComment = false;
98
+
99
+ for (const rawLine of lines) {
100
+ const line = rawLine.trim();
101
+
102
+ // Track code block boundaries (``` and ````)
103
+ if (/^`{3,}/.test(line)) {
104
+ inCodeBlock = !inCodeBlock;
105
+ continue;
106
+ }
107
+ if (inCodeBlock) continue;
100
108
 
101
- // Remove lines that are mostly non-prose (>60% special characters)
102
- text = text.replace(/^.+$/gm, (line) => {
103
- const trimmed = line.trim();
104
- if (trimmed.length < 5) return '';
105
- const alphaCount = (trimmed.match(/[a-zA-Z]/g) || []).length;
106
- const ratio = alphaCount / trimmed.length;
107
- return ratio < 0.4 ? '' : line; // If <40% letters, it's not prose
108
- });
109
+ // Track multi-line HTML comments
110
+ if (line.includes('<!--') && !line.includes('-->')) {
111
+ inHtmlComment = true;
112
+ continue;
113
+ }
114
+ if (inHtmlComment) {
115
+ if (line.includes('-->')) inHtmlComment = false;
116
+ continue;
117
+ }
109
118
 
110
- // Collapse multiple blank lines
111
- text = text.replace(/\n{3,}/g, '\n\n');
119
+ // Skip non-prose line types
120
+ if (line.startsWith('|')) continue; // Table rows
121
+ if (line.startsWith('#')) continue; // Headers
122
+ if (line.startsWith('!')) continue; // Images
123
+ if (/^[-*_]{3,}\s*$/.test(line)) continue; // Horizontal rules
124
+ if (/^[|:\-\s]+$/.test(line)) continue; // Table separators
125
+ if (/^<!--.*-->$/.test(line)) continue; // Inline HTML comments
126
+ if (/^<[^>]+>/.test(line)) continue; // HTML tags
127
+ if (/^---\s*$/.test(line)) continue; // YAML frontmatter
128
+ if (line.length === 0) continue; // Empty lines
129
+
130
+ // Clean the line: extract text from markdown formatting
131
+ let cleaned = line;
132
+ cleaned = cleaned.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1'); // Links → text only
133
+ cleaned = cleaned.replace(/`[^`]+`/g, ''); // Remove inline code
134
+ cleaned = cleaned.replace(/!\[.*?\]\(.*?\)/g, ''); // Remove images
135
+ cleaned = cleaned.replace(/\*{1,3}([^*]+)\*{1,3}/g, '$1'); // Bold/italic → text
136
+ cleaned = cleaned.replace(/_{1,3}([^_]+)_{1,3}/g, '$1'); // Underline emphasis
137
+ cleaned = cleaned.replace(/^[-*+]\s+/, ''); // List markers
138
+ cleaned = cleaned.replace(/^\d+\.\s+/, ''); // Numbered list markers
139
+ cleaned = cleaned.trim();
140
+
141
+ if (cleaned.length < 15) continue;
142
+
143
+ // Prose heuristic: check alphabetic ratio and word count
144
+ const alphaCount = (cleaned.match(/[a-zA-Z]/g) || []).length;
145
+ const alphaRatio = alphaCount / cleaned.length;
146
+ const wordCount = cleaned.split(/\s+/).length;
147
+
148
+ // A prose line needs ≥55% letters and ≥5 words
149
+ if (alphaRatio >= 0.55 && wordCount >= 5) {
150
+ proseLines.push(cleaned);
151
+ }
152
+ }
112
153
 
113
- return text.trim();
154
+ return proseLines.join('\n');
114
155
  }
115
156
 
116
157
  /**
117
- * Split text into sentences using common sentence-ending punctuation.
118
- * Handles abbreviations (Mr., Dr., etc.) and decimal numbers to avoid false splits.
158
+ * Split text into sentences with markdown-aware boundary detection.
159
+ *
160
+ * Protects against false splits from:
161
+ * - File paths (src/services/auth.ts → the dot isn't a sentence boundary)
162
+ * - Version numbers (v0.9.2, Node.js 18)
163
+ * - URLs (https://example.com)
164
+ * - Common abbreviations (e.g., i.e., etc., vs.)
165
+ * - Technical dotted names (package.json, .env.local)
119
166
  */
120
167
  function splitSentences(text) {
121
168
  if (!text || text.trim().length === 0) return [];
122
169
 
123
- // Protect common abbreviations from false sentence splits
124
170
  let protected_ = text;
125
- const abbreviations = ['Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'i.e', 'e.g', 'cf'];
171
+
172
+ // Protect dotted filenames (package.json, .env.local, auth.ts)
173
+ protected_ = protected_.replace(/[\w.-]+\.[a-z]{1,4}(?=[\s,;:)\]|]|$)/gi, (m) => m.replace(/\./g, '≈'));
174
+
175
+ // Protect version numbers (v0.9.2, 1.2.3)
176
+ protected_ = protected_.replace(/\bv?\d+\.\d+(?:\.\d+)*\b/g, (m) => m.replace(/\./g, '≈'));
177
+
178
+ // Protect URLs
179
+ protected_ = protected_.replace(/https?:\/\/[^\s)]+/g, (m) => m.replace(/\./g, '≈'));
180
+
181
+ // Protect common abbreviations
182
+ const abbreviations = ['Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'approx', 'incl'];
126
183
  for (const abbr of abbreviations) {
127
184
  const regex = new RegExp(`\\b${abbr}\\.`, 'gi');
128
- protected_ = protected_.replace(regex, `${abbr}≈`);
185
+ protected_ = protected_.replace(regex, (m) => m.replace(/\./g, '≈'));
129
186
  }
130
187
 
188
+ // Protect e.g. and i.e. specifically (have dots in the abbreviation itself)
189
+ protected_ = protected_.replace(/\be\.g\./gi, 'e≈g≈');
190
+ protected_ = protected_.replace(/\bi\.e\./gi, 'i≈e≈');
191
+
192
+ // Protect Node.js, Vue.js, etc.
193
+ protected_ = protected_.replace(/\b(\w+)\.js\b/gi, '$1≈js');
194
+
131
195
  // Protect decimal numbers (3.14)
132
196
  protected_ = protected_.replace(/(\d)\.(\d)/g, '$1≈$2');
133
197
 
134
- // Split on sentence-ending punctuation followed by space or end
135
- const raw = protected_.split(/[.!?]+(?:\s+|$)/);
198
+ // Split on sentence-ending punctuation followed by whitespace/newline/end
199
+ const raw = protected_.split(/[.!?]+(?:\s+|\n|$)/);
136
200
 
137
- // Restore protected characters and filter empties
201
+ // Restore protected characters and filter empties/fragments
138
202
  return raw
139
203
  .map(s => s.replace(/≈/g, '.').trim())
140
- .filter(s => s.length > 3); // Ignore fragments under 4 chars
204
+ .filter(s => {
205
+ if (s.length < 10) return false;
206
+ return s.split(/\s+/).length >= 3; // At least 3 words
207
+ });
141
208
  }
142
209
 
143
210
  /**
144
- * Count syllables in a word using a heuristic approach.
145
- * Based on the algorithm used in readability research:
146
- * 1. Count vowel groups
147
- * 2. Subtract silent-e at end
148
- * 3. Add back for specific suffixes (-le, -les, -tion, etc.)
149
- * 4. Minimum 1 syllable per word
211
+ * Count syllables with technical vocabulary normalization.
212
+ *
213
+ * Technical terms (DynamoDB, WebSocket, middleware) are normalized to
214
+ * 2 syllables. The target audience knows these terms — they don't make
215
+ * the text harder to read.
150
216
  */
151
217
  function countSyllables(word) {
152
218
  word = word.toLowerCase().replace(/[^a-z]/g, '');
153
219
  if (word.length <= 2) return 1;
154
220
 
155
- // Exception list for common words with unusual syllable counts
221
+ // Technical vocabulary 2 syllables (known terms)
222
+ if (TECH_VOCAB.has(word)) return 2;
223
+
156
224
  const exceptions = {
157
225
  'the': 1, 'are': 1, 'were': 1, 'have': 1, 'there': 1,
158
226
  'where': 1, 'here': 1, 'every': 3, 'everything': 4,
@@ -164,17 +232,16 @@ function countSyllables(word) {
164
232
  const vowelGroups = word.match(/[aeiouy]+/g);
165
233
  let count = vowelGroups ? vowelGroups.length : 1;
166
234
 
167
- // Subtract silent-e at end (but not for words like "able", "ible")
235
+ // Subtract silent-e at end (but not -le, -ce, -ge)
168
236
  if (word.endsWith('e') && !word.endsWith('le') && !word.endsWith('ce') && !word.endsWith('ge')) {
169
237
  count--;
170
238
  }
171
239
 
172
- // Subtract for common diphthong/double vowel endings
240
+ // Subtract for common past-tense endings
173
241
  if (word.endsWith('ed') && !word.endsWith('ted') && !word.endsWith('ded')) {
174
242
  count--;
175
243
  }
176
244
 
177
- // Ensure minimum 1 syllable
178
245
  return Math.max(1, count);
179
246
  }
180
247
 
@@ -202,7 +269,6 @@ function tokenizeWords(text) {
202
269
  function measurePassiveVoice(sentences) {
203
270
  if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
204
271
 
205
- // Passive voice pattern: be-verb followed by past participle
206
272
  const passivePattern = /\b(is|was|were|been|being|are|be|am)\s+([\w]+\s+)?([\w]*(?:ed|en|wn|lt|nt|pt|ft|zed))\b/i;
207
273
 
208
274
  let passiveCount = 0;
@@ -221,11 +287,6 @@ function measurePassiveVoice(sentences) {
221
287
 
222
288
  /**
223
289
  * Ambiguous Pronoun Ratio (Structure, 3.0% weight in Understanding)
224
- *
225
- * Counts pronouns that lack clear antecedents: it, this, that, they, them, these, those.
226
- * In technical documentation, these often create confusion about what exactly is referenced.
227
- *
228
- * Returns ratio of ambiguous pronouns to total word count.
229
290
  */
230
291
  function measureAmbiguousPronouns(words) {
231
292
  if (words.length === 0) return { ratio: 0, count: 0, total: 0 };
@@ -250,30 +311,19 @@ function measureAmbiguousPronouns(words) {
250
311
  }
251
312
 
252
313
  /**
253
- * Atomicity Score (Structure, 9.0% weight in Understanding — HIGHEST)
254
- *
255
- * Measures how "atomic" (single-purpose) sentences are.
256
- * Compound sentences with and/or/also/additionally indicate non-atomic requirements.
257
- * IEEE 830 §4.1 recommends atomic requirements that can be independently verified.
258
- *
259
- * Returns ratio of NON-atomic sentences (compound) to total sentences.
314
+ * Atomicity Score (Structure, 9.0% weight — HIGHEST in Understanding)
260
315
  */
261
316
  function measureAtomicity(sentences) {
262
317
  if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
263
318
 
264
- // Compound indicators (sentence-level conjunctions, not word-level)
265
- // We match these only when preceded/followed by spaces to avoid matching within words
266
319
  const compoundPattern = /\b(and also|and then|as well as|in addition to|additionally|furthermore|moreover)\b/i;
267
- // Simple "and" / "or" — only flag if >1 occurrence in a sentence (natural language has legitimate single "and")
268
320
  const simpleCompound = /\band\b/gi;
269
- const simpleOr = /\bor\b/gi;
270
321
 
271
322
  let compoundCount = 0;
272
323
  for (const sentence of sentences) {
273
324
  if (compoundPattern.test(sentence)) {
274
325
  compoundCount++;
275
326
  } else {
276
- // Count simple "and" — 2+ indicates compound
277
327
  const andMatches = sentence.match(simpleCompound);
278
328
  if (andMatches && andMatches.length >= 2) {
279
329
  compoundCount++;
@@ -289,16 +339,8 @@ function measureAtomicity(sentences) {
289
339
  }
290
340
 
291
341
  /**
292
- * Flesch Reading Ease (Readability, 3.75% weight in Understanding)
293
- *
294
- * Formula: 206.835 - 1.015 * (total words / total sentences) - 84.6 * (total syllables / total words)
295
- * Source: Flesch, R. (1948). "A new readability yardstick." Journal of Applied Psychology.
296
- *
297
- * Scale: 0-100, higher = easier to read.
298
- * 90-100: Very Easy (5th grade)
299
- * 60-69: Standard (8th-9th grade)
300
- * 30-49: Difficult (college level)
301
- * 0-29: Very Confusing (graduate level)
342
+ * Flesch Reading Ease (Readability)
343
+ * Formula: 206.835 - 1.015 * (words/sentences) - 84.6 * (syllables/words)
302
344
  */
303
345
  function measureFleschReadingEase(words, sentences) {
304
346
  if (words.length === 0 || sentences.length === 0) return 0;
@@ -312,12 +354,8 @@ function measureFleschReadingEase(words, sentences) {
312
354
  }
313
355
 
314
356
  /**
315
- * Flesch-Kincaid Grade Level (Readability, 2.25% weight in Understanding)
316
- *
317
- * Formula: 0.39 * (total words / total sentences) + 11.8 * (total syllables / total words) - 15.59
318
- * Source: Kincaid, J.P. et al. (1975). "Derivation of new readability formulas."
319
- *
320
- * Returns US grade level (8 = 8th grade, 12 = high school senior, 16+ = graduate)
357
+ * Flesch-Kincaid Grade Level (Readability)
358
+ * Formula: 0.39 * (words/sentences) + 11.8 * (syllables/words) - 15.59
321
359
  */
322
360
  function measureFleschKincaidGrade(words, sentences) {
323
361
  if (words.length === 0 || sentences.length === 0) return 0;
@@ -331,10 +369,7 @@ function measureFleschKincaidGrade(words, sentences) {
331
369
  }
332
370
 
333
371
  /**
334
- * Sentence Length (Cognitive, 3.0% weight in Understanding)
335
- *
336
- * Average words per sentence. Cognitive load research (Sweller, 1988) shows that
337
- * sentences over 25 words significantly increase processing effort.
372
+ * Sentence Length (Cognitive)
338
373
  */
339
374
  function measureSentenceLength(words, sentences) {
340
375
  if (sentences.length === 0) return 0;
@@ -342,11 +377,7 @@ function measureSentenceLength(words, sentences) {
342
377
  }
343
378
 
344
379
  /**
345
- * Negation Load (Cognitive, 1.5% weight in Understanding)
346
- *
347
- * Ratio of sentences containing negation words.
348
- * Negation increases cognitive load because readers must mentally invert meaning.
349
- * IEEE 830 §4.3 recommends positive phrasing in requirements.
380
+ * Negation Load (Cognitive)
350
381
  */
351
382
  function measureNegationLoad(sentences) {
352
383
  if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
@@ -368,10 +399,7 @@ function measureNegationLoad(sentences) {
368
399
  }
369
400
 
370
401
  /**
371
- * Conditional Load (Cognitive, 1.5% weight in Understanding)
372
- *
373
- * Ratio of sentences containing conditional keywords.
374
- * Excessive conditionals make documentation hard to follow and test.
402
+ * Conditional Load (Cognitive)
375
403
  */
376
404
  function measureConditionalLoad(sentences) {
377
405
  if (sentences.length === 0) return { ratio: 0, count: 0, total: 0 };
@@ -400,6 +428,7 @@ function getReadabilityLabel(score) {
400
428
  if (score >= 60) return 'Standard';
401
429
  if (score >= 50) return 'Fairly Difficult';
402
430
  if (score >= 30) return 'Difficult';
431
+ if (score >= 15) return 'Hard — Technical';
403
432
  return 'Very Confusing';
404
433
  }
405
434
 
@@ -416,11 +445,9 @@ function getGradeLabel(grade) {
416
445
 
417
446
  /**
418
447
  * Check if the `understanding` CLI is available on the system.
419
- * Returns the path to the executable or null.
420
448
  */
421
449
  function findUnderstandingCli() {
422
450
  try {
423
- // Use 'which' on Unix/Mac, 'where' on Windows — never redirect to NUL (creates file on Mac)
424
451
  const cmd = process.platform === 'win32' ? 'where understanding' : 'which understanding';
425
452
  const result = execSync(`${cmd} 2>/dev/null`, {
426
453
  encoding: 'utf-8',
@@ -434,7 +461,6 @@ function findUnderstandingCli() {
434
461
 
435
462
  /**
436
463
  * Run the `understanding` CLI on a file and parse results.
437
- * Returns understanding's quality score or null if it fails.
438
464
  */
439
465
  function runUnderstandingDeepScan(filePath) {
440
466
  try {
@@ -484,20 +510,22 @@ function getCanonicalDocs(projectDir) {
484
510
 
485
511
  /**
486
512
  * Analyze a single document and return per-metric results.
513
+ *
514
+ * Uses extractProse() instead of stripMarkdown() — only actual prose
515
+ * paragraphs are scored. Documents that are mostly tables/code/reference
516
+ * material are skipped for readability (they'd score 0/100 unfairly).
487
517
  */
488
518
  function analyzeDocument(doc) {
489
519
  const content = readFileSync(doc.path, 'utf-8');
490
- const plainText = stripMarkdown(content);
491
-
492
- if (plainText.length < 50) {
493
- return { skipped: true, reason: 'too short', name: doc.name };
494
- }
520
+ const proseText = extractProse(content);
495
521
 
496
- const sentences = splitSentences(plainText);
497
- const words = tokenizeWords(plainText);
522
+ const sentences = splitSentences(proseText);
523
+ const words = tokenizeWords(proseText);
498
524
 
499
- if (sentences.length < 3 || words.length < 20) {
500
- return { skipped: true, reason: 'insufficient content', name: doc.name };
525
+ // Skip if insufficient prose content
526
+ // Reference docs (mostly tables, code, lists) shouldn't be scored for readability
527
+ if (words.length < MIN_PROSE_WORDS || sentences.length < 3) {
528
+ return { skipped: true, reason: 'insufficient prose (reference document)', name: doc.name };
501
529
  }
502
530
 
503
531
  const passive = measurePassiveVoice(sentences);
@@ -539,7 +567,6 @@ export function validateDocQuality(projectDir, config) {
539
567
 
540
568
  const docs = getCanonicalDocs(projectDir);
541
569
  if (docs.length === 0) {
542
- // No docs to analyze — structure validator catches this
543
570
  return results;
544
571
  }
545
572
 
@@ -606,7 +633,7 @@ export function validateDocQuality(projectDir, config) {
606
633
  } else {
607
634
  results.warnings.push(
608
635
  `${doc.name}: Reading level too high (grade ${m.fleschKincaidGrade} — ${getGradeLabel(m.fleschKincaidGrade)}). ` +
609
- `Aim for grade 10-12 for technical docs`
636
+ `Aim for grade 12-16 for technical docs`
610
637
  );
611
638
  }
612
639
 
@@ -617,7 +644,7 @@ export function validateDocQuality(projectDir, config) {
617
644
  } else {
618
645
  results.warnings.push(
619
646
  `${doc.name}: Average sentence too long (${m.avgSentenceLength} words). ` +
620
- `Target ≤25 words per sentence for readability (Sweller, 1988)`
647
+ `Target ≤30 words per sentence for readability`
621
648
  );
622
649
  }
623
650
 
@@ -644,11 +671,5 @@ export function validateDocQuality(projectDir, config) {
644
671
  }
645
672
  }
646
673
 
647
- // ── Optional: Understanding deep scan note ──
648
- if (!understandingCli && docs.length > 0) {
649
- // Don't add as warning — just a note in verbose mode
650
- // Users who want full 31-metric scan can install understanding
651
- }
652
-
653
674
  return results;
654
675
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "docguard-cli",
3
- "version": "0.9.2",
3
+ "version": "0.9.4",
4
4
  "description": "The enforcement tool for Canonical-Driven Development (CDD). Audit, generate, and guard your project documentation.",
5
5
  "type": "module",
6
6
  "bin": {