@buresmi7/agent-doc-rules-docs-duplicates 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,97 @@
1
+ # Docs AI Review
2
+
3
+ `@buresmi7/agent-doc-rules-docs-duplicates` provides Codex-backed documentation review.
4
+ It checks likely semantic duplicates and can review Markdown sentences for
5
+ style issues.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pnpm add -D @buresmi7/agent-doc-rules-docs-duplicates
11
+ ```
12
+
13
+ ## Command
14
+
15
+ ```bash
16
+ agent-doc-rules-docs-duplicates check
17
+ agent-doc-rules-docs-duplicates style
18
+ ```
19
+
20
+ `check` and `duplicates` run semantic duplicate review. `style` runs AI review
21
+ for Markdown sentences.
22
+
23
+ The command resolves the bundled `@openai/codex` binary from this package. It
24
+ does not rely on `codex` being present in `PATH`.
25
+
26
+ Default model settings for both AI checks:
27
+
28
+ - model: `gpt-5-nano`
29
+ - reasoning effort: `low`
30
+
31
+ `gpt-5-nano` is the default because duplicate review is a classification task
32
+ and OpenAI positions the nano GPT-5 variant as the fastest, lowest-cost GPT-5
33
+ option for tasks such as summarization and classification.
34
+
35
+ Use `--model <model>` or `agent-doc-rules.config.json` when your Codex account
36
+ does not expose the default model.
37
+
38
+ ## Flow
39
+
40
+ 1. Parse Markdown prose into text units.
41
+ 2. Skip code blocks, short noise, and `references/` directories by default.
42
+ 3. Build candidates with normalized exact matching, shingle overlap, word
43
+ overlap, and string similarity.
44
+ 4. Remove candidates that match configured `ignorePairs`.
45
+ 5. Send only candidate pairs to Codex.
46
+ 6. Map structured Codex JSON to `fail`, `warn`, and `ok`.
47
+
48
+ `fail` returns a non-zero exit code. Warning-only results return zero.
49
+
50
+ ## Style Review
51
+
52
+ Style review parses Markdown into sentence units, sends only those units to
53
+ Codex, and asks for `fail` or `warn` findings. It is meant for judgment calls
54
+ such as unclear workflow names, vague AI-like phrasing, long sentences, or
55
+ sentences that are understandable but need a maintainer rewrite.
56
+
57
+ Use deterministic wording checks for known banned terms. Use AI style review
58
+ when the question depends on the sentence.
59
+
60
+ ## Config
61
+
62
+ Duplicate settings live under `docs.duplicates` in the root
63
+ `agent-doc-rules.config.json`. Command-line flags take precedence.
64
+
65
+ ```json
66
+ {
67
+ "docs": {
68
+ "duplicates": {
69
+ "includeReferences": false,
70
+ "ignorePairs": [
71
+ {
72
+ "left": "^e2e/",
73
+ "right": "^e2e/",
74
+ "reason": "E2E fixtures intentionally repeat scenario facts."
75
+ }
76
+ ],
77
+ "warnScore": 0.78,
78
+ "failScore": 0.92,
79
+ "model": "gpt-5-nano",
80
+ "reasoningEffort": "low"
81
+ },
82
+ "style": {
83
+ "includeReferences": false,
84
+ "maxUnits": 80,
85
+ "model": "gpt-5-nano",
86
+ "reasoningEffort": "low"
87
+ }
88
+ }
89
+ }
90
+ ```
91
+
92
+ The duplicate-review workflow is derived from the earlier `meta-work`
93
+ documentation maintenance workflow, where deterministic duplicate candidates
94
+ were reviewed separately from Markdown and link checks.
95
+
96
+ See the skill package [Config Reference](../agent-doc-rules-skill/docs/config-reference.md)
97
+ for shared include, exclude, wording, and AI style settings.
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { main } from '../src/cli.mjs';
4
+
5
+ main(process.argv.slice(2)).catch((error) => {
6
+ console.error(error.stack ?? error.message);
7
+ process.exit(1);
8
+ });
package/package.json ADDED
@@ -0,0 +1,31 @@
1
+ {
2
+ "name": "@buresmi7/agent-doc-rules-docs-duplicates",
3
+ "version": "0.8.2",
4
+ "private": false,
5
+ "description": "Semantic documentation duplicate checker powered by Codex",
6
+ "type": "module",
7
+ "license": "MIT",
8
+ "bin": {
9
+ "agent-doc-rules-docs-duplicates": "bin/agent-doc-rules-docs-duplicates.mjs"
10
+ },
11
+ "files": [
12
+ "bin",
13
+ "src",
14
+ "README.md"
15
+ ],
16
+ "scripts": {
17
+ "test": "node --test test/*.test.mjs"
18
+ },
19
+ "publishConfig": {
20
+ "access": "public"
21
+ },
22
+ "dependencies": {
23
+ "@openai/codex": "^0.142.0",
24
+ "fast-glob": "^3.3.3",
25
+ "mdast-util-to-string": "^4.0.0",
26
+ "remark-parse": "^11.0.0",
27
+ "sentence-splitter": "^5.0.1",
28
+ "unified": "^11.0.5",
29
+ "unist-util-visit": "^5.0.0"
30
+ }
31
+ }
@@ -0,0 +1,175 @@
1
+ export function findCandidatePairs(units, {
2
+ includeSameFile = false,
3
+ ignorePairs = [],
4
+ warnScore = 0.78,
5
+ maxCandidates = 50,
6
+ } = {}) {
7
+ const threshold = Math.min(warnScore, 0.72);
8
+ const pairIgnores = normalizeIgnorePairs(ignorePairs);
9
+ const candidates = [];
10
+
11
+ for (let leftIndex = 0; leftIndex < units.length; leftIndex += 1) {
12
+ for (let rightIndex = leftIndex + 1; rightIndex < units.length; rightIndex += 1) {
13
+ const left = units[leftIndex];
14
+ const right = units[rightIndex];
15
+
16
+ if (!includeSameFile && left.file === right.file) {
17
+ continue;
18
+ }
19
+
20
+ if (isIgnoredPair(left.file, right.file, pairIgnores)) {
21
+ continue;
22
+ }
23
+
24
+ const score = scorePair(left, right);
25
+
26
+ if (score.score >= threshold || score.reason === 'normalized exact match') {
27
+ candidates.push({
28
+ id: `DUP-${candidates.length + 1}`,
29
+ score: score.score,
30
+ reason: score.reason,
31
+ left: pickUnit(left),
32
+ right: pickUnit(right),
33
+ });
34
+ }
35
+ }
36
+ }
37
+
38
+ return candidates
39
+ .sort((left, right) => right.score - left.score)
40
+ .slice(0, maxCandidates)
41
+ .map((candidate, index) => ({
42
+ ...candidate,
43
+ id: `DUP-${index + 1}`,
44
+ }));
45
+ }
46
+
47
+ export function normalizeIgnorePairs(ignorePairs = []) {
48
+ return ignorePairs.map((entry) => {
49
+ if (!entry?.left || !entry?.right) {
50
+ throw new Error('Duplicate ignore pairs must include left and right regex strings.');
51
+ }
52
+
53
+ return {
54
+ left: new RegExp(entry.left),
55
+ right: new RegExp(entry.right),
56
+ };
57
+ });
58
+ }
59
+
60
+ export function isIgnoredPair(leftFile, rightFile, ignorePairs = []) {
61
+ return ignorePairs.some((entry) => (
62
+ (entry.left.test(leftFile) && entry.right.test(rightFile))
63
+ || (entry.left.test(rightFile) && entry.right.test(leftFile))
64
+ ));
65
+ }
66
+
67
+ export function scorePair(left, right) {
68
+ if (left.normalized === right.normalized) {
69
+ return { score: 1, reason: 'normalized exact match' };
70
+ }
71
+
72
+ const shingle = jaccard(shingles(left.words, 4), shingles(right.words, 4));
73
+ const wordOverlap = overlap(left.words, right.words);
74
+ const charDice = diceCoefficient(left.normalized, right.normalized);
75
+ const score = Math.max(shingle, wordOverlap * 0.96, charDice * 0.9);
76
+
77
+ if (shingle >= wordOverlap && shingle >= charDice) {
78
+ return { score, reason: 'high shingle overlap' };
79
+ }
80
+
81
+ if (wordOverlap >= charDice) {
82
+ return { score, reason: 'high word overlap' };
83
+ }
84
+
85
+ return { score, reason: 'high string similarity' };
86
+ }
87
+
88
+ function pickUnit(unit) {
89
+ return {
90
+ file: unit.file,
91
+ line: unit.line,
92
+ text: unit.text,
93
+ };
94
+ }
95
+
96
+ function shingles(words, size) {
97
+ if (words.length < size) {
98
+ return new Set(words);
99
+ }
100
+
101
+ const result = new Set();
102
+
103
+ for (let index = 0; index <= words.length - size; index += 1) {
104
+ result.add(words.slice(index, index + size).join(' '));
105
+ }
106
+
107
+ return result;
108
+ }
109
+
110
+ function jaccard(left, right) {
111
+ if (left.size === 0 || right.size === 0) {
112
+ return 0;
113
+ }
114
+
115
+ let intersection = 0;
116
+
117
+ for (const value of left) {
118
+ if (right.has(value)) {
119
+ intersection += 1;
120
+ }
121
+ }
122
+
123
+ return intersection / (left.size + right.size - intersection);
124
+ }
125
+
126
+ function overlap(leftWords, rightWords) {
127
+ const left = new Set(leftWords);
128
+ const right = new Set(rightWords);
129
+ const smaller = left.size < right.size ? left : right;
130
+ const larger = left.size < right.size ? right : left;
131
+
132
+ if (smaller.size === 0) {
133
+ return 0;
134
+ }
135
+
136
+ let intersection = 0;
137
+
138
+ for (const word of smaller) {
139
+ if (larger.has(word)) {
140
+ intersection += 1;
141
+ }
142
+ }
143
+
144
+ return intersection / smaller.size;
145
+ }
146
+
147
+ function diceCoefficient(left, right) {
148
+ const leftPairs = bigrams(left);
149
+ const rightPairs = bigrams(right);
150
+
151
+ if (leftPairs.size === 0 || rightPairs.size === 0) {
152
+ return 0;
153
+ }
154
+
155
+ let intersection = 0;
156
+
157
+ for (const pair of leftPairs) {
158
+ if (rightPairs.has(pair)) {
159
+ intersection += 1;
160
+ }
161
+ }
162
+
163
+ return (2 * intersection) / (leftPairs.size + rightPairs.size);
164
+ }
165
+
166
+ function bigrams(text) {
167
+ const normalized = text.replace(/\s+/g, ' ');
168
+ const result = new Set();
169
+
170
+ for (let index = 0; index < normalized.length - 1; index += 1) {
171
+ result.add(normalized.slice(index, index + 2));
172
+ }
173
+
174
+ return result;
175
+ }
package/src/check.mjs ADDED
@@ -0,0 +1,113 @@
1
+ import { findCandidatePairs } from './candidates.mjs';
2
+ import { runCodexClassifier } from './codex.mjs';
3
+ import { loadMarkdownUnits } from './markdown.mjs';
4
+
5
+ export async function checkDuplicates(options, deps = {}) {
6
+ const loadUnits = deps.loadMarkdownUnits ?? loadMarkdownUnits;
7
+ const classifyCandidates = deps.classifyCandidates ?? runCodexClassifier;
8
+ const { files, units } = await loadUnits(options);
9
+ const candidates = findCandidatePairs(units, options);
10
+
11
+ if (candidates.length === 0) {
12
+ return {
13
+ code: 0,
14
+ files,
15
+ units,
16
+ candidates,
17
+ reviews: [],
18
+ report: formatReport({ files, units, candidates, reviews: [] }),
19
+ };
20
+ }
21
+
22
+ const rawResult = await classifyCandidates(candidates, options);
23
+ const reviews = normalizeReviews({ candidates, rawResult, options });
24
+ const failCount = reviews.filter((review) => review.status === 'fail').length;
25
+
26
+ return {
27
+ code: failCount > 0 ? 1 : 0,
28
+ files,
29
+ units,
30
+ candidates,
31
+ reviews,
32
+ report: formatReport({ files, units, candidates, reviews }),
33
+ };
34
+ }
35
+
36
+ export function normalizeReviews({ candidates, rawResult, options }) {
37
+ const byId = new Map((rawResult.matches ?? []).map((match) => [match.id, match]));
38
+
39
+ return candidates.map((candidate) => {
40
+ const match = byId.get(candidate.id);
41
+
42
+ if (!match) {
43
+ return {
44
+ ...candidate,
45
+ status: 'warn',
46
+ duplicateScore: candidate.score,
47
+ reviewReason: 'Codex did not classify this candidate.',
48
+ };
49
+ }
50
+
51
+ const duplicateScore = Number(match.score);
52
+ const status = normalizeStatus(match.status)
53
+ ?? (Number.isFinite(duplicateScore) ? statusFromScore(duplicateScore, options) : 'warn');
54
+
55
+ return {
56
+ ...candidate,
57
+ status,
58
+ duplicateScore: Number.isFinite(duplicateScore) ? duplicateScore : candidate.score,
59
+ reviewReason: match.reason,
60
+ };
61
+ });
62
+ }
63
+
64
+ export function formatReport({ files, units, candidates, reviews }) {
65
+ const lines = [
66
+ 'Docs semantic duplicate check',
67
+ `Files: ${files.length}`,
68
+ `Text units: ${units.length}`,
69
+ `Candidates: ${candidates.length}`,
70
+ ];
71
+
72
+ if (candidates.length === 0) {
73
+ lines.push('No semantic duplicate candidates found.');
74
+ return `${lines.join('\n')}\n`;
75
+ }
76
+
77
+ const grouped = {
78
+ fail: reviews.filter((review) => review.status === 'fail'),
79
+ warn: reviews.filter((review) => review.status === 'warn'),
80
+ ok: reviews.filter((review) => review.status === 'ok'),
81
+ };
82
+
83
+ for (const status of ['fail', 'warn']) {
84
+ for (const review of grouped[status]) {
85
+ lines.push('');
86
+ lines.push(`[${status}] ${review.id} score=${review.duplicateScore.toFixed(2)}`);
87
+ lines.push(`${review.left.file}:${review.left.line}`);
88
+ lines.push(`${review.right.file}:${review.right.line}`);
89
+ lines.push(review.reviewReason);
90
+ }
91
+ }
92
+
93
+ lines.push('');
94
+ lines.push(`Summary: ${grouped.fail.length} fail, ${grouped.warn.length} warn, ${grouped.ok.length} ok`);
95
+
96
+ return `${lines.join('\n')}\n`;
97
+ }
98
+
99
+ function statusFromScore(score, { warnScore, failScore }) {
100
+ if (score >= failScore) {
101
+ return 'fail';
102
+ }
103
+
104
+ if (score >= warnScore) {
105
+ return 'warn';
106
+ }
107
+
108
+ return 'ok';
109
+ }
110
+
111
+ function normalizeStatus(status) {
112
+ return ['fail', 'warn', 'ok'].includes(status) ? status : null;
113
+ }
package/src/cli.mjs ADDED
@@ -0,0 +1,120 @@
1
+ import { checkDuplicates } from './check.mjs';
2
+ import { resolveDuplicateOptions, resolveStyleOptions } from './config.mjs';
3
+ import { checkStyle } from './style.mjs';
4
+
5
+ export async function main(argv = process.argv.slice(2)) {
6
+ const parsed = parseArgs(argv);
7
+
8
+ if (parsed.help) {
9
+ console.log(usage());
10
+ return;
11
+ }
12
+
13
+ const options = parsed.command === 'style'
14
+ ? await resolveStyleOptions(parsed)
15
+ : await resolveDuplicateOptions(parsed);
16
+ const result = parsed.command === 'style'
17
+ ? await checkStyle(options)
18
+ : await checkDuplicates(options);
19
+ process.stdout.write(result.report);
20
+
21
+ if (result.code !== 0) {
22
+ process.exitCode = result.code;
23
+ }
24
+ }
25
+
26
+ export function parseArgs(argv) {
27
+ const [command, ...rest] = argv;
28
+
29
+ if (!command || command === '--help' || command === '-h') {
30
+ return { command: 'check', help: true };
31
+ }
32
+
33
+ if (!['check', 'duplicates', 'style'].includes(command)) {
34
+ throw new Error(`Unknown command: ${command}`);
35
+ }
36
+
37
+ const parsed = {
38
+ command,
39
+ include: [],
40
+ exclude: [],
41
+ };
42
+
43
+ for (let index = 0; index < rest.length; index += 1) {
44
+ const arg = rest[index];
45
+
46
+ if (arg === '--root') {
47
+ parsed.root = readValue(rest, ++index, arg);
48
+ } else if (arg === '--include') {
49
+ parsed.include.push(readValue(rest, ++index, arg));
50
+ } else if (arg === '--exclude') {
51
+ parsed.exclude.push(readValue(rest, ++index, arg));
52
+ } else if (arg === '--config') {
53
+ parsed.configPath = readValue(rest, ++index, arg);
54
+ } else if (arg === '--include-references') {
55
+ parsed.includeReferences = true;
56
+ } else if (arg === '--include-same-file') {
57
+ parsed.includeSameFile = true;
58
+ } else if (arg === '--warn-score') {
59
+ parsed.warnScore = Number(readValue(rest, ++index, arg));
60
+ } else if (arg === '--fail-score') {
61
+ parsed.failScore = Number(readValue(rest, ++index, arg));
62
+ } else if (arg === '--min-words') {
63
+ parsed.minWords = Number(readValue(rest, ++index, arg));
64
+ } else if (arg === '--min-chars') {
65
+ parsed.minChars = Number(readValue(rest, ++index, arg));
66
+ } else if (arg === '--max-candidates') {
67
+ parsed.maxCandidates = Number(readValue(rest, ++index, arg));
68
+ } else if (arg === '--max-units') {
69
+ parsed.maxUnits = Number(readValue(rest, ++index, arg));
70
+ } else if (arg === '--model') {
71
+ parsed.model = readValue(rest, ++index, arg);
72
+ } else if (arg === '--reasoning-effort') {
73
+ parsed.reasoningEffort = readValue(rest, ++index, arg);
74
+ } else if (arg === '--codex-bin') {
75
+ parsed.codexBin = readValue(rest, ++index, arg);
76
+ } else if (arg === '--help' || arg === '-h') {
77
+ parsed.help = true;
78
+ } else {
79
+ throw new Error(`Unknown option: ${arg}`);
80
+ }
81
+ }
82
+
83
+ return parsed;
84
+ }
85
+
86
+ function readValue(args, index, option) {
87
+ const value = args[index];
88
+
89
+ if (!value || value.startsWith('--')) {
90
+ throw new Error(`Missing value for ${option}`);
91
+ }
92
+
93
+ return value;
94
+ }
95
+
96
+ function usage() {
97
+ return `Usage: agent-doc-rules-docs-duplicates <command> [options]
98
+
99
+ Commands:
100
+ check Run semantic duplicate review. Same as duplicates.
101
+ duplicates Run semantic duplicate review.
102
+ style Run AI style review for Markdown sentences.
103
+
104
+ Options:
105
+ --root <dir> Repository root. Defaults to current directory.
106
+ --include <glob> Include Markdown glob. Repeatable.
107
+ --exclude <glob> Exclude glob. Repeatable.
108
+ --config <path> Config file. Defaults to agent-doc-rules.config.json.
109
+ --include-references Include files in references/ directories.
110
+ --include-same-file Compare units from the same file.
111
+ --warn-score <number> Score threshold for warnings.
112
+ --fail-score <number> Score threshold for failures.
113
+ --min-words <number> Minimum words per prose unit.
114
+ --min-chars <number> Minimum characters per prose unit.
115
+ --max-candidates <number> Maximum candidate pairs sent to Codex.
116
+ --max-units <number> Maximum sentence units sent to Codex for style review.
117
+ --model <model> Codex model. Defaults to gpt-5-nano.
118
+ --reasoning-effort <effort> Codex reasoning effort. Defaults to low.
119
+ --codex-bin <path> Override Codex binary for local debugging.`;
120
+ }
package/src/codex.mjs ADDED
@@ -0,0 +1,333 @@
1
+ import { spawn } from 'node:child_process';
2
+ import { existsSync, readFileSync } from 'node:fs';
3
+ import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
4
+ import { createRequire } from 'node:module';
5
+ import { tmpdir } from 'node:os';
6
+ import { dirname, join } from 'node:path';
7
+
8
+ const require = createRequire(import.meta.url);
9
+
10
+ export const codexOutputSchema = {
11
+ type: 'object',
12
+ additionalProperties: false,
13
+ properties: {
14
+ matches: {
15
+ type: 'array',
16
+ items: {
17
+ type: 'object',
18
+ additionalProperties: false,
19
+ properties: {
20
+ id: { type: 'string' },
21
+ score: { type: 'number', minimum: 0, maximum: 1 },
22
+ status: { type: 'string', enum: ['fail', 'warn', 'ok'] },
23
+ reason: { type: 'string' },
24
+ },
25
+ required: ['id', 'score', 'status', 'reason'],
26
+ },
27
+ },
28
+ },
29
+ required: ['matches'],
30
+ };
31
+
32
+ export const styleOutputSchema = {
33
+ type: 'object',
34
+ additionalProperties: false,
35
+ properties: {
36
+ findings: {
37
+ type: 'array',
38
+ items: {
39
+ type: 'object',
40
+ additionalProperties: false,
41
+ properties: {
42
+ id: { type: 'string' },
43
+ status: { type: 'string', enum: ['fail', 'warn', 'ok'] },
44
+ category: {
45
+ type: 'string',
46
+ enum: ['unclear', 'idiom', 'vague', 'ai-voice', 'too-long', 'passive', 'ok'],
47
+ },
48
+ issue: { type: 'string' },
49
+ suggestion: { type: 'string' },
50
+ confidence: { type: 'number', minimum: 0, maximum: 1 },
51
+ },
52
+ required: ['id', 'status', 'category', 'issue', 'suggestion', 'confidence'],
53
+ },
54
+ },
55
+ },
56
+ required: ['findings'],
57
+ };
58
+
59
+ export async function runCodexClassifier(candidates, {
60
+ root,
61
+ model,
62
+ reasoningEffort,
63
+ codexBin,
64
+ } = {}) {
65
+ const tempDir = await mkdtemp(join(tmpdir(), 'docs-duplicates-codex-'));
66
+ const schemaFile = join(tempDir, 'schema.json');
67
+ const outputFile = join(tempDir, 'last-message.json');
68
+
69
+ try {
70
+ await writeFile(schemaFile, JSON.stringify(codexOutputSchema, null, 2));
71
+ const prompt = buildCodexPrompt(candidates);
72
+ const invocation = buildCodexInvocation({
73
+ root,
74
+ model,
75
+ reasoningEffort,
76
+ codexBin,
77
+ schemaFile,
78
+ outputFile,
79
+ });
80
+
81
+ await runCodex(invocation, prompt);
82
+ return parseCodexResponse(await readFile(outputFile, 'utf8'));
83
+ } finally {
84
+ await rm(tempDir, { recursive: true, force: true });
85
+ }
86
+ }
87
+
88
+ export async function runCodexStyleReviewer(units, {
89
+ root,
90
+ model,
91
+ reasoningEffort,
92
+ codexBin,
93
+ } = {}) {
94
+ const tempDir = await mkdtemp(join(tmpdir(), 'docs-style-codex-'));
95
+ const schemaFile = join(tempDir, 'schema.json');
96
+ const outputFile = join(tempDir, 'last-message.json');
97
+
98
+ try {
99
+ await writeFile(schemaFile, JSON.stringify(styleOutputSchema, null, 2));
100
+ const prompt = buildStylePrompt(units);
101
+ const invocation = buildCodexInvocation({
102
+ root,
103
+ model,
104
+ reasoningEffort,
105
+ codexBin,
106
+ schemaFile,
107
+ outputFile,
108
+ });
109
+
110
+ await runCodex(invocation, prompt);
111
+ return parseCodexResponse(await readFile(outputFile, 'utf8'));
112
+ } finally {
113
+ await rm(tempDir, { recursive: true, force: true });
114
+ }
115
+ }
116
+
117
+ export function buildCodexPrompt(candidates) {
118
+ const formattedCandidates = candidates.map((candidate) => `## ${candidate.id}
119
+
120
+ Heuristic score: ${candidate.score.toFixed(3)}
121
+ Heuristic reason: ${candidate.reason}
122
+
123
+ Left: ${candidate.left.file}:${candidate.left.line}
124
+ ${candidate.left.text}
125
+
126
+ Right: ${candidate.right.file}:${candidate.right.line}
127
+ ${candidate.right.text}`).join('\n\n');
128
+
129
+ return `You are reviewing a small list of possible duplicate documentation passages.
130
+
131
+ Classify only the candidate pairs shown below. Do not inspect the repository or
132
+ invent additional pairs.
133
+
134
+ Use these labels:
135
+
136
+ - fail: the passages repeat the same durable rule, fact, or procedure and one
137
+ should be deduplicated.
138
+ - warn: the passages overlap enough for a maintainer to review, but the
139
+ duplication may be acceptable.
140
+ - ok: the passages are not a meaningful duplicate.
141
+
142
+ Use warn, not fail, when repetition appears intentional in README summaries,
143
+ templates, E2E fixtures, E2E criteria, reference indexes, or short routing
144
+ pointers.
145
+
146
+ Return JSON matching the provided schema. Use score as duplicate confidence from
147
+ 0.0 to 1.0.
148
+
149
+ # Candidate Pairs
150
+
151
+ ${formattedCandidates}`;
152
+ }
153
+
154
+ export function buildStylePrompt(units) {
155
+ const formattedUnits = units.map((unit) => `## ${unit.id}
156
+
157
+ Location: ${unit.file}:${unit.line}
158
+ ${unit.text}`).join('\n\n');
159
+
160
+ return `You are reviewing repository documentation sentence by sentence.
161
+
162
+ Review only the sentences listed below. Do not inspect the repository and do not
163
+ invent findings for text that is not shown.
164
+
165
+ Use these labels:
166
+
167
+ - fail: the sentence has a clear style problem that should block documentation
168
+ changes, such as an unclear idiom, metaphorical workflow name, vague AI-like
169
+ phrasing, or wording that makes the task hard to understand.
170
+ - warn: the sentence is understandable but a maintainer should consider a
171
+ clearer rewrite.
172
+ - ok: the sentence is clear enough for repository documentation.
173
+
174
+ Prefer concrete wording. Be strict about workflow, process, and section names
175
+ that sound clever but do not explain the task. Do not flag paths, commands,
176
+ package names, code identifiers, or necessary technical terms.
177
+
178
+ Return only findings that are fail or warn. If every sentence is ok, return an
179
+ empty findings array. Use confidence from 0.0 to 1.0.
180
+
181
+ # Sentences
182
+
183
+ ${formattedUnits}`;
184
+ }
185
+
186
+ export function buildCodexInvocation({
187
+ root,
188
+ model,
189
+ reasoningEffort,
190
+ codexBin,
191
+ schemaFile,
192
+ outputFile,
193
+ }) {
194
+ const args = [
195
+ 'exec',
196
+ '--skip-git-repo-check',
197
+ '--ephemeral',
198
+ '--ignore-rules',
199
+ '--sandbox',
200
+ 'read-only',
201
+ '--model',
202
+ model,
203
+ '--config',
204
+ `model_reasoning_effort=${JSON.stringify(reasoningEffort)}`,
205
+ '--output-schema',
206
+ schemaFile,
207
+ '--output-last-message',
208
+ outputFile,
209
+ '--color',
210
+ 'never',
211
+ '--cd',
212
+ root,
213
+ '-',
214
+ ];
215
+
216
+ if (codexBin) {
217
+ return { command: codexBin, args };
218
+ }
219
+
220
+ return {
221
+ command: process.execPath,
222
+ args: [resolveCodexBin(), ...args],
223
+ };
224
+ }
225
+
226
+ export function resolveCodexBin() {
227
+ for (const packageJsonPath of resolvePackageJsonPaths('@openai/codex')) {
228
+ const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf8'));
229
+ const bin = typeof packageJson.bin === 'string'
230
+ ? packageJson.bin
231
+ : packageJson.bin?.codex;
232
+
233
+ if (!bin) {
234
+ continue;
235
+ }
236
+
237
+ const binPath = join(dirname(packageJsonPath), bin);
238
+
239
+ if (existsSync(binPath)) {
240
+ return binPath;
241
+ }
242
+ }
243
+
244
+ throw new Error('@openai/codex does not expose a codex binary.');
245
+ }
246
+
247
+ export function resolvePackageJsonPaths(packageName) {
248
+ const packageJsonPaths = [];
249
+
250
+ try {
251
+ packageJsonPaths.push(require.resolve(`${packageName}/package.json`));
252
+ } catch (error) {
253
+ if (error.code !== 'ERR_PACKAGE_PATH_NOT_EXPORTED') {
254
+ throw error;
255
+ }
256
+ }
257
+
258
+ if (packageJsonPaths.length > 0) {
259
+ return [...new Set(packageJsonPaths)];
260
+ }
261
+
262
+ let directory = dirname(require.resolve(packageName));
263
+
264
+ while (true) {
265
+ const candidate = join(directory, 'package.json');
266
+
267
+ if (existsSync(candidate)) {
268
+ const packageJson = JSON.parse(readFileSync(candidate, 'utf8'));
269
+
270
+ if (packageJson.name === packageName) {
271
+ packageJsonPaths.push(candidate);
272
+ }
273
+ }
274
+
275
+ const parent = dirname(directory);
276
+
277
+ if (parent === directory) {
278
+ break;
279
+ }
280
+
281
+ directory = parent;
282
+ }
283
+
284
+ return [...new Set(packageJsonPaths)];
285
+ }
286
+
287
+ export function parseCodexResponse(text) {
288
+ const trimmed = text.trim();
289
+
290
+ try {
291
+ return JSON.parse(trimmed);
292
+ } catch {
293
+ const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);
294
+
295
+ if (fenced) {
296
+ return JSON.parse(fenced[1]);
297
+ }
298
+
299
+ throw new Error('Codex did not return valid duplicate-check JSON.');
300
+ }
301
+ }
302
+
303
+ function runCodex({ command, args }, prompt) {
304
+ return new Promise((resolve, reject) => {
305
+ let stdout = '';
306
+ let stderr = '';
307
+ const child = spawn(command, args, {
308
+ stdio: ['pipe', 'pipe', 'pipe'],
309
+ env: {
310
+ ...process.env,
311
+ NO_COLOR: process.env.NO_COLOR ?? '1',
312
+ },
313
+ });
314
+
315
+ child.stdout.on('data', (chunk) => {
316
+ stdout += chunk.toString();
317
+ });
318
+ child.stderr.on('data', (chunk) => {
319
+ stderr += chunk.toString();
320
+ });
321
+ child.on('error', reject);
322
+ child.on('close', (code) => {
323
+ if (code === 0) {
324
+ resolve();
325
+ } else {
326
+ const detail = [stderr.trim(), stdout.trim()].filter(Boolean).join('\n');
327
+ reject(new Error(`Codex duplicate review failed with exit code ${code ?? 1}.\n${detail}`));
328
+ }
329
+ });
330
+
331
+ child.stdin.end(prompt);
332
+ });
333
+ }
package/src/config.mjs ADDED
@@ -0,0 +1,94 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import { isAbsolute, resolve } from 'node:path';
3
+ import {
4
+ defaultConfigFile,
5
+ defaultExclude,
6
+ defaultInclude,
7
+ duplicateDefaults,
8
+ styleDefaults,
9
+ } from './defaults.mjs';
10
+
11
+ export async function loadDocsConfig({ root = process.cwd(), configPath } = {}) {
12
+ const resolvedRoot = resolve(root);
13
+ const resolvedConfigPath = configPath
14
+ ? resolvePath(resolvedRoot, configPath)
15
+ : resolve(resolvedRoot, defaultConfigFile);
16
+
17
+ try {
18
+ const raw = await readFile(resolvedConfigPath, 'utf8');
19
+ const parsed = JSON.parse(raw);
20
+ return parsed.docs ?? parsed;
21
+ } catch (error) {
22
+ if (error.code === 'ENOENT' && !configPath) {
23
+ return {};
24
+ }
25
+
26
+ throw error;
27
+ }
28
+ }
29
+
30
+ export async function resolveDuplicateOptions(flags = {}) {
31
+ const root = resolve(flags.root ?? process.cwd());
32
+ const config = await loadDocsConfig({ root, configPath: flags.configPath });
33
+ const duplicateConfig = config.duplicates ?? {};
34
+
35
+ return {
36
+ root,
37
+ include: chooseArray(flags.include, duplicateConfig.include, config.include, defaultInclude),
38
+ exclude: chooseArray(flags.exclude, duplicateConfig.exclude, config.exclude, defaultExclude),
39
+ includeReferences: flags.includeReferences ?? duplicateConfig.includeReferences ?? duplicateDefaults.includeReferences,
40
+ includeSameFile: flags.includeSameFile ?? duplicateConfig.includeSameFile ?? duplicateDefaults.includeSameFile,
41
+ ignorePairs: chooseArray(duplicateConfig.ignorePairs, duplicateDefaults.ignorePairs),
42
+ warnScore: chooseNumber(flags.warnScore, duplicateConfig.warnScore, duplicateDefaults.warnScore),
43
+ failScore: chooseNumber(flags.failScore, duplicateConfig.failScore, duplicateDefaults.failScore),
44
+ minWords: chooseNumber(flags.minWords, duplicateConfig.minWords, duplicateDefaults.minWords),
45
+ minChars: chooseNumber(flags.minChars, duplicateConfig.minChars, duplicateDefaults.minChars),
46
+ maxCandidates: chooseNumber(flags.maxCandidates, duplicateConfig.maxCandidates, duplicateDefaults.maxCandidates),
47
+ model: flags.model ?? duplicateConfig.model ?? duplicateDefaults.model,
48
+ reasoningEffort: flags.reasoningEffort ?? duplicateConfig.reasoningEffort ?? duplicateDefaults.reasoningEffort,
49
+ codexBin: flags.codexBin ?? duplicateConfig.codexBin,
50
+ };
51
+ }
52
+
53
+ export async function resolveStyleOptions(flags = {}) {
54
+ const root = resolve(flags.root ?? process.cwd());
55
+ const config = await loadDocsConfig({ root, configPath: flags.configPath });
56
+ const styleConfig = config.style ?? {};
57
+
58
+ return {
59
+ root,
60
+ include: chooseArray(flags.include, styleConfig.include, config.include, defaultInclude),
61
+ exclude: chooseArray(flags.exclude, styleConfig.exclude, config.exclude, defaultExclude),
62
+ includeReferences: flags.includeReferences ?? styleConfig.includeReferences ?? styleDefaults.includeReferences,
63
+ minWords: chooseNumber(flags.minWords, styleConfig.minWords, styleDefaults.minWords),
64
+ minChars: chooseNumber(flags.minChars, styleConfig.minChars, styleDefaults.minChars),
65
+ maxUnits: chooseNumber(flags.maxUnits, styleConfig.maxUnits, styleDefaults.maxUnits),
66
+ model: flags.model ?? styleConfig.model ?? styleDefaults.model,
67
+ reasoningEffort: flags.reasoningEffort ?? styleConfig.reasoningEffort ?? styleDefaults.reasoningEffort,
68
+ codexBin: flags.codexBin ?? styleConfig.codexBin,
69
+ };
70
+ }
71
+
72
+ function chooseArray(...candidates) {
73
+ for (const candidate of candidates) {
74
+ if (Array.isArray(candidate) && candidate.length > 0) {
75
+ return candidate;
76
+ }
77
+ }
78
+
79
+ return [];
80
+ }
81
+
82
+ function chooseNumber(...candidates) {
83
+ for (const candidate of candidates) {
84
+ if (candidate !== undefined && candidate !== null && !Number.isNaN(Number(candidate))) {
85
+ return Number(candidate);
86
+ }
87
+ }
88
+
89
+ return undefined;
90
+ }
91
+
92
+ function resolvePath(root, path) {
93
+ return isAbsolute(path) ? path : resolve(root, path);
94
+ }
@@ -0,0 +1,43 @@
1
+ export const defaultInclude = [
2
+ '*.md',
3
+ 'docs/**/*.md',
4
+ '**/AGENTS.md',
5
+ '.agents/skills/**/*.md',
6
+ 'packages/**/*.md',
7
+ 'rules/**/*.md',
8
+ '.codex/**/*.md',
9
+ ];
10
+
11
+ export const defaultExclude = [
12
+ 'node_modules/**',
13
+ '.git/**',
14
+ 'dist/**',
15
+ 'coverage/**',
16
+ '.tmp/**',
17
+ 'repos/**',
18
+ 'worktrees/**',
19
+ ];
20
+
21
+ export const defaultConfigFile = 'agent-doc-rules.config.json';
22
+
23
+ export const duplicateDefaults = {
24
+ includeReferences: false,
25
+ includeSameFile: false,
26
+ ignorePairs: [],
27
+ warnScore: 0.78,
28
+ failScore: 0.92,
29
+ minWords: 6,
30
+ minChars: 40,
31
+ maxCandidates: 50,
32
+ model: 'gpt-5-nano',
33
+ reasoningEffort: 'low',
34
+ };
35
+
36
+ export const styleDefaults = {
37
+ includeReferences: false,
38
+ minWords: 6,
39
+ minChars: 40,
40
+ maxUnits: 80,
41
+ model: 'gpt-5-nano',
42
+ reasoningEffort: 'low',
43
+ };
@@ -0,0 +1,148 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import { join } from 'node:path';
3
+ import fastGlob from 'fast-glob';
4
+ import { toString } from 'mdast-util-to-string';
5
+ import { split } from 'sentence-splitter';
6
+ import remarkParse from 'remark-parse';
7
+ import { unified } from 'unified';
8
+ import { visit } from 'unist-util-visit';
9
+
10
+ export async function resolveDuplicateFiles({ root, include, exclude, includeReferences = false }) {
11
+ const files = await fastGlob(include, {
12
+ cwd: root,
13
+ dot: true,
14
+ ignore: expandExcludePatterns(exclude),
15
+ onlyFiles: true,
16
+ unique: true,
17
+ });
18
+
19
+ return files
20
+ .filter((file) => file.endsWith('.md'))
21
+ .filter((file) => includeReferences || !hasPathSegment(file, 'references'))
22
+ .sort((left, right) => left.localeCompare(right));
23
+ }
24
+
25
+ export async function loadMarkdownUnits(options) {
26
+ const files = await resolveDuplicateFiles(options);
27
+ const units = [];
28
+
29
+ for (const file of files) {
30
+ const content = await readFile(join(options.root, file), 'utf8');
31
+ units.push(...extractMarkdownUnits({
32
+ file,
33
+ content,
34
+ minWords: options.minWords,
35
+ minChars: options.minChars,
36
+ }));
37
+ }
38
+
39
+ return { files, units };
40
+ }
41
+
42
+ export function extractMarkdownUnits({ file, content, minWords = 6, minChars = 40 }) {
43
+ const tree = unified().use(remarkParse).parse(content);
44
+ const units = [];
45
+
46
+ visit(tree, ['heading', 'paragraph'], (node) => {
47
+ if (node.type === 'paragraph' && isMarkdownTableBlock(sliceNodeContent(content, node))) {
48
+ return;
49
+ }
50
+
51
+ const text = normalizeWhitespace(toString(node));
52
+
53
+ for (const sentence of splitIntoUnits(text)) {
54
+ const normalized = normalizeForDuplicateCheck(sentence);
55
+ const words = normalized.split(' ').filter(Boolean);
56
+
57
+ if (isUsefulUnit({ text: sentence, normalized, words, minWords, minChars })) {
58
+ units.push({
59
+ id: `${file}:${node.position?.start?.line ?? 1}:${units.length + 1}`,
60
+ file,
61
+ line: node.position?.start?.line ?? 1,
62
+ text: sentence,
63
+ normalized,
64
+ words,
65
+ });
66
+ }
67
+ }
68
+ });
69
+
70
+ return units;
71
+ }
72
+
73
+ export function normalizeForDuplicateCheck(text) {
74
+ return normalizeWhitespace(text)
75
+ .toLowerCase()
76
+ .replace(/[`*_~[\](){}#>.,:;!?'"“”‘’]/g, '')
77
+ .replace(/\s+/g, ' ')
78
+ .trim();
79
+ }
80
+
81
+ function splitIntoUnits(text) {
82
+ const sentences = split(text)
83
+ .filter((node) => node.type === 'Sentence')
84
+ .map((node) => normalizeWhitespace(node.raw))
85
+ .filter(Boolean);
86
+
87
+ return sentences.length > 0 ? sentences : [text];
88
+ }
89
+
90
+ function isUsefulUnit({ text, normalized, words, minWords, minChars }) {
91
+ if (normalized.length < minChars || words.length < minWords) {
92
+ return false;
93
+ }
94
+
95
+ const alphaNumericCount = (text.match(/[a-z0-9]/gi) ?? []).length;
96
+ return alphaNumericCount / Math.max(text.length, 1) >= 0.45;
97
+ }
98
+
99
+ function sliceNodeContent(content, node) {
100
+ const start = node.position?.start?.offset;
101
+ const end = node.position?.end?.offset;
102
+
103
+ if (!Number.isInteger(start) || !Number.isInteger(end)) {
104
+ return '';
105
+ }
106
+
107
+ return content.slice(start, end);
108
+ }
109
+
110
+ function isMarkdownTableBlock(raw) {
111
+ const lines = raw
112
+ .split(/\r?\n/)
113
+ .map((line) => line.trim())
114
+ .filter(Boolean);
115
+
116
+ if (lines.length < 2) {
117
+ return false;
118
+ }
119
+
120
+ const allPipeRows = lines.every((line) => line.startsWith('|') && line.endsWith('|') && line.split('|').length >= 4);
121
+ if (!allPipeRows) {
122
+ return false;
123
+ }
124
+
125
+ return lines.some((line) => /^\|\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|$/.test(line));
126
+ }
127
+
128
+ function normalizeWhitespace(text) {
129
+ return text.replace(/\s+/g, ' ').trim();
130
+ }
131
+
132
+ function hasPathSegment(file, segment) {
133
+ return file.split(/[\\/]/).includes(segment);
134
+ }
135
+
136
+ function expandExcludePatterns(exclude) {
137
+ const expanded = [];
138
+
139
+ for (const pattern of exclude) {
140
+ expanded.push(pattern);
141
+
142
+ if (!pattern.startsWith('**/') && !pattern.startsWith('/')) {
143
+ expanded.push(`**/${pattern}`);
144
+ }
145
+ }
146
+
147
+ return [...new Set(expanded)];
148
+ }
package/src/style.mjs ADDED
@@ -0,0 +1,93 @@
1
+ import { runCodexStyleReviewer } from './codex.mjs';
2
+ import { loadMarkdownUnits } from './markdown.mjs';
3
+
4
+ export async function checkStyle(options, deps = {}) {
5
+ const loadUnits = deps.loadMarkdownUnits ?? loadMarkdownUnits;
6
+ const reviewStyle = deps.reviewStyle ?? runCodexStyleReviewer;
7
+ const { files, units } = await loadUnits(options);
8
+ const reviewUnits = units.slice(0, options.maxUnits);
9
+
10
+ if (reviewUnits.length === 0) {
11
+ return {
12
+ code: 0,
13
+ files,
14
+ units,
15
+ reviewUnits,
16
+ findings: [],
17
+ report: formatStyleReport({ files, units, reviewUnits, findings: [] }),
18
+ };
19
+ }
20
+
21
+ const rawResult = await reviewStyle(reviewUnits, options);
22
+ const findings = normalizeStyleFindings({ reviewUnits, rawResult });
23
+ const failCount = findings.filter((finding) => finding.status === 'fail').length;
24
+
25
+ return {
26
+ code: failCount > 0 ? 1 : 0,
27
+ files,
28
+ units,
29
+ reviewUnits,
30
+ findings,
31
+ report: formatStyleReport({ files, units, reviewUnits, findings }),
32
+ };
33
+ }
34
+
35
+ export function normalizeStyleFindings({ reviewUnits, rawResult }) {
36
+ const unitsById = new Map(reviewUnits.map((unit) => [unit.id, unit]));
37
+
38
+ return (rawResult.findings ?? [])
39
+ .filter((finding) => ['fail', 'warn'].includes(finding.status))
40
+ .map((finding) => {
41
+ const unit = unitsById.get(finding.id);
42
+
43
+ return {
44
+ id: finding.id,
45
+ status: finding.status,
46
+ category: finding.category,
47
+ issue: finding.issue,
48
+ suggestion: finding.suggestion,
49
+ confidence: Number(finding.confidence),
50
+ file: unit?.file ?? 'unknown',
51
+ line: unit?.line ?? 1,
52
+ text: unit?.text ?? '',
53
+ };
54
+ });
55
+ }
56
+
57
+ export function formatStyleReport({ files, units, reviewUnits, findings }) {
58
+ const lines = [
59
+ 'Docs AI style review',
60
+ `Files: ${files.length}`,
61
+ `Text units: ${units.length}`,
62
+ `Reviewed units: ${reviewUnits.length}`,
63
+ ];
64
+
65
+ if (findings.length === 0) {
66
+ lines.push('No AI style findings.');
67
+ return `${lines.join('\n')}\n`;
68
+ }
69
+
70
+ const grouped = {
71
+ fail: findings.filter((finding) => finding.status === 'fail'),
72
+ warn: findings.filter((finding) => finding.status === 'warn'),
73
+ };
74
+
75
+ for (const status of ['fail', 'warn']) {
76
+ for (const finding of grouped[status]) {
77
+ lines.push('');
78
+ lines.push(`[${status}] ${finding.id} confidence=${formatConfidence(finding.confidence)} category=${finding.category}`);
79
+ lines.push(`${finding.file}:${finding.line}`);
80
+ lines.push(`Issue: ${finding.issue}`);
81
+ lines.push(`Suggestion: ${finding.suggestion}`);
82
+ }
83
+ }
84
+
85
+ lines.push('');
86
+ lines.push(`Summary: ${grouped.fail.length} fail, ${grouped.warn.length} warn`);
87
+
88
+ return `${lines.join('\n')}\n`;
89
+ }
90
+
91
+ function formatConfidence(confidence) {
92
+ return Number.isFinite(confidence) ? confidence.toFixed(2) : 'n/a';
93
+ }