@buresmi7/agent-doc-rules-docs-duplicates 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +97 -0
- package/bin/agent-doc-rules-docs-duplicates.mjs +8 -0
- package/package.json +31 -0
- package/src/candidates.mjs +175 -0
- package/src/check.mjs +113 -0
- package/src/cli.mjs +120 -0
- package/src/codex.mjs +333 -0
- package/src/config.mjs +94 -0
- package/src/defaults.mjs +43 -0
- package/src/markdown.mjs +148 -0
- package/src/style.mjs +93 -0
package/README.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Docs AI Review
|
|
2
|
+
|
|
3
|
+
`@buresmi7/agent-doc-rules-docs-duplicates` provides Codex-backed documentation review.
|
|
4
|
+
It checks likely semantic duplicates and can review Markdown sentences for
|
|
5
|
+
style issues.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pnpm add -D @buresmi7/agent-doc-rules-docs-duplicates
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Command
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
agent-doc-rules-docs-duplicates check
|
|
17
|
+
agent-doc-rules-docs-duplicates style
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
`check` and `duplicates` run semantic duplicate review. `style` runs AI review
|
|
21
|
+
for Markdown sentences.
|
|
22
|
+
|
|
23
|
+
The command resolves the bundled `@openai/codex` binary from this package. It
|
|
24
|
+
does not rely on `codex` being present in `PATH`.
|
|
25
|
+
|
|
26
|
+
Default model settings for both AI checks:
|
|
27
|
+
|
|
28
|
+
- model: `gpt-5-nano`
|
|
29
|
+
- reasoning effort: `low`
|
|
30
|
+
|
|
31
|
+
`gpt-5-nano` is the default because duplicate review is a classification task
|
|
32
|
+
and OpenAI positions the nano GPT-5 variant as the fastest, lowest-cost GPT-5
|
|
33
|
+
option for tasks such as summarization and classification.
|
|
34
|
+
|
|
35
|
+
Use `--model <model>` or `agent-doc-rules.config.json` when your Codex account
|
|
36
|
+
does not expose the default model.
|
|
37
|
+
|
|
38
|
+
## Flow
|
|
39
|
+
|
|
40
|
+
1. Parse Markdown prose into text units.
|
|
41
|
+
2. Skip code blocks, short noise, and `references/` directories by default.
|
|
42
|
+
3. Build candidates with normalized exact matching, shingle overlap, word
|
|
43
|
+
overlap, and string similarity.
|
|
44
|
+
4. Remove candidates that match configured `ignorePairs`.
|
|
45
|
+
5. Send only candidate pairs to Codex.
|
|
46
|
+
6. Map structured Codex JSON to `fail`, `warn`, and `ok`.
|
|
47
|
+
|
|
48
|
+
`fail` returns a non-zero exit code. Warning-only results return zero.
|
|
49
|
+
|
|
50
|
+
## Style Review
|
|
51
|
+
|
|
52
|
+
Style review parses Markdown into sentence units, sends only those units to
|
|
53
|
+
Codex, and asks for `fail` or `warn` findings. It is meant for judgment calls
|
|
54
|
+
such as unclear workflow names, vague AI-like phrasing, long sentences, or
|
|
55
|
+
sentences that are understandable but need a maintainer rewrite.
|
|
56
|
+
|
|
57
|
+
Use deterministic wording checks for known banned terms. Use AI style review
|
|
58
|
+
when the question depends on the sentence.
|
|
59
|
+
|
|
60
|
+
## Config
|
|
61
|
+
|
|
62
|
+
Duplicate settings live under `docs.duplicates` in the root
|
|
63
|
+
`agent-doc-rules.config.json`. Command-line flags take precedence.
|
|
64
|
+
|
|
65
|
+
```json
|
|
66
|
+
{
|
|
67
|
+
"docs": {
|
|
68
|
+
"duplicates": {
|
|
69
|
+
"includeReferences": false,
|
|
70
|
+
"ignorePairs": [
|
|
71
|
+
{
|
|
72
|
+
"left": "^e2e/",
|
|
73
|
+
"right": "^e2e/",
|
|
74
|
+
"reason": "E2E fixtures intentionally repeat scenario facts."
|
|
75
|
+
}
|
|
76
|
+
],
|
|
77
|
+
"warnScore": 0.78,
|
|
78
|
+
"failScore": 0.92,
|
|
79
|
+
"model": "gpt-5-nano",
|
|
80
|
+
"reasoningEffort": "low"
|
|
81
|
+
},
|
|
82
|
+
"style": {
|
|
83
|
+
"includeReferences": false,
|
|
84
|
+
"maxUnits": 80,
|
|
85
|
+
"model": "gpt-5-nano",
|
|
86
|
+
"reasoningEffort": "low"
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
The duplicate-review workflow is derived from the earlier `meta-work`
|
|
93
|
+
documentation maintenance workflow, where deterministic duplicate candidates
|
|
94
|
+
were reviewed separately from Markdown and link checks.
|
|
95
|
+
|
|
96
|
+
See the skill package [Config Reference](../agent-doc-rules-skill/docs/config-reference.md)
|
|
97
|
+
for shared include, exclude, wording, and AI style settings.
|
package/package.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@buresmi7/agent-doc-rules-docs-duplicates",
|
|
3
|
+
"version": "0.8.2",
|
|
4
|
+
"private": false,
|
|
5
|
+
"description": "Semantic documentation duplicate checker powered by Codex",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"license": "MIT",
|
|
8
|
+
"bin": {
|
|
9
|
+
"agent-doc-rules-docs-duplicates": "bin/agent-doc-rules-docs-duplicates.mjs"
|
|
10
|
+
},
|
|
11
|
+
"files": [
|
|
12
|
+
"bin",
|
|
13
|
+
"src",
|
|
14
|
+
"README.md"
|
|
15
|
+
],
|
|
16
|
+
"scripts": {
|
|
17
|
+
"test": "node --test test/*.test.mjs"
|
|
18
|
+
},
|
|
19
|
+
"publishConfig": {
|
|
20
|
+
"access": "public"
|
|
21
|
+
},
|
|
22
|
+
"dependencies": {
|
|
23
|
+
"@openai/codex": "^0.142.0",
|
|
24
|
+
"fast-glob": "^3.3.3",
|
|
25
|
+
"mdast-util-to-string": "^4.0.0",
|
|
26
|
+
"remark-parse": "^11.0.0",
|
|
27
|
+
"sentence-splitter": "^5.0.1",
|
|
28
|
+
"unified": "^11.0.5",
|
|
29
|
+
"unist-util-visit": "^5.0.0"
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
export function findCandidatePairs(units, {
|
|
2
|
+
includeSameFile = false,
|
|
3
|
+
ignorePairs = [],
|
|
4
|
+
warnScore = 0.78,
|
|
5
|
+
maxCandidates = 50,
|
|
6
|
+
} = {}) {
|
|
7
|
+
const threshold = Math.min(warnScore, 0.72);
|
|
8
|
+
const pairIgnores = normalizeIgnorePairs(ignorePairs);
|
|
9
|
+
const candidates = [];
|
|
10
|
+
|
|
11
|
+
for (let leftIndex = 0; leftIndex < units.length; leftIndex += 1) {
|
|
12
|
+
for (let rightIndex = leftIndex + 1; rightIndex < units.length; rightIndex += 1) {
|
|
13
|
+
const left = units[leftIndex];
|
|
14
|
+
const right = units[rightIndex];
|
|
15
|
+
|
|
16
|
+
if (!includeSameFile && left.file === right.file) {
|
|
17
|
+
continue;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
if (isIgnoredPair(left.file, right.file, pairIgnores)) {
|
|
21
|
+
continue;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const score = scorePair(left, right);
|
|
25
|
+
|
|
26
|
+
if (score.score >= threshold || score.reason === 'normalized exact match') {
|
|
27
|
+
candidates.push({
|
|
28
|
+
id: `DUP-${candidates.length + 1}`,
|
|
29
|
+
score: score.score,
|
|
30
|
+
reason: score.reason,
|
|
31
|
+
left: pickUnit(left),
|
|
32
|
+
right: pickUnit(right),
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
return candidates
|
|
39
|
+
.sort((left, right) => right.score - left.score)
|
|
40
|
+
.slice(0, maxCandidates)
|
|
41
|
+
.map((candidate, index) => ({
|
|
42
|
+
...candidate,
|
|
43
|
+
id: `DUP-${index + 1}`,
|
|
44
|
+
}));
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export function normalizeIgnorePairs(ignorePairs = []) {
|
|
48
|
+
return ignorePairs.map((entry) => {
|
|
49
|
+
if (!entry?.left || !entry?.right) {
|
|
50
|
+
throw new Error('Duplicate ignore pairs must include left and right regex strings.');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return {
|
|
54
|
+
left: new RegExp(entry.left),
|
|
55
|
+
right: new RegExp(entry.right),
|
|
56
|
+
};
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export function isIgnoredPair(leftFile, rightFile, ignorePairs = []) {
|
|
61
|
+
return ignorePairs.some((entry) => (
|
|
62
|
+
(entry.left.test(leftFile) && entry.right.test(rightFile))
|
|
63
|
+
|| (entry.left.test(rightFile) && entry.right.test(leftFile))
|
|
64
|
+
));
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
export function scorePair(left, right) {
|
|
68
|
+
if (left.normalized === right.normalized) {
|
|
69
|
+
return { score: 1, reason: 'normalized exact match' };
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const shingle = jaccard(shingles(left.words, 4), shingles(right.words, 4));
|
|
73
|
+
const wordOverlap = overlap(left.words, right.words);
|
|
74
|
+
const charDice = diceCoefficient(left.normalized, right.normalized);
|
|
75
|
+
const score = Math.max(shingle, wordOverlap * 0.96, charDice * 0.9);
|
|
76
|
+
|
|
77
|
+
if (shingle >= wordOverlap && shingle >= charDice) {
|
|
78
|
+
return { score, reason: 'high shingle overlap' };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (wordOverlap >= charDice) {
|
|
82
|
+
return { score, reason: 'high word overlap' };
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return { score, reason: 'high string similarity' };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function pickUnit(unit) {
|
|
89
|
+
return {
|
|
90
|
+
file: unit.file,
|
|
91
|
+
line: unit.line,
|
|
92
|
+
text: unit.text,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function shingles(words, size) {
|
|
97
|
+
if (words.length < size) {
|
|
98
|
+
return new Set(words);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
const result = new Set();
|
|
102
|
+
|
|
103
|
+
for (let index = 0; index <= words.length - size; index += 1) {
|
|
104
|
+
result.add(words.slice(index, index + size).join(' '));
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return result;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function jaccard(left, right) {
|
|
111
|
+
if (left.size === 0 || right.size === 0) {
|
|
112
|
+
return 0;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
let intersection = 0;
|
|
116
|
+
|
|
117
|
+
for (const value of left) {
|
|
118
|
+
if (right.has(value)) {
|
|
119
|
+
intersection += 1;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
return intersection / (left.size + right.size - intersection);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
function overlap(leftWords, rightWords) {
|
|
127
|
+
const left = new Set(leftWords);
|
|
128
|
+
const right = new Set(rightWords);
|
|
129
|
+
const smaller = left.size < right.size ? left : right;
|
|
130
|
+
const larger = left.size < right.size ? right : left;
|
|
131
|
+
|
|
132
|
+
if (smaller.size === 0) {
|
|
133
|
+
return 0;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
let intersection = 0;
|
|
137
|
+
|
|
138
|
+
for (const word of smaller) {
|
|
139
|
+
if (larger.has(word)) {
|
|
140
|
+
intersection += 1;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return intersection / smaller.size;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function diceCoefficient(left, right) {
|
|
148
|
+
const leftPairs = bigrams(left);
|
|
149
|
+
const rightPairs = bigrams(right);
|
|
150
|
+
|
|
151
|
+
if (leftPairs.size === 0 || rightPairs.size === 0) {
|
|
152
|
+
return 0;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
let intersection = 0;
|
|
156
|
+
|
|
157
|
+
for (const pair of leftPairs) {
|
|
158
|
+
if (rightPairs.has(pair)) {
|
|
159
|
+
intersection += 1;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return (2 * intersection) / (leftPairs.size + rightPairs.size);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
function bigrams(text) {
|
|
167
|
+
const normalized = text.replace(/\s+/g, ' ');
|
|
168
|
+
const result = new Set();
|
|
169
|
+
|
|
170
|
+
for (let index = 0; index < normalized.length - 1; index += 1) {
|
|
171
|
+
result.add(normalized.slice(index, index + 2));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
return result;
|
|
175
|
+
}
|
package/src/check.mjs
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { findCandidatePairs } from './candidates.mjs';
|
|
2
|
+
import { runCodexClassifier } from './codex.mjs';
|
|
3
|
+
import { loadMarkdownUnits } from './markdown.mjs';
|
|
4
|
+
|
|
5
|
+
export async function checkDuplicates(options, deps = {}) {
|
|
6
|
+
const loadUnits = deps.loadMarkdownUnits ?? loadMarkdownUnits;
|
|
7
|
+
const classifyCandidates = deps.classifyCandidates ?? runCodexClassifier;
|
|
8
|
+
const { files, units } = await loadUnits(options);
|
|
9
|
+
const candidates = findCandidatePairs(units, options);
|
|
10
|
+
|
|
11
|
+
if (candidates.length === 0) {
|
|
12
|
+
return {
|
|
13
|
+
code: 0,
|
|
14
|
+
files,
|
|
15
|
+
units,
|
|
16
|
+
candidates,
|
|
17
|
+
reviews: [],
|
|
18
|
+
report: formatReport({ files, units, candidates, reviews: [] }),
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const rawResult = await classifyCandidates(candidates, options);
|
|
23
|
+
const reviews = normalizeReviews({ candidates, rawResult, options });
|
|
24
|
+
const failCount = reviews.filter((review) => review.status === 'fail').length;
|
|
25
|
+
|
|
26
|
+
return {
|
|
27
|
+
code: failCount > 0 ? 1 : 0,
|
|
28
|
+
files,
|
|
29
|
+
units,
|
|
30
|
+
candidates,
|
|
31
|
+
reviews,
|
|
32
|
+
report: formatReport({ files, units, candidates, reviews }),
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function normalizeReviews({ candidates, rawResult, options }) {
|
|
37
|
+
const byId = new Map((rawResult.matches ?? []).map((match) => [match.id, match]));
|
|
38
|
+
|
|
39
|
+
return candidates.map((candidate) => {
|
|
40
|
+
const match = byId.get(candidate.id);
|
|
41
|
+
|
|
42
|
+
if (!match) {
|
|
43
|
+
return {
|
|
44
|
+
...candidate,
|
|
45
|
+
status: 'warn',
|
|
46
|
+
duplicateScore: candidate.score,
|
|
47
|
+
reviewReason: 'Codex did not classify this candidate.',
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const duplicateScore = Number(match.score);
|
|
52
|
+
const status = normalizeStatus(match.status)
|
|
53
|
+
?? (Number.isFinite(duplicateScore) ? statusFromScore(duplicateScore, options) : 'warn');
|
|
54
|
+
|
|
55
|
+
return {
|
|
56
|
+
...candidate,
|
|
57
|
+
status,
|
|
58
|
+
duplicateScore: Number.isFinite(duplicateScore) ? duplicateScore : candidate.score,
|
|
59
|
+
reviewReason: match.reason,
|
|
60
|
+
};
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export function formatReport({ files, units, candidates, reviews }) {
|
|
65
|
+
const lines = [
|
|
66
|
+
'Docs semantic duplicate check',
|
|
67
|
+
`Files: ${files.length}`,
|
|
68
|
+
`Text units: ${units.length}`,
|
|
69
|
+
`Candidates: ${candidates.length}`,
|
|
70
|
+
];
|
|
71
|
+
|
|
72
|
+
if (candidates.length === 0) {
|
|
73
|
+
lines.push('No semantic duplicate candidates found.');
|
|
74
|
+
return `${lines.join('\n')}\n`;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const grouped = {
|
|
78
|
+
fail: reviews.filter((review) => review.status === 'fail'),
|
|
79
|
+
warn: reviews.filter((review) => review.status === 'warn'),
|
|
80
|
+
ok: reviews.filter((review) => review.status === 'ok'),
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
for (const status of ['fail', 'warn']) {
|
|
84
|
+
for (const review of grouped[status]) {
|
|
85
|
+
lines.push('');
|
|
86
|
+
lines.push(`[${status}] ${review.id} score=${review.duplicateScore.toFixed(2)}`);
|
|
87
|
+
lines.push(`${review.left.file}:${review.left.line}`);
|
|
88
|
+
lines.push(`${review.right.file}:${review.right.line}`);
|
|
89
|
+
lines.push(review.reviewReason);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
lines.push('');
|
|
94
|
+
lines.push(`Summary: ${grouped.fail.length} fail, ${grouped.warn.length} warn, ${grouped.ok.length} ok`);
|
|
95
|
+
|
|
96
|
+
return `${lines.join('\n')}\n`;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function statusFromScore(score, { warnScore, failScore }) {
|
|
100
|
+
if (score >= failScore) {
|
|
101
|
+
return 'fail';
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (score >= warnScore) {
|
|
105
|
+
return 'warn';
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return 'ok';
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function normalizeStatus(status) {
|
|
112
|
+
return ['fail', 'warn', 'ok'].includes(status) ? status : null;
|
|
113
|
+
}
|
package/src/cli.mjs
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import { checkDuplicates } from './check.mjs';
|
|
2
|
+
import { resolveDuplicateOptions, resolveStyleOptions } from './config.mjs';
|
|
3
|
+
import { checkStyle } from './style.mjs';
|
|
4
|
+
|
|
5
|
+
export async function main(argv = process.argv.slice(2)) {
|
|
6
|
+
const parsed = parseArgs(argv);
|
|
7
|
+
|
|
8
|
+
if (parsed.help) {
|
|
9
|
+
console.log(usage());
|
|
10
|
+
return;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const options = parsed.command === 'style'
|
|
14
|
+
? await resolveStyleOptions(parsed)
|
|
15
|
+
: await resolveDuplicateOptions(parsed);
|
|
16
|
+
const result = parsed.command === 'style'
|
|
17
|
+
? await checkStyle(options)
|
|
18
|
+
: await checkDuplicates(options);
|
|
19
|
+
process.stdout.write(result.report);
|
|
20
|
+
|
|
21
|
+
if (result.code !== 0) {
|
|
22
|
+
process.exitCode = result.code;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function parseArgs(argv) {
|
|
27
|
+
const [command, ...rest] = argv;
|
|
28
|
+
|
|
29
|
+
if (!command || command === '--help' || command === '-h') {
|
|
30
|
+
return { command: 'check', help: true };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (!['check', 'duplicates', 'style'].includes(command)) {
|
|
34
|
+
throw new Error(`Unknown command: ${command}`);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
const parsed = {
|
|
38
|
+
command,
|
|
39
|
+
include: [],
|
|
40
|
+
exclude: [],
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
for (let index = 0; index < rest.length; index += 1) {
|
|
44
|
+
const arg = rest[index];
|
|
45
|
+
|
|
46
|
+
if (arg === '--root') {
|
|
47
|
+
parsed.root = readValue(rest, ++index, arg);
|
|
48
|
+
} else if (arg === '--include') {
|
|
49
|
+
parsed.include.push(readValue(rest, ++index, arg));
|
|
50
|
+
} else if (arg === '--exclude') {
|
|
51
|
+
parsed.exclude.push(readValue(rest, ++index, arg));
|
|
52
|
+
} else if (arg === '--config') {
|
|
53
|
+
parsed.configPath = readValue(rest, ++index, arg);
|
|
54
|
+
} else if (arg === '--include-references') {
|
|
55
|
+
parsed.includeReferences = true;
|
|
56
|
+
} else if (arg === '--include-same-file') {
|
|
57
|
+
parsed.includeSameFile = true;
|
|
58
|
+
} else if (arg === '--warn-score') {
|
|
59
|
+
parsed.warnScore = Number(readValue(rest, ++index, arg));
|
|
60
|
+
} else if (arg === '--fail-score') {
|
|
61
|
+
parsed.failScore = Number(readValue(rest, ++index, arg));
|
|
62
|
+
} else if (arg === '--min-words') {
|
|
63
|
+
parsed.minWords = Number(readValue(rest, ++index, arg));
|
|
64
|
+
} else if (arg === '--min-chars') {
|
|
65
|
+
parsed.minChars = Number(readValue(rest, ++index, arg));
|
|
66
|
+
} else if (arg === '--max-candidates') {
|
|
67
|
+
parsed.maxCandidates = Number(readValue(rest, ++index, arg));
|
|
68
|
+
} else if (arg === '--max-units') {
|
|
69
|
+
parsed.maxUnits = Number(readValue(rest, ++index, arg));
|
|
70
|
+
} else if (arg === '--model') {
|
|
71
|
+
parsed.model = readValue(rest, ++index, arg);
|
|
72
|
+
} else if (arg === '--reasoning-effort') {
|
|
73
|
+
parsed.reasoningEffort = readValue(rest, ++index, arg);
|
|
74
|
+
} else if (arg === '--codex-bin') {
|
|
75
|
+
parsed.codexBin = readValue(rest, ++index, arg);
|
|
76
|
+
} else if (arg === '--help' || arg === '-h') {
|
|
77
|
+
parsed.help = true;
|
|
78
|
+
} else {
|
|
79
|
+
throw new Error(`Unknown option: ${arg}`);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return parsed;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function readValue(args, index, option) {
|
|
87
|
+
const value = args[index];
|
|
88
|
+
|
|
89
|
+
if (!value || value.startsWith('--')) {
|
|
90
|
+
throw new Error(`Missing value for ${option}`);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return value;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function usage() {
|
|
97
|
+
return `Usage: agent-doc-rules-docs-duplicates <command> [options]
|
|
98
|
+
|
|
99
|
+
Commands:
|
|
100
|
+
check Run semantic duplicate review. Same as duplicates.
|
|
101
|
+
duplicates Run semantic duplicate review.
|
|
102
|
+
style Run AI style review for Markdown sentences.
|
|
103
|
+
|
|
104
|
+
Options:
|
|
105
|
+
--root <dir> Repository root. Defaults to current directory.
|
|
106
|
+
--include <glob> Include Markdown glob. Repeatable.
|
|
107
|
+
--exclude <glob> Exclude glob. Repeatable.
|
|
108
|
+
--config <path> Config file. Defaults to agent-doc-rules.config.json.
|
|
109
|
+
--include-references Include files in references/ directories.
|
|
110
|
+
--include-same-file Compare units from the same file.
|
|
111
|
+
--warn-score <number> Score threshold for warnings.
|
|
112
|
+
--fail-score <number> Score threshold for failures.
|
|
113
|
+
--min-words <number> Minimum words per prose unit.
|
|
114
|
+
--min-chars <number> Minimum characters per prose unit.
|
|
115
|
+
--max-candidates <number> Maximum candidate pairs sent to Codex.
|
|
116
|
+
--max-units <number> Maximum sentence units sent to Codex for style review.
|
|
117
|
+
--model <model> Codex model. Defaults to gpt-5-nano.
|
|
118
|
+
--reasoning-effort <effort> Codex reasoning effort. Defaults to low.
|
|
119
|
+
--codex-bin <path> Override Codex binary for local debugging.`;
|
|
120
|
+
}
|
package/src/codex.mjs
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
import { spawn } from 'node:child_process';
|
|
2
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
3
|
+
import { mkdtemp, readFile, rm, writeFile } from 'node:fs/promises';
|
|
4
|
+
import { createRequire } from 'node:module';
|
|
5
|
+
import { tmpdir } from 'node:os';
|
|
6
|
+
import { dirname, join } from 'node:path';
|
|
7
|
+
|
|
8
|
+
const require = createRequire(import.meta.url);
|
|
9
|
+
|
|
10
|
+
export const codexOutputSchema = {
|
|
11
|
+
type: 'object',
|
|
12
|
+
additionalProperties: false,
|
|
13
|
+
properties: {
|
|
14
|
+
matches: {
|
|
15
|
+
type: 'array',
|
|
16
|
+
items: {
|
|
17
|
+
type: 'object',
|
|
18
|
+
additionalProperties: false,
|
|
19
|
+
properties: {
|
|
20
|
+
id: { type: 'string' },
|
|
21
|
+
score: { type: 'number', minimum: 0, maximum: 1 },
|
|
22
|
+
status: { type: 'string', enum: ['fail', 'warn', 'ok'] },
|
|
23
|
+
reason: { type: 'string' },
|
|
24
|
+
},
|
|
25
|
+
required: ['id', 'score', 'status', 'reason'],
|
|
26
|
+
},
|
|
27
|
+
},
|
|
28
|
+
},
|
|
29
|
+
required: ['matches'],
|
|
30
|
+
};
|
|
31
|
+
|
|
32
|
+
export const styleOutputSchema = {
|
|
33
|
+
type: 'object',
|
|
34
|
+
additionalProperties: false,
|
|
35
|
+
properties: {
|
|
36
|
+
findings: {
|
|
37
|
+
type: 'array',
|
|
38
|
+
items: {
|
|
39
|
+
type: 'object',
|
|
40
|
+
additionalProperties: false,
|
|
41
|
+
properties: {
|
|
42
|
+
id: { type: 'string' },
|
|
43
|
+
status: { type: 'string', enum: ['fail', 'warn', 'ok'] },
|
|
44
|
+
category: {
|
|
45
|
+
type: 'string',
|
|
46
|
+
enum: ['unclear', 'idiom', 'vague', 'ai-voice', 'too-long', 'passive', 'ok'],
|
|
47
|
+
},
|
|
48
|
+
issue: { type: 'string' },
|
|
49
|
+
suggestion: { type: 'string' },
|
|
50
|
+
confidence: { type: 'number', minimum: 0, maximum: 1 },
|
|
51
|
+
},
|
|
52
|
+
required: ['id', 'status', 'category', 'issue', 'suggestion', 'confidence'],
|
|
53
|
+
},
|
|
54
|
+
},
|
|
55
|
+
},
|
|
56
|
+
required: ['findings'],
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
export async function runCodexClassifier(candidates, {
|
|
60
|
+
root,
|
|
61
|
+
model,
|
|
62
|
+
reasoningEffort,
|
|
63
|
+
codexBin,
|
|
64
|
+
} = {}) {
|
|
65
|
+
const tempDir = await mkdtemp(join(tmpdir(), 'docs-duplicates-codex-'));
|
|
66
|
+
const schemaFile = join(tempDir, 'schema.json');
|
|
67
|
+
const outputFile = join(tempDir, 'last-message.json');
|
|
68
|
+
|
|
69
|
+
try {
|
|
70
|
+
await writeFile(schemaFile, JSON.stringify(codexOutputSchema, null, 2));
|
|
71
|
+
const prompt = buildCodexPrompt(candidates);
|
|
72
|
+
const invocation = buildCodexInvocation({
|
|
73
|
+
root,
|
|
74
|
+
model,
|
|
75
|
+
reasoningEffort,
|
|
76
|
+
codexBin,
|
|
77
|
+
schemaFile,
|
|
78
|
+
outputFile,
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
await runCodex(invocation, prompt);
|
|
82
|
+
return parseCodexResponse(await readFile(outputFile, 'utf8'));
|
|
83
|
+
} finally {
|
|
84
|
+
await rm(tempDir, { recursive: true, force: true });
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export async function runCodexStyleReviewer(units, {
|
|
89
|
+
root,
|
|
90
|
+
model,
|
|
91
|
+
reasoningEffort,
|
|
92
|
+
codexBin,
|
|
93
|
+
} = {}) {
|
|
94
|
+
const tempDir = await mkdtemp(join(tmpdir(), 'docs-style-codex-'));
|
|
95
|
+
const schemaFile = join(tempDir, 'schema.json');
|
|
96
|
+
const outputFile = join(tempDir, 'last-message.json');
|
|
97
|
+
|
|
98
|
+
try {
|
|
99
|
+
await writeFile(schemaFile, JSON.stringify(styleOutputSchema, null, 2));
|
|
100
|
+
const prompt = buildStylePrompt(units);
|
|
101
|
+
const invocation = buildCodexInvocation({
|
|
102
|
+
root,
|
|
103
|
+
model,
|
|
104
|
+
reasoningEffort,
|
|
105
|
+
codexBin,
|
|
106
|
+
schemaFile,
|
|
107
|
+
outputFile,
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
await runCodex(invocation, prompt);
|
|
111
|
+
return parseCodexResponse(await readFile(outputFile, 'utf8'));
|
|
112
|
+
} finally {
|
|
113
|
+
await rm(tempDir, { recursive: true, force: true });
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export function buildCodexPrompt(candidates) {
|
|
118
|
+
const formattedCandidates = candidates.map((candidate) => `## ${candidate.id}
|
|
119
|
+
|
|
120
|
+
Heuristic score: ${candidate.score.toFixed(3)}
|
|
121
|
+
Heuristic reason: ${candidate.reason}
|
|
122
|
+
|
|
123
|
+
Left: ${candidate.left.file}:${candidate.left.line}
|
|
124
|
+
${candidate.left.text}
|
|
125
|
+
|
|
126
|
+
Right: ${candidate.right.file}:${candidate.right.line}
|
|
127
|
+
${candidate.right.text}`).join('\n\n');
|
|
128
|
+
|
|
129
|
+
return `You are reviewing a small list of possible duplicate documentation passages.
|
|
130
|
+
|
|
131
|
+
Classify only the candidate pairs shown below. Do not inspect the repository or
|
|
132
|
+
invent additional pairs.
|
|
133
|
+
|
|
134
|
+
Use these labels:
|
|
135
|
+
|
|
136
|
+
- fail: the passages repeat the same durable rule, fact, or procedure and one
|
|
137
|
+
should be deduplicated.
|
|
138
|
+
- warn: the passages overlap enough for a maintainer to review, but the
|
|
139
|
+
duplication may be acceptable.
|
|
140
|
+
- ok: the passages are not a meaningful duplicate.
|
|
141
|
+
|
|
142
|
+
Use warn, not fail, when repetition appears intentional in README summaries,
|
|
143
|
+
templates, E2E fixtures, E2E criteria, reference indexes, or short routing
|
|
144
|
+
pointers.
|
|
145
|
+
|
|
146
|
+
Return JSON matching the provided schema. Use score as duplicate confidence from
|
|
147
|
+
0.0 to 1.0.
|
|
148
|
+
|
|
149
|
+
# Candidate Pairs
|
|
150
|
+
|
|
151
|
+
${formattedCandidates}`;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
export function buildStylePrompt(units) {
|
|
155
|
+
const formattedUnits = units.map((unit) => `## ${unit.id}
|
|
156
|
+
|
|
157
|
+
Location: ${unit.file}:${unit.line}
|
|
158
|
+
${unit.text}`).join('\n\n');
|
|
159
|
+
|
|
160
|
+
return `You are reviewing repository documentation sentence by sentence.
|
|
161
|
+
|
|
162
|
+
Review only the sentences listed below. Do not inspect the repository and do not
|
|
163
|
+
invent findings for text that is not shown.
|
|
164
|
+
|
|
165
|
+
Use these labels:
|
|
166
|
+
|
|
167
|
+
- fail: the sentence has a clear style problem that should block documentation
|
|
168
|
+
changes, such as an unclear idiom, metaphorical workflow name, vague AI-like
|
|
169
|
+
phrasing, or wording that makes the task hard to understand.
|
|
170
|
+
- warn: the sentence is understandable but a maintainer should consider a
|
|
171
|
+
clearer rewrite.
|
|
172
|
+
- ok: the sentence is clear enough for repository documentation.
|
|
173
|
+
|
|
174
|
+
Prefer concrete wording. Be strict about workflow, process, and section names
|
|
175
|
+
that sound clever but do not explain the task. Do not flag paths, commands,
|
|
176
|
+
package names, code identifiers, or necessary technical terms.
|
|
177
|
+
|
|
178
|
+
Return only findings that are fail or warn. If every sentence is ok, return an
|
|
179
|
+
empty findings array. Use confidence from 0.0 to 1.0.
|
|
180
|
+
|
|
181
|
+
# Sentences
|
|
182
|
+
|
|
183
|
+
${formattedUnits}`;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
export function buildCodexInvocation({
|
|
187
|
+
root,
|
|
188
|
+
model,
|
|
189
|
+
reasoningEffort,
|
|
190
|
+
codexBin,
|
|
191
|
+
schemaFile,
|
|
192
|
+
outputFile,
|
|
193
|
+
}) {
|
|
194
|
+
const args = [
|
|
195
|
+
'exec',
|
|
196
|
+
'--skip-git-repo-check',
|
|
197
|
+
'--ephemeral',
|
|
198
|
+
'--ignore-rules',
|
|
199
|
+
'--sandbox',
|
|
200
|
+
'read-only',
|
|
201
|
+
'--model',
|
|
202
|
+
model,
|
|
203
|
+
'--config',
|
|
204
|
+
`model_reasoning_effort=${JSON.stringify(reasoningEffort)}`,
|
|
205
|
+
'--output-schema',
|
|
206
|
+
schemaFile,
|
|
207
|
+
'--output-last-message',
|
|
208
|
+
outputFile,
|
|
209
|
+
'--color',
|
|
210
|
+
'never',
|
|
211
|
+
'--cd',
|
|
212
|
+
root,
|
|
213
|
+
'-',
|
|
214
|
+
];
|
|
215
|
+
|
|
216
|
+
if (codexBin) {
|
|
217
|
+
return { command: codexBin, args };
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
return {
|
|
221
|
+
command: process.execPath,
|
|
222
|
+
args: [resolveCodexBin(), ...args],
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
export function resolveCodexBin() {
|
|
227
|
+
for (const packageJsonPath of resolvePackageJsonPaths('@openai/codex')) {
|
|
228
|
+
const packageJson = JSON.parse(readFileSync(packageJsonPath, 'utf8'));
|
|
229
|
+
const bin = typeof packageJson.bin === 'string'
|
|
230
|
+
? packageJson.bin
|
|
231
|
+
: packageJson.bin?.codex;
|
|
232
|
+
|
|
233
|
+
if (!bin) {
|
|
234
|
+
continue;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
const binPath = join(dirname(packageJsonPath), bin);
|
|
238
|
+
|
|
239
|
+
if (existsSync(binPath)) {
|
|
240
|
+
return binPath;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
throw new Error('@openai/codex does not expose a codex binary.');
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
export function resolvePackageJsonPaths(packageName) {
|
|
248
|
+
const packageJsonPaths = [];
|
|
249
|
+
|
|
250
|
+
try {
|
|
251
|
+
packageJsonPaths.push(require.resolve(`${packageName}/package.json`));
|
|
252
|
+
} catch (error) {
|
|
253
|
+
if (error.code !== 'ERR_PACKAGE_PATH_NOT_EXPORTED') {
|
|
254
|
+
throw error;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
if (packageJsonPaths.length > 0) {
|
|
259
|
+
return [...new Set(packageJsonPaths)];
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
let directory = dirname(require.resolve(packageName));
|
|
263
|
+
|
|
264
|
+
while (true) {
|
|
265
|
+
const candidate = join(directory, 'package.json');
|
|
266
|
+
|
|
267
|
+
if (existsSync(candidate)) {
|
|
268
|
+
const packageJson = JSON.parse(readFileSync(candidate, 'utf8'));
|
|
269
|
+
|
|
270
|
+
if (packageJson.name === packageName) {
|
|
271
|
+
packageJsonPaths.push(candidate);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const parent = dirname(directory);
|
|
276
|
+
|
|
277
|
+
if (parent === directory) {
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
directory = parent;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return [...new Set(packageJsonPaths)];
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
export function parseCodexResponse(text) {
|
|
288
|
+
const trimmed = text.trim();
|
|
289
|
+
|
|
290
|
+
try {
|
|
291
|
+
return JSON.parse(trimmed);
|
|
292
|
+
} catch {
|
|
293
|
+
const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
294
|
+
|
|
295
|
+
if (fenced) {
|
|
296
|
+
return JSON.parse(fenced[1]);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
throw new Error('Codex did not return valid duplicate-check JSON.');
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
function runCodex({ command, args }, prompt) {
|
|
304
|
+
return new Promise((resolve, reject) => {
|
|
305
|
+
let stdout = '';
|
|
306
|
+
let stderr = '';
|
|
307
|
+
const child = spawn(command, args, {
|
|
308
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
309
|
+
env: {
|
|
310
|
+
...process.env,
|
|
311
|
+
NO_COLOR: process.env.NO_COLOR ?? '1',
|
|
312
|
+
},
|
|
313
|
+
});
|
|
314
|
+
|
|
315
|
+
child.stdout.on('data', (chunk) => {
|
|
316
|
+
stdout += chunk.toString();
|
|
317
|
+
});
|
|
318
|
+
child.stderr.on('data', (chunk) => {
|
|
319
|
+
stderr += chunk.toString();
|
|
320
|
+
});
|
|
321
|
+
child.on('error', reject);
|
|
322
|
+
child.on('close', (code) => {
|
|
323
|
+
if (code === 0) {
|
|
324
|
+
resolve();
|
|
325
|
+
} else {
|
|
326
|
+
const detail = [stderr.trim(), stdout.trim()].filter(Boolean).join('\n');
|
|
327
|
+
reject(new Error(`Codex duplicate review failed with exit code ${code ?? 1}.\n${detail}`));
|
|
328
|
+
}
|
|
329
|
+
});
|
|
330
|
+
|
|
331
|
+
child.stdin.end(prompt);
|
|
332
|
+
});
|
|
333
|
+
}
|
package/src/config.mjs
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
import { isAbsolute, resolve } from 'node:path';
|
|
3
|
+
import {
|
|
4
|
+
defaultConfigFile,
|
|
5
|
+
defaultExclude,
|
|
6
|
+
defaultInclude,
|
|
7
|
+
duplicateDefaults,
|
|
8
|
+
styleDefaults,
|
|
9
|
+
} from './defaults.mjs';
|
|
10
|
+
|
|
11
|
+
export async function loadDocsConfig({ root = process.cwd(), configPath } = {}) {
|
|
12
|
+
const resolvedRoot = resolve(root);
|
|
13
|
+
const resolvedConfigPath = configPath
|
|
14
|
+
? resolvePath(resolvedRoot, configPath)
|
|
15
|
+
: resolve(resolvedRoot, defaultConfigFile);
|
|
16
|
+
|
|
17
|
+
try {
|
|
18
|
+
const raw = await readFile(resolvedConfigPath, 'utf8');
|
|
19
|
+
const parsed = JSON.parse(raw);
|
|
20
|
+
return parsed.docs ?? parsed;
|
|
21
|
+
} catch (error) {
|
|
22
|
+
if (error.code === 'ENOENT' && !configPath) {
|
|
23
|
+
return {};
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
throw error;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export async function resolveDuplicateOptions(flags = {}) {
|
|
31
|
+
const root = resolve(flags.root ?? process.cwd());
|
|
32
|
+
const config = await loadDocsConfig({ root, configPath: flags.configPath });
|
|
33
|
+
const duplicateConfig = config.duplicates ?? {};
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
root,
|
|
37
|
+
include: chooseArray(flags.include, duplicateConfig.include, config.include, defaultInclude),
|
|
38
|
+
exclude: chooseArray(flags.exclude, duplicateConfig.exclude, config.exclude, defaultExclude),
|
|
39
|
+
includeReferences: flags.includeReferences ?? duplicateConfig.includeReferences ?? duplicateDefaults.includeReferences,
|
|
40
|
+
includeSameFile: flags.includeSameFile ?? duplicateConfig.includeSameFile ?? duplicateDefaults.includeSameFile,
|
|
41
|
+
ignorePairs: chooseArray(duplicateConfig.ignorePairs, duplicateDefaults.ignorePairs),
|
|
42
|
+
warnScore: chooseNumber(flags.warnScore, duplicateConfig.warnScore, duplicateDefaults.warnScore),
|
|
43
|
+
failScore: chooseNumber(flags.failScore, duplicateConfig.failScore, duplicateDefaults.failScore),
|
|
44
|
+
minWords: chooseNumber(flags.minWords, duplicateConfig.minWords, duplicateDefaults.minWords),
|
|
45
|
+
minChars: chooseNumber(flags.minChars, duplicateConfig.minChars, duplicateDefaults.minChars),
|
|
46
|
+
maxCandidates: chooseNumber(flags.maxCandidates, duplicateConfig.maxCandidates, duplicateDefaults.maxCandidates),
|
|
47
|
+
model: flags.model ?? duplicateConfig.model ?? duplicateDefaults.model,
|
|
48
|
+
reasoningEffort: flags.reasoningEffort ?? duplicateConfig.reasoningEffort ?? duplicateDefaults.reasoningEffort,
|
|
49
|
+
codexBin: flags.codexBin ?? duplicateConfig.codexBin,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export async function resolveStyleOptions(flags = {}) {
|
|
54
|
+
const root = resolve(flags.root ?? process.cwd());
|
|
55
|
+
const config = await loadDocsConfig({ root, configPath: flags.configPath });
|
|
56
|
+
const styleConfig = config.style ?? {};
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
root,
|
|
60
|
+
include: chooseArray(flags.include, styleConfig.include, config.include, defaultInclude),
|
|
61
|
+
exclude: chooseArray(flags.exclude, styleConfig.exclude, config.exclude, defaultExclude),
|
|
62
|
+
includeReferences: flags.includeReferences ?? styleConfig.includeReferences ?? styleDefaults.includeReferences,
|
|
63
|
+
minWords: chooseNumber(flags.minWords, styleConfig.minWords, styleDefaults.minWords),
|
|
64
|
+
minChars: chooseNumber(flags.minChars, styleConfig.minChars, styleDefaults.minChars),
|
|
65
|
+
maxUnits: chooseNumber(flags.maxUnits, styleConfig.maxUnits, styleDefaults.maxUnits),
|
|
66
|
+
model: flags.model ?? styleConfig.model ?? styleDefaults.model,
|
|
67
|
+
reasoningEffort: flags.reasoningEffort ?? styleConfig.reasoningEffort ?? styleDefaults.reasoningEffort,
|
|
68
|
+
codexBin: flags.codexBin ?? styleConfig.codexBin,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function chooseArray(...candidates) {
|
|
73
|
+
for (const candidate of candidates) {
|
|
74
|
+
if (Array.isArray(candidate) && candidate.length > 0) {
|
|
75
|
+
return candidate;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return [];
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function chooseNumber(...candidates) {
|
|
83
|
+
for (const candidate of candidates) {
|
|
84
|
+
if (candidate !== undefined && candidate !== null && !Number.isNaN(Number(candidate))) {
|
|
85
|
+
return Number(candidate);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return undefined;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function resolvePath(root, path) {
|
|
93
|
+
return isAbsolute(path) ? path : resolve(root, path);
|
|
94
|
+
}
|
package/src/defaults.mjs
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
export const defaultInclude = [
|
|
2
|
+
'*.md',
|
|
3
|
+
'docs/**/*.md',
|
|
4
|
+
'**/AGENTS.md',
|
|
5
|
+
'.agents/skills/**/*.md',
|
|
6
|
+
'packages/**/*.md',
|
|
7
|
+
'rules/**/*.md',
|
|
8
|
+
'.codex/**/*.md',
|
|
9
|
+
];
|
|
10
|
+
|
|
11
|
+
export const defaultExclude = [
|
|
12
|
+
'node_modules/**',
|
|
13
|
+
'.git/**',
|
|
14
|
+
'dist/**',
|
|
15
|
+
'coverage/**',
|
|
16
|
+
'.tmp/**',
|
|
17
|
+
'repos/**',
|
|
18
|
+
'worktrees/**',
|
|
19
|
+
];
|
|
20
|
+
|
|
21
|
+
export const defaultConfigFile = 'agent-doc-rules.config.json';
|
|
22
|
+
|
|
23
|
+
export const duplicateDefaults = {
|
|
24
|
+
includeReferences: false,
|
|
25
|
+
includeSameFile: false,
|
|
26
|
+
ignorePairs: [],
|
|
27
|
+
warnScore: 0.78,
|
|
28
|
+
failScore: 0.92,
|
|
29
|
+
minWords: 6,
|
|
30
|
+
minChars: 40,
|
|
31
|
+
maxCandidates: 50,
|
|
32
|
+
model: 'gpt-5-nano',
|
|
33
|
+
reasoningEffort: 'low',
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
export const styleDefaults = {
|
|
37
|
+
includeReferences: false,
|
|
38
|
+
minWords: 6,
|
|
39
|
+
minChars: 40,
|
|
40
|
+
maxUnits: 80,
|
|
41
|
+
model: 'gpt-5-nano',
|
|
42
|
+
reasoningEffort: 'low',
|
|
43
|
+
};
|
package/src/markdown.mjs
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import fastGlob from 'fast-glob';
|
|
4
|
+
import { toString } from 'mdast-util-to-string';
|
|
5
|
+
import { split } from 'sentence-splitter';
|
|
6
|
+
import remarkParse from 'remark-parse';
|
|
7
|
+
import { unified } from 'unified';
|
|
8
|
+
import { visit } from 'unist-util-visit';
|
|
9
|
+
|
|
10
|
+
export async function resolveDuplicateFiles({ root, include, exclude, includeReferences = false }) {
|
|
11
|
+
const files = await fastGlob(include, {
|
|
12
|
+
cwd: root,
|
|
13
|
+
dot: true,
|
|
14
|
+
ignore: expandExcludePatterns(exclude),
|
|
15
|
+
onlyFiles: true,
|
|
16
|
+
unique: true,
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
return files
|
|
20
|
+
.filter((file) => file.endsWith('.md'))
|
|
21
|
+
.filter((file) => includeReferences || !hasPathSegment(file, 'references'))
|
|
22
|
+
.sort((left, right) => left.localeCompare(right));
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export async function loadMarkdownUnits(options) {
|
|
26
|
+
const files = await resolveDuplicateFiles(options);
|
|
27
|
+
const units = [];
|
|
28
|
+
|
|
29
|
+
for (const file of files) {
|
|
30
|
+
const content = await readFile(join(options.root, file), 'utf8');
|
|
31
|
+
units.push(...extractMarkdownUnits({
|
|
32
|
+
file,
|
|
33
|
+
content,
|
|
34
|
+
minWords: options.minWords,
|
|
35
|
+
minChars: options.minChars,
|
|
36
|
+
}));
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return { files, units };
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function extractMarkdownUnits({ file, content, minWords = 6, minChars = 40 }) {
|
|
43
|
+
const tree = unified().use(remarkParse).parse(content);
|
|
44
|
+
const units = [];
|
|
45
|
+
|
|
46
|
+
visit(tree, ['heading', 'paragraph'], (node) => {
|
|
47
|
+
if (node.type === 'paragraph' && isMarkdownTableBlock(sliceNodeContent(content, node))) {
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
const text = normalizeWhitespace(toString(node));
|
|
52
|
+
|
|
53
|
+
for (const sentence of splitIntoUnits(text)) {
|
|
54
|
+
const normalized = normalizeForDuplicateCheck(sentence);
|
|
55
|
+
const words = normalized.split(' ').filter(Boolean);
|
|
56
|
+
|
|
57
|
+
if (isUsefulUnit({ text: sentence, normalized, words, minWords, minChars })) {
|
|
58
|
+
units.push({
|
|
59
|
+
id: `${file}:${node.position?.start?.line ?? 1}:${units.length + 1}`,
|
|
60
|
+
file,
|
|
61
|
+
line: node.position?.start?.line ?? 1,
|
|
62
|
+
text: sentence,
|
|
63
|
+
normalized,
|
|
64
|
+
words,
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
return units;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export function normalizeForDuplicateCheck(text) {
|
|
74
|
+
return normalizeWhitespace(text)
|
|
75
|
+
.toLowerCase()
|
|
76
|
+
.replace(/[`*_~[\](){}#>.,:;!?'"“”‘’]/g, '')
|
|
77
|
+
.replace(/\s+/g, ' ')
|
|
78
|
+
.trim();
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function splitIntoUnits(text) {
|
|
82
|
+
const sentences = split(text)
|
|
83
|
+
.filter((node) => node.type === 'Sentence')
|
|
84
|
+
.map((node) => normalizeWhitespace(node.raw))
|
|
85
|
+
.filter(Boolean);
|
|
86
|
+
|
|
87
|
+
return sentences.length > 0 ? sentences : [text];
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function isUsefulUnit({ text, normalized, words, minWords, minChars }) {
|
|
91
|
+
if (normalized.length < minChars || words.length < minWords) {
|
|
92
|
+
return false;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const alphaNumericCount = (text.match(/[a-z0-9]/gi) ?? []).length;
|
|
96
|
+
return alphaNumericCount / Math.max(text.length, 1) >= 0.45;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function sliceNodeContent(content, node) {
|
|
100
|
+
const start = node.position?.start?.offset;
|
|
101
|
+
const end = node.position?.end?.offset;
|
|
102
|
+
|
|
103
|
+
if (!Number.isInteger(start) || !Number.isInteger(end)) {
|
|
104
|
+
return '';
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return content.slice(start, end);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
function isMarkdownTableBlock(raw) {
|
|
111
|
+
const lines = raw
|
|
112
|
+
.split(/\r?\n/)
|
|
113
|
+
.map((line) => line.trim())
|
|
114
|
+
.filter(Boolean);
|
|
115
|
+
|
|
116
|
+
if (lines.length < 2) {
|
|
117
|
+
return false;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
const allPipeRows = lines.every((line) => line.startsWith('|') && line.endsWith('|') && line.split('|').length >= 4);
|
|
121
|
+
if (!allPipeRows) {
|
|
122
|
+
return false;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return lines.some((line) => /^\|\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|$/.test(line));
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function normalizeWhitespace(text) {
|
|
129
|
+
return text.replace(/\s+/g, ' ').trim();
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function hasPathSegment(file, segment) {
|
|
133
|
+
return file.split(/[\\/]/).includes(segment);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function expandExcludePatterns(exclude) {
|
|
137
|
+
const expanded = [];
|
|
138
|
+
|
|
139
|
+
for (const pattern of exclude) {
|
|
140
|
+
expanded.push(pattern);
|
|
141
|
+
|
|
142
|
+
if (!pattern.startsWith('**/') && !pattern.startsWith('/')) {
|
|
143
|
+
expanded.push(`**/${pattern}`);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return [...new Set(expanded)];
|
|
148
|
+
}
|
package/src/style.mjs
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { runCodexStyleReviewer } from './codex.mjs';
|
|
2
|
+
import { loadMarkdownUnits } from './markdown.mjs';
|
|
3
|
+
|
|
4
|
+
export async function checkStyle(options, deps = {}) {
|
|
5
|
+
const loadUnits = deps.loadMarkdownUnits ?? loadMarkdownUnits;
|
|
6
|
+
const reviewStyle = deps.reviewStyle ?? runCodexStyleReviewer;
|
|
7
|
+
const { files, units } = await loadUnits(options);
|
|
8
|
+
const reviewUnits = units.slice(0, options.maxUnits);
|
|
9
|
+
|
|
10
|
+
if (reviewUnits.length === 0) {
|
|
11
|
+
return {
|
|
12
|
+
code: 0,
|
|
13
|
+
files,
|
|
14
|
+
units,
|
|
15
|
+
reviewUnits,
|
|
16
|
+
findings: [],
|
|
17
|
+
report: formatStyleReport({ files, units, reviewUnits, findings: [] }),
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const rawResult = await reviewStyle(reviewUnits, options);
|
|
22
|
+
const findings = normalizeStyleFindings({ reviewUnits, rawResult });
|
|
23
|
+
const failCount = findings.filter((finding) => finding.status === 'fail').length;
|
|
24
|
+
|
|
25
|
+
return {
|
|
26
|
+
code: failCount > 0 ? 1 : 0,
|
|
27
|
+
files,
|
|
28
|
+
units,
|
|
29
|
+
reviewUnits,
|
|
30
|
+
findings,
|
|
31
|
+
report: formatStyleReport({ files, units, reviewUnits, findings }),
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function normalizeStyleFindings({ reviewUnits, rawResult }) {
|
|
36
|
+
const unitsById = new Map(reviewUnits.map((unit) => [unit.id, unit]));
|
|
37
|
+
|
|
38
|
+
return (rawResult.findings ?? [])
|
|
39
|
+
.filter((finding) => ['fail', 'warn'].includes(finding.status))
|
|
40
|
+
.map((finding) => {
|
|
41
|
+
const unit = unitsById.get(finding.id);
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
id: finding.id,
|
|
45
|
+
status: finding.status,
|
|
46
|
+
category: finding.category,
|
|
47
|
+
issue: finding.issue,
|
|
48
|
+
suggestion: finding.suggestion,
|
|
49
|
+
confidence: Number(finding.confidence),
|
|
50
|
+
file: unit?.file ?? 'unknown',
|
|
51
|
+
line: unit?.line ?? 1,
|
|
52
|
+
text: unit?.text ?? '',
|
|
53
|
+
};
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function formatStyleReport({ files, units, reviewUnits, findings }) {
|
|
58
|
+
const lines = [
|
|
59
|
+
'Docs AI style review',
|
|
60
|
+
`Files: ${files.length}`,
|
|
61
|
+
`Text units: ${units.length}`,
|
|
62
|
+
`Reviewed units: ${reviewUnits.length}`,
|
|
63
|
+
];
|
|
64
|
+
|
|
65
|
+
if (findings.length === 0) {
|
|
66
|
+
lines.push('No AI style findings.');
|
|
67
|
+
return `${lines.join('\n')}\n`;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
const grouped = {
|
|
71
|
+
fail: findings.filter((finding) => finding.status === 'fail'),
|
|
72
|
+
warn: findings.filter((finding) => finding.status === 'warn'),
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
for (const status of ['fail', 'warn']) {
|
|
76
|
+
for (const finding of grouped[status]) {
|
|
77
|
+
lines.push('');
|
|
78
|
+
lines.push(`[${status}] ${finding.id} confidence=${formatConfidence(finding.confidence)} category=${finding.category}`);
|
|
79
|
+
lines.push(`${finding.file}:${finding.line}`);
|
|
80
|
+
lines.push(`Issue: ${finding.issue}`);
|
|
81
|
+
lines.push(`Suggestion: ${finding.suggestion}`);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
lines.push('');
|
|
86
|
+
lines.push(`Summary: ${grouped.fail.length} fail, ${grouped.warn.length} warn`);
|
|
87
|
+
|
|
88
|
+
return `${lines.join('\n')}\n`;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function formatConfidence(confidence) {
|
|
92
|
+
return Number.isFinite(confidence) ? confidence.toFixed(2) : 'n/a';
|
|
93
|
+
}
|