docrev 0.9.6 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/dev_notes/bug_repro_comment_parser.md +71 -0
- package/dist/lib/anchor-match.d.ts +41 -0
- package/dist/lib/anchor-match.d.ts.map +1 -0
- package/dist/lib/anchor-match.js +192 -0
- package/dist/lib/anchor-match.js.map +1 -0
- package/dist/lib/annotations.d.ts.map +1 -1
- package/dist/lib/annotations.js +8 -5
- package/dist/lib/annotations.js.map +1 -1
- package/dist/lib/commands/index.d.ts +2 -1
- package/dist/lib/commands/index.d.ts.map +1 -1
- package/dist/lib/commands/index.js +3 -1
- package/dist/lib/commands/index.js.map +1 -1
- package/dist/lib/commands/section-boundaries.d.ts +22 -0
- package/dist/lib/commands/section-boundaries.d.ts.map +1 -0
- package/dist/lib/commands/section-boundaries.js +53 -0
- package/dist/lib/commands/section-boundaries.js.map +1 -0
- package/dist/lib/commands/sync.d.ts.map +1 -1
- package/dist/lib/commands/sync.js +135 -0
- package/dist/lib/commands/sync.js.map +1 -1
- package/dist/lib/commands/verify-anchors.d.ts +17 -0
- package/dist/lib/commands/verify-anchors.d.ts.map +1 -0
- package/dist/lib/commands/verify-anchors.js +215 -0
- package/dist/lib/commands/verify-anchors.js.map +1 -0
- package/dist/lib/import.d.ts +14 -8
- package/dist/lib/import.d.ts.map +1 -1
- package/dist/lib/import.js +16 -144
- package/dist/lib/import.js.map +1 -1
- package/dist/lib/word-extraction.d.ts +23 -0
- package/dist/lib/word-extraction.d.ts.map +1 -1
- package/dist/lib/word-extraction.js +79 -0
- package/dist/lib/word-extraction.js.map +1 -1
- package/lib/anchor-match.ts +238 -0
- package/lib/annotations.ts +9 -5
- package/lib/commands/index.ts +3 -0
- package/lib/commands/section-boundaries.ts +72 -0
- package/lib/commands/sync.ts +165 -0
- package/lib/commands/verify-anchors.ts +261 -0
- package/lib/import.ts +29 -165
- package/lib/word-extraction.ts +93 -0
- package/package.json +1 -1
- package/skill/REFERENCE.md +29 -2
- package/skill/SKILL.md +12 -2
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* VERIFY-ANCHORS command: report drift between Word comment anchors
|
|
3
|
+
* and the current markdown.
|
|
4
|
+
*
|
|
5
|
+
* Useful when prose has been revised between sending the docx out for
|
|
6
|
+
* review and receiving it back. Each comment is classified by how well
|
|
7
|
+
* its anchor still matches the current section prose:
|
|
8
|
+
*
|
|
9
|
+
* clean – exact or whitespace-normalized hit
|
|
10
|
+
* drift – anchor only matches via stripped/partial fallbacks
|
|
11
|
+
* context-only – anchor text is gone, only surrounding context survives
|
|
12
|
+
* ambiguous – multiple matches, can't pick one without context
|
|
13
|
+
* unmatched – nothing maps; user must place the comment manually
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import {
|
|
17
|
+
chalk,
|
|
18
|
+
fs,
|
|
19
|
+
path,
|
|
20
|
+
fmt,
|
|
21
|
+
loadConfig,
|
|
22
|
+
jsonMode,
|
|
23
|
+
jsonOutput,
|
|
24
|
+
} from './context.js';
|
|
25
|
+
import type { Command } from 'commander';
|
|
26
|
+
import { findAnchorInText, classifyStrategy, type AnchorMatchQuality } from '../anchor-match.js';
|
|
27
|
+
import type { CommentAnchorData } from '../word-extraction.js';
|
|
28
|
+
import { computeSectionBoundaries } from './section-boundaries.js';
|
|
29
|
+
|
|
30
|
+
interface VerifyOptions {
|
|
31
|
+
config: string;
|
|
32
|
+
dir: string;
|
|
33
|
+
json?: boolean;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
interface CommentReport {
|
|
37
|
+
id: string;
|
|
38
|
+
author: string;
|
|
39
|
+
text: string;
|
|
40
|
+
section: string | null;
|
|
41
|
+
quality: AnchorMatchQuality | 'ambiguous';
|
|
42
|
+
strategy: string;
|
|
43
|
+
anchor: string;
|
|
44
|
+
occurrences: number;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export function register(program: Command): void {
|
|
48
|
+
program
|
|
49
|
+
.command('verify-anchors')
|
|
50
|
+
.description('Report drift between Word comment anchors and current markdown')
|
|
51
|
+
.argument('<file>', 'Word document with reviewer comments (.docx)')
|
|
52
|
+
.option('-c, --config <file>', 'Sections config file', 'sections.yaml')
|
|
53
|
+
.option('-d, --dir <directory>', 'Directory with section files', '.')
|
|
54
|
+
.option('--json', 'Output JSON report (for scripting)')
|
|
55
|
+
.action(async (docxPath: string, options: VerifyOptions) => {
|
|
56
|
+
if (!fs.existsSync(docxPath)) {
|
|
57
|
+
console.error(fmt.status('error', `File not found: ${docxPath}`));
|
|
58
|
+
process.exit(1);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const configPath = path.resolve(options.dir, options.config);
|
|
62
|
+
if (!fs.existsSync(configPath)) {
|
|
63
|
+
console.error(fmt.status('error', `Config not found: ${configPath}`));
|
|
64
|
+
console.error(chalk.dim(' Run "rev init" first to generate sections.yaml'));
|
|
65
|
+
process.exit(1);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const config = loadConfig(configPath);
|
|
69
|
+
const { extractWordComments, extractCommentAnchors, extractHeadings } = await import('../import.js');
|
|
70
|
+
|
|
71
|
+
let comments;
|
|
72
|
+
let anchors;
|
|
73
|
+
let headings;
|
|
74
|
+
try {
|
|
75
|
+
comments = await extractWordComments(docxPath);
|
|
76
|
+
const result = await extractCommentAnchors(docxPath);
|
|
77
|
+
anchors = result.anchors;
|
|
78
|
+
headings = await extractHeadings(docxPath);
|
|
79
|
+
} catch (err) {
|
|
80
|
+
const error = err as Error;
|
|
81
|
+
console.error(fmt.status('error', `Failed to read ${path.basename(docxPath)}: ${error.message}`));
|
|
82
|
+
if (process.env.DEBUG) console.error(error.stack);
|
|
83
|
+
process.exit(1);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (comments.length === 0) {
|
|
87
|
+
console.log(fmt.status('info', 'No comments found in document.'));
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const boundaries = computeSectionBoundaries(config.sections, headings);
|
|
92
|
+
|
|
93
|
+
// Cache section markdown contents on first read
|
|
94
|
+
const sectionCache = new Map<string, string>();
|
|
95
|
+
function loadSection(file: string): string | null {
|
|
96
|
+
if (sectionCache.has(file)) return sectionCache.get(file)!;
|
|
97
|
+
const sectionPath = path.join(options.dir, file);
|
|
98
|
+
if (!fs.existsSync(sectionPath)) return null;
|
|
99
|
+
const content = fs.readFileSync(sectionPath, 'utf-8');
|
|
100
|
+
sectionCache.set(file, content);
|
|
101
|
+
return content;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
const firstBoundaryStart = boundaries.length > 0 ? boundaries[0].start : 0;
|
|
105
|
+
const reports: CommentReport[] = [];
|
|
106
|
+
|
|
107
|
+
for (const c of comments) {
|
|
108
|
+
const anchor: CommentAnchorData | undefined = anchors.get(c.id);
|
|
109
|
+
const anchorText = anchor?.anchor || '';
|
|
110
|
+
if (!anchor) {
|
|
111
|
+
reports.push({
|
|
112
|
+
id: c.id,
|
|
113
|
+
author: c.author,
|
|
114
|
+
text: c.text,
|
|
115
|
+
section: null,
|
|
116
|
+
quality: 'unmatched',
|
|
117
|
+
strategy: 'no-anchor',
|
|
118
|
+
anchor: '',
|
|
119
|
+
occurrences: 0,
|
|
120
|
+
});
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Determine which section file this comment lives in
|
|
125
|
+
let sectionFile: string | null = null;
|
|
126
|
+
for (const b of boundaries) {
|
|
127
|
+
if (anchor.docPosition >= b.start && anchor.docPosition < b.end) {
|
|
128
|
+
sectionFile = b.file;
|
|
129
|
+
break;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
if (!sectionFile && boundaries.length > 0 && anchor.docPosition < firstBoundaryStart) {
|
|
133
|
+
sectionFile = boundaries[0].file;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (!sectionFile) {
|
|
137
|
+
reports.push({
|
|
138
|
+
id: c.id,
|
|
139
|
+
author: c.author,
|
|
140
|
+
text: c.text,
|
|
141
|
+
section: null,
|
|
142
|
+
quality: 'unmatched',
|
|
143
|
+
strategy: 'no-section',
|
|
144
|
+
anchor: anchorText,
|
|
145
|
+
occurrences: 0,
|
|
146
|
+
});
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
const md = loadSection(sectionFile);
|
|
151
|
+
if (md === null) {
|
|
152
|
+
reports.push({
|
|
153
|
+
id: c.id,
|
|
154
|
+
author: c.author,
|
|
155
|
+
text: c.text,
|
|
156
|
+
section: sectionFile,
|
|
157
|
+
quality: 'unmatched',
|
|
158
|
+
strategy: 'missing-file',
|
|
159
|
+
anchor: anchorText,
|
|
160
|
+
occurrences: 0,
|
|
161
|
+
});
|
|
162
|
+
continue;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
const search = findAnchorInText(anchor.anchor, md, anchor.before, anchor.after);
|
|
166
|
+
let quality: AnchorMatchQuality | 'ambiguous' = classifyStrategy(search.strategy, search.occurrences.length);
|
|
167
|
+
if (quality === 'clean' && search.occurrences.length > 1) {
|
|
168
|
+
quality = 'ambiguous';
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
reports.push({
|
|
172
|
+
id: c.id,
|
|
173
|
+
author: c.author,
|
|
174
|
+
text: c.text,
|
|
175
|
+
section: sectionFile,
|
|
176
|
+
quality,
|
|
177
|
+
strategy: search.strategy,
|
|
178
|
+
anchor: anchorText,
|
|
179
|
+
occurrences: search.occurrences.length,
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
if (options.json || jsonMode) {
|
|
184
|
+
jsonOutput({
|
|
185
|
+
file: docxPath,
|
|
186
|
+
totalComments: comments.length,
|
|
187
|
+
summary: tally(reports),
|
|
188
|
+
comments: reports,
|
|
189
|
+
});
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
printReport(docxPath, reports);
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
function tally(reports: CommentReport[]): Record<string, number> {
|
|
198
|
+
const out: Record<string, number> = { clean: 0, drift: 0, 'context-only': 0, ambiguous: 0, unmatched: 0 };
|
|
199
|
+
for (const r of reports) out[r.quality] = (out[r.quality] || 0) + 1;
|
|
200
|
+
return out;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function printReport(docxPath: string, reports: CommentReport[]): void {
|
|
204
|
+
console.log(fmt.header(`Anchor Verification: ${path.basename(docxPath)}`));
|
|
205
|
+
console.log();
|
|
206
|
+
|
|
207
|
+
const totals = tally(reports);
|
|
208
|
+
const summaryLines: string[] = [];
|
|
209
|
+
summaryLines.push(`${chalk.green(totals.clean)} clean (anchor still matches)`);
|
|
210
|
+
if (totals.drift) summaryLines.push(`${chalk.cyan(totals.drift)} drifted (matched via fallback strategies)`);
|
|
211
|
+
if (totals['context-only']) summaryLines.push(`${chalk.yellow(totals['context-only'])} context-only (anchor text gone, neighbors survive)`);
|
|
212
|
+
if (totals.ambiguous) summaryLines.push(`${chalk.magenta(totals.ambiguous)} ambiguous (multiple candidate positions)`);
|
|
213
|
+
if (totals.unmatched) summaryLines.push(`${chalk.red(totals.unmatched)} unmatched (manual placement needed)`);
|
|
214
|
+
console.log(fmt.box(summaryLines.join('\n'), { title: 'Summary', padding: 0 }));
|
|
215
|
+
console.log();
|
|
216
|
+
|
|
217
|
+
// Per-comment table for everything that isn't a clean direct hit
|
|
218
|
+
const problems = reports.filter(r => r.quality !== 'clean');
|
|
219
|
+
if (problems.length === 0) {
|
|
220
|
+
console.log(fmt.status('success', 'All comment anchors match the current markdown.'));
|
|
221
|
+
return;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const rows = problems.map(r => [
|
|
225
|
+
chalk.dim(`#${r.id}`),
|
|
226
|
+
qualityColor(r.quality),
|
|
227
|
+
r.section ? chalk.bold(r.section) : chalk.dim('—'),
|
|
228
|
+
chalk.dim(r.strategy),
|
|
229
|
+
truncate(r.anchor, 35),
|
|
230
|
+
truncate(r.text, 35),
|
|
231
|
+
]);
|
|
232
|
+
|
|
233
|
+
console.log(fmt.table(
|
|
234
|
+
['ID', 'Quality', 'Section', 'Strategy', 'Anchor (Word)', 'Comment'],
|
|
235
|
+
rows,
|
|
236
|
+
{ align: ['right', 'left', 'left', 'left', 'left', 'left'] },
|
|
237
|
+
));
|
|
238
|
+
|
|
239
|
+
if (totals.unmatched > 0 || totals.ambiguous > 0) {
|
|
240
|
+
console.log();
|
|
241
|
+
console.log(chalk.dim('Comments flagged "unmatched" or "ambiguous" need manual placement.'));
|
|
242
|
+
console.log(chalk.dim('Run "rev sync --no-overwrite" to import the matched ones without touching prose.'));
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
function qualityColor(q: string): string {
|
|
247
|
+
switch (q) {
|
|
248
|
+
case 'clean': return chalk.green('clean');
|
|
249
|
+
case 'drift': return chalk.cyan('drift');
|
|
250
|
+
case 'context-only': return chalk.yellow('context');
|
|
251
|
+
case 'ambiguous': return chalk.magenta('ambiguous');
|
|
252
|
+
case 'unmatched': return chalk.red('unmatched');
|
|
253
|
+
default: return q;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
function truncate(s: string, max: number): string {
|
|
258
|
+
if (!s) return chalk.dim('—');
|
|
259
|
+
const flat = s.replace(/\s+/g, ' ').trim();
|
|
260
|
+
return flat.length > max ? flat.slice(0, max - 1) + '…' : flat;
|
|
261
|
+
}
|
package/lib/import.ts
CHANGED
|
@@ -36,12 +36,14 @@ import {
|
|
|
36
36
|
parseVisibleComments,
|
|
37
37
|
convertVisibleComments,
|
|
38
38
|
} from './restore-references.js';
|
|
39
|
+
import { findAnchorInText } from './anchor-match.js';
|
|
39
40
|
|
|
40
41
|
// Re-export everything so existing imports from './import.js' still work
|
|
41
42
|
export {
|
|
42
43
|
extractFromWord,
|
|
43
44
|
extractWordComments,
|
|
44
45
|
extractCommentAnchors,
|
|
46
|
+
extractHeadings,
|
|
45
47
|
extractWordTables,
|
|
46
48
|
} from './word-extraction.js';
|
|
47
49
|
export type {
|
|
@@ -49,6 +51,7 @@ export type {
|
|
|
49
51
|
TextNode,
|
|
50
52
|
CommentAnchorData,
|
|
51
53
|
CommentAnchorsResult,
|
|
54
|
+
DocxHeading,
|
|
52
55
|
WordTable,
|
|
53
56
|
ParsedRow,
|
|
54
57
|
ExtractFromWordOptions,
|
|
@@ -86,6 +89,17 @@ const execAsync = promisify(exec);
|
|
|
86
89
|
export interface InsertCommentsOptions {
|
|
87
90
|
quiet?: boolean;
|
|
88
91
|
sectionBoundary?: { start: number; end: number } | null;
|
|
92
|
+
/**
|
|
93
|
+
* When true (default), comments wrap their anchor text in `[anchor]{.mark}`
|
|
94
|
+
* so the rebuilt docx restores the original Word comment range. When false,
|
|
95
|
+
* comments are inserted as standalone `{>>...<<}` blocks adjacent to the
|
|
96
|
+
* anchor — the prose stays byte-identical except for the inserted blocks.
|
|
97
|
+
*
|
|
98
|
+
* Set to false from `sync --comments-only` so a draft revised after the
|
|
99
|
+
* docx was sent for review keeps its prose intact, and so multiple
|
|
100
|
+
* comments sharing one anchor don't produce nested broken markup.
|
|
101
|
+
*/
|
|
102
|
+
wrapAnchor?: boolean;
|
|
89
103
|
}
|
|
90
104
|
|
|
91
105
|
export interface CommentWithPos {
|
|
@@ -100,12 +114,7 @@ export interface CommentWithPos {
|
|
|
100
114
|
strategy?: string;
|
|
101
115
|
}
|
|
102
116
|
|
|
103
|
-
export
|
|
104
|
-
occurrences: number[];
|
|
105
|
-
matchedAnchor: string | null;
|
|
106
|
-
strategy: string;
|
|
107
|
-
stripped?: boolean;
|
|
108
|
-
}
|
|
117
|
+
export type { AnchorSearchResult } from './anchor-match.js';
|
|
109
118
|
|
|
110
119
|
export interface MarkdownPrefixResult {
|
|
111
120
|
prefix: string;
|
|
@@ -175,165 +184,14 @@ export function insertCommentsIntoMarkdown(
|
|
|
175
184
|
anchors: Map<string, CommentAnchorData | string>,
|
|
176
185
|
options: InsertCommentsOptions = {}
|
|
177
186
|
): string {
|
|
178
|
-
const { quiet = false, sectionBoundary = null } = options;
|
|
187
|
+
const { quiet = false, sectionBoundary = null, wrapAnchor = true } = options;
|
|
179
188
|
let result = markdown;
|
|
180
189
|
let unmatchedCount = 0;
|
|
181
190
|
const duplicateWarnings: string[] = [];
|
|
182
191
|
const usedPositions = new Set<number>(); // For tie-breaking: track used positions
|
|
183
192
|
|
|
184
|
-
//
|
|
185
|
-
|
|
186
|
-
return text
|
|
187
|
-
.replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep inserted text
|
|
188
|
-
.replace(/\{--([^-]*)--\}/g, '') // deletions: remove deleted text
|
|
189
|
-
.replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
|
|
190
|
-
.replace(/\{>>[^<]*<<\}/g, '') // comments: remove
|
|
191
|
-
.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
// Helper: Find anchor in text with multiple fallback strategies
|
|
195
|
-
function findAnchorInText(anchor: string, text: string, before: string = '', after: string = ''): AnchorSearchResult {
|
|
196
|
-
// If anchor is empty, skip directly to context-based matching
|
|
197
|
-
if (!anchor || anchor.trim().length === 0) {
|
|
198
|
-
// Jump to context-based strategies (Strategy 5)
|
|
199
|
-
if (before || after) {
|
|
200
|
-
const beforeLower = (before || '').toLowerCase();
|
|
201
|
-
const afterLower = (after || '').toLowerCase();
|
|
202
|
-
const textLower = text.toLowerCase();
|
|
203
|
-
|
|
204
|
-
if (before && after) {
|
|
205
|
-
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
206
|
-
if (beforeIdx !== -1) {
|
|
207
|
-
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
208
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
209
|
-
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
210
|
-
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
if (before) {
|
|
216
|
-
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
217
|
-
if (beforeIdx !== -1) {
|
|
218
|
-
return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
|
|
219
|
-
}
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
if (after) {
|
|
223
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
224
|
-
if (afterIdx !== -1) {
|
|
225
|
-
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
const anchorLower = anchor.toLowerCase();
|
|
233
|
-
const textLower = text.toLowerCase();
|
|
234
|
-
|
|
235
|
-
// Strategy 1: Direct match
|
|
236
|
-
let occurrences = findAllOccurrences(textLower, anchorLower);
|
|
237
|
-
if (occurrences.length > 0) {
|
|
238
|
-
return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
// Strategy 2: Normalized whitespace
|
|
242
|
-
const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
|
|
243
|
-
const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
|
|
244
|
-
let idx = normalizedText.indexOf(normalizedAnchor);
|
|
245
|
-
if (idx !== -1) {
|
|
246
|
-
return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
|
|
247
|
-
}
|
|
248
|
-
|
|
249
|
-
// Strategy 3: Try matching in stripped CriticMarkup version
|
|
250
|
-
const strippedText = stripCriticMarkup(text);
|
|
251
|
-
const strippedLower = strippedText.toLowerCase();
|
|
252
|
-
occurrences = findAllOccurrences(strippedLower, anchorLower);
|
|
253
|
-
if (occurrences.length > 0) {
|
|
254
|
-
return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
// Strategy 4: First N words of anchor (for long anchors)
|
|
258
|
-
const words = anchor.split(/\s+/);
|
|
259
|
-
if (words.length > 3) {
|
|
260
|
-
for (let n = Math.min(6, words.length); n >= 3; n--) {
|
|
261
|
-
const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
|
|
262
|
-
if (partialAnchor.length >= 15) {
|
|
263
|
-
occurrences = findAllOccurrences(textLower, partialAnchor);
|
|
264
|
-
if (occurrences.length > 0) {
|
|
265
|
-
return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
|
|
266
|
-
}
|
|
267
|
-
occurrences = findAllOccurrences(strippedLower, partialAnchor);
|
|
268
|
-
if (occurrences.length > 0) {
|
|
269
|
-
return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start-stripped', stripped: true };
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
// Strategy 5: Use context (before/after) to find approximate position
|
|
276
|
-
if (before || after) {
|
|
277
|
-
const beforeLower = before.toLowerCase();
|
|
278
|
-
const afterLower = after.toLowerCase();
|
|
279
|
-
|
|
280
|
-
if (before && after) {
|
|
281
|
-
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
282
|
-
if (beforeIdx !== -1) {
|
|
283
|
-
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
284
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
285
|
-
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
286
|
-
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
287
|
-
}
|
|
288
|
-
}
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
if (before) {
|
|
292
|
-
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
293
|
-
if (beforeIdx !== -1) {
|
|
294
|
-
return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
if (after) {
|
|
299
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
300
|
-
if (afterIdx !== -1) {
|
|
301
|
-
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
// Strategy 6: Try splitting anchor on common transition words
|
|
307
|
-
const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
|
|
308
|
-
for (const sep of splitPatterns) {
|
|
309
|
-
if (anchor.includes(sep)) {
|
|
310
|
-
const parts = anchor.split(sep).filter(p => p.length >= 4);
|
|
311
|
-
for (const part of parts) {
|
|
312
|
-
const partLower = part.toLowerCase();
|
|
313
|
-
occurrences = findAllOccurrences(textLower, partLower);
|
|
314
|
-
if (occurrences.length > 0 && occurrences.length < 5) {
|
|
315
|
-
return { occurrences, matchedAnchor: part, strategy: 'split-match' };
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
|
|
322
|
-
}
|
|
323
|
-
|
|
324
|
-
// Helper: Find all occurrences of needle in haystack
|
|
325
|
-
function findAllOccurrences(haystack: string, needle: string): number[] {
|
|
326
|
-
if (!needle || needle.length === 0) {
|
|
327
|
-
return [];
|
|
328
|
-
}
|
|
329
|
-
const occurrences: number[] = [];
|
|
330
|
-
let idx = 0;
|
|
331
|
-
while ((idx = haystack.indexOf(needle, idx)) !== -1) {
|
|
332
|
-
occurrences.push(idx);
|
|
333
|
-
idx += 1;
|
|
334
|
-
}
|
|
335
|
-
return occurrences;
|
|
336
|
-
}
|
|
193
|
+
// Anchor matching primitives live in lib/anchor-match.ts so that
|
|
194
|
+
// `rev verify-anchors` can use the same strategies for drift reporting.
|
|
337
195
|
|
|
338
196
|
// Get all positions in order (for sequential tie-breaking)
|
|
339
197
|
const commentsWithPositions = comments.map((c): CommentWithPos => {
|
|
@@ -489,18 +347,24 @@ export function insertCommentsIntoMarkdown(
|
|
|
489
347
|
// Sort by position descending (insert from end to avoid offset issues)
|
|
490
348
|
matched.sort((a, b) => b.pos - a.pos);
|
|
491
349
|
|
|
492
|
-
// Insert each comment
|
|
350
|
+
// Insert each comment. With `wrapAnchor` (the default), the anchor text
|
|
351
|
+
// gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
|
|
352
|
+
// original Word comment range. Without it, the comment block is inserted
|
|
353
|
+
// adjacent to the anchor and prose stays untouched — required for
|
|
354
|
+
// comments-only sync where multiple comments may share one anchor.
|
|
493
355
|
for (const c of matched) {
|
|
494
356
|
const comment = `{>>${c.author}: ${c.text}<<}`;
|
|
495
|
-
if (c.anchorText && c.anchorEnd) {
|
|
496
|
-
// Replace anchor text with: {>>comment<<}[anchor]{.mark}
|
|
357
|
+
if (wrapAnchor && c.anchorText && c.anchorEnd) {
|
|
497
358
|
const before = result.slice(0, c.pos);
|
|
498
359
|
const anchor = result.slice(c.pos, c.anchorEnd);
|
|
499
360
|
const after = result.slice(c.anchorEnd);
|
|
500
361
|
result = before + comment + `[${anchor}]{.mark}` + after;
|
|
501
362
|
} else {
|
|
502
|
-
//
|
|
503
|
-
|
|
363
|
+
// Insert comment at the anchor position with no surrounding whitespace
|
|
364
|
+
// tweaks; CriticMarkup blocks are invisible to readers, and adding a
|
|
365
|
+
// leading space would shift prose byte-for-byte (relevant when callers
|
|
366
|
+
// verify that --comments-only didn't touch the original).
|
|
367
|
+
result = result.slice(0, c.pos) + comment + result.slice(c.pos);
|
|
504
368
|
}
|
|
505
369
|
}
|
|
506
370
|
|
package/lib/word-extraction.ts
CHANGED
|
@@ -42,6 +42,17 @@ export interface CommentAnchorsResult {
|
|
|
42
42
|
fullDocText: string;
|
|
43
43
|
}
|
|
44
44
|
|
|
45
|
+
export interface DocxHeading {
|
|
46
|
+
/** Heading style name from `<w:pStyle>`, e.g. "Heading1" */
|
|
47
|
+
style: string;
|
|
48
|
+
/** Heading depth: 1, 2, 3, ... (parsed from style name; 0 if unknown) */
|
|
49
|
+
level: number;
|
|
50
|
+
/** Concatenated text content of the heading paragraph */
|
|
51
|
+
text: string;
|
|
52
|
+
/** Position in fullDocText (same coordinate system as CommentAnchorData.docPosition) */
|
|
53
|
+
docPosition: number;
|
|
54
|
+
}
|
|
55
|
+
|
|
45
56
|
export interface WordTable {
|
|
46
57
|
markdown: string;
|
|
47
58
|
rowCount: number;
|
|
@@ -331,6 +342,88 @@ export async function extractCommentAnchors(docxPath: string): Promise<CommentAn
|
|
|
331
342
|
return { anchors, fullDocText };
|
|
332
343
|
}
|
|
333
344
|
|
|
345
|
+
/**
|
|
346
|
+
* Extract heading paragraphs from a docx, with their text positions in the
|
|
347
|
+
* same coordinate system as `extractCommentAnchors`'s `fullDocText` and
|
|
348
|
+
* `CommentAnchorData.docPosition`.
|
|
349
|
+
*
|
|
350
|
+
* Headings are paragraphs whose `<w:pStyle>` is a Heading style. Reading
|
|
351
|
+
* styles directly is more reliable than keyword-matching the concatenated
|
|
352
|
+
* body text — there, paragraph boundaries are gone, so the literal string
|
|
353
|
+
* "Methods" can appear inside prose ("results across countries") and the
|
|
354
|
+
* structured-abstract label "Methods:" loses its colon when text runs are
|
|
355
|
+
* concatenated.
|
|
356
|
+
*/
|
|
357
|
+
export async function extractHeadings(docxPath: string): Promise<DocxHeading[]> {
|
|
358
|
+
const AdmZip = (await import('adm-zip')).default;
|
|
359
|
+
|
|
360
|
+
if (!fs.existsSync(docxPath)) {
|
|
361
|
+
throw new Error(`File not found: ${docxPath}`);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
const zip = new AdmZip(docxPath);
|
|
365
|
+
const docEntry = zip.getEntry('word/document.xml');
|
|
366
|
+
if (!docEntry) return [];
|
|
367
|
+
const xml = docEntry.getData().toString('utf8');
|
|
368
|
+
|
|
369
|
+
// Build the same xml-pos → text-pos mapping that extractCommentAnchors does
|
|
370
|
+
const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
|
|
371
|
+
const nodes: Array<{ xmlStart: number; xmlEnd: number; textStart: number; textEnd: number }> = [];
|
|
372
|
+
let textPos = 0;
|
|
373
|
+
let m;
|
|
374
|
+
while ((m = textNodePattern.exec(xml)) !== null) {
|
|
375
|
+
const decoded = decodeXmlEntities(m[1] ?? '');
|
|
376
|
+
nodes.push({
|
|
377
|
+
xmlStart: m.index,
|
|
378
|
+
xmlEnd: m.index + m[0].length,
|
|
379
|
+
textStart: textPos,
|
|
380
|
+
textEnd: textPos + decoded.length,
|
|
381
|
+
});
|
|
382
|
+
textPos += decoded.length;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
function xmlToTextPos(xmlPos: number): number {
|
|
386
|
+
for (const n of nodes) {
|
|
387
|
+
if (xmlPos >= n.xmlStart && xmlPos < n.xmlEnd) return n.textStart;
|
|
388
|
+
if (xmlPos < n.xmlStart) return n.textStart;
|
|
389
|
+
}
|
|
390
|
+
return nodes.length ? nodes[nodes.length - 1].textEnd : 0;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
const headings: DocxHeading[] = [];
|
|
394
|
+
const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
|
|
395
|
+
let pm;
|
|
396
|
+
while ((pm = paraPattern.exec(xml)) !== null) {
|
|
397
|
+
const inner = pm[1];
|
|
398
|
+
const styleMatch = inner.match(/<w:pStyle[^>]*w:val="([^"]+)"/);
|
|
399
|
+
if (!styleMatch) continue;
|
|
400
|
+
const style = styleMatch[1];
|
|
401
|
+
if (!/heading/i.test(style)) continue;
|
|
402
|
+
|
|
403
|
+
// Concatenate text runs; include w:delText so a heading inside a tracked
|
|
404
|
+
// deletion is still surfaced (verifying anchors against an original draft)
|
|
405
|
+
const textInRange = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
|
|
406
|
+
let txt = '';
|
|
407
|
+
let tm;
|
|
408
|
+
while ((tm = textInRange.exec(inner)) !== null) {
|
|
409
|
+
txt += decodeXmlEntities(tm[1] || tm[2] || '');
|
|
410
|
+
}
|
|
411
|
+
const trimmed = txt.trim();
|
|
412
|
+
if (!trimmed) continue;
|
|
413
|
+
|
|
414
|
+
const levelMatch = style.match(/(\d+)/);
|
|
415
|
+
const level = levelMatch ? parseInt(levelMatch[1], 10) : 0;
|
|
416
|
+
headings.push({
|
|
417
|
+
style,
|
|
418
|
+
level,
|
|
419
|
+
text: trimmed,
|
|
420
|
+
docPosition: xmlToTextPos(pm.index),
|
|
421
|
+
});
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
return headings;
|
|
425
|
+
}
|
|
426
|
+
|
|
334
427
|
/**
|
|
335
428
|
* Decode XML entities in text
|
|
336
429
|
*/
|
package/package.json
CHANGED
package/skill/REFERENCE.md
CHANGED
|
@@ -21,11 +21,38 @@ rev import manuscript.docx --output ./project
|
|
|
21
21
|
### rev sync
|
|
22
22
|
Sync feedback from a reviewed Word document into existing markdown sections.
|
|
23
23
|
```bash
|
|
24
|
-
rev sync reviewed.docx
|
|
25
|
-
rev sync
|
|
24
|
+
rev sync reviewed.docx # Updates markdown with track changes/comments
|
|
25
|
+
rev sync # Auto-detect most recent .docx
|
|
26
26
|
rev sync reviewed.docx methods # Sync only methods section
|
|
27
|
+
rev sync reviewed.docx --comments-only # Insert comments only; never modify prose
|
|
27
28
|
```
|
|
28
29
|
|
|
30
|
+
`--comments-only` skips the Word→Markdown diff entirely. Use it when the
|
|
31
|
+
markdown has been revised between sending the docx out for review and
|
|
32
|
+
receiving it back: applying track changes from a stale draft would clobber
|
|
33
|
+
newer edits, but comments still need to land. Comments are placed at
|
|
34
|
+
fuzzy-matched anchors against the current prose. Pair with
|
|
35
|
+
`rev verify-anchors` to see which ones won't fit before you run sync.
|
|
36
|
+
|
|
37
|
+
### rev verify-anchors
|
|
38
|
+
Report drift between Word comment anchors and the current markdown.
|
|
39
|
+
```bash
|
|
40
|
+
rev verify-anchors reviewed.docx # Print per-comment match quality
|
|
41
|
+
rev verify-anchors reviewed.docx --json # Machine-readable report
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Each comment is classified by how well its anchor still matches the current
|
|
45
|
+
section prose:
|
|
46
|
+
|
|
47
|
+
- `clean` – exact or whitespace-normalized hit
|
|
48
|
+
- `drift` – anchor only matches via stripped-CriticMarkup or partial-prefix fallbacks
|
|
49
|
+
- `context-only` – anchor text is gone, only surrounding context survives
|
|
50
|
+
- `ambiguous` – multiple candidate positions; needs context to disambiguate
|
|
51
|
+
- `unmatched` – nothing maps; user must place the comment manually
|
|
52
|
+
|
|
53
|
+
Useful before `rev sync --comments-only` to plan which comments will land
|
|
54
|
+
automatically and which need manual placement.
|
|
55
|
+
|
|
29
56
|
### rev build
|
|
30
57
|
Build output documents from markdown sections.
|
|
31
58
|
```bash
|