docrev 0.9.6 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +20 -0
  2. package/dev_notes/bug_repro_comment_parser.md +71 -0
  3. package/dist/lib/anchor-match.d.ts +41 -0
  4. package/dist/lib/anchor-match.d.ts.map +1 -0
  5. package/dist/lib/anchor-match.js +192 -0
  6. package/dist/lib/anchor-match.js.map +1 -0
  7. package/dist/lib/annotations.d.ts.map +1 -1
  8. package/dist/lib/annotations.js +8 -5
  9. package/dist/lib/annotations.js.map +1 -1
  10. package/dist/lib/commands/index.d.ts +2 -1
  11. package/dist/lib/commands/index.d.ts.map +1 -1
  12. package/dist/lib/commands/index.js +3 -1
  13. package/dist/lib/commands/index.js.map +1 -1
  14. package/dist/lib/commands/section-boundaries.d.ts +22 -0
  15. package/dist/lib/commands/section-boundaries.d.ts.map +1 -0
  16. package/dist/lib/commands/section-boundaries.js +53 -0
  17. package/dist/lib/commands/section-boundaries.js.map +1 -0
  18. package/dist/lib/commands/sync.d.ts.map +1 -1
  19. package/dist/lib/commands/sync.js +135 -0
  20. package/dist/lib/commands/sync.js.map +1 -1
  21. package/dist/lib/commands/verify-anchors.d.ts +17 -0
  22. package/dist/lib/commands/verify-anchors.d.ts.map +1 -0
  23. package/dist/lib/commands/verify-anchors.js +215 -0
  24. package/dist/lib/commands/verify-anchors.js.map +1 -0
  25. package/dist/lib/import.d.ts +14 -8
  26. package/dist/lib/import.d.ts.map +1 -1
  27. package/dist/lib/import.js +16 -144
  28. package/dist/lib/import.js.map +1 -1
  29. package/dist/lib/word-extraction.d.ts +23 -0
  30. package/dist/lib/word-extraction.d.ts.map +1 -1
  31. package/dist/lib/word-extraction.js +79 -0
  32. package/dist/lib/word-extraction.js.map +1 -1
  33. package/lib/anchor-match.ts +238 -0
  34. package/lib/annotations.ts +9 -5
  35. package/lib/commands/index.ts +3 -0
  36. package/lib/commands/section-boundaries.ts +72 -0
  37. package/lib/commands/sync.ts +165 -0
  38. package/lib/commands/verify-anchors.ts +261 -0
  39. package/lib/import.ts +29 -165
  40. package/lib/word-extraction.ts +93 -0
  41. package/package.json +1 -1
  42. package/skill/REFERENCE.md +29 -2
  43. package/skill/SKILL.md +12 -2
@@ -0,0 +1,261 @@
1
+ /**
2
+ * VERIFY-ANCHORS command: report drift between Word comment anchors
3
+ * and the current markdown.
4
+ *
5
+ * Useful when prose has been revised between sending the docx out for
6
+ * review and receiving it back. Each comment is classified by how well
7
+ * its anchor still matches the current section prose:
8
+ *
9
+ * clean – exact or whitespace-normalized hit
10
+ * drift – anchor only matches via stripped/partial fallbacks
11
+ * context-only – anchor text is gone, only surrounding context survives
12
+ * ambiguous – multiple matches, can't pick one without context
13
+ * unmatched – nothing maps; user must place the comment manually
14
+ */
15
+
16
+ import {
17
+ chalk,
18
+ fs,
19
+ path,
20
+ fmt,
21
+ loadConfig,
22
+ jsonMode,
23
+ jsonOutput,
24
+ } from './context.js';
25
+ import type { Command } from 'commander';
26
+ import { findAnchorInText, classifyStrategy, type AnchorMatchQuality } from '../anchor-match.js';
27
+ import type { CommentAnchorData } from '../word-extraction.js';
28
+ import { computeSectionBoundaries } from './section-boundaries.js';
29
+
30
+ interface VerifyOptions {
31
+ config: string;
32
+ dir: string;
33
+ json?: boolean;
34
+ }
35
+
36
+ interface CommentReport {
37
+ id: string;
38
+ author: string;
39
+ text: string;
40
+ section: string | null;
41
+ quality: AnchorMatchQuality | 'ambiguous';
42
+ strategy: string;
43
+ anchor: string;
44
+ occurrences: number;
45
+ }
46
+
47
+ export function register(program: Command): void {
48
+ program
49
+ .command('verify-anchors')
50
+ .description('Report drift between Word comment anchors and current markdown')
51
+ .argument('<file>', 'Word document with reviewer comments (.docx)')
52
+ .option('-c, --config <file>', 'Sections config file', 'sections.yaml')
53
+ .option('-d, --dir <directory>', 'Directory with section files', '.')
54
+ .option('--json', 'Output JSON report (for scripting)')
55
+ .action(async (docxPath: string, options: VerifyOptions) => {
56
+ if (!fs.existsSync(docxPath)) {
57
+ console.error(fmt.status('error', `File not found: ${docxPath}`));
58
+ process.exit(1);
59
+ }
60
+
61
+ const configPath = path.resolve(options.dir, options.config);
62
+ if (!fs.existsSync(configPath)) {
63
+ console.error(fmt.status('error', `Config not found: ${configPath}`));
64
+ console.error(chalk.dim(' Run "rev init" first to generate sections.yaml'));
65
+ process.exit(1);
66
+ }
67
+
68
+ const config = loadConfig(configPath);
69
+ const { extractWordComments, extractCommentAnchors, extractHeadings } = await import('../import.js');
70
+
71
+ let comments;
72
+ let anchors;
73
+ let headings;
74
+ try {
75
+ comments = await extractWordComments(docxPath);
76
+ const result = await extractCommentAnchors(docxPath);
77
+ anchors = result.anchors;
78
+ headings = await extractHeadings(docxPath);
79
+ } catch (err) {
80
+ const error = err as Error;
81
+ console.error(fmt.status('error', `Failed to read ${path.basename(docxPath)}: ${error.message}`));
82
+ if (process.env.DEBUG) console.error(error.stack);
83
+ process.exit(1);
84
+ }
85
+
86
+ if (comments.length === 0) {
87
+ console.log(fmt.status('info', 'No comments found in document.'));
88
+ return;
89
+ }
90
+
91
+ const boundaries = computeSectionBoundaries(config.sections, headings);
92
+
93
+ // Cache section markdown contents on first read
94
+ const sectionCache = new Map<string, string>();
95
+ function loadSection(file: string): string | null {
96
+ if (sectionCache.has(file)) return sectionCache.get(file)!;
97
+ const sectionPath = path.join(options.dir, file);
98
+ if (!fs.existsSync(sectionPath)) return null;
99
+ const content = fs.readFileSync(sectionPath, 'utf-8');
100
+ sectionCache.set(file, content);
101
+ return content;
102
+ }
103
+
104
+ const firstBoundaryStart = boundaries.length > 0 ? boundaries[0].start : 0;
105
+ const reports: CommentReport[] = [];
106
+
107
+ for (const c of comments) {
108
+ const anchor: CommentAnchorData | undefined = anchors.get(c.id);
109
+ const anchorText = anchor?.anchor || '';
110
+ if (!anchor) {
111
+ reports.push({
112
+ id: c.id,
113
+ author: c.author,
114
+ text: c.text,
115
+ section: null,
116
+ quality: 'unmatched',
117
+ strategy: 'no-anchor',
118
+ anchor: '',
119
+ occurrences: 0,
120
+ });
121
+ continue;
122
+ }
123
+
124
+ // Determine which section file this comment lives in
125
+ let sectionFile: string | null = null;
126
+ for (const b of boundaries) {
127
+ if (anchor.docPosition >= b.start && anchor.docPosition < b.end) {
128
+ sectionFile = b.file;
129
+ break;
130
+ }
131
+ }
132
+ if (!sectionFile && boundaries.length > 0 && anchor.docPosition < firstBoundaryStart) {
133
+ sectionFile = boundaries[0].file;
134
+ }
135
+
136
+ if (!sectionFile) {
137
+ reports.push({
138
+ id: c.id,
139
+ author: c.author,
140
+ text: c.text,
141
+ section: null,
142
+ quality: 'unmatched',
143
+ strategy: 'no-section',
144
+ anchor: anchorText,
145
+ occurrences: 0,
146
+ });
147
+ continue;
148
+ }
149
+
150
+ const md = loadSection(sectionFile);
151
+ if (md === null) {
152
+ reports.push({
153
+ id: c.id,
154
+ author: c.author,
155
+ text: c.text,
156
+ section: sectionFile,
157
+ quality: 'unmatched',
158
+ strategy: 'missing-file',
159
+ anchor: anchorText,
160
+ occurrences: 0,
161
+ });
162
+ continue;
163
+ }
164
+
165
+ const search = findAnchorInText(anchor.anchor, md, anchor.before, anchor.after);
166
+ let quality: AnchorMatchQuality | 'ambiguous' = classifyStrategy(search.strategy, search.occurrences.length);
167
+ if (quality === 'clean' && search.occurrences.length > 1) {
168
+ quality = 'ambiguous';
169
+ }
170
+
171
+ reports.push({
172
+ id: c.id,
173
+ author: c.author,
174
+ text: c.text,
175
+ section: sectionFile,
176
+ quality,
177
+ strategy: search.strategy,
178
+ anchor: anchorText,
179
+ occurrences: search.occurrences.length,
180
+ });
181
+ }
182
+
183
+ if (options.json || jsonMode) {
184
+ jsonOutput({
185
+ file: docxPath,
186
+ totalComments: comments.length,
187
+ summary: tally(reports),
188
+ comments: reports,
189
+ });
190
+ return;
191
+ }
192
+
193
+ printReport(docxPath, reports);
194
+ });
195
+ }
196
+
197
+ function tally(reports: CommentReport[]): Record<string, number> {
198
+ const out: Record<string, number> = { clean: 0, drift: 0, 'context-only': 0, ambiguous: 0, unmatched: 0 };
199
+ for (const r of reports) out[r.quality] = (out[r.quality] || 0) + 1;
200
+ return out;
201
+ }
202
+
203
+ function printReport(docxPath: string, reports: CommentReport[]): void {
204
+ console.log(fmt.header(`Anchor Verification: ${path.basename(docxPath)}`));
205
+ console.log();
206
+
207
+ const totals = tally(reports);
208
+ const summaryLines: string[] = [];
209
+ summaryLines.push(`${chalk.green(totals.clean)} clean (anchor still matches)`);
210
+ if (totals.drift) summaryLines.push(`${chalk.cyan(totals.drift)} drifted (matched via fallback strategies)`);
211
+ if (totals['context-only']) summaryLines.push(`${chalk.yellow(totals['context-only'])} context-only (anchor text gone, neighbors survive)`);
212
+ if (totals.ambiguous) summaryLines.push(`${chalk.magenta(totals.ambiguous)} ambiguous (multiple candidate positions)`);
213
+ if (totals.unmatched) summaryLines.push(`${chalk.red(totals.unmatched)} unmatched (manual placement needed)`);
214
+ console.log(fmt.box(summaryLines.join('\n'), { title: 'Summary', padding: 0 }));
215
+ console.log();
216
+
217
+ // Per-comment table for everything that isn't a clean direct hit
218
+ const problems = reports.filter(r => r.quality !== 'clean');
219
+ if (problems.length === 0) {
220
+ console.log(fmt.status('success', 'All comment anchors match the current markdown.'));
221
+ return;
222
+ }
223
+
224
+ const rows = problems.map(r => [
225
+ chalk.dim(`#${r.id}`),
226
+ qualityColor(r.quality),
227
+ r.section ? chalk.bold(r.section) : chalk.dim('—'),
228
+ chalk.dim(r.strategy),
229
+ truncate(r.anchor, 35),
230
+ truncate(r.text, 35),
231
+ ]);
232
+
233
+ console.log(fmt.table(
234
+ ['ID', 'Quality', 'Section', 'Strategy', 'Anchor (Word)', 'Comment'],
235
+ rows,
236
+ { align: ['right', 'left', 'left', 'left', 'left', 'left'] },
237
+ ));
238
+
239
+ if (totals.unmatched > 0 || totals.ambiguous > 0) {
240
+ console.log();
241
+ console.log(chalk.dim('Comments flagged "unmatched" or "ambiguous" need manual placement.'));
242
+ console.log(chalk.dim('Run "rev sync --no-overwrite" to import the matched ones without touching prose.'));
243
+ }
244
+ }
245
+
246
+ function qualityColor(q: string): string {
247
+ switch (q) {
248
+ case 'clean': return chalk.green('clean');
249
+ case 'drift': return chalk.cyan('drift');
250
+ case 'context-only': return chalk.yellow('context');
251
+ case 'ambiguous': return chalk.magenta('ambiguous');
252
+ case 'unmatched': return chalk.red('unmatched');
253
+ default: return q;
254
+ }
255
+ }
256
+
257
+ function truncate(s: string, max: number): string {
258
+ if (!s) return chalk.dim('—');
259
+ const flat = s.replace(/\s+/g, ' ').trim();
260
+ return flat.length > max ? flat.slice(0, max - 1) + '…' : flat;
261
+ }
package/lib/import.ts CHANGED
@@ -36,12 +36,14 @@ import {
36
36
  parseVisibleComments,
37
37
  convertVisibleComments,
38
38
  } from './restore-references.js';
39
+ import { findAnchorInText } from './anchor-match.js';
39
40
 
40
41
  // Re-export everything so existing imports from './import.js' still work
41
42
  export {
42
43
  extractFromWord,
43
44
  extractWordComments,
44
45
  extractCommentAnchors,
46
+ extractHeadings,
45
47
  extractWordTables,
46
48
  } from './word-extraction.js';
47
49
  export type {
@@ -49,6 +51,7 @@ export type {
49
51
  TextNode,
50
52
  CommentAnchorData,
51
53
  CommentAnchorsResult,
54
+ DocxHeading,
52
55
  WordTable,
53
56
  ParsedRow,
54
57
  ExtractFromWordOptions,
@@ -86,6 +89,17 @@ const execAsync = promisify(exec);
86
89
  export interface InsertCommentsOptions {
87
90
  quiet?: boolean;
88
91
  sectionBoundary?: { start: number; end: number } | null;
92
+ /**
93
+ * When true (default), comments wrap their anchor text in `[anchor]{.mark}`
94
+ * so the rebuilt docx restores the original Word comment range. When false,
95
+ * comments are inserted as standalone `{>>...<<}` blocks adjacent to the
96
+ * anchor — the prose stays byte-identical except for the inserted blocks.
97
+ *
98
+ * Set to false from `sync --comments-only` so a draft revised after the
99
+ * docx was sent for review keeps its prose intact, and so multiple
100
+ * comments sharing one anchor don't produce nested broken markup.
101
+ */
102
+ wrapAnchor?: boolean;
89
103
  }
90
104
 
91
105
  export interface CommentWithPos {
@@ -100,12 +114,7 @@ export interface CommentWithPos {
100
114
  strategy?: string;
101
115
  }
102
116
 
103
- export interface AnchorSearchResult {
104
- occurrences: number[];
105
- matchedAnchor: string | null;
106
- strategy: string;
107
- stripped?: boolean;
108
- }
117
+ export type { AnchorSearchResult } from './anchor-match.js';
109
118
 
110
119
  export interface MarkdownPrefixResult {
111
120
  prefix: string;
@@ -175,165 +184,14 @@ export function insertCommentsIntoMarkdown(
175
184
  anchors: Map<string, CommentAnchorData | string>,
176
185
  options: InsertCommentsOptions = {}
177
186
  ): string {
178
- const { quiet = false, sectionBoundary = null } = options;
187
+ const { quiet = false, sectionBoundary = null, wrapAnchor = true } = options;
179
188
  let result = markdown;
180
189
  let unmatchedCount = 0;
181
190
  const duplicateWarnings: string[] = [];
182
191
  const usedPositions = new Set<number>(); // For tie-breaking: track used positions
183
192
 
184
- // Helper: Strip CriticMarkup from text to get "clean" version for matching
185
- function stripCriticMarkup(text: string): string {
186
- return text
187
- .replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep inserted text
188
- .replace(/\{--([^-]*)--\}/g, '') // deletions: remove deleted text
189
- .replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
190
- .replace(/\{>>[^<]*<<\}/g, '') // comments: remove
191
- .replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
192
- }
193
-
194
- // Helper: Find anchor in text with multiple fallback strategies
195
- function findAnchorInText(anchor: string, text: string, before: string = '', after: string = ''): AnchorSearchResult {
196
- // If anchor is empty, skip directly to context-based matching
197
- if (!anchor || anchor.trim().length === 0) {
198
- // Jump to context-based strategies (Strategy 5)
199
- if (before || after) {
200
- const beforeLower = (before || '').toLowerCase();
201
- const afterLower = (after || '').toLowerCase();
202
- const textLower = text.toLowerCase();
203
-
204
- if (before && after) {
205
- const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
206
- if (beforeIdx !== -1) {
207
- const searchStart = beforeIdx + beforeLower.slice(-50).length;
208
- const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
209
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
210
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
211
- }
212
- }
213
- }
214
-
215
- if (before) {
216
- const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
217
- if (beforeIdx !== -1) {
218
- return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
219
- }
220
- }
221
-
222
- if (after) {
223
- const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
224
- if (afterIdx !== -1) {
225
- return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
226
- }
227
- }
228
- }
229
- return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
230
- }
231
-
232
- const anchorLower = anchor.toLowerCase();
233
- const textLower = text.toLowerCase();
234
-
235
- // Strategy 1: Direct match
236
- let occurrences = findAllOccurrences(textLower, anchorLower);
237
- if (occurrences.length > 0) {
238
- return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
239
- }
240
-
241
- // Strategy 2: Normalized whitespace
242
- const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
243
- const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
244
- let idx = normalizedText.indexOf(normalizedAnchor);
245
- if (idx !== -1) {
246
- return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
247
- }
248
-
249
- // Strategy 3: Try matching in stripped CriticMarkup version
250
- const strippedText = stripCriticMarkup(text);
251
- const strippedLower = strippedText.toLowerCase();
252
- occurrences = findAllOccurrences(strippedLower, anchorLower);
253
- if (occurrences.length > 0) {
254
- return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
255
- }
256
-
257
- // Strategy 4: First N words of anchor (for long anchors)
258
- const words = anchor.split(/\s+/);
259
- if (words.length > 3) {
260
- for (let n = Math.min(6, words.length); n >= 3; n--) {
261
- const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
262
- if (partialAnchor.length >= 15) {
263
- occurrences = findAllOccurrences(textLower, partialAnchor);
264
- if (occurrences.length > 0) {
265
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
266
- }
267
- occurrences = findAllOccurrences(strippedLower, partialAnchor);
268
- if (occurrences.length > 0) {
269
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start-stripped', stripped: true };
270
- }
271
- }
272
- }
273
- }
274
-
275
- // Strategy 5: Use context (before/after) to find approximate position
276
- if (before || after) {
277
- const beforeLower = before.toLowerCase();
278
- const afterLower = after.toLowerCase();
279
-
280
- if (before && after) {
281
- const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
282
- if (beforeIdx !== -1) {
283
- const searchStart = beforeIdx + beforeLower.slice(-50).length;
284
- const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
285
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
286
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
287
- }
288
- }
289
- }
290
-
291
- if (before) {
292
- const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
293
- if (beforeIdx !== -1) {
294
- return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
295
- }
296
- }
297
-
298
- if (after) {
299
- const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
300
- if (afterIdx !== -1) {
301
- return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
302
- }
303
- }
304
- }
305
-
306
- // Strategy 6: Try splitting anchor on common transition words
307
- const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
308
- for (const sep of splitPatterns) {
309
- if (anchor.includes(sep)) {
310
- const parts = anchor.split(sep).filter(p => p.length >= 4);
311
- for (const part of parts) {
312
- const partLower = part.toLowerCase();
313
- occurrences = findAllOccurrences(textLower, partLower);
314
- if (occurrences.length > 0 && occurrences.length < 5) {
315
- return { occurrences, matchedAnchor: part, strategy: 'split-match' };
316
- }
317
- }
318
- }
319
- }
320
-
321
- return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
322
- }
323
-
324
- // Helper: Find all occurrences of needle in haystack
325
- function findAllOccurrences(haystack: string, needle: string): number[] {
326
- if (!needle || needle.length === 0) {
327
- return [];
328
- }
329
- const occurrences: number[] = [];
330
- let idx = 0;
331
- while ((idx = haystack.indexOf(needle, idx)) !== -1) {
332
- occurrences.push(idx);
333
- idx += 1;
334
- }
335
- return occurrences;
336
- }
193
+ // Anchor matching primitives live in lib/anchor-match.ts so that
194
+ // `rev verify-anchors` can use the same strategies for drift reporting.
337
195
 
338
196
  // Get all positions in order (for sequential tie-breaking)
339
197
  const commentsWithPositions = comments.map((c): CommentWithPos => {
@@ -489,18 +347,24 @@ export function insertCommentsIntoMarkdown(
489
347
  // Sort by position descending (insert from end to avoid offset issues)
490
348
  matched.sort((a, b) => b.pos - a.pos);
491
349
 
492
- // Insert each comment with anchor marking
350
+ // Insert each comment. With `wrapAnchor` (the default), the anchor text
351
+ // gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
352
+ // original Word comment range. Without it, the comment block is inserted
353
+ // adjacent to the anchor and prose stays untouched — required for
354
+ // comments-only sync where multiple comments may share one anchor.
493
355
  for (const c of matched) {
494
356
  const comment = `{>>${c.author}: ${c.text}<<}`;
495
- if (c.anchorText && c.anchorEnd) {
496
- // Replace anchor text with: {>>comment<<}[anchor]{.mark}
357
+ if (wrapAnchor && c.anchorText && c.anchorEnd) {
497
358
  const before = result.slice(0, c.pos);
498
359
  const anchor = result.slice(c.pos, c.anchorEnd);
499
360
  const after = result.slice(c.anchorEnd);
500
361
  result = before + comment + `[${anchor}]{.mark}` + after;
501
362
  } else {
502
- // No anchor - just insert comment at position
503
- result = result.slice(0, c.pos) + ` ${comment}` + result.slice(c.pos);
363
+ // Insert comment at the anchor position with no surrounding whitespace
364
+ // tweaks; CriticMarkup blocks are invisible to readers, and adding a
365
+ // leading space would shift prose byte-for-byte (relevant when callers
366
+ // verify that --comments-only didn't touch the original).
367
+ result = result.slice(0, c.pos) + comment + result.slice(c.pos);
504
368
  }
505
369
  }
506
370
 
@@ -42,6 +42,17 @@ export interface CommentAnchorsResult {
42
42
  fullDocText: string;
43
43
  }
44
44
 
45
+ export interface DocxHeading {
46
+ /** Heading style name from `<w:pStyle>`, e.g. "Heading1" */
47
+ style: string;
48
+ /** Heading depth: 1, 2, 3, ... (parsed from style name; 0 if unknown) */
49
+ level: number;
50
+ /** Concatenated text content of the heading paragraph */
51
+ text: string;
52
+ /** Position in fullDocText (same coordinate system as CommentAnchorData.docPosition) */
53
+ docPosition: number;
54
+ }
55
+
45
56
  export interface WordTable {
46
57
  markdown: string;
47
58
  rowCount: number;
@@ -331,6 +342,88 @@ export async function extractCommentAnchors(docxPath: string): Promise<CommentAn
331
342
  return { anchors, fullDocText };
332
343
  }
333
344
 
345
+ /**
346
+ * Extract heading paragraphs from a docx, with their text positions in the
347
+ * same coordinate system as `extractCommentAnchors`'s `fullDocText` and
348
+ * `CommentAnchorData.docPosition`.
349
+ *
350
+ * Headings are paragraphs whose `<w:pStyle>` is a Heading style. Reading
351
+ * styles directly is more reliable than keyword-matching the concatenated
352
+ * body text — there, paragraph boundaries are gone, so the literal string
353
+ * "Methods" can appear inside prose ("results across countries") and the
354
+ * structured-abstract label "Methods:" loses its colon when text runs are
355
+ * concatenated.
356
+ */
357
+ export async function extractHeadings(docxPath: string): Promise<DocxHeading[]> {
358
+ const AdmZip = (await import('adm-zip')).default;
359
+
360
+ if (!fs.existsSync(docxPath)) {
361
+ throw new Error(`File not found: ${docxPath}`);
362
+ }
363
+
364
+ const zip = new AdmZip(docxPath);
365
+ const docEntry = zip.getEntry('word/document.xml');
366
+ if (!docEntry) return [];
367
+ const xml = docEntry.getData().toString('utf8');
368
+
369
+ // Build the same xml-pos → text-pos mapping that extractCommentAnchors does
370
+ const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
371
+ const nodes: Array<{ xmlStart: number; xmlEnd: number; textStart: number; textEnd: number }> = [];
372
+ let textPos = 0;
373
+ let m;
374
+ while ((m = textNodePattern.exec(xml)) !== null) {
375
+ const decoded = decodeXmlEntities(m[1] ?? '');
376
+ nodes.push({
377
+ xmlStart: m.index,
378
+ xmlEnd: m.index + m[0].length,
379
+ textStart: textPos,
380
+ textEnd: textPos + decoded.length,
381
+ });
382
+ textPos += decoded.length;
383
+ }
384
+
385
+ function xmlToTextPos(xmlPos: number): number {
386
+ for (const n of nodes) {
387
+ if (xmlPos >= n.xmlStart && xmlPos < n.xmlEnd) return n.textStart;
388
+ if (xmlPos < n.xmlStart) return n.textStart;
389
+ }
390
+ return nodes.length ? nodes[nodes.length - 1].textEnd : 0;
391
+ }
392
+
393
+ const headings: DocxHeading[] = [];
394
+ const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
395
+ let pm;
396
+ while ((pm = paraPattern.exec(xml)) !== null) {
397
+ const inner = pm[1];
398
+ const styleMatch = inner.match(/<w:pStyle[^>]*w:val="([^"]+)"/);
399
+ if (!styleMatch) continue;
400
+ const style = styleMatch[1];
401
+ if (!/heading/i.test(style)) continue;
402
+
403
+ // Concatenate text runs; include w:delText so a heading inside a tracked
404
+ // deletion is still surfaced (verifying anchors against an original draft)
405
+ const textInRange = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
406
+ let txt = '';
407
+ let tm;
408
+ while ((tm = textInRange.exec(inner)) !== null) {
409
+ txt += decodeXmlEntities(tm[1] || tm[2] || '');
410
+ }
411
+ const trimmed = txt.trim();
412
+ if (!trimmed) continue;
413
+
414
+ const levelMatch = style.match(/(\d+)/);
415
+ const level = levelMatch ? parseInt(levelMatch[1], 10) : 0;
416
+ headings.push({
417
+ style,
418
+ level,
419
+ text: trimmed,
420
+ docPosition: xmlToTextPos(pm.index),
421
+ });
422
+ }
423
+
424
+ return headings;
425
+ }
426
+
334
427
  /**
335
428
  * Decode XML entities in text
336
429
  */
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "docrev",
3
- "version": "0.9.6",
3
+ "version": "0.9.7",
4
4
  "description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
5
5
  "type": "module",
6
6
  "types": "dist/lib/types.d.ts",
@@ -21,11 +21,38 @@ rev import manuscript.docx --output ./project
21
21
  ### rev sync
22
22
  Sync feedback from a reviewed Word document into existing markdown sections.
23
23
  ```bash
24
- rev sync reviewed.docx # Updates markdown with track changes/comments
25
- rev sync # Auto-detect most recent .docx
24
+ rev sync reviewed.docx # Updates markdown with track changes/comments
25
+ rev sync # Auto-detect most recent .docx
26
26
  rev sync reviewed.docx methods # Sync only methods section
27
+ rev sync reviewed.docx --comments-only # Insert comments only; never modify prose
27
28
  ```
28
29
 
30
+ `--comments-only` skips the Word→Markdown diff entirely. Use it when the
31
+ markdown has been revised between sending the docx out for review and
32
+ receiving it back: applying track changes from a stale draft would clobber
33
+ newer edits, but comments still need to land. Comments are placed at
34
+ fuzzy-matched anchors against the current prose. Pair with
35
+ `rev verify-anchors` to see which ones won't fit before you run sync.
36
+
37
+ ### rev verify-anchors
38
+ Report drift between Word comment anchors and the current markdown.
39
+ ```bash
40
+ rev verify-anchors reviewed.docx # Print per-comment match quality
41
+ rev verify-anchors reviewed.docx --json # Machine-readable report
42
+ ```
43
+
44
+ Each comment is classified by how well its anchor still matches the current
45
+ section prose:
46
+
47
+ - `clean` – exact or whitespace-normalized hit
48
+ - `drift` – anchor only matches via stripped-CriticMarkup or partial-prefix fallbacks
49
+ - `context-only` – anchor text is gone, only surrounding context survives
50
+ - `ambiguous` – multiple candidate positions; needs context to disambiguate
51
+ - `unmatched` – nothing maps; user must place the comment manually
52
+
53
+ Useful before `rev sync --comments-only` to plan which comments will land
54
+ automatically and which need manual placement.
55
+
29
56
  ### rev build
30
57
  Build output documents from markdown sections.
31
58
  ```bash