docrev 0.9.6 → 0.9.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +41 -0
  2. package/dev_notes/bug_repro_comment_parser.md +71 -0
  3. package/dev_notes/stress2/adversarial.docx +0 -0
  4. package/dev_notes/stress2/build_adversarial.ts +186 -0
  5. package/dev_notes/stress2/drift_matcher.ts +62 -0
  6. package/dev_notes/stress2/probe_anchors.ts +35 -0
  7. package/dev_notes/stress2/project/adversarial.docx +0 -0
  8. package/dev_notes/stress2/project/discussion.before.md +3 -0
  9. package/dev_notes/stress2/project/discussion.md +3 -0
  10. package/dev_notes/stress2/project/methods.before.md +20 -0
  11. package/dev_notes/stress2/project/methods.md +20 -0
  12. package/dev_notes/stress2/project/rev.yaml +5 -0
  13. package/dev_notes/stress2/project/sections.yaml +4 -0
  14. package/dev_notes/stress2/sections.yaml +5 -0
  15. package/dev_notes/stress2/trace_placement.ts +50 -0
  16. package/dev_notes/stresstest_boundaries.ts +27 -0
  17. package/dev_notes/stresstest_drift_apply.ts +43 -0
  18. package/dev_notes/stresstest_drift_compare.ts +43 -0
  19. package/dev_notes/stresstest_drift_v2.ts +54 -0
  20. package/dev_notes/stresstest_inspect.ts +54 -0
  21. package/dev_notes/stresstest_pstyle.ts +55 -0
  22. package/dev_notes/stresstest_section_debug.ts +23 -0
  23. package/dev_notes/stresstest_split.ts +70 -0
  24. package/dev_notes/stresstest_trace.ts +19 -0
  25. package/dev_notes/stresstest_verify_no_overwrite.ts +40 -0
  26. package/dist/lib/anchor-match.d.ts +51 -0
  27. package/dist/lib/anchor-match.d.ts.map +1 -0
  28. package/dist/lib/anchor-match.js +227 -0
  29. package/dist/lib/anchor-match.js.map +1 -0
  30. package/dist/lib/annotations.d.ts.map +1 -1
  31. package/dist/lib/annotations.js +24 -11
  32. package/dist/lib/annotations.js.map +1 -1
  33. package/dist/lib/commands/index.d.ts +2 -1
  34. package/dist/lib/commands/index.d.ts.map +1 -1
  35. package/dist/lib/commands/index.js +3 -1
  36. package/dist/lib/commands/index.js.map +1 -1
  37. package/dist/lib/commands/quality.js +1 -1
  38. package/dist/lib/commands/quality.js.map +1 -1
  39. package/dist/lib/commands/section-boundaries.d.ts +22 -0
  40. package/dist/lib/commands/section-boundaries.d.ts.map +1 -0
  41. package/dist/lib/commands/section-boundaries.js +63 -0
  42. package/dist/lib/commands/section-boundaries.js.map +1 -0
  43. package/dist/lib/commands/sync.d.ts.map +1 -1
  44. package/dist/lib/commands/sync.js +141 -0
  45. package/dist/lib/commands/sync.js.map +1 -1
  46. package/dist/lib/commands/verify-anchors.d.ts +17 -0
  47. package/dist/lib/commands/verify-anchors.d.ts.map +1 -0
  48. package/dist/lib/commands/verify-anchors.js +226 -0
  49. package/dist/lib/commands/verify-anchors.js.map +1 -0
  50. package/dist/lib/comment-realign.js +2 -2
  51. package/dist/lib/comment-realign.js.map +1 -1
  52. package/dist/lib/import.d.ts +26 -8
  53. package/dist/lib/import.d.ts.map +1 -1
  54. package/dist/lib/import.js +166 -187
  55. package/dist/lib/import.js.map +1 -1
  56. package/dist/lib/response.js +1 -1
  57. package/dist/lib/response.js.map +1 -1
  58. package/dist/lib/word-extraction.d.ts +23 -0
  59. package/dist/lib/word-extraction.d.ts.map +1 -1
  60. package/dist/lib/word-extraction.js +79 -0
  61. package/dist/lib/word-extraction.js.map +1 -1
  62. package/dist/lib/wordcomments.d.ts.map +1 -1
  63. package/dist/lib/wordcomments.js +165 -73
  64. package/dist/lib/wordcomments.js.map +1 -1
  65. package/lib/anchor-match.ts +276 -0
  66. package/lib/annotations.ts +25 -11
  67. package/lib/commands/index.ts +3 -0
  68. package/lib/commands/quality.ts +1 -1
  69. package/lib/commands/section-boundaries.ts +82 -0
  70. package/lib/commands/sync.ts +170 -0
  71. package/lib/commands/verify-anchors.ts +272 -0
  72. package/lib/comment-realign.ts +2 -2
  73. package/lib/import.ts +197 -209
  74. package/lib/response.ts +1 -1
  75. package/lib/word-extraction.ts +93 -0
  76. package/lib/wordcomments.ts +180 -82
  77. package/package.json +1 -1
  78. package/skill/REFERENCE.md +29 -2
  79. package/skill/SKILL.md +12 -2
  80. package/dist/package.json +0 -137
package/lib/import.ts CHANGED
@@ -36,12 +36,72 @@ import {
36
36
  parseVisibleComments,
37
37
  convertVisibleComments,
38
38
  } from './restore-references.js';
39
+ import { findAnchorInText, findAllOccurrences } from './anchor-match.js';
40
+
41
+ /**
42
+ * Pick the best position from candidate `occurrences` given the
43
+ * surrounding `before` / `after` context from the docx, while
44
+ * respecting `usedPositions` to avoid stacking distinct comments at
45
+ * the same anchor instance.
46
+ *
47
+ * Returns the chosen position, or -1 if every candidate is already used.
48
+ */
49
+ function pickBestOccurrence(
50
+ occurrences: number[],
51
+ result: string,
52
+ before: string,
53
+ after: string,
54
+ anchorLen: number,
55
+ usedPositions: Set<number>,
56
+ ): number {
57
+ if (occurrences.length === 0) return -1;
58
+ if (occurrences.length === 1) {
59
+ return usedPositions.has(occurrences[0]) ? -1 : occurrences[0];
60
+ }
61
+
62
+ let bestIdx = occurrences.find(p => !usedPositions.has(p)) ?? -1;
63
+ if (bestIdx < 0) return -1;
64
+ let bestScore = -1;
65
+
66
+ for (const pos of occurrences) {
67
+ if (usedPositions.has(pos)) continue;
68
+ let score = 0;
69
+
70
+ if (before) {
71
+ const contextBefore = result.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
72
+ const beforeLower = before.toLowerCase();
73
+ const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
74
+ for (const word of beforeWords) {
75
+ if (contextBefore.includes(word)) score += 2;
76
+ }
77
+ if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
78
+ }
79
+
80
+ if (after) {
81
+ const contextAfter = result.slice(pos + anchorLen, pos + anchorLen + after.length + 20).toLowerCase();
82
+ const afterLower = after.toLowerCase();
83
+ const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
84
+ for (const word of afterWords) {
85
+ if (contextAfter.includes(word)) score += 2;
86
+ }
87
+ if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
88
+ }
89
+
90
+ if (score > bestScore || (score === bestScore && pos < bestIdx)) {
91
+ bestScore = score;
92
+ bestIdx = pos;
93
+ }
94
+ }
95
+
96
+ return bestIdx;
97
+ }
39
98
 
40
99
  // Re-export everything so existing imports from './import.js' still work
41
100
  export {
42
101
  extractFromWord,
43
102
  extractWordComments,
44
103
  extractCommentAnchors,
104
+ extractHeadings,
45
105
  extractWordTables,
46
106
  } from './word-extraction.js';
47
107
  export type {
@@ -49,6 +109,7 @@ export type {
49
109
  TextNode,
50
110
  CommentAnchorData,
51
111
  CommentAnchorsResult,
112
+ DocxHeading,
52
113
  WordTable,
53
114
  ParsedRow,
54
115
  ExtractFromWordOptions,
@@ -86,6 +147,25 @@ const execAsync = promisify(exec);
86
147
  export interface InsertCommentsOptions {
87
148
  quiet?: boolean;
88
149
  sectionBoundary?: { start: number; end: number } | null;
150
+ /**
151
+ * When true (default), comments wrap their anchor text in `[anchor]{.mark}`
152
+ * so the rebuilt docx restores the original Word comment range. When false,
153
+ * comments are inserted as standalone `{>>...<<}` blocks adjacent to the
154
+ * anchor — the prose stays byte-identical except for the inserted blocks.
155
+ *
156
+ * Set to false from `sync --comments-only` so a draft revised after the
157
+ * docx was sent for review keeps its prose intact, and so multiple
158
+ * comments sharing one anchor don't produce nested broken markup.
159
+ */
160
+ wrapAnchor?: boolean;
161
+ /**
162
+ * Mutable output: when provided, the function fills in counters so callers
163
+ * can distinguish placement outcomes in their summary. `placed` counts new
164
+ * insertions, `deduped` counts comments that were already present at their
165
+ * anchor (skipped to avoid duplication on re-sync), `unmatched` counts
166
+ * comments whose anchor couldn't be located.
167
+ */
168
+ outStats?: { placed: number; deduped: number; unmatched: number };
89
169
  }
90
170
 
91
171
  export interface CommentWithPos {
@@ -100,12 +180,7 @@ export interface CommentWithPos {
100
180
  strategy?: string;
101
181
  }
102
182
 
103
- export interface AnchorSearchResult {
104
- occurrences: number[];
105
- matchedAnchor: string | null;
106
- strategy: string;
107
- stripped?: boolean;
108
- }
183
+ export type { AnchorSearchResult } from './anchor-match.js';
109
184
 
110
185
  export interface MarkdownPrefixResult {
111
186
  prefix: string;
@@ -166,6 +241,34 @@ export interface MoveExtractedMediaResult {
166
241
  // Functions
167
242
  // ============================================
168
243
 
244
+ /**
245
+ * If `pos` lands inside a section file's leading `# Heading` line (or the
246
+ * blank line right after it), advance past the first paragraph break so
247
+ * the comment stays inside the section. A comment authored at the very
248
+ * start of a Word section maps to `pos === 0`, but inserting at column 0
249
+ * of a markdown file that begins with `# Heading` puts the `{>>...<<}`
250
+ * before the heading marker — Pandoc then treats the line as ordinary
251
+ * paragraph text and the comment renders in the previous section.
252
+ */
253
+ function pushPastSectionHeading(text: string, pos: number): number {
254
+ if (pos > 0) {
255
+ const headingMatch = text.match(/^#{1,6}\s.+$/m);
256
+ if (!headingMatch || headingMatch.index === undefined) return pos;
257
+ const headingEnd = headingMatch.index + headingMatch[0].length;
258
+ if (pos >= headingEnd) return pos;
259
+ }
260
+ // pos is at-or-before the first heading line. Advance to the first
261
+ // non-blank position after the heading paragraph.
262
+ const headingLine = text.match(/^#{1,6}\s.+(?:\n|$)/m);
263
+ if (!headingLine || headingLine.index === undefined) return pos;
264
+ let after = headingLine.index + headingLine[0].length;
265
+ // Skip blank lines so we land at the start of the first body paragraph.
266
+ while (after < text.length && (text[after] === '\n' || text[after] === '\r')) {
267
+ after++;
268
+ }
269
+ return after;
270
+ }
271
+
169
272
  /**
170
273
  * Insert comments into markdown text based on anchor texts with context
171
274
  */
@@ -175,165 +278,15 @@ export function insertCommentsIntoMarkdown(
175
278
  anchors: Map<string, CommentAnchorData | string>,
176
279
  options: InsertCommentsOptions = {}
177
280
  ): string {
178
- const { quiet = false, sectionBoundary = null } = options;
281
+ const { quiet = false, sectionBoundary = null, wrapAnchor = true, outStats } = options;
179
282
  let result = markdown;
180
283
  let unmatchedCount = 0;
284
+ let placedCount = 0;
181
285
  const duplicateWarnings: string[] = [];
182
286
  const usedPositions = new Set<number>(); // For tie-breaking: track used positions
183
287
 
184
- // Helper: Strip CriticMarkup from text to get "clean" version for matching
185
- function stripCriticMarkup(text: string): string {
186
- return text
187
- .replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep inserted text
188
- .replace(/\{--([^-]*)--\}/g, '') // deletions: remove deleted text
189
- .replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
190
- .replace(/\{>>[^<]*<<\}/g, '') // comments: remove
191
- .replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
192
- }
193
-
194
- // Helper: Find anchor in text with multiple fallback strategies
195
- function findAnchorInText(anchor: string, text: string, before: string = '', after: string = ''): AnchorSearchResult {
196
- // If anchor is empty, skip directly to context-based matching
197
- if (!anchor || anchor.trim().length === 0) {
198
- // Jump to context-based strategies (Strategy 5)
199
- if (before || after) {
200
- const beforeLower = (before || '').toLowerCase();
201
- const afterLower = (after || '').toLowerCase();
202
- const textLower = text.toLowerCase();
203
-
204
- if (before && after) {
205
- const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
206
- if (beforeIdx !== -1) {
207
- const searchStart = beforeIdx + beforeLower.slice(-50).length;
208
- const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
209
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
210
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
211
- }
212
- }
213
- }
214
-
215
- if (before) {
216
- const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
217
- if (beforeIdx !== -1) {
218
- return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
219
- }
220
- }
221
-
222
- if (after) {
223
- const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
224
- if (afterIdx !== -1) {
225
- return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
226
- }
227
- }
228
- }
229
- return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
230
- }
231
-
232
- const anchorLower = anchor.toLowerCase();
233
- const textLower = text.toLowerCase();
234
-
235
- // Strategy 1: Direct match
236
- let occurrences = findAllOccurrences(textLower, anchorLower);
237
- if (occurrences.length > 0) {
238
- return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
239
- }
240
-
241
- // Strategy 2: Normalized whitespace
242
- const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
243
- const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
244
- let idx = normalizedText.indexOf(normalizedAnchor);
245
- if (idx !== -1) {
246
- return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
247
- }
248
-
249
- // Strategy 3: Try matching in stripped CriticMarkup version
250
- const strippedText = stripCriticMarkup(text);
251
- const strippedLower = strippedText.toLowerCase();
252
- occurrences = findAllOccurrences(strippedLower, anchorLower);
253
- if (occurrences.length > 0) {
254
- return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
255
- }
256
-
257
- // Strategy 4: First N words of anchor (for long anchors)
258
- const words = anchor.split(/\s+/);
259
- if (words.length > 3) {
260
- for (let n = Math.min(6, words.length); n >= 3; n--) {
261
- const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
262
- if (partialAnchor.length >= 15) {
263
- occurrences = findAllOccurrences(textLower, partialAnchor);
264
- if (occurrences.length > 0) {
265
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
266
- }
267
- occurrences = findAllOccurrences(strippedLower, partialAnchor);
268
- if (occurrences.length > 0) {
269
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start-stripped', stripped: true };
270
- }
271
- }
272
- }
273
- }
274
-
275
- // Strategy 5: Use context (before/after) to find approximate position
276
- if (before || after) {
277
- const beforeLower = before.toLowerCase();
278
- const afterLower = after.toLowerCase();
279
-
280
- if (before && after) {
281
- const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
282
- if (beforeIdx !== -1) {
283
- const searchStart = beforeIdx + beforeLower.slice(-50).length;
284
- const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
285
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
286
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
287
- }
288
- }
289
- }
290
-
291
- if (before) {
292
- const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
293
- if (beforeIdx !== -1) {
294
- return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
295
- }
296
- }
297
-
298
- if (after) {
299
- const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
300
- if (afterIdx !== -1) {
301
- return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
302
- }
303
- }
304
- }
305
-
306
- // Strategy 6: Try splitting anchor on common transition words
307
- const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
308
- for (const sep of splitPatterns) {
309
- if (anchor.includes(sep)) {
310
- const parts = anchor.split(sep).filter(p => p.length >= 4);
311
- for (const part of parts) {
312
- const partLower = part.toLowerCase();
313
- occurrences = findAllOccurrences(textLower, partLower);
314
- if (occurrences.length > 0 && occurrences.length < 5) {
315
- return { occurrences, matchedAnchor: part, strategy: 'split-match' };
316
- }
317
- }
318
- }
319
- }
320
-
321
- return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
322
- }
323
-
324
- // Helper: Find all occurrences of needle in haystack
325
- function findAllOccurrences(haystack: string, needle: string): number[] {
326
- if (!needle || needle.length === 0) {
327
- return [];
328
- }
329
- const occurrences: number[] = [];
330
- let idx = 0;
331
- while ((idx = haystack.indexOf(needle, idx)) !== -1) {
332
- occurrences.push(idx);
333
- idx += 1;
334
- }
335
- return occurrences;
336
- }
288
+ // Anchor matching primitives live in lib/anchor-match.ts so that
289
+ // `rev verify-anchors` can use the same strategies for drift reporting.
337
290
 
338
291
  // Get all positions in order (for sequential tie-breaking)
339
292
  const commentsWithPositions = comments.map((c): CommentWithPos => {
@@ -364,6 +317,18 @@ export function insertCommentsIntoMarkdown(
364
317
  const proportion = Math.min(relativePos / sectionLength, 1.0);
365
318
  const markdownPos = Math.floor(proportion * result.length);
366
319
 
320
+ // For empty anchors, before/after context is the only signal that
321
+ // pinpoints the original split — without it, proportional placement
322
+ // can land mid-word or split unrelated phrases. Try context match
323
+ // first; only fall through to proportional when context is gone.
324
+ if ((!anchor || isEmpty) && (before || after)) {
325
+ const ctx = findAnchorInText('', result, before, after);
326
+ if (ctx.occurrences.length > 0) {
327
+ const pos = pushPastSectionHeading(result, ctx.occurrences[0]);
328
+ return { ...c, pos, anchorText: null, isEmpty: true, strategy: `ctx:${ctx.strategy}` };
329
+ }
330
+ }
331
+
367
332
  let insertPos = markdownPos;
368
333
 
369
334
  // Look for nearby word boundary
@@ -373,26 +338,52 @@ export function insertCommentsIntoMarkdown(
373
338
  insertPos = Math.max(0, markdownPos - 25) + spaceIdx;
374
339
  }
375
340
 
376
- // If we have anchor text, try to find it near this position
341
+ // If we have anchor text, try to find it near this position.
342
+ // Collect ALL occurrences in the local window, then disambiguate
343
+ // via before/after context + usedPositions — otherwise two
344
+ // comments sharing the same anchor word would both collide at
345
+ // the leftmost match. The context-scoring helper handles the
346
+ // "repeated formulaic prose" case using docx-side context, which
347
+ // is a stronger signal than raw distance to the proportional
348
+ // insertPos (insertPos is itself an approximation).
377
349
  if (anchor && !isEmpty) {
378
350
  const searchStart = Math.max(0, insertPos - 200);
379
351
  const searchEnd = Math.min(result.length, insertPos + 200);
380
352
  const localSearch = result.slice(searchStart, searchEnd).toLowerCase();
381
353
  const anchorLower = anchor.toLowerCase();
382
- const localIdx = localSearch.indexOf(anchorLower);
383
- if (localIdx !== -1) {
384
- return { ...c, pos: searchStart + localIdx, anchorText: anchor, anchorEnd: searchStart + localIdx + anchor.length, strategy: 'position+text' };
354
+
355
+ const localHits = findAllOccurrences(localSearch, anchorLower).map(i => searchStart + i);
356
+ if (localHits.length > 0) {
357
+ const chosen = pickBestOccurrence(localHits, result, before, after, anchor.length, usedPositions);
358
+ if (chosen >= 0) {
359
+ if (localHits.length > 1) {
360
+ duplicateWarnings.push(`"${anchor.slice(0, 40)}${anchor.length > 40 ? '...' : ''}" appears ${localHits.length} times in section window`);
361
+ }
362
+ usedPositions.add(chosen);
363
+ return { ...c, pos: chosen, anchorText: anchor, anchorEnd: chosen + anchor.length, strategy: 'position+text' };
364
+ }
385
365
  }
366
+
386
367
  // Try first few words
387
368
  const words = anchor.split(/\s+/).slice(0, 4).join(' ').toLowerCase();
388
369
  if (words.length >= 10) {
389
- const partialIdx = localSearch.indexOf(words);
390
- if (partialIdx !== -1) {
391
- return { ...c, pos: searchStart + partialIdx, anchorText: words, anchorEnd: searchStart + partialIdx + words.length, strategy: 'position+partial' };
370
+ const partialHits = findAllOccurrences(localSearch, words).map(i => searchStart + i);
371
+ if (partialHits.length > 0) {
372
+ const chosen = pickBestOccurrence(partialHits, result, before, after, words.length, usedPositions);
373
+ if (chosen >= 0) {
374
+ usedPositions.add(chosen);
375
+ return { ...c, pos: chosen, anchorText: words, anchorEnd: chosen + words.length, strategy: 'position+partial' };
376
+ }
392
377
  }
393
378
  }
394
379
  }
395
380
 
381
+ // A docPosition at the very start of a section maps to markdownPos=0,
382
+ // which sits before the file's `# Heading` line and gets rendered in
383
+ // the previous section. Push past the heading line so the comment
384
+ // stays inside the section it was authored in.
385
+ insertPos = pushPastSectionHeading(result, insertPos);
386
+
396
387
  return { ...c, pos: insertPos, anchorText: null, strategy: 'position-only' };
397
388
  }
398
389
  }
@@ -432,46 +423,14 @@ export function insertCommentsIntoMarkdown(
432
423
  duplicateWarnings.push(`"${matchedAnchor.slice(0, 40)}${matchedAnchor.length > 40 ? '...' : ''}" appears ${occurrences.length} times`);
433
424
  }
434
425
 
435
- let bestIdx = occurrences.find(p => !usedPositions.has(p)) ?? occurrences[0];
436
- let bestScore = -1;
437
-
438
- for (const pos of occurrences) {
439
- if (usedPositions.has(pos)) continue;
440
-
441
- let score = 0;
442
-
443
- if (before) {
444
- const contextBefore = result.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
445
- const beforeLower = before.toLowerCase();
446
- const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
447
- for (const word of beforeWords) {
448
- if (contextBefore.includes(word)) score += 2;
449
- }
450
- if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
451
- }
452
-
453
- if (after) {
454
- const contextAfter = result.slice(pos + anchorLen, pos + anchorLen + after.length + 20).toLowerCase();
455
- const afterLower = after.toLowerCase();
456
- const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
457
- for (const word of afterWords) {
458
- if (contextAfter.includes(word)) score += 2;
459
- }
460
- if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
461
- }
462
-
463
- if (score > bestScore || (score === bestScore && pos < bestIdx)) {
464
- bestScore = score;
465
- bestIdx = pos;
466
- }
467
- }
468
-
469
- usedPositions.add(bestIdx);
426
+ const bestIdx = pickBestOccurrence(occurrences, result, before, after, anchorLen, usedPositions);
427
+ const finalIdx = bestIdx >= 0 ? bestIdx : occurrences[0];
428
+ usedPositions.add(finalIdx);
470
429
 
471
430
  if (matchedAnchor) {
472
- return { ...c, pos: bestIdx, anchorText: matchedAnchor, anchorEnd: bestIdx + anchorLen };
431
+ return { ...c, pos: finalIdx, anchorText: matchedAnchor, anchorEnd: finalIdx + anchorLen };
473
432
  } else {
474
- return { ...c, pos: bestIdx, anchorText: null };
433
+ return { ...c, pos: finalIdx, anchorText: null };
475
434
  }
476
435
  });
477
436
 
@@ -489,19 +448,45 @@ export function insertCommentsIntoMarkdown(
489
448
  // Sort by position descending (insert from end to avoid offset issues)
490
449
  matched.sort((a, b) => b.pos - a.pos);
491
450
 
492
- // Insert each comment with anchor marking
451
+ // Insert each comment. With `wrapAnchor` (the default), the anchor text
452
+ // gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
453
+ // original Word comment range. Without it, the comment block is inserted
454
+ // adjacent to the anchor and prose stays untouched — required for
455
+ // comments-only sync where multiple comments may share one anchor.
456
+ // Skip insertion when an identical comment already lives near the target.
457
+ // Re-running sync against the same docx would otherwise stack duplicate
458
+ // CriticMarkup blocks (`{>>R1: ...<<}{>>R1: ...<<}...`) on each invocation.
459
+ // A 200-char window catches both wrapped (`{>>...<<}[anchor]{.mark}`) and
460
+ // bare (`{>>...<<}anchor`) forms while ignoring incidental matches farther
461
+ // away.
462
+ let dedupedCount = 0;
493
463
  for (const c of matched) {
494
464
  const comment = `{>>${c.author}: ${c.text}<<}`;
495
- if (c.anchorText && c.anchorEnd) {
496
- // Replace anchor text with: {>>comment<<}[anchor]{.mark}
465
+ const windowStart = Math.max(0, c.pos - 200);
466
+ const windowEnd = Math.min(result.length, c.pos + 200);
467
+ if (result.slice(windowStart, windowEnd).includes(comment)) {
468
+ dedupedCount++;
469
+ continue;
470
+ }
471
+ if (wrapAnchor && c.anchorText && c.anchorEnd) {
497
472
  const before = result.slice(0, c.pos);
498
473
  const anchor = result.slice(c.pos, c.anchorEnd);
499
474
  const after = result.slice(c.anchorEnd);
500
475
  result = before + comment + `[${anchor}]{.mark}` + after;
501
476
  } else {
502
- // No anchor - just insert comment at position
503
- result = result.slice(0, c.pos) + ` ${comment}` + result.slice(c.pos);
477
+ // Insert comment at the anchor position with no surrounding whitespace
478
+ // tweaks; CriticMarkup blocks are invisible to readers, and adding a
479
+ // leading space would shift prose byte-for-byte (relevant when callers
480
+ // verify that --comments-only didn't touch the original).
481
+ result = result.slice(0, c.pos) + comment + result.slice(c.pos);
504
482
  }
483
+ placedCount++;
484
+ }
485
+
486
+ if (outStats) {
487
+ outStats.placed = placedCount;
488
+ outStats.deduped = dedupedCount;
489
+ outStats.unmatched = unmatchedCount;
505
490
  }
506
491
 
507
492
  // Log warnings unless quiet mode
@@ -509,6 +494,9 @@ export function insertCommentsIntoMarkdown(
509
494
  if (unmatchedCount > 0) {
510
495
  console.warn(`Warning: ${unmatchedCount} comment(s) could not be matched to anchor text`);
511
496
  }
497
+ if (dedupedCount > 0) {
498
+ console.warn(`Note: ${dedupedCount} comment(s) already present at anchor — skipped to avoid duplication`);
499
+ }
512
500
  if (duplicateWarnings.length > 0) {
513
501
  console.warn(`Warning: Duplicate anchor text found (using context & tie-breaks for placement):`);
514
502
  for (const w of duplicateWarnings) {
package/lib/response.ts CHANGED
@@ -46,7 +46,7 @@ export function parseCommentsWithReplies(text: string, file: string = ''): Comme
46
46
  if (matches.length === 0) continue;
47
47
 
48
48
  // Get context (surrounding text without comments)
49
- const contextLine = line.replace(/\{>>[^<]+<<\}/g, '').trim();
49
+ const contextLine = line.replace(/\{>>[\s\S]+?<<\}/g, '').trim();
50
50
  const context = contextLine.slice(0, 100) + (contextLine.length > 100 ? '...' : '');
51
51
 
52
52
  // First match is the original comment, rest are replies
@@ -42,6 +42,17 @@ export interface CommentAnchorsResult {
42
42
  fullDocText: string;
43
43
  }
44
44
 
45
+ export interface DocxHeading {
46
+ /** Heading style name from `<w:pStyle>`, e.g. "Heading1" */
47
+ style: string;
48
+ /** Heading depth: 1, 2, 3, ... (parsed from style name; 0 if unknown) */
49
+ level: number;
50
+ /** Concatenated text content of the heading paragraph */
51
+ text: string;
52
+ /** Position in fullDocText (same coordinate system as CommentAnchorData.docPosition) */
53
+ docPosition: number;
54
+ }
55
+
45
56
  export interface WordTable {
46
57
  markdown: string;
47
58
  rowCount: number;
@@ -331,6 +342,88 @@ export async function extractCommentAnchors(docxPath: string): Promise<CommentAn
331
342
  return { anchors, fullDocText };
332
343
  }
333
344
 
345
+ /**
346
+ * Extract heading paragraphs from a docx, with their text positions in the
347
+ * same coordinate system as `extractCommentAnchors`'s `fullDocText` and
348
+ * `CommentAnchorData.docPosition`.
349
+ *
350
+ * Headings are paragraphs whose `<w:pStyle>` is a Heading style. Reading
351
+ * styles directly is more reliable than keyword-matching the concatenated
352
+ * body text — there, paragraph boundaries are gone, so the literal string
353
+ * "Methods" can appear inside prose ("results across countries") and the
354
+ * structured-abstract label "Methods:" loses its colon when text runs are
355
+ * concatenated.
356
+ */
357
+ export async function extractHeadings(docxPath: string): Promise<DocxHeading[]> {
358
+ const AdmZip = (await import('adm-zip')).default;
359
+
360
+ if (!fs.existsSync(docxPath)) {
361
+ throw new Error(`File not found: ${docxPath}`);
362
+ }
363
+
364
+ const zip = new AdmZip(docxPath);
365
+ const docEntry = zip.getEntry('word/document.xml');
366
+ if (!docEntry) return [];
367
+ const xml = docEntry.getData().toString('utf8');
368
+
369
+ // Build the same xml-pos → text-pos mapping that extractCommentAnchors does
370
+ const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
371
+ const nodes: Array<{ xmlStart: number; xmlEnd: number; textStart: number; textEnd: number }> = [];
372
+ let textPos = 0;
373
+ let m;
374
+ while ((m = textNodePattern.exec(xml)) !== null) {
375
+ const decoded = decodeXmlEntities(m[1] ?? '');
376
+ nodes.push({
377
+ xmlStart: m.index,
378
+ xmlEnd: m.index + m[0].length,
379
+ textStart: textPos,
380
+ textEnd: textPos + decoded.length,
381
+ });
382
+ textPos += decoded.length;
383
+ }
384
+
385
+ function xmlToTextPos(xmlPos: number): number {
386
+ for (const n of nodes) {
387
+ if (xmlPos >= n.xmlStart && xmlPos < n.xmlEnd) return n.textStart;
388
+ if (xmlPos < n.xmlStart) return n.textStart;
389
+ }
390
+ return nodes.length ? nodes[nodes.length - 1].textEnd : 0;
391
+ }
392
+
393
+ const headings: DocxHeading[] = [];
394
+ const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
395
+ let pm;
396
+ while ((pm = paraPattern.exec(xml)) !== null) {
397
+ const inner = pm[1];
398
+ const styleMatch = inner.match(/<w:pStyle[^>]*w:val="([^"]+)"/);
399
+ if (!styleMatch) continue;
400
+ const style = styleMatch[1];
401
+ if (!/heading/i.test(style)) continue;
402
+
403
+ // Concatenate text runs; include w:delText so a heading inside a tracked
404
+ // deletion is still surfaced (verifying anchors against an original draft)
405
+ const textInRange = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
406
+ let txt = '';
407
+ let tm;
408
+ while ((tm = textInRange.exec(inner)) !== null) {
409
+ txt += decodeXmlEntities(tm[1] || tm[2] || '');
410
+ }
411
+ const trimmed = txt.trim();
412
+ if (!trimmed) continue;
413
+
414
+ const levelMatch = style.match(/(\d+)/);
415
+ const level = levelMatch ? parseInt(levelMatch[1], 10) : 0;
416
+ headings.push({
417
+ style,
418
+ level,
419
+ text: trimmed,
420
+ docPosition: xmlToTextPos(pm.index),
421
+ });
422
+ }
423
+
424
+ return headings;
425
+ }
426
+
334
427
  /**
335
428
  * Decode XML entities in text
336
429
  */