docrev 0.9.18 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/.gitattributes +1 -1
  2. package/CHANGELOG.md +173 -149
  3. package/PLAN-tables-and-postprocess.md +850 -850
  4. package/README.md +431 -406
  5. package/bin/rev.js +11 -11
  6. package/bin/rev.ts +145 -145
  7. package/completions/rev.bash +127 -127
  8. package/completions/rev.ps1 +210 -210
  9. package/completions/rev.zsh +207 -207
  10. package/dist/lib/build.d.ts +8 -0
  11. package/dist/lib/build.d.ts.map +1 -1
  12. package/dist/lib/build.js +62 -6
  13. package/dist/lib/build.js.map +1 -1
  14. package/dist/lib/commands/context.d.ts +1 -1
  15. package/dist/lib/commands/context.d.ts.map +1 -1
  16. package/dist/lib/commands/context.js +1 -1
  17. package/dist/lib/commands/context.js.map +1 -1
  18. package/dist/lib/commands/sections.js +7 -7
  19. package/dist/lib/commands/sections.js.map +1 -1
  20. package/dist/lib/commands/sync.d.ts.map +1 -1
  21. package/dist/lib/commands/sync.js +15 -14
  22. package/dist/lib/commands/sync.js.map +1 -1
  23. package/dist/lib/commands/utilities.js +164 -164
  24. package/dist/lib/commands/verify-anchors.js +6 -6
  25. package/dist/lib/commands/verify-anchors.js.map +1 -1
  26. package/dist/lib/commands/word-tools.js +8 -8
  27. package/dist/lib/grammar.js +3 -3
  28. package/dist/lib/macro-filter.lua +201 -0
  29. package/dist/lib/macros.d.ts +102 -0
  30. package/dist/lib/macros.d.ts.map +1 -0
  31. package/dist/lib/macros.js +218 -0
  32. package/dist/lib/macros.js.map +1 -0
  33. package/dist/lib/pdf-comments.js +44 -44
  34. package/dist/lib/plugins.js +57 -57
  35. package/dist/lib/pptx-color-filter.lua +37 -0
  36. package/dist/lib/pptx-themes.js +115 -115
  37. package/dist/lib/schema.d.ts.map +1 -1
  38. package/dist/lib/schema.js +34 -0
  39. package/dist/lib/schema.js.map +1 -1
  40. package/dist/lib/sections.d.ts +35 -0
  41. package/dist/lib/sections.d.ts.map +1 -1
  42. package/dist/lib/sections.js +81 -0
  43. package/dist/lib/sections.js.map +1 -1
  44. package/dist/lib/spelling.js +2 -2
  45. package/dist/lib/templates.js +387 -387
  46. package/dist/lib/themes.js +51 -51
  47. package/eslint.config.js +27 -27
  48. package/lib/anchor-match.ts +276 -276
  49. package/lib/annotations.ts +644 -644
  50. package/lib/build.ts +1766 -1694
  51. package/lib/citations.ts +160 -160
  52. package/lib/commands/build.ts +855 -855
  53. package/lib/commands/citations.ts +515 -515
  54. package/lib/commands/comments.ts +1050 -1050
  55. package/lib/commands/context.ts +176 -174
  56. package/lib/commands/core.ts +309 -309
  57. package/lib/commands/doi.ts +435 -435
  58. package/lib/commands/file-ops.ts +372 -372
  59. package/lib/commands/history.ts +320 -320
  60. package/lib/commands/index.ts +87 -87
  61. package/lib/commands/init.ts +259 -259
  62. package/lib/commands/merge-resolve.ts +378 -378
  63. package/lib/commands/preview.ts +178 -178
  64. package/lib/commands/project-info.ts +244 -244
  65. package/lib/commands/quality.ts +517 -517
  66. package/lib/commands/response.ts +454 -454
  67. package/lib/commands/section-boundaries.ts +82 -82
  68. package/lib/commands/sections.ts +451 -451
  69. package/lib/commands/sync.ts +709 -706
  70. package/lib/commands/text-ops.ts +449 -449
  71. package/lib/commands/utilities.ts +448 -448
  72. package/lib/commands/verify-anchors.ts +272 -272
  73. package/lib/commands/word-tools.ts +340 -340
  74. package/lib/comment-realign.ts +517 -517
  75. package/lib/config.ts +84 -84
  76. package/lib/crossref.ts +781 -781
  77. package/lib/csl.ts +191 -191
  78. package/lib/dependencies.ts +98 -98
  79. package/lib/diff-engine.ts +465 -465
  80. package/lib/doi-cache.ts +115 -115
  81. package/lib/doi.ts +897 -897
  82. package/lib/equations.ts +506 -506
  83. package/lib/errors.ts +346 -346
  84. package/lib/format.ts +541 -541
  85. package/lib/git.ts +326 -326
  86. package/lib/grammar.ts +303 -303
  87. package/lib/image-registry.ts +180 -180
  88. package/lib/import.ts +911 -911
  89. package/lib/journals.ts +543 -543
  90. package/lib/macro-filter.lua +201 -0
  91. package/lib/macros.ts +273 -0
  92. package/lib/merge.ts +633 -633
  93. package/lib/orcid.ts +144 -144
  94. package/lib/pdf-comments.ts +263 -263
  95. package/lib/pdf-import.ts +524 -524
  96. package/lib/plugins.ts +362 -362
  97. package/lib/postprocess.ts +188 -188
  98. package/lib/pptx-color-filter.lua +37 -37
  99. package/lib/pptx-template.ts +469 -469
  100. package/lib/pptx-themes.ts +483 -483
  101. package/lib/protect-restore.ts +520 -520
  102. package/lib/rate-limiter.ts +94 -94
  103. package/lib/response.ts +197 -197
  104. package/lib/restore-references.ts +240 -240
  105. package/lib/review.ts +327 -327
  106. package/lib/schema.ts +488 -454
  107. package/lib/scientific-words.ts +73 -73
  108. package/lib/sections.ts +425 -335
  109. package/lib/slides.ts +756 -756
  110. package/lib/spelling.ts +334 -334
  111. package/lib/templates.ts +526 -526
  112. package/lib/themes.ts +742 -742
  113. package/lib/trackchanges.ts +247 -247
  114. package/lib/tui.ts +450 -450
  115. package/lib/types.ts +550 -550
  116. package/lib/undo.ts +250 -250
  117. package/lib/utils.ts +69 -69
  118. package/lib/variables.ts +179 -179
  119. package/lib/word-extraction.ts +806 -806
  120. package/lib/word.ts +643 -643
  121. package/lib/wordcomments.ts +840 -840
  122. package/package.json +137 -137
  123. package/scripts/postbuild.js +47 -28
  124. package/skill/REFERENCE.md +539 -539
  125. package/skill/SKILL.md +295 -295
  126. package/tsconfig.json +26 -26
  127. package/types/index.d.ts +525 -525
  128. package/issues.md +0 -180
  129. package/site/assets/extra.css +0 -208
  130. package/site/commands.html +0 -926
  131. package/site/configuration.html +0 -469
  132. package/site/index.html +0 -288
  133. package/site/troubleshooting.html +0 -461
  134. package/site/workflow.html +0 -518
package/lib/import.ts CHANGED
@@ -1,911 +1,911 @@
1
- /**
2
- * Import functionality - convert Word docs to annotated Markdown
3
- *
4
- * Orchestration workflows + re-exports from extraction/diff/restore modules
5
- */
6
-
7
- import * as fs from 'fs';
8
- import * as path from 'path';
9
- import { stripAnnotations } from './annotations.js';
10
- import { readImageRegistry } from './image-registry.js';
11
- import { exec } from 'child_process';
12
- import { promisify } from 'util';
13
-
14
- // Import from split modules
15
- import {
16
- extractFromWord,
17
- extractWordComments,
18
- extractCommentAnchors,
19
- extractWordTables,
20
- } from './word-extraction.js';
21
- import type {
22
- WordComment,
23
- CommentAnchorData,
24
- WordTable,
25
- ExtractFromWordResult,
26
- } from './word-extraction.js';
27
- import {
28
- generateSmartDiff,
29
- generateAnnotatedDiff,
30
- cleanupAnnotations,
31
- fixCitationAnnotations,
32
- } from './diff-engine.js';
33
- import {
34
- restoreCrossrefFromWord,
35
- restoreImagesFromRegistry,
36
- parseVisibleComments,
37
- convertVisibleComments,
38
- } from './restore-references.js';
39
- import { findAnchorInText, findAllOccurrences } from './anchor-match.js';
40
-
41
- /**
42
- * Pick the best position from candidate `occurrences` given the
43
- * surrounding `before` / `after` context from the docx, while
44
- * respecting `usedPositions` to avoid stacking distinct comments at
45
- * the same anchor instance.
46
- *
47
- * Returns the chosen position, or -1 if every candidate is already used.
48
- */
49
- function pickBestOccurrence(
50
- occurrences: number[],
51
- result: string,
52
- before: string,
53
- after: string,
54
- anchorLen: number,
55
- usedPositions: Set<number>,
56
- ): number {
57
- if (occurrences.length === 0) return -1;
58
- if (occurrences.length === 1) {
59
- return usedPositions.has(occurrences[0]) ? -1 : occurrences[0];
60
- }
61
-
62
- let bestIdx = occurrences.find(p => !usedPositions.has(p)) ?? -1;
63
- if (bestIdx < 0) return -1;
64
- let bestScore = -1;
65
-
66
- for (const pos of occurrences) {
67
- if (usedPositions.has(pos)) continue;
68
- let score = 0;
69
-
70
- if (before) {
71
- const contextBefore = result.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
72
- const beforeLower = before.toLowerCase();
73
- const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
74
- for (const word of beforeWords) {
75
- if (contextBefore.includes(word)) score += 2;
76
- }
77
- if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
78
- }
79
-
80
- if (after) {
81
- const contextAfter = result.slice(pos + anchorLen, pos + anchorLen + after.length + 20).toLowerCase();
82
- const afterLower = after.toLowerCase();
83
- const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
84
- for (const word of afterWords) {
85
- if (contextAfter.includes(word)) score += 2;
86
- }
87
- if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
88
- }
89
-
90
- if (score > bestScore || (score === bestScore && pos < bestIdx)) {
91
- bestScore = score;
92
- bestIdx = pos;
93
- }
94
- }
95
-
96
- return bestIdx;
97
- }
98
-
99
- // Re-export everything so existing imports from './import.js' still work
100
- export {
101
- extractFromWord,
102
- extractWordComments,
103
- extractCommentAnchors,
104
- extractHeadings,
105
- extractWordTables,
106
- } from './word-extraction.js';
107
- export type {
108
- WordComment,
109
- TextNode,
110
- CommentAnchorData,
111
- CommentAnchorsResult,
112
- DocxHeading,
113
- WordTable,
114
- ParsedRow,
115
- ExtractFromWordOptions,
116
- ExtractMessage,
117
- ExtractFromWordResult,
118
- } from './word-extraction.js';
119
-
120
- export {
121
- generateSmartDiff,
122
- generateAnnotatedDiff,
123
- cleanupAnnotations,
124
- fixCitationAnnotations,
125
- } from './diff-engine.js';
126
- export type {
127
- GenerateSmartDiffOptions,
128
- } from './diff-engine.js';
129
-
130
- export {
131
- restoreCrossrefFromWord,
132
- restoreImagesFromRegistry,
133
- parseVisibleComments,
134
- convertVisibleComments,
135
- } from './restore-references.js';
136
- export type {
137
- RestoreCrossrefResult,
138
- RestoreImagesResult,
139
- } from './restore-references.js';
140
-
141
- const execAsync = promisify(exec);
142
-
143
- // ============================================
144
- // Type Definitions (orchestration-specific)
145
- // ============================================
146
-
147
- export interface InsertCommentsOptions {
148
- quiet?: boolean;
149
- sectionBoundary?: { start: number; end: number } | null;
150
- /**
151
- * When true (default), comments wrap their anchor text in `[anchor]{.mark}`
152
- * so the rebuilt docx restores the original Word comment range. When false,
153
- * comments are inserted as standalone `{>>...<<}` blocks adjacent to the
154
- * anchor — the prose stays byte-identical except for the inserted blocks.
155
- *
156
- * Set to false from `sync --comments-only` so a draft revised after the
157
- * docx was sent for review keeps its prose intact, and so multiple
158
- * comments sharing one anchor don't produce nested broken markup.
159
- */
160
- wrapAnchor?: boolean;
161
- /**
162
- * Mutable output: when provided, the function fills in counters so callers
163
- * can distinguish placement outcomes in their summary. `placed` counts new
164
- * insertions, `deduped` counts comments that were already present at their
165
- * anchor (skipped to avoid duplication on re-sync), `unmatched` counts
166
- * comments whose anchor couldn't be located.
167
- */
168
- outStats?: { placed: number; deduped: number; unmatched: number };
169
- }
170
-
171
- export interface CommentWithPos {
172
- id: string;
173
- author: string;
174
- text: string;
175
- date: string;
176
- pos: number;
177
- anchorText: string | null;
178
- anchorEnd?: number;
179
- isEmpty?: boolean;
180
- strategy?: string;
181
- }
182
-
183
- export type { AnchorSearchResult } from './anchor-match.js';
184
-
185
- export interface MarkdownPrefixResult {
186
- prefix: string;
187
- content: string;
188
- }
189
-
190
- export interface ImportWordWithTrackChangesOptions {
191
- mediaDir?: string;
192
- projectDir?: string;
193
- }
194
-
195
- export interface ImportWordWithTrackChangesResult {
196
- text: string;
197
- stats: {
198
- insertions: number;
199
- deletions: number;
200
- substitutions: number;
201
- comments: number;
202
- total: number;
203
- hasTrackChanges: boolean;
204
- trackChangeStats: { insertions: number; deletions: number };
205
- };
206
- extractedMedia: string[];
207
- comments: WordComment[];
208
- }
209
-
210
- export interface ImportFromWordOptions {
211
- author?: string;
212
- sectionContent?: string;
213
- figuresDir?: string;
214
- wordTables?: WordTable[];
215
- }
216
-
217
- export interface ImportFromWordResult {
218
- annotated: string;
219
- stats: {
220
- insertions: number;
221
- deletions: number;
222
- substitutions: number;
223
- comments: number;
224
- total: number;
225
- };
226
- extractedMedia: string[];
227
- }
228
-
229
- export interface MovedFile {
230
- from: string;
231
- to: string;
232
- name: string;
233
- }
234
-
235
- export interface MoveExtractedMediaResult {
236
- moved: MovedFile[];
237
- errors: string[];
238
- }
239
-
240
- // ============================================
241
- // Functions
242
- // ============================================
243
-
244
- /**
245
- * If `pos` lands inside a section file's leading `# Heading` line (or the
246
- * blank line right after it), advance past the first paragraph break so
247
- * the comment stays inside the section. A comment authored at the very
248
- * start of a Word section maps to `pos === 0`, but inserting at column 0
249
- * of a markdown file that begins with `# Heading` puts the `{>>...<<}`
250
- * before the heading marker — Pandoc then treats the line as ordinary
251
- * paragraph text and the comment renders in the previous section.
252
- */
253
- function pushPastSectionHeading(text: string, pos: number): number {
254
- if (pos > 0) {
255
- const headingMatch = text.match(/^#{1,6}\s.+$/m);
256
- if (!headingMatch || headingMatch.index === undefined) return pos;
257
- const headingEnd = headingMatch.index + headingMatch[0].length;
258
- if (pos >= headingEnd) return pos;
259
- }
260
- // pos is at-or-before the first heading line. Advance to the first
261
- // non-blank position after the heading paragraph.
262
- const headingLine = text.match(/^#{1,6}\s.+(?:\n|$)/m);
263
- if (!headingLine || headingLine.index === undefined) return pos;
264
- let after = headingLine.index + headingLine[0].length;
265
- // Skip blank lines so we land at the start of the first body paragraph.
266
- while (after < text.length && (text[after] === '\n' || text[after] === '\r')) {
267
- after++;
268
- }
269
- return after;
270
- }
271
-
272
- /**
273
- * Snap a position to the nearest whitespace boundary within ±50 chars so a
274
- * proportional fallback insertion never lands mid-word.
275
- */
276
- function snapToWordBoundary(text: string, pos: number): number {
277
- if (pos <= 0) return 0;
278
- if (pos >= text.length) return text.length;
279
- if (/\s/.test(text[pos] ?? '')) return pos;
280
- for (let d = 1; d <= 50; d++) {
281
- if (pos + d < text.length && /\s/.test(text[pos + d] ?? '')) return pos + d;
282
- if (pos - d >= 0 && /\s/.test(text[pos - d] ?? '')) return pos - d;
283
- }
284
- return pos;
285
- }
286
-
287
- /**
288
- * Final-resort placement when every text-matching strategy failed. The docx
289
- * carries a real `<w:commentRangeStart w:id="N">` marker at a known offset
290
- * inside its body text — that's a structural anchor, even if the anchored
291
- * span itself is empty and the surrounding context drifted in the target.
292
- *
293
- * Map docPosition into the target markdown proportionally and snap to a word
294
- * boundary. This is approximate when the document was heavily restructured,
295
- * but it's strictly better than silently dropping a reviewer's comment: the
296
- * comment lands in roughly the right neighborhood and the reviewer can
297
- * relocate it during their next pass.
298
- */
299
- function proportionalFallback(
300
- anchorData: CommentAnchorData,
301
- target: string,
302
- ): number | null {
303
- if (anchorData.docLength <= 0) return null;
304
- const proportion = Math.min(anchorData.docPosition / anchorData.docLength, 1.0);
305
- const rawPos = Math.floor(proportion * target.length);
306
- return pushPastSectionHeading(target, snapToWordBoundary(target, rawPos));
307
- }
308
-
309
- /**
310
- * Insert comments into markdown text based on anchor texts with context
311
- */
312
- export function insertCommentsIntoMarkdown(
313
- markdown: string,
314
- comments: WordComment[],
315
- anchors: Map<string, CommentAnchorData | string>,
316
- options: InsertCommentsOptions = {}
317
- ): string {
318
- const { quiet = false, sectionBoundary = null, wrapAnchor = true, outStats } = options;
319
- let result = markdown;
320
- let unmatchedCount = 0;
321
- let placedCount = 0;
322
- const duplicateWarnings: string[] = [];
323
- const usedPositions = new Set<number>(); // For tie-breaking: track used positions
324
-
325
- // Resolve threading: replies share their parent's anchor in Word, so they
326
- // must inherit the parent's position and ride alongside it as one cluster.
327
- // Letting each reply run through anchor scoring scatters the cluster (the
328
- // same docPosition forces `usedPositions` to push later replies onto a
329
- // different occurrence), which on re-build looks like independent comments
330
- // and loses the paraIdParent threading. See gcol33/docrev issue #2.
331
- const inputById = new Map<string, WordComment>();
332
- for (const c of comments) inputById.set(c.id, c);
333
- function rootIdOf(c: WordComment): string {
334
- let cur: WordComment = c;
335
- const seen = new Set<string>();
336
- while (cur.parentId && !seen.has(cur.id)) {
337
- seen.add(cur.id);
338
- const parent = inputById.get(cur.parentId);
339
- if (!parent || parent === cur) break;
340
- cur = parent;
341
- }
342
- return cur.id;
343
- }
344
- const replyRootId = new Map<string, string>();
345
- for (const c of comments) {
346
- const root = rootIdOf(c);
347
- if (root !== c.id) replyRootId.set(c.id, root);
348
- }
349
-
350
- // Anchor matching primitives live in lib/anchor-match.ts so that
351
- // `rev verify-anchors` can use the same strategies for drift reporting.
352
-
353
- // Get all positions in order (for sequential tie-breaking).
354
- // Replies skip scoring entirely — they piggyback on their root's position
355
- // in the emit pass below.
356
- const commentsWithPositions = comments.map((c): CommentWithPos => {
357
- if (replyRootId.has(c.id)) {
358
- return { ...c, pos: -1, anchorText: null, strategy: 'reply' };
359
- }
360
- const anchorData = anchors.get(c.id);
361
- if (!anchorData) {
362
- unmatchedCount++;
363
- return { ...c, pos: -1, anchorText: null };
364
- }
365
-
366
- // Support both old format (string) and new format ({anchor, before, after})
367
- const anchor = typeof anchorData === 'string' ? anchorData : anchorData.anchor;
368
- const before = typeof anchorData === 'object' ? anchorData.before : '';
369
- const after = typeof anchorData === 'object' ? anchorData.after : '';
370
- const isEmpty = typeof anchorData === 'object' && anchorData.isEmpty;
371
- const docPosition = typeof anchorData === 'object' ? anchorData.docPosition : undefined;
372
-
373
- // Position-based insertion (most reliable)
374
- if (sectionBoundary && docPosition !== undefined) {
375
- const sectionLength = sectionBoundary.end - sectionBoundary.start;
376
- if (sectionLength > 0) {
377
- let relativePos;
378
- if (docPosition < sectionBoundary.start) {
379
- relativePos = 0;
380
- } else {
381
- relativePos = docPosition - sectionBoundary.start;
382
- }
383
-
384
- const proportion = Math.min(relativePos / sectionLength, 1.0);
385
- const markdownPos = Math.floor(proportion * result.length);
386
-
387
- // For empty anchors, before/after context is the only signal that
388
- // pinpoints the original split — without it, proportional placement
389
- // can land mid-word or split unrelated phrases. Try context match
390
- // first; only fall through to proportional when context is gone.
391
- if ((!anchor || isEmpty) && (before || after)) {
392
- const ctx = findAnchorInText('', result, before, after);
393
- if (ctx.occurrences.length > 0) {
394
- const pos = pushPastSectionHeading(result, ctx.occurrences[0]);
395
- return { ...c, pos, anchorText: null, isEmpty: true, strategy: `ctx:${ctx.strategy}` };
396
- }
397
- }
398
-
399
- let insertPos = markdownPos;
400
-
401
- // Look for nearby word boundary
402
- const searchWindow = result.slice(Math.max(0, markdownPos - 25), Math.min(result.length, markdownPos + 25));
403
- const spaceIdx = searchWindow.indexOf(' ', 25);
404
- if (spaceIdx !== -1 && spaceIdx < 50) {
405
- insertPos = Math.max(0, markdownPos - 25) + spaceIdx;
406
- }
407
-
408
- // If we have anchor text, try to find it near this position.
409
- // Collect ALL occurrences in the local window, then disambiguate
410
- // via before/after context + usedPositions — otherwise two
411
- // comments sharing the same anchor word would both collide at
412
- // the leftmost match. The context-scoring helper handles the
413
- // "repeated formulaic prose" case using docx-side context, which
414
- // is a stronger signal than raw distance to the proportional
415
- // insertPos (insertPos is itself an approximation).
416
- if (anchor && !isEmpty) {
417
- const searchStart = Math.max(0, insertPos - 200);
418
- const searchEnd = Math.min(result.length, insertPos + 200);
419
- const localSearch = result.slice(searchStart, searchEnd).toLowerCase();
420
- const anchorLower = anchor.toLowerCase();
421
-
422
- const localHits = findAllOccurrences(localSearch, anchorLower).map(i => searchStart + i);
423
- if (localHits.length > 0) {
424
- const chosen = pickBestOccurrence(localHits, result, before, after, anchor.length, usedPositions);
425
- if (chosen >= 0) {
426
- if (localHits.length > 1) {
427
- duplicateWarnings.push(`"${anchor.slice(0, 40)}${anchor.length > 40 ? '...' : ''}" appears ${localHits.length} times in section window`);
428
- }
429
- usedPositions.add(chosen);
430
- return { ...c, pos: chosen, anchorText: anchor, anchorEnd: chosen + anchor.length, strategy: 'position+text' };
431
- }
432
- }
433
-
434
- // Try first few words
435
- const words = anchor.split(/\s+/).slice(0, 4).join(' ').toLowerCase();
436
- if (words.length >= 10) {
437
- const partialHits = findAllOccurrences(localSearch, words).map(i => searchStart + i);
438
- if (partialHits.length > 0) {
439
- const chosen = pickBestOccurrence(partialHits, result, before, after, words.length, usedPositions);
440
- if (chosen >= 0) {
441
- usedPositions.add(chosen);
442
- return { ...c, pos: chosen, anchorText: words, anchorEnd: chosen + words.length, strategy: 'position+partial' };
443
- }
444
- }
445
- }
446
- }
447
-
448
- // A docPosition at the very start of a section maps to markdownPos=0,
449
- // which sits before the file's `# Heading` line and gets rendered in
450
- // the previous section. Push past the heading line so the comment
451
- // stays inside the section it was authored in.
452
- insertPos = pushPastSectionHeading(result, insertPos);
453
-
454
- return { ...c, pos: insertPos, anchorText: null, strategy: 'position-only' };
455
- }
456
- }
457
-
458
- // Handle empty anchors
459
- if (!anchor || isEmpty) {
460
- if (before || after) {
461
- const { occurrences } = findAnchorInText('', result, before, after);
462
- if (occurrences.length > 0) {
463
- return { ...c, pos: occurrences[0], anchorText: null, isEmpty: true };
464
- }
465
- }
466
- // Last resort: docx carried a structural marker at docPosition; map
467
- // it proportionally into the target so the comment isn't dropped.
468
- if (typeof anchorData === 'object') {
469
- const fallback = proportionalFallback(anchorData, result);
470
- if (fallback !== null) {
471
- return { ...c, pos: fallback, anchorText: null, isEmpty: true, strategy: 'proportional-fallback' };
472
- }
473
- }
474
- unmatchedCount++;
475
- return { ...c, pos: -1, anchorText: null, isEmpty: true };
476
- }
477
-
478
- // Text-based matching strategies
479
- const { occurrences, matchedAnchor, strategy, stripped } = findAnchorInText(anchor, result, before, after);
480
-
481
- if (occurrences.length === 0) {
482
- // Same last-resort as the empty-anchor path: anchor text is gone from
483
- // the target, but the marker's text-offset survived extraction.
484
- if (typeof anchorData === 'object') {
485
- const fallback = proportionalFallback(anchorData, result);
486
- if (fallback !== null) {
487
- return { ...c, pos: fallback, anchorText: null, strategy: 'proportional-fallback' };
488
- }
489
- }
490
- unmatchedCount++;
491
- return { ...c, pos: -1, anchorText: null };
492
- }
493
-
494
- const anchorLen = matchedAnchor ? matchedAnchor.length : 0;
495
-
496
- if (occurrences.length === 1) {
497
- if (matchedAnchor) {
498
- return { ...c, pos: occurrences[0], anchorText: matchedAnchor, anchorEnd: occurrences[0] + anchorLen };
499
- } else {
500
- return { ...c, pos: occurrences[0], anchorText: null };
501
- }
502
- }
503
-
504
- // Multiple occurrences - use context for disambiguation
505
- if (matchedAnchor) {
506
- duplicateWarnings.push(`"${matchedAnchor.slice(0, 40)}${matchedAnchor.length > 40 ? '...' : ''}" appears ${occurrences.length} times`);
507
- }
508
-
509
- const bestIdx = pickBestOccurrence(occurrences, result, before, after, anchorLen, usedPositions);
510
- const finalIdx = bestIdx >= 0 ? bestIdx : occurrences[0];
511
- usedPositions.add(finalIdx);
512
-
513
- if (matchedAnchor) {
514
- return { ...c, pos: finalIdx, anchorText: matchedAnchor, anchorEnd: finalIdx + anchorLen };
515
- } else {
516
- return { ...c, pos: finalIdx, anchorText: null };
517
- }
518
- });
519
-
520
- // Group comments into clusters (root + ordered replies). The root carries
521
- // the resolved position; replies inherit it and ride along in input order
522
- // so the rebuilt CriticMarkup looks like `{>>p<<}{>>r1<<}{>>r2<<}[anchor]`
523
- // and adjacency-based reply detection picks the cluster up again.
524
- const byId = new Map<string, CommentWithPos>();
525
- for (const cwp of commentsWithPositions) byId.set(cwp.id, cwp);
526
- const repliesByRoot = new Map<string, CommentWithPos[]>();
527
- for (const c of comments) {
528
- const rootId = replyRootId.get(c.id);
529
- if (!rootId) continue;
530
- const cwp = byId.get(c.id);
531
- if (!cwp) continue;
532
- const list = repliesByRoot.get(rootId);
533
- if (list) list.push(cwp);
534
- else repliesByRoot.set(rootId, [cwp]);
535
- }
536
-
537
- // Replies whose root never resolved (parent missing from the input slice or
538
- // parent unmatched) count as unmatched too — there's no position to attach
539
- // them to.
540
- for (const [rootId, replies] of repliesByRoot) {
541
- const root = byId.get(rootId);
542
- if (!root || root.pos < 0) {
543
- unmatchedCount += replies.length;
544
- }
545
- }
546
-
547
- // Roots only — replies attach during emission.
548
- const rootsWithPos = commentsWithPositions.filter(
549
- c => !replyRootId.has(c.id)
550
- );
551
-
552
- // Log any unmatched roots for debugging
553
- const unmatched = rootsWithPos.filter((c) => c.pos < 0);
554
- if (process.env.DEBUG) {
555
- console.log(`[DEBUG] insertComments: ${comments.length} input, ${rootsWithPos.length} roots, ${unmatched.length} unmatched roots, ${replyRootId.size} replies`);
556
- if (unmatched.length > 0) {
557
- unmatched.forEach(c => console.log(`[DEBUG] Unmatched ID=${c.id}: anchor="${(c.anchorText || 'none').slice(0,30)}"`));
558
- }
559
- }
560
-
561
- const matchedRoots = rootsWithPos.filter((c) => c.pos >= 0);
562
-
563
- // Sort by position descending (insert from end to avoid offset issues)
564
- matchedRoots.sort((a, b) => b.pos - a.pos);
565
-
566
- // Insert each cluster. With `wrapAnchor` (the default), the anchor text
567
- // gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
568
- // original Word comment range. Without it, the comment block is inserted
569
- // adjacent to the anchor and prose stays untouched — required for
570
- // comments-only sync where multiple comments may share one anchor.
571
- // Skip insertion when the parent's CriticMarkup already lives near the
572
- // target — re-running sync against the same docx would otherwise stack
573
- // duplicates. A 200-char window catches both wrapped
574
- // (`{>>...<<}[anchor]{.mark}`) and bare (`{>>...<<}anchor`) forms while
575
- // ignoring incidental matches farther away.
576
- let dedupedCount = 0;
577
- for (const c of matchedRoots) {
578
- const parentBlock = `{>>${c.author}: ${c.text}<<}`;
579
- const replies = repliesByRoot.get(c.id) ?? [];
580
- const windowStart = Math.max(0, c.pos - 200);
581
- const windowEnd = Math.min(result.length, c.pos + 200);
582
- if (result.slice(windowStart, windowEnd).includes(parentBlock)) {
583
- // Cluster already synced; treat all members as deduped.
584
- dedupedCount += 1 + replies.length;
585
- continue;
586
- }
587
- // Replies carry an explicit `↪ ` author prefix so the round-trip does not
588
- // depend on positional adjacency in the markdown. On dense reviewer docs
589
- // distinct clusters frequently land at the same anchor position; without
590
- // the prefix the re-parse would misthread them. The injection side strips
591
- // `↪ ` back off the author so Word renders the original name.
592
- const replyBlocks = replies.map(r => `{>>↪ ${r.author}: ${r.text}<<}`);
593
- const combined = parentBlock + replyBlocks.join('');
594
- if (wrapAnchor && c.anchorText && c.anchorEnd) {
595
- const before = result.slice(0, c.pos);
596
- const anchor = result.slice(c.pos, c.anchorEnd);
597
- const after = result.slice(c.anchorEnd);
598
- result = before + combined + `[${anchor}]{.mark}` + after;
599
- } else {
600
- result = result.slice(0, c.pos) + combined + result.slice(c.pos);
601
- }
602
- placedCount += 1 + replies.length;
603
- }
604
-
605
- if (outStats) {
606
- outStats.placed = placedCount;
607
- outStats.deduped = dedupedCount;
608
- outStats.unmatched = unmatchedCount;
609
- }
610
-
611
- // Log warnings unless quiet mode
612
- if (!quiet) {
613
- if (unmatchedCount > 0) {
614
- console.warn(`Warning: ${unmatchedCount} comment(s) could not be matched to anchor text`);
615
- }
616
- if (dedupedCount > 0) {
617
- console.warn(`Note: ${dedupedCount} comment(s) already present at anchor — skipped to avoid duplication`);
618
- }
619
- if (duplicateWarnings.length > 0) {
620
- console.warn(`Warning: Duplicate anchor text found (using context & tie-breaks for placement):`);
621
- for (const w of duplicateWarnings) {
622
- console.warn(` - ${w}`);
623
- }
624
- }
625
- }
626
-
627
- return result;
628
- }
629
-
630
- /**
631
- * Import Word document with track changes directly as CriticMarkup
632
- */
633
- export async function importWordWithTrackChanges(
634
- docxPath: string,
635
- options: ImportWordWithTrackChangesOptions = {}
636
- ): Promise<ImportWordWithTrackChangesResult> {
637
- const { mediaDir, projectDir } = options;
638
- const docxDir = path.dirname(docxPath);
639
- const targetMediaDir = mediaDir || path.join(docxDir, 'media');
640
- const targetProjectDir = projectDir || docxDir;
641
-
642
- const registry = readImageRegistry(targetProjectDir);
643
- const hasRegistry = registry && registry.figures && registry.figures.length > 0;
644
-
645
- // First pass: count images
646
- const { stdout: rawText } = await execAsync(
647
- `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`,
648
- { maxBuffer: 50 * 1024 * 1024 }
649
- );
650
-
651
- const wordImageCount = (rawText.match(/!\[[^\]]*\]\(media\/[^)]+\)/g) || []).length;
652
- const registryCount = hasRegistry ? registry.figures.length : 0;
653
-
654
- const needsMediaExtraction = wordImageCount > registryCount;
655
-
656
- if (hasRegistry) {
657
- console.log(`Registry has ${registryCount} figures, Word doc has ${wordImageCount} images`);
658
- if (needsMediaExtraction) {
659
- console.log(`Extracting media (${wordImageCount - registryCount} new image(s) detected)`);
660
- } else {
661
- console.log(`Using existing figures from registry`);
662
- }
663
- }
664
-
665
- // Extract from Word
666
- const extracted = await extractFromWord(docxPath, {
667
- mediaDir: targetMediaDir,
668
- skipMediaExtraction: !needsMediaExtraction,
669
- });
670
-
671
- let text = extracted.text;
672
- const extractedMedia = extracted.extractedMedia || [];
673
- const comments = extracted.comments || [];
674
- const anchors = extracted.anchors || new Map();
675
-
676
- // Log messages
677
- for (const msg of extracted.messages || []) {
678
- if (msg.type === 'info') {
679
- console.log(msg.message);
680
- } else if (msg.type === 'warning') {
681
- console.warn(`Warning: ${msg.message}`);
682
- }
683
- }
684
-
685
- // Restore crossref
686
- const crossrefResult = restoreCrossrefFromWord(text, targetProjectDir);
687
- text = crossrefResult.text;
688
- if (crossrefResult.restored > 0) {
689
- console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
690
- }
691
-
692
- // Restore images
693
- const imageRestoreResult = restoreImagesFromRegistry(text, targetProjectDir, crossrefResult.restoredLabels);
694
- text = imageRestoreResult.text;
695
- if (imageRestoreResult.restored > 0) {
696
- console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
697
- }
698
-
699
- // Insert comments
700
- if (comments.length > 0) {
701
- text = insertCommentsIntoMarkdown(text, comments, anchors);
702
- console.log(`Inserted ${comments.length} comment(s)`);
703
- }
704
-
705
- // Clean up
706
- text = cleanupAnnotations(text);
707
-
708
- // Count final changes
709
- const insertions = (text.match(/\{\+\+/g) || []).length;
710
- const deletions = (text.match(/\{--/g) || []).length;
711
- const substitutions = (text.match(/\{~~/g) || []).length;
712
- const commentCount = (text.match(/\{>>/g) || []).length;
713
-
714
- return {
715
- text,
716
- stats: {
717
- insertions,
718
- deletions,
719
- substitutions,
720
- comments: commentCount,
721
- total: insertions + deletions + substitutions + commentCount,
722
- hasTrackChanges: extracted.hasTrackChanges,
723
- trackChangeStats: extracted.trackChangeStats,
724
- },
725
- extractedMedia,
726
- comments,
727
- };
728
- }
729
-
730
- /**
731
- * Legacy import function: Word doc → annotated MD via diff
732
- */
733
- export async function importFromWord(
734
- docxPath: string,
735
- originalMdPath: string,
736
- options: ImportFromWordOptions = {}
737
- ): Promise<ImportFromWordResult> {
738
- const { author = 'Reviewer', sectionContent, figuresDir } = options;
739
- const projectDir = path.dirname(originalMdPath);
740
-
741
- let wordText: string;
742
- let extractedMedia: string[] = [];
743
- let wordTables: WordTable[] = options.wordTables || [];
744
- let hasTrackChanges = false;
745
-
746
- if (sectionContent !== undefined) {
747
- let annotated = cleanupAnnotations(sectionContent);
748
-
749
- const insertions = (annotated.match(/\{\+\+/g) || []).length;
750
- const deletions = (annotated.match(/\{--/g) || []).length;
751
- const substitutions = (annotated.match(/\{~~/g) || []).length;
752
- const commentCount = (annotated.match(/\{>>/g) || []).length;
753
-
754
- return {
755
- annotated,
756
- stats: {
757
- insertions,
758
- deletions,
759
- substitutions,
760
- comments: commentCount,
761
- total: insertions + deletions + substitutions + commentCount,
762
- },
763
- extractedMedia: [],
764
- };
765
- } else {
766
- const docxDir = path.dirname(docxPath);
767
- const mediaDir = figuresDir || docxDir;
768
-
769
- const extracted = await extractFromWord(docxPath, { mediaDir });
770
- wordText = extracted.text;
771
- extractedMedia = extracted.extractedMedia || [];
772
- wordTables = extracted.tables || [];
773
- hasTrackChanges = extracted.hasTrackChanges || false;
774
-
775
- for (const msg of extracted.messages || []) {
776
- if (msg.type === 'info') {
777
- console.log(msg.message);
778
- } else if (msg.type === 'warning') {
779
- console.warn(`Warning: ${msg.message}`);
780
- }
781
- }
782
-
783
- if (hasTrackChanges) {
784
- const crossrefResult = restoreCrossrefFromWord(wordText, projectDir);
785
- wordText = crossrefResult.text;
786
- if (crossrefResult.restored > 0) {
787
- console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
788
- }
789
-
790
- const imageRestoreResult = restoreImagesFromRegistry(wordText, projectDir, crossrefResult.restoredLabels);
791
- wordText = imageRestoreResult.text;
792
- if (imageRestoreResult.restored > 0) {
793
- console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
794
- }
795
-
796
- const comments = extracted.comments || [];
797
- const anchors = extracted.anchors || new Map();
798
- if (comments.length > 0) {
799
- wordText = insertCommentsIntoMarkdown(wordText, comments, anchors);
800
- console.log(`Inserted ${comments.length} comment(s)`);
801
- }
802
-
803
- wordText = cleanupAnnotations(wordText);
804
-
805
- const insertions = (wordText.match(/\{\+\+/g) || []).length;
806
- const deletions = (wordText.match(/\{--/g) || []).length;
807
- const substitutions = (wordText.match(/\{~~/g) || []).length;
808
- const commentCount = (wordText.match(/\{>>/g) || []).length;
809
-
810
- return {
811
- annotated: wordText,
812
- stats: {
813
- insertions,
814
- deletions,
815
- substitutions,
816
- comments: commentCount,
817
- total: insertions + deletions + substitutions + commentCount,
818
- },
819
- extractedMedia,
820
- };
821
- }
822
-
823
- console.warn('Warning: No track changes detected in Word document.');
824
- console.warn(' For best results, reviewers should use Track Changes in Word.');
825
- console.warn(' Falling back to diff-based import (comparing against original MD).');
826
- console.warn(' This approach may produce less accurate change annotations.');
827
-
828
- const crossrefResult = restoreCrossrefFromWord(wordText, projectDir);
829
- wordText = crossrefResult.text;
830
- if (crossrefResult.restored > 0) {
831
- console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
832
- }
833
-
834
- const imageRestoreResult = restoreImagesFromRegistry(wordText, projectDir, crossrefResult.restoredLabels);
835
- wordText = imageRestoreResult.text;
836
- if (imageRestoreResult.restored > 0) {
837
- console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
838
- }
839
- }
840
-
841
- // Read original markdown
842
- let originalMd = fs.readFileSync(originalMdPath, 'utf-8');
843
-
844
- // Strip existing annotations
845
- originalMd = stripAnnotations(originalMd, { keepComments: false });
846
-
847
- // Load image registry
848
- const imageRegistry = readImageRegistry(projectDir);
849
-
850
- // Generate diff
851
- let annotated = generateSmartDiff(originalMd, wordText, author, { wordTables, imageRegistry });
852
-
853
- // Clean up
854
- annotated = cleanupAnnotations(annotated);
855
-
856
- // Fix citation annotations
857
- annotated = fixCitationAnnotations(annotated, originalMd);
858
-
859
- // Convert visible comments
860
- annotated = convertVisibleComments(annotated);
861
-
862
- // Count changes
863
- const insertions = (annotated.match(/\{\+\+/g) || []).length;
864
- const deletions = (annotated.match(/\{--/g) || []).length;
865
- const substitutions = (annotated.match(/\{~~/g) || []).length;
866
- const comments = (annotated.match(/\{>>/g) || []).length;
867
-
868
- return {
869
- annotated,
870
- stats: {
871
- insertions,
872
- deletions,
873
- substitutions,
874
- comments,
875
- total: insertions + deletions + substitutions + comments,
876
- },
877
- extractedMedia,
878
- };
879
- }
880
-
881
- /**
882
- * Move extracted media files to a figures directory with better names
883
- */
884
- export function moveExtractedMedia(
885
- mediaFiles: string[],
886
- figuresDir: string,
887
- prefix: string = 'figure'
888
- ): MoveExtractedMediaResult {
889
- const moved: MovedFile[] = [];
890
- const errors: string[] = [];
891
-
892
- if (!fs.existsSync(figuresDir)) {
893
- fs.mkdirSync(figuresDir, { recursive: true });
894
- }
895
-
896
- for (let i = 0; i < mediaFiles.length; i++) {
897
- const src = mediaFiles[i];
898
- const ext = path.extname(src).toLowerCase();
899
- const newName = `${prefix}${i + 1}${ext}`;
900
- const dest = path.join(figuresDir, newName);
901
-
902
- try {
903
- fs.copyFileSync(src, dest);
904
- moved.push({ from: src, to: dest, name: newName });
905
- } catch (err: any) {
906
- errors.push(`Failed to copy ${src}: ${err.message}`);
907
- }
908
- }
909
-
910
- return { moved, errors };
911
- }
1
+ /**
2
+ * Import functionality - convert Word docs to annotated Markdown
3
+ *
4
+ * Orchestration workflows + re-exports from extraction/diff/restore modules
5
+ */
6
+
7
+ import * as fs from 'fs';
8
+ import * as path from 'path';
9
+ import { stripAnnotations } from './annotations.js';
10
+ import { readImageRegistry } from './image-registry.js';
11
+ import { exec } from 'child_process';
12
+ import { promisify } from 'util';
13
+
14
+ // Import from split modules
15
+ import {
16
+ extractFromWord,
17
+ extractWordComments,
18
+ extractCommentAnchors,
19
+ extractWordTables,
20
+ } from './word-extraction.js';
21
+ import type {
22
+ WordComment,
23
+ CommentAnchorData,
24
+ WordTable,
25
+ ExtractFromWordResult,
26
+ } from './word-extraction.js';
27
+ import {
28
+ generateSmartDiff,
29
+ generateAnnotatedDiff,
30
+ cleanupAnnotations,
31
+ fixCitationAnnotations,
32
+ } from './diff-engine.js';
33
+ import {
34
+ restoreCrossrefFromWord,
35
+ restoreImagesFromRegistry,
36
+ parseVisibleComments,
37
+ convertVisibleComments,
38
+ } from './restore-references.js';
39
+ import { findAnchorInText, findAllOccurrences } from './anchor-match.js';
40
+
41
+ /**
42
+ * Pick the best position from candidate `occurrences` given the
43
+ * surrounding `before` / `after` context from the docx, while
44
+ * respecting `usedPositions` to avoid stacking distinct comments at
45
+ * the same anchor instance.
46
+ *
47
+ * Returns the chosen position, or -1 if every candidate is already used.
48
+ */
49
+ function pickBestOccurrence(
50
+ occurrences: number[],
51
+ result: string,
52
+ before: string,
53
+ after: string,
54
+ anchorLen: number,
55
+ usedPositions: Set<number>,
56
+ ): number {
57
+ if (occurrences.length === 0) return -1;
58
+ if (occurrences.length === 1) {
59
+ return usedPositions.has(occurrences[0]) ? -1 : occurrences[0];
60
+ }
61
+
62
+ let bestIdx = occurrences.find(p => !usedPositions.has(p)) ?? -1;
63
+ if (bestIdx < 0) return -1;
64
+ let bestScore = -1;
65
+
66
+ for (const pos of occurrences) {
67
+ if (usedPositions.has(pos)) continue;
68
+ let score = 0;
69
+
70
+ if (before) {
71
+ const contextBefore = result.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
72
+ const beforeLower = before.toLowerCase();
73
+ const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
74
+ for (const word of beforeWords) {
75
+ if (contextBefore.includes(word)) score += 2;
76
+ }
77
+ if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
78
+ }
79
+
80
+ if (after) {
81
+ const contextAfter = result.slice(pos + anchorLen, pos + anchorLen + after.length + 20).toLowerCase();
82
+ const afterLower = after.toLowerCase();
83
+ const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
84
+ for (const word of afterWords) {
85
+ if (contextAfter.includes(word)) score += 2;
86
+ }
87
+ if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
88
+ }
89
+
90
+ if (score > bestScore || (score === bestScore && pos < bestIdx)) {
91
+ bestScore = score;
92
+ bestIdx = pos;
93
+ }
94
+ }
95
+
96
+ return bestIdx;
97
+ }
98
+
99
+ // Re-export everything so existing imports from './import.js' still work
100
+ export {
101
+ extractFromWord,
102
+ extractWordComments,
103
+ extractCommentAnchors,
104
+ extractHeadings,
105
+ extractWordTables,
106
+ } from './word-extraction.js';
107
+ export type {
108
+ WordComment,
109
+ TextNode,
110
+ CommentAnchorData,
111
+ CommentAnchorsResult,
112
+ DocxHeading,
113
+ WordTable,
114
+ ParsedRow,
115
+ ExtractFromWordOptions,
116
+ ExtractMessage,
117
+ ExtractFromWordResult,
118
+ } from './word-extraction.js';
119
+
120
+ export {
121
+ generateSmartDiff,
122
+ generateAnnotatedDiff,
123
+ cleanupAnnotations,
124
+ fixCitationAnnotations,
125
+ } from './diff-engine.js';
126
+ export type {
127
+ GenerateSmartDiffOptions,
128
+ } from './diff-engine.js';
129
+
130
+ export {
131
+ restoreCrossrefFromWord,
132
+ restoreImagesFromRegistry,
133
+ parseVisibleComments,
134
+ convertVisibleComments,
135
+ } from './restore-references.js';
136
+ export type {
137
+ RestoreCrossrefResult,
138
+ RestoreImagesResult,
139
+ } from './restore-references.js';
140
+
141
+ const execAsync = promisify(exec);
142
+
143
+ // ============================================
144
+ // Type Definitions (orchestration-specific)
145
+ // ============================================
146
+
147
+ export interface InsertCommentsOptions {
148
+ quiet?: boolean;
149
+ sectionBoundary?: { start: number; end: number } | null;
150
+ /**
151
+ * When true (default), comments wrap their anchor text in `[anchor]{.mark}`
152
+ * so the rebuilt docx restores the original Word comment range. When false,
153
+ * comments are inserted as standalone `{>>...<<}` blocks adjacent to the
154
+ * anchor — the prose stays byte-identical except for the inserted blocks.
155
+ *
156
+ * Set to false from `sync --comments-only` so a draft revised after the
157
+ * docx was sent for review keeps its prose intact, and so multiple
158
+ * comments sharing one anchor don't produce nested broken markup.
159
+ */
160
+ wrapAnchor?: boolean;
161
+ /**
162
+ * Mutable output: when provided, the function fills in counters so callers
163
+ * can distinguish placement outcomes in their summary. `placed` counts new
164
+ * insertions, `deduped` counts comments that were already present at their
165
+ * anchor (skipped to avoid duplication on re-sync), `unmatched` counts
166
+ * comments whose anchor couldn't be located.
167
+ */
168
+ outStats?: { placed: number; deduped: number; unmatched: number };
169
+ }
170
+
171
+ export interface CommentWithPos {
172
+ id: string;
173
+ author: string;
174
+ text: string;
175
+ date: string;
176
+ pos: number;
177
+ anchorText: string | null;
178
+ anchorEnd?: number;
179
+ isEmpty?: boolean;
180
+ strategy?: string;
181
+ }
182
+
183
+ export type { AnchorSearchResult } from './anchor-match.js';
184
+
185
+ export interface MarkdownPrefixResult {
186
+ prefix: string;
187
+ content: string;
188
+ }
189
+
190
+ export interface ImportWordWithTrackChangesOptions {
191
+ mediaDir?: string;
192
+ projectDir?: string;
193
+ }
194
+
195
+ export interface ImportWordWithTrackChangesResult {
196
+ text: string;
197
+ stats: {
198
+ insertions: number;
199
+ deletions: number;
200
+ substitutions: number;
201
+ comments: number;
202
+ total: number;
203
+ hasTrackChanges: boolean;
204
+ trackChangeStats: { insertions: number; deletions: number };
205
+ };
206
+ extractedMedia: string[];
207
+ comments: WordComment[];
208
+ }
209
+
210
+ export interface ImportFromWordOptions {
211
+ author?: string;
212
+ sectionContent?: string;
213
+ figuresDir?: string;
214
+ wordTables?: WordTable[];
215
+ }
216
+
217
+ export interface ImportFromWordResult {
218
+ annotated: string;
219
+ stats: {
220
+ insertions: number;
221
+ deletions: number;
222
+ substitutions: number;
223
+ comments: number;
224
+ total: number;
225
+ };
226
+ extractedMedia: string[];
227
+ }
228
+
229
+ export interface MovedFile {
230
+ from: string;
231
+ to: string;
232
+ name: string;
233
+ }
234
+
235
+ export interface MoveExtractedMediaResult {
236
+ moved: MovedFile[];
237
+ errors: string[];
238
+ }
239
+
240
+ // ============================================
241
+ // Functions
242
+ // ============================================
243
+
244
+ /**
245
+ * If `pos` lands inside a section file's leading `# Heading` line (or the
246
+ * blank line right after it), advance past the first paragraph break so
247
+ * the comment stays inside the section. A comment authored at the very
248
+ * start of a Word section maps to `pos === 0`, but inserting at column 0
249
+ * of a markdown file that begins with `# Heading` puts the `{>>...<<}`
250
+ * before the heading marker — Pandoc then treats the line as ordinary
251
+ * paragraph text and the comment renders in the previous section.
252
+ */
253
+ function pushPastSectionHeading(text: string, pos: number): number {
254
+ if (pos > 0) {
255
+ const headingMatch = text.match(/^#{1,6}\s.+$/m);
256
+ if (!headingMatch || headingMatch.index === undefined) return pos;
257
+ const headingEnd = headingMatch.index + headingMatch[0].length;
258
+ if (pos >= headingEnd) return pos;
259
+ }
260
+ // pos is at-or-before the first heading line. Advance to the first
261
+ // non-blank position after the heading paragraph.
262
+ const headingLine = text.match(/^#{1,6}\s.+(?:\n|$)/m);
263
+ if (!headingLine || headingLine.index === undefined) return pos;
264
+ let after = headingLine.index + headingLine[0].length;
265
+ // Skip blank lines so we land at the start of the first body paragraph.
266
+ while (after < text.length && (text[after] === '\n' || text[after] === '\r')) {
267
+ after++;
268
+ }
269
+ return after;
270
+ }
271
+
272
+ /**
273
+ * Snap a position to the nearest whitespace boundary within ±50 chars so a
274
+ * proportional fallback insertion never lands mid-word.
275
+ */
276
+ function snapToWordBoundary(text: string, pos: number): number {
277
+ if (pos <= 0) return 0;
278
+ if (pos >= text.length) return text.length;
279
+ if (/\s/.test(text[pos] ?? '')) return pos;
280
+ for (let d = 1; d <= 50; d++) {
281
+ if (pos + d < text.length && /\s/.test(text[pos + d] ?? '')) return pos + d;
282
+ if (pos - d >= 0 && /\s/.test(text[pos - d] ?? '')) return pos - d;
283
+ }
284
+ return pos;
285
+ }
286
+
287
+ /**
288
+ * Final-resort placement when every text-matching strategy failed. The docx
289
+ * carries a real `<w:commentRangeStart w:id="N">` marker at a known offset
290
+ * inside its body text — that's a structural anchor, even if the anchored
291
+ * span itself is empty and the surrounding context drifted in the target.
292
+ *
293
+ * Map docPosition into the target markdown proportionally and snap to a word
294
+ * boundary. This is approximate when the document was heavily restructured,
295
+ * but it's strictly better than silently dropping a reviewer's comment: the
296
+ * comment lands in roughly the right neighborhood and the reviewer can
297
+ * relocate it during their next pass.
298
+ */
299
+ function proportionalFallback(
300
+ anchorData: CommentAnchorData,
301
+ target: string,
302
+ ): number | null {
303
+ if (anchorData.docLength <= 0) return null;
304
+ const proportion = Math.min(anchorData.docPosition / anchorData.docLength, 1.0);
305
+ const rawPos = Math.floor(proportion * target.length);
306
+ return pushPastSectionHeading(target, snapToWordBoundary(target, rawPos));
307
+ }
308
+
309
+ /**
310
+ * Insert comments into markdown text based on anchor texts with context
311
+ */
312
+ export function insertCommentsIntoMarkdown(
313
+ markdown: string,
314
+ comments: WordComment[],
315
+ anchors: Map<string, CommentAnchorData | string>,
316
+ options: InsertCommentsOptions = {}
317
+ ): string {
318
+ const { quiet = false, sectionBoundary = null, wrapAnchor = true, outStats } = options;
319
+ let result = markdown;
320
+ let unmatchedCount = 0;
321
+ let placedCount = 0;
322
+ const duplicateWarnings: string[] = [];
323
+ const usedPositions = new Set<number>(); // For tie-breaking: track used positions
324
+
325
+ // Resolve threading: replies share their parent's anchor in Word, so they
326
+ // must inherit the parent's position and ride alongside it as one cluster.
327
+ // Letting each reply run through anchor scoring scatters the cluster (the
328
+ // same docPosition forces `usedPositions` to push later replies onto a
329
+ // different occurrence), which on re-build looks like independent comments
330
+ // and loses the paraIdParent threading. See gcol33/docrev issue #2.
331
+ const inputById = new Map<string, WordComment>();
332
+ for (const c of comments) inputById.set(c.id, c);
333
+ function rootIdOf(c: WordComment): string {
334
+ let cur: WordComment = c;
335
+ const seen = new Set<string>();
336
+ while (cur.parentId && !seen.has(cur.id)) {
337
+ seen.add(cur.id);
338
+ const parent = inputById.get(cur.parentId);
339
+ if (!parent || parent === cur) break;
340
+ cur = parent;
341
+ }
342
+ return cur.id;
343
+ }
344
+ const replyRootId = new Map<string, string>();
345
+ for (const c of comments) {
346
+ const root = rootIdOf(c);
347
+ if (root !== c.id) replyRootId.set(c.id, root);
348
+ }
349
+
350
+ // Anchor matching primitives live in lib/anchor-match.ts so that
351
+ // `rev verify-anchors` can use the same strategies for drift reporting.
352
+
353
+ // Get all positions in order (for sequential tie-breaking).
354
+ // Replies skip scoring entirely — they piggyback on their root's position
355
+ // in the emit pass below.
356
+ const commentsWithPositions = comments.map((c): CommentWithPos => {
357
+ if (replyRootId.has(c.id)) {
358
+ return { ...c, pos: -1, anchorText: null, strategy: 'reply' };
359
+ }
360
+ const anchorData = anchors.get(c.id);
361
+ if (!anchorData) {
362
+ unmatchedCount++;
363
+ return { ...c, pos: -1, anchorText: null };
364
+ }
365
+
366
+ // Support both old format (string) and new format ({anchor, before, after})
367
+ const anchor = typeof anchorData === 'string' ? anchorData : anchorData.anchor;
368
+ const before = typeof anchorData === 'object' ? anchorData.before : '';
369
+ const after = typeof anchorData === 'object' ? anchorData.after : '';
370
+ const isEmpty = typeof anchorData === 'object' && anchorData.isEmpty;
371
+ const docPosition = typeof anchorData === 'object' ? anchorData.docPosition : undefined;
372
+
373
+ // Position-based insertion (most reliable)
374
+ if (sectionBoundary && docPosition !== undefined) {
375
+ const sectionLength = sectionBoundary.end - sectionBoundary.start;
376
+ if (sectionLength > 0) {
377
+ let relativePos;
378
+ if (docPosition < sectionBoundary.start) {
379
+ relativePos = 0;
380
+ } else {
381
+ relativePos = docPosition - sectionBoundary.start;
382
+ }
383
+
384
+ const proportion = Math.min(relativePos / sectionLength, 1.0);
385
+ const markdownPos = Math.floor(proportion * result.length);
386
+
387
+ // For empty anchors, before/after context is the only signal that
388
+ // pinpoints the original split — without it, proportional placement
389
+ // can land mid-word or split unrelated phrases. Try context match
390
+ // first; only fall through to proportional when context is gone.
391
+ if ((!anchor || isEmpty) && (before || after)) {
392
+ const ctx = findAnchorInText('', result, before, after);
393
+ if (ctx.occurrences.length > 0) {
394
+ const pos = pushPastSectionHeading(result, ctx.occurrences[0]);
395
+ return { ...c, pos, anchorText: null, isEmpty: true, strategy: `ctx:${ctx.strategy}` };
396
+ }
397
+ }
398
+
399
+ let insertPos = markdownPos;
400
+
401
+ // Look for nearby word boundary
402
+ const searchWindow = result.slice(Math.max(0, markdownPos - 25), Math.min(result.length, markdownPos + 25));
403
+ const spaceIdx = searchWindow.indexOf(' ', 25);
404
+ if (spaceIdx !== -1 && spaceIdx < 50) {
405
+ insertPos = Math.max(0, markdownPos - 25) + spaceIdx;
406
+ }
407
+
408
+ // If we have anchor text, try to find it near this position.
409
+ // Collect ALL occurrences in the local window, then disambiguate
410
+ // via before/after context + usedPositions — otherwise two
411
+ // comments sharing the same anchor word would both collide at
412
+ // the leftmost match. The context-scoring helper handles the
413
+ // "repeated formulaic prose" case using docx-side context, which
414
+ // is a stronger signal than raw distance to the proportional
415
+ // insertPos (insertPos is itself an approximation).
416
+ if (anchor && !isEmpty) {
417
+ const searchStart = Math.max(0, insertPos - 200);
418
+ const searchEnd = Math.min(result.length, insertPos + 200);
419
+ const localSearch = result.slice(searchStart, searchEnd).toLowerCase();
420
+ const anchorLower = anchor.toLowerCase();
421
+
422
+ const localHits = findAllOccurrences(localSearch, anchorLower).map(i => searchStart + i);
423
+ if (localHits.length > 0) {
424
+ const chosen = pickBestOccurrence(localHits, result, before, after, anchor.length, usedPositions);
425
+ if (chosen >= 0) {
426
+ if (localHits.length > 1) {
427
+ duplicateWarnings.push(`"${anchor.slice(0, 40)}${anchor.length > 40 ? '...' : ''}" appears ${localHits.length} times in section window`);
428
+ }
429
+ usedPositions.add(chosen);
430
+ return { ...c, pos: chosen, anchorText: anchor, anchorEnd: chosen + anchor.length, strategy: 'position+text' };
431
+ }
432
+ }
433
+
434
+ // Try first few words
435
+ const words = anchor.split(/\s+/).slice(0, 4).join(' ').toLowerCase();
436
+ if (words.length >= 10) {
437
+ const partialHits = findAllOccurrences(localSearch, words).map(i => searchStart + i);
438
+ if (partialHits.length > 0) {
439
+ const chosen = pickBestOccurrence(partialHits, result, before, after, words.length, usedPositions);
440
+ if (chosen >= 0) {
441
+ usedPositions.add(chosen);
442
+ return { ...c, pos: chosen, anchorText: words, anchorEnd: chosen + words.length, strategy: 'position+partial' };
443
+ }
444
+ }
445
+ }
446
+ }
447
+
448
+ // A docPosition at the very start of a section maps to markdownPos=0,
449
+ // which sits before the file's `# Heading` line and gets rendered in
450
+ // the previous section. Push past the heading line so the comment
451
+ // stays inside the section it was authored in.
452
+ insertPos = pushPastSectionHeading(result, insertPos);
453
+
454
+ return { ...c, pos: insertPos, anchorText: null, strategy: 'position-only' };
455
+ }
456
+ }
457
+
458
+ // Handle empty anchors
459
+ if (!anchor || isEmpty) {
460
+ if (before || after) {
461
+ const { occurrences } = findAnchorInText('', result, before, after);
462
+ if (occurrences.length > 0) {
463
+ return { ...c, pos: occurrences[0], anchorText: null, isEmpty: true };
464
+ }
465
+ }
466
+ // Last resort: docx carried a structural marker at docPosition; map
467
+ // it proportionally into the target so the comment isn't dropped.
468
+ if (typeof anchorData === 'object') {
469
+ const fallback = proportionalFallback(anchorData, result);
470
+ if (fallback !== null) {
471
+ return { ...c, pos: fallback, anchorText: null, isEmpty: true, strategy: 'proportional-fallback' };
472
+ }
473
+ }
474
+ unmatchedCount++;
475
+ return { ...c, pos: -1, anchorText: null, isEmpty: true };
476
+ }
477
+
478
+ // Text-based matching strategies
479
+ const { occurrences, matchedAnchor, strategy, stripped } = findAnchorInText(anchor, result, before, after);
480
+
481
+ if (occurrences.length === 0) {
482
+ // Same last-resort as the empty-anchor path: anchor text is gone from
483
+ // the target, but the marker's text-offset survived extraction.
484
+ if (typeof anchorData === 'object') {
485
+ const fallback = proportionalFallback(anchorData, result);
486
+ if (fallback !== null) {
487
+ return { ...c, pos: fallback, anchorText: null, strategy: 'proportional-fallback' };
488
+ }
489
+ }
490
+ unmatchedCount++;
491
+ return { ...c, pos: -1, anchorText: null };
492
+ }
493
+
494
+ const anchorLen = matchedAnchor ? matchedAnchor.length : 0;
495
+
496
+ if (occurrences.length === 1) {
497
+ if (matchedAnchor) {
498
+ return { ...c, pos: occurrences[0], anchorText: matchedAnchor, anchorEnd: occurrences[0] + anchorLen };
499
+ } else {
500
+ return { ...c, pos: occurrences[0], anchorText: null };
501
+ }
502
+ }
503
+
504
+ // Multiple occurrences - use context for disambiguation
505
+ if (matchedAnchor) {
506
+ duplicateWarnings.push(`"${matchedAnchor.slice(0, 40)}${matchedAnchor.length > 40 ? '...' : ''}" appears ${occurrences.length} times`);
507
+ }
508
+
509
+ const bestIdx = pickBestOccurrence(occurrences, result, before, after, anchorLen, usedPositions);
510
+ const finalIdx = bestIdx >= 0 ? bestIdx : occurrences[0];
511
+ usedPositions.add(finalIdx);
512
+
513
+ if (matchedAnchor) {
514
+ return { ...c, pos: finalIdx, anchorText: matchedAnchor, anchorEnd: finalIdx + anchorLen };
515
+ } else {
516
+ return { ...c, pos: finalIdx, anchorText: null };
517
+ }
518
+ });
519
+
520
+ // Group comments into clusters (root + ordered replies). The root carries
521
+ // the resolved position; replies inherit it and ride along in input order
522
+ // so the rebuilt CriticMarkup looks like `{>>p<<}{>>r1<<}{>>r2<<}[anchor]`
523
+ // and adjacency-based reply detection picks the cluster up again.
524
+ const byId = new Map<string, CommentWithPos>();
525
+ for (const cwp of commentsWithPositions) byId.set(cwp.id, cwp);
526
+ const repliesByRoot = new Map<string, CommentWithPos[]>();
527
+ for (const c of comments) {
528
+ const rootId = replyRootId.get(c.id);
529
+ if (!rootId) continue;
530
+ const cwp = byId.get(c.id);
531
+ if (!cwp) continue;
532
+ const list = repliesByRoot.get(rootId);
533
+ if (list) list.push(cwp);
534
+ else repliesByRoot.set(rootId, [cwp]);
535
+ }
536
+
537
+ // Replies whose root never resolved (parent missing from the input slice or
538
+ // parent unmatched) count as unmatched too — there's no position to attach
539
+ // them to.
540
+ for (const [rootId, replies] of repliesByRoot) {
541
+ const root = byId.get(rootId);
542
+ if (!root || root.pos < 0) {
543
+ unmatchedCount += replies.length;
544
+ }
545
+ }
546
+
547
+ // Roots only — replies attach during emission.
548
+ const rootsWithPos = commentsWithPositions.filter(
549
+ c => !replyRootId.has(c.id)
550
+ );
551
+
552
+ // Log any unmatched roots for debugging
553
+ const unmatched = rootsWithPos.filter((c) => c.pos < 0);
554
+ if (process.env.DEBUG) {
555
+ console.log(`[DEBUG] insertComments: ${comments.length} input, ${rootsWithPos.length} roots, ${unmatched.length} unmatched roots, ${replyRootId.size} replies`);
556
+ if (unmatched.length > 0) {
557
+ unmatched.forEach(c => console.log(`[DEBUG] Unmatched ID=${c.id}: anchor="${(c.anchorText || 'none').slice(0,30)}"`));
558
+ }
559
+ }
560
+
561
+ const matchedRoots = rootsWithPos.filter((c) => c.pos >= 0);
562
+
563
+ // Sort by position descending (insert from end to avoid offset issues)
564
+ matchedRoots.sort((a, b) => b.pos - a.pos);
565
+
566
+ // Insert each cluster. With `wrapAnchor` (the default), the anchor text
567
+ // gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
568
+ // original Word comment range. Without it, the comment block is inserted
569
+ // adjacent to the anchor and prose stays untouched — required for
570
+ // comments-only sync where multiple comments may share one anchor.
571
+ // Skip insertion when the parent's CriticMarkup already lives near the
572
+ // target — re-running sync against the same docx would otherwise stack
573
+ // duplicates. A 200-char window catches both wrapped
574
+ // (`{>>...<<}[anchor]{.mark}`) and bare (`{>>...<<}anchor`) forms while
575
+ // ignoring incidental matches farther away.
576
+ let dedupedCount = 0;
577
+ for (const c of matchedRoots) {
578
+ const parentBlock = `{>>${c.author}: ${c.text}<<}`;
579
+ const replies = repliesByRoot.get(c.id) ?? [];
580
+ const windowStart = Math.max(0, c.pos - 200);
581
+ const windowEnd = Math.min(result.length, c.pos + 200);
582
+ if (result.slice(windowStart, windowEnd).includes(parentBlock)) {
583
+ // Cluster already synced; treat all members as deduped.
584
+ dedupedCount += 1 + replies.length;
585
+ continue;
586
+ }
587
+ // Replies carry an explicit `↪ ` author prefix so the round-trip does not
588
+ // depend on positional adjacency in the markdown. On dense reviewer docs
589
+ // distinct clusters frequently land at the same anchor position; without
590
+ // the prefix the re-parse would misthread them. The injection side strips
591
+ // `↪ ` back off the author so Word renders the original name.
592
+ const replyBlocks = replies.map(r => `{>>↪ ${r.author}: ${r.text}<<}`);
593
+ const combined = parentBlock + replyBlocks.join('');
594
+ if (wrapAnchor && c.anchorText && c.anchorEnd) {
595
+ const before = result.slice(0, c.pos);
596
+ const anchor = result.slice(c.pos, c.anchorEnd);
597
+ const after = result.slice(c.anchorEnd);
598
+ result = before + combined + `[${anchor}]{.mark}` + after;
599
+ } else {
600
+ result = result.slice(0, c.pos) + combined + result.slice(c.pos);
601
+ }
602
+ placedCount += 1 + replies.length;
603
+ }
604
+
605
+ if (outStats) {
606
+ outStats.placed = placedCount;
607
+ outStats.deduped = dedupedCount;
608
+ outStats.unmatched = unmatchedCount;
609
+ }
610
+
611
+ // Log warnings unless quiet mode
612
+ if (!quiet) {
613
+ if (unmatchedCount > 0) {
614
+ console.warn(`Warning: ${unmatchedCount} comment(s) could not be matched to anchor text`);
615
+ }
616
+ if (dedupedCount > 0) {
617
+ console.warn(`Note: ${dedupedCount} comment(s) already present at anchor — skipped to avoid duplication`);
618
+ }
619
+ if (duplicateWarnings.length > 0) {
620
+ console.warn(`Warning: Duplicate anchor text found (using context & tie-breaks for placement):`);
621
+ for (const w of duplicateWarnings) {
622
+ console.warn(` - ${w}`);
623
+ }
624
+ }
625
+ }
626
+
627
+ return result;
628
+ }
629
+
630
+ /**
631
+ * Import Word document with track changes directly as CriticMarkup
632
+ */
633
+ export async function importWordWithTrackChanges(
634
+ docxPath: string,
635
+ options: ImportWordWithTrackChangesOptions = {}
636
+ ): Promise<ImportWordWithTrackChangesResult> {
637
+ const { mediaDir, projectDir } = options;
638
+ const docxDir = path.dirname(docxPath);
639
+ const targetMediaDir = mediaDir || path.join(docxDir, 'media');
640
+ const targetProjectDir = projectDir || docxDir;
641
+
642
+ const registry = readImageRegistry(targetProjectDir);
643
+ const hasRegistry = registry && registry.figures && registry.figures.length > 0;
644
+
645
+ // First pass: count images
646
+ const { stdout: rawText } = await execAsync(
647
+ `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`,
648
+ { maxBuffer: 50 * 1024 * 1024 }
649
+ );
650
+
651
+ const wordImageCount = (rawText.match(/!\[[^\]]*\]\(media\/[^)]+\)/g) || []).length;
652
+ const registryCount = hasRegistry ? registry.figures.length : 0;
653
+
654
+ const needsMediaExtraction = wordImageCount > registryCount;
655
+
656
+ if (hasRegistry) {
657
+ console.log(`Registry has ${registryCount} figures, Word doc has ${wordImageCount} images`);
658
+ if (needsMediaExtraction) {
659
+ console.log(`Extracting media (${wordImageCount - registryCount} new image(s) detected)`);
660
+ } else {
661
+ console.log(`Using existing figures from registry`);
662
+ }
663
+ }
664
+
665
+ // Extract from Word
666
+ const extracted = await extractFromWord(docxPath, {
667
+ mediaDir: targetMediaDir,
668
+ skipMediaExtraction: !needsMediaExtraction,
669
+ });
670
+
671
+ let text = extracted.text;
672
+ const extractedMedia = extracted.extractedMedia || [];
673
+ const comments = extracted.comments || [];
674
+ const anchors = extracted.anchors || new Map();
675
+
676
+ // Log messages
677
+ for (const msg of extracted.messages || []) {
678
+ if (msg.type === 'info') {
679
+ console.log(msg.message);
680
+ } else if (msg.type === 'warning') {
681
+ console.warn(`Warning: ${msg.message}`);
682
+ }
683
+ }
684
+
685
+ // Restore crossref
686
+ const crossrefResult = restoreCrossrefFromWord(text, targetProjectDir);
687
+ text = crossrefResult.text;
688
+ if (crossrefResult.restored > 0) {
689
+ console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
690
+ }
691
+
692
+ // Restore images
693
+ const imageRestoreResult = restoreImagesFromRegistry(text, targetProjectDir, crossrefResult.restoredLabels);
694
+ text = imageRestoreResult.text;
695
+ if (imageRestoreResult.restored > 0) {
696
+ console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
697
+ }
698
+
699
+ // Insert comments
700
+ if (comments.length > 0) {
701
+ text = insertCommentsIntoMarkdown(text, comments, anchors);
702
+ console.log(`Inserted ${comments.length} comment(s)`);
703
+ }
704
+
705
+ // Clean up
706
+ text = cleanupAnnotations(text);
707
+
708
+ // Count final changes
709
+ const insertions = (text.match(/\{\+\+/g) || []).length;
710
+ const deletions = (text.match(/\{--/g) || []).length;
711
+ const substitutions = (text.match(/\{~~/g) || []).length;
712
+ const commentCount = (text.match(/\{>>/g) || []).length;
713
+
714
+ return {
715
+ text,
716
+ stats: {
717
+ insertions,
718
+ deletions,
719
+ substitutions,
720
+ comments: commentCount,
721
+ total: insertions + deletions + substitutions + commentCount,
722
+ hasTrackChanges: extracted.hasTrackChanges,
723
+ trackChangeStats: extracted.trackChangeStats,
724
+ },
725
+ extractedMedia,
726
+ comments,
727
+ };
728
+ }
729
+
730
+ /**
731
+ * Legacy import function: Word doc → annotated MD via diff
732
+ */
733
+ export async function importFromWord(
734
+ docxPath: string,
735
+ originalMdPath: string,
736
+ options: ImportFromWordOptions = {}
737
+ ): Promise<ImportFromWordResult> {
738
+ const { author = 'Reviewer', sectionContent, figuresDir } = options;
739
+ const projectDir = path.dirname(originalMdPath);
740
+
741
+ let wordText: string;
742
+ let extractedMedia: string[] = [];
743
+ let wordTables: WordTable[] = options.wordTables || [];
744
+ let hasTrackChanges = false;
745
+
746
+ if (sectionContent !== undefined) {
747
+ let annotated = cleanupAnnotations(sectionContent);
748
+
749
+ const insertions = (annotated.match(/\{\+\+/g) || []).length;
750
+ const deletions = (annotated.match(/\{--/g) || []).length;
751
+ const substitutions = (annotated.match(/\{~~/g) || []).length;
752
+ const commentCount = (annotated.match(/\{>>/g) || []).length;
753
+
754
+ return {
755
+ annotated,
756
+ stats: {
757
+ insertions,
758
+ deletions,
759
+ substitutions,
760
+ comments: commentCount,
761
+ total: insertions + deletions + substitutions + commentCount,
762
+ },
763
+ extractedMedia: [],
764
+ };
765
+ } else {
766
+ const docxDir = path.dirname(docxPath);
767
+ const mediaDir = figuresDir || docxDir;
768
+
769
+ const extracted = await extractFromWord(docxPath, { mediaDir });
770
+ wordText = extracted.text;
771
+ extractedMedia = extracted.extractedMedia || [];
772
+ wordTables = extracted.tables || [];
773
+ hasTrackChanges = extracted.hasTrackChanges || false;
774
+
775
+ for (const msg of extracted.messages || []) {
776
+ if (msg.type === 'info') {
777
+ console.log(msg.message);
778
+ } else if (msg.type === 'warning') {
779
+ console.warn(`Warning: ${msg.message}`);
780
+ }
781
+ }
782
+
783
+ if (hasTrackChanges) {
784
+ const crossrefResult = restoreCrossrefFromWord(wordText, projectDir);
785
+ wordText = crossrefResult.text;
786
+ if (crossrefResult.restored > 0) {
787
+ console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
788
+ }
789
+
790
+ const imageRestoreResult = restoreImagesFromRegistry(wordText, projectDir, crossrefResult.restoredLabels);
791
+ wordText = imageRestoreResult.text;
792
+ if (imageRestoreResult.restored > 0) {
793
+ console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
794
+ }
795
+
796
+ const comments = extracted.comments || [];
797
+ const anchors = extracted.anchors || new Map();
798
+ if (comments.length > 0) {
799
+ wordText = insertCommentsIntoMarkdown(wordText, comments, anchors);
800
+ console.log(`Inserted ${comments.length} comment(s)`);
801
+ }
802
+
803
+ wordText = cleanupAnnotations(wordText);
804
+
805
+ const insertions = (wordText.match(/\{\+\+/g) || []).length;
806
+ const deletions = (wordText.match(/\{--/g) || []).length;
807
+ const substitutions = (wordText.match(/\{~~/g) || []).length;
808
+ const commentCount = (wordText.match(/\{>>/g) || []).length;
809
+
810
+ return {
811
+ annotated: wordText,
812
+ stats: {
813
+ insertions,
814
+ deletions,
815
+ substitutions,
816
+ comments: commentCount,
817
+ total: insertions + deletions + substitutions + commentCount,
818
+ },
819
+ extractedMedia,
820
+ };
821
+ }
822
+
823
+ console.warn('Warning: No track changes detected in Word document.');
824
+ console.warn(' For best results, reviewers should use Track Changes in Word.');
825
+ console.warn(' Falling back to diff-based import (comparing against original MD).');
826
+ console.warn(' This approach may produce less accurate change annotations.');
827
+
828
+ const crossrefResult = restoreCrossrefFromWord(wordText, projectDir);
829
+ wordText = crossrefResult.text;
830
+ if (crossrefResult.restored > 0) {
831
+ console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
832
+ }
833
+
834
+ const imageRestoreResult = restoreImagesFromRegistry(wordText, projectDir, crossrefResult.restoredLabels);
835
+ wordText = imageRestoreResult.text;
836
+ if (imageRestoreResult.restored > 0) {
837
+ console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
838
+ }
839
+ }
840
+
841
+ // Read original markdown
842
+ let originalMd = fs.readFileSync(originalMdPath, 'utf-8');
843
+
844
+ // Strip existing annotations
845
+ originalMd = stripAnnotations(originalMd, { keepComments: false });
846
+
847
+ // Load image registry
848
+ const imageRegistry = readImageRegistry(projectDir);
849
+
850
+ // Generate diff
851
+ let annotated = generateSmartDiff(originalMd, wordText, author, { wordTables, imageRegistry });
852
+
853
+ // Clean up
854
+ annotated = cleanupAnnotations(annotated);
855
+
856
+ // Fix citation annotations
857
+ annotated = fixCitationAnnotations(annotated, originalMd);
858
+
859
+ // Convert visible comments
860
+ annotated = convertVisibleComments(annotated);
861
+
862
+ // Count changes
863
+ const insertions = (annotated.match(/\{\+\+/g) || []).length;
864
+ const deletions = (annotated.match(/\{--/g) || []).length;
865
+ const substitutions = (annotated.match(/\{~~/g) || []).length;
866
+ const comments = (annotated.match(/\{>>/g) || []).length;
867
+
868
+ return {
869
+ annotated,
870
+ stats: {
871
+ insertions,
872
+ deletions,
873
+ substitutions,
874
+ comments,
875
+ total: insertions + deletions + substitutions + comments,
876
+ },
877
+ extractedMedia,
878
+ };
879
+ }
880
+
881
+ /**
882
+ * Move extracted media files to a figures directory with better names
883
+ */
884
+ export function moveExtractedMedia(
885
+ mediaFiles: string[],
886
+ figuresDir: string,
887
+ prefix: string = 'figure'
888
+ ): MoveExtractedMediaResult {
889
+ const moved: MovedFile[] = [];
890
+ const errors: string[] = [];
891
+
892
+ if (!fs.existsSync(figuresDir)) {
893
+ fs.mkdirSync(figuresDir, { recursive: true });
894
+ }
895
+
896
+ for (let i = 0; i < mediaFiles.length; i++) {
897
+ const src = mediaFiles[i];
898
+ const ext = path.extname(src).toLowerCase();
899
+ const newName = `${prefix}${i + 1}${ext}`;
900
+ const dest = path.join(figuresDir, newName);
901
+
902
+ try {
903
+ fs.copyFileSync(src, dest);
904
+ moved.push({ from: src, to: dest, name: newName });
905
+ } catch (err: any) {
906
+ errors.push(`Failed to copy ${src}: ${err.message}`);
907
+ }
908
+ }
909
+
910
+ return { moved, errors };
911
+ }