docrev 0.9.5 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/CHANGELOG.md +20 -0
  2. package/dev_notes/bug_repro_comment_parser.md +71 -0
  3. package/dist/lib/anchor-match.d.ts +41 -0
  4. package/dist/lib/anchor-match.d.ts.map +1 -0
  5. package/dist/lib/anchor-match.js +192 -0
  6. package/dist/lib/anchor-match.js.map +1 -0
  7. package/dist/lib/annotations.d.ts.map +1 -1
  8. package/dist/lib/annotations.js +8 -5
  9. package/dist/lib/annotations.js.map +1 -1
  10. package/dist/lib/commands/file-ops.d.ts +11 -0
  11. package/dist/lib/commands/file-ops.d.ts.map +1 -0
  12. package/dist/lib/commands/file-ops.js +301 -0
  13. package/dist/lib/commands/file-ops.js.map +1 -0
  14. package/dist/lib/commands/index.d.ts +10 -1
  15. package/dist/lib/commands/index.d.ts.map +1 -1
  16. package/dist/lib/commands/index.js +19 -1
  17. package/dist/lib/commands/index.js.map +1 -1
  18. package/dist/lib/commands/merge-resolve.d.ts +12 -0
  19. package/dist/lib/commands/merge-resolve.d.ts.map +1 -0
  20. package/dist/lib/commands/merge-resolve.js +318 -0
  21. package/dist/lib/commands/merge-resolve.js.map +1 -0
  22. package/dist/lib/commands/preview.d.ts +11 -0
  23. package/dist/lib/commands/preview.d.ts.map +1 -0
  24. package/dist/lib/commands/preview.js +138 -0
  25. package/dist/lib/commands/preview.js.map +1 -0
  26. package/dist/lib/commands/project-info.d.ts +11 -0
  27. package/dist/lib/commands/project-info.d.ts.map +1 -0
  28. package/dist/lib/commands/project-info.js +187 -0
  29. package/dist/lib/commands/project-info.js.map +1 -0
  30. package/dist/lib/commands/quality.d.ts +11 -0
  31. package/dist/lib/commands/quality.d.ts.map +1 -0
  32. package/dist/lib/commands/quality.js +384 -0
  33. package/dist/lib/commands/quality.js.map +1 -0
  34. package/dist/lib/commands/section-boundaries.d.ts +22 -0
  35. package/dist/lib/commands/section-boundaries.d.ts.map +1 -0
  36. package/dist/lib/commands/section-boundaries.js +53 -0
  37. package/dist/lib/commands/section-boundaries.js.map +1 -0
  38. package/dist/lib/commands/sections.d.ts +3 -2
  39. package/dist/lib/commands/sections.d.ts.map +1 -1
  40. package/dist/lib/commands/sections.js +4 -736
  41. package/dist/lib/commands/sections.js.map +1 -1
  42. package/dist/lib/commands/sync.d.ts +11 -0
  43. package/dist/lib/commands/sync.d.ts.map +1 -0
  44. package/dist/lib/commands/sync.js +576 -0
  45. package/dist/lib/commands/sync.js.map +1 -0
  46. package/dist/lib/commands/text-ops.d.ts +11 -0
  47. package/dist/lib/commands/text-ops.d.ts.map +1 -0
  48. package/dist/lib/commands/text-ops.js +357 -0
  49. package/dist/lib/commands/text-ops.js.map +1 -0
  50. package/dist/lib/commands/utilities.d.ts +2 -4
  51. package/dist/lib/commands/utilities.d.ts.map +1 -1
  52. package/dist/lib/commands/utilities.js +3 -1572
  53. package/dist/lib/commands/utilities.js.map +1 -1
  54. package/dist/lib/commands/verify-anchors.d.ts +17 -0
  55. package/dist/lib/commands/verify-anchors.d.ts.map +1 -0
  56. package/dist/lib/commands/verify-anchors.js +215 -0
  57. package/dist/lib/commands/verify-anchors.js.map +1 -0
  58. package/dist/lib/commands/word-tools.d.ts +11 -0
  59. package/dist/lib/commands/word-tools.d.ts.map +1 -0
  60. package/dist/lib/commands/word-tools.js +272 -0
  61. package/dist/lib/commands/word-tools.js.map +1 -0
  62. package/dist/lib/diff-engine.d.ts +25 -0
  63. package/dist/lib/diff-engine.d.ts.map +1 -0
  64. package/dist/lib/diff-engine.js +354 -0
  65. package/dist/lib/diff-engine.js.map +1 -0
  66. package/dist/lib/import.d.ts +44 -118
  67. package/dist/lib/import.d.ts.map +1 -1
  68. package/dist/lib/import.js +25 -1173
  69. package/dist/lib/import.js.map +1 -1
  70. package/dist/lib/restore-references.d.ts +35 -0
  71. package/dist/lib/restore-references.d.ts.map +1 -0
  72. package/dist/lib/restore-references.js +188 -0
  73. package/dist/lib/restore-references.js.map +1 -0
  74. package/dist/lib/word-extraction.d.ts +100 -0
  75. package/dist/lib/word-extraction.d.ts.map +1 -0
  76. package/dist/lib/word-extraction.js +594 -0
  77. package/dist/lib/word-extraction.js.map +1 -0
  78. package/lib/anchor-match.ts +238 -0
  79. package/lib/annotations.ts +9 -5
  80. package/lib/commands/file-ops.ts +372 -0
  81. package/lib/commands/index.ts +27 -0
  82. package/lib/commands/merge-resolve.ts +378 -0
  83. package/lib/commands/preview.ts +178 -0
  84. package/lib/commands/project-info.ts +244 -0
  85. package/lib/commands/quality.ts +517 -0
  86. package/lib/commands/section-boundaries.ts +72 -0
  87. package/lib/commands/sections.ts +3 -870
  88. package/lib/commands/sync.ts +701 -0
  89. package/lib/commands/text-ops.ts +449 -0
  90. package/lib/commands/utilities.ts +62 -2043
  91. package/lib/commands/verify-anchors.ts +261 -0
  92. package/lib/commands/word-tools.ts +340 -0
  93. package/lib/diff-engine.ts +465 -0
  94. package/lib/import.ts +108 -1504
  95. package/lib/restore-references.ts +240 -0
  96. package/lib/word-extraction.ts +759 -0
  97. package/package.json +1 -1
  98. package/skill/REFERENCE.md +29 -2
  99. package/skill/SKILL.md +12 -2
@@ -1,668 +1,38 @@
1
1
  /**
2
2
  * Import functionality - convert Word docs to annotated Markdown
3
+ *
4
+ * Orchestration workflows + re-exports from extraction/diff/restore modules
3
5
  */
4
6
  import * as fs from 'fs';
5
7
  import * as path from 'path';
6
- import { diffWords } from 'diff';
7
8
  import { stripAnnotations } from './annotations.js';
8
9
  import { readImageRegistry } from './image-registry.js';
9
10
  import { exec } from 'child_process';
10
11
  import { promisify } from 'util';
11
- import { extractMarkdownPrefix, protectAnchors, restoreAnchors, protectCrossrefs, restoreCrossrefs, protectMath, restoreMath, replaceRenderedMath, protectCitations, restoreCitations, replaceRenderedCitations, protectImages, restoreImages, matchWordImagesToOriginal, protectTables, restoreTables, } from './protect-restore.js';
12
- import { normalizeWhitespace } from './utils.js';
12
+ // Import from split modules
13
+ import { extractFromWord, } from './word-extraction.js';
14
+ import { generateSmartDiff, cleanupAnnotations, fixCitationAnnotations, } from './diff-engine.js';
15
+ import { restoreCrossrefFromWord, restoreImagesFromRegistry, convertVisibleComments, } from './restore-references.js';
16
+ import { findAnchorInText } from './anchor-match.js';
17
+ // Re-export everything so existing imports from './import.js' still work
18
+ export { extractFromWord, extractWordComments, extractCommentAnchors, extractHeadings, extractWordTables, } from './word-extraction.js';
19
+ export { generateSmartDiff, generateAnnotatedDiff, cleanupAnnotations, fixCitationAnnotations, } from './diff-engine.js';
20
+ export { restoreCrossrefFromWord, restoreImagesFromRegistry, parseVisibleComments, convertVisibleComments, } from './restore-references.js';
13
21
  const execAsync = promisify(exec);
14
22
  // ============================================
15
23
  // Functions
16
24
  // ============================================
17
- /**
18
- * Extract comments directly from Word docx comments.xml
19
- */
20
- export async function extractWordComments(docxPath) {
21
- const AdmZip = (await import('adm-zip')).default;
22
- const { parseStringPromise } = await import('xml2js');
23
- const comments = [];
24
- // Validate file exists
25
- if (!fs.existsSync(docxPath)) {
26
- throw new Error(`File not found: ${docxPath}`);
27
- }
28
- try {
29
- let zip;
30
- try {
31
- zip = new AdmZip(docxPath);
32
- }
33
- catch (err) {
34
- throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
35
- }
36
- const commentsEntry = zip.getEntry('word/comments.xml');
37
- if (!commentsEntry) {
38
- return comments;
39
- }
40
- let commentsXml;
41
- try {
42
- commentsXml = commentsEntry.getData().toString('utf8');
43
- }
44
- catch (err) {
45
- throw new Error(`Failed to read comments from document: ${err.message}`);
46
- }
47
- const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
48
- const ns = 'w:';
49
- const commentsRoot = parsed['w:comments'];
50
- if (!commentsRoot || !commentsRoot['w:comment']) {
51
- return comments;
52
- }
53
- // Ensure it's an array
54
- const commentNodes = Array.isArray(commentsRoot['w:comment'])
55
- ? commentsRoot['w:comment']
56
- : [commentsRoot['w:comment']];
57
- for (const comment of commentNodes) {
58
- const id = comment.$?.['w:id'] || '';
59
- const author = comment.$?.['w:author'] || 'Unknown';
60
- const date = comment.$?.['w:date'] || '';
61
- // Extract text from nested w:p/w:r/w:t elements
62
- let text = '';
63
- const extractText = (node) => {
64
- if (!node)
65
- return;
66
- if (typeof node === 'string') {
67
- text += node;
68
- return;
69
- }
70
- if (node['w:t']) {
71
- const t = node['w:t'];
72
- text += typeof t === 'string' ? t : (t._ || t);
73
- }
74
- if (node['w:r']) {
75
- const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
76
- runs.forEach(extractText);
77
- }
78
- if (node['w:p']) {
79
- const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
80
- paras.forEach(extractText);
81
- }
82
- };
83
- extractText(comment);
84
- comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
85
- }
86
- }
87
- catch (err) {
88
- // Re-throw with more context if it's already an Error we created
89
- if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
90
- throw err;
91
- }
92
- throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
93
- }
94
- return comments;
95
- }
96
- /**
97
- * Extract comment anchor texts from document.xml with surrounding context
98
- * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
99
- * Also returns fullDocText for section boundary matching
100
- */
101
- export async function extractCommentAnchors(docxPath) {
102
- const AdmZip = (await import('adm-zip')).default;
103
- const anchors = new Map();
104
- let fullDocText = '';
105
- try {
106
- const zip = new AdmZip(docxPath);
107
- const docEntry = zip.getEntry('word/document.xml');
108
- if (!docEntry) {
109
- return { anchors, fullDocText };
110
- }
111
- const docXml = docEntry.getData().toString('utf8');
112
- // ========================================
113
- // STEP 1: Build text position mapping
114
- // ========================================
115
- const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
116
- const textNodes = [];
117
- let textPosition = 0;
118
- let nodeMatch;
119
- while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
120
- const rawText = nodeMatch[1] ?? '';
121
- const decodedText = decodeXmlEntities(rawText);
122
- textNodes.push({
123
- xmlStart: nodeMatch.index,
124
- xmlEnd: nodeMatch.index + nodeMatch[0].length,
125
- textStart: textPosition,
126
- textEnd: textPosition + decodedText.length,
127
- text: decodedText
128
- });
129
- textPosition += decodedText.length;
130
- }
131
- fullDocText = textNodes.map(n => n.text).join('');
132
- // Helper: convert XML position to text position
133
- function xmlPosToTextPos(xmlPos) {
134
- for (let i = 0; i < textNodes.length; i++) {
135
- const node = textNodes[i];
136
- if (!node)
137
- continue;
138
- if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
139
- return node.textStart;
140
- }
141
- if (xmlPos < node.xmlStart) {
142
- return node.textStart;
143
- }
144
- }
145
- const lastNode = textNodes[textNodes.length - 1];
146
- return lastNode ? lastNode.textEnd : 0;
147
- }
148
- // Helper: extract context before a position
149
- function getContextBefore(position, maxLength = 150) {
150
- const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
151
- const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
152
- return sentenceStart >= 0
153
- ? beforeText.slice(sentenceStart + 2).trim()
154
- : beforeText.slice(-80).trim();
155
- }
156
- // Helper: extract context after a position
157
- function getContextAfter(position, maxLength = 150) {
158
- const afterText = fullDocText.slice(position, position + maxLength);
159
- const sentenceEnd = afterText.search(/[.!?]\s/);
160
- return sentenceEnd >= 0
161
- ? afterText.slice(0, sentenceEnd + 1).trim()
162
- : afterText.slice(0, 80).trim();
163
- }
164
- // ========================================
165
- // STEP 2: Collect all start/end markers separately
166
- // ========================================
167
- const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
168
- const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
169
- const starts = new Map(); // id -> position after start tag
170
- const ends = new Map(); // id -> position before end tag
171
- let match;
172
- while ((match = startPattern.exec(docXml)) !== null) {
173
- const id = match[1];
174
- if (!starts.has(id)) {
175
- starts.set(id, match.index + match[0].length);
176
- }
177
- }
178
- while ((match = endPattern.exec(docXml)) !== null) {
179
- const id = match[1];
180
- if (!ends.has(id)) {
181
- ends.set(id, match.index);
182
- }
183
- }
184
- // ========================================
185
- // STEP 3: Process each comment range by ID
186
- // ========================================
187
- for (const [id, startXmlPos] of starts) {
188
- const endXmlPos = ends.get(id);
189
- // Missing end marker - skip with warning
190
- if (endXmlPos === undefined) {
191
- console.warn(`Comment ${id}: missing end marker`);
192
- continue;
193
- }
194
- // Calculate text position
195
- const docPosition = xmlPosToTextPos(startXmlPos);
196
- // Handle empty or inverted ranges
197
- if (endXmlPos <= startXmlPos) {
198
- anchors.set(id, {
199
- anchor: '',
200
- before: getContextBefore(docPosition),
201
- after: getContextAfter(docPosition),
202
- docPosition,
203
- docLength: fullDocText.length,
204
- isEmpty: true
205
- });
206
- continue;
207
- }
208
- // Extract XML segment between markers
209
- const segment = docXml.slice(startXmlPos, endXmlPos);
210
- // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
211
- const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
212
- let anchorText = '';
213
- let tm;
214
- while ((tm = textInRangePattern.exec(segment)) !== null) {
215
- anchorText += tm[1] || tm[2] || '';
216
- }
217
- anchorText = decodeXmlEntities(anchorText);
218
- // Get context
219
- const anchorLength = anchorText.length;
220
- const before = getContextBefore(docPosition);
221
- const after = getContextAfter(docPosition + anchorLength);
222
- // ALWAYS add entry (even if anchor is empty)
223
- anchors.set(id, {
224
- anchor: anchorText.trim(),
225
- before,
226
- after,
227
- docPosition,
228
- docLength: fullDocText.length,
229
- isEmpty: !anchorText.trim()
230
- });
231
- }
232
- }
233
- catch (err) {
234
- console.error('Error extracting comment anchors:', err.message);
235
- return { anchors, fullDocText: '' };
236
- }
237
- return { anchors, fullDocText };
238
- }
239
- /**
240
- * Decode XML entities in text
241
- */
242
- function decodeXmlEntities(text) {
243
- return text
244
- .replace(/&amp;/g, '&')
245
- .replace(/&lt;/g, '<')
246
- .replace(/&gt;/g, '>')
247
- .replace(/&quot;/g, '"')
248
- .replace(/&apos;/g, "'")
249
- .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
250
- .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
251
- }
252
- /**
253
- * Extract text content from a Word XML cell
254
- */
255
- function extractCellText(cellXml) {
256
- const parts = [];
257
- // Check for OMML math - replace with [math] placeholder
258
- if (cellXml.includes('<m:oMath')) {
259
- // Try to extract the text representation of math
260
- const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
261
- if (mathTextMatches.length > 0) {
262
- const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
263
- parts.push(mathText);
264
- }
265
- else {
266
- parts.push('[math]');
267
- }
268
- }
269
- // Extract regular text from w:t elements
270
- const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
271
- for (const match of textMatches) {
272
- const text = match.replace(/<[^>]+>/g, '');
273
- if (text) {
274
- parts.push(text);
275
- }
276
- }
277
- let result = parts.join('').trim();
278
- result = decodeXmlEntities(result);
279
- // Escape pipe characters in cell content (would break table)
280
- result = result.replace(/\|/g, '\\|');
281
- return result;
282
- }
283
- /**
284
- * Parse a table row, handling merged cells (gridSpan)
285
- */
286
- function parseTableRow(rowXml, expectedCols) {
287
- // Match cells - handle both <w:tc> and <w:tc ...>
288
- const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
289
- const cells = [];
290
- const colSpans = [];
291
- for (const cellXml of cellMatches) {
292
- // Check for horizontal merge (gridSpan)
293
- const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
294
- const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
295
- // Check for vertical merge continuation (vMerge without restart)
296
- // If vMerge is present without w:val="restart", it's a continuation - use empty
297
- const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
298
- const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
299
- const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
300
- // Add the cell content
301
- cells.push(cellText);
302
- colSpans.push(span);
303
- // For gridSpan > 1, add empty cells to maintain column alignment
304
- for (let i = 1; i < span; i++) {
305
- cells.push('');
306
- colSpans.push(0); // 0 indicates this is a spanned cell
307
- }
308
- }
309
- return { cells, colSpans };
310
- }
311
- /**
312
- * Determine table grid column count from table XML
313
- */
314
- function getTableGridCols(tableXml) {
315
- // Try to get from tblGrid
316
- const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
317
- if (gridColMatches.length > 0) {
318
- return gridColMatches.length;
319
- }
320
- // Fallback: count max cells in any row
321
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
322
- let maxCols = 0;
323
- for (const rowXml of rowMatches) {
324
- const { cells } = parseTableRow(rowXml, 0);
325
- maxCols = Math.max(maxCols, cells.length);
326
- }
327
- return maxCols;
328
- }
329
- /**
330
- * Extract tables directly from Word document XML and convert to markdown pipe tables
331
- */
332
- export async function extractWordTables(docxPath) {
333
- const AdmZip = (await import('adm-zip')).default;
334
- const tables = [];
335
- try {
336
- const zip = new AdmZip(docxPath);
337
- const docEntry = zip.getEntry('word/document.xml');
338
- if (!docEntry) {
339
- return tables;
340
- }
341
- const xml = docEntry.getData().toString('utf8');
342
- // Find all table elements
343
- const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
344
- for (const tableXml of tableMatches) {
345
- // Determine expected column count from grid
346
- const expectedCols = getTableGridCols(tableXml);
347
- // Extract rows
348
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
349
- const rows = [];
350
- for (const rowXml of rowMatches) {
351
- const { cells } = parseTableRow(rowXml, expectedCols);
352
- if (cells.length > 0) {
353
- rows.push(cells);
354
- }
355
- }
356
- if (rows.length > 0) {
357
- // Convert to markdown pipe table
358
- const markdown = convertRowsToMarkdownTable(rows);
359
- tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
360
- }
361
- }
362
- }
363
- catch (err) {
364
- console.error('Error extracting tables from Word:', err.message);
365
- }
366
- return tables;
367
- }
368
- /**
369
- * Convert array of rows (each row is array of cell strings) to markdown pipe table
370
- */
371
- function convertRowsToMarkdownTable(rows) {
372
- if (rows.length === 0)
373
- return '';
374
- // Normalize column count (use max across all rows)
375
- const colCount = Math.max(...rows.map((r) => r.length));
376
- // Pad rows to have consistent column count
377
- const normalizedRows = rows.map((row) => {
378
- while (row.length < colCount) {
379
- row.push('');
380
- }
381
- return row;
382
- });
383
- // Build markdown table
384
- const lines = [];
385
- // Header row
386
- const header = normalizedRows[0];
387
- lines.push('| ' + header.join(' | ') + ' |');
388
- // Separator row
389
- lines.push('|' + header.map(() => '---').join('|') + '|');
390
- // Data rows
391
- for (let i = 1; i < normalizedRows.length; i++) {
392
- lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
393
- }
394
- return lines.join('\n');
395
- }
396
- /**
397
- * Extract text from Word document using pandoc with track changes preserved
398
- */
399
- export async function extractFromWord(docxPath, options = {}) {
400
- let text;
401
- let messages = [];
402
- let extractedMedia = [];
403
- let hasTrackChanges = false;
404
- let trackChangeStats = { insertions: 0, deletions: 0 };
405
- // Determine media extraction directory
406
- const docxDir = path.dirname(docxPath);
407
- const mediaDir = options.mediaDir || path.join(docxDir, 'media');
408
- // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
409
- const skipMediaExtraction = options.skipMediaExtraction || false;
410
- // Extract tables directly from Word XML (reliable, no heuristics)
411
- const wordTables = await extractWordTables(docxPath);
412
- // Try pandoc first with --track-changes=all to preserve reviewer edits
413
- try {
414
- // Build pandoc command
415
- let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
416
- if (!skipMediaExtraction) {
417
- pandocCmd += ` --extract-media="${mediaDir}"`;
418
- }
419
- const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
420
- text = stdout;
421
- // Convert pandoc's track change format to CriticMarkup
422
- const origLength = text.length;
423
- // Use a more robust pattern that handles nested content
424
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
425
- if (content.trim()) {
426
- trackChangeStats.insertions++;
427
- return `{++${content}++}`;
428
- }
429
- return ''; // Empty insertions are removed
430
- });
431
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
432
- if (content.trim()) {
433
- trackChangeStats.deletions++;
434
- return `{--${content}--}`;
435
- }
436
- return ''; // Empty deletions are removed
437
- });
438
- // Handle any remaining pandoc track change patterns
439
- let prevText;
440
- do {
441
- prevText = text;
442
- text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
443
- if (content.trim()) {
444
- trackChangeStats.insertions++;
445
- return `{++${content}++}`;
446
- }
447
- return '';
448
- });
449
- text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
450
- if (content.trim()) {
451
- trackChangeStats.deletions++;
452
- return `{--${content}--}`;
453
- }
454
- return '';
455
- });
456
- } while (text !== prevText);
457
- // Handle pandoc comment patterns - remove comment text from body
458
- text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
459
- text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
460
- // Also handle {.mark} spans
461
- text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
462
- hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
463
- if (hasTrackChanges) {
464
- messages.push({
465
- type: 'info',
466
- message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
467
- });
468
- }
469
- // Find extracted media files
470
- const mediaSubdir = path.join(mediaDir, 'media');
471
- if (fs.existsSync(mediaSubdir)) {
472
- extractedMedia = fs.readdirSync(mediaSubdir)
473
- .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
474
- .map(f => path.join(mediaSubdir, f));
475
- if (extractedMedia.length > 0) {
476
- messages.push({
477
- type: 'info',
478
- message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
479
- });
480
- }
481
- }
482
- }
483
- catch (pandocErr) {
484
- // Pandoc not available — use XML-based extraction with track change support
485
- const { extractPlainTextWithTrackChanges } = await import('./word.js');
486
- const { getInstallInstructions } = await import('./dependencies.js');
487
- const installCmd = getInstallInstructions('pandoc');
488
- const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
489
- text = xmlResult.text;
490
- hasTrackChanges = xmlResult.hasTrackChanges;
491
- trackChangeStats = xmlResult.stats;
492
- if (hasTrackChanges) {
493
- messages.push({
494
- type: 'warning',
495
- message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
496
- });
497
- }
498
- else {
499
- messages.push({
500
- type: 'warning',
501
- message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
502
- });
503
- }
504
- }
505
- // Extract comments directly from docx XML
506
- const comments = await extractWordComments(docxPath);
507
- // Extract comment anchor texts
508
- const { anchors } = await extractCommentAnchors(docxPath);
509
- return {
510
- text,
511
- comments,
512
- anchors,
513
- messages,
514
- extractedMedia,
515
- tables: wordTables,
516
- hasTrackChanges,
517
- trackChangeStats,
518
- };
519
- }
520
25
  /**
521
26
  * Insert comments into markdown text based on anchor texts with context
522
27
  */
523
28
  export function insertCommentsIntoMarkdown(markdown, comments, anchors, options = {}) {
524
- const { quiet = false, sectionBoundary = null } = options;
29
+ const { quiet = false, sectionBoundary = null, wrapAnchor = true } = options;
525
30
  let result = markdown;
526
31
  let unmatchedCount = 0;
527
32
  const duplicateWarnings = [];
528
33
  const usedPositions = new Set(); // For tie-breaking: track used positions
529
- // Helper: Strip CriticMarkup from text to get "clean" version for matching
530
- function stripCriticMarkup(text) {
531
- return text
532
- .replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep inserted text
533
- .replace(/\{--([^-]*)--\}/g, '') // deletions: remove deleted text
534
- .replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
535
- .replace(/\{>>[^<]*<<\}/g, '') // comments: remove
536
- .replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
537
- }
538
- // Helper: Find anchor in text with multiple fallback strategies
539
- function findAnchorInText(anchor, text, before = '', after = '') {
540
- // If anchor is empty, skip directly to context-based matching
541
- if (!anchor || anchor.trim().length === 0) {
542
- // Jump to context-based strategies (Strategy 5)
543
- if (before || after) {
544
- const beforeLower = (before || '').toLowerCase();
545
- const afterLower = (after || '').toLowerCase();
546
- const textLower = text.toLowerCase();
547
- if (before && after) {
548
- const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
549
- if (beforeIdx !== -1) {
550
- const searchStart = beforeIdx + beforeLower.slice(-50).length;
551
- const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
552
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
553
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
554
- }
555
- }
556
- }
557
- if (before) {
558
- const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
559
- if (beforeIdx !== -1) {
560
- return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
561
- }
562
- }
563
- if (after) {
564
- const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
565
- if (afterIdx !== -1) {
566
- return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
567
- }
568
- }
569
- }
570
- return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
571
- }
572
- const anchorLower = anchor.toLowerCase();
573
- const textLower = text.toLowerCase();
574
- // Strategy 1: Direct match
575
- let occurrences = findAllOccurrences(textLower, anchorLower);
576
- if (occurrences.length > 0) {
577
- return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
578
- }
579
- // Strategy 2: Normalized whitespace
580
- const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
581
- const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
582
- let idx = normalizedText.indexOf(normalizedAnchor);
583
- if (idx !== -1) {
584
- return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
585
- }
586
- // Strategy 3: Try matching in stripped CriticMarkup version
587
- const strippedText = stripCriticMarkup(text);
588
- const strippedLower = strippedText.toLowerCase();
589
- occurrences = findAllOccurrences(strippedLower, anchorLower);
590
- if (occurrences.length > 0) {
591
- return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
592
- }
593
- // Strategy 4: First N words of anchor (for long anchors)
594
- const words = anchor.split(/\s+/);
595
- if (words.length > 3) {
596
- for (let n = Math.min(6, words.length); n >= 3; n--) {
597
- const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
598
- if (partialAnchor.length >= 15) {
599
- occurrences = findAllOccurrences(textLower, partialAnchor);
600
- if (occurrences.length > 0) {
601
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
602
- }
603
- occurrences = findAllOccurrences(strippedLower, partialAnchor);
604
- if (occurrences.length > 0) {
605
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start-stripped', stripped: true };
606
- }
607
- }
608
- }
609
- }
610
- // Strategy 5: Use context (before/after) to find approximate position
611
- if (before || after) {
612
- const beforeLower = before.toLowerCase();
613
- const afterLower = after.toLowerCase();
614
- if (before && after) {
615
- const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
616
- if (beforeIdx !== -1) {
617
- const searchStart = beforeIdx + beforeLower.slice(-50).length;
618
- const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
619
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
620
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
621
- }
622
- }
623
- }
624
- if (before) {
625
- const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
626
- if (beforeIdx !== -1) {
627
- return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
628
- }
629
- }
630
- if (after) {
631
- const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
632
- if (afterIdx !== -1) {
633
- return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
634
- }
635
- }
636
- }
637
- // Strategy 6: Try splitting anchor on common transition words
638
- const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
639
- for (const sep of splitPatterns) {
640
- if (anchor.includes(sep)) {
641
- const parts = anchor.split(sep).filter(p => p.length >= 4);
642
- for (const part of parts) {
643
- const partLower = part.toLowerCase();
644
- occurrences = findAllOccurrences(textLower, partLower);
645
- if (occurrences.length > 0 && occurrences.length < 5) {
646
- return { occurrences, matchedAnchor: part, strategy: 'split-match' };
647
- }
648
- }
649
- }
650
- }
651
- return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
652
- }
653
- // Helper: Find all occurrences of needle in haystack
654
- function findAllOccurrences(haystack, needle) {
655
- if (!needle || needle.length === 0) {
656
- return [];
657
- }
658
- const occurrences = [];
659
- let idx = 0;
660
- while ((idx = haystack.indexOf(needle, idx)) !== -1) {
661
- occurrences.push(idx);
662
- idx += 1;
663
- }
664
- return occurrences;
665
- }
34
+ // Anchor matching primitives live in lib/anchor-match.ts so that
35
+ // `rev verify-anchors` can use the same strategies for drift reporting.
666
36
  // Get all positions in order (for sequential tie-breaking)
667
37
  const commentsWithPositions = comments.map((c) => {
668
38
  const anchorData = anchors.get(c.id);
@@ -800,19 +170,25 @@ export function insertCommentsIntoMarkdown(markdown, comments, anchors, options
800
170
  const matched = commentsWithPositions.filter((c) => c.pos >= 0);
801
171
  // Sort by position descending (insert from end to avoid offset issues)
802
172
  matched.sort((a, b) => b.pos - a.pos);
803
- // Insert each comment with anchor marking
173
+ // Insert each comment. With `wrapAnchor` (the default), the anchor text
174
+ // gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
175
+ // original Word comment range. Without it, the comment block is inserted
176
+ // adjacent to the anchor and prose stays untouched — required for
177
+ // comments-only sync where multiple comments may share one anchor.
804
178
  for (const c of matched) {
805
179
  const comment = `{>>${c.author}: ${c.text}<<}`;
806
- if (c.anchorText && c.anchorEnd) {
807
- // Replace anchor text with: {>>comment<<}[anchor]{.mark}
180
+ if (wrapAnchor && c.anchorText && c.anchorEnd) {
808
181
  const before = result.slice(0, c.pos);
809
182
  const anchor = result.slice(c.pos, c.anchorEnd);
810
183
  const after = result.slice(c.anchorEnd);
811
184
  result = before + comment + `[${anchor}]{.mark}` + after;
812
185
  }
813
186
  else {
814
- // No anchor - just insert comment at position
815
- result = result.slice(0, c.pos) + ` ${comment}` + result.slice(c.pos);
187
+ // Insert comment at the anchor position with no surrounding whitespace
188
+ // tweaks; CriticMarkup blocks are invisible to readers, and adding a
189
+ // leading space would shift prose byte-for-byte (relevant when callers
190
+ // verify that --comments-only didn't touch the original).
191
+ result = result.slice(0, c.pos) + comment + result.slice(c.pos);
816
192
  }
817
193
  }
818
194
  // Log warnings unless quiet mode
@@ -829,530 +205,6 @@ export function insertCommentsIntoMarkdown(markdown, comments, anchors, options
829
205
  }
830
206
  return result;
831
207
  }
832
- /**
833
- * Fix citation and math annotations by preserving original markdown syntax
834
- */
835
- function fixCitationAnnotations(text, originalMd) {
836
- // Fix math annotations - preserve inline and display math
837
- text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
838
- text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
839
- text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
840
- text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
841
- // Extract all citations from original markdown
842
- const citationPattern = /\[@[^\]]+\]/g;
843
- const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
844
- // Fix substitutions where left side has markdown citation
845
- text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
846
- // Fix substitutions where left side STARTS with markdown citation
847
- text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
848
- if (oldText.trim() === '' && newText.trim() === '') {
849
- return cite;
850
- }
851
- if (oldText.trim() || newText.trim()) {
852
- return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
853
- }
854
- return cite;
855
- });
856
- // Fix deletions of markdown citations
857
- text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
858
- // Fix insertions of rendered citations
859
- text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
860
- // Clean up broken multi-part substitutions
861
- text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
862
- // Fix citations split across substitution boundaries
863
- text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
864
- // Clean up any remaining partial citations
865
- text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
866
- // Remove rendered citation insertions (with Unicode support)
867
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
868
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
869
- // Trailing citation fragments
870
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
871
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
872
- // Just year with closing paren
873
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
874
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
875
- // Leading citation fragments
876
- text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
877
- // Semicolon-separated fragments
878
- text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
879
- // Year ranges with authors
880
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
881
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
882
- // Clean up double spaces and orphaned punctuation
883
- text = text.replace(/ +/g, ' ');
884
- text = text.replace(/\s+\./g, '.');
885
- text = text.replace(/\s+,/g, ',');
886
- // Final cleanup - remove empty annotations
887
- text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
888
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
889
- text = text.replace(/\{--\s*--\}/g, '');
890
- return text;
891
- }
892
- /**
893
- * Strip markdown syntax to get plain text
894
- */
895
- function stripMarkdownSyntax(md) {
896
- return md
897
- .replace(/^---[\s\S]*?---\n*/m, '')
898
- .replace(/^#{1,6}\s+/gm, '')
899
- .replace(/(\*\*|__)(.*?)\1/g, '$2')
900
- .replace(/(\*|_)(.*?)\1/g, '$2')
901
- .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
902
- .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
903
- .replace(/`([^`]+)`/g, '$1')
904
- .replace(/```[\s\S]*?```/g, '')
905
- .replace(/^>\s*/gm, '')
906
- .replace(/^[-*_]{3,}\s*$/gm, '')
907
- .replace(/^[\s]*[-*+]\s+/gm, '')
908
- .replace(/^[\s]*\d+\.\s+/gm, '')
909
- .replace(/\|/g, ' ')
910
- .replace(/^[-:]+$/gm, '')
911
- .replace(/\n{3,}/g, '\n\n')
912
- .trim();
913
- }
914
- /**
915
- * Generate annotated markdown by diffing original MD against Word text
916
- */
917
- export function generateAnnotatedDiff(originalMd, wordText, author = 'Reviewer') {
918
- const normalizedOriginal = normalizeWhitespace(originalMd);
919
- const normalizedWord = normalizeWhitespace(wordText);
920
- const changes = diffWords(normalizedOriginal, normalizedWord);
921
- let result = '';
922
- for (const part of changes) {
923
- if (part.added) {
924
- result += `{++${part.value}++}`;
925
- }
926
- else if (part.removed) {
927
- result += `{--${part.value}--}`;
928
- }
929
- else {
930
- result += part.value;
931
- }
932
- }
933
- return result;
934
- }
935
- /**
936
- * Inject Word tables (extracted from XML) into pandoc text output
937
- */
938
- function injectWordTables(pandocText, wordTables) {
939
- if (!wordTables || wordTables.length === 0) {
940
- return pandocText;
941
- }
942
- let result = pandocText;
943
- for (const table of wordTables) {
944
- const firstLine = table.markdown.split('\n')[0];
945
- const headerCells = firstLine
946
- .split('|')
947
- .map((c) => c.trim())
948
- .filter((c) => c.length > 0);
949
- if (headerCells.length === 0)
950
- continue;
951
- const firstCell = headerCells[0];
952
- const startIdx = result.indexOf(firstCell);
953
- if (startIdx === -1)
954
- continue;
955
- const lastLine = table.markdown.split('\n').pop();
956
- const lastCells = lastLine
957
- .split('|')
958
- .map((c) => c.trim())
959
- .filter((c) => c.length > 0);
960
- const lastCell = lastCells[lastCells.length - 1] || lastCells[0];
961
- const endIdx = result.indexOf(lastCell, startIdx);
962
- if (endIdx === -1)
963
- continue;
964
- let regionStart = result.lastIndexOf('\n\n', startIdx);
965
- if (regionStart === -1)
966
- regionStart = 0;
967
- else
968
- regionStart += 2;
969
- let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length);
970
- if (regionEnd === -1)
971
- regionEnd = result.length;
972
- result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd);
973
- }
974
- return result;
975
- }
976
- /**
977
- * Smart paragraph-level diff that preserves markdown structure
978
- */
979
- export function generateSmartDiff(originalMd, wordText, author = 'Reviewer', options = {}) {
980
- const { wordTables = [], imageRegistry = null } = options;
981
- // Inject Word tables into pandoc output
982
- let wordTextWithTables = injectWordTables(wordText, wordTables);
983
- // Protect markdown tables
984
- const { text: mdWithTablesProtected, tables } = protectTables(originalMd);
985
- // Also protect tables in Word text
986
- const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables);
987
- // Protect images
988
- const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry);
989
- const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry);
990
- // Match Word images to original images
991
- const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry);
992
- // Replace Word image placeholders with matching original placeholders
993
- let wordWithMappedImages = wordWithImagesProtected;
994
- for (const [wordPlaceholder, origPlaceholder] of imageMapping) {
995
- wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder);
996
- }
997
- // Protect figure/table anchors
998
- const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected);
999
- // Protect cross-references
1000
- const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
1001
- // Protect math
1002
- const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
1003
- // Protect citations
1004
- const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
1005
- // Replace rendered elements in Word text
1006
- let wordProtected = wordWithMappedImages;
1007
- wordProtected = replaceRenderedMath(wordProtected, mathBlocks);
1008
- wordProtected = replaceRenderedCitations(wordProtected, citations.length);
1009
- // Split into paragraphs
1010
- const originalParas = mdProtected.split(/\n\n+/);
1011
- const wordParas = wordProtected.split(/\n\n+/);
1012
- const result = [];
1013
- // Try to match paragraphs intelligently
1014
- let wordIdx = 0;
1015
- for (let i = 0; i < originalParas.length; i++) {
1016
- const orig = originalParas[i] || '';
1017
- const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
1018
- // Find best matching word paragraph
1019
- let bestMatch = -1;
1020
- let bestScore = 0;
1021
- for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
1022
- const wordPara = wordParas[j] || '';
1023
- const origWords = new Set(origContent.toLowerCase().split(/\s+/));
1024
- const wordWords = wordPara.toLowerCase().split(/\s+/);
1025
- const common = wordWords.filter((w) => origWords.has(w)).length;
1026
- const score = common / Math.max(origWords.size, wordWords.length);
1027
- if (score > bestScore && score > 0.3) {
1028
- bestScore = score;
1029
- bestMatch = j;
1030
- }
1031
- }
1032
- if (bestMatch === -1) {
1033
- if (mdPrefix && wordIdx < wordParas.length) {
1034
- const wordPara = wordParas[wordIdx];
1035
- if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
1036
- bestMatch = wordIdx;
1037
- }
1038
- }
1039
- }
1040
- if (bestMatch >= 0) {
1041
- const word = wordParas[bestMatch];
1042
- const origStripped = stripMarkdownSyntax(orig);
1043
- const wordNormalized = normalizeWhitespace(word);
1044
- if (origStripped === wordNormalized) {
1045
- result.push(orig);
1046
- }
1047
- else {
1048
- const changes = diffWords(origStripped, wordNormalized);
1049
- let annotated = mdPrefix;
1050
- for (const part of changes) {
1051
- if (part.added) {
1052
- annotated += `{++${part.value}++}`;
1053
- }
1054
- else if (part.removed) {
1055
- annotated += `{--${part.value}--}`;
1056
- }
1057
- else {
1058
- annotated += part.value;
1059
- }
1060
- }
1061
- result.push(annotated);
1062
- }
1063
- wordIdx = bestMatch + 1;
1064
- }
1065
- else {
1066
- // Paragraph deleted entirely
1067
- if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) {
1068
- result.push(orig);
1069
- }
1070
- else {
1071
- result.push(`{--${orig}--}`);
1072
- }
1073
- }
1074
- }
1075
- // Any remaining word paragraphs are additions
1076
- for (let j = wordIdx; j < wordParas.length; j++) {
1077
- const word = wordParas[j];
1078
- if (word.trim()) {
1079
- result.push(`{++${word}++}`);
1080
- }
1081
- }
1082
- // Restore protected content
1083
- let finalResult = result.join('\n\n');
1084
- finalResult = restoreCitations(finalResult, citations);
1085
- finalResult = restoreMath(finalResult, mathBlocks);
1086
- finalResult = restoreCrossrefs(finalResult, crossrefs);
1087
- finalResult = restoreAnchors(finalResult, figAnchors);
1088
- finalResult = restoreImages(finalResult, origImages);
1089
- finalResult = restoreImages(finalResult, wordImages);
1090
- finalResult = restoreTables(finalResult, tables);
1091
- finalResult = restoreTables(finalResult, wordTableBlocks);
1092
- return finalResult;
1093
- }
1094
- /**
1095
- * Clean up redundant adjacent annotations
1096
- */
1097
- export function cleanupAnnotations(text) {
1098
- // Convert adjacent delete+insert to substitution
1099
- text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
1100
- // Also handle insert+delete
1101
- text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
1102
- // Fix malformed patterns
1103
- text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
1104
- // Fix malformed substitutions that got split
1105
- text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
1106
- text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
1107
- // Clean up empty annotations
1108
- text = text.replace(/\{--\s*--\}/g, '');
1109
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1110
- // Clean up double spaces in prose, but preserve table formatting
1111
- const lines = text.split('\n');
1112
- let inTable = false;
1113
- const processedLines = lines.map((line, idx) => {
1114
- const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim());
1115
- const looksLikeTableRow = /\S+\s{2,}\S+/.test(line);
1116
- if (isSeparator) {
1117
- if (!inTable) {
1118
- inTable = true;
1119
- }
1120
- return line;
1121
- }
1122
- if (inTable) {
1123
- if (line.trim() === '') {
1124
- let lookAhead = idx + 1;
1125
- let foundTableContent = false;
1126
- let foundEndSeparator = false;
1127
- while (lookAhead < lines.length && lookAhead < idx + 20) {
1128
- const nextLine = lines[lookAhead].trim();
1129
- if (nextLine === '') {
1130
- lookAhead++;
1131
- continue;
1132
- }
1133
- if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) {
1134
- foundEndSeparator = true;
1135
- break;
1136
- }
1137
- if (/\S+\s{2,}\S+/.test(nextLine)) {
1138
- foundTableContent = true;
1139
- break;
1140
- }
1141
- if (/^\*[^*]+\*\s*$/.test(nextLine)) {
1142
- foundTableContent = true;
1143
- break;
1144
- }
1145
- if (lines[lookAhead].startsWith(' ')) {
1146
- lookAhead++;
1147
- continue;
1148
- }
1149
- break;
1150
- }
1151
- if (foundTableContent || foundEndSeparator) {
1152
- return line;
1153
- }
1154
- inTable = false;
1155
- return line;
1156
- }
1157
- return line;
1158
- }
1159
- if (looksLikeTableRow) {
1160
- let nextIdx = idx + 1;
1161
- while (nextIdx < lines.length && lines[nextIdx].trim() === '') {
1162
- nextIdx++;
1163
- }
1164
- if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) {
1165
- return line;
1166
- }
1167
- }
1168
- if (line.trim().startsWith('|')) {
1169
- return line;
1170
- }
1171
- return line.replace(/ +/g, ' ');
1172
- });
1173
- text = processedLines.join('\n');
1174
- return text;
1175
- }
1176
- /**
1177
- * Parse visible comment markers from Word text
1178
- */
1179
- export function parseVisibleComments(text) {
1180
- const comments = [];
1181
- const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
1182
- let match;
1183
- while ((match = pattern.exec(text)) !== null) {
1184
- comments.push({
1185
- author: match[1].trim(),
1186
- text: match[2].trim(),
1187
- position: match.index,
1188
- });
1189
- }
1190
- return comments;
1191
- }
1192
- /**
1193
- * Convert visible comments to CriticMarkup format
1194
- */
1195
- export function convertVisibleComments(text) {
1196
- return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
1197
- }
1198
- /**
1199
- * Restore pandoc-crossref figure/table references from Word-rendered format
1200
- */
1201
- export function restoreCrossrefFromWord(text, projectDir, restoredLabels = null) {
1202
- const messages = [];
1203
- let restored = 0;
1204
- let result = text;
1205
- const registry = readImageRegistry(projectDir);
1206
- if (!restoredLabels) {
1207
- restoredLabels = new Set();
1208
- }
1209
- // Pattern 1: [Figure]{.mark} [N]{.mark}
1210
- result = result.replace(/\[(Figure|Table|Fig\.?)\]\{\.mark\}\s*\[(\d+|S\d+)\]\{\.mark\}/gi, (match, type, num) => {
1211
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1212
- if (registry) {
1213
- const entry = registry.byNumber?.get(`${prefix}:${num}`);
1214
- if (entry && entry.label) {
1215
- restored++;
1216
- return `@${prefix}:${entry.label}`;
1217
- }
1218
- }
1219
- restored++;
1220
- messages.push(`Restored ${type} ${num} (no label found, using placeholder)`);
1221
- return `@${prefix}:fig${num}`;
1222
- });
1223
- // Pattern 2: Plain "Figure N" or "Fig. N"
1224
- result = result.replace(/(?<!!)\b(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)\b(?!\s*:)/gi, (match, type, num) => {
1225
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1226
- if (registry) {
1227
- const entry = registry.byNumber?.get(`${prefix}:${num}`);
1228
- if (entry && entry.label) {
1229
- restored++;
1230
- return `@${prefix}:${entry.label}`;
1231
- }
1232
- }
1233
- return match;
1234
- });
1235
- // Pattern 3: Remove duplicate plain-text captions
1236
- result = result.replace(/(\!\[[^\]]+\]\([^)]+\)(?:\{[^}]*\})?)\s*\n+\s*(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*[^\n]+/gi, '$1');
1237
- // Pattern 4: Clean up image captions that start with "Figure N: "
1238
- result = result.replace(/!\[(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*([^\]]*)\]\(([^)]+)\)(?:\{[^}]*\})?/gi, (match, type, num, caption, imgPath) => {
1239
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1240
- const labelKey = `${prefix}:${num}`;
1241
- if (registry) {
1242
- const entry = registry.byNumber?.get(labelKey);
1243
- if (entry) {
1244
- if (restoredLabels.has(labelKey)) {
1245
- messages.push(`Skipped duplicate ${prefix}:${entry.label} (already restored)`);
1246
- return `![${entry.caption}](${entry.path})`;
1247
- }
1248
- restoredLabels.add(labelKey);
1249
- restored++;
1250
- messages.push(`Restored image ${prefix}:${entry.label} from Figure ${num}`);
1251
- return `![${entry.caption}](${entry.path}){#${prefix}:${entry.label}}`;
1252
- }
1253
- }
1254
- const cleanCaption = caption.trim();
1255
- return `![${cleanCaption}](${imgPath})`;
1256
- });
1257
- return { text: result, restored, messages, restoredLabels };
1258
- }
1259
- /**
1260
- * Restore proper markdown image syntax from Word-extracted text using image registry
1261
- */
1262
- export function restoreImagesFromRegistry(text, projectDir, restoredLabels = null) {
1263
- const messages = [];
1264
- let restored = 0;
1265
- const registry = readImageRegistry(projectDir);
1266
- if (!registry || !registry.figures || registry.figures.length === 0) {
1267
- return { text, restored: 0, messages: ['No image registry found'] };
1268
- }
1269
- if (!restoredLabels) {
1270
- restoredLabels = new Set();
1271
- }
1272
- let result = text;
1273
- // Pattern 1: Caption-like text
1274
- const captionPatterns = [
1275
- /@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^\n]+)/gi,
1276
- /^(Figure|Fig\.?)\s+(\d+|S\d+)[.:]\s*([^\n]+)/gim,
1277
- /\|\s*@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^|]+)\s*\|/gi,
1278
- ];
1279
- // Fix @fig:label: caption patterns
1280
- result = result.replace(captionPatterns[0], (match, type, label, caption) => {
1281
- const key = `${type}:${label}`;
1282
- const entry = registry.byLabel.get(key);
1283
- if (entry) {
1284
- if (restoredLabels.has(key)) {
1285
- messages.push(`Skipped duplicate ${key} (already restored)`);
1286
- return `![${entry.caption}](${entry.path})`;
1287
- }
1288
- restoredLabels.add(key);
1289
- restored++;
1290
- messages.push(`Restored ${type}:${label} from registry`);
1291
- return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1292
- }
1293
- return match;
1294
- });
1295
- // Fix table-wrapped captions
1296
- result = result.replace(captionPatterns[2], (match, type, label, caption) => {
1297
- const key = `${type}:${label}`;
1298
- const entry = registry.byLabel.get(key);
1299
- if (entry) {
1300
- if (restoredLabels.has(key)) {
1301
- messages.push(`Skipped duplicate ${key} from table wrapper`);
1302
- return `![${entry.caption}](${entry.path})`;
1303
- }
1304
- restoredLabels.add(key);
1305
- restored++;
1306
- messages.push(`Restored ${type}:${label} from table wrapper`);
1307
- return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1308
- }
1309
- return match;
1310
- });
1311
- // Clean up empty table structures
1312
- result = result.replace(/\|\s*\|\s*\n\|:--:\|\s*\n/g, '');
1313
- // Fix "Figure N:" standalone lines
1314
- result = result.replace(captionPatterns[1], (match, prefix, num, caption) => {
1315
- const numKey = `fig:${num}`;
1316
- const entry = registry.byNumber.get(numKey);
1317
- if (entry) {
1318
- const labelKey = `fig:${entry.label}`;
1319
- if (restoredLabels.has(labelKey)) {
1320
- messages.push(`Skipped duplicate Figure ${num} (already restored)`);
1321
- return `![${entry.caption}](${entry.path})`;
1322
- }
1323
- restoredLabels.add(labelKey);
1324
- restored++;
1325
- messages.push(`Restored Figure ${num} by number lookup`);
1326
- return `![${entry.caption}](${entry.path}){#fig:${entry.label}}`;
1327
- }
1328
- return match;
1329
- });
1330
- // Fix generic media paths by matching caption text
1331
- const genericImagePattern = /!\[([^\]]*)\]\(media\/[^)]+\)/g;
1332
- result = result.replace(genericImagePattern, (match, caption) => {
1333
- if (!caption || caption.trim() === '') {
1334
- return match;
1335
- }
1336
- const captionKey = caption.slice(0, 50).toLowerCase().trim();
1337
- const entry = registry.byCaption.get(captionKey);
1338
- if (entry) {
1339
- const labelKey = entry.label ? `${entry.type}:${entry.label}` : null;
1340
- if (labelKey && restoredLabels.has(labelKey)) {
1341
- messages.push(`Skipped duplicate by caption match: ${captionKey.slice(0, 30)}...`);
1342
- return `![${entry.caption}](${entry.path})`;
1343
- }
1344
- if (labelKey) {
1345
- restoredLabels.add(labelKey);
1346
- }
1347
- restored++;
1348
- messages.push(`Restored image by caption match: ${captionKey.slice(0, 30)}...`);
1349
- const anchor = (entry.label && !restoredLabels.has(labelKey)) ? `{#${entry.type}:${entry.label}}` : '';
1350
- return `![${entry.caption}](${entry.path})${anchor}`;
1351
- }
1352
- return match;
1353
- });
1354
- return { text: result, restored, messages };
1355
- }
1356
208
  /**
1357
209
  * Import Word document with track changes directly as CriticMarkup
1358
210
  */