docrev 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/import.js ADDED
@@ -0,0 +1,1018 @@
1
+ /**
2
+ * Import functionality - convert Word docs to annotated Markdown
3
+ */
4
+
5
+ import * as fs from 'fs';
6
+ import * as path from 'path';
7
+ import { diffWords } from 'diff';
8
+
9
+ /**
10
+ * Extract comments directly from Word docx comments.xml
11
+ * @param {string} docxPath
12
+ * @returns {Promise<Array<{id: string, author: string, date: string, text: string}>>}
13
+ */
14
+ export async function extractWordComments(docxPath) {
15
+ const AdmZip = (await import('adm-zip')).default;
16
+ const { parseStringPromise } = await import('xml2js');
17
+
18
+ const comments = [];
19
+
20
+ try {
21
+ const zip = new AdmZip(docxPath);
22
+ const commentsEntry = zip.getEntry('word/comments.xml');
23
+
24
+ if (!commentsEntry) {
25
+ return comments;
26
+ }
27
+
28
+ const commentsXml = commentsEntry.getData().toString('utf8');
29
+ const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
30
+
31
+ const ns = 'w:';
32
+ const commentsRoot = parsed['w:comments'];
33
+ if (!commentsRoot || !commentsRoot['w:comment']) {
34
+ return comments;
35
+ }
36
+
37
+ // Ensure it's an array
38
+ const commentNodes = Array.isArray(commentsRoot['w:comment'])
39
+ ? commentsRoot['w:comment']
40
+ : [commentsRoot['w:comment']];
41
+
42
+ for (const comment of commentNodes) {
43
+ const id = comment.$?.['w:id'] || '';
44
+ const author = comment.$?.['w:author'] || 'Unknown';
45
+ const date = comment.$?.['w:date'] || '';
46
+
47
+ // Extract text from nested w:p/w:r/w:t elements
48
+ let text = '';
49
+ const extractText = (node) => {
50
+ if (!node) return;
51
+ if (typeof node === 'string') {
52
+ text += node;
53
+ return;
54
+ }
55
+ if (node['w:t']) {
56
+ const t = node['w:t'];
57
+ text += typeof t === 'string' ? t : (t._ || t);
58
+ }
59
+ if (node['w:r']) {
60
+ const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
61
+ runs.forEach(extractText);
62
+ }
63
+ if (node['w:p']) {
64
+ const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
65
+ paras.forEach(extractText);
66
+ }
67
+ };
68
+ extractText(comment);
69
+
70
+ comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
71
+ }
72
+ } catch (err) {
73
+ console.error('Error extracting comments:', err.message);
74
+ }
75
+
76
+ return comments;
77
+ }
78
+
79
+ /**
80
+ * Extract comment anchor texts from document.xml with surrounding context
81
+ * Returns map of comment ID -> {anchor, before, after} for better matching
82
+ * @param {string} docxPath
83
+ * @returns {Promise<Map<string, {anchor: string, before: string, after: string}>>}
84
+ */
85
+ export async function extractCommentAnchors(docxPath) {
86
+ const AdmZip = (await import('adm-zip')).default;
87
+ const anchors = new Map();
88
+
89
+ try {
90
+ const zip = new AdmZip(docxPath);
91
+ const docEntry = zip.getEntry('word/document.xml');
92
+
93
+ if (!docEntry) {
94
+ return anchors;
95
+ }
96
+
97
+ const docXml = docEntry.getData().toString('utf8');
98
+
99
+ // Extract ALL text nodes in document order for context
100
+ const allTextNodes = [...docXml.matchAll(/<w:t[^>]*>([^<]*)<\/w:t>/g)].map(m => m[1]);
101
+ const fullDocText = allTextNodes.join('');
102
+
103
+ // Find commentRangeStart...commentRangeEnd pairs
104
+ const rangePattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>[\s\S]*?<w:commentRangeEnd[^>]*w:id="\1"[^>]*\/?>/g;
105
+
106
+ let match;
107
+ while ((match = rangePattern.exec(docXml)) !== null) {
108
+ const id = match[1];
109
+ const rangeContent = match[0];
110
+
111
+ // Extract all w:t text within this range
112
+ const textMatches = rangeContent.matchAll(/<w:t[^>]*>([^<]*)<\/w:t>/g);
113
+ let anchorText = '';
114
+ for (const tm of textMatches) {
115
+ anchorText += tm[1];
116
+ }
117
+
118
+ if (anchorText.trim()) {
119
+ // Get surrounding context from full document
120
+ const anchorPos = fullDocText.indexOf(anchorText.trim());
121
+ let before = '';
122
+ let after = '';
123
+
124
+ if (anchorPos >= 0) {
125
+ // Get ~100 chars before (up to sentence boundary)
126
+ const beforeText = fullDocText.slice(Math.max(0, anchorPos - 150), anchorPos);
127
+ const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
128
+ before = sentenceStart >= 0 ? beforeText.slice(sentenceStart + 2).trim() : beforeText.slice(-80).trim();
129
+
130
+ // Get ~100 chars after (up to sentence boundary)
131
+ const afterStart = anchorPos + anchorText.length;
132
+ const afterText = fullDocText.slice(afterStart, afterStart + 150);
133
+ const sentenceEnd = afterText.search(/[.!?]\s/);
134
+ after = sentenceEnd >= 0 ? afterText.slice(0, sentenceEnd + 1).trim() : afterText.slice(0, 80).trim();
135
+ }
136
+
137
+ anchors.set(id, { anchor: anchorText.trim(), before, after });
138
+ }
139
+ }
140
+ } catch (err) {
141
+ console.error('Error extracting comment anchors:', err.message);
142
+ }
143
+
144
+ return anchors;
145
+ }
146
+
147
+ /**
148
+ * Extract text from Word document using mammoth
149
+ * @param {string} docxPath
150
+ * @returns {Promise<{text: string, comments: Array, anchors: Map}>}
151
+ */
152
+ export async function extractFromWord(docxPath) {
153
+ const mammoth = await import('mammoth');
154
+
155
+ // Extract raw text
156
+ const textResult = await mammoth.extractRawText({ path: docxPath });
157
+
158
+ // Try to extract with messages (may include comments info)
159
+ const htmlResult = await mammoth.convertToHtml({ path: docxPath });
160
+
161
+ // Extract comments directly from docx XML
162
+ const comments = await extractWordComments(docxPath);
163
+
164
+ // Extract comment anchor texts
165
+ const anchors = await extractCommentAnchors(docxPath);
166
+
167
+ return {
168
+ text: textResult.value,
169
+ comments,
170
+ anchors,
171
+ messages: [...textResult.messages, ...htmlResult.messages],
172
+ };
173
+ }
174
+
175
+ /**
176
+ * Insert comments into markdown text based on anchor texts with context
177
+ * Uses sentence context for disambiguation and tie-breaks for duplicates
178
+ * @param {string} markdown - The markdown text
179
+ * @param {Array} comments - Array of {id, author, text}
180
+ * @param {Map} anchors - Map of comment id -> {anchor, before, after} or string (legacy)
181
+ * @param {object} options - Options {quiet: boolean}
182
+ * @returns {string} - Markdown with comments inserted
183
+ */
184
+ export function insertCommentsIntoMarkdown(markdown, comments, anchors, options = {}) {
185
+ const { quiet = false } = options;
186
+ let result = markdown;
187
+ let unmatchedCount = 0;
188
+ const duplicateWarnings = [];
189
+ const usedPositions = new Set(); // For tie-breaking: track used positions
190
+
191
+ // Get all positions in order (for sequential tie-breaking)
192
+ const commentsWithPositions = comments.map((c) => {
193
+ const anchorData = anchors.get(c.id);
194
+ if (!anchorData) {
195
+ unmatchedCount++;
196
+ return { ...c, pos: -1, anchorText: null };
197
+ }
198
+
199
+ // Support both old format (string) and new format ({anchor, before, after})
200
+ const anchor = typeof anchorData === 'string' ? anchorData : anchorData.anchor;
201
+ const before = typeof anchorData === 'object' ? anchorData.before : '';
202
+ const after = typeof anchorData === 'object' ? anchorData.after : '';
203
+
204
+ const anchorLower = anchor.toLowerCase();
205
+ const resultLower = result.toLowerCase();
206
+
207
+ // Find ALL occurrences of anchor text
208
+ const occurrences = [];
209
+ let searchIdx = 0;
210
+ while ((searchIdx = resultLower.indexOf(anchorLower, searchIdx)) !== -1) {
211
+ occurrences.push(searchIdx);
212
+ searchIdx += 1;
213
+ }
214
+
215
+ if (occurrences.length === 0) {
216
+ // Try normalized whitespace match
217
+ const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
218
+ const normalizedResult = result.replace(/\s+/g, ' ').toLowerCase();
219
+ const normalizedIdx = normalizedResult.indexOf(normalizedAnchor);
220
+
221
+ if (normalizedIdx !== -1) {
222
+ return { ...c, pos: normalizedIdx + anchor.length, anchorText: anchor };
223
+ }
224
+ unmatchedCount++;
225
+ return { ...c, pos: -1, anchorText: null };
226
+ }
227
+
228
+ if (occurrences.length === 1) {
229
+ // Unique match - easy case
230
+ return { ...c, pos: occurrences[0] + anchor.length, anchorText: anchor };
231
+ }
232
+
233
+ // Multiple occurrences - use context for disambiguation
234
+ duplicateWarnings.push(`"${anchor.slice(0, 40)}${anchor.length > 40 ? '...' : ''}" appears ${occurrences.length} times`);
235
+
236
+ // Score each occurrence based on context match
237
+ // Initialize to first UNUSED occurrence (for tie-break correctness)
238
+ let bestIdx = occurrences.find(p => !usedPositions.has(p)) ?? occurrences[0];
239
+ let bestScore = -1; // Start at -1 so first valid candidate wins
240
+
241
+ for (const pos of occurrences) {
242
+ // Skip positions already used by previous comments
243
+ if (usedPositions.has(pos)) continue;
244
+
245
+ let score = 0;
246
+
247
+ // Check context before
248
+ if (before) {
249
+ const contextBefore = result.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
250
+ const beforeLower = before.toLowerCase();
251
+ // Check if context contains parts of 'before'
252
+ const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
253
+ for (const word of beforeWords) {
254
+ if (contextBefore.includes(word)) score += 2;
255
+ }
256
+ // Bonus for full match
257
+ if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
258
+ }
259
+
260
+ // Check context after
261
+ if (after) {
262
+ const contextAfter = result.slice(pos + anchor.length, pos + anchor.length + after.length + 20).toLowerCase();
263
+ const afterLower = after.toLowerCase();
264
+ // Check if context contains parts of 'after'
265
+ const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
266
+ for (const word of afterWords) {
267
+ if (contextAfter.includes(word)) score += 2;
268
+ }
269
+ // Bonus for full match
270
+ if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
271
+ }
272
+
273
+ // Tie-break: prefer earlier unused occurrence (document order)
274
+ if (score > bestScore || (score === bestScore && pos < bestIdx)) {
275
+ bestScore = score;
276
+ bestIdx = pos;
277
+ }
278
+ }
279
+
280
+ // Mark this position as used for tie-breaking subsequent comments
281
+ usedPositions.add(bestIdx);
282
+
283
+ return { ...c, pos: bestIdx + anchor.length, anchorText: anchor };
284
+ }).filter((c) => c.pos > 0);
285
+
286
+ // Sort by position descending (insert from end to avoid offset issues)
287
+ commentsWithPositions.sort((a, b) => b.pos - a.pos);
288
+
289
+ // Insert each comment
290
+ for (const c of commentsWithPositions) {
291
+ const commentMark = ` {>>${c.author}: ${c.text}<<}`;
292
+ result = result.slice(0, c.pos) + commentMark + result.slice(c.pos);
293
+ }
294
+
295
+ // Log warnings unless quiet mode
296
+ if (!quiet) {
297
+ if (unmatchedCount > 0) {
298
+ console.warn(`Warning: ${unmatchedCount} comment(s) could not be matched to anchor text`);
299
+ }
300
+ if (duplicateWarnings.length > 0) {
301
+ console.warn(`Warning: Duplicate anchor text found (using context & tie-breaks for placement):`);
302
+ for (const w of duplicateWarnings) {
303
+ console.warn(` - ${w}`);
304
+ }
305
+ }
306
+ }
307
+
308
+ return result;
309
+ }
310
+
311
+ /**
312
+ * Normalize text for comparison (handle whitespace differences)
313
+ * @param {string} text
314
+ * @returns {string}
315
+ */
316
+ function normalizeText(text) {
317
+ return text
318
+ .replace(/\r\n/g, '\n') // Normalize line endings
319
+ .replace(/\t/g, ' ') // Tabs to spaces
320
+ .replace(/ +/g, ' ') // Collapse multiple spaces
321
+ .trim();
322
+ }
323
+
324
+ /**
325
+ * Fix citation and math annotations by preserving original markdown syntax
326
+ * When Word renders [@Author2021] as "(Author et al. 2021)" or $p$ as "p", we preserve markdown
327
+ * @param {string} text - Annotated text
328
+ * @param {string} originalMd - Original markdown with proper citations and math
329
+ * @returns {string}
330
+ */
331
+ function fixCitationAnnotations(text, originalMd) {
332
+ // Step 0: Fix math annotations - preserve inline and display math
333
+ // Deletions of inline math should keep the math: {--$p$--} -> $p$
334
+ text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
335
+ text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
336
+
337
+ // Substitutions where math was "changed" to rendered form: {~~$p$~>p~~} -> $p$
338
+ text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
339
+ text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
340
+
341
+ // Extract all citations from original markdown with positions
342
+ const citationPattern = /\[@[^\]]+\]/g;
343
+ const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
344
+
345
+ // Step 1: Fix substitutions where left side has markdown citation
346
+ // {~~[@Author]~>rendered~~} -> [@Author]
347
+ text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
348
+
349
+ // Step 2: Fix substitutions where left side STARTS with markdown citation
350
+ // {~~[@Author] more text~>rendered more~~} -> [@Author] {~~more text~>more~~}
351
+ text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
352
+ // If old and new text are similar (just whitespace/formatting), keep cite + new
353
+ if (oldText.trim() === '' && newText.trim() === '') {
354
+ return cite;
355
+ }
356
+ // Otherwise, keep citation and create substitution for the rest
357
+ if (oldText.trim() || newText.trim()) {
358
+ return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
359
+ }
360
+ return cite;
361
+ });
362
+
363
+ // Step 3: Fix deletions of markdown citations (should keep them)
364
+ text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
365
+
366
+ // Step 4: Fix insertions of rendered citations (usually duplicates, remove)
367
+ // {++(Author et al. 2021)++} or {++(Author 2021)++}
368
+ text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
369
+
370
+ // Step 5: Clean up broken multi-part substitutions involving citations
371
+ // Pattern: {~~[@cite~>rendered~~} {~~text~>more~~} -> [@cite] {~~text~>more~~}
372
+ text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
373
+
374
+ // Step 6: Fix citations split across substitution boundaries
375
+ // {~~[@~>something~~}Author2021] -> [@Author2021]
376
+ text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
377
+
378
+ // Step 7: Clean up any remaining partial citations in substitutions
379
+ // {~~; @Author2021]~>something~~} -> ; [@Author2021]
380
+ text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
381
+
382
+ // Step 8: Remove rendered citation insertions (fragments left over from citation matching)
383
+ // These are leftover pieces of rendered citations that didn't match placeholders
384
+ // Use \p{L} for Unicode letters to handle accented chars (š, é, ü, etc.)
385
+
386
+ // Full rendered citations in parentheses: {++(Author et al. 2021)++}
387
+ text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
388
+ text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
389
+
390
+ // Trailing citation fragments: {++2019; IPBES 2023). ++} or {++2008b; Rouget et al. 2016). ++}
391
+ text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
392
+ text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
393
+
394
+ // Just year with closing paren: {++2021)++} or {++2021).++}
395
+ text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
396
+ text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
397
+
398
+ // Leading citation fragments: {++Author et al.++} or {++(Author++}
399
+ text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
400
+
401
+ // Semicolon-separated author-year fragments: {++; Author 2021++}
402
+ text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
403
+
404
+ // Year ranges with authors: {++Author 2019; Other 2020)++}
405
+ text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
406
+ text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
407
+
408
+ // Step 9: Clean up double spaces and orphaned punctuation
409
+ text = text.replace(/ +/g, ' ');
410
+ text = text.replace(/\s+\./g, '.');
411
+ text = text.replace(/\s+,/g, ',');
412
+
413
+ // Step 10: Final cleanup - remove empty annotations
414
+ text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
415
+ text = text.replace(/\{\+\+\s*\+\+\}/g, '');
416
+ text = text.replace(/\{--\s*--\}/g, '');
417
+
418
+ return text;
419
+ }
420
+
421
+ /**
422
+ * Strip markdown syntax to get plain text (for comparison with Word output)
423
+ * @param {string} md
424
+ * @returns {string}
425
+ */
426
+ function stripMarkdownSyntax(md) {
427
+ return md
428
+ // Remove YAML front matter
429
+ .replace(/^---[\s\S]*?---\n*/m, '')
430
+ // Headers: # Title → Title
431
+ .replace(/^#{1,6}\s+/gm, '')
432
+ // Bold/italic: **text** or *text* or __text__ or _text_ → text
433
+ .replace(/(\*\*|__)(.*?)\1/g, '$2')
434
+ .replace(/(\*|_)(.*?)\1/g, '$2')
435
+ // Links: [text](url) → text
436
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
437
+ // Images: ![alt](url) → (remove entirely or keep alt)
438
+ .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
439
+ // Inline code: `code` → code
440
+ .replace(/`([^`]+)`/g, '$1')
441
+ // Code blocks: ```...``` → (remove)
442
+ .replace(/```[\s\S]*?```/g, '')
443
+ // Blockquotes: > text → text
444
+ .replace(/^>\s*/gm, '')
445
+ // Horizontal rules
446
+ .replace(/^[-*_]{3,}\s*$/gm, '')
447
+ // List markers: - item or * item or 1. item → item
448
+ .replace(/^[\s]*[-*+]\s+/gm, '')
449
+ .replace(/^[\s]*\d+\.\s+/gm, '')
450
+ // Citations: [@Author2020] → (keep as-is, Word might have them)
451
+ // Tables: simplified handling
452
+ .replace(/\|/g, ' ')
453
+ .replace(/^[-:]+$/gm, '')
454
+ // Clean up extra whitespace
455
+ .replace(/\n{3,}/g, '\n\n')
456
+ .trim();
457
+ }
458
+
459
+ /**
460
+ * Generate annotated markdown by diffing original MD against Word text
461
+ * @param {string} originalMd - Original markdown content
462
+ * @param {string} wordText - Text extracted from Word
463
+ * @param {string} author - Author name for the changes
464
+ * @returns {string} Annotated markdown with CriticMarkup
465
+ */
466
+ export function generateAnnotatedDiff(originalMd, wordText, author = 'Reviewer') {
467
+ // Normalize both texts
468
+ const normalizedOriginal = normalizeText(originalMd);
469
+ const normalizedWord = normalizeText(wordText);
470
+
471
+ // Compute word-level diff
472
+ const changes = diffWords(normalizedOriginal, normalizedWord);
473
+
474
+ let result = '';
475
+
476
+ for (const part of changes) {
477
+ if (part.added) {
478
+ // Insertion
479
+ result += `{++${part.value}++}`;
480
+ } else if (part.removed) {
481
+ // Deletion
482
+ result += `{--${part.value}--}`;
483
+ } else {
484
+ // Unchanged
485
+ result += part.value;
486
+ }
487
+ }
488
+
489
+ return result;
490
+ }
491
+
492
+ /**
493
+ * Extract markdown prefix (headers, list markers) from a line
494
+ * @param {string} line
495
+ * @returns {{prefix: string, content: string}}
496
+ */
497
+ function extractMarkdownPrefix(line) {
498
+ // Headers
499
+ const headerMatch = line.match(/^(#{1,6}\s+)/);
500
+ if (headerMatch) {
501
+ return { prefix: headerMatch[1], content: line.slice(headerMatch[1].length) };
502
+ }
503
+
504
+ // List items
505
+ const listMatch = line.match(/^(\s*[-*+]\s+|\s*\d+\.\s+)/);
506
+ if (listMatch) {
507
+ return { prefix: listMatch[1], content: line.slice(listMatch[1].length) };
508
+ }
509
+
510
+ // Blockquotes
511
+ const quoteMatch = line.match(/^(>\s*)/);
512
+ if (quoteMatch) {
513
+ return { prefix: quoteMatch[1], content: line.slice(quoteMatch[1].length) };
514
+ }
515
+
516
+ return { prefix: '', content: line };
517
+ }
518
+
519
+ /**
520
+ * Protect figure/table anchors before diffing
521
+ * Anchors like {#fig:heatmap} and {#tbl:results} should never be deleted
522
+ * @param {string} md
523
+ * @returns {{text: string, anchors: Array<{original: string, placeholder: string}>}}
524
+ */
525
+ function protectAnchors(md) {
526
+ const anchors = [];
527
+
528
+ // Match {#fig:label}, {#tbl:label}, {#eq:label}, {#sec:label} etc.
529
+ // Also match with additional attributes like {#fig:label width=50%}
530
+ const text = md.replace(/\{#(fig|tbl|eq|sec|lst):[^}]+\}/g, (match) => {
531
+ const idx = anchors.length;
532
+ const placeholder = `ANCHORBLOCK${idx}ENDANCHOR`;
533
+ anchors.push({ original: match, placeholder });
534
+ return placeholder;
535
+ });
536
+
537
+ return { text, anchors };
538
+ }
539
+
540
+ /**
541
+ * Restore anchors from placeholders
542
+ * @param {string} text
543
+ * @param {Array} anchors
544
+ * @returns {string}
545
+ */
546
+ function restoreAnchors(text, anchors) {
547
+ for (const anchor of anchors) {
548
+ // Handle case where anchor is inside a deletion annotation
549
+ // {--...ANCHORBLOCK0ENDANCHOR--} should become {--...--}{#fig:label}
550
+ const deletionPattern = new RegExp(`\\{--([^}]*?)${anchor.placeholder}([^}]*?)--\\}`, 'g');
551
+ text = text.replace(deletionPattern, (match, before, after) => {
552
+ const cleanBefore = before.trim();
553
+ const cleanAfter = after.trim();
554
+ let result = '';
555
+ if (cleanBefore) result += `{--${cleanBefore}--}`;
556
+ result += anchor.original;
557
+ if (cleanAfter) result += `{--${cleanAfter}--}`;
558
+ return result;
559
+ });
560
+
561
+ // Handle case where anchor is inside a substitution
562
+ // {~~old ANCHORBLOCK0ENDANCHOR~>new~~} -> {~~old~>new~~}{#fig:label}
563
+ const substitutionPattern = new RegExp(`\\{~~([^~]*?)${anchor.placeholder}([^~]*?)~>([^~]*)~~\\}`, 'g');
564
+ text = text.replace(substitutionPattern, (match, oldBefore, oldAfter, newText) => {
565
+ const cleanOldBefore = oldBefore.trim();
566
+ const cleanOldAfter = oldAfter.trim();
567
+ const cleanNew = newText.trim();
568
+ const oldText = (cleanOldBefore + ' ' + cleanOldAfter).trim();
569
+ let result = '';
570
+ if (oldText !== cleanNew) {
571
+ result += `{~~${oldText}~>${cleanNew}~~}`;
572
+ } else {
573
+ result += cleanNew;
574
+ }
575
+ result += anchor.original;
576
+ return result;
577
+ });
578
+
579
+ // Normal replacement
580
+ text = text.split(anchor.placeholder).join(anchor.original);
581
+ }
582
+ return text;
583
+ }
584
+
585
+ /**
586
+ * Protect cross-references before diffing
587
+ * References like @fig:label, @tbl:label should be preserved
588
+ * @param {string} md
589
+ * @returns {{text: string, crossrefs: Array<{original: string, placeholder: string}>}}
590
+ */
591
+ function protectCrossrefs(md) {
592
+ const crossrefs = [];
593
+
594
+ // Match @fig:label, @tbl:label, @eq:label, @sec:label
595
+ // Can appear as @fig:label or (@fig:label) or [@fig:label]
596
+ const text = md.replace(/@(fig|tbl|eq|sec|lst):[a-zA-Z0-9_-]+/g, (match) => {
597
+ const idx = crossrefs.length;
598
+ const placeholder = `XREFBLOCK${idx}ENDXREF`;
599
+ crossrefs.push({ original: match, placeholder });
600
+ return placeholder;
601
+ });
602
+
603
+ return { text, crossrefs };
604
+ }
605
+
606
+ /**
607
+ * Restore cross-references from placeholders
608
+ * @param {string} text
609
+ * @param {Array} crossrefs
610
+ * @returns {string}
611
+ */
612
+ function restoreCrossrefs(text, crossrefs) {
613
+ for (const xref of crossrefs) {
614
+ // Handle deletions - restore the reference even if marked deleted
615
+ const deletionPattern = new RegExp(`\\{--([^}]*?)${xref.placeholder}([^}]*?)--\\}`, 'g');
616
+ text = text.replace(deletionPattern, (match, before, after) => {
617
+ const cleanBefore = before.trim();
618
+ const cleanAfter = after.trim();
619
+ let result = '';
620
+ if (cleanBefore) result += `{--${cleanBefore}--}`;
621
+ result += xref.original;
622
+ if (cleanAfter) result += `{--${cleanAfter}--}`;
623
+ return result;
624
+ });
625
+
626
+ // Handle substitutions where rendered form (Figure 1) replaced the reference
627
+ // {~~XREFBLOCK0ENDXREF~>Figure 1~~} -> @fig:label
628
+ const substitutionPattern = new RegExp(`\\{~~${xref.placeholder}~>[^~]+~~\\}`, 'g');
629
+ text = text.replace(substitutionPattern, xref.original);
630
+
631
+ // Normal replacement
632
+ text = text.split(xref.placeholder).join(xref.original);
633
+ }
634
+ return text;
635
+ }
636
+
637
+ /**
638
+ * Protect mathematical notation before diffing by replacing with placeholders
639
+ * Handles both inline $...$ and display $$...$$ math
640
+ * @param {string} md
641
+ * @returns {{text: string, mathBlocks: Array<{original: string, placeholder: string, type: string, simplified: string}>}}
642
+ */
643
+ function protectMath(md) {
644
+ const mathBlocks = [];
645
+
646
+ // First protect display math ($$...$$) - must be done before inline math
647
+ let text = md.replace(/\$\$([^$]+)\$\$/g, (match, content) => {
648
+ const idx = mathBlocks.length;
649
+ const placeholder = `MATHBLOCK${idx}ENDMATH`;
650
+ // Create simplified version for matching in Word text
651
+ const simplified = simplifyMathForMatching(content);
652
+ mathBlocks.push({ original: match, placeholder, type: 'display', simplified });
653
+ return placeholder;
654
+ });
655
+
656
+ // Then protect inline math ($...$)
657
+ text = text.replace(/\$([^$\n]+)\$/g, (match, content) => {
658
+ const idx = mathBlocks.length;
659
+ const placeholder = `MATHBLOCK${idx}ENDMATH`;
660
+ const simplified = simplifyMathForMatching(content);
661
+ mathBlocks.push({ original: match, placeholder, type: 'inline', simplified });
662
+ return placeholder;
663
+ });
664
+
665
+ return { text, mathBlocks };
666
+ }
667
+
668
+ /**
669
+ * Simplify LaTeX math for fuzzy matching against Word text
670
+ * Word renders math as text, so we need to match the rendered form
671
+ * @param {string} latex
672
+ * @returns {string}
673
+ */
674
+ function simplifyMathForMatching(latex) {
675
+ return latex
676
+ // Remove common LaTeX commands
677
+ .replace(/\\text\{([^}]+)\}/g, '$1')
678
+ .replace(/\\hat\{([^}]+)\}/g, '$1')
679
+ .replace(/\\bar\{([^}]+)\}/g, '$1')
680
+ .replace(/\\frac\{([^}]+)\}\{([^}]+)\}/g, '$1/$2')
681
+ .replace(/\\sum_([a-z])/g, 'Σ')
682
+ .replace(/\\sum/g, 'Σ')
683
+ .replace(/\\cdot/g, '·')
684
+ .replace(/\\quad/g, ' ')
685
+ .replace(/\\,/g, ' ')
686
+ .replace(/\\_/g, '_')
687
+ .replace(/\\{/g, '{')
688
+ .replace(/\\}/g, '}')
689
+ .replace(/\\/g, '') // Remove remaining backslashes
690
+ .replace(/[{}]/g, '') // Remove braces
691
+ .replace(/\s+/g, ' ')
692
+ .trim();
693
+ }
694
+
695
+ /**
696
+ * Restore math from placeholders
697
+ * @param {string} text
698
+ * @param {Array} mathBlocks
699
+ * @returns {string}
700
+ */
701
+ function restoreMath(text, mathBlocks) {
702
+ for (const block of mathBlocks) {
703
+ text = text.split(block.placeholder).join(block.original);
704
+ }
705
+ return text;
706
+ }
707
+
708
+ /**
709
+ * Replace rendered math in Word text with matching placeholders
710
+ * This is heuristic-based since Word can render math in various ways
711
+ * @param {string} wordText
712
+ * @param {Array} mathBlocks
713
+ * @returns {string}
714
+ */
715
+ function replaceRenderedMath(wordText, mathBlocks) {
716
+ let result = wordText;
717
+
718
+ for (const block of mathBlocks) {
719
+ // For inline math, try to find the simplified form in Word text
720
+ if (block.simplified.length >= 2) {
721
+ // Try exact match first
722
+ if (result.includes(block.simplified)) {
723
+ result = result.replace(block.simplified, block.placeholder);
724
+ }
725
+ }
726
+ }
727
+
728
+ return result;
729
+ }
730
+
731
+ /**
732
+ * Protect citations before diffing by replacing with placeholders
733
+ * @param {string} md
734
+ * @returns {{text: string, citations: string[]}}
735
+ */
736
+ function protectCitations(md) {
737
+ const citations = [];
738
+ const text = md.replace(/\[@[^\]]+\]/g, (match) => {
739
+ const idx = citations.length;
740
+ citations.push(match);
741
+ return `CITEREF${idx}ENDCITE`;
742
+ });
743
+ return { text, citations };
744
+ }
745
+
746
+ /**
747
+ * Restore citations from placeholders
748
+ * @param {string} text
749
+ * @param {string[]} citations
750
+ * @returns {string}
751
+ */
752
+ function restoreCitations(text, citations) {
753
+ for (let i = 0; i < citations.length; i++) {
754
+ // Handle cases where placeholder might be inside annotations
755
+ const placeholder = `CITEREF${i}ENDCITE`;
756
+ text = text.split(placeholder).join(citations[i]);
757
+ }
758
+ return text;
759
+ }
760
+
761
+ /**
762
+ * Remove rendered citations from Word text (replace with matching placeholders)
763
+ * @param {string} wordText
764
+ * @param {number} count
765
+ * @returns {string}
766
+ */
767
+ function replaceRenderedCitations(wordText, count) {
768
+ // Match rendered citation patterns: (Author 2021), (Author et al. 2021), etc.
769
+ const pattern = /\((?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?(?:\s*[&,;]\s*[A-Z][a-zé]+(?:\s+et\s+al\.?)?)*\s+\d{4}(?:[a-z])?(?:\s*[,;]\s*(?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?\s+)?\d{4}(?:[a-z])?)*)\)/g;
770
+
771
+ let idx = 0;
772
+ return wordText.replace(pattern, (match) => {
773
+ if (idx < count) {
774
+ const placeholder = `CITEREF${idx}ENDCITE`;
775
+ idx++;
776
+ return placeholder;
777
+ }
778
+ return match;
779
+ });
780
+ }
781
+
782
+ /**
783
+ * Smart paragraph-level diff that preserves markdown structure
784
+ * @param {string} originalMd
785
+ * @param {string} wordText
786
+ * @param {string} author
787
+ * @returns {string}
788
+ */
789
+ export function generateSmartDiff(originalMd, wordText, author = 'Reviewer') {
790
+ // Protection order matters: anchors first, then crossrefs, math, citations
791
+
792
+ // Protect figure/table anchors (CRITICAL - these must never be deleted)
793
+ const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(originalMd);
794
+
795
+ // Protect cross-references (@fig:label, @tbl:label)
796
+ const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
797
+
798
+ // Protect math (before citations, since citations might be inside math)
799
+ const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
800
+
801
+ // Then protect citations
802
+ const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
803
+
804
+ // Replace rendered math and citations in Word text with matching placeholders
805
+ let wordProtected = replaceRenderedMath(wordText, mathBlocks);
806
+ wordProtected = replaceRenderedCitations(wordProtected, citations.length);
807
+
808
+ // Split into paragraphs
809
+ const originalParas = mdProtected.split(/\n\n+/);
810
+ const wordParas = wordProtected.split(/\n\n+/);
811
+
812
+ const result = [];
813
+
814
+ // Try to match paragraphs intelligently
815
+ let wordIdx = 0;
816
+
817
+ for (let i = 0; i < originalParas.length; i++) {
818
+ const orig = originalParas[i] || '';
819
+ const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
820
+
821
+ // Find best matching word paragraph
822
+ let bestMatch = -1;
823
+ let bestScore = 0;
824
+
825
+ for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
826
+ const wordPara = wordParas[j] || '';
827
+ // Simple similarity: count common words
828
+ const origWords = new Set(origContent.toLowerCase().split(/\s+/));
829
+ const wordWords = wordPara.toLowerCase().split(/\s+/);
830
+ const common = wordWords.filter((w) => origWords.has(w)).length;
831
+ const score = common / Math.max(origWords.size, wordWords.length);
832
+
833
+ if (score > bestScore && score > 0.3) {
834
+ bestScore = score;
835
+ bestMatch = j;
836
+ }
837
+ }
838
+
839
+ if (bestMatch === -1) {
840
+ // No match found - paragraph was deleted or heavily modified
841
+ // Check if it's just a header that Word converted
842
+ if (mdPrefix && wordIdx < wordParas.length) {
843
+ const wordPara = wordParas[wordIdx];
844
+ if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
845
+ // Word paragraph contains the header content - match them
846
+ bestMatch = wordIdx;
847
+ }
848
+ }
849
+ }
850
+
851
+ if (bestMatch >= 0) {
852
+ const word = wordParas[bestMatch];
853
+
854
+ // Strip markdown from original for clean comparison
855
+ const origStripped = stripMarkdownSyntax(orig);
856
+ const wordNormalized = normalizeText(word);
857
+
858
+ if (origStripped === wordNormalized) {
859
+ // Unchanged (ignoring markdown syntax)
860
+ result.push(orig);
861
+ } else {
862
+ // Modified - diff the content, preserve markdown prefix
863
+ const changes = diffWords(origStripped, wordNormalized);
864
+ let annotated = mdPrefix; // Preserve header/list marker
865
+
866
+ for (const part of changes) {
867
+ if (part.added) {
868
+ annotated += `{++${part.value}++}`;
869
+ } else if (part.removed) {
870
+ annotated += `{--${part.value}--}`;
871
+ } else {
872
+ annotated += part.value;
873
+ }
874
+ }
875
+
876
+ result.push(annotated);
877
+ }
878
+
879
+ wordIdx = bestMatch + 1;
880
+ } else {
881
+ // Paragraph deleted entirely
882
+ result.push(`{--${orig}--}`);
883
+ }
884
+ }
885
+
886
+ // Any remaining word paragraphs are additions
887
+ for (let j = wordIdx; j < wordParas.length; j++) {
888
+ const word = wordParas[j];
889
+ if (word.trim()) {
890
+ result.push(`{++${word}++}`);
891
+ }
892
+ }
893
+
894
+ // Restore protected content (reverse order of protection)
895
+ let finalResult = result.join('\n\n');
896
+ finalResult = restoreCitations(finalResult, citations);
897
+ finalResult = restoreMath(finalResult, mathBlocks);
898
+ finalResult = restoreCrossrefs(finalResult, crossrefs);
899
+ finalResult = restoreAnchors(finalResult, figAnchors);
900
+
901
+ return finalResult;
902
+ }
903
+
904
+ /**
905
+ * Clean up redundant adjacent annotations
906
+ * e.g., {--old--}{++new++} → {~~old~>new~~}
907
+ * @param {string} text
908
+ * @returns {string}
909
+ */
910
+ export function cleanupAnnotations(text) {
911
+ // Convert adjacent delete+insert to substitution (with possible whitespace between)
912
+ // Pattern: {--something--} {++something else++}
913
+ text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
914
+
915
+ // Also handle insert+delete (less common but possible)
916
+ text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
917
+
918
+ // Fix malformed patterns where {-- got merged with ~>
919
+ // {--key~>critical~~} → {~~key~>critical~~}
920
+ text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
921
+
922
+ // Fix malformed substitutions that got split
923
+ // {~~word --} ... {++other~~} patterns
924
+ text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
925
+ text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
926
+
927
+ // Clean up empty annotations
928
+ text = text.replace(/\{--\s*--\}/g, '');
929
+ text = text.replace(/\{\+\+\s*\+\+\}/g, '');
930
+
931
+ // Clean up double spaces
932
+ text = text.replace(/ +/g, ' ');
933
+
934
+ return text;
935
+ }
936
+
937
+ /**
938
+ * Parse visible comment markers from Word text
939
+ * Format: [Author: comment text]
940
+ * @param {string} text
941
+ * @returns {Array<{author: string, text: string, position: number}>}
942
+ */
943
+ export function parseVisibleComments(text) {
944
+ const comments = [];
945
+ const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
946
+
947
+ let match;
948
+ while ((match = pattern.exec(text)) !== null) {
949
+ comments.push({
950
+ author: match[1].trim(),
951
+ text: match[2].trim(),
952
+ position: match.index,
953
+ });
954
+ }
955
+
956
+ return comments;
957
+ }
958
+
959
+ /**
960
+ * Convert visible comments to CriticMarkup format
961
+ * @param {string} text
962
+ * @returns {string}
963
+ */
964
+ export function convertVisibleComments(text) {
965
+ return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
966
+ }
967
+
968
+ /**
969
+ * Full import pipeline: Word doc → annotated MD
970
+ * @param {string} docxPath - Path to Word document
971
+ * @param {string} originalMdPath - Path to original markdown
972
+ * @param {{author?: string, sectionContent?: string}} options
973
+ * @returns {Promise<{annotated: string, stats: object}>}
974
+ */
975
+ export async function importFromWord(docxPath, originalMdPath, options = {}) {
976
+ const { author = 'Reviewer', sectionContent } = options;
977
+
978
+ // Use provided section content or extract from Word
979
+ let wordText;
980
+ if (sectionContent !== undefined) {
981
+ wordText = sectionContent;
982
+ } else {
983
+ const extracted = await extractFromWord(docxPath);
984
+ wordText = extracted.text;
985
+ }
986
+
987
+ // Read original markdown
988
+ const originalMd = fs.readFileSync(originalMdPath, 'utf-8');
989
+
990
+ // Generate diff
991
+ let annotated = generateSmartDiff(originalMd, wordText, author);
992
+
993
+ // Clean up adjacent del/ins to substitutions
994
+ annotated = cleanupAnnotations(annotated);
995
+
996
+ // Fix citation-related annotations (preserve markdown citations)
997
+ annotated = fixCitationAnnotations(annotated, originalMd);
998
+
999
+ // Convert any visible comments
1000
+ annotated = convertVisibleComments(annotated);
1001
+
1002
+ // Count changes
1003
+ const insertions = (annotated.match(/\{\+\+/g) || []).length;
1004
+ const deletions = (annotated.match(/\{--/g) || []).length;
1005
+ const substitutions = (annotated.match(/\{~~/g) || []).length;
1006
+ const comments = (annotated.match(/\{>>/g) || []).length;
1007
+
1008
+ return {
1009
+ annotated,
1010
+ stats: {
1011
+ insertions,
1012
+ deletions,
1013
+ substitutions,
1014
+ comments,
1015
+ total: insertions + deletions + substitutions + comments,
1016
+ },
1017
+ };
1018
+ }