docrev 0.9.4 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/dist/lib/commands/comments.d.ts.map +1 -1
  2. package/dist/lib/commands/comments.js +19 -27
  3. package/dist/lib/commands/comments.js.map +1 -1
  4. package/dist/lib/commands/context.d.ts +1 -0
  5. package/dist/lib/commands/context.d.ts.map +1 -1
  6. package/dist/lib/commands/context.js +1 -2
  7. package/dist/lib/commands/context.js.map +1 -1
  8. package/dist/lib/commands/file-ops.d.ts +11 -0
  9. package/dist/lib/commands/file-ops.d.ts.map +1 -0
  10. package/dist/lib/commands/file-ops.js +301 -0
  11. package/dist/lib/commands/file-ops.js.map +1 -0
  12. package/dist/lib/commands/index.d.ts +9 -1
  13. package/dist/lib/commands/index.d.ts.map +1 -1
  14. package/dist/lib/commands/index.js +17 -1
  15. package/dist/lib/commands/index.js.map +1 -1
  16. package/dist/lib/commands/merge-resolve.d.ts +12 -0
  17. package/dist/lib/commands/merge-resolve.d.ts.map +1 -0
  18. package/dist/lib/commands/merge-resolve.js +318 -0
  19. package/dist/lib/commands/merge-resolve.js.map +1 -0
  20. package/dist/lib/commands/preview.d.ts +11 -0
  21. package/dist/lib/commands/preview.d.ts.map +1 -0
  22. package/dist/lib/commands/preview.js +138 -0
  23. package/dist/lib/commands/preview.js.map +1 -0
  24. package/dist/lib/commands/project-info.d.ts +11 -0
  25. package/dist/lib/commands/project-info.d.ts.map +1 -0
  26. package/dist/lib/commands/project-info.js +187 -0
  27. package/dist/lib/commands/project-info.js.map +1 -0
  28. package/dist/lib/commands/quality.d.ts +11 -0
  29. package/dist/lib/commands/quality.d.ts.map +1 -0
  30. package/dist/lib/commands/quality.js +384 -0
  31. package/dist/lib/commands/quality.js.map +1 -0
  32. package/dist/lib/commands/sections.d.ts +3 -2
  33. package/dist/lib/commands/sections.d.ts.map +1 -1
  34. package/dist/lib/commands/sections.js +4 -723
  35. package/dist/lib/commands/sections.js.map +1 -1
  36. package/dist/lib/commands/sync.d.ts +11 -0
  37. package/dist/lib/commands/sync.d.ts.map +1 -0
  38. package/dist/lib/commands/sync.js +441 -0
  39. package/dist/lib/commands/sync.js.map +1 -0
  40. package/dist/lib/commands/text-ops.d.ts +11 -0
  41. package/dist/lib/commands/text-ops.d.ts.map +1 -0
  42. package/dist/lib/commands/text-ops.js +357 -0
  43. package/dist/lib/commands/text-ops.js.map +1 -0
  44. package/dist/lib/commands/utilities.d.ts +2 -4
  45. package/dist/lib/commands/utilities.d.ts.map +1 -1
  46. package/dist/lib/commands/utilities.js +3 -1605
  47. package/dist/lib/commands/utilities.js.map +1 -1
  48. package/dist/lib/commands/word-tools.d.ts +11 -0
  49. package/dist/lib/commands/word-tools.d.ts.map +1 -0
  50. package/dist/lib/commands/word-tools.js +272 -0
  51. package/dist/lib/commands/word-tools.js.map +1 -0
  52. package/dist/lib/comment-realign.d.ts.map +1 -1
  53. package/dist/lib/comment-realign.js +0 -7
  54. package/dist/lib/comment-realign.js.map +1 -1
  55. package/dist/lib/dependencies.d.ts.map +1 -1
  56. package/dist/lib/dependencies.js +11 -23
  57. package/dist/lib/dependencies.js.map +1 -1
  58. package/dist/lib/diff-engine.d.ts +25 -0
  59. package/dist/lib/diff-engine.d.ts.map +1 -0
  60. package/dist/lib/diff-engine.js +354 -0
  61. package/dist/lib/diff-engine.js.map +1 -0
  62. package/dist/lib/git.d.ts.map +1 -1
  63. package/dist/lib/git.js +18 -28
  64. package/dist/lib/git.js.map +1 -1
  65. package/dist/lib/import.d.ts +37 -117
  66. package/dist/lib/import.d.ts.map +1 -1
  67. package/dist/lib/import.js +10 -1039
  68. package/dist/lib/import.js.map +1 -1
  69. package/dist/lib/merge.d.ts.map +1 -1
  70. package/dist/lib/merge.js +29 -117
  71. package/dist/lib/merge.js.map +1 -1
  72. package/dist/lib/pdf-comments.d.ts.map +1 -1
  73. package/dist/lib/pdf-comments.js +1 -13
  74. package/dist/lib/pdf-comments.js.map +1 -1
  75. package/dist/lib/pptx-themes.d.ts.map +1 -1
  76. package/dist/lib/pptx-themes.js +0 -403
  77. package/dist/lib/pptx-themes.js.map +1 -1
  78. package/dist/lib/protect-restore.d.ts.map +1 -1
  79. package/dist/lib/protect-restore.js +34 -36
  80. package/dist/lib/protect-restore.js.map +1 -1
  81. package/dist/lib/restore-references.d.ts +35 -0
  82. package/dist/lib/restore-references.d.ts.map +1 -0
  83. package/dist/lib/restore-references.js +188 -0
  84. package/dist/lib/restore-references.js.map +1 -0
  85. package/dist/lib/slides.d.ts.map +1 -1
  86. package/dist/lib/slides.js +0 -35
  87. package/dist/lib/slides.js.map +1 -1
  88. package/dist/lib/trackchanges.d.ts.map +1 -1
  89. package/dist/lib/trackchanges.js +1 -11
  90. package/dist/lib/trackchanges.js.map +1 -1
  91. package/dist/lib/tui.d.ts +36 -45
  92. package/dist/lib/tui.d.ts.map +1 -1
  93. package/dist/lib/tui.js +92 -108
  94. package/dist/lib/tui.js.map +1 -1
  95. package/dist/lib/undo.d.ts +3 -4
  96. package/dist/lib/undo.d.ts.map +1 -1
  97. package/dist/lib/undo.js +0 -7
  98. package/dist/lib/undo.js.map +1 -1
  99. package/dist/lib/utils.d.ts +12 -0
  100. package/dist/lib/utils.d.ts.map +1 -1
  101. package/dist/lib/utils.js +26 -0
  102. package/dist/lib/utils.js.map +1 -1
  103. package/dist/lib/word-extraction.d.ts +77 -0
  104. package/dist/lib/word-extraction.d.ts.map +1 -0
  105. package/dist/lib/word-extraction.js +515 -0
  106. package/dist/lib/word-extraction.js.map +1 -0
  107. package/dist/lib/wordcomments.d.ts.map +1 -1
  108. package/dist/lib/wordcomments.js +1 -8
  109. package/dist/lib/wordcomments.js.map +1 -1
  110. package/dist/package.json +137 -0
  111. package/lib/commands/comments.ts +20 -25
  112. package/lib/commands/context.ts +1 -2
  113. package/lib/commands/file-ops.ts +372 -0
  114. package/lib/commands/index.ts +24 -0
  115. package/lib/commands/merge-resolve.ts +378 -0
  116. package/lib/commands/preview.ts +178 -0
  117. package/lib/commands/project-info.ts +244 -0
  118. package/lib/commands/quality.ts +517 -0
  119. package/lib/commands/sections.ts +3 -857
  120. package/lib/commands/sync.ts +536 -0
  121. package/lib/commands/text-ops.ts +449 -0
  122. package/lib/commands/utilities.ts +62 -2066
  123. package/lib/commands/word-tools.ts +340 -0
  124. package/lib/comment-realign.ts +0 -8
  125. package/lib/dependencies.ts +12 -20
  126. package/lib/diff-engine.ts +465 -0
  127. package/lib/git.ts +24 -31
  128. package/lib/import.ts +78 -1348
  129. package/lib/merge.ts +42 -132
  130. package/lib/pdf-comments.ts +2 -14
  131. package/lib/pptx-themes.ts +0 -413
  132. package/lib/protect-restore.ts +48 -44
  133. package/lib/restore-references.ts +240 -0
  134. package/lib/slides.ts +0 -37
  135. package/lib/trackchanges.ts +1 -12
  136. package/lib/{tui.js → tui.ts} +139 -126
  137. package/lib/undo.ts +3 -12
  138. package/lib/utils.ts +28 -0
  139. package/lib/word-extraction.ts +666 -0
  140. package/lib/wordcomments.ts +1 -9
  141. package/package.json +1 -1
@@ -1,521 +1,26 @@
1
1
  /**
2
2
  * Import functionality - convert Word docs to annotated Markdown
3
+ *
4
+ * Orchestration workflows + re-exports from extraction/diff/restore modules
3
5
  */
4
6
  import * as fs from 'fs';
5
7
  import * as path from 'path';
6
- import { diffWords } from 'diff';
7
8
  import { stripAnnotations } from './annotations.js';
8
9
  import { readImageRegistry } from './image-registry.js';
9
10
  import { exec } from 'child_process';
10
11
  import { promisify } from 'util';
11
- import { extractMarkdownPrefix, protectAnchors, restoreAnchors, protectCrossrefs, restoreCrossrefs, protectMath, restoreMath, replaceRenderedMath, protectCitations, restoreCitations, replaceRenderedCitations, protectImages, restoreImages, matchWordImagesToOriginal, protectTables, restoreTables, } from './protect-restore.js';
12
+ // Import from split modules
13
+ import { extractFromWord, } from './word-extraction.js';
14
+ import { generateSmartDiff, cleanupAnnotations, fixCitationAnnotations, } from './diff-engine.js';
15
+ import { restoreCrossrefFromWord, restoreImagesFromRegistry, convertVisibleComments, } from './restore-references.js';
16
+ // Re-export everything so existing imports from './import.js' still work
17
+ export { extractFromWord, extractWordComments, extractCommentAnchors, extractWordTables, } from './word-extraction.js';
18
+ export { generateSmartDiff, generateAnnotatedDiff, cleanupAnnotations, fixCitationAnnotations, } from './diff-engine.js';
19
+ export { restoreCrossrefFromWord, restoreImagesFromRegistry, parseVisibleComments, convertVisibleComments, } from './restore-references.js';
12
20
  const execAsync = promisify(exec);
13
21
  // ============================================
14
22
  // Functions
15
23
  // ============================================
16
- /**
17
- * Extract comments directly from Word docx comments.xml
18
- */
19
- export async function extractWordComments(docxPath) {
20
- const AdmZip = (await import('adm-zip')).default;
21
- const { parseStringPromise } = await import('xml2js');
22
- const comments = [];
23
- // Validate file exists
24
- if (!fs.existsSync(docxPath)) {
25
- throw new Error(`File not found: ${docxPath}`);
26
- }
27
- try {
28
- let zip;
29
- try {
30
- zip = new AdmZip(docxPath);
31
- }
32
- catch (err) {
33
- throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
34
- }
35
- const commentsEntry = zip.getEntry('word/comments.xml');
36
- if (!commentsEntry) {
37
- return comments;
38
- }
39
- let commentsXml;
40
- try {
41
- commentsXml = commentsEntry.getData().toString('utf8');
42
- }
43
- catch (err) {
44
- throw new Error(`Failed to read comments from document: ${err.message}`);
45
- }
46
- const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
47
- const ns = 'w:';
48
- const commentsRoot = parsed['w:comments'];
49
- if (!commentsRoot || !commentsRoot['w:comment']) {
50
- return comments;
51
- }
52
- // Ensure it's an array
53
- const commentNodes = Array.isArray(commentsRoot['w:comment'])
54
- ? commentsRoot['w:comment']
55
- : [commentsRoot['w:comment']];
56
- for (const comment of commentNodes) {
57
- const id = comment.$?.['w:id'] || '';
58
- const author = comment.$?.['w:author'] || 'Unknown';
59
- const date = comment.$?.['w:date'] || '';
60
- // Extract text from nested w:p/w:r/w:t elements
61
- let text = '';
62
- const extractText = (node) => {
63
- if (!node)
64
- return;
65
- if (typeof node === 'string') {
66
- text += node;
67
- return;
68
- }
69
- if (node['w:t']) {
70
- const t = node['w:t'];
71
- text += typeof t === 'string' ? t : (t._ || t);
72
- }
73
- if (node['w:r']) {
74
- const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
75
- runs.forEach(extractText);
76
- }
77
- if (node['w:p']) {
78
- const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
79
- paras.forEach(extractText);
80
- }
81
- };
82
- extractText(comment);
83
- comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
84
- }
85
- }
86
- catch (err) {
87
- // Re-throw with more context if it's already an Error we created
88
- if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
89
- throw err;
90
- }
91
- throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
92
- }
93
- return comments;
94
- }
95
- /**
96
- * Extract comment anchor texts from document.xml with surrounding context
97
- * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
98
- * Also returns fullDocText for section boundary matching
99
- */
100
- export async function extractCommentAnchors(docxPath) {
101
- const AdmZip = (await import('adm-zip')).default;
102
- const anchors = new Map();
103
- let fullDocText = '';
104
- try {
105
- const zip = new AdmZip(docxPath);
106
- const docEntry = zip.getEntry('word/document.xml');
107
- if (!docEntry) {
108
- return { anchors, fullDocText };
109
- }
110
- const docXml = docEntry.getData().toString('utf8');
111
- // ========================================
112
- // STEP 1: Build text position mapping
113
- // ========================================
114
- const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
115
- const textNodes = [];
116
- let textPosition = 0;
117
- let nodeMatch;
118
- while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
119
- const rawText = nodeMatch[1] ?? '';
120
- const decodedText = decodeXmlEntities(rawText);
121
- textNodes.push({
122
- xmlStart: nodeMatch.index,
123
- xmlEnd: nodeMatch.index + nodeMatch[0].length,
124
- textStart: textPosition,
125
- textEnd: textPosition + decodedText.length,
126
- text: decodedText
127
- });
128
- textPosition += decodedText.length;
129
- }
130
- fullDocText = textNodes.map(n => n.text).join('');
131
- // Helper: convert XML position to text position
132
- function xmlPosToTextPos(xmlPos) {
133
- for (let i = 0; i < textNodes.length; i++) {
134
- const node = textNodes[i];
135
- if (!node)
136
- continue;
137
- if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
138
- return node.textStart;
139
- }
140
- if (xmlPos < node.xmlStart) {
141
- return node.textStart;
142
- }
143
- }
144
- const lastNode = textNodes[textNodes.length - 1];
145
- return lastNode ? lastNode.textEnd : 0;
146
- }
147
- // Helper: extract context before a position
148
- function getContextBefore(position, maxLength = 150) {
149
- const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
150
- const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
151
- return sentenceStart >= 0
152
- ? beforeText.slice(sentenceStart + 2).trim()
153
- : beforeText.slice(-80).trim();
154
- }
155
- // Helper: extract context after a position
156
- function getContextAfter(position, maxLength = 150) {
157
- const afterText = fullDocText.slice(position, position + maxLength);
158
- const sentenceEnd = afterText.search(/[.!?]\s/);
159
- return sentenceEnd >= 0
160
- ? afterText.slice(0, sentenceEnd + 1).trim()
161
- : afterText.slice(0, 80).trim();
162
- }
163
- // ========================================
164
- // STEP 2: Collect all start/end markers separately
165
- // ========================================
166
- const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
167
- const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
168
- const starts = new Map(); // id -> position after start tag
169
- const ends = new Map(); // id -> position before end tag
170
- let match;
171
- while ((match = startPattern.exec(docXml)) !== null) {
172
- const id = match[1];
173
- if (!starts.has(id)) {
174
- starts.set(id, match.index + match[0].length);
175
- }
176
- }
177
- while ((match = endPattern.exec(docXml)) !== null) {
178
- const id = match[1];
179
- if (!ends.has(id)) {
180
- ends.set(id, match.index);
181
- }
182
- }
183
- // ========================================
184
- // STEP 3: Process each comment range by ID
185
- // ========================================
186
- for (const [id, startXmlPos] of starts) {
187
- const endXmlPos = ends.get(id);
188
- // Missing end marker - skip with warning
189
- if (endXmlPos === undefined) {
190
- console.warn(`Comment ${id}: missing end marker`);
191
- continue;
192
- }
193
- // Calculate text position
194
- const docPosition = xmlPosToTextPos(startXmlPos);
195
- // Handle empty or inverted ranges
196
- if (endXmlPos <= startXmlPos) {
197
- anchors.set(id, {
198
- anchor: '',
199
- before: getContextBefore(docPosition),
200
- after: getContextAfter(docPosition),
201
- docPosition,
202
- docLength: fullDocText.length,
203
- isEmpty: true
204
- });
205
- continue;
206
- }
207
- // Extract XML segment between markers
208
- const segment = docXml.slice(startXmlPos, endXmlPos);
209
- // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
210
- const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
211
- let anchorText = '';
212
- let tm;
213
- while ((tm = textInRangePattern.exec(segment)) !== null) {
214
- anchorText += tm[1] || tm[2] || '';
215
- }
216
- anchorText = decodeXmlEntities(anchorText);
217
- // Get context
218
- const anchorLength = anchorText.length;
219
- const before = getContextBefore(docPosition);
220
- const after = getContextAfter(docPosition + anchorLength);
221
- // ALWAYS add entry (even if anchor is empty)
222
- anchors.set(id, {
223
- anchor: anchorText.trim(),
224
- before,
225
- after,
226
- docPosition,
227
- docLength: fullDocText.length,
228
- isEmpty: !anchorText.trim()
229
- });
230
- }
231
- }
232
- catch (err) {
233
- console.error('Error extracting comment anchors:', err.message);
234
- return { anchors, fullDocText: '' };
235
- }
236
- return { anchors, fullDocText };
237
- }
238
- /**
239
- * Decode XML entities in text
240
- */
241
- function decodeXmlEntities(text) {
242
- return text
243
- .replace(/&amp;/g, '&')
244
- .replace(/&lt;/g, '<')
245
- .replace(/&gt;/g, '>')
246
- .replace(/&quot;/g, '"')
247
- .replace(/&apos;/g, "'")
248
- .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
249
- .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
250
- }
251
- /**
252
- * Extract text content from a Word XML cell
253
- */
254
- function extractCellText(cellXml) {
255
- const parts = [];
256
- // Check for OMML math - replace with [math] placeholder
257
- if (cellXml.includes('<m:oMath')) {
258
- // Try to extract the text representation of math
259
- const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
260
- if (mathTextMatches.length > 0) {
261
- const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
262
- parts.push(mathText);
263
- }
264
- else {
265
- parts.push('[math]');
266
- }
267
- }
268
- // Extract regular text from w:t elements
269
- const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
270
- for (const match of textMatches) {
271
- const text = match.replace(/<[^>]+>/g, '');
272
- if (text) {
273
- parts.push(text);
274
- }
275
- }
276
- let result = parts.join('').trim();
277
- result = decodeXmlEntities(result);
278
- // Escape pipe characters in cell content (would break table)
279
- result = result.replace(/\|/g, '\\|');
280
- return result;
281
- }
282
- /**
283
- * Parse a table row, handling merged cells (gridSpan)
284
- */
285
- function parseTableRow(rowXml, expectedCols) {
286
- // Match cells - handle both <w:tc> and <w:tc ...>
287
- const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
288
- const cells = [];
289
- const colSpans = [];
290
- for (const cellXml of cellMatches) {
291
- // Check for horizontal merge (gridSpan)
292
- const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
293
- const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
294
- // Check for vertical merge continuation (vMerge without restart)
295
- // If vMerge is present without w:val="restart", it's a continuation - use empty
296
- const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
297
- const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
298
- const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
299
- // Add the cell content
300
- cells.push(cellText);
301
- colSpans.push(span);
302
- // For gridSpan > 1, add empty cells to maintain column alignment
303
- for (let i = 1; i < span; i++) {
304
- cells.push('');
305
- colSpans.push(0); // 0 indicates this is a spanned cell
306
- }
307
- }
308
- return { cells, colSpans };
309
- }
310
- /**
311
- * Determine table grid column count from table XML
312
- */
313
- function getTableGridCols(tableXml) {
314
- // Try to get from tblGrid
315
- const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
316
- if (gridColMatches.length > 0) {
317
- return gridColMatches.length;
318
- }
319
- // Fallback: count max cells in any row
320
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
321
- let maxCols = 0;
322
- for (const rowXml of rowMatches) {
323
- const { cells } = parseTableRow(rowXml, 0);
324
- maxCols = Math.max(maxCols, cells.length);
325
- }
326
- return maxCols;
327
- }
328
- /**
329
- * Extract tables directly from Word document XML and convert to markdown pipe tables
330
- */
331
- export async function extractWordTables(docxPath) {
332
- const AdmZip = (await import('adm-zip')).default;
333
- const tables = [];
334
- try {
335
- const zip = new AdmZip(docxPath);
336
- const docEntry = zip.getEntry('word/document.xml');
337
- if (!docEntry) {
338
- return tables;
339
- }
340
- const xml = docEntry.getData().toString('utf8');
341
- // Find all table elements
342
- const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
343
- for (const tableXml of tableMatches) {
344
- // Determine expected column count from grid
345
- const expectedCols = getTableGridCols(tableXml);
346
- // Extract rows
347
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
348
- const rows = [];
349
- for (const rowXml of rowMatches) {
350
- const { cells } = parseTableRow(rowXml, expectedCols);
351
- if (cells.length > 0) {
352
- rows.push(cells);
353
- }
354
- }
355
- if (rows.length > 0) {
356
- // Convert to markdown pipe table
357
- const markdown = convertRowsToMarkdownTable(rows);
358
- tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
359
- }
360
- }
361
- }
362
- catch (err) {
363
- console.error('Error extracting tables from Word:', err.message);
364
- }
365
- return tables;
366
- }
367
- /**
368
- * Convert array of rows (each row is array of cell strings) to markdown pipe table
369
- */
370
- function convertRowsToMarkdownTable(rows) {
371
- if (rows.length === 0)
372
- return '';
373
- // Normalize column count (use max across all rows)
374
- const colCount = Math.max(...rows.map((r) => r.length));
375
- // Pad rows to have consistent column count
376
- const normalizedRows = rows.map((row) => {
377
- while (row.length < colCount) {
378
- row.push('');
379
- }
380
- return row;
381
- });
382
- // Build markdown table
383
- const lines = [];
384
- // Header row
385
- const header = normalizedRows[0];
386
- lines.push('| ' + header.join(' | ') + ' |');
387
- // Separator row
388
- lines.push('|' + header.map(() => '---').join('|') + '|');
389
- // Data rows
390
- for (let i = 1; i < normalizedRows.length; i++) {
391
- lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
392
- }
393
- return lines.join('\n');
394
- }
395
- /**
396
- * Extract text from Word document using pandoc with track changes preserved
397
- */
398
- export async function extractFromWord(docxPath, options = {}) {
399
- let text;
400
- let messages = [];
401
- let extractedMedia = [];
402
- let hasTrackChanges = false;
403
- let trackChangeStats = { insertions: 0, deletions: 0 };
404
- // Determine media extraction directory
405
- const docxDir = path.dirname(docxPath);
406
- const mediaDir = options.mediaDir || path.join(docxDir, 'media');
407
- // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
408
- const skipMediaExtraction = options.skipMediaExtraction || false;
409
- // Extract tables directly from Word XML (reliable, no heuristics)
410
- const wordTables = await extractWordTables(docxPath);
411
- // Try pandoc first with --track-changes=all to preserve reviewer edits
412
- try {
413
- // Build pandoc command
414
- let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
415
- if (!skipMediaExtraction) {
416
- pandocCmd += ` --extract-media="${mediaDir}"`;
417
- }
418
- const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
419
- text = stdout;
420
- // Convert pandoc's track change format to CriticMarkup
421
- const origLength = text.length;
422
- // Use a more robust pattern that handles nested content
423
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
424
- if (content.trim()) {
425
- trackChangeStats.insertions++;
426
- return `{++${content}++}`;
427
- }
428
- return ''; // Empty insertions are removed
429
- });
430
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
431
- if (content.trim()) {
432
- trackChangeStats.deletions++;
433
- return `{--${content}--}`;
434
- }
435
- return ''; // Empty deletions are removed
436
- });
437
- // Handle any remaining pandoc track change patterns
438
- let prevText;
439
- do {
440
- prevText = text;
441
- text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
442
- if (content.trim()) {
443
- trackChangeStats.insertions++;
444
- return `{++${content}++}`;
445
- }
446
- return '';
447
- });
448
- text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
449
- if (content.trim()) {
450
- trackChangeStats.deletions++;
451
- return `{--${content}--}`;
452
- }
453
- return '';
454
- });
455
- } while (text !== prevText);
456
- // Handle pandoc comment patterns - remove comment text from body
457
- text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
458
- text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
459
- // Also handle {.mark} spans
460
- text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
461
- hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
462
- if (hasTrackChanges) {
463
- messages.push({
464
- type: 'info',
465
- message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
466
- });
467
- }
468
- // Find extracted media files
469
- const mediaSubdir = path.join(mediaDir, 'media');
470
- if (fs.existsSync(mediaSubdir)) {
471
- extractedMedia = fs.readdirSync(mediaSubdir)
472
- .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
473
- .map(f => path.join(mediaSubdir, f));
474
- if (extractedMedia.length > 0) {
475
- messages.push({
476
- type: 'info',
477
- message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
478
- });
479
- }
480
- }
481
- }
482
- catch (pandocErr) {
483
- // Pandoc not available — use XML-based extraction with track change support
484
- const { extractPlainTextWithTrackChanges } = await import('./word.js');
485
- const { getInstallInstructions } = await import('./dependencies.js');
486
- const installCmd = getInstallInstructions('pandoc');
487
- const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
488
- text = xmlResult.text;
489
- hasTrackChanges = xmlResult.hasTrackChanges;
490
- trackChangeStats = xmlResult.stats;
491
- if (hasTrackChanges) {
492
- messages.push({
493
- type: 'warning',
494
- message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
495
- });
496
- }
497
- else {
498
- messages.push({
499
- type: 'warning',
500
- message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
501
- });
502
- }
503
- }
504
- // Extract comments directly from docx XML
505
- const comments = await extractWordComments(docxPath);
506
- // Extract comment anchor texts
507
- const { anchors } = await extractCommentAnchors(docxPath);
508
- return {
509
- text,
510
- comments,
511
- anchors,
512
- messages,
513
- extractedMedia,
514
- tables: wordTables,
515
- hasTrackChanges,
516
- trackChangeStats,
517
- };
518
- }
519
24
  /**
520
25
  * Insert comments into markdown text based on anchor texts with context
521
26
  */
@@ -828,540 +333,6 @@ export function insertCommentsIntoMarkdown(markdown, comments, anchors, options
828
333
  }
829
334
  return result;
830
335
  }
831
- /**
832
- * Normalize text for comparison (handle whitespace differences)
833
- */
834
- function normalizeWhitespace(text) {
835
- return text
836
- .replace(/\r\n/g, '\n') // Normalize line endings
837
- .replace(/\t/g, ' ') // Tabs to spaces
838
- .replace(/ +/g, ' ') // Collapse multiple spaces
839
- .trim();
840
- }
841
- /**
842
- * Fix citation and math annotations by preserving original markdown syntax
843
- */
844
- function fixCitationAnnotations(text, originalMd) {
845
- // Fix math annotations - preserve inline and display math
846
- text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
847
- text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
848
- text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
849
- text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
850
- // Extract all citations from original markdown
851
- const citationPattern = /\[@[^\]]+\]/g;
852
- const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
853
- // Fix substitutions where left side has markdown citation
854
- text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
855
- // Fix substitutions where left side STARTS with markdown citation
856
- text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
857
- if (oldText.trim() === '' && newText.trim() === '') {
858
- return cite;
859
- }
860
- if (oldText.trim() || newText.trim()) {
861
- return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
862
- }
863
- return cite;
864
- });
865
- // Fix deletions of markdown citations
866
- text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
867
- // Fix insertions of rendered citations
868
- text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
869
- // Clean up broken multi-part substitutions
870
- text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
871
- // Fix citations split across substitution boundaries
872
- text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
873
- // Clean up any remaining partial citations
874
- text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
875
- // Remove rendered citation insertions (with Unicode support)
876
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
877
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
878
- // Trailing citation fragments
879
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
880
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
881
- // Just year with closing paren
882
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
883
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
884
- // Leading citation fragments
885
- text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
886
- // Semicolon-separated fragments
887
- text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
888
- // Year ranges with authors
889
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
890
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
891
- // Clean up double spaces and orphaned punctuation
892
- text = text.replace(/ +/g, ' ');
893
- text = text.replace(/\s+\./g, '.');
894
- text = text.replace(/\s+,/g, ',');
895
- // Final cleanup - remove empty annotations
896
- text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
897
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
898
- text = text.replace(/\{--\s*--\}/g, '');
899
- return text;
900
- }
901
- /**
902
- * Strip markdown syntax to get plain text
903
- */
904
- function stripMarkdownSyntax(md) {
905
- return md
906
- .replace(/^---[\s\S]*?---\n*/m, '')
907
- .replace(/^#{1,6}\s+/gm, '')
908
- .replace(/(\*\*|__)(.*?)\1/g, '$2')
909
- .replace(/(\*|_)(.*?)\1/g, '$2')
910
- .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
911
- .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
912
- .replace(/`([^`]+)`/g, '$1')
913
- .replace(/```[\s\S]*?```/g, '')
914
- .replace(/^>\s*/gm, '')
915
- .replace(/^[-*_]{3,}\s*$/gm, '')
916
- .replace(/^[\s]*[-*+]\s+/gm, '')
917
- .replace(/^[\s]*\d+\.\s+/gm, '')
918
- .replace(/\|/g, ' ')
919
- .replace(/^[-:]+$/gm, '')
920
- .replace(/\n{3,}/g, '\n\n')
921
- .trim();
922
- }
923
- /**
924
- * Generate annotated markdown by diffing original MD against Word text
925
- */
926
- export function generateAnnotatedDiff(originalMd, wordText, author = 'Reviewer') {
927
- const normalizedOriginal = normalizeWhitespace(originalMd);
928
- const normalizedWord = normalizeWhitespace(wordText);
929
- const changes = diffWords(normalizedOriginal, normalizedWord);
930
- let result = '';
931
- for (const part of changes) {
932
- if (part.added) {
933
- result += `{++${part.value}++}`;
934
- }
935
- else if (part.removed) {
936
- result += `{--${part.value}--}`;
937
- }
938
- else {
939
- result += part.value;
940
- }
941
- }
942
- return result;
943
- }
944
- /**
945
- * Inject Word tables (extracted from XML) into pandoc text output
946
- */
947
- function injectWordTables(pandocText, wordTables) {
948
- if (!wordTables || wordTables.length === 0) {
949
- return pandocText;
950
- }
951
- let result = pandocText;
952
- for (const table of wordTables) {
953
- const firstLine = table.markdown.split('\n')[0];
954
- const headerCells = firstLine
955
- .split('|')
956
- .map((c) => c.trim())
957
- .filter((c) => c.length > 0);
958
- if (headerCells.length === 0)
959
- continue;
960
- const firstCell = headerCells[0];
961
- const startIdx = result.indexOf(firstCell);
962
- if (startIdx === -1)
963
- continue;
964
- const lastLine = table.markdown.split('\n').pop();
965
- const lastCells = lastLine
966
- .split('|')
967
- .map((c) => c.trim())
968
- .filter((c) => c.length > 0);
969
- const lastCell = lastCells[lastCells.length - 1] || lastCells[0];
970
- const endIdx = result.indexOf(lastCell, startIdx);
971
- if (endIdx === -1)
972
- continue;
973
- let regionStart = result.lastIndexOf('\n\n', startIdx);
974
- if (regionStart === -1)
975
- regionStart = 0;
976
- else
977
- regionStart += 2;
978
- let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length);
979
- if (regionEnd === -1)
980
- regionEnd = result.length;
981
- result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd);
982
- }
983
- return result;
984
- }
985
- /**
986
- * Smart paragraph-level diff that preserves markdown structure
987
- */
988
- export function generateSmartDiff(originalMd, wordText, author = 'Reviewer', options = {}) {
989
- const { wordTables = [], imageRegistry = null } = options;
990
- // Inject Word tables into pandoc output
991
- let wordTextWithTables = injectWordTables(wordText, wordTables);
992
- // Protect markdown tables
993
- const { text: mdWithTablesProtected, tables } = protectTables(originalMd);
994
- // Also protect tables in Word text
995
- const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables);
996
- // Protect images
997
- const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry);
998
- const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry);
999
- // Match Word images to original images
1000
- const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry);
1001
- // Replace Word image placeholders with matching original placeholders
1002
- let wordWithMappedImages = wordWithImagesProtected;
1003
- for (const [wordPlaceholder, origPlaceholder] of imageMapping) {
1004
- wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder);
1005
- }
1006
- // Protect figure/table anchors
1007
- const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected);
1008
- // Protect cross-references
1009
- const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
1010
- // Protect math
1011
- const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
1012
- // Protect citations
1013
- const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
1014
- // Replace rendered elements in Word text
1015
- let wordProtected = wordWithMappedImages;
1016
- wordProtected = replaceRenderedMath(wordProtected, mathBlocks);
1017
- wordProtected = replaceRenderedCitations(wordProtected, citations.length);
1018
- // Split into paragraphs
1019
- const originalParas = mdProtected.split(/\n\n+/);
1020
- const wordParas = wordProtected.split(/\n\n+/);
1021
- const result = [];
1022
- // Try to match paragraphs intelligently
1023
- let wordIdx = 0;
1024
- for (let i = 0; i < originalParas.length; i++) {
1025
- const orig = originalParas[i] || '';
1026
- const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
1027
- // Find best matching word paragraph
1028
- let bestMatch = -1;
1029
- let bestScore = 0;
1030
- for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
1031
- const wordPara = wordParas[j] || '';
1032
- const origWords = new Set(origContent.toLowerCase().split(/\s+/));
1033
- const wordWords = wordPara.toLowerCase().split(/\s+/);
1034
- const common = wordWords.filter((w) => origWords.has(w)).length;
1035
- const score = common / Math.max(origWords.size, wordWords.length);
1036
- if (score > bestScore && score > 0.3) {
1037
- bestScore = score;
1038
- bestMatch = j;
1039
- }
1040
- }
1041
- if (bestMatch === -1) {
1042
- if (mdPrefix && wordIdx < wordParas.length) {
1043
- const wordPara = wordParas[wordIdx];
1044
- if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
1045
- bestMatch = wordIdx;
1046
- }
1047
- }
1048
- }
1049
- if (bestMatch >= 0) {
1050
- const word = wordParas[bestMatch];
1051
- const origStripped = stripMarkdownSyntax(orig);
1052
- const wordNormalized = normalizeWhitespace(word);
1053
- if (origStripped === wordNormalized) {
1054
- result.push(orig);
1055
- }
1056
- else {
1057
- const changes = diffWords(origStripped, wordNormalized);
1058
- let annotated = mdPrefix;
1059
- for (const part of changes) {
1060
- if (part.added) {
1061
- annotated += `{++${part.value}++}`;
1062
- }
1063
- else if (part.removed) {
1064
- annotated += `{--${part.value}--}`;
1065
- }
1066
- else {
1067
- annotated += part.value;
1068
- }
1069
- }
1070
- result.push(annotated);
1071
- }
1072
- wordIdx = bestMatch + 1;
1073
- }
1074
- else {
1075
- // Paragraph deleted entirely
1076
- if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) {
1077
- result.push(orig);
1078
- }
1079
- else {
1080
- result.push(`{--${orig}--}`);
1081
- }
1082
- }
1083
- }
1084
- // Any remaining word paragraphs are additions
1085
- for (let j = wordIdx; j < wordParas.length; j++) {
1086
- const word = wordParas[j];
1087
- if (word.trim()) {
1088
- result.push(`{++${word}++}`);
1089
- }
1090
- }
1091
- // Restore protected content
1092
- let finalResult = result.join('\n\n');
1093
- finalResult = restoreCitations(finalResult, citations);
1094
- finalResult = restoreMath(finalResult, mathBlocks);
1095
- finalResult = restoreCrossrefs(finalResult, crossrefs);
1096
- finalResult = restoreAnchors(finalResult, figAnchors);
1097
- finalResult = restoreImages(finalResult, origImages);
1098
- finalResult = restoreImages(finalResult, wordImages);
1099
- finalResult = restoreTables(finalResult, tables);
1100
- finalResult = restoreTables(finalResult, wordTableBlocks);
1101
- return finalResult;
1102
- }
1103
- /**
1104
- * Clean up redundant adjacent annotations
1105
- */
1106
- export function cleanupAnnotations(text) {
1107
- // Convert adjacent delete+insert to substitution
1108
- text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
1109
- // Also handle insert+delete
1110
- text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
1111
- // Fix malformed patterns
1112
- text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
1113
- // Fix malformed substitutions that got split
1114
- text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
1115
- text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
1116
- // Clean up empty annotations
1117
- text = text.replace(/\{--\s*--\}/g, '');
1118
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1119
- // Clean up double spaces in prose, but preserve table formatting
1120
- const lines = text.split('\n');
1121
- let inTable = false;
1122
- const processedLines = lines.map((line, idx) => {
1123
- const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim());
1124
- const looksLikeTableRow = /\S+\s{2,}\S+/.test(line);
1125
- if (isSeparator) {
1126
- if (!inTable) {
1127
- inTable = true;
1128
- }
1129
- return line;
1130
- }
1131
- if (inTable) {
1132
- if (line.trim() === '') {
1133
- let lookAhead = idx + 1;
1134
- let foundTableContent = false;
1135
- let foundEndSeparator = false;
1136
- while (lookAhead < lines.length && lookAhead < idx + 20) {
1137
- const nextLine = lines[lookAhead].trim();
1138
- if (nextLine === '') {
1139
- lookAhead++;
1140
- continue;
1141
- }
1142
- if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) {
1143
- foundEndSeparator = true;
1144
- break;
1145
- }
1146
- if (/\S+\s{2,}\S+/.test(nextLine)) {
1147
- foundTableContent = true;
1148
- break;
1149
- }
1150
- if (/^\*[^*]+\*\s*$/.test(nextLine)) {
1151
- foundTableContent = true;
1152
- break;
1153
- }
1154
- if (lines[lookAhead].startsWith(' ')) {
1155
- lookAhead++;
1156
- continue;
1157
- }
1158
- break;
1159
- }
1160
- if (foundTableContent || foundEndSeparator) {
1161
- return line;
1162
- }
1163
- inTable = false;
1164
- return line;
1165
- }
1166
- return line;
1167
- }
1168
- if (looksLikeTableRow) {
1169
- let nextIdx = idx + 1;
1170
- while (nextIdx < lines.length && lines[nextIdx].trim() === '') {
1171
- nextIdx++;
1172
- }
1173
- if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) {
1174
- return line;
1175
- }
1176
- }
1177
- if (line.trim().startsWith('|')) {
1178
- return line;
1179
- }
1180
- return line.replace(/ +/g, ' ');
1181
- });
1182
- text = processedLines.join('\n');
1183
- return text;
1184
- }
1185
- /**
1186
- * Parse visible comment markers from Word text
1187
- */
1188
- export function parseVisibleComments(text) {
1189
- const comments = [];
1190
- const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
1191
- let match;
1192
- while ((match = pattern.exec(text)) !== null) {
1193
- comments.push({
1194
- author: match[1].trim(),
1195
- text: match[2].trim(),
1196
- position: match.index,
1197
- });
1198
- }
1199
- return comments;
1200
- }
1201
- /**
1202
- * Convert visible comments to CriticMarkup format
1203
- */
1204
- export function convertVisibleComments(text) {
1205
- return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
1206
- }
1207
- /**
1208
- * Restore pandoc-crossref figure/table references from Word-rendered format
1209
- */
1210
- export function restoreCrossrefFromWord(text, projectDir, restoredLabels = null) {
1211
- const messages = [];
1212
- let restored = 0;
1213
- let result = text;
1214
- const registry = readImageRegistry(projectDir);
1215
- if (!restoredLabels) {
1216
- restoredLabels = new Set();
1217
- }
1218
- // Pattern 1: [Figure]{.mark} [N]{.mark}
1219
- result = result.replace(/\[(Figure|Table|Fig\.?)\]\{\.mark\}\s*\[(\d+|S\d+)\]\{\.mark\}/gi, (match, type, num) => {
1220
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1221
- if (registry) {
1222
- const entry = registry.byNumber?.get(`${prefix}:${num}`);
1223
- if (entry && entry.label) {
1224
- restored++;
1225
- return `@${prefix}:${entry.label}`;
1226
- }
1227
- }
1228
- restored++;
1229
- messages.push(`Restored ${type} ${num} (no label found, using placeholder)`);
1230
- return `@${prefix}:fig${num}`;
1231
- });
1232
- // Pattern 2: Plain "Figure N" or "Fig. N"
1233
- result = result.replace(/(?<!!)\b(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)\b(?!\s*:)/gi, (match, type, num) => {
1234
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1235
- if (registry) {
1236
- const entry = registry.byNumber?.get(`${prefix}:${num}`);
1237
- if (entry && entry.label) {
1238
- restored++;
1239
- return `@${prefix}:${entry.label}`;
1240
- }
1241
- }
1242
- return match;
1243
- });
1244
- // Pattern 3: Remove duplicate plain-text captions
1245
- result = result.replace(/(\!\[[^\]]+\]\([^)]+\)(?:\{[^}]*\})?)\s*\n+\s*(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*[^\n]+/gi, '$1');
1246
- // Pattern 4: Clean up image captions that start with "Figure N: "
1247
- result = result.replace(/!\[(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*([^\]]*)\]\(([^)]+)\)(?:\{[^}]*\})?/gi, (match, type, num, caption, imgPath) => {
1248
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1249
- const labelKey = `${prefix}:${num}`;
1250
- if (registry) {
1251
- const entry = registry.byNumber?.get(labelKey);
1252
- if (entry) {
1253
- if (restoredLabels.has(labelKey)) {
1254
- messages.push(`Skipped duplicate ${prefix}:${entry.label} (already restored)`);
1255
- return `![${entry.caption}](${entry.path})`;
1256
- }
1257
- restoredLabels.add(labelKey);
1258
- restored++;
1259
- messages.push(`Restored image ${prefix}:${entry.label} from Figure ${num}`);
1260
- return `![${entry.caption}](${entry.path}){#${prefix}:${entry.label}}`;
1261
- }
1262
- }
1263
- const cleanCaption = caption.trim();
1264
- return `![${cleanCaption}](${imgPath})`;
1265
- });
1266
- return { text: result, restored, messages, restoredLabels };
1267
- }
1268
- /**
1269
- * Restore proper markdown image syntax from Word-extracted text using image registry
1270
- */
1271
- export function restoreImagesFromRegistry(text, projectDir, restoredLabels = null) {
1272
- const messages = [];
1273
- let restored = 0;
1274
- const registry = readImageRegistry(projectDir);
1275
- if (!registry || !registry.figures || registry.figures.length === 0) {
1276
- return { text, restored: 0, messages: ['No image registry found'] };
1277
- }
1278
- if (!restoredLabels) {
1279
- restoredLabels = new Set();
1280
- }
1281
- let result = text;
1282
- // Pattern 1: Caption-like text
1283
- const captionPatterns = [
1284
- /@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^\n]+)/gi,
1285
- /^(Figure|Fig\.?)\s+(\d+|S\d+)[.:]\s*([^\n]+)/gim,
1286
- /\|\s*@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^|]+)\s*\|/gi,
1287
- ];
1288
- // Fix @fig:label: caption patterns
1289
- result = result.replace(captionPatterns[0], (match, type, label, caption) => {
1290
- const key = `${type}:${label}`;
1291
- const entry = registry.byLabel.get(key);
1292
- if (entry) {
1293
- if (restoredLabels.has(key)) {
1294
- messages.push(`Skipped duplicate ${key} (already restored)`);
1295
- return `![${entry.caption}](${entry.path})`;
1296
- }
1297
- restoredLabels.add(key);
1298
- restored++;
1299
- messages.push(`Restored ${type}:${label} from registry`);
1300
- return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1301
- }
1302
- return match;
1303
- });
1304
- // Fix table-wrapped captions
1305
- result = result.replace(captionPatterns[2], (match, type, label, caption) => {
1306
- const key = `${type}:${label}`;
1307
- const entry = registry.byLabel.get(key);
1308
- if (entry) {
1309
- if (restoredLabels.has(key)) {
1310
- messages.push(`Skipped duplicate ${key} from table wrapper`);
1311
- return `![${entry.caption}](${entry.path})`;
1312
- }
1313
- restoredLabels.add(key);
1314
- restored++;
1315
- messages.push(`Restored ${type}:${label} from table wrapper`);
1316
- return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1317
- }
1318
- return match;
1319
- });
1320
- // Clean up empty table structures
1321
- result = result.replace(/\|\s*\|\s*\n\|:--:\|\s*\n/g, '');
1322
- // Fix "Figure N:" standalone lines
1323
- result = result.replace(captionPatterns[1], (match, prefix, num, caption) => {
1324
- const numKey = `fig:${num}`;
1325
- const entry = registry.byNumber.get(numKey);
1326
- if (entry) {
1327
- const labelKey = `fig:${entry.label}`;
1328
- if (restoredLabels.has(labelKey)) {
1329
- messages.push(`Skipped duplicate Figure ${num} (already restored)`);
1330
- return `![${entry.caption}](${entry.path})`;
1331
- }
1332
- restoredLabels.add(labelKey);
1333
- restored++;
1334
- messages.push(`Restored Figure ${num} by number lookup`);
1335
- return `![${entry.caption}](${entry.path}){#fig:${entry.label}}`;
1336
- }
1337
- return match;
1338
- });
1339
- // Fix generic media paths by matching caption text
1340
- const genericImagePattern = /!\[([^\]]*)\]\(media\/[^)]+\)/g;
1341
- result = result.replace(genericImagePattern, (match, caption) => {
1342
- if (!caption || caption.trim() === '') {
1343
- return match;
1344
- }
1345
- const captionKey = caption.slice(0, 50).toLowerCase().trim();
1346
- const entry = registry.byCaption.get(captionKey);
1347
- if (entry) {
1348
- const labelKey = entry.label ? `${entry.type}:${entry.label}` : null;
1349
- if (labelKey && restoredLabels.has(labelKey)) {
1350
- messages.push(`Skipped duplicate by caption match: ${captionKey.slice(0, 30)}...`);
1351
- return `![${entry.caption}](${entry.path})`;
1352
- }
1353
- if (labelKey) {
1354
- restoredLabels.add(labelKey);
1355
- }
1356
- restored++;
1357
- messages.push(`Restored image by caption match: ${captionKey.slice(0, 30)}...`);
1358
- const anchor = (entry.label && !restoredLabels.has(labelKey)) ? `{#${entry.type}:${entry.label}}` : '';
1359
- return `![${entry.caption}](${entry.path})${anchor}`;
1360
- }
1361
- return match;
1362
- });
1363
- return { text: result, restored, messages };
1364
- }
1365
336
  /**
1366
337
  * Import Word document with track changes directly as CriticMarkup
1367
338
  */