docrev 0.9.4 → 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/dist/lib/commands/comments.d.ts.map +1 -1
  2. package/dist/lib/commands/comments.js +19 -27
  3. package/dist/lib/commands/comments.js.map +1 -1
  4. package/dist/lib/commands/context.d.ts +1 -0
  5. package/dist/lib/commands/context.d.ts.map +1 -1
  6. package/dist/lib/commands/context.js +1 -2
  7. package/dist/lib/commands/context.js.map +1 -1
  8. package/dist/lib/commands/file-ops.d.ts +11 -0
  9. package/dist/lib/commands/file-ops.d.ts.map +1 -0
  10. package/dist/lib/commands/file-ops.js +301 -0
  11. package/dist/lib/commands/file-ops.js.map +1 -0
  12. package/dist/lib/commands/index.d.ts +9 -1
  13. package/dist/lib/commands/index.d.ts.map +1 -1
  14. package/dist/lib/commands/index.js +17 -1
  15. package/dist/lib/commands/index.js.map +1 -1
  16. package/dist/lib/commands/merge-resolve.d.ts +12 -0
  17. package/dist/lib/commands/merge-resolve.d.ts.map +1 -0
  18. package/dist/lib/commands/merge-resolve.js +318 -0
  19. package/dist/lib/commands/merge-resolve.js.map +1 -0
  20. package/dist/lib/commands/preview.d.ts +11 -0
  21. package/dist/lib/commands/preview.d.ts.map +1 -0
  22. package/dist/lib/commands/preview.js +138 -0
  23. package/dist/lib/commands/preview.js.map +1 -0
  24. package/dist/lib/commands/project-info.d.ts +11 -0
  25. package/dist/lib/commands/project-info.d.ts.map +1 -0
  26. package/dist/lib/commands/project-info.js +187 -0
  27. package/dist/lib/commands/project-info.js.map +1 -0
  28. package/dist/lib/commands/quality.d.ts +11 -0
  29. package/dist/lib/commands/quality.d.ts.map +1 -0
  30. package/dist/lib/commands/quality.js +384 -0
  31. package/dist/lib/commands/quality.js.map +1 -0
  32. package/dist/lib/commands/sections.d.ts +3 -2
  33. package/dist/lib/commands/sections.d.ts.map +1 -1
  34. package/dist/lib/commands/sections.js +4 -723
  35. package/dist/lib/commands/sections.js.map +1 -1
  36. package/dist/lib/commands/sync.d.ts +11 -0
  37. package/dist/lib/commands/sync.d.ts.map +1 -0
  38. package/dist/lib/commands/sync.js +441 -0
  39. package/dist/lib/commands/sync.js.map +1 -0
  40. package/dist/lib/commands/text-ops.d.ts +11 -0
  41. package/dist/lib/commands/text-ops.d.ts.map +1 -0
  42. package/dist/lib/commands/text-ops.js +357 -0
  43. package/dist/lib/commands/text-ops.js.map +1 -0
  44. package/dist/lib/commands/utilities.d.ts +2 -4
  45. package/dist/lib/commands/utilities.d.ts.map +1 -1
  46. package/dist/lib/commands/utilities.js +3 -1605
  47. package/dist/lib/commands/utilities.js.map +1 -1
  48. package/dist/lib/commands/word-tools.d.ts +11 -0
  49. package/dist/lib/commands/word-tools.d.ts.map +1 -0
  50. package/dist/lib/commands/word-tools.js +272 -0
  51. package/dist/lib/commands/word-tools.js.map +1 -0
  52. package/dist/lib/comment-realign.d.ts.map +1 -1
  53. package/dist/lib/comment-realign.js +0 -7
  54. package/dist/lib/comment-realign.js.map +1 -1
  55. package/dist/lib/dependencies.d.ts.map +1 -1
  56. package/dist/lib/dependencies.js +11 -23
  57. package/dist/lib/dependencies.js.map +1 -1
  58. package/dist/lib/diff-engine.d.ts +25 -0
  59. package/dist/lib/diff-engine.d.ts.map +1 -0
  60. package/dist/lib/diff-engine.js +354 -0
  61. package/dist/lib/diff-engine.js.map +1 -0
  62. package/dist/lib/git.d.ts.map +1 -1
  63. package/dist/lib/git.js +18 -28
  64. package/dist/lib/git.js.map +1 -1
  65. package/dist/lib/import.d.ts +37 -117
  66. package/dist/lib/import.d.ts.map +1 -1
  67. package/dist/lib/import.js +10 -1039
  68. package/dist/lib/import.js.map +1 -1
  69. package/dist/lib/merge.d.ts.map +1 -1
  70. package/dist/lib/merge.js +29 -117
  71. package/dist/lib/merge.js.map +1 -1
  72. package/dist/lib/pdf-comments.d.ts.map +1 -1
  73. package/dist/lib/pdf-comments.js +1 -13
  74. package/dist/lib/pdf-comments.js.map +1 -1
  75. package/dist/lib/pptx-themes.d.ts.map +1 -1
  76. package/dist/lib/pptx-themes.js +0 -403
  77. package/dist/lib/pptx-themes.js.map +1 -1
  78. package/dist/lib/protect-restore.d.ts.map +1 -1
  79. package/dist/lib/protect-restore.js +34 -36
  80. package/dist/lib/protect-restore.js.map +1 -1
  81. package/dist/lib/restore-references.d.ts +35 -0
  82. package/dist/lib/restore-references.d.ts.map +1 -0
  83. package/dist/lib/restore-references.js +188 -0
  84. package/dist/lib/restore-references.js.map +1 -0
  85. package/dist/lib/slides.d.ts.map +1 -1
  86. package/dist/lib/slides.js +0 -35
  87. package/dist/lib/slides.js.map +1 -1
  88. package/dist/lib/trackchanges.d.ts.map +1 -1
  89. package/dist/lib/trackchanges.js +1 -11
  90. package/dist/lib/trackchanges.js.map +1 -1
  91. package/dist/lib/tui.d.ts +36 -45
  92. package/dist/lib/tui.d.ts.map +1 -1
  93. package/dist/lib/tui.js +92 -108
  94. package/dist/lib/tui.js.map +1 -1
  95. package/dist/lib/undo.d.ts +3 -4
  96. package/dist/lib/undo.d.ts.map +1 -1
  97. package/dist/lib/undo.js +0 -7
  98. package/dist/lib/undo.js.map +1 -1
  99. package/dist/lib/utils.d.ts +12 -0
  100. package/dist/lib/utils.d.ts.map +1 -1
  101. package/dist/lib/utils.js +26 -0
  102. package/dist/lib/utils.js.map +1 -1
  103. package/dist/lib/word-extraction.d.ts +77 -0
  104. package/dist/lib/word-extraction.d.ts.map +1 -0
  105. package/dist/lib/word-extraction.js +515 -0
  106. package/dist/lib/word-extraction.js.map +1 -0
  107. package/dist/lib/wordcomments.d.ts.map +1 -1
  108. package/dist/lib/wordcomments.js +1 -8
  109. package/dist/lib/wordcomments.js.map +1 -1
  110. package/dist/package.json +137 -0
  111. package/lib/commands/comments.ts +20 -25
  112. package/lib/commands/context.ts +1 -2
  113. package/lib/commands/file-ops.ts +372 -0
  114. package/lib/commands/index.ts +24 -0
  115. package/lib/commands/merge-resolve.ts +378 -0
  116. package/lib/commands/preview.ts +178 -0
  117. package/lib/commands/project-info.ts +244 -0
  118. package/lib/commands/quality.ts +517 -0
  119. package/lib/commands/sections.ts +3 -857
  120. package/lib/commands/sync.ts +536 -0
  121. package/lib/commands/text-ops.ts +449 -0
  122. package/lib/commands/utilities.ts +62 -2066
  123. package/lib/commands/word-tools.ts +340 -0
  124. package/lib/comment-realign.ts +0 -8
  125. package/lib/dependencies.ts +12 -20
  126. package/lib/diff-engine.ts +465 -0
  127. package/lib/git.ts +24 -31
  128. package/lib/import.ts +78 -1348
  129. package/lib/merge.ts +42 -132
  130. package/lib/pdf-comments.ts +2 -14
  131. package/lib/pptx-themes.ts +0 -413
  132. package/lib/protect-restore.ts +48 -44
  133. package/lib/restore-references.ts +240 -0
  134. package/lib/slides.ts +0 -37
  135. package/lib/trackchanges.ts +1 -12
  136. package/lib/{tui.js → tui.ts} +139 -126
  137. package/lib/undo.ts +3 -12
  138. package/lib/utils.ts +28 -0
  139. package/lib/word-extraction.ts +666 -0
  140. package/lib/wordcomments.ts +1 -9
  141. package/package.json +1 -1
package/lib/import.ts CHANGED
@@ -1,107 +1,94 @@
1
1
  /**
2
2
  * Import functionality - convert Word docs to annotated Markdown
3
+ *
4
+ * Orchestration workflows + re-exports from extraction/diff/restore modules
3
5
  */
4
6
 
5
7
  import * as fs from 'fs';
6
8
  import * as path from 'path';
7
- import { diffWords, Change } from 'diff';
8
9
  import { stripAnnotations } from './annotations.js';
9
10
  import { readImageRegistry } from './image-registry.js';
10
11
  import { exec } from 'child_process';
11
12
  import { promisify } from 'util';
13
+
14
+ // Import from split modules
15
+ import {
16
+ extractFromWord,
17
+ extractWordComments,
18
+ extractCommentAnchors,
19
+ extractWordTables,
20
+ } from './word-extraction.js';
21
+ import type {
22
+ WordComment,
23
+ CommentAnchorData,
24
+ WordTable,
25
+ ExtractFromWordResult,
26
+ } from './word-extraction.js';
12
27
  import {
13
- extractMarkdownPrefix,
14
- protectAnchors,
15
- restoreAnchors,
16
- protectCrossrefs,
17
- restoreCrossrefs,
18
- simplifyMathForMatching,
19
- protectMath,
20
- restoreMath,
21
- replaceRenderedMath,
22
- protectCitations,
23
- restoreCitations,
24
- replaceRenderedCitations,
25
- protectImages,
26
- restoreImages,
27
- matchWordImagesToOriginal,
28
- protectTables,
29
- restoreTables,
30
- } from './protect-restore.js';
28
+ generateSmartDiff,
29
+ generateAnnotatedDiff,
30
+ cleanupAnnotations,
31
+ fixCitationAnnotations,
32
+ } from './diff-engine.js';
33
+ import {
34
+ restoreCrossrefFromWord,
35
+ restoreImagesFromRegistry,
36
+ parseVisibleComments,
37
+ convertVisibleComments,
38
+ } from './restore-references.js';
39
+
40
+ // Re-export everything so existing imports from './import.js' still work
41
+ export {
42
+ extractFromWord,
43
+ extractWordComments,
44
+ extractCommentAnchors,
45
+ extractWordTables,
46
+ } from './word-extraction.js';
47
+ export type {
48
+ WordComment,
49
+ TextNode,
50
+ CommentAnchorData,
51
+ CommentAnchorsResult,
52
+ WordTable,
53
+ ParsedRow,
54
+ ExtractFromWordOptions,
55
+ ExtractMessage,
56
+ ExtractFromWordResult,
57
+ } from './word-extraction.js';
58
+
59
+ export {
60
+ generateSmartDiff,
61
+ generateAnnotatedDiff,
62
+ cleanupAnnotations,
63
+ fixCitationAnnotations,
64
+ } from './diff-engine.js';
65
+ export type {
66
+ GenerateSmartDiffOptions,
67
+ } from './diff-engine.js';
68
+
69
+ export {
70
+ restoreCrossrefFromWord,
71
+ restoreImagesFromRegistry,
72
+ parseVisibleComments,
73
+ convertVisibleComments,
74
+ } from './restore-references.js';
75
+ export type {
76
+ RestoreCrossrefResult,
77
+ RestoreImagesResult,
78
+ } from './restore-references.js';
31
79
 
32
80
  const execAsync = promisify(exec);
33
81
 
34
82
  // ============================================
35
- // Type Definitions
83
+ // Type Definitions (orchestration-specific)
36
84
  // ============================================
37
85
 
38
- interface WordComment {
39
- id: string;
40
- author: string;
41
- date: string;
42
- text: string;
43
- }
44
-
45
- interface TextNode {
46
- xmlStart: number;
47
- xmlEnd: number;
48
- textStart: number;
49
- textEnd: number;
50
- text: string;
51
- }
52
-
53
- interface CommentAnchorData {
54
- anchor: string;
55
- before: string;
56
- after: string;
57
- docPosition: number;
58
- docLength: number;
59
- isEmpty: boolean;
60
- }
61
-
62
- interface CommentAnchorsResult {
63
- anchors: Map<string, CommentAnchorData>;
64
- fullDocText: string;
65
- }
66
-
67
- interface WordTable {
68
- markdown: string;
69
- rowCount: number;
70
- colCount: number;
71
- }
72
-
73
- interface ParsedRow {
74
- cells: string[];
75
- colSpans: number[];
76
- }
77
-
78
- interface ExtractFromWordOptions {
79
- mediaDir?: string;
80
- skipMediaExtraction?: boolean;
81
- }
82
-
83
- interface ExtractMessage {
84
- type: 'info' | 'warning';
85
- message: string;
86
- }
87
-
88
- interface ExtractFromWordResult {
89
- text: string;
90
- comments: WordComment[];
91
- anchors: Map<string, CommentAnchorData>;
92
- messages: ExtractMessage[];
93
- extractedMedia: string[];
94
- tables: WordTable[];
95
- hasTrackChanges: boolean;
96
- trackChangeStats: { insertions: number; deletions: number };
97
- }
98
-
99
- interface InsertCommentsOptions {
86
+ export interface InsertCommentsOptions {
100
87
  quiet?: boolean;
101
88
  sectionBoundary?: { start: number; end: number } | null;
102
89
  }
103
90
 
104
- interface CommentWithPos {
91
+ export interface CommentWithPos {
105
92
  id: string;
106
93
  author: string;
107
94
  text: string;
@@ -113,42 +100,24 @@ interface CommentWithPos {
113
100
  strategy?: string;
114
101
  }
115
102
 
116
- interface AnchorSearchResult {
103
+ export interface AnchorSearchResult {
117
104
  occurrences: number[];
118
105
  matchedAnchor: string | null;
119
106
  strategy: string;
120
107
  stripped?: boolean;
121
108
  }
122
109
 
123
- interface MarkdownPrefixResult {
110
+ export interface MarkdownPrefixResult {
124
111
  prefix: string;
125
112
  content: string;
126
113
  }
127
114
 
128
- interface GenerateSmartDiffOptions {
129
- wordTables?: WordTable[];
130
- imageRegistry?: any;
131
- }
132
-
133
- interface RestoreCrossrefResult {
134
- text: string;
135
- restored: number;
136
- messages: string[];
137
- restoredLabels: Set<string>;
138
- }
139
-
140
- interface RestoreImagesResult {
141
- text: string;
142
- restored: number;
143
- messages: string[];
144
- }
145
-
146
- interface ImportWordWithTrackChangesOptions {
115
+ export interface ImportWordWithTrackChangesOptions {
147
116
  mediaDir?: string;
148
117
  projectDir?: string;
149
118
  }
150
119
 
151
- interface ImportWordWithTrackChangesResult {
120
+ export interface ImportWordWithTrackChangesResult {
152
121
  text: string;
153
122
  stats: {
154
123
  insertions: number;
@@ -163,14 +132,14 @@ interface ImportWordWithTrackChangesResult {
163
132
  comments: WordComment[];
164
133
  }
165
134
 
166
- interface ImportFromWordOptions {
135
+ export interface ImportFromWordOptions {
167
136
  author?: string;
168
137
  sectionContent?: string;
169
138
  figuresDir?: string;
170
139
  wordTables?: WordTable[];
171
140
  }
172
141
 
173
- interface ImportFromWordResult {
142
+ export interface ImportFromWordResult {
174
143
  annotated: string;
175
144
  stats: {
176
145
  insertions: number;
@@ -182,13 +151,13 @@ interface ImportFromWordResult {
182
151
  extractedMedia: string[];
183
152
  }
184
153
 
185
- interface MovedFile {
154
+ export interface MovedFile {
186
155
  from: string;
187
156
  to: string;
188
157
  name: string;
189
158
  }
190
159
 
191
- interface MoveExtractedMediaResult {
160
+ export interface MoveExtractedMediaResult {
192
161
  moved: MovedFile[];
193
162
  errors: string[];
194
163
  }
@@ -197,593 +166,6 @@ interface MoveExtractedMediaResult {
197
166
  // Functions
198
167
  // ============================================
199
168
 
200
- /**
201
- * Extract comments directly from Word docx comments.xml
202
- */
203
- export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
204
- const AdmZip = (await import('adm-zip')).default;
205
- const { parseStringPromise } = await import('xml2js');
206
-
207
- const comments: WordComment[] = [];
208
-
209
- // Validate file exists
210
- if (!fs.existsSync(docxPath)) {
211
- throw new Error(`File not found: ${docxPath}`);
212
- }
213
-
214
- try {
215
- let zip;
216
- try {
217
- zip = new AdmZip(docxPath);
218
- } catch (err: any) {
219
- throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
220
- }
221
-
222
- const commentsEntry = zip.getEntry('word/comments.xml');
223
-
224
- if (!commentsEntry) {
225
- return comments;
226
- }
227
-
228
- let commentsXml;
229
- try {
230
- commentsXml = commentsEntry.getData().toString('utf8');
231
- } catch (err: any) {
232
- throw new Error(`Failed to read comments from document: ${err.message}`);
233
- }
234
-
235
- const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
236
-
237
- const ns = 'w:';
238
- const commentsRoot = parsed['w:comments'];
239
- if (!commentsRoot || !commentsRoot['w:comment']) {
240
- return comments;
241
- }
242
-
243
- // Ensure it's an array
244
- const commentNodes = Array.isArray(commentsRoot['w:comment'])
245
- ? commentsRoot['w:comment']
246
- : [commentsRoot['w:comment']];
247
-
248
- for (const comment of commentNodes) {
249
- const id = comment.$?.['w:id'] || '';
250
- const author = comment.$?.['w:author'] || 'Unknown';
251
- const date = comment.$?.['w:date'] || '';
252
-
253
- // Extract text from nested w:p/w:r/w:t elements
254
- let text = '';
255
- const extractText = (node: any): void => {
256
- if (!node) return;
257
- if (typeof node === 'string') {
258
- text += node;
259
- return;
260
- }
261
- if (node['w:t']) {
262
- const t = node['w:t'];
263
- text += typeof t === 'string' ? t : (t._ || t);
264
- }
265
- if (node['w:r']) {
266
- const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
267
- runs.forEach(extractText);
268
- }
269
- if (node['w:p']) {
270
- const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
271
- paras.forEach(extractText);
272
- }
273
- };
274
- extractText(comment);
275
-
276
- comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
277
- }
278
- } catch (err: any) {
279
- // Re-throw with more context if it's already an Error we created
280
- if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
281
- throw err;
282
- }
283
- throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
284
- }
285
-
286
- return comments;
287
- }
288
-
289
- /**
290
- * Extract comment anchor texts from document.xml with surrounding context
291
- * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
292
- * Also returns fullDocText for section boundary matching
293
- */
294
- export async function extractCommentAnchors(docxPath: string): Promise<CommentAnchorsResult> {
295
- const AdmZip = (await import('adm-zip')).default;
296
- const anchors = new Map<string, CommentAnchorData>();
297
- let fullDocText = '';
298
-
299
- try {
300
- const zip = new AdmZip(docxPath);
301
- const docEntry = zip.getEntry('word/document.xml');
302
-
303
- if (!docEntry) {
304
- return { anchors, fullDocText };
305
- }
306
-
307
- const docXml = docEntry.getData().toString('utf8');
308
-
309
- // ========================================
310
- // STEP 1: Build text position mapping
311
- // ========================================
312
- const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
313
- const textNodes: TextNode[] = [];
314
- let textPosition = 0;
315
- let nodeMatch;
316
-
317
- while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
318
- const rawText = nodeMatch[1] ?? '';
319
- const decodedText = decodeXmlEntities(rawText);
320
- textNodes.push({
321
- xmlStart: nodeMatch.index,
322
- xmlEnd: nodeMatch.index + nodeMatch[0].length,
323
- textStart: textPosition,
324
- textEnd: textPosition + decodedText.length,
325
- text: decodedText
326
- });
327
- textPosition += decodedText.length;
328
- }
329
-
330
- fullDocText = textNodes.map(n => n.text).join('');
331
-
332
- // Helper: convert XML position to text position
333
- function xmlPosToTextPos(xmlPos: number): number {
334
- for (let i = 0; i < textNodes.length; i++) {
335
- const node = textNodes[i];
336
- if (!node) continue;
337
- if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
338
- return node.textStart;
339
- }
340
- if (xmlPos < node.xmlStart) {
341
- return node.textStart;
342
- }
343
- }
344
- const lastNode = textNodes[textNodes.length - 1];
345
- return lastNode ? lastNode.textEnd : 0;
346
- }
347
-
348
- // Helper: extract context before a position
349
- function getContextBefore(position: number, maxLength: number = 150): string {
350
- const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
351
- const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
352
- return sentenceStart >= 0
353
- ? beforeText.slice(sentenceStart + 2).trim()
354
- : beforeText.slice(-80).trim();
355
- }
356
-
357
- // Helper: extract context after a position
358
- function getContextAfter(position: number, maxLength: number = 150): string {
359
- const afterText = fullDocText.slice(position, position + maxLength);
360
- const sentenceEnd = afterText.search(/[.!?]\s/);
361
- return sentenceEnd >= 0
362
- ? afterText.slice(0, sentenceEnd + 1).trim()
363
- : afterText.slice(0, 80).trim();
364
- }
365
-
366
- // ========================================
367
- // STEP 2: Collect all start/end markers separately
368
- // ========================================
369
- const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
370
- const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
371
-
372
- const starts = new Map<string, number>(); // id -> position after start tag
373
- const ends = new Map<string, number>(); // id -> position before end tag
374
-
375
- let match;
376
- while ((match = startPattern.exec(docXml)) !== null) {
377
- const id = match[1];
378
- if (!starts.has(id)) {
379
- starts.set(id, match.index + match[0].length);
380
- }
381
- }
382
-
383
- while ((match = endPattern.exec(docXml)) !== null) {
384
- const id = match[1];
385
- if (!ends.has(id)) {
386
- ends.set(id, match.index);
387
- }
388
- }
389
-
390
- // ========================================
391
- // STEP 3: Process each comment range by ID
392
- // ========================================
393
- for (const [id, startXmlPos] of starts) {
394
- const endXmlPos = ends.get(id);
395
-
396
- // Missing end marker - skip with warning
397
- if (endXmlPos === undefined) {
398
- console.warn(`Comment ${id}: missing end marker`);
399
- continue;
400
- }
401
-
402
- // Calculate text position
403
- const docPosition = xmlPosToTextPos(startXmlPos);
404
-
405
- // Handle empty or inverted ranges
406
- if (endXmlPos <= startXmlPos) {
407
- anchors.set(id, {
408
- anchor: '',
409
- before: getContextBefore(docPosition),
410
- after: getContextAfter(docPosition),
411
- docPosition,
412
- docLength: fullDocText.length,
413
- isEmpty: true
414
- });
415
- continue;
416
- }
417
-
418
- // Extract XML segment between markers
419
- const segment = docXml.slice(startXmlPos, endXmlPos);
420
-
421
- // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
422
- const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
423
- let anchorText = '';
424
- let tm;
425
- while ((tm = textInRangePattern.exec(segment)) !== null) {
426
- anchorText += tm[1] || tm[2] || '';
427
- }
428
- anchorText = decodeXmlEntities(anchorText);
429
-
430
- // Get context
431
- const anchorLength = anchorText.length;
432
- const before = getContextBefore(docPosition);
433
- const after = getContextAfter(docPosition + anchorLength);
434
-
435
- // ALWAYS add entry (even if anchor is empty)
436
- anchors.set(id, {
437
- anchor: anchorText.trim(),
438
- before,
439
- after,
440
- docPosition,
441
- docLength: fullDocText.length,
442
- isEmpty: !anchorText.trim()
443
- });
444
- }
445
- } catch (err: any) {
446
- console.error('Error extracting comment anchors:', err.message);
447
- return { anchors, fullDocText: '' };
448
- }
449
-
450
- return { anchors, fullDocText };
451
- }
452
-
453
- /**
454
- * Decode XML entities in text
455
- */
456
- function decodeXmlEntities(text: string): string {
457
- return text
458
- .replace(/&amp;/g, '&')
459
- .replace(/&lt;/g, '<')
460
- .replace(/&gt;/g, '>')
461
- .replace(/&quot;/g, '"')
462
- .replace(/&apos;/g, "'")
463
- .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
464
- .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
465
- }
466
-
467
- /**
468
- * Extract text content from a Word XML cell
469
- */
470
- function extractCellText(cellXml: string): string {
471
- const parts: string[] = [];
472
-
473
- // Check for OMML math - replace with [math] placeholder
474
- if (cellXml.includes('<m:oMath')) {
475
- // Try to extract the text representation of math
476
- const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
477
- if (mathTextMatches.length > 0) {
478
- const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
479
- parts.push(mathText);
480
- } else {
481
- parts.push('[math]');
482
- }
483
- }
484
-
485
- // Extract regular text from w:t elements
486
- const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
487
- for (const match of textMatches) {
488
- const text = match.replace(/<[^>]+>/g, '');
489
- if (text) {
490
- parts.push(text);
491
- }
492
- }
493
-
494
- let result = parts.join('').trim();
495
- result = decodeXmlEntities(result);
496
-
497
- // Escape pipe characters in cell content (would break table)
498
- result = result.replace(/\|/g, '\\|');
499
-
500
- return result;
501
- }
502
-
503
- /**
504
- * Parse a table row, handling merged cells (gridSpan)
505
- */
506
- function parseTableRow(rowXml: string, expectedCols: number): ParsedRow {
507
- // Match cells - handle both <w:tc> and <w:tc ...>
508
- const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
509
- const cells: string[] = [];
510
- const colSpans: number[] = [];
511
-
512
- for (const cellXml of cellMatches) {
513
- // Check for horizontal merge (gridSpan)
514
- const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
515
- const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
516
-
517
- // Check for vertical merge continuation (vMerge without restart)
518
- // If vMerge is present without w:val="restart", it's a continuation - use empty
519
- const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
520
- const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
521
-
522
- const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
523
-
524
- // Add the cell content
525
- cells.push(cellText);
526
- colSpans.push(span);
527
-
528
- // For gridSpan > 1, add empty cells to maintain column alignment
529
- for (let i = 1; i < span; i++) {
530
- cells.push('');
531
- colSpans.push(0); // 0 indicates this is a spanned cell
532
- }
533
- }
534
-
535
- return { cells, colSpans };
536
- }
537
-
538
- /**
539
- * Determine table grid column count from table XML
540
- */
541
- function getTableGridCols(tableXml: string): number {
542
- // Try to get from tblGrid
543
- const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
544
- if (gridColMatches.length > 0) {
545
- return gridColMatches.length;
546
- }
547
-
548
- // Fallback: count max cells in any row
549
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
550
- let maxCols = 0;
551
- for (const rowXml of rowMatches) {
552
- const { cells } = parseTableRow(rowXml, 0);
553
- maxCols = Math.max(maxCols, cells.length);
554
- }
555
- return maxCols;
556
- }
557
-
558
- /**
559
- * Extract tables directly from Word document XML and convert to markdown pipe tables
560
- */
561
- export async function extractWordTables(docxPath: string): Promise<WordTable[]> {
562
- const AdmZip = (await import('adm-zip')).default;
563
- const tables: WordTable[] = [];
564
-
565
- try {
566
- const zip = new AdmZip(docxPath);
567
- const docEntry = zip.getEntry('word/document.xml');
568
-
569
- if (!docEntry) {
570
- return tables;
571
- }
572
-
573
- const xml = docEntry.getData().toString('utf8');
574
-
575
- // Find all table elements
576
- const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
577
-
578
- for (const tableXml of tableMatches) {
579
- // Determine expected column count from grid
580
- const expectedCols = getTableGridCols(tableXml);
581
-
582
- // Extract rows
583
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
584
- const rows: string[][] = [];
585
-
586
- for (const rowXml of rowMatches) {
587
- const { cells } = parseTableRow(rowXml, expectedCols);
588
- if (cells.length > 0) {
589
- rows.push(cells);
590
- }
591
- }
592
-
593
- if (rows.length > 0) {
594
- // Convert to markdown pipe table
595
- const markdown = convertRowsToMarkdownTable(rows);
596
- tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
597
- }
598
- }
599
- } catch (err: any) {
600
- console.error('Error extracting tables from Word:', err.message);
601
- }
602
-
603
- return tables;
604
- }
605
-
606
- /**
607
- * Convert array of rows (each row is array of cell strings) to markdown pipe table
608
- */
609
- function convertRowsToMarkdownTable(rows: string[][]): string {
610
- if (rows.length === 0) return '';
611
-
612
- // Normalize column count (use max across all rows)
613
- const colCount = Math.max(...rows.map((r) => r.length));
614
-
615
- // Pad rows to have consistent column count
616
- const normalizedRows = rows.map((row) => {
617
- while (row.length < colCount) {
618
- row.push('');
619
- }
620
- return row;
621
- });
622
-
623
- // Build markdown table
624
- const lines: string[] = [];
625
-
626
- // Header row
627
- const header = normalizedRows[0];
628
- lines.push('| ' + header.join(' | ') + ' |');
629
-
630
- // Separator row
631
- lines.push('|' + header.map(() => '---').join('|') + '|');
632
-
633
- // Data rows
634
- for (let i = 1; i < normalizedRows.length; i++) {
635
- lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
636
- }
637
-
638
- return lines.join('\n');
639
- }
640
-
641
- /**
642
- * Extract text from Word document using pandoc with track changes preserved
643
- */
644
- export async function extractFromWord(
645
- docxPath: string,
646
- options: ExtractFromWordOptions = {}
647
- ): Promise<ExtractFromWordResult> {
648
- let text: string;
649
- let messages: ExtractMessage[] = [];
650
- let extractedMedia: string[] = [];
651
- let hasTrackChanges = false;
652
- let trackChangeStats = { insertions: 0, deletions: 0 };
653
-
654
- // Determine media extraction directory
655
- const docxDir = path.dirname(docxPath);
656
- const mediaDir = options.mediaDir || path.join(docxDir, 'media');
657
-
658
- // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
659
- const skipMediaExtraction = options.skipMediaExtraction || false;
660
-
661
- // Extract tables directly from Word XML (reliable, no heuristics)
662
- const wordTables = await extractWordTables(docxPath);
663
-
664
- // Try pandoc first with --track-changes=all to preserve reviewer edits
665
- try {
666
- // Build pandoc command
667
- let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
668
- if (!skipMediaExtraction) {
669
- pandocCmd += ` --extract-media="${mediaDir}"`;
670
- }
671
-
672
- const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
673
- text = stdout;
674
-
675
- // Convert pandoc's track change format to CriticMarkup
676
- const origLength = text.length;
677
-
678
- // Use a more robust pattern that handles nested content
679
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
680
- if (content.trim()) {
681
- trackChangeStats.insertions++;
682
- return `{++${content}++}`;
683
- }
684
- return ''; // Empty insertions are removed
685
- });
686
-
687
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
688
- if (content.trim()) {
689
- trackChangeStats.deletions++;
690
- return `{--${content}--}`;
691
- }
692
- return ''; // Empty deletions are removed
693
- });
694
-
695
- // Handle any remaining pandoc track change patterns
696
- let prevText;
697
- do {
698
- prevText = text;
699
- text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
700
- if (content.trim()) {
701
- trackChangeStats.insertions++;
702
- return `{++${content}++}`;
703
- }
704
- return '';
705
- });
706
- text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
707
- if (content.trim()) {
708
- trackChangeStats.deletions++;
709
- return `{--${content}--}`;
710
- }
711
- return '';
712
- });
713
- } while (text !== prevText);
714
-
715
- // Handle pandoc comment patterns - remove comment text from body
716
- text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
717
- text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
718
-
719
- // Also handle {.mark} spans
720
- text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
721
-
722
- hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
723
-
724
- if (hasTrackChanges) {
725
- messages.push({
726
- type: 'info',
727
- message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
728
- });
729
- }
730
-
731
- // Find extracted media files
732
- const mediaSubdir = path.join(mediaDir, 'media');
733
- if (fs.existsSync(mediaSubdir)) {
734
- extractedMedia = fs.readdirSync(mediaSubdir)
735
- .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
736
- .map(f => path.join(mediaSubdir, f));
737
-
738
- if (extractedMedia.length > 0) {
739
- messages.push({
740
- type: 'info',
741
- message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
742
- });
743
- }
744
- }
745
- } catch (pandocErr: any) {
746
- // Pandoc not available — use XML-based extraction with track change support
747
- const { extractPlainTextWithTrackChanges } = await import('./word.js');
748
- const { getInstallInstructions } = await import('./dependencies.js');
749
- const installCmd = getInstallInstructions('pandoc');
750
-
751
- const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
752
- text = xmlResult.text;
753
- hasTrackChanges = xmlResult.hasTrackChanges;
754
- trackChangeStats = xmlResult.stats;
755
-
756
- if (hasTrackChanges) {
757
- messages.push({
758
- type: 'warning',
759
- message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
760
- });
761
- } else {
762
- messages.push({
763
- type: 'warning',
764
- message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
765
- });
766
- }
767
- }
768
-
769
- // Extract comments directly from docx XML
770
- const comments = await extractWordComments(docxPath);
771
-
772
- // Extract comment anchor texts
773
- const { anchors } = await extractCommentAnchors(docxPath);
774
-
775
- return {
776
- text,
777
- comments,
778
- anchors,
779
- messages,
780
- extractedMedia,
781
- tables: wordTables,
782
- hasTrackChanges,
783
- trackChangeStats,
784
- };
785
- }
786
-
787
169
  /**
788
170
  * Insert comments into markdown text based on anchor texts with context
789
171
  */
@@ -1138,658 +520,6 @@ export function insertCommentsIntoMarkdown(
1138
520
  return result;
1139
521
  }
1140
522
 
1141
- /**
1142
- * Normalize text for comparison (handle whitespace differences)
1143
- */
1144
- function normalizeWhitespace(text: string): string {
1145
- return text
1146
- .replace(/\r\n/g, '\n') // Normalize line endings
1147
- .replace(/\t/g, ' ') // Tabs to spaces
1148
- .replace(/ +/g, ' ') // Collapse multiple spaces
1149
- .trim();
1150
- }
1151
-
1152
- /**
1153
- * Fix citation and math annotations by preserving original markdown syntax
1154
- */
1155
- function fixCitationAnnotations(text: string, originalMd: string): string {
1156
- // Fix math annotations - preserve inline and display math
1157
- text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
1158
- text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
1159
-
1160
- text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
1161
- text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
1162
-
1163
- // Extract all citations from original markdown
1164
- const citationPattern = /\[@[^\]]+\]/g;
1165
- const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
1166
-
1167
- // Fix substitutions where left side has markdown citation
1168
- text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
1169
-
1170
- // Fix substitutions where left side STARTS with markdown citation
1171
- text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
1172
- if (oldText.trim() === '' && newText.trim() === '') {
1173
- return cite;
1174
- }
1175
- if (oldText.trim() || newText.trim()) {
1176
- return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
1177
- }
1178
- return cite;
1179
- });
1180
-
1181
- // Fix deletions of markdown citations
1182
- text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
1183
-
1184
- // Fix insertions of rendered citations
1185
- text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
1186
-
1187
- // Clean up broken multi-part substitutions
1188
- text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
1189
-
1190
- // Fix citations split across substitution boundaries
1191
- text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
1192
-
1193
- // Clean up any remaining partial citations
1194
- text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
1195
-
1196
- // Remove rendered citation insertions (with Unicode support)
1197
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
1198
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1199
-
1200
- // Trailing citation fragments
1201
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1202
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
1203
-
1204
- // Just year with closing paren
1205
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
1206
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
1207
-
1208
- // Leading citation fragments
1209
- text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
1210
-
1211
- // Semicolon-separated fragments
1212
- text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
1213
-
1214
- // Year ranges with authors
1215
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
1216
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1217
-
1218
- // Clean up double spaces and orphaned punctuation
1219
- text = text.replace(/ +/g, ' ');
1220
- text = text.replace(/\s+\./g, '.');
1221
- text = text.replace(/\s+,/g, ',');
1222
-
1223
- // Final cleanup - remove empty annotations
1224
- text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
1225
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1226
- text = text.replace(/\{--\s*--\}/g, '');
1227
-
1228
- return text;
1229
- }
1230
-
1231
- /**
1232
- * Strip markdown syntax to get plain text
1233
- */
1234
- function stripMarkdownSyntax(md: string): string {
1235
- return md
1236
- .replace(/^---[\s\S]*?---\n*/m, '')
1237
- .replace(/^#{1,6}\s+/gm, '')
1238
- .replace(/(\*\*|__)(.*?)\1/g, '$2')
1239
- .replace(/(\*|_)(.*?)\1/g, '$2')
1240
- .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
1241
- .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
1242
- .replace(/`([^`]+)`/g, '$1')
1243
- .replace(/```[\s\S]*?```/g, '')
1244
- .replace(/^>\s*/gm, '')
1245
- .replace(/^[-*_]{3,}\s*$/gm, '')
1246
- .replace(/^[\s]*[-*+]\s+/gm, '')
1247
- .replace(/^[\s]*\d+\.\s+/gm, '')
1248
- .replace(/\|/g, ' ')
1249
- .replace(/^[-:]+$/gm, '')
1250
- .replace(/\n{3,}/g, '\n\n')
1251
- .trim();
1252
- }
1253
-
1254
- /**
1255
- * Generate annotated markdown by diffing original MD against Word text
1256
- */
1257
- export function generateAnnotatedDiff(originalMd: string, wordText: string, author: string = 'Reviewer'): string {
1258
- const normalizedOriginal = normalizeWhitespace(originalMd);
1259
- const normalizedWord = normalizeWhitespace(wordText);
1260
-
1261
- const changes = diffWords(normalizedOriginal, normalizedWord);
1262
-
1263
- let result = '';
1264
-
1265
- for (const part of changes) {
1266
- if (part.added) {
1267
- result += `{++${part.value}++}`;
1268
- } else if (part.removed) {
1269
- result += `{--${part.value}--}`;
1270
- } else {
1271
- result += part.value;
1272
- }
1273
- }
1274
-
1275
- return result;
1276
- }
1277
-
1278
- /**
1279
- * Inject Word tables (extracted from XML) into pandoc text output
1280
- */
1281
- function injectWordTables(pandocText: string, wordTables: WordTable[]): string {
1282
- if (!wordTables || wordTables.length === 0) {
1283
- return pandocText;
1284
- }
1285
-
1286
- let result = pandocText;
1287
-
1288
- for (const table of wordTables) {
1289
- const firstLine = table.markdown.split('\n')[0];
1290
- const headerCells = firstLine
1291
- .split('|')
1292
- .map((c) => c.trim())
1293
- .filter((c) => c.length > 0);
1294
-
1295
- if (headerCells.length === 0) continue;
1296
-
1297
- const firstCell = headerCells[0];
1298
- const startIdx = result.indexOf(firstCell);
1299
-
1300
- if (startIdx === -1) continue;
1301
-
1302
- const lastLine = table.markdown.split('\n').pop();
1303
- const lastCells = lastLine!
1304
- .split('|')
1305
- .map((c) => c.trim())
1306
- .filter((c) => c.length > 0);
1307
- const lastCell = lastCells[lastCells.length - 1] || lastCells[0];
1308
-
1309
- const endIdx = result.indexOf(lastCell, startIdx);
1310
- if (endIdx === -1) continue;
1311
-
1312
- let regionStart = result.lastIndexOf('\n\n', startIdx);
1313
- if (regionStart === -1) regionStart = 0;
1314
- else regionStart += 2;
1315
-
1316
- let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length);
1317
- if (regionEnd === -1) regionEnd = result.length;
1318
-
1319
- result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd);
1320
- }
1321
-
1322
- return result;
1323
- }
1324
-
1325
- /**
1326
- * Smart paragraph-level diff that preserves markdown structure
1327
- */
1328
- export function generateSmartDiff(
1329
- originalMd: string,
1330
- wordText: string,
1331
- author: string = 'Reviewer',
1332
- options: GenerateSmartDiffOptions = {}
1333
- ): string {
1334
- const { wordTables = [], imageRegistry = null } = options;
1335
-
1336
- // Inject Word tables into pandoc output
1337
- let wordTextWithTables = injectWordTables(wordText, wordTables);
1338
-
1339
- // Protect markdown tables
1340
- const { text: mdWithTablesProtected, tables } = protectTables(originalMd);
1341
-
1342
- // Also protect tables in Word text
1343
- const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables);
1344
-
1345
- // Protect images
1346
- const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry);
1347
-
1348
- const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry);
1349
-
1350
- // Match Word images to original images
1351
- const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry);
1352
-
1353
- // Replace Word image placeholders with matching original placeholders
1354
- let wordWithMappedImages = wordWithImagesProtected;
1355
- for (const [wordPlaceholder, origPlaceholder] of imageMapping) {
1356
- wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder);
1357
- }
1358
-
1359
- // Protect figure/table anchors
1360
- const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected);
1361
-
1362
- // Protect cross-references
1363
- const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
1364
-
1365
- // Protect math
1366
- const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
1367
-
1368
- // Protect citations
1369
- const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
1370
-
1371
- // Replace rendered elements in Word text
1372
- let wordProtected = wordWithMappedImages;
1373
- wordProtected = replaceRenderedMath(wordProtected, mathBlocks);
1374
- wordProtected = replaceRenderedCitations(wordProtected, citations.length);
1375
-
1376
- // Split into paragraphs
1377
- const originalParas = mdProtected.split(/\n\n+/);
1378
- const wordParas = wordProtected.split(/\n\n+/);
1379
-
1380
- const result: string[] = [];
1381
-
1382
- // Try to match paragraphs intelligently
1383
- let wordIdx = 0;
1384
-
1385
- for (let i = 0; i < originalParas.length; i++) {
1386
- const orig = originalParas[i] || '';
1387
- const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
1388
-
1389
- // Find best matching word paragraph
1390
- let bestMatch = -1;
1391
- let bestScore = 0;
1392
-
1393
- for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
1394
- const wordPara = wordParas[j] || '';
1395
- const origWords = new Set(origContent.toLowerCase().split(/\s+/));
1396
- const wordWords = wordPara.toLowerCase().split(/\s+/);
1397
- const common = wordWords.filter((w) => origWords.has(w)).length;
1398
- const score = common / Math.max(origWords.size, wordWords.length);
1399
-
1400
- if (score > bestScore && score > 0.3) {
1401
- bestScore = score;
1402
- bestMatch = j;
1403
- }
1404
- }
1405
-
1406
- if (bestMatch === -1) {
1407
- if (mdPrefix && wordIdx < wordParas.length) {
1408
- const wordPara = wordParas[wordIdx];
1409
- if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
1410
- bestMatch = wordIdx;
1411
- }
1412
- }
1413
- }
1414
-
1415
- if (bestMatch >= 0) {
1416
- const word = wordParas[bestMatch];
1417
-
1418
- const origStripped = stripMarkdownSyntax(orig);
1419
- const wordNormalized = normalizeWhitespace(word);
1420
-
1421
- if (origStripped === wordNormalized) {
1422
- result.push(orig);
1423
- } else {
1424
- const changes = diffWords(origStripped, wordNormalized);
1425
- let annotated = mdPrefix;
1426
-
1427
- for (const part of changes) {
1428
- if (part.added) {
1429
- annotated += `{++${part.value}++}`;
1430
- } else if (part.removed) {
1431
- annotated += `{--${part.value}--}`;
1432
- } else {
1433
- annotated += part.value;
1434
- }
1435
- }
1436
-
1437
- result.push(annotated);
1438
- }
1439
-
1440
- wordIdx = bestMatch + 1;
1441
- } else {
1442
- // Paragraph deleted entirely
1443
- if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) {
1444
- result.push(orig);
1445
- } else {
1446
- result.push(`{--${orig}--}`);
1447
- }
1448
- }
1449
- }
1450
-
1451
- // Any remaining word paragraphs are additions
1452
- for (let j = wordIdx; j < wordParas.length; j++) {
1453
- const word = wordParas[j];
1454
- if (word.trim()) {
1455
- result.push(`{++${word}++}`);
1456
- }
1457
- }
1458
-
1459
- // Restore protected content
1460
- let finalResult = result.join('\n\n');
1461
- finalResult = restoreCitations(finalResult, citations);
1462
- finalResult = restoreMath(finalResult, mathBlocks);
1463
- finalResult = restoreCrossrefs(finalResult, crossrefs);
1464
- finalResult = restoreAnchors(finalResult, figAnchors);
1465
- finalResult = restoreImages(finalResult, origImages);
1466
- finalResult = restoreImages(finalResult, wordImages);
1467
- finalResult = restoreTables(finalResult, tables);
1468
- finalResult = restoreTables(finalResult, wordTableBlocks);
1469
-
1470
- return finalResult;
1471
- }
1472
-
1473
- /**
1474
- * Clean up redundant adjacent annotations
1475
- */
1476
- export function cleanupAnnotations(text: string): string {
1477
- // Convert adjacent delete+insert to substitution
1478
- text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
1479
-
1480
- // Also handle insert+delete
1481
- text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
1482
-
1483
- // Fix malformed patterns
1484
- text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
1485
-
1486
- // Fix malformed substitutions that got split
1487
- text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
1488
- text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
1489
-
1490
- // Clean up empty annotations
1491
- text = text.replace(/\{--\s*--\}/g, '');
1492
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1493
-
1494
- // Clean up double spaces in prose, but preserve table formatting
1495
- const lines = text.split('\n');
1496
- let inTable = false;
1497
-
1498
- const processedLines = lines.map((line, idx) => {
1499
- const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim());
1500
-
1501
- const looksLikeTableRow = /\S+\s{2,}\S+/.test(line);
1502
-
1503
- if (isSeparator) {
1504
- if (!inTable) {
1505
- inTable = true;
1506
- }
1507
- return line;
1508
- }
1509
-
1510
- if (inTable) {
1511
- if (line.trim() === '') {
1512
- let lookAhead = idx + 1;
1513
- let foundTableContent = false;
1514
- let foundEndSeparator = false;
1515
-
1516
- while (lookAhead < lines.length && lookAhead < idx + 20) {
1517
- const nextLine = lines[lookAhead].trim();
1518
-
1519
- if (nextLine === '') {
1520
- lookAhead++;
1521
- continue;
1522
- }
1523
-
1524
- if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) {
1525
- foundEndSeparator = true;
1526
- break;
1527
- }
1528
-
1529
- if (/\S+\s{2,}\S+/.test(nextLine)) {
1530
- foundTableContent = true;
1531
- break;
1532
- }
1533
-
1534
- if (/^\*[^*]+\*\s*$/.test(nextLine)) {
1535
- foundTableContent = true;
1536
- break;
1537
- }
1538
-
1539
- if (lines[lookAhead].startsWith(' ')) {
1540
- lookAhead++;
1541
- continue;
1542
- }
1543
-
1544
- break;
1545
- }
1546
-
1547
- if (foundTableContent || foundEndSeparator) {
1548
- return line;
1549
- }
1550
-
1551
- inTable = false;
1552
- return line;
1553
- }
1554
-
1555
- return line;
1556
- }
1557
-
1558
- if (looksLikeTableRow) {
1559
- let nextIdx = idx + 1;
1560
- while (nextIdx < lines.length && lines[nextIdx].trim() === '') {
1561
- nextIdx++;
1562
- }
1563
- if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) {
1564
- return line;
1565
- }
1566
- }
1567
-
1568
- if (line.trim().startsWith('|')) {
1569
- return line;
1570
- }
1571
-
1572
- return line.replace(/ +/g, ' ');
1573
- });
1574
- text = processedLines.join('\n');
1575
-
1576
- return text;
1577
- }
1578
-
1579
- /**
1580
- * Parse visible comment markers from Word text
1581
- */
1582
- export function parseVisibleComments(text: string): Array<{ author: string; text: string; position: number }> {
1583
- const comments: Array<{ author: string; text: string; position: number }> = [];
1584
- const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
1585
-
1586
- let match;
1587
- while ((match = pattern.exec(text)) !== null) {
1588
- comments.push({
1589
- author: match[1].trim(),
1590
- text: match[2].trim(),
1591
- position: match.index,
1592
- });
1593
- }
1594
-
1595
- return comments;
1596
- }
1597
-
1598
- /**
1599
- * Convert visible comments to CriticMarkup format
1600
- */
1601
- export function convertVisibleComments(text: string): string {
1602
- return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
1603
- }
1604
-
1605
- /**
1606
- * Restore pandoc-crossref figure/table references from Word-rendered format
1607
- */
1608
- export function restoreCrossrefFromWord(
1609
- text: string,
1610
- projectDir: string,
1611
- restoredLabels: Set<string> | null = null
1612
- ): RestoreCrossrefResult {
1613
- const messages: string[] = [];
1614
- let restored = 0;
1615
- let result = text;
1616
-
1617
- const registry = readImageRegistry(projectDir);
1618
-
1619
- if (!restoredLabels) {
1620
- restoredLabels = new Set<string>();
1621
- }
1622
-
1623
- // Pattern 1: [Figure]{.mark} [N]{.mark}
1624
- result = result.replace(/\[(Figure|Table|Fig\.?)\]\{\.mark\}\s*\[(\d+|S\d+)\]\{\.mark\}/gi, (match, type, num) => {
1625
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1626
- if (registry) {
1627
- const entry = registry.byNumber?.get(`${prefix}:${num}`);
1628
- if (entry && entry.label) {
1629
- restored++;
1630
- return `@${prefix}:${entry.label}`;
1631
- }
1632
- }
1633
- restored++;
1634
- messages.push(`Restored ${type} ${num} (no label found, using placeholder)`);
1635
- return `@${prefix}:fig${num}`;
1636
- });
1637
-
1638
- // Pattern 2: Plain "Figure N" or "Fig. N"
1639
- result = result.replace(/(?<!!)\b(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)\b(?!\s*:)/gi, (match, type, num) => {
1640
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1641
- if (registry) {
1642
- const entry = registry.byNumber?.get(`${prefix}:${num}`);
1643
- if (entry && entry.label) {
1644
- restored++;
1645
- return `@${prefix}:${entry.label}`;
1646
- }
1647
- }
1648
- return match;
1649
- });
1650
-
1651
- // Pattern 3: Remove duplicate plain-text captions
1652
- result = result.replace(/(\!\[[^\]]+\]\([^)]+\)(?:\{[^}]*\})?)\s*\n+\s*(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*[^\n]+/gi, '$1');
1653
-
1654
- // Pattern 4: Clean up image captions that start with "Figure N: "
1655
- result = result.replace(/!\[(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*([^\]]*)\]\(([^)]+)\)(?:\{[^}]*\})?/gi,
1656
- (match, type, num, caption, imgPath) => {
1657
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1658
- const labelKey = `${prefix}:${num}`;
1659
-
1660
- if (registry) {
1661
- const entry = registry.byNumber?.get(labelKey);
1662
- if (entry) {
1663
- if (restoredLabels!.has(labelKey)) {
1664
- messages.push(`Skipped duplicate ${prefix}:${entry.label} (already restored)`);
1665
- return `![${entry.caption}](${entry.path})`;
1666
- }
1667
- restoredLabels!.add(labelKey);
1668
- restored++;
1669
- messages.push(`Restored image ${prefix}:${entry.label} from Figure ${num}`);
1670
- return `![${entry.caption}](${entry.path}){#${prefix}:${entry.label}}`;
1671
- }
1672
- }
1673
- const cleanCaption = caption.trim();
1674
- return `![${cleanCaption}](${imgPath})`;
1675
- });
1676
-
1677
- return { text: result, restored, messages, restoredLabels };
1678
- }
1679
-
1680
- /**
1681
- * Restore proper markdown image syntax from Word-extracted text using image registry
1682
- */
1683
- export function restoreImagesFromRegistry(
1684
- text: string,
1685
- projectDir: string,
1686
- restoredLabels: Set<string> | null = null
1687
- ): RestoreImagesResult {
1688
- const messages: string[] = [];
1689
- let restored = 0;
1690
-
1691
- const registry = readImageRegistry(projectDir);
1692
- if (!registry || !registry.figures || registry.figures.length === 0) {
1693
- return { text, restored: 0, messages: ['No image registry found'] };
1694
- }
1695
-
1696
- if (!restoredLabels) {
1697
- restoredLabels = new Set<string>();
1698
- }
1699
-
1700
- let result = text;
1701
-
1702
- // Pattern 1: Caption-like text
1703
- const captionPatterns = [
1704
- /@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^\n]+)/gi,
1705
- /^(Figure|Fig\.?)\s+(\d+|S\d+)[.:]\s*([^\n]+)/gim,
1706
- /\|\s*@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^|]+)\s*\|/gi,
1707
- ];
1708
-
1709
- // Fix @fig:label: caption patterns
1710
- result = result.replace(captionPatterns[0], (match, type, label, caption) => {
1711
- const key = `${type}:${label}`;
1712
- const entry = registry.byLabel.get(key);
1713
- if (entry) {
1714
- if (restoredLabels!.has(key)) {
1715
- messages.push(`Skipped duplicate ${key} (already restored)`);
1716
- return `![${entry.caption}](${entry.path})`;
1717
- }
1718
- restoredLabels!.add(key);
1719
- restored++;
1720
- messages.push(`Restored ${type}:${label} from registry`);
1721
- return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1722
- }
1723
- return match;
1724
- });
1725
-
1726
- // Fix table-wrapped captions
1727
- result = result.replace(captionPatterns[2], (match, type, label, caption) => {
1728
- const key = `${type}:${label}`;
1729
- const entry = registry.byLabel.get(key);
1730
- if (entry) {
1731
- if (restoredLabels!.has(key)) {
1732
- messages.push(`Skipped duplicate ${key} from table wrapper`);
1733
- return `![${entry.caption}](${entry.path})`;
1734
- }
1735
- restoredLabels!.add(key);
1736
- restored++;
1737
- messages.push(`Restored ${type}:${label} from table wrapper`);
1738
- return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1739
- }
1740
- return match;
1741
- });
1742
-
1743
- // Clean up empty table structures
1744
- result = result.replace(/\|\s*\|\s*\n\|:--:\|\s*\n/g, '');
1745
-
1746
- // Fix "Figure N:" standalone lines
1747
- result = result.replace(captionPatterns[1], (match, prefix, num, caption) => {
1748
- const numKey = `fig:${num}`;
1749
- const entry = registry.byNumber.get(numKey);
1750
- if (entry) {
1751
- const labelKey = `fig:${entry.label}`;
1752
- if (restoredLabels!.has(labelKey)) {
1753
- messages.push(`Skipped duplicate Figure ${num} (already restored)`);
1754
- return `![${entry.caption}](${entry.path})`;
1755
- }
1756
- restoredLabels!.add(labelKey);
1757
- restored++;
1758
- messages.push(`Restored Figure ${num} by number lookup`);
1759
- return `![${entry.caption}](${entry.path}){#fig:${entry.label}}`;
1760
- }
1761
- return match;
1762
- });
1763
-
1764
- // Fix generic media paths by matching caption text
1765
- const genericImagePattern = /!\[([^\]]*)\]\(media\/[^)]+\)/g;
1766
- result = result.replace(genericImagePattern, (match, caption) => {
1767
- if (!caption || caption.trim() === '') {
1768
- return match;
1769
- }
1770
-
1771
- const captionKey = caption.slice(0, 50).toLowerCase().trim();
1772
- const entry = registry.byCaption.get(captionKey);
1773
- if (entry) {
1774
- const labelKey = entry.label ? `${entry.type}:${entry.label}` : null;
1775
- if (labelKey && restoredLabels!.has(labelKey)) {
1776
- messages.push(`Skipped duplicate by caption match: ${captionKey.slice(0, 30)}...`);
1777
- return `![${entry.caption}](${entry.path})`;
1778
- }
1779
- if (labelKey) {
1780
- restoredLabels!.add(labelKey);
1781
- }
1782
- restored++;
1783
- messages.push(`Restored image by caption match: ${captionKey.slice(0, 30)}...`);
1784
- const anchor = (entry.label && !restoredLabels!.has(labelKey!)) ? `{#${entry.type}:${entry.label}}` : '';
1785
- return `![${entry.caption}](${entry.path})${anchor}`;
1786
- }
1787
- return match;
1788
- });
1789
-
1790
- return { text: result, restored, messages };
1791
- }
1792
-
1793
523
  /**
1794
524
  * Import Word document with track changes directly as CriticMarkup
1795
525
  */