docrev 0.9.5 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. package/CHANGELOG.md +20 -0
  2. package/dev_notes/bug_repro_comment_parser.md +71 -0
  3. package/dist/lib/anchor-match.d.ts +41 -0
  4. package/dist/lib/anchor-match.d.ts.map +1 -0
  5. package/dist/lib/anchor-match.js +192 -0
  6. package/dist/lib/anchor-match.js.map +1 -0
  7. package/dist/lib/annotations.d.ts.map +1 -1
  8. package/dist/lib/annotations.js +8 -5
  9. package/dist/lib/annotations.js.map +1 -1
  10. package/dist/lib/commands/file-ops.d.ts +11 -0
  11. package/dist/lib/commands/file-ops.d.ts.map +1 -0
  12. package/dist/lib/commands/file-ops.js +301 -0
  13. package/dist/lib/commands/file-ops.js.map +1 -0
  14. package/dist/lib/commands/index.d.ts +10 -1
  15. package/dist/lib/commands/index.d.ts.map +1 -1
  16. package/dist/lib/commands/index.js +19 -1
  17. package/dist/lib/commands/index.js.map +1 -1
  18. package/dist/lib/commands/merge-resolve.d.ts +12 -0
  19. package/dist/lib/commands/merge-resolve.d.ts.map +1 -0
  20. package/dist/lib/commands/merge-resolve.js +318 -0
  21. package/dist/lib/commands/merge-resolve.js.map +1 -0
  22. package/dist/lib/commands/preview.d.ts +11 -0
  23. package/dist/lib/commands/preview.d.ts.map +1 -0
  24. package/dist/lib/commands/preview.js +138 -0
  25. package/dist/lib/commands/preview.js.map +1 -0
  26. package/dist/lib/commands/project-info.d.ts +11 -0
  27. package/dist/lib/commands/project-info.d.ts.map +1 -0
  28. package/dist/lib/commands/project-info.js +187 -0
  29. package/dist/lib/commands/project-info.js.map +1 -0
  30. package/dist/lib/commands/quality.d.ts +11 -0
  31. package/dist/lib/commands/quality.d.ts.map +1 -0
  32. package/dist/lib/commands/quality.js +384 -0
  33. package/dist/lib/commands/quality.js.map +1 -0
  34. package/dist/lib/commands/section-boundaries.d.ts +22 -0
  35. package/dist/lib/commands/section-boundaries.d.ts.map +1 -0
  36. package/dist/lib/commands/section-boundaries.js +53 -0
  37. package/dist/lib/commands/section-boundaries.js.map +1 -0
  38. package/dist/lib/commands/sections.d.ts +3 -2
  39. package/dist/lib/commands/sections.d.ts.map +1 -1
  40. package/dist/lib/commands/sections.js +4 -736
  41. package/dist/lib/commands/sections.js.map +1 -1
  42. package/dist/lib/commands/sync.d.ts +11 -0
  43. package/dist/lib/commands/sync.d.ts.map +1 -0
  44. package/dist/lib/commands/sync.js +576 -0
  45. package/dist/lib/commands/sync.js.map +1 -0
  46. package/dist/lib/commands/text-ops.d.ts +11 -0
  47. package/dist/lib/commands/text-ops.d.ts.map +1 -0
  48. package/dist/lib/commands/text-ops.js +357 -0
  49. package/dist/lib/commands/text-ops.js.map +1 -0
  50. package/dist/lib/commands/utilities.d.ts +2 -4
  51. package/dist/lib/commands/utilities.d.ts.map +1 -1
  52. package/dist/lib/commands/utilities.js +3 -1572
  53. package/dist/lib/commands/utilities.js.map +1 -1
  54. package/dist/lib/commands/verify-anchors.d.ts +17 -0
  55. package/dist/lib/commands/verify-anchors.d.ts.map +1 -0
  56. package/dist/lib/commands/verify-anchors.js +215 -0
  57. package/dist/lib/commands/verify-anchors.js.map +1 -0
  58. package/dist/lib/commands/word-tools.d.ts +11 -0
  59. package/dist/lib/commands/word-tools.d.ts.map +1 -0
  60. package/dist/lib/commands/word-tools.js +272 -0
  61. package/dist/lib/commands/word-tools.js.map +1 -0
  62. package/dist/lib/diff-engine.d.ts +25 -0
  63. package/dist/lib/diff-engine.d.ts.map +1 -0
  64. package/dist/lib/diff-engine.js +354 -0
  65. package/dist/lib/diff-engine.js.map +1 -0
  66. package/dist/lib/import.d.ts +44 -118
  67. package/dist/lib/import.d.ts.map +1 -1
  68. package/dist/lib/import.js +25 -1173
  69. package/dist/lib/import.js.map +1 -1
  70. package/dist/lib/restore-references.d.ts +35 -0
  71. package/dist/lib/restore-references.d.ts.map +1 -0
  72. package/dist/lib/restore-references.js +188 -0
  73. package/dist/lib/restore-references.js.map +1 -0
  74. package/dist/lib/word-extraction.d.ts +100 -0
  75. package/dist/lib/word-extraction.d.ts.map +1 -0
  76. package/dist/lib/word-extraction.js +594 -0
  77. package/dist/lib/word-extraction.js.map +1 -0
  78. package/lib/anchor-match.ts +238 -0
  79. package/lib/annotations.ts +9 -5
  80. package/lib/commands/file-ops.ts +372 -0
  81. package/lib/commands/index.ts +27 -0
  82. package/lib/commands/merge-resolve.ts +378 -0
  83. package/lib/commands/preview.ts +178 -0
  84. package/lib/commands/project-info.ts +244 -0
  85. package/lib/commands/quality.ts +517 -0
  86. package/lib/commands/section-boundaries.ts +72 -0
  87. package/lib/commands/sections.ts +3 -870
  88. package/lib/commands/sync.ts +701 -0
  89. package/lib/commands/text-ops.ts +449 -0
  90. package/lib/commands/utilities.ts +62 -2043
  91. package/lib/commands/verify-anchors.ts +261 -0
  92. package/lib/commands/word-tools.ts +340 -0
  93. package/lib/diff-engine.ts +465 -0
  94. package/lib/import.ts +108 -1504
  95. package/lib/restore-references.ts +240 -0
  96. package/lib/word-extraction.ts +759 -0
  97. package/package.json +1 -1
  98. package/skill/REFERENCE.md +29 -2
  99. package/skill/SKILL.md +12 -2
package/lib/import.ts CHANGED
@@ -1,108 +1,108 @@
1
1
  /**
2
2
  * Import functionality - convert Word docs to annotated Markdown
3
+ *
4
+ * Orchestration workflows + re-exports from extraction/diff/restore modules
3
5
  */
4
6
 
5
7
  import * as fs from 'fs';
6
8
  import * as path from 'path';
7
- import { diffWords, Change } from 'diff';
8
9
  import { stripAnnotations } from './annotations.js';
9
10
  import { readImageRegistry } from './image-registry.js';
10
11
  import { exec } from 'child_process';
11
12
  import { promisify } from 'util';
13
+
14
+ // Import from split modules
15
+ import {
16
+ extractFromWord,
17
+ extractWordComments,
18
+ extractCommentAnchors,
19
+ extractWordTables,
20
+ } from './word-extraction.js';
21
+ import type {
22
+ WordComment,
23
+ CommentAnchorData,
24
+ WordTable,
25
+ ExtractFromWordResult,
26
+ } from './word-extraction.js';
12
27
  import {
13
- extractMarkdownPrefix,
14
- protectAnchors,
15
- restoreAnchors,
16
- protectCrossrefs,
17
- restoreCrossrefs,
18
- simplifyMathForMatching,
19
- protectMath,
20
- restoreMath,
21
- replaceRenderedMath,
22
- protectCitations,
23
- restoreCitations,
24
- replaceRenderedCitations,
25
- protectImages,
26
- restoreImages,
27
- matchWordImagesToOriginal,
28
- protectTables,
29
- restoreTables,
30
- } from './protect-restore.js';
31
- import { normalizeWhitespace } from './utils.js';
28
+ generateSmartDiff,
29
+ generateAnnotatedDiff,
30
+ cleanupAnnotations,
31
+ fixCitationAnnotations,
32
+ } from './diff-engine.js';
33
+ import {
34
+ restoreCrossrefFromWord,
35
+ restoreImagesFromRegistry,
36
+ parseVisibleComments,
37
+ convertVisibleComments,
38
+ } from './restore-references.js';
39
+ import { findAnchorInText } from './anchor-match.js';
40
+
41
+ // Re-export everything so existing imports from './import.js' still work
42
+ export {
43
+ extractFromWord,
44
+ extractWordComments,
45
+ extractCommentAnchors,
46
+ extractHeadings,
47
+ extractWordTables,
48
+ } from './word-extraction.js';
49
+ export type {
50
+ WordComment,
51
+ TextNode,
52
+ CommentAnchorData,
53
+ CommentAnchorsResult,
54
+ DocxHeading,
55
+ WordTable,
56
+ ParsedRow,
57
+ ExtractFromWordOptions,
58
+ ExtractMessage,
59
+ ExtractFromWordResult,
60
+ } from './word-extraction.js';
61
+
62
+ export {
63
+ generateSmartDiff,
64
+ generateAnnotatedDiff,
65
+ cleanupAnnotations,
66
+ fixCitationAnnotations,
67
+ } from './diff-engine.js';
68
+ export type {
69
+ GenerateSmartDiffOptions,
70
+ } from './diff-engine.js';
71
+
72
+ export {
73
+ restoreCrossrefFromWord,
74
+ restoreImagesFromRegistry,
75
+ parseVisibleComments,
76
+ convertVisibleComments,
77
+ } from './restore-references.js';
78
+ export type {
79
+ RestoreCrossrefResult,
80
+ RestoreImagesResult,
81
+ } from './restore-references.js';
32
82
 
33
83
  const execAsync = promisify(exec);
34
84
 
35
85
  // ============================================
36
- // Type Definitions
86
+ // Type Definitions (orchestration-specific)
37
87
  // ============================================
38
88
 
39
- interface WordComment {
40
- id: string;
41
- author: string;
42
- date: string;
43
- text: string;
44
- }
45
-
46
- interface TextNode {
47
- xmlStart: number;
48
- xmlEnd: number;
49
- textStart: number;
50
- textEnd: number;
51
- text: string;
52
- }
53
-
54
- interface CommentAnchorData {
55
- anchor: string;
56
- before: string;
57
- after: string;
58
- docPosition: number;
59
- docLength: number;
60
- isEmpty: boolean;
61
- }
62
-
63
- interface CommentAnchorsResult {
64
- anchors: Map<string, CommentAnchorData>;
65
- fullDocText: string;
66
- }
67
-
68
- interface WordTable {
69
- markdown: string;
70
- rowCount: number;
71
- colCount: number;
72
- }
73
-
74
- interface ParsedRow {
75
- cells: string[];
76
- colSpans: number[];
77
- }
78
-
79
- interface ExtractFromWordOptions {
80
- mediaDir?: string;
81
- skipMediaExtraction?: boolean;
82
- }
83
-
84
- interface ExtractMessage {
85
- type: 'info' | 'warning';
86
- message: string;
87
- }
88
-
89
- interface ExtractFromWordResult {
90
- text: string;
91
- comments: WordComment[];
92
- anchors: Map<string, CommentAnchorData>;
93
- messages: ExtractMessage[];
94
- extractedMedia: string[];
95
- tables: WordTable[];
96
- hasTrackChanges: boolean;
97
- trackChangeStats: { insertions: number; deletions: number };
98
- }
99
-
100
- interface InsertCommentsOptions {
89
+ export interface InsertCommentsOptions {
101
90
  quiet?: boolean;
102
91
  sectionBoundary?: { start: number; end: number } | null;
103
- }
104
-
105
- interface CommentWithPos {
92
+ /**
93
+ * When true (default), comments wrap their anchor text in `[anchor]{.mark}`
94
+ * so the rebuilt docx restores the original Word comment range. When false,
95
+ * comments are inserted as standalone `{>>...<<}` blocks adjacent to the
96
+ * anchor — the prose stays byte-identical except for the inserted blocks.
97
+ *
98
+ * Set to false from `sync --comments-only` so a draft revised after the
99
+ * docx was sent for review keeps its prose intact, and so multiple
100
+ * comments sharing one anchor don't produce nested broken markup.
101
+ */
102
+ wrapAnchor?: boolean;
103
+ }
104
+
105
+ export interface CommentWithPos {
106
106
  id: string;
107
107
  author: string;
108
108
  text: string;
@@ -114,42 +114,19 @@ interface CommentWithPos {
114
114
  strategy?: string;
115
115
  }
116
116
 
117
- interface AnchorSearchResult {
118
- occurrences: number[];
119
- matchedAnchor: string | null;
120
- strategy: string;
121
- stripped?: boolean;
122
- }
117
+ export type { AnchorSearchResult } from './anchor-match.js';
123
118
 
124
- interface MarkdownPrefixResult {
119
+ export interface MarkdownPrefixResult {
125
120
  prefix: string;
126
121
  content: string;
127
122
  }
128
123
 
129
- interface GenerateSmartDiffOptions {
130
- wordTables?: WordTable[];
131
- imageRegistry?: any;
132
- }
133
-
134
- interface RestoreCrossrefResult {
135
- text: string;
136
- restored: number;
137
- messages: string[];
138
- restoredLabels: Set<string>;
139
- }
140
-
141
- interface RestoreImagesResult {
142
- text: string;
143
- restored: number;
144
- messages: string[];
145
- }
146
-
147
- interface ImportWordWithTrackChangesOptions {
124
+ export interface ImportWordWithTrackChangesOptions {
148
125
  mediaDir?: string;
149
126
  projectDir?: string;
150
127
  }
151
128
 
152
- interface ImportWordWithTrackChangesResult {
129
+ export interface ImportWordWithTrackChangesResult {
153
130
  text: string;
154
131
  stats: {
155
132
  insertions: number;
@@ -164,14 +141,14 @@ interface ImportWordWithTrackChangesResult {
164
141
  comments: WordComment[];
165
142
  }
166
143
 
167
- interface ImportFromWordOptions {
144
+ export interface ImportFromWordOptions {
168
145
  author?: string;
169
146
  sectionContent?: string;
170
147
  figuresDir?: string;
171
148
  wordTables?: WordTable[];
172
149
  }
173
150
 
174
- interface ImportFromWordResult {
151
+ export interface ImportFromWordResult {
175
152
  annotated: string;
176
153
  stats: {
177
154
  insertions: number;
@@ -183,13 +160,13 @@ interface ImportFromWordResult {
183
160
  extractedMedia: string[];
184
161
  }
185
162
 
186
- interface MovedFile {
163
+ export interface MovedFile {
187
164
  from: string;
188
165
  to: string;
189
166
  name: string;
190
167
  }
191
168
 
192
- interface MoveExtractedMediaResult {
169
+ export interface MoveExtractedMediaResult {
193
170
  moved: MovedFile[];
194
171
  errors: string[];
195
172
  }
@@ -198,593 +175,6 @@ interface MoveExtractedMediaResult {
198
175
  // Functions
199
176
  // ============================================
200
177
 
201
- /**
202
- * Extract comments directly from Word docx comments.xml
203
- */
204
- export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
205
- const AdmZip = (await import('adm-zip')).default;
206
- const { parseStringPromise } = await import('xml2js');
207
-
208
- const comments: WordComment[] = [];
209
-
210
- // Validate file exists
211
- if (!fs.existsSync(docxPath)) {
212
- throw new Error(`File not found: ${docxPath}`);
213
- }
214
-
215
- try {
216
- let zip;
217
- try {
218
- zip = new AdmZip(docxPath);
219
- } catch (err: any) {
220
- throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
221
- }
222
-
223
- const commentsEntry = zip.getEntry('word/comments.xml');
224
-
225
- if (!commentsEntry) {
226
- return comments;
227
- }
228
-
229
- let commentsXml;
230
- try {
231
- commentsXml = commentsEntry.getData().toString('utf8');
232
- } catch (err: any) {
233
- throw new Error(`Failed to read comments from document: ${err.message}`);
234
- }
235
-
236
- const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
237
-
238
- const ns = 'w:';
239
- const commentsRoot = parsed['w:comments'];
240
- if (!commentsRoot || !commentsRoot['w:comment']) {
241
- return comments;
242
- }
243
-
244
- // Ensure it's an array
245
- const commentNodes = Array.isArray(commentsRoot['w:comment'])
246
- ? commentsRoot['w:comment']
247
- : [commentsRoot['w:comment']];
248
-
249
- for (const comment of commentNodes) {
250
- const id = comment.$?.['w:id'] || '';
251
- const author = comment.$?.['w:author'] || 'Unknown';
252
- const date = comment.$?.['w:date'] || '';
253
-
254
- // Extract text from nested w:p/w:r/w:t elements
255
- let text = '';
256
- const extractText = (node: any): void => {
257
- if (!node) return;
258
- if (typeof node === 'string') {
259
- text += node;
260
- return;
261
- }
262
- if (node['w:t']) {
263
- const t = node['w:t'];
264
- text += typeof t === 'string' ? t : (t._ || t);
265
- }
266
- if (node['w:r']) {
267
- const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
268
- runs.forEach(extractText);
269
- }
270
- if (node['w:p']) {
271
- const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
272
- paras.forEach(extractText);
273
- }
274
- };
275
- extractText(comment);
276
-
277
- comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
278
- }
279
- } catch (err: any) {
280
- // Re-throw with more context if it's already an Error we created
281
- if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
282
- throw err;
283
- }
284
- throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
285
- }
286
-
287
- return comments;
288
- }
289
-
290
- /**
291
- * Extract comment anchor texts from document.xml with surrounding context
292
- * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
293
- * Also returns fullDocText for section boundary matching
294
- */
295
- export async function extractCommentAnchors(docxPath: string): Promise<CommentAnchorsResult> {
296
- const AdmZip = (await import('adm-zip')).default;
297
- const anchors = new Map<string, CommentAnchorData>();
298
- let fullDocText = '';
299
-
300
- try {
301
- const zip = new AdmZip(docxPath);
302
- const docEntry = zip.getEntry('word/document.xml');
303
-
304
- if (!docEntry) {
305
- return { anchors, fullDocText };
306
- }
307
-
308
- const docXml = docEntry.getData().toString('utf8');
309
-
310
- // ========================================
311
- // STEP 1: Build text position mapping
312
- // ========================================
313
- const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
314
- const textNodes: TextNode[] = [];
315
- let textPosition = 0;
316
- let nodeMatch;
317
-
318
- while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
319
- const rawText = nodeMatch[1] ?? '';
320
- const decodedText = decodeXmlEntities(rawText);
321
- textNodes.push({
322
- xmlStart: nodeMatch.index,
323
- xmlEnd: nodeMatch.index + nodeMatch[0].length,
324
- textStart: textPosition,
325
- textEnd: textPosition + decodedText.length,
326
- text: decodedText
327
- });
328
- textPosition += decodedText.length;
329
- }
330
-
331
- fullDocText = textNodes.map(n => n.text).join('');
332
-
333
- // Helper: convert XML position to text position
334
- function xmlPosToTextPos(xmlPos: number): number {
335
- for (let i = 0; i < textNodes.length; i++) {
336
- const node = textNodes[i];
337
- if (!node) continue;
338
- if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
339
- return node.textStart;
340
- }
341
- if (xmlPos < node.xmlStart) {
342
- return node.textStart;
343
- }
344
- }
345
- const lastNode = textNodes[textNodes.length - 1];
346
- return lastNode ? lastNode.textEnd : 0;
347
- }
348
-
349
- // Helper: extract context before a position
350
- function getContextBefore(position: number, maxLength: number = 150): string {
351
- const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
352
- const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
353
- return sentenceStart >= 0
354
- ? beforeText.slice(sentenceStart + 2).trim()
355
- : beforeText.slice(-80).trim();
356
- }
357
-
358
- // Helper: extract context after a position
359
- function getContextAfter(position: number, maxLength: number = 150): string {
360
- const afterText = fullDocText.slice(position, position + maxLength);
361
- const sentenceEnd = afterText.search(/[.!?]\s/);
362
- return sentenceEnd >= 0
363
- ? afterText.slice(0, sentenceEnd + 1).trim()
364
- : afterText.slice(0, 80).trim();
365
- }
366
-
367
- // ========================================
368
- // STEP 2: Collect all start/end markers separately
369
- // ========================================
370
- const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
371
- const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
372
-
373
- const starts = new Map<string, number>(); // id -> position after start tag
374
- const ends = new Map<string, number>(); // id -> position before end tag
375
-
376
- let match;
377
- while ((match = startPattern.exec(docXml)) !== null) {
378
- const id = match[1];
379
- if (!starts.has(id)) {
380
- starts.set(id, match.index + match[0].length);
381
- }
382
- }
383
-
384
- while ((match = endPattern.exec(docXml)) !== null) {
385
- const id = match[1];
386
- if (!ends.has(id)) {
387
- ends.set(id, match.index);
388
- }
389
- }
390
-
391
- // ========================================
392
- // STEP 3: Process each comment range by ID
393
- // ========================================
394
- for (const [id, startXmlPos] of starts) {
395
- const endXmlPos = ends.get(id);
396
-
397
- // Missing end marker - skip with warning
398
- if (endXmlPos === undefined) {
399
- console.warn(`Comment ${id}: missing end marker`);
400
- continue;
401
- }
402
-
403
- // Calculate text position
404
- const docPosition = xmlPosToTextPos(startXmlPos);
405
-
406
- // Handle empty or inverted ranges
407
- if (endXmlPos <= startXmlPos) {
408
- anchors.set(id, {
409
- anchor: '',
410
- before: getContextBefore(docPosition),
411
- after: getContextAfter(docPosition),
412
- docPosition,
413
- docLength: fullDocText.length,
414
- isEmpty: true
415
- });
416
- continue;
417
- }
418
-
419
- // Extract XML segment between markers
420
- const segment = docXml.slice(startXmlPos, endXmlPos);
421
-
422
- // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
423
- const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
424
- let anchorText = '';
425
- let tm;
426
- while ((tm = textInRangePattern.exec(segment)) !== null) {
427
- anchorText += tm[1] || tm[2] || '';
428
- }
429
- anchorText = decodeXmlEntities(anchorText);
430
-
431
- // Get context
432
- const anchorLength = anchorText.length;
433
- const before = getContextBefore(docPosition);
434
- const after = getContextAfter(docPosition + anchorLength);
435
-
436
- // ALWAYS add entry (even if anchor is empty)
437
- anchors.set(id, {
438
- anchor: anchorText.trim(),
439
- before,
440
- after,
441
- docPosition,
442
- docLength: fullDocText.length,
443
- isEmpty: !anchorText.trim()
444
- });
445
- }
446
- } catch (err: any) {
447
- console.error('Error extracting comment anchors:', err.message);
448
- return { anchors, fullDocText: '' };
449
- }
450
-
451
- return { anchors, fullDocText };
452
- }
453
-
454
- /**
455
- * Decode XML entities in text
456
- */
457
- function decodeXmlEntities(text: string): string {
458
- return text
459
- .replace(/&amp;/g, '&')
460
- .replace(/&lt;/g, '<')
461
- .replace(/&gt;/g, '>')
462
- .replace(/&quot;/g, '"')
463
- .replace(/&apos;/g, "'")
464
- .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
465
- .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
466
- }
467
-
468
- /**
469
- * Extract text content from a Word XML cell
470
- */
471
- function extractCellText(cellXml: string): string {
472
- const parts: string[] = [];
473
-
474
- // Check for OMML math - replace with [math] placeholder
475
- if (cellXml.includes('<m:oMath')) {
476
- // Try to extract the text representation of math
477
- const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
478
- if (mathTextMatches.length > 0) {
479
- const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
480
- parts.push(mathText);
481
- } else {
482
- parts.push('[math]');
483
- }
484
- }
485
-
486
- // Extract regular text from w:t elements
487
- const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
488
- for (const match of textMatches) {
489
- const text = match.replace(/<[^>]+>/g, '');
490
- if (text) {
491
- parts.push(text);
492
- }
493
- }
494
-
495
- let result = parts.join('').trim();
496
- result = decodeXmlEntities(result);
497
-
498
- // Escape pipe characters in cell content (would break table)
499
- result = result.replace(/\|/g, '\\|');
500
-
501
- return result;
502
- }
503
-
504
- /**
505
- * Parse a table row, handling merged cells (gridSpan)
506
- */
507
- function parseTableRow(rowXml: string, expectedCols: number): ParsedRow {
508
- // Match cells - handle both <w:tc> and <w:tc ...>
509
- const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
510
- const cells: string[] = [];
511
- const colSpans: number[] = [];
512
-
513
- for (const cellXml of cellMatches) {
514
- // Check for horizontal merge (gridSpan)
515
- const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
516
- const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
517
-
518
- // Check for vertical merge continuation (vMerge without restart)
519
- // If vMerge is present without w:val="restart", it's a continuation - use empty
520
- const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
521
- const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
522
-
523
- const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
524
-
525
- // Add the cell content
526
- cells.push(cellText);
527
- colSpans.push(span);
528
-
529
- // For gridSpan > 1, add empty cells to maintain column alignment
530
- for (let i = 1; i < span; i++) {
531
- cells.push('');
532
- colSpans.push(0); // 0 indicates this is a spanned cell
533
- }
534
- }
535
-
536
- return { cells, colSpans };
537
- }
538
-
539
- /**
540
- * Determine table grid column count from table XML
541
- */
542
- function getTableGridCols(tableXml: string): number {
543
- // Try to get from tblGrid
544
- const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
545
- if (gridColMatches.length > 0) {
546
- return gridColMatches.length;
547
- }
548
-
549
- // Fallback: count max cells in any row
550
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
551
- let maxCols = 0;
552
- for (const rowXml of rowMatches) {
553
- const { cells } = parseTableRow(rowXml, 0);
554
- maxCols = Math.max(maxCols, cells.length);
555
- }
556
- return maxCols;
557
- }
558
-
559
- /**
560
- * Extract tables directly from Word document XML and convert to markdown pipe tables
561
- */
562
- export async function extractWordTables(docxPath: string): Promise<WordTable[]> {
563
- const AdmZip = (await import('adm-zip')).default;
564
- const tables: WordTable[] = [];
565
-
566
- try {
567
- const zip = new AdmZip(docxPath);
568
- const docEntry = zip.getEntry('word/document.xml');
569
-
570
- if (!docEntry) {
571
- return tables;
572
- }
573
-
574
- const xml = docEntry.getData().toString('utf8');
575
-
576
- // Find all table elements
577
- const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
578
-
579
- for (const tableXml of tableMatches) {
580
- // Determine expected column count from grid
581
- const expectedCols = getTableGridCols(tableXml);
582
-
583
- // Extract rows
584
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
585
- const rows: string[][] = [];
586
-
587
- for (const rowXml of rowMatches) {
588
- const { cells } = parseTableRow(rowXml, expectedCols);
589
- if (cells.length > 0) {
590
- rows.push(cells);
591
- }
592
- }
593
-
594
- if (rows.length > 0) {
595
- // Convert to markdown pipe table
596
- const markdown = convertRowsToMarkdownTable(rows);
597
- tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
598
- }
599
- }
600
- } catch (err: any) {
601
- console.error('Error extracting tables from Word:', err.message);
602
- }
603
-
604
- return tables;
605
- }
606
-
607
- /**
608
- * Convert array of rows (each row is array of cell strings) to markdown pipe table
609
- */
610
- function convertRowsToMarkdownTable(rows: string[][]): string {
611
- if (rows.length === 0) return '';
612
-
613
- // Normalize column count (use max across all rows)
614
- const colCount = Math.max(...rows.map((r) => r.length));
615
-
616
- // Pad rows to have consistent column count
617
- const normalizedRows = rows.map((row) => {
618
- while (row.length < colCount) {
619
- row.push('');
620
- }
621
- return row;
622
- });
623
-
624
- // Build markdown table
625
- const lines: string[] = [];
626
-
627
- // Header row
628
- const header = normalizedRows[0];
629
- lines.push('| ' + header.join(' | ') + ' |');
630
-
631
- // Separator row
632
- lines.push('|' + header.map(() => '---').join('|') + '|');
633
-
634
- // Data rows
635
- for (let i = 1; i < normalizedRows.length; i++) {
636
- lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
637
- }
638
-
639
- return lines.join('\n');
640
- }
641
-
642
- /**
643
- * Extract text from Word document using pandoc with track changes preserved
644
- */
645
- export async function extractFromWord(
646
- docxPath: string,
647
- options: ExtractFromWordOptions = {}
648
- ): Promise<ExtractFromWordResult> {
649
- let text: string;
650
- let messages: ExtractMessage[] = [];
651
- let extractedMedia: string[] = [];
652
- let hasTrackChanges = false;
653
- let trackChangeStats = { insertions: 0, deletions: 0 };
654
-
655
- // Determine media extraction directory
656
- const docxDir = path.dirname(docxPath);
657
- const mediaDir = options.mediaDir || path.join(docxDir, 'media');
658
-
659
- // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
660
- const skipMediaExtraction = options.skipMediaExtraction || false;
661
-
662
- // Extract tables directly from Word XML (reliable, no heuristics)
663
- const wordTables = await extractWordTables(docxPath);
664
-
665
- // Try pandoc first with --track-changes=all to preserve reviewer edits
666
- try {
667
- // Build pandoc command
668
- let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
669
- if (!skipMediaExtraction) {
670
- pandocCmd += ` --extract-media="${mediaDir}"`;
671
- }
672
-
673
- const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
674
- text = stdout;
675
-
676
- // Convert pandoc's track change format to CriticMarkup
677
- const origLength = text.length;
678
-
679
- // Use a more robust pattern that handles nested content
680
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
681
- if (content.trim()) {
682
- trackChangeStats.insertions++;
683
- return `{++${content}++}`;
684
- }
685
- return ''; // Empty insertions are removed
686
- });
687
-
688
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
689
- if (content.trim()) {
690
- trackChangeStats.deletions++;
691
- return `{--${content}--}`;
692
- }
693
- return ''; // Empty deletions are removed
694
- });
695
-
696
- // Handle any remaining pandoc track change patterns
697
- let prevText;
698
- do {
699
- prevText = text;
700
- text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
701
- if (content.trim()) {
702
- trackChangeStats.insertions++;
703
- return `{++${content}++}`;
704
- }
705
- return '';
706
- });
707
- text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
708
- if (content.trim()) {
709
- trackChangeStats.deletions++;
710
- return `{--${content}--}`;
711
- }
712
- return '';
713
- });
714
- } while (text !== prevText);
715
-
716
- // Handle pandoc comment patterns - remove comment text from body
717
- text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
718
- text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
719
-
720
- // Also handle {.mark} spans
721
- text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
722
-
723
- hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
724
-
725
- if (hasTrackChanges) {
726
- messages.push({
727
- type: 'info',
728
- message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
729
- });
730
- }
731
-
732
- // Find extracted media files
733
- const mediaSubdir = path.join(mediaDir, 'media');
734
- if (fs.existsSync(mediaSubdir)) {
735
- extractedMedia = fs.readdirSync(mediaSubdir)
736
- .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
737
- .map(f => path.join(mediaSubdir, f));
738
-
739
- if (extractedMedia.length > 0) {
740
- messages.push({
741
- type: 'info',
742
- message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
743
- });
744
- }
745
- }
746
- } catch (pandocErr: any) {
747
- // Pandoc not available — use XML-based extraction with track change support
748
- const { extractPlainTextWithTrackChanges } = await import('./word.js');
749
- const { getInstallInstructions } = await import('./dependencies.js');
750
- const installCmd = getInstallInstructions('pandoc');
751
-
752
- const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
753
- text = xmlResult.text;
754
- hasTrackChanges = xmlResult.hasTrackChanges;
755
- trackChangeStats = xmlResult.stats;
756
-
757
- if (hasTrackChanges) {
758
- messages.push({
759
- type: 'warning',
760
- message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
761
- });
762
- } else {
763
- messages.push({
764
- type: 'warning',
765
- message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
766
- });
767
- }
768
- }
769
-
770
- // Extract comments directly from docx XML
771
- const comments = await extractWordComments(docxPath);
772
-
773
- // Extract comment anchor texts
774
- const { anchors } = await extractCommentAnchors(docxPath);
775
-
776
- return {
777
- text,
778
- comments,
779
- anchors,
780
- messages,
781
- extractedMedia,
782
- tables: wordTables,
783
- hasTrackChanges,
784
- trackChangeStats,
785
- };
786
- }
787
-
788
178
  /**
789
179
  * Insert comments into markdown text based on anchor texts with context
790
180
  */
@@ -794,165 +184,14 @@ export function insertCommentsIntoMarkdown(
794
184
  anchors: Map<string, CommentAnchorData | string>,
795
185
  options: InsertCommentsOptions = {}
796
186
  ): string {
797
- const { quiet = false, sectionBoundary = null } = options;
187
+ const { quiet = false, sectionBoundary = null, wrapAnchor = true } = options;
798
188
  let result = markdown;
799
189
  let unmatchedCount = 0;
800
190
  const duplicateWarnings: string[] = [];
801
191
  const usedPositions = new Set<number>(); // For tie-breaking: track used positions
802
192
 
803
- // Helper: Strip CriticMarkup from text to get "clean" version for matching
804
- function stripCriticMarkup(text: string): string {
805
- return text
806
- .replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep inserted text
807
- .replace(/\{--([^-]*)--\}/g, '') // deletions: remove deleted text
808
- .replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
809
- .replace(/\{>>[^<]*<<\}/g, '') // comments: remove
810
- .replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
811
- }
812
-
813
- // Helper: Find anchor in text with multiple fallback strategies
814
- function findAnchorInText(anchor: string, text: string, before: string = '', after: string = ''): AnchorSearchResult {
815
- // If anchor is empty, skip directly to context-based matching
816
- if (!anchor || anchor.trim().length === 0) {
817
- // Jump to context-based strategies (Strategy 5)
818
- if (before || after) {
819
- const beforeLower = (before || '').toLowerCase();
820
- const afterLower = (after || '').toLowerCase();
821
- const textLower = text.toLowerCase();
822
-
823
- if (before && after) {
824
- const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
825
- if (beforeIdx !== -1) {
826
- const searchStart = beforeIdx + beforeLower.slice(-50).length;
827
- const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
828
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
829
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
830
- }
831
- }
832
- }
833
-
834
- if (before) {
835
- const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
836
- if (beforeIdx !== -1) {
837
- return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
838
- }
839
- }
840
-
841
- if (after) {
842
- const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
843
- if (afterIdx !== -1) {
844
- return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
845
- }
846
- }
847
- }
848
- return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
849
- }
850
-
851
- const anchorLower = anchor.toLowerCase();
852
- const textLower = text.toLowerCase();
853
-
854
- // Strategy 1: Direct match
855
- let occurrences = findAllOccurrences(textLower, anchorLower);
856
- if (occurrences.length > 0) {
857
- return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
858
- }
859
-
860
- // Strategy 2: Normalized whitespace
861
- const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
862
- const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
863
- let idx = normalizedText.indexOf(normalizedAnchor);
864
- if (idx !== -1) {
865
- return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
866
- }
867
-
868
- // Strategy 3: Try matching in stripped CriticMarkup version
869
- const strippedText = stripCriticMarkup(text);
870
- const strippedLower = strippedText.toLowerCase();
871
- occurrences = findAllOccurrences(strippedLower, anchorLower);
872
- if (occurrences.length > 0) {
873
- return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
874
- }
875
-
876
- // Strategy 4: First N words of anchor (for long anchors)
877
- const words = anchor.split(/\s+/);
878
- if (words.length > 3) {
879
- for (let n = Math.min(6, words.length); n >= 3; n--) {
880
- const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
881
- if (partialAnchor.length >= 15) {
882
- occurrences = findAllOccurrences(textLower, partialAnchor);
883
- if (occurrences.length > 0) {
884
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
885
- }
886
- occurrences = findAllOccurrences(strippedLower, partialAnchor);
887
- if (occurrences.length > 0) {
888
- return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start-stripped', stripped: true };
889
- }
890
- }
891
- }
892
- }
893
-
894
- // Strategy 5: Use context (before/after) to find approximate position
895
- if (before || after) {
896
- const beforeLower = before.toLowerCase();
897
- const afterLower = after.toLowerCase();
898
-
899
- if (before && after) {
900
- const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
901
- if (beforeIdx !== -1) {
902
- const searchStart = beforeIdx + beforeLower.slice(-50).length;
903
- const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
904
- if (afterIdx !== -1 && afterIdx - searchStart < 500) {
905
- return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
906
- }
907
- }
908
- }
909
-
910
- if (before) {
911
- const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
912
- if (beforeIdx !== -1) {
913
- return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
914
- }
915
- }
916
-
917
- if (after) {
918
- const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
919
- if (afterIdx !== -1) {
920
- return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
921
- }
922
- }
923
- }
924
-
925
- // Strategy 6: Try splitting anchor on common transition words
926
- const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
927
- for (const sep of splitPatterns) {
928
- if (anchor.includes(sep)) {
929
- const parts = anchor.split(sep).filter(p => p.length >= 4);
930
- for (const part of parts) {
931
- const partLower = part.toLowerCase();
932
- occurrences = findAllOccurrences(textLower, partLower);
933
- if (occurrences.length > 0 && occurrences.length < 5) {
934
- return { occurrences, matchedAnchor: part, strategy: 'split-match' };
935
- }
936
- }
937
- }
938
- }
939
-
940
- return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
941
- }
942
-
943
- // Helper: Find all occurrences of needle in haystack
944
- function findAllOccurrences(haystack: string, needle: string): number[] {
945
- if (!needle || needle.length === 0) {
946
- return [];
947
- }
948
- const occurrences: number[] = [];
949
- let idx = 0;
950
- while ((idx = haystack.indexOf(needle, idx)) !== -1) {
951
- occurrences.push(idx);
952
- idx += 1;
953
- }
954
- return occurrences;
955
- }
193
+ // Anchor matching primitives live in lib/anchor-match.ts so that
194
+ // `rev verify-anchors` can use the same strategies for drift reporting.
956
195
 
957
196
  // Get all positions in order (for sequential tie-breaking)
958
197
  const commentsWithPositions = comments.map((c): CommentWithPos => {
@@ -1108,18 +347,24 @@ export function insertCommentsIntoMarkdown(
1108
347
  // Sort by position descending (insert from end to avoid offset issues)
1109
348
  matched.sort((a, b) => b.pos - a.pos);
1110
349
 
1111
- // Insert each comment with anchor marking
350
+ // Insert each comment. With `wrapAnchor` (the default), the anchor text
351
+ // gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
352
+ // original Word comment range. Without it, the comment block is inserted
353
+ // adjacent to the anchor and prose stays untouched — required for
354
+ // comments-only sync where multiple comments may share one anchor.
1112
355
  for (const c of matched) {
1113
356
  const comment = `{>>${c.author}: ${c.text}<<}`;
1114
- if (c.anchorText && c.anchorEnd) {
1115
- // Replace anchor text with: {>>comment<<}[anchor]{.mark}
357
+ if (wrapAnchor && c.anchorText && c.anchorEnd) {
1116
358
  const before = result.slice(0, c.pos);
1117
359
  const anchor = result.slice(c.pos, c.anchorEnd);
1118
360
  const after = result.slice(c.anchorEnd);
1119
361
  result = before + comment + `[${anchor}]{.mark}` + after;
1120
362
  } else {
1121
- // No anchor - just insert comment at position
1122
- result = result.slice(0, c.pos) + ` ${comment}` + result.slice(c.pos);
363
+ // Insert comment at the anchor position with no surrounding whitespace
364
+ // tweaks; CriticMarkup blocks are invisible to readers, and adding a
365
+ // leading space would shift prose byte-for-byte (relevant when callers
366
+ // verify that --comments-only didn't touch the original).
367
+ result = result.slice(0, c.pos) + comment + result.slice(c.pos);
1123
368
  }
1124
369
  }
1125
370
 
@@ -1139,647 +384,6 @@ export function insertCommentsIntoMarkdown(
1139
384
  return result;
1140
385
  }
1141
386
 
1142
- /**
1143
- * Fix citation and math annotations by preserving original markdown syntax
1144
- */
1145
- function fixCitationAnnotations(text: string, originalMd: string): string {
1146
- // Fix math annotations - preserve inline and display math
1147
- text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
1148
- text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
1149
-
1150
- text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
1151
- text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
1152
-
1153
- // Extract all citations from original markdown
1154
- const citationPattern = /\[@[^\]]+\]/g;
1155
- const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
1156
-
1157
- // Fix substitutions where left side has markdown citation
1158
- text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
1159
-
1160
- // Fix substitutions where left side STARTS with markdown citation
1161
- text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
1162
- if (oldText.trim() === '' && newText.trim() === '') {
1163
- return cite;
1164
- }
1165
- if (oldText.trim() || newText.trim()) {
1166
- return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
1167
- }
1168
- return cite;
1169
- });
1170
-
1171
- // Fix deletions of markdown citations
1172
- text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
1173
-
1174
- // Fix insertions of rendered citations
1175
- text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
1176
-
1177
- // Clean up broken multi-part substitutions
1178
- text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
1179
-
1180
- // Fix citations split across substitution boundaries
1181
- text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
1182
-
1183
- // Clean up any remaining partial citations
1184
- text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
1185
-
1186
- // Remove rendered citation insertions (with Unicode support)
1187
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
1188
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1189
-
1190
- // Trailing citation fragments
1191
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1192
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
1193
-
1194
- // Just year with closing paren
1195
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
1196
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
1197
-
1198
- // Leading citation fragments
1199
- text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
1200
-
1201
- // Semicolon-separated fragments
1202
- text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
1203
-
1204
- // Year ranges with authors
1205
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
1206
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1207
-
1208
- // Clean up double spaces and orphaned punctuation
1209
- text = text.replace(/ +/g, ' ');
1210
- text = text.replace(/\s+\./g, '.');
1211
- text = text.replace(/\s+,/g, ',');
1212
-
1213
- // Final cleanup - remove empty annotations
1214
- text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
1215
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1216
- text = text.replace(/\{--\s*--\}/g, '');
1217
-
1218
- return text;
1219
- }
1220
-
1221
- /**
1222
- * Strip markdown syntax to get plain text
1223
- */
1224
- function stripMarkdownSyntax(md: string): string {
1225
- return md
1226
- .replace(/^---[\s\S]*?---\n*/m, '')
1227
- .replace(/^#{1,6}\s+/gm, '')
1228
- .replace(/(\*\*|__)(.*?)\1/g, '$2')
1229
- .replace(/(\*|_)(.*?)\1/g, '$2')
1230
- .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
1231
- .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
1232
- .replace(/`([^`]+)`/g, '$1')
1233
- .replace(/```[\s\S]*?```/g, '')
1234
- .replace(/^>\s*/gm, '')
1235
- .replace(/^[-*_]{3,}\s*$/gm, '')
1236
- .replace(/^[\s]*[-*+]\s+/gm, '')
1237
- .replace(/^[\s]*\d+\.\s+/gm, '')
1238
- .replace(/\|/g, ' ')
1239
- .replace(/^[-:]+$/gm, '')
1240
- .replace(/\n{3,}/g, '\n\n')
1241
- .trim();
1242
- }
1243
-
1244
- /**
1245
- * Generate annotated markdown by diffing original MD against Word text
1246
- */
1247
- export function generateAnnotatedDiff(originalMd: string, wordText: string, author: string = 'Reviewer'): string {
1248
- const normalizedOriginal = normalizeWhitespace(originalMd);
1249
- const normalizedWord = normalizeWhitespace(wordText);
1250
-
1251
- const changes = diffWords(normalizedOriginal, normalizedWord);
1252
-
1253
- let result = '';
1254
-
1255
- for (const part of changes) {
1256
- if (part.added) {
1257
- result += `{++${part.value}++}`;
1258
- } else if (part.removed) {
1259
- result += `{--${part.value}--}`;
1260
- } else {
1261
- result += part.value;
1262
- }
1263
- }
1264
-
1265
- return result;
1266
- }
1267
-
1268
- /**
1269
- * Inject Word tables (extracted from XML) into pandoc text output
1270
- */
1271
- function injectWordTables(pandocText: string, wordTables: WordTable[]): string {
1272
- if (!wordTables || wordTables.length === 0) {
1273
- return pandocText;
1274
- }
1275
-
1276
- let result = pandocText;
1277
-
1278
- for (const table of wordTables) {
1279
- const firstLine = table.markdown.split('\n')[0];
1280
- const headerCells = firstLine
1281
- .split('|')
1282
- .map((c) => c.trim())
1283
- .filter((c) => c.length > 0);
1284
-
1285
- if (headerCells.length === 0) continue;
1286
-
1287
- const firstCell = headerCells[0];
1288
- const startIdx = result.indexOf(firstCell);
1289
-
1290
- if (startIdx === -1) continue;
1291
-
1292
- const lastLine = table.markdown.split('\n').pop();
1293
- const lastCells = lastLine!
1294
- .split('|')
1295
- .map((c) => c.trim())
1296
- .filter((c) => c.length > 0);
1297
- const lastCell = lastCells[lastCells.length - 1] || lastCells[0];
1298
-
1299
- const endIdx = result.indexOf(lastCell, startIdx);
1300
- if (endIdx === -1) continue;
1301
-
1302
- let regionStart = result.lastIndexOf('\n\n', startIdx);
1303
- if (regionStart === -1) regionStart = 0;
1304
- else regionStart += 2;
1305
-
1306
- let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length);
1307
- if (regionEnd === -1) regionEnd = result.length;
1308
-
1309
- result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd);
1310
- }
1311
-
1312
- return result;
1313
- }
1314
-
1315
- /**
1316
- * Smart paragraph-level diff that preserves markdown structure
1317
- */
1318
- export function generateSmartDiff(
1319
- originalMd: string,
1320
- wordText: string,
1321
- author: string = 'Reviewer',
1322
- options: GenerateSmartDiffOptions = {}
1323
- ): string {
1324
- const { wordTables = [], imageRegistry = null } = options;
1325
-
1326
- // Inject Word tables into pandoc output
1327
- let wordTextWithTables = injectWordTables(wordText, wordTables);
1328
-
1329
- // Protect markdown tables
1330
- const { text: mdWithTablesProtected, tables } = protectTables(originalMd);
1331
-
1332
- // Also protect tables in Word text
1333
- const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables);
1334
-
1335
- // Protect images
1336
- const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry);
1337
-
1338
- const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry);
1339
-
1340
- // Match Word images to original images
1341
- const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry);
1342
-
1343
- // Replace Word image placeholders with matching original placeholders
1344
- let wordWithMappedImages = wordWithImagesProtected;
1345
- for (const [wordPlaceholder, origPlaceholder] of imageMapping) {
1346
- wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder);
1347
- }
1348
-
1349
- // Protect figure/table anchors
1350
- const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected);
1351
-
1352
- // Protect cross-references
1353
- const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
1354
-
1355
- // Protect math
1356
- const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
1357
-
1358
- // Protect citations
1359
- const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
1360
-
1361
- // Replace rendered elements in Word text
1362
- let wordProtected = wordWithMappedImages;
1363
- wordProtected = replaceRenderedMath(wordProtected, mathBlocks);
1364
- wordProtected = replaceRenderedCitations(wordProtected, citations.length);
1365
-
1366
- // Split into paragraphs
1367
- const originalParas = mdProtected.split(/\n\n+/);
1368
- const wordParas = wordProtected.split(/\n\n+/);
1369
-
1370
- const result: string[] = [];
1371
-
1372
- // Try to match paragraphs intelligently
1373
- let wordIdx = 0;
1374
-
1375
- for (let i = 0; i < originalParas.length; i++) {
1376
- const orig = originalParas[i] || '';
1377
- const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
1378
-
1379
- // Find best matching word paragraph
1380
- let bestMatch = -1;
1381
- let bestScore = 0;
1382
-
1383
- for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
1384
- const wordPara = wordParas[j] || '';
1385
- const origWords = new Set(origContent.toLowerCase().split(/\s+/));
1386
- const wordWords = wordPara.toLowerCase().split(/\s+/);
1387
- const common = wordWords.filter((w) => origWords.has(w)).length;
1388
- const score = common / Math.max(origWords.size, wordWords.length);
1389
-
1390
- if (score > bestScore && score > 0.3) {
1391
- bestScore = score;
1392
- bestMatch = j;
1393
- }
1394
- }
1395
-
1396
- if (bestMatch === -1) {
1397
- if (mdPrefix && wordIdx < wordParas.length) {
1398
- const wordPara = wordParas[wordIdx];
1399
- if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
1400
- bestMatch = wordIdx;
1401
- }
1402
- }
1403
- }
1404
-
1405
- if (bestMatch >= 0) {
1406
- const word = wordParas[bestMatch];
1407
-
1408
- const origStripped = stripMarkdownSyntax(orig);
1409
- const wordNormalized = normalizeWhitespace(word);
1410
-
1411
- if (origStripped === wordNormalized) {
1412
- result.push(orig);
1413
- } else {
1414
- const changes = diffWords(origStripped, wordNormalized);
1415
- let annotated = mdPrefix;
1416
-
1417
- for (const part of changes) {
1418
- if (part.added) {
1419
- annotated += `{++${part.value}++}`;
1420
- } else if (part.removed) {
1421
- annotated += `{--${part.value}--}`;
1422
- } else {
1423
- annotated += part.value;
1424
- }
1425
- }
1426
-
1427
- result.push(annotated);
1428
- }
1429
-
1430
- wordIdx = bestMatch + 1;
1431
- } else {
1432
- // Paragraph deleted entirely
1433
- if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) {
1434
- result.push(orig);
1435
- } else {
1436
- result.push(`{--${orig}--}`);
1437
- }
1438
- }
1439
- }
1440
-
1441
- // Any remaining word paragraphs are additions
1442
- for (let j = wordIdx; j < wordParas.length; j++) {
1443
- const word = wordParas[j];
1444
- if (word.trim()) {
1445
- result.push(`{++${word}++}`);
1446
- }
1447
- }
1448
-
1449
- // Restore protected content
1450
- let finalResult = result.join('\n\n');
1451
- finalResult = restoreCitations(finalResult, citations);
1452
- finalResult = restoreMath(finalResult, mathBlocks);
1453
- finalResult = restoreCrossrefs(finalResult, crossrefs);
1454
- finalResult = restoreAnchors(finalResult, figAnchors);
1455
- finalResult = restoreImages(finalResult, origImages);
1456
- finalResult = restoreImages(finalResult, wordImages);
1457
- finalResult = restoreTables(finalResult, tables);
1458
- finalResult = restoreTables(finalResult, wordTableBlocks);
1459
-
1460
- return finalResult;
1461
- }
1462
-
1463
- /**
1464
- * Clean up redundant adjacent annotations
1465
- */
1466
- export function cleanupAnnotations(text: string): string {
1467
- // Convert adjacent delete+insert to substitution
1468
- text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
1469
-
1470
- // Also handle insert+delete
1471
- text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
1472
-
1473
- // Fix malformed patterns
1474
- text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
1475
-
1476
- // Fix malformed substitutions that got split
1477
- text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
1478
- text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
1479
-
1480
- // Clean up empty annotations
1481
- text = text.replace(/\{--\s*--\}/g, '');
1482
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1483
-
1484
- // Clean up double spaces in prose, but preserve table formatting
1485
- const lines = text.split('\n');
1486
- let inTable = false;
1487
-
1488
- const processedLines = lines.map((line, idx) => {
1489
- const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim());
1490
-
1491
- const looksLikeTableRow = /\S+\s{2,}\S+/.test(line);
1492
-
1493
- if (isSeparator) {
1494
- if (!inTable) {
1495
- inTable = true;
1496
- }
1497
- return line;
1498
- }
1499
-
1500
- if (inTable) {
1501
- if (line.trim() === '') {
1502
- let lookAhead = idx + 1;
1503
- let foundTableContent = false;
1504
- let foundEndSeparator = false;
1505
-
1506
- while (lookAhead < lines.length && lookAhead < idx + 20) {
1507
- const nextLine = lines[lookAhead].trim();
1508
-
1509
- if (nextLine === '') {
1510
- lookAhead++;
1511
- continue;
1512
- }
1513
-
1514
- if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) {
1515
- foundEndSeparator = true;
1516
- break;
1517
- }
1518
-
1519
- if (/\S+\s{2,}\S+/.test(nextLine)) {
1520
- foundTableContent = true;
1521
- break;
1522
- }
1523
-
1524
- if (/^\*[^*]+\*\s*$/.test(nextLine)) {
1525
- foundTableContent = true;
1526
- break;
1527
- }
1528
-
1529
- if (lines[lookAhead].startsWith(' ')) {
1530
- lookAhead++;
1531
- continue;
1532
- }
1533
-
1534
- break;
1535
- }
1536
-
1537
- if (foundTableContent || foundEndSeparator) {
1538
- return line;
1539
- }
1540
-
1541
- inTable = false;
1542
- return line;
1543
- }
1544
-
1545
- return line;
1546
- }
1547
-
1548
- if (looksLikeTableRow) {
1549
- let nextIdx = idx + 1;
1550
- while (nextIdx < lines.length && lines[nextIdx].trim() === '') {
1551
- nextIdx++;
1552
- }
1553
- if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) {
1554
- return line;
1555
- }
1556
- }
1557
-
1558
- if (line.trim().startsWith('|')) {
1559
- return line;
1560
- }
1561
-
1562
- return line.replace(/ +/g, ' ');
1563
- });
1564
- text = processedLines.join('\n');
1565
-
1566
- return text;
1567
- }
1568
-
1569
- /**
1570
- * Parse visible comment markers from Word text
1571
- */
1572
- export function parseVisibleComments(text: string): Array<{ author: string; text: string; position: number }> {
1573
- const comments: Array<{ author: string; text: string; position: number }> = [];
1574
- const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
1575
-
1576
- let match;
1577
- while ((match = pattern.exec(text)) !== null) {
1578
- comments.push({
1579
- author: match[1].trim(),
1580
- text: match[2].trim(),
1581
- position: match.index,
1582
- });
1583
- }
1584
-
1585
- return comments;
1586
- }
1587
-
1588
- /**
1589
- * Convert visible comments to CriticMarkup format
1590
- */
1591
- export function convertVisibleComments(text: string): string {
1592
- return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
1593
- }
1594
-
1595
- /**
1596
- * Restore pandoc-crossref figure/table references from Word-rendered format
1597
- */
1598
- export function restoreCrossrefFromWord(
1599
- text: string,
1600
- projectDir: string,
1601
- restoredLabels: Set<string> | null = null
1602
- ): RestoreCrossrefResult {
1603
- const messages: string[] = [];
1604
- let restored = 0;
1605
- let result = text;
1606
-
1607
- const registry = readImageRegistry(projectDir);
1608
-
1609
- if (!restoredLabels) {
1610
- restoredLabels = new Set<string>();
1611
- }
1612
-
1613
- // Pattern 1: [Figure]{.mark} [N]{.mark}
1614
- result = result.replace(/\[(Figure|Table|Fig\.?)\]\{\.mark\}\s*\[(\d+|S\d+)\]\{\.mark\}/gi, (match, type, num) => {
1615
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1616
- if (registry) {
1617
- const entry = registry.byNumber?.get(`${prefix}:${num}`);
1618
- if (entry && entry.label) {
1619
- restored++;
1620
- return `@${prefix}:${entry.label}`;
1621
- }
1622
- }
1623
- restored++;
1624
- messages.push(`Restored ${type} ${num} (no label found, using placeholder)`);
1625
- return `@${prefix}:fig${num}`;
1626
- });
1627
-
1628
- // Pattern 2: Plain "Figure N" or "Fig. N"
1629
- result = result.replace(/(?<!!)\b(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)\b(?!\s*:)/gi, (match, type, num) => {
1630
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1631
- if (registry) {
1632
- const entry = registry.byNumber?.get(`${prefix}:${num}`);
1633
- if (entry && entry.label) {
1634
- restored++;
1635
- return `@${prefix}:${entry.label}`;
1636
- }
1637
- }
1638
- return match;
1639
- });
1640
-
1641
- // Pattern 3: Remove duplicate plain-text captions
1642
- result = result.replace(/(\!\[[^\]]+\]\([^)]+\)(?:\{[^}]*\})?)\s*\n+\s*(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*[^\n]+/gi, '$1');
1643
-
1644
- // Pattern 4: Clean up image captions that start with "Figure N: "
1645
- result = result.replace(/!\[(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*([^\]]*)\]\(([^)]+)\)(?:\{[^}]*\})?/gi,
1646
- (match, type, num, caption, imgPath) => {
1647
- const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1648
- const labelKey = `${prefix}:${num}`;
1649
-
1650
- if (registry) {
1651
- const entry = registry.byNumber?.get(labelKey);
1652
- if (entry) {
1653
- if (restoredLabels!.has(labelKey)) {
1654
- messages.push(`Skipped duplicate ${prefix}:${entry.label} (already restored)`);
1655
- return `![${entry.caption}](${entry.path})`;
1656
- }
1657
- restoredLabels!.add(labelKey);
1658
- restored++;
1659
- messages.push(`Restored image ${prefix}:${entry.label} from Figure ${num}`);
1660
- return `![${entry.caption}](${entry.path}){#${prefix}:${entry.label}}`;
1661
- }
1662
- }
1663
- const cleanCaption = caption.trim();
1664
- return `![${cleanCaption}](${imgPath})`;
1665
- });
1666
-
1667
- return { text: result, restored, messages, restoredLabels };
1668
- }
1669
-
1670
- /**
1671
- * Restore proper markdown image syntax from Word-extracted text using image registry
1672
- */
1673
- export function restoreImagesFromRegistry(
1674
- text: string,
1675
- projectDir: string,
1676
- restoredLabels: Set<string> | null = null
1677
- ): RestoreImagesResult {
1678
- const messages: string[] = [];
1679
- let restored = 0;
1680
-
1681
- const registry = readImageRegistry(projectDir);
1682
- if (!registry || !registry.figures || registry.figures.length === 0) {
1683
- return { text, restored: 0, messages: ['No image registry found'] };
1684
- }
1685
-
1686
- if (!restoredLabels) {
1687
- restoredLabels = new Set<string>();
1688
- }
1689
-
1690
- let result = text;
1691
-
1692
- // Pattern 1: Caption-like text
1693
- const captionPatterns = [
1694
- /@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^\n]+)/gi,
1695
- /^(Figure|Fig\.?)\s+(\d+|S\d+)[.:]\s*([^\n]+)/gim,
1696
- /\|\s*@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^|]+)\s*\|/gi,
1697
- ];
1698
-
1699
- // Fix @fig:label: caption patterns
1700
- result = result.replace(captionPatterns[0], (match, type, label, caption) => {
1701
- const key = `${type}:${label}`;
1702
- const entry = registry.byLabel.get(key);
1703
- if (entry) {
1704
- if (restoredLabels!.has(key)) {
1705
- messages.push(`Skipped duplicate ${key} (already restored)`);
1706
- return `![${entry.caption}](${entry.path})`;
1707
- }
1708
- restoredLabels!.add(key);
1709
- restored++;
1710
- messages.push(`Restored ${type}:${label} from registry`);
1711
- return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1712
- }
1713
- return match;
1714
- });
1715
-
1716
- // Fix table-wrapped captions
1717
- result = result.replace(captionPatterns[2], (match, type, label, caption) => {
1718
- const key = `${type}:${label}`;
1719
- const entry = registry.byLabel.get(key);
1720
- if (entry) {
1721
- if (restoredLabels!.has(key)) {
1722
- messages.push(`Skipped duplicate ${key} from table wrapper`);
1723
- return `![${entry.caption}](${entry.path})`;
1724
- }
1725
- restoredLabels!.add(key);
1726
- restored++;
1727
- messages.push(`Restored ${type}:${label} from table wrapper`);
1728
- return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1729
- }
1730
- return match;
1731
- });
1732
-
1733
- // Clean up empty table structures
1734
- result = result.replace(/\|\s*\|\s*\n\|:--:\|\s*\n/g, '');
1735
-
1736
- // Fix "Figure N:" standalone lines
1737
- result = result.replace(captionPatterns[1], (match, prefix, num, caption) => {
1738
- const numKey = `fig:${num}`;
1739
- const entry = registry.byNumber.get(numKey);
1740
- if (entry) {
1741
- const labelKey = `fig:${entry.label}`;
1742
- if (restoredLabels!.has(labelKey)) {
1743
- messages.push(`Skipped duplicate Figure ${num} (already restored)`);
1744
- return `![${entry.caption}](${entry.path})`;
1745
- }
1746
- restoredLabels!.add(labelKey);
1747
- restored++;
1748
- messages.push(`Restored Figure ${num} by number lookup`);
1749
- return `![${entry.caption}](${entry.path}){#fig:${entry.label}}`;
1750
- }
1751
- return match;
1752
- });
1753
-
1754
- // Fix generic media paths by matching caption text
1755
- const genericImagePattern = /!\[([^\]]*)\]\(media\/[^)]+\)/g;
1756
- result = result.replace(genericImagePattern, (match, caption) => {
1757
- if (!caption || caption.trim() === '') {
1758
- return match;
1759
- }
1760
-
1761
- const captionKey = caption.slice(0, 50).toLowerCase().trim();
1762
- const entry = registry.byCaption.get(captionKey);
1763
- if (entry) {
1764
- const labelKey = entry.label ? `${entry.type}:${entry.label}` : null;
1765
- if (labelKey && restoredLabels!.has(labelKey)) {
1766
- messages.push(`Skipped duplicate by caption match: ${captionKey.slice(0, 30)}...`);
1767
- return `![${entry.caption}](${entry.path})`;
1768
- }
1769
- if (labelKey) {
1770
- restoredLabels!.add(labelKey);
1771
- }
1772
- restored++;
1773
- messages.push(`Restored image by caption match: ${captionKey.slice(0, 30)}...`);
1774
- const anchor = (entry.label && !restoredLabels!.has(labelKey!)) ? `{#${entry.type}:${entry.label}}` : '';
1775
- return `![${entry.caption}](${entry.path})${anchor}`;
1776
- }
1777
- return match;
1778
- });
1779
-
1780
- return { text: result, restored, messages };
1781
- }
1782
-
1783
387
  /**
1784
388
  * Import Word document with track changes directly as CriticMarkup
1785
389
  */