docrev 0.9.11 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.claude/settings.local.json +9 -9
  2. package/.gitattributes +1 -1
  3. package/CHANGELOG.md +149 -149
  4. package/PLAN-tables-and-postprocess.md +850 -850
  5. package/README.md +391 -391
  6. package/bin/rev.js +11 -11
  7. package/bin/rev.ts +145 -145
  8. package/completions/rev.bash +127 -127
  9. package/completions/rev.ps1 +210 -210
  10. package/completions/rev.zsh +207 -207
  11. package/dev_notes/stress2/build_adversarial.ts +186 -186
  12. package/dev_notes/stress2/drift_matcher.ts +62 -62
  13. package/dev_notes/stress2/probe_anchors.ts +35 -35
  14. package/dev_notes/stress2/project/discussion.before.md +3 -3
  15. package/dev_notes/stress2/project/discussion.md +3 -3
  16. package/dev_notes/stress2/project/methods.before.md +20 -20
  17. package/dev_notes/stress2/project/methods.md +20 -20
  18. package/dev_notes/stress2/project/rev.yaml +5 -5
  19. package/dev_notes/stress2/project/sections.yaml +4 -4
  20. package/dev_notes/stress2/sections.yaml +5 -5
  21. package/dev_notes/stress2/trace_placement.ts +50 -50
  22. package/dev_notes/stresstest_boundaries.ts +27 -27
  23. package/dev_notes/stresstest_drift_apply.ts +43 -43
  24. package/dev_notes/stresstest_drift_compare.ts +43 -43
  25. package/dev_notes/stresstest_drift_v2.ts +54 -54
  26. package/dev_notes/stresstest_inspect.ts +54 -54
  27. package/dev_notes/stresstest_pstyle.ts +55 -55
  28. package/dev_notes/stresstest_section_debug.ts +23 -23
  29. package/dev_notes/stresstest_split.ts +70 -70
  30. package/dev_notes/stresstest_trace.ts +19 -19
  31. package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
  32. package/dist/lib/build.d.ts +50 -1
  33. package/dist/lib/build.d.ts.map +1 -1
  34. package/dist/lib/build.js +80 -30
  35. package/dist/lib/build.js.map +1 -1
  36. package/dist/lib/commands/build.d.ts.map +1 -1
  37. package/dist/lib/commands/build.js +38 -5
  38. package/dist/lib/commands/build.js.map +1 -1
  39. package/dist/lib/commands/utilities.js +164 -164
  40. package/dist/lib/commands/word-tools.js +8 -8
  41. package/dist/lib/grammar.js +3 -3
  42. package/dist/lib/import.d.ts.map +1 -1
  43. package/dist/lib/import.js +146 -24
  44. package/dist/lib/import.js.map +1 -1
  45. package/dist/lib/pdf-comments.js +44 -44
  46. package/dist/lib/plugins.js +57 -57
  47. package/dist/lib/pptx-themes.js +115 -115
  48. package/dist/lib/spelling.js +2 -2
  49. package/dist/lib/templates.js +387 -387
  50. package/dist/lib/themes.js +51 -51
  51. package/dist/lib/types.d.ts +20 -0
  52. package/dist/lib/types.d.ts.map +1 -1
  53. package/dist/lib/word-extraction.d.ts +6 -0
  54. package/dist/lib/word-extraction.d.ts.map +1 -1
  55. package/dist/lib/word-extraction.js +46 -3
  56. package/dist/lib/word-extraction.js.map +1 -1
  57. package/dist/lib/wordcomments.d.ts.map +1 -1
  58. package/dist/lib/wordcomments.js +23 -5
  59. package/dist/lib/wordcomments.js.map +1 -1
  60. package/eslint.config.js +27 -27
  61. package/lib/anchor-match.ts +276 -276
  62. package/lib/annotations.ts +644 -644
  63. package/lib/build.ts +1300 -1227
  64. package/lib/citations.ts +160 -160
  65. package/lib/commands/build.ts +833 -801
  66. package/lib/commands/citations.ts +515 -515
  67. package/lib/commands/comments.ts +1050 -1050
  68. package/lib/commands/context.ts +174 -174
  69. package/lib/commands/core.ts +309 -309
  70. package/lib/commands/doi.ts +435 -435
  71. package/lib/commands/file-ops.ts +372 -372
  72. package/lib/commands/history.ts +320 -320
  73. package/lib/commands/index.ts +87 -87
  74. package/lib/commands/init.ts +259 -259
  75. package/lib/commands/merge-resolve.ts +378 -378
  76. package/lib/commands/preview.ts +178 -178
  77. package/lib/commands/project-info.ts +244 -244
  78. package/lib/commands/quality.ts +517 -517
  79. package/lib/commands/response.ts +454 -454
  80. package/lib/commands/section-boundaries.ts +82 -82
  81. package/lib/commands/sections.ts +451 -451
  82. package/lib/commands/sync.ts +706 -706
  83. package/lib/commands/text-ops.ts +449 -449
  84. package/lib/commands/utilities.ts +448 -448
  85. package/lib/commands/verify-anchors.ts +272 -272
  86. package/lib/commands/word-tools.ts +340 -340
  87. package/lib/comment-realign.ts +517 -517
  88. package/lib/config.ts +84 -84
  89. package/lib/crossref.ts +781 -781
  90. package/lib/csl.ts +191 -191
  91. package/lib/dependencies.ts +98 -98
  92. package/lib/diff-engine.ts +465 -465
  93. package/lib/doi-cache.ts +115 -115
  94. package/lib/doi.ts +897 -897
  95. package/lib/equations.ts +506 -506
  96. package/lib/errors.ts +346 -346
  97. package/lib/format.ts +541 -541
  98. package/lib/git.ts +326 -326
  99. package/lib/grammar.ts +303 -303
  100. package/lib/image-registry.ts +180 -180
  101. package/lib/import.ts +911 -792
  102. package/lib/journals.ts +543 -543
  103. package/lib/merge.ts +633 -633
  104. package/lib/orcid.ts +144 -144
  105. package/lib/pdf-comments.ts +263 -263
  106. package/lib/pdf-import.ts +524 -524
  107. package/lib/plugins.ts +362 -362
  108. package/lib/postprocess.ts +188 -188
  109. package/lib/pptx-color-filter.lua +37 -37
  110. package/lib/pptx-template.ts +469 -469
  111. package/lib/pptx-themes.ts +483 -483
  112. package/lib/protect-restore.ts +520 -520
  113. package/lib/rate-limiter.ts +94 -94
  114. package/lib/response.ts +197 -197
  115. package/lib/restore-references.ts +240 -240
  116. package/lib/review.ts +327 -327
  117. package/lib/schema.ts +417 -417
  118. package/lib/scientific-words.ts +73 -73
  119. package/lib/sections.ts +335 -335
  120. package/lib/slides.ts +756 -756
  121. package/lib/spelling.ts +334 -334
  122. package/lib/templates.ts +526 -526
  123. package/lib/themes.ts +742 -742
  124. package/lib/trackchanges.ts +247 -247
  125. package/lib/tui.ts +450 -450
  126. package/lib/types.ts +550 -530
  127. package/lib/undo.ts +250 -250
  128. package/lib/utils.ts +69 -69
  129. package/lib/variables.ts +179 -179
  130. package/lib/word-extraction.ts +806 -759
  131. package/lib/word.ts +643 -643
  132. package/lib/wordcomments.ts +817 -798
  133. package/package.json +137 -137
  134. package/scripts/postbuild.js +28 -28
  135. package/skill/REFERENCE.md +431 -431
  136. package/skill/SKILL.md +258 -258
  137. package/tsconfig.json +26 -26
  138. package/types/index.d.ts +525 -525
@@ -1,520 +1,520 @@
1
- /**
2
- * Protection and restoration utilities for markdown elements during Word import
3
- *
4
- * These functions protect special markdown syntax (anchors, cross-refs, math, citations,
5
- * images, tables) by replacing them with placeholders before diffing, then restore them after.
6
- */
7
-
8
- // =============================================================================
9
- // Interfaces
10
- // =============================================================================
11
-
12
- interface MarkdownPrefix {
13
- prefix: string;
14
- content: string;
15
- }
16
-
17
- interface ProtectedItem {
18
- original: string;
19
- placeholder: string;
20
- }
21
-
22
- interface ProtectedMath extends ProtectedItem {
23
- type: 'inline' | 'display';
24
- simplified: string;
25
- }
26
-
27
- interface ProtectedImage extends ProtectedItem {
28
- label: string | null;
29
- caption: string;
30
- path: string;
31
- figureNumber: string | null;
32
- }
33
-
34
- interface ProtectedTable extends ProtectedItem {
35
- cellCount: number;
36
- }
37
-
38
- interface ProtectAnchorsResult {
39
- text: string;
40
- anchors: ProtectedItem[];
41
- }
42
-
43
- interface ProtectCrossrefsResult {
44
- text: string;
45
- crossrefs: ProtectedItem[];
46
- }
47
-
48
- interface ProtectMathResult {
49
- text: string;
50
- mathBlocks: ProtectedMath[];
51
- }
52
-
53
- interface ProtectCitationsResult {
54
- text: string;
55
- citations: string[];
56
- }
57
-
58
- interface ProtectImagesResult {
59
- text: string;
60
- images: ProtectedImage[];
61
- }
62
-
63
- interface ProtectTablesResult {
64
- text: string;
65
- tables: ProtectedTable[];
66
- }
67
-
68
- interface ImageRegistry {
69
- byNumber?: Map<string, { label: string }>;
70
- }
71
-
72
- // =============================================================================
73
- // Shared Helpers
74
- // =============================================================================
75
-
76
- /**
77
- * Replace regex matches with indexed placeholders and collect originals
78
- */
79
- function collectAndReplace(
80
- text: string,
81
- pattern: RegExp,
82
- prefix: string,
83
- suffix: string,
84
- ): { text: string; items: ProtectedItem[] } {
85
- const items: ProtectedItem[] = [];
86
- const result = text.replace(pattern, (match) => {
87
- const idx = items.length;
88
- const placeholder = `${prefix}${idx}${suffix}`;
89
- items.push({ original: match, placeholder });
90
- return placeholder;
91
- });
92
- return { text: result, items };
93
- }
94
-
95
- /**
96
- * Restore protected items from placeholders, handling annotation wrappers
97
- * (deletion {--...--} and insertion {++...++} wrappers are unwrapped)
98
- */
99
- function restoreProtectedItems(text: string, items: ProtectedItem[]): string {
100
- for (const item of items) {
101
- const deletionPattern = new RegExp(`\\{--[^}]*?${item.placeholder}[^}]*?--\\}`, 'g');
102
- text = text.replace(deletionPattern, item.original);
103
-
104
- const insertionPattern = new RegExp(`\\{\\+\\+[^}]*?${item.placeholder}[^}]*?\\+\\+\\}`, 'g');
105
- text = text.replace(insertionPattern, item.original);
106
-
107
- text = text.split(item.placeholder).join(item.original);
108
- }
109
- return text;
110
- }
111
-
112
- // =============================================================================
113
- // Public Functions
114
- // =============================================================================
115
-
116
- /**
117
- * Extract markdown prefix (headers, list markers) from a line
118
- */
119
- export function extractMarkdownPrefix(line: string): MarkdownPrefix {
120
- // Headers
121
- const headerMatch = line.match(/^(#{1,6}\s+)/);
122
- if (headerMatch && headerMatch[1]) {
123
- return { prefix: headerMatch[1], content: line.slice(headerMatch[1].length) };
124
- }
125
-
126
- // List items
127
- const listMatch = line.match(/^(\s*[-*+]\s+|\s*\d+\.\s+)/);
128
- if (listMatch && listMatch[1]) {
129
- return { prefix: listMatch[1], content: line.slice(listMatch[1].length) };
130
- }
131
-
132
- // Blockquotes
133
- const quoteMatch = line.match(/^(>\s*)/);
134
- if (quoteMatch && quoteMatch[1]) {
135
- return { prefix: quoteMatch[1], content: line.slice(quoteMatch[1].length) };
136
- }
137
-
138
- return { prefix: '', content: line };
139
- }
140
-
141
- /**
142
- * Protect figure/table anchors before diffing
143
- * Anchors like {#fig:heatmap} and {#tbl:results} should never be deleted
144
- */
145
- export function protectAnchors(md: string): ProtectAnchorsResult {
146
- // Match {#fig:label}, {#tbl:label}, {#eq:label}, {#sec:label} etc.
147
- // Also match with additional attributes like {#fig:label width=50%}
148
- const { text, items: anchors } = collectAndReplace(
149
- md, /\{#(fig|tbl|eq|sec|lst):[^}]+\}/g, 'ANCHORBLOCK', 'ENDANCHOR',
150
- );
151
- return { text, anchors };
152
- }
153
-
154
- /**
155
- * Restore anchors from placeholders
156
- */
157
- export function restoreAnchors(text: string, anchors: ProtectedItem[]): string {
158
- for (const anchor of anchors) {
159
- // Handle case where anchor is inside a deletion annotation
160
- // {--...ANCHORBLOCK0ENDANCHOR--} should become {--...--}{#fig:label}
161
- const deletionPattern = new RegExp(`\\{--([^}]*?)${anchor.placeholder}([^}]*?)--\\}`, 'g');
162
- text = text.replace(deletionPattern, (match, before, after) => {
163
- const cleanBefore = before.trim();
164
- const cleanAfter = after.trim();
165
- let result = '';
166
- if (cleanBefore) result += `{--${cleanBefore}--}`;
167
- result += anchor.original;
168
- if (cleanAfter) result += `{--${cleanAfter}--}`;
169
- return result;
170
- });
171
-
172
- // Handle case where anchor is inside a substitution
173
- // {~~old ANCHORBLOCK0ENDANCHOR~>new~~} -> {~~old~>new~~}{#fig:label}
174
- const substitutionPattern = new RegExp(`\\{~~([^~]*?)${anchor.placeholder}([^~]*?)~>([^~]*)~~\\}`, 'g');
175
- text = text.replace(substitutionPattern, (match: string, oldBefore: string, oldAfter: string, newText: string) => {
176
- const cleanOldBefore = (oldBefore ?? '').trim();
177
- const cleanOldAfter = (oldAfter ?? '').trim();
178
- const cleanNew = (newText ?? '').trim();
179
- const oldText = (cleanOldBefore + ' ' + cleanOldAfter).trim();
180
- let result = '';
181
- if (oldText !== cleanNew) {
182
- result += `{~~${oldText}~>${cleanNew}~~}`;
183
- } else {
184
- result += cleanNew;
185
- }
186
- result += anchor.original;
187
- return result;
188
- });
189
-
190
- // Normal replacement
191
- text = text.split(anchor.placeholder).join(anchor.original);
192
- }
193
- return text;
194
- }
195
-
196
- /**
197
- * Protect cross-references before diffing
198
- * References like @fig:label, @tbl:label should be preserved
199
- */
200
- export function protectCrossrefs(md: string): ProtectCrossrefsResult {
201
- // Match @fig:label, @tbl:label, @eq:label, @sec:label
202
- // Can appear as @fig:label or (@fig:label) or [@fig:label]
203
- const { text, items: crossrefs } = collectAndReplace(
204
- md, /@(fig|tbl|eq|sec|lst):[a-zA-Z0-9_-]+/g, 'XREFBLOCK', 'ENDXREF',
205
- );
206
- return { text, crossrefs };
207
- }
208
-
209
- /**
210
- * Restore cross-references from placeholders
211
- */
212
- export function restoreCrossrefs(text: string, crossrefs: ProtectedItem[]): string {
213
- for (const xref of crossrefs) {
214
- // Handle deletions - restore the reference even if marked deleted
215
- const deletionPattern = new RegExp(`\\{--([^}]*?)${xref.placeholder}([^}]*?)--\\}`, 'g');
216
- text = text.replace(deletionPattern, (match, before, after) => {
217
- const cleanBefore = before.trim();
218
- const cleanAfter = after.trim();
219
- let result = '';
220
- if (cleanBefore) result += `{--${cleanBefore}--}`;
221
- result += xref.original;
222
- if (cleanAfter) result += `{--${cleanAfter}--}`;
223
- return result;
224
- });
225
-
226
- // Handle substitutions where rendered form (Figure 1) replaced the reference
227
- // {~~XREFBLOCK0ENDXREF~>Figure 1~~} -> @fig:label
228
- const substitutionPattern = new RegExp(`\\{~~${xref.placeholder}~>[^~]+~~\\}`, 'g');
229
- text = text.replace(substitutionPattern, xref.original);
230
-
231
- // Normal replacement
232
- text = text.split(xref.placeholder).join(xref.original);
233
- }
234
- return text;
235
- }
236
-
237
- /**
238
- * Simplify LaTeX math for fuzzy matching against Word text
239
- * Word renders math as text, so we need to match the rendered form
240
- */
241
- export function simplifyMathForMatching(latex: string): string {
242
- return latex
243
- // Remove common LaTeX commands
244
- .replace(/\\text\{([^}]+)\}/g, '$1')
245
- .replace(/\\hat\{([^}]+)\}/g, '$1')
246
- .replace(/\\bar\{([^}]+)\}/g, '$1')
247
- .replace(/\\frac\{([^}]+)\}\{([^}]+)\}/g, '$1/$2')
248
- .replace(/\\sum_([a-z])/g, 'Σ')
249
- .replace(/\\sum/g, 'Σ')
250
- .replace(/\\cdot/g, '·')
251
- .replace(/\\quad/g, ' ')
252
- .replace(/\\,/g, ' ')
253
- .replace(/\\_/g, '_')
254
- .replace(/\\{/g, '{')
255
- .replace(/\\}/g, '}')
256
- .replace(/\\/g, '') // Remove remaining backslashes
257
- .replace(/[{}]/g, '') // Remove braces
258
- .replace(/\s+/g, ' ')
259
- .trim();
260
- }
261
-
262
- /**
263
- * Protect mathematical notation before diffing by replacing with placeholders
264
- * Handles both inline $...$ and display $$...$$ math
265
- */
266
- export function protectMath(md: string): ProtectMathResult {
267
- const mathBlocks: ProtectedMath[] = [];
268
-
269
- // First protect display math ($$...$$) - must be done before inline math
270
- let text = md.replace(/\$\$([^$]+)\$\$/g, (match, content) => {
271
- const idx = mathBlocks.length;
272
- const placeholder = `MATHBLOCK${idx}ENDMATH`;
273
- // Create simplified version for matching in Word text
274
- const simplified = simplifyMathForMatching(content);
275
- mathBlocks.push({ original: match, placeholder, type: 'display', simplified });
276
- return placeholder;
277
- });
278
-
279
- // Then protect inline math ($...$)
280
- text = text.replace(/\$([^$\n]+)\$/g, (match, content) => {
281
- const idx = mathBlocks.length;
282
- const placeholder = `MATHBLOCK${idx}ENDMATH`;
283
- const simplified = simplifyMathForMatching(content);
284
- mathBlocks.push({ original: match, placeholder, type: 'inline', simplified });
285
- return placeholder;
286
- });
287
-
288
- return { text, mathBlocks };
289
- }
290
-
291
- /**
292
- * Restore math from placeholders
293
- */
294
- export function restoreMath(text: string, mathBlocks: ProtectedMath[]): string {
295
- for (const block of mathBlocks) {
296
- text = text.split(block.placeholder).join(block.original);
297
- }
298
- return text;
299
- }
300
-
301
- /**
302
- * Replace rendered math in Word text with matching placeholders
303
- * This is heuristic-based since Word can render math in various ways
304
- */
305
- export function replaceRenderedMath(wordText: string, mathBlocks: ProtectedMath[]): string {
306
- let result = wordText;
307
-
308
- for (const block of mathBlocks) {
309
- // For inline math, try to find the simplified form in Word text
310
- if (block.simplified.length >= 2) {
311
- // Try exact match first
312
- if (result.includes(block.simplified)) {
313
- result = result.replace(block.simplified, block.placeholder);
314
- }
315
- }
316
- }
317
-
318
- return result;
319
- }
320
-
321
- /**
322
- * Protect citations before diffing by replacing with placeholders
323
- */
324
- export function protectCitations(md: string): ProtectCitationsResult {
325
- const citations: string[] = [];
326
- const text = md.replace(/\[@[^\]]+\]/g, (match) => {
327
- const idx = citations.length;
328
- citations.push(match);
329
- return `CITEREF${idx}ENDCITE`;
330
- });
331
- return { text, citations };
332
- }
333
-
334
- /**
335
- * Restore citations from placeholders
336
- */
337
- export function restoreCitations(text: string, citations: string[]): string {
338
- for (let i = 0; i < citations.length; i++) {
339
- // Handle cases where placeholder might be inside annotations
340
- const placeholder = `CITEREF${i}ENDCITE`;
341
- text = text.split(placeholder).join(citations[i]);
342
- }
343
- return text;
344
- }
345
-
346
- /**
347
- * Remove rendered citations from Word text (replace with matching placeholders)
348
- */
349
- export function replaceRenderedCitations(wordText: string, count: number): string {
350
- // Match rendered citation patterns: (Author 2021), (Author et al. 2021), etc.
351
- const pattern = /\((?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?(?:\s*[&,;]\s*[A-Z][a-zé]+(?:\s+et\s+al\.?)?)*\s+\d{4}(?:[a-z])?(?:\s*[,;]\s*(?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?\s+)?\d{4}(?:[a-z])?)*)\)/g;
352
-
353
- let idx = 0;
354
- return wordText.replace(pattern, (match) => {
355
- if (idx < count) {
356
- const placeholder = `CITEREF${idx}ENDCITE`;
357
- idx++;
358
- return placeholder;
359
- }
360
- return match;
361
- });
362
- }
363
-
364
- /**
365
- * Protect markdown images before diffing by replacing with placeholders
366
- * Images are treated as atomic blocks to prevent corruption during diff
367
- *
368
- * Matches: ![caption](path){#fig:label} or ![caption](path)
369
- * Also matches Word-style: ![Figure N: caption](media/path)
370
- */
371
- export function protectImages(md: string, registry: ImageRegistry | null = null): ProtectImagesResult {
372
- const images: ProtectedImage[] = [];
373
-
374
- // Match markdown images: ![caption](path){#anchor} or ![caption](path)
375
- // The anchor is optional and can have additional attributes
376
- const imagePattern = /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g;
377
-
378
- const text = md.replace(imagePattern, (match, caption, path, anchor) => {
379
- const idx = images.length;
380
- const placeholder = `IMAGEBLOCK${idx}ENDIMAGE`;
381
-
382
- // Extract label from anchor if present (e.g., "#fig:map" -> "map")
383
- let label: string | null = null;
384
- if (anchor) {
385
- const labelMatch = anchor.match(/#(fig|tbl):([a-zA-Z0-9_-]+)/);
386
- if (labelMatch) {
387
- label = labelMatch[2];
388
- }
389
- }
390
-
391
- // Try to extract figure number from Word-style caption "Figure N: ..."
392
- let figureNumber: string | null = null;
393
- const figNumMatch = caption.match(/^(?:Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*/i);
394
- if (figNumMatch) {
395
- figureNumber = figNumMatch[1];
396
- }
397
-
398
- images.push({
399
- original: match,
400
- placeholder,
401
- label,
402
- caption: caption.trim(),
403
- path,
404
- figureNumber,
405
- });
406
-
407
- return placeholder;
408
- });
409
-
410
- return { text, images };
411
- }
412
-
413
- /**
414
- * Restore images from placeholders
415
- */
416
- export function restoreImages(text: string, images: ProtectedImage[]): string {
417
- return restoreProtectedItems(text, images);
418
- }
419
-
420
- /**
421
- * Match Word-extracted images to original images using registry
422
- * Returns a mapping of Word image placeholders to original image placeholders
423
- */
424
- export function matchWordImagesToOriginal(
425
- originalImages: ProtectedImage[],
426
- wordImages: ProtectedImage[],
427
- registry: ImageRegistry | null = null
428
- ): Map<string, string> {
429
- const mapping = new Map<string, string>();
430
- const usedOriginals = new Set<string>();
431
-
432
- for (const wordImg of wordImages) {
433
- let bestMatch: ProtectedImage | null = null;
434
- let bestScore = 0;
435
-
436
- for (const origImg of originalImages) {
437
- if (usedOriginals.has(origImg.placeholder)) continue;
438
-
439
- let score = 0;
440
-
441
- // Match by label (most reliable)
442
- if (wordImg.label && origImg.label && wordImg.label === origImg.label) {
443
- score += 100;
444
- }
445
-
446
- // Match by figure number via registry
447
- if (wordImg.figureNumber && registry) {
448
- const entry = registry.byNumber?.get(`fig:${wordImg.figureNumber}`);
449
- if (entry && entry.label === origImg.label) {
450
- score += 90;
451
- }
452
- }
453
-
454
- // Match by caption similarity (first 50 chars, normalized)
455
- const wordCaption = wordImg.caption.replace(/^(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*/i, '').toLowerCase().slice(0, 50);
456
- const origCaption = origImg.caption.toLowerCase().slice(0, 50);
457
- if (wordCaption && origCaption && wordCaption === origCaption) {
458
- score += 80;
459
- } else if (wordCaption && origCaption && (wordCaption.includes(origCaption.slice(0, 30)) || origCaption.includes(wordCaption.slice(0, 30)))) {
460
- score += 40;
461
- }
462
-
463
- // Match by path similarity (filename)
464
- const wordFile = wordImg.path.split('/').pop()?.toLowerCase() || '';
465
- const origFile = origImg.path.split('/').pop()?.toLowerCase() || '';
466
- if (wordFile === origFile) {
467
- score += 30;
468
- }
469
-
470
- if (score > bestScore) {
471
- bestScore = score;
472
- bestMatch = origImg;
473
- }
474
- }
475
-
476
- if (bestMatch && bestScore >= 40) {
477
- mapping.set(wordImg.placeholder, bestMatch.placeholder);
478
- usedOriginals.add(bestMatch.placeholder);
479
- }
480
- }
481
-
482
- return mapping;
483
- }
484
-
485
- /**
486
- * Protect markdown tables before diffing by replacing with placeholders
487
- * Tables are treated as atomic blocks to prevent corruption during diff
488
- */
489
- export function protectTables(md: string): ProtectTablesResult {
490
- const tables: ProtectedTable[] = [];
491
-
492
- // Match markdown tables: lines starting with | and containing |
493
- // A table is: optional caption, header row, separator row (|---|), data rows
494
- const tablePattern = /(?:^(?:\*\*)?Table[^\n]*\n\n?)?(?:^\|[^\n]+\|\n)+/gm;
495
-
496
- const text = md.replace(tablePattern, (match) => {
497
- // Verify it's actually a table (has separator row with dashes)
498
- if (!match.includes('|---') && !match.includes('| ---') && !match.includes('|:--')) {
499
- return match; // Not a real table, just lines with pipes
500
- }
501
-
502
- const idx = tables.length;
503
- const placeholder = `\n\nTABLEBLOCK${idx}ENDTABLE\n\n`;
504
-
505
- // Count cells for matching in Word (approximate)
506
- const cellCount = (match.match(/\|/g) || []).length;
507
-
508
- tables.push({ original: match.trim(), placeholder: placeholder.trim(), cellCount });
509
- return placeholder;
510
- });
511
-
512
- return { text, tables };
513
- }
514
-
515
- /**
516
- * Restore tables from placeholders
517
- */
518
- export function restoreTables(text: string, tables: ProtectedTable[]): string {
519
- return restoreProtectedItems(text, tables);
520
- }
1
+ /**
2
+ * Protection and restoration utilities for markdown elements during Word import
3
+ *
4
+ * These functions protect special markdown syntax (anchors, cross-refs, math, citations,
5
+ * images, tables) by replacing them with placeholders before diffing, then restore them after.
6
+ */
7
+
8
+ // =============================================================================
9
+ // Interfaces
10
+ // =============================================================================
11
+
12
+ interface MarkdownPrefix {
13
+ prefix: string;
14
+ content: string;
15
+ }
16
+
17
+ interface ProtectedItem {
18
+ original: string;
19
+ placeholder: string;
20
+ }
21
+
22
+ interface ProtectedMath extends ProtectedItem {
23
+ type: 'inline' | 'display';
24
+ simplified: string;
25
+ }
26
+
27
+ interface ProtectedImage extends ProtectedItem {
28
+ label: string | null;
29
+ caption: string;
30
+ path: string;
31
+ figureNumber: string | null;
32
+ }
33
+
34
+ interface ProtectedTable extends ProtectedItem {
35
+ cellCount: number;
36
+ }
37
+
38
+ interface ProtectAnchorsResult {
39
+ text: string;
40
+ anchors: ProtectedItem[];
41
+ }
42
+
43
+ interface ProtectCrossrefsResult {
44
+ text: string;
45
+ crossrefs: ProtectedItem[];
46
+ }
47
+
48
+ interface ProtectMathResult {
49
+ text: string;
50
+ mathBlocks: ProtectedMath[];
51
+ }
52
+
53
+ interface ProtectCitationsResult {
54
+ text: string;
55
+ citations: string[];
56
+ }
57
+
58
+ interface ProtectImagesResult {
59
+ text: string;
60
+ images: ProtectedImage[];
61
+ }
62
+
63
+ interface ProtectTablesResult {
64
+ text: string;
65
+ tables: ProtectedTable[];
66
+ }
67
+
68
+ interface ImageRegistry {
69
+ byNumber?: Map<string, { label: string }>;
70
+ }
71
+
72
+ // =============================================================================
73
+ // Shared Helpers
74
+ // =============================================================================
75
+
76
+ /**
77
+ * Replace regex matches with indexed placeholders and collect originals
78
+ */
79
+ function collectAndReplace(
80
+ text: string,
81
+ pattern: RegExp,
82
+ prefix: string,
83
+ suffix: string,
84
+ ): { text: string; items: ProtectedItem[] } {
85
+ const items: ProtectedItem[] = [];
86
+ const result = text.replace(pattern, (match) => {
87
+ const idx = items.length;
88
+ const placeholder = `${prefix}${idx}${suffix}`;
89
+ items.push({ original: match, placeholder });
90
+ return placeholder;
91
+ });
92
+ return { text: result, items };
93
+ }
94
+
95
+ /**
96
+ * Restore protected items from placeholders, handling annotation wrappers
97
+ * (deletion {--...--} and insertion {++...++} wrappers are unwrapped)
98
+ */
99
+ function restoreProtectedItems(text: string, items: ProtectedItem[]): string {
100
+ for (const item of items) {
101
+ const deletionPattern = new RegExp(`\\{--[^}]*?${item.placeholder}[^}]*?--\\}`, 'g');
102
+ text = text.replace(deletionPattern, item.original);
103
+
104
+ const insertionPattern = new RegExp(`\\{\\+\\+[^}]*?${item.placeholder}[^}]*?\\+\\+\\}`, 'g');
105
+ text = text.replace(insertionPattern, item.original);
106
+
107
+ text = text.split(item.placeholder).join(item.original);
108
+ }
109
+ return text;
110
+ }
111
+
112
+ // =============================================================================
113
+ // Public Functions
114
+ // =============================================================================
115
+
116
+ /**
117
+ * Extract markdown prefix (headers, list markers) from a line
118
+ */
119
+ export function extractMarkdownPrefix(line: string): MarkdownPrefix {
120
+ // Headers
121
+ const headerMatch = line.match(/^(#{1,6}\s+)/);
122
+ if (headerMatch && headerMatch[1]) {
123
+ return { prefix: headerMatch[1], content: line.slice(headerMatch[1].length) };
124
+ }
125
+
126
+ // List items
127
+ const listMatch = line.match(/^(\s*[-*+]\s+|\s*\d+\.\s+)/);
128
+ if (listMatch && listMatch[1]) {
129
+ return { prefix: listMatch[1], content: line.slice(listMatch[1].length) };
130
+ }
131
+
132
+ // Blockquotes
133
+ const quoteMatch = line.match(/^(>\s*)/);
134
+ if (quoteMatch && quoteMatch[1]) {
135
+ return { prefix: quoteMatch[1], content: line.slice(quoteMatch[1].length) };
136
+ }
137
+
138
+ return { prefix: '', content: line };
139
+ }
140
+
141
+ /**
142
+ * Protect figure/table anchors before diffing
143
+ * Anchors like {#fig:heatmap} and {#tbl:results} should never be deleted
144
+ */
145
+ export function protectAnchors(md: string): ProtectAnchorsResult {
146
+ // Match {#fig:label}, {#tbl:label}, {#eq:label}, {#sec:label} etc.
147
+ // Also match with additional attributes like {#fig:label width=50%}
148
+ const { text, items: anchors } = collectAndReplace(
149
+ md, /\{#(fig|tbl|eq|sec|lst):[^}]+\}/g, 'ANCHORBLOCK', 'ENDANCHOR',
150
+ );
151
+ return { text, anchors };
152
+ }
153
+
154
+ /**
155
+ * Restore anchors from placeholders
156
+ */
157
+ export function restoreAnchors(text: string, anchors: ProtectedItem[]): string {
158
+ for (const anchor of anchors) {
159
+ // Handle case where anchor is inside a deletion annotation
160
+ // {--...ANCHORBLOCK0ENDANCHOR--} should become {--...--}{#fig:label}
161
+ const deletionPattern = new RegExp(`\\{--([^}]*?)${anchor.placeholder}([^}]*?)--\\}`, 'g');
162
+ text = text.replace(deletionPattern, (match, before, after) => {
163
+ const cleanBefore = before.trim();
164
+ const cleanAfter = after.trim();
165
+ let result = '';
166
+ if (cleanBefore) result += `{--${cleanBefore}--}`;
167
+ result += anchor.original;
168
+ if (cleanAfter) result += `{--${cleanAfter}--}`;
169
+ return result;
170
+ });
171
+
172
+ // Handle case where anchor is inside a substitution
173
+ // {~~old ANCHORBLOCK0ENDANCHOR~>new~~} -> {~~old~>new~~}{#fig:label}
174
+ const substitutionPattern = new RegExp(`\\{~~([^~]*?)${anchor.placeholder}([^~]*?)~>([^~]*)~~\\}`, 'g');
175
+ text = text.replace(substitutionPattern, (match: string, oldBefore: string, oldAfter: string, newText: string) => {
176
+ const cleanOldBefore = (oldBefore ?? '').trim();
177
+ const cleanOldAfter = (oldAfter ?? '').trim();
178
+ const cleanNew = (newText ?? '').trim();
179
+ const oldText = (cleanOldBefore + ' ' + cleanOldAfter).trim();
180
+ let result = '';
181
+ if (oldText !== cleanNew) {
182
+ result += `{~~${oldText}~>${cleanNew}~~}`;
183
+ } else {
184
+ result += cleanNew;
185
+ }
186
+ result += anchor.original;
187
+ return result;
188
+ });
189
+
190
+ // Normal replacement
191
+ text = text.split(anchor.placeholder).join(anchor.original);
192
+ }
193
+ return text;
194
+ }
195
+
196
+ /**
197
+ * Protect cross-references before diffing
198
+ * References like @fig:label, @tbl:label should be preserved
199
+ */
200
+ export function protectCrossrefs(md: string): ProtectCrossrefsResult {
201
+ // Match @fig:label, @tbl:label, @eq:label, @sec:label
202
+ // Can appear as @fig:label or (@fig:label) or [@fig:label]
203
+ const { text, items: crossrefs } = collectAndReplace(
204
+ md, /@(fig|tbl|eq|sec|lst):[a-zA-Z0-9_-]+/g, 'XREFBLOCK', 'ENDXREF',
205
+ );
206
+ return { text, crossrefs };
207
+ }
208
+
209
+ /**
210
+ * Restore cross-references from placeholders
211
+ */
212
+ export function restoreCrossrefs(text: string, crossrefs: ProtectedItem[]): string {
213
+ for (const xref of crossrefs) {
214
+ // Handle deletions - restore the reference even if marked deleted
215
+ const deletionPattern = new RegExp(`\\{--([^}]*?)${xref.placeholder}([^}]*?)--\\}`, 'g');
216
+ text = text.replace(deletionPattern, (match, before, after) => {
217
+ const cleanBefore = before.trim();
218
+ const cleanAfter = after.trim();
219
+ let result = '';
220
+ if (cleanBefore) result += `{--${cleanBefore}--}`;
221
+ result += xref.original;
222
+ if (cleanAfter) result += `{--${cleanAfter}--}`;
223
+ return result;
224
+ });
225
+
226
+ // Handle substitutions where rendered form (Figure 1) replaced the reference
227
+ // {~~XREFBLOCK0ENDXREF~>Figure 1~~} -> @fig:label
228
+ const substitutionPattern = new RegExp(`\\{~~${xref.placeholder}~>[^~]+~~\\}`, 'g');
229
+ text = text.replace(substitutionPattern, xref.original);
230
+
231
+ // Normal replacement
232
+ text = text.split(xref.placeholder).join(xref.original);
233
+ }
234
+ return text;
235
+ }
236
+
237
+ /**
238
+ * Simplify LaTeX math for fuzzy matching against Word text
239
+ * Word renders math as text, so we need to match the rendered form
240
+ */
241
+ export function simplifyMathForMatching(latex: string): string {
242
+ return latex
243
+ // Remove common LaTeX commands
244
+ .replace(/\\text\{([^}]+)\}/g, '$1')
245
+ .replace(/\\hat\{([^}]+)\}/g, '$1')
246
+ .replace(/\\bar\{([^}]+)\}/g, '$1')
247
+ .replace(/\\frac\{([^}]+)\}\{([^}]+)\}/g, '$1/$2')
248
+ .replace(/\\sum_([a-z])/g, 'Σ')
249
+ .replace(/\\sum/g, 'Σ')
250
+ .replace(/\\cdot/g, '·')
251
+ .replace(/\\quad/g, ' ')
252
+ .replace(/\\,/g, ' ')
253
+ .replace(/\\_/g, '_')
254
+ .replace(/\\{/g, '{')
255
+ .replace(/\\}/g, '}')
256
+ .replace(/\\/g, '') // Remove remaining backslashes
257
+ .replace(/[{}]/g, '') // Remove braces
258
+ .replace(/\s+/g, ' ')
259
+ .trim();
260
+ }
261
+
262
+ /**
263
+ * Protect mathematical notation before diffing by replacing with placeholders
264
+ * Handles both inline $...$ and display $$...$$ math
265
+ */
266
+ export function protectMath(md: string): ProtectMathResult {
267
+ const mathBlocks: ProtectedMath[] = [];
268
+
269
+ // First protect display math ($$...$$) - must be done before inline math
270
+ let text = md.replace(/\$\$([^$]+)\$\$/g, (match, content) => {
271
+ const idx = mathBlocks.length;
272
+ const placeholder = `MATHBLOCK${idx}ENDMATH`;
273
+ // Create simplified version for matching in Word text
274
+ const simplified = simplifyMathForMatching(content);
275
+ mathBlocks.push({ original: match, placeholder, type: 'display', simplified });
276
+ return placeholder;
277
+ });
278
+
279
+ // Then protect inline math ($...$)
280
+ text = text.replace(/\$([^$\n]+)\$/g, (match, content) => {
281
+ const idx = mathBlocks.length;
282
+ const placeholder = `MATHBLOCK${idx}ENDMATH`;
283
+ const simplified = simplifyMathForMatching(content);
284
+ mathBlocks.push({ original: match, placeholder, type: 'inline', simplified });
285
+ return placeholder;
286
+ });
287
+
288
+ return { text, mathBlocks };
289
+ }
290
+
291
+ /**
292
+ * Restore math from placeholders
293
+ */
294
+ export function restoreMath(text: string, mathBlocks: ProtectedMath[]): string {
295
+ for (const block of mathBlocks) {
296
+ text = text.split(block.placeholder).join(block.original);
297
+ }
298
+ return text;
299
+ }
300
+
301
+ /**
302
+ * Replace rendered math in Word text with matching placeholders
303
+ * This is heuristic-based since Word can render math in various ways
304
+ */
305
+ export function replaceRenderedMath(wordText: string, mathBlocks: ProtectedMath[]): string {
306
+ let result = wordText;
307
+
308
+ for (const block of mathBlocks) {
309
+ // For inline math, try to find the simplified form in Word text
310
+ if (block.simplified.length >= 2) {
311
+ // Try exact match first
312
+ if (result.includes(block.simplified)) {
313
+ result = result.replace(block.simplified, block.placeholder);
314
+ }
315
+ }
316
+ }
317
+
318
+ return result;
319
+ }
320
+
321
+ /**
322
+ * Protect citations before diffing by replacing with placeholders
323
+ */
324
+ export function protectCitations(md: string): ProtectCitationsResult {
325
+ const citations: string[] = [];
326
+ const text = md.replace(/\[@[^\]]+\]/g, (match) => {
327
+ const idx = citations.length;
328
+ citations.push(match);
329
+ return `CITEREF${idx}ENDCITE`;
330
+ });
331
+ return { text, citations };
332
+ }
333
+
334
+ /**
335
+ * Restore citations from placeholders
336
+ */
337
+ export function restoreCitations(text: string, citations: string[]): string {
338
+ for (let i = 0; i < citations.length; i++) {
339
+ // Handle cases where placeholder might be inside annotations
340
+ const placeholder = `CITEREF${i}ENDCITE`;
341
+ text = text.split(placeholder).join(citations[i]);
342
+ }
343
+ return text;
344
+ }
345
+
346
+ /**
347
+ * Remove rendered citations from Word text (replace with matching placeholders)
348
+ */
349
+ export function replaceRenderedCitations(wordText: string, count: number): string {
350
+ // Match rendered citation patterns: (Author 2021), (Author et al. 2021), etc.
351
+ const pattern = /\((?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?(?:\s*[&,;]\s*[A-Z][a-zé]+(?:\s+et\s+al\.?)?)*\s+\d{4}(?:[a-z])?(?:\s*[,;]\s*(?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?\s+)?\d{4}(?:[a-z])?)*)\)/g;
352
+
353
+ let idx = 0;
354
+ return wordText.replace(pattern, (match) => {
355
+ if (idx < count) {
356
+ const placeholder = `CITEREF${idx}ENDCITE`;
357
+ idx++;
358
+ return placeholder;
359
+ }
360
+ return match;
361
+ });
362
+ }
363
+
364
+ /**
365
+ * Protect markdown images before diffing by replacing with placeholders
366
+ * Images are treated as atomic blocks to prevent corruption during diff
367
+ *
368
+ * Matches: ![caption](path){#fig:label} or ![caption](path)
369
+ * Also matches Word-style: ![Figure N: caption](media/path)
370
+ */
371
+ export function protectImages(md: string, registry: ImageRegistry | null = null): ProtectImagesResult {
372
+ const images: ProtectedImage[] = [];
373
+
374
+ // Match markdown images: ![caption](path){#anchor} or ![caption](path)
375
+ // The anchor is optional and can have additional attributes
376
+ const imagePattern = /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g;
377
+
378
+ const text = md.replace(imagePattern, (match, caption, path, anchor) => {
379
+ const idx = images.length;
380
+ const placeholder = `IMAGEBLOCK${idx}ENDIMAGE`;
381
+
382
+ // Extract label from anchor if present (e.g., "#fig:map" -> "map")
383
+ let label: string | null = null;
384
+ if (anchor) {
385
+ const labelMatch = anchor.match(/#(fig|tbl):([a-zA-Z0-9_-]+)/);
386
+ if (labelMatch) {
387
+ label = labelMatch[2];
388
+ }
389
+ }
390
+
391
+ // Try to extract figure number from Word-style caption "Figure N: ..."
392
+ let figureNumber: string | null = null;
393
+ const figNumMatch = caption.match(/^(?:Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*/i);
394
+ if (figNumMatch) {
395
+ figureNumber = figNumMatch[1];
396
+ }
397
+
398
+ images.push({
399
+ original: match,
400
+ placeholder,
401
+ label,
402
+ caption: caption.trim(),
403
+ path,
404
+ figureNumber,
405
+ });
406
+
407
+ return placeholder;
408
+ });
409
+
410
+ return { text, images };
411
+ }
412
+
413
+ /**
414
+ * Restore images from placeholders
415
+ */
416
+ export function restoreImages(text: string, images: ProtectedImage[]): string {
417
+ return restoreProtectedItems(text, images);
418
+ }
419
+
420
+ /**
421
+ * Match Word-extracted images to original images using registry
422
+ * Returns a mapping of Word image placeholders to original image placeholders
423
+ */
424
+ export function matchWordImagesToOriginal(
425
+ originalImages: ProtectedImage[],
426
+ wordImages: ProtectedImage[],
427
+ registry: ImageRegistry | null = null
428
+ ): Map<string, string> {
429
+ const mapping = new Map<string, string>();
430
+ const usedOriginals = new Set<string>();
431
+
432
+ for (const wordImg of wordImages) {
433
+ let bestMatch: ProtectedImage | null = null;
434
+ let bestScore = 0;
435
+
436
+ for (const origImg of originalImages) {
437
+ if (usedOriginals.has(origImg.placeholder)) continue;
438
+
439
+ let score = 0;
440
+
441
+ // Match by label (most reliable)
442
+ if (wordImg.label && origImg.label && wordImg.label === origImg.label) {
443
+ score += 100;
444
+ }
445
+
446
+ // Match by figure number via registry
447
+ if (wordImg.figureNumber && registry) {
448
+ const entry = registry.byNumber?.get(`fig:${wordImg.figureNumber}`);
449
+ if (entry && entry.label === origImg.label) {
450
+ score += 90;
451
+ }
452
+ }
453
+
454
+ // Match by caption similarity (first 50 chars, normalized)
455
+ const wordCaption = wordImg.caption.replace(/^(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*/i, '').toLowerCase().slice(0, 50);
456
+ const origCaption = origImg.caption.toLowerCase().slice(0, 50);
457
+ if (wordCaption && origCaption && wordCaption === origCaption) {
458
+ score += 80;
459
+ } else if (wordCaption && origCaption && (wordCaption.includes(origCaption.slice(0, 30)) || origCaption.includes(wordCaption.slice(0, 30)))) {
460
+ score += 40;
461
+ }
462
+
463
+ // Match by path similarity (filename)
464
+ const wordFile = wordImg.path.split('/').pop()?.toLowerCase() || '';
465
+ const origFile = origImg.path.split('/').pop()?.toLowerCase() || '';
466
+ if (wordFile === origFile) {
467
+ score += 30;
468
+ }
469
+
470
+ if (score > bestScore) {
471
+ bestScore = score;
472
+ bestMatch = origImg;
473
+ }
474
+ }
475
+
476
+ if (bestMatch && bestScore >= 40) {
477
+ mapping.set(wordImg.placeholder, bestMatch.placeholder);
478
+ usedOriginals.add(bestMatch.placeholder);
479
+ }
480
+ }
481
+
482
+ return mapping;
483
+ }
484
+
485
+ /**
486
+ * Protect markdown tables before diffing by replacing with placeholders
487
+ * Tables are treated as atomic blocks to prevent corruption during diff
488
+ */
489
+ export function protectTables(md: string): ProtectTablesResult {
490
+ const tables: ProtectedTable[] = [];
491
+
492
+ // Match markdown tables: lines starting with | and containing |
493
+ // A table is: optional caption, header row, separator row (|---|), data rows
494
+ const tablePattern = /(?:^(?:\*\*)?Table[^\n]*\n\n?)?(?:^\|[^\n]+\|\n)+/gm;
495
+
496
+ const text = md.replace(tablePattern, (match) => {
497
+ // Verify it's actually a table (has separator row with dashes)
498
+ if (!match.includes('|---') && !match.includes('| ---') && !match.includes('|:--')) {
499
+ return match; // Not a real table, just lines with pipes
500
+ }
501
+
502
+ const idx = tables.length;
503
+ const placeholder = `\n\nTABLEBLOCK${idx}ENDTABLE\n\n`;
504
+
505
+ // Count cells for matching in Word (approximate)
506
+ const cellCount = (match.match(/\|/g) || []).length;
507
+
508
+ tables.push({ original: match.trim(), placeholder: placeholder.trim(), cellCount });
509
+ return placeholder;
510
+ });
511
+
512
+ return { text, tables };
513
+ }
514
+
515
+ /**
516
+ * Restore tables from placeholders
517
+ */
518
+ export function restoreTables(text: string, tables: ProtectedTable[]): string {
519
+ return restoreProtectedItems(text, tables);
520
+ }