docrev 0.9.18 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/.gitattributes +1 -1
  2. package/CHANGELOG.md +173 -149
  3. package/PLAN-tables-and-postprocess.md +850 -850
  4. package/README.md +431 -406
  5. package/bin/rev.js +11 -11
  6. package/bin/rev.ts +145 -145
  7. package/completions/rev.bash +127 -127
  8. package/completions/rev.ps1 +210 -210
  9. package/completions/rev.zsh +207 -207
  10. package/dist/lib/build.d.ts +8 -0
  11. package/dist/lib/build.d.ts.map +1 -1
  12. package/dist/lib/build.js +62 -6
  13. package/dist/lib/build.js.map +1 -1
  14. package/dist/lib/commands/context.d.ts +1 -1
  15. package/dist/lib/commands/context.d.ts.map +1 -1
  16. package/dist/lib/commands/context.js +1 -1
  17. package/dist/lib/commands/context.js.map +1 -1
  18. package/dist/lib/commands/sections.js +7 -7
  19. package/dist/lib/commands/sections.js.map +1 -1
  20. package/dist/lib/commands/sync.d.ts.map +1 -1
  21. package/dist/lib/commands/sync.js +15 -14
  22. package/dist/lib/commands/sync.js.map +1 -1
  23. package/dist/lib/commands/utilities.js +164 -164
  24. package/dist/lib/commands/verify-anchors.js +6 -6
  25. package/dist/lib/commands/verify-anchors.js.map +1 -1
  26. package/dist/lib/commands/word-tools.js +8 -8
  27. package/dist/lib/grammar.js +3 -3
  28. package/dist/lib/macro-filter.lua +201 -0
  29. package/dist/lib/macros.d.ts +102 -0
  30. package/dist/lib/macros.d.ts.map +1 -0
  31. package/dist/lib/macros.js +218 -0
  32. package/dist/lib/macros.js.map +1 -0
  33. package/dist/lib/pdf-comments.js +44 -44
  34. package/dist/lib/plugins.js +57 -57
  35. package/dist/lib/pptx-color-filter.lua +37 -0
  36. package/dist/lib/pptx-themes.js +115 -115
  37. package/dist/lib/schema.d.ts.map +1 -1
  38. package/dist/lib/schema.js +34 -0
  39. package/dist/lib/schema.js.map +1 -1
  40. package/dist/lib/sections.d.ts +35 -0
  41. package/dist/lib/sections.d.ts.map +1 -1
  42. package/dist/lib/sections.js +81 -0
  43. package/dist/lib/sections.js.map +1 -1
  44. package/dist/lib/spelling.js +2 -2
  45. package/dist/lib/templates.js +387 -387
  46. package/dist/lib/themes.js +51 -51
  47. package/eslint.config.js +27 -27
  48. package/lib/anchor-match.ts +276 -276
  49. package/lib/annotations.ts +644 -644
  50. package/lib/build.ts +1766 -1694
  51. package/lib/citations.ts +160 -160
  52. package/lib/commands/build.ts +855 -855
  53. package/lib/commands/citations.ts +515 -515
  54. package/lib/commands/comments.ts +1050 -1050
  55. package/lib/commands/context.ts +176 -174
  56. package/lib/commands/core.ts +309 -309
  57. package/lib/commands/doi.ts +435 -435
  58. package/lib/commands/file-ops.ts +372 -372
  59. package/lib/commands/history.ts +320 -320
  60. package/lib/commands/index.ts +87 -87
  61. package/lib/commands/init.ts +259 -259
  62. package/lib/commands/merge-resolve.ts +378 -378
  63. package/lib/commands/preview.ts +178 -178
  64. package/lib/commands/project-info.ts +244 -244
  65. package/lib/commands/quality.ts +517 -517
  66. package/lib/commands/response.ts +454 -454
  67. package/lib/commands/section-boundaries.ts +82 -82
  68. package/lib/commands/sections.ts +451 -451
  69. package/lib/commands/sync.ts +709 -706
  70. package/lib/commands/text-ops.ts +449 -449
  71. package/lib/commands/utilities.ts +448 -448
  72. package/lib/commands/verify-anchors.ts +272 -272
  73. package/lib/commands/word-tools.ts +340 -340
  74. package/lib/comment-realign.ts +517 -517
  75. package/lib/config.ts +84 -84
  76. package/lib/crossref.ts +781 -781
  77. package/lib/csl.ts +191 -191
  78. package/lib/dependencies.ts +98 -98
  79. package/lib/diff-engine.ts +465 -465
  80. package/lib/doi-cache.ts +115 -115
  81. package/lib/doi.ts +897 -897
  82. package/lib/equations.ts +506 -506
  83. package/lib/errors.ts +346 -346
  84. package/lib/format.ts +541 -541
  85. package/lib/git.ts +326 -326
  86. package/lib/grammar.ts +303 -303
  87. package/lib/image-registry.ts +180 -180
  88. package/lib/import.ts +911 -911
  89. package/lib/journals.ts +543 -543
  90. package/lib/macro-filter.lua +201 -0
  91. package/lib/macros.ts +273 -0
  92. package/lib/merge.ts +633 -633
  93. package/lib/orcid.ts +144 -144
  94. package/lib/pdf-comments.ts +263 -263
  95. package/lib/pdf-import.ts +524 -524
  96. package/lib/plugins.ts +362 -362
  97. package/lib/postprocess.ts +188 -188
  98. package/lib/pptx-color-filter.lua +37 -37
  99. package/lib/pptx-template.ts +469 -469
  100. package/lib/pptx-themes.ts +483 -483
  101. package/lib/protect-restore.ts +520 -520
  102. package/lib/rate-limiter.ts +94 -94
  103. package/lib/response.ts +197 -197
  104. package/lib/restore-references.ts +240 -240
  105. package/lib/review.ts +327 -327
  106. package/lib/schema.ts +488 -454
  107. package/lib/scientific-words.ts +73 -73
  108. package/lib/sections.ts +425 -335
  109. package/lib/slides.ts +756 -756
  110. package/lib/spelling.ts +334 -334
  111. package/lib/templates.ts +526 -526
  112. package/lib/themes.ts +742 -742
  113. package/lib/trackchanges.ts +247 -247
  114. package/lib/tui.ts +450 -450
  115. package/lib/types.ts +550 -550
  116. package/lib/undo.ts +250 -250
  117. package/lib/utils.ts +69 -69
  118. package/lib/variables.ts +179 -179
  119. package/lib/word-extraction.ts +806 -806
  120. package/lib/word.ts +643 -643
  121. package/lib/wordcomments.ts +840 -840
  122. package/package.json +137 -137
  123. package/scripts/postbuild.js +47 -28
  124. package/skill/REFERENCE.md +539 -539
  125. package/skill/SKILL.md +295 -295
  126. package/tsconfig.json +26 -26
  127. package/types/index.d.ts +525 -525
  128. package/issues.md +0 -180
  129. package/site/assets/extra.css +0 -208
  130. package/site/commands.html +0 -926
  131. package/site/configuration.html +0 -469
  132. package/site/index.html +0 -288
  133. package/site/troubleshooting.html +0 -461
  134. package/site/workflow.html +0 -518
@@ -1,465 +1,465 @@
1
- /**
2
- * Diff engine - diffing and annotation processing for Word→Markdown import
3
- */
4
-
5
- import { diffWords, Change } from 'diff';
6
- import {
7
- extractMarkdownPrefix,
8
- protectAnchors,
9
- restoreAnchors,
10
- protectCrossrefs,
11
- restoreCrossrefs,
12
- protectMath,
13
- restoreMath,
14
- replaceRenderedMath,
15
- protectCitations,
16
- restoreCitations,
17
- replaceRenderedCitations,
18
- protectImages,
19
- restoreImages,
20
- matchWordImagesToOriginal,
21
- protectTables,
22
- restoreTables,
23
- } from './protect-restore.js';
24
- import { normalizeWhitespace } from './utils.js';
25
- import type { WordTable } from './word-extraction.js';
26
-
27
- // ============================================
28
- // Type Definitions
29
- // ============================================
30
-
31
- export interface GenerateSmartDiffOptions {
32
- wordTables?: WordTable[];
33
- imageRegistry?: any;
34
- }
35
-
36
- // ============================================
37
- // Functions
38
- // ============================================
39
-
40
- /**
41
- * Fix citation and math annotations by preserving original markdown syntax
42
- */
43
- export function fixCitationAnnotations(text: string, originalMd: string): string {
44
- // Fix math annotations - preserve inline and display math
45
- text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
46
- text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
47
-
48
- text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
49
- text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
50
-
51
- // Extract all citations from original markdown
52
- const citationPattern = /\[@[^\]]+\]/g;
53
- const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
54
-
55
- // Fix substitutions where left side has markdown citation
56
- text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
57
-
58
- // Fix substitutions where left side STARTS with markdown citation
59
- text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
60
- if (oldText.trim() === '' && newText.trim() === '') {
61
- return cite;
62
- }
63
- if (oldText.trim() || newText.trim()) {
64
- return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
65
- }
66
- return cite;
67
- });
68
-
69
- // Fix deletions of markdown citations
70
- text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
71
-
72
- // Fix insertions of rendered citations
73
- text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
74
-
75
- // Clean up broken multi-part substitutions
76
- text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
77
-
78
- // Fix citations split across substitution boundaries
79
- text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
80
-
81
- // Clean up any remaining partial citations
82
- text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
83
-
84
- // Remove rendered citation insertions (with Unicode support)
85
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
86
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
87
-
88
- // Trailing citation fragments
89
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
90
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
91
-
92
- // Just year with closing paren
93
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
94
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
95
-
96
- // Leading citation fragments
97
- text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
98
-
99
- // Semicolon-separated fragments
100
- text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
101
-
102
- // Year ranges with authors
103
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
104
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
105
-
106
- // Clean up double spaces and orphaned punctuation
107
- text = text.replace(/ +/g, ' ');
108
- text = text.replace(/\s+\./g, '.');
109
- text = text.replace(/\s+,/g, ',');
110
-
111
- // Final cleanup - remove empty annotations
112
- text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
113
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
114
- text = text.replace(/\{--\s*--\}/g, '');
115
-
116
- return text;
117
- }
118
-
119
- /**
120
- * Strip markdown syntax to get plain text
121
- */
122
- function stripMarkdownSyntax(md: string): string {
123
- return md
124
- .replace(/^---[\s\S]*?---\n*/m, '')
125
- .replace(/^#{1,6}\s+/gm, '')
126
- .replace(/(\*\*|__)(.*?)\1/g, '$2')
127
- .replace(/(\*|_)(.*?)\1/g, '$2')
128
- .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
129
- .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
130
- .replace(/`([^`]+)`/g, '$1')
131
- .replace(/```[\s\S]*?```/g, '')
132
- .replace(/^>\s*/gm, '')
133
- .replace(/^[-*_]{3,}\s*$/gm, '')
134
- .replace(/^[\s]*[-*+]\s+/gm, '')
135
- .replace(/^[\s]*\d+\.\s+/gm, '')
136
- .replace(/\|/g, ' ')
137
- .replace(/^[-:]+$/gm, '')
138
- .replace(/\n{3,}/g, '\n\n')
139
- .trim();
140
- }
141
-
142
- /**
143
- * Inject Word tables (extracted from XML) into pandoc text output
144
- */
145
- function injectWordTables(pandocText: string, wordTables: WordTable[]): string {
146
- if (!wordTables || wordTables.length === 0) {
147
- return pandocText;
148
- }
149
-
150
- let result = pandocText;
151
-
152
- for (const table of wordTables) {
153
- const firstLine = table.markdown.split('\n')[0];
154
- const headerCells = firstLine
155
- .split('|')
156
- .map((c) => c.trim())
157
- .filter((c) => c.length > 0);
158
-
159
- if (headerCells.length === 0) continue;
160
-
161
- const firstCell = headerCells[0];
162
- const startIdx = result.indexOf(firstCell);
163
-
164
- if (startIdx === -1) continue;
165
-
166
- const lastLine = table.markdown.split('\n').pop();
167
- const lastCells = lastLine!
168
- .split('|')
169
- .map((c) => c.trim())
170
- .filter((c) => c.length > 0);
171
- const lastCell = lastCells[lastCells.length - 1] || lastCells[0];
172
-
173
- const endIdx = result.indexOf(lastCell, startIdx);
174
- if (endIdx === -1) continue;
175
-
176
- let regionStart = result.lastIndexOf('\n\n', startIdx);
177
- if (regionStart === -1) regionStart = 0;
178
- else regionStart += 2;
179
-
180
- let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length);
181
- if (regionEnd === -1) regionEnd = result.length;
182
-
183
- result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd);
184
- }
185
-
186
- return result;
187
- }
188
-
189
- /**
190
- * Generate annotated markdown by diffing original MD against Word text
191
- */
192
- export function generateAnnotatedDiff(originalMd: string, wordText: string, author: string = 'Reviewer'): string {
193
- const normalizedOriginal = normalizeWhitespace(originalMd);
194
- const normalizedWord = normalizeWhitespace(wordText);
195
-
196
- const changes = diffWords(normalizedOriginal, normalizedWord);
197
-
198
- let result = '';
199
-
200
- for (const part of changes) {
201
- if (part.added) {
202
- result += `{++${part.value}++}`;
203
- } else if (part.removed) {
204
- result += `{--${part.value}--}`;
205
- } else {
206
- result += part.value;
207
- }
208
- }
209
-
210
- return result;
211
- }
212
-
213
- /**
214
- * Smart paragraph-level diff that preserves markdown structure
215
- */
216
- export function generateSmartDiff(
217
- originalMd: string,
218
- wordText: string,
219
- author: string = 'Reviewer',
220
- options: GenerateSmartDiffOptions = {}
221
- ): string {
222
- const { wordTables = [], imageRegistry = null } = options;
223
-
224
- // Inject Word tables into pandoc output
225
- let wordTextWithTables = injectWordTables(wordText, wordTables);
226
-
227
- // Protect markdown tables
228
- const { text: mdWithTablesProtected, tables } = protectTables(originalMd);
229
-
230
- // Also protect tables in Word text
231
- const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables);
232
-
233
- // Protect images
234
- const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry);
235
-
236
- const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry);
237
-
238
- // Match Word images to original images
239
- const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry);
240
-
241
- // Replace Word image placeholders with matching original placeholders
242
- let wordWithMappedImages = wordWithImagesProtected;
243
- for (const [wordPlaceholder, origPlaceholder] of imageMapping) {
244
- wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder);
245
- }
246
-
247
- // Protect figure/table anchors
248
- const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected);
249
-
250
- // Protect cross-references
251
- const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
252
-
253
- // Protect math
254
- const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
255
-
256
- // Protect citations
257
- const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
258
-
259
- // Replace rendered elements in Word text
260
- let wordProtected = wordWithMappedImages;
261
- wordProtected = replaceRenderedMath(wordProtected, mathBlocks);
262
- wordProtected = replaceRenderedCitations(wordProtected, citations.length);
263
-
264
- // Split into paragraphs
265
- const originalParas = mdProtected.split(/\n\n+/);
266
- const wordParas = wordProtected.split(/\n\n+/);
267
-
268
- const result: string[] = [];
269
-
270
- // Try to match paragraphs intelligently
271
- let wordIdx = 0;
272
-
273
- for (let i = 0; i < originalParas.length; i++) {
274
- const orig = originalParas[i] || '';
275
- const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
276
-
277
- // Find best matching word paragraph
278
- let bestMatch = -1;
279
- let bestScore = 0;
280
-
281
- for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
282
- const wordPara = wordParas[j] || '';
283
- const origWords = new Set(origContent.toLowerCase().split(/\s+/));
284
- const wordWords = wordPara.toLowerCase().split(/\s+/);
285
- const common = wordWords.filter((w) => origWords.has(w)).length;
286
- const score = common / Math.max(origWords.size, wordWords.length);
287
-
288
- if (score > bestScore && score > 0.3) {
289
- bestScore = score;
290
- bestMatch = j;
291
- }
292
- }
293
-
294
- if (bestMatch === -1) {
295
- if (mdPrefix && wordIdx < wordParas.length) {
296
- const wordPara = wordParas[wordIdx];
297
- if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
298
- bestMatch = wordIdx;
299
- }
300
- }
301
- }
302
-
303
- if (bestMatch >= 0) {
304
- const word = wordParas[bestMatch];
305
-
306
- const origStripped = stripMarkdownSyntax(orig);
307
- const wordNormalized = normalizeWhitespace(word);
308
-
309
- if (origStripped === wordNormalized) {
310
- result.push(orig);
311
- } else {
312
- const changes = diffWords(origStripped, wordNormalized);
313
- let annotated = mdPrefix;
314
-
315
- for (const part of changes) {
316
- if (part.added) {
317
- annotated += `{++${part.value}++}`;
318
- } else if (part.removed) {
319
- annotated += `{--${part.value}--}`;
320
- } else {
321
- annotated += part.value;
322
- }
323
- }
324
-
325
- result.push(annotated);
326
- }
327
-
328
- wordIdx = bestMatch + 1;
329
- } else {
330
- // Paragraph deleted entirely
331
- if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) {
332
- result.push(orig);
333
- } else {
334
- result.push(`{--${orig}--}`);
335
- }
336
- }
337
- }
338
-
339
- // Any remaining word paragraphs are additions
340
- for (let j = wordIdx; j < wordParas.length; j++) {
341
- const word = wordParas[j];
342
- if (word.trim()) {
343
- result.push(`{++${word}++}`);
344
- }
345
- }
346
-
347
- // Restore protected content
348
- let finalResult = result.join('\n\n');
349
- finalResult = restoreCitations(finalResult, citations);
350
- finalResult = restoreMath(finalResult, mathBlocks);
351
- finalResult = restoreCrossrefs(finalResult, crossrefs);
352
- finalResult = restoreAnchors(finalResult, figAnchors);
353
- finalResult = restoreImages(finalResult, origImages);
354
- finalResult = restoreImages(finalResult, wordImages);
355
- finalResult = restoreTables(finalResult, tables);
356
- finalResult = restoreTables(finalResult, wordTableBlocks);
357
-
358
- return finalResult;
359
- }
360
-
361
- /**
362
- * Clean up redundant adjacent annotations
363
- */
364
- export function cleanupAnnotations(text: string): string {
365
- // Convert adjacent delete+insert to substitution
366
- text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
367
-
368
- // Also handle insert+delete
369
- text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
370
-
371
- // Fix malformed patterns
372
- text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
373
-
374
- // Fix malformed substitutions that got split
375
- text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
376
- text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
377
-
378
- // Clean up empty annotations
379
- text = text.replace(/\{--\s*--\}/g, '');
380
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
381
-
382
- // Clean up double spaces in prose, but preserve table formatting
383
- const lines = text.split('\n');
384
- let inTable = false;
385
-
386
- const processedLines = lines.map((line, idx) => {
387
- const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim());
388
-
389
- const looksLikeTableRow = /\S+\s{2,}\S+/.test(line);
390
-
391
- if (isSeparator) {
392
- if (!inTable) {
393
- inTable = true;
394
- }
395
- return line;
396
- }
397
-
398
- if (inTable) {
399
- if (line.trim() === '') {
400
- let lookAhead = idx + 1;
401
- let foundTableContent = false;
402
- let foundEndSeparator = false;
403
-
404
- while (lookAhead < lines.length && lookAhead < idx + 20) {
405
- const nextLine = lines[lookAhead].trim();
406
-
407
- if (nextLine === '') {
408
- lookAhead++;
409
- continue;
410
- }
411
-
412
- if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) {
413
- foundEndSeparator = true;
414
- break;
415
- }
416
-
417
- if (/\S+\s{2,}\S+/.test(nextLine)) {
418
- foundTableContent = true;
419
- break;
420
- }
421
-
422
- if (/^\*[^*]+\*\s*$/.test(nextLine)) {
423
- foundTableContent = true;
424
- break;
425
- }
426
-
427
- if (lines[lookAhead].startsWith(' ')) {
428
- lookAhead++;
429
- continue;
430
- }
431
-
432
- break;
433
- }
434
-
435
- if (foundTableContent || foundEndSeparator) {
436
- return line;
437
- }
438
-
439
- inTable = false;
440
- return line;
441
- }
442
-
443
- return line;
444
- }
445
-
446
- if (looksLikeTableRow) {
447
- let nextIdx = idx + 1;
448
- while (nextIdx < lines.length && lines[nextIdx].trim() === '') {
449
- nextIdx++;
450
- }
451
- if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) {
452
- return line;
453
- }
454
- }
455
-
456
- if (line.trim().startsWith('|')) {
457
- return line;
458
- }
459
-
460
- return line.replace(/ +/g, ' ');
461
- });
462
- text = processedLines.join('\n');
463
-
464
- return text;
465
- }
1
+ /**
2
+ * Diff engine - diffing and annotation processing for Word→Markdown import
3
+ */
4
+
5
+ import { diffWords, Change } from 'diff';
6
+ import {
7
+ extractMarkdownPrefix,
8
+ protectAnchors,
9
+ restoreAnchors,
10
+ protectCrossrefs,
11
+ restoreCrossrefs,
12
+ protectMath,
13
+ restoreMath,
14
+ replaceRenderedMath,
15
+ protectCitations,
16
+ restoreCitations,
17
+ replaceRenderedCitations,
18
+ protectImages,
19
+ restoreImages,
20
+ matchWordImagesToOriginal,
21
+ protectTables,
22
+ restoreTables,
23
+ } from './protect-restore.js';
24
+ import { normalizeWhitespace } from './utils.js';
25
+ import type { WordTable } from './word-extraction.js';
26
+
27
+ // ============================================
28
+ // Type Definitions
29
+ // ============================================
30
+
31
+ export interface GenerateSmartDiffOptions {
32
+ wordTables?: WordTable[];
33
+ imageRegistry?: any;
34
+ }
35
+
36
+ // ============================================
37
+ // Functions
38
+ // ============================================
39
+
40
+ /**
41
+ * Fix citation and math annotations by preserving original markdown syntax
42
+ */
43
+ export function fixCitationAnnotations(text: string, originalMd: string): string {
44
+ // Fix math annotations - preserve inline and display math
45
+ text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
46
+ text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
47
+
48
+ text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
49
+ text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
50
+
51
+ // Extract all citations from original markdown
52
+ const citationPattern = /\[@[^\]]+\]/g;
53
+ const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
54
+
55
+ // Fix substitutions where left side has markdown citation
56
+ text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
57
+
58
+ // Fix substitutions where left side STARTS with markdown citation
59
+ text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
60
+ if (oldText.trim() === '' && newText.trim() === '') {
61
+ return cite;
62
+ }
63
+ if (oldText.trim() || newText.trim()) {
64
+ return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
65
+ }
66
+ return cite;
67
+ });
68
+
69
+ // Fix deletions of markdown citations
70
+ text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
71
+
72
+ // Fix insertions of rendered citations
73
+ text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
74
+
75
+ // Clean up broken multi-part substitutions
76
+ text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
77
+
78
+ // Fix citations split across substitution boundaries
79
+ text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
80
+
81
+ // Clean up any remaining partial citations
82
+ text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
83
+
84
+ // Remove rendered citation insertions (with Unicode support)
85
+ text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
86
+ text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
87
+
88
+ // Trailing citation fragments
89
+ text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
90
+ text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
91
+
92
+ // Just year with closing paren
93
+ text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
94
+ text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
95
+
96
+ // Leading citation fragments
97
+ text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
98
+
99
+ // Semicolon-separated fragments
100
+ text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
101
+
102
+ // Year ranges with authors
103
+ text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
104
+ text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
105
+
106
+ // Clean up double spaces and orphaned punctuation
107
+ text = text.replace(/ +/g, ' ');
108
+ text = text.replace(/\s+\./g, '.');
109
+ text = text.replace(/\s+,/g, ',');
110
+
111
+ // Final cleanup - remove empty annotations
112
+ text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
113
+ text = text.replace(/\{\+\+\s*\+\+\}/g, '');
114
+ text = text.replace(/\{--\s*--\}/g, '');
115
+
116
+ return text;
117
+ }
118
+
119
+ /**
120
+ * Strip markdown syntax to get plain text
121
+ */
122
+ function stripMarkdownSyntax(md: string): string {
123
+ return md
124
+ .replace(/^---[\s\S]*?---\n*/m, '')
125
+ .replace(/^#{1,6}\s+/gm, '')
126
+ .replace(/(\*\*|__)(.*?)\1/g, '$2')
127
+ .replace(/(\*|_)(.*?)\1/g, '$2')
128
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
129
+ .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
130
+ .replace(/`([^`]+)`/g, '$1')
131
+ .replace(/```[\s\S]*?```/g, '')
132
+ .replace(/^>\s*/gm, '')
133
+ .replace(/^[-*_]{3,}\s*$/gm, '')
134
+ .replace(/^[\s]*[-*+]\s+/gm, '')
135
+ .replace(/^[\s]*\d+\.\s+/gm, '')
136
+ .replace(/\|/g, ' ')
137
+ .replace(/^[-:]+$/gm, '')
138
+ .replace(/\n{3,}/g, '\n\n')
139
+ .trim();
140
+ }
141
+
142
+ /**
143
+ * Inject Word tables (extracted from XML) into pandoc text output
144
+ */
145
+ function injectWordTables(pandocText: string, wordTables: WordTable[]): string {
146
+ if (!wordTables || wordTables.length === 0) {
147
+ return pandocText;
148
+ }
149
+
150
+ let result = pandocText;
151
+
152
+ for (const table of wordTables) {
153
+ const firstLine = table.markdown.split('\n')[0];
154
+ const headerCells = firstLine
155
+ .split('|')
156
+ .map((c) => c.trim())
157
+ .filter((c) => c.length > 0);
158
+
159
+ if (headerCells.length === 0) continue;
160
+
161
+ const firstCell = headerCells[0];
162
+ const startIdx = result.indexOf(firstCell);
163
+
164
+ if (startIdx === -1) continue;
165
+
166
+ const lastLine = table.markdown.split('\n').pop();
167
+ const lastCells = lastLine!
168
+ .split('|')
169
+ .map((c) => c.trim())
170
+ .filter((c) => c.length > 0);
171
+ const lastCell = lastCells[lastCells.length - 1] || lastCells[0];
172
+
173
+ const endIdx = result.indexOf(lastCell, startIdx);
174
+ if (endIdx === -1) continue;
175
+
176
+ let regionStart = result.lastIndexOf('\n\n', startIdx);
177
+ if (regionStart === -1) regionStart = 0;
178
+ else regionStart += 2;
179
+
180
+ let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length);
181
+ if (regionEnd === -1) regionEnd = result.length;
182
+
183
+ result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd);
184
+ }
185
+
186
+ return result;
187
+ }
188
+
189
+ /**
190
+ * Generate annotated markdown by diffing original MD against Word text
191
+ */
192
+ export function generateAnnotatedDiff(originalMd: string, wordText: string, author: string = 'Reviewer'): string {
193
+ const normalizedOriginal = normalizeWhitespace(originalMd);
194
+ const normalizedWord = normalizeWhitespace(wordText);
195
+
196
+ const changes = diffWords(normalizedOriginal, normalizedWord);
197
+
198
+ let result = '';
199
+
200
+ for (const part of changes) {
201
+ if (part.added) {
202
+ result += `{++${part.value}++}`;
203
+ } else if (part.removed) {
204
+ result += `{--${part.value}--}`;
205
+ } else {
206
+ result += part.value;
207
+ }
208
+ }
209
+
210
+ return result;
211
+ }
212
+
213
+ /**
214
+ * Smart paragraph-level diff that preserves markdown structure
215
+ */
216
+ export function generateSmartDiff(
217
+ originalMd: string,
218
+ wordText: string,
219
+ author: string = 'Reviewer',
220
+ options: GenerateSmartDiffOptions = {}
221
+ ): string {
222
+ const { wordTables = [], imageRegistry = null } = options;
223
+
224
+ // Inject Word tables into pandoc output
225
+ let wordTextWithTables = injectWordTables(wordText, wordTables);
226
+
227
+ // Protect markdown tables
228
+ const { text: mdWithTablesProtected, tables } = protectTables(originalMd);
229
+
230
+ // Also protect tables in Word text
231
+ const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables);
232
+
233
+ // Protect images
234
+ const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry);
235
+
236
+ const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry);
237
+
238
+ // Match Word images to original images
239
+ const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry);
240
+
241
+ // Replace Word image placeholders with matching original placeholders
242
+ let wordWithMappedImages = wordWithImagesProtected;
243
+ for (const [wordPlaceholder, origPlaceholder] of imageMapping) {
244
+ wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder);
245
+ }
246
+
247
+ // Protect figure/table anchors
248
+ const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected);
249
+
250
+ // Protect cross-references
251
+ const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
252
+
253
+ // Protect math
254
+ const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
255
+
256
+ // Protect citations
257
+ const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
258
+
259
+ // Replace rendered elements in Word text
260
+ let wordProtected = wordWithMappedImages;
261
+ wordProtected = replaceRenderedMath(wordProtected, mathBlocks);
262
+ wordProtected = replaceRenderedCitations(wordProtected, citations.length);
263
+
264
+ // Split into paragraphs
265
+ const originalParas = mdProtected.split(/\n\n+/);
266
+ const wordParas = wordProtected.split(/\n\n+/);
267
+
268
+ const result: string[] = [];
269
+
270
+ // Try to match paragraphs intelligently
271
+ let wordIdx = 0;
272
+
273
+ for (let i = 0; i < originalParas.length; i++) {
274
+ const orig = originalParas[i] || '';
275
+ const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
276
+
277
+ // Find best matching word paragraph
278
+ let bestMatch = -1;
279
+ let bestScore = 0;
280
+
281
+ for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
282
+ const wordPara = wordParas[j] || '';
283
+ const origWords = new Set(origContent.toLowerCase().split(/\s+/));
284
+ const wordWords = wordPara.toLowerCase().split(/\s+/);
285
+ const common = wordWords.filter((w) => origWords.has(w)).length;
286
+ const score = common / Math.max(origWords.size, wordWords.length);
287
+
288
+ if (score > bestScore && score > 0.3) {
289
+ bestScore = score;
290
+ bestMatch = j;
291
+ }
292
+ }
293
+
294
+ if (bestMatch === -1) {
295
+ if (mdPrefix && wordIdx < wordParas.length) {
296
+ const wordPara = wordParas[wordIdx];
297
+ if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
298
+ bestMatch = wordIdx;
299
+ }
300
+ }
301
+ }
302
+
303
+ if (bestMatch >= 0) {
304
+ const word = wordParas[bestMatch];
305
+
306
+ const origStripped = stripMarkdownSyntax(orig);
307
+ const wordNormalized = normalizeWhitespace(word);
308
+
309
+ if (origStripped === wordNormalized) {
310
+ result.push(orig);
311
+ } else {
312
+ const changes = diffWords(origStripped, wordNormalized);
313
+ let annotated = mdPrefix;
314
+
315
+ for (const part of changes) {
316
+ if (part.added) {
317
+ annotated += `{++${part.value}++}`;
318
+ } else if (part.removed) {
319
+ annotated += `{--${part.value}--}`;
320
+ } else {
321
+ annotated += part.value;
322
+ }
323
+ }
324
+
325
+ result.push(annotated);
326
+ }
327
+
328
+ wordIdx = bestMatch + 1;
329
+ } else {
330
+ // Paragraph deleted entirely
331
+ if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) {
332
+ result.push(orig);
333
+ } else {
334
+ result.push(`{--${orig}--}`);
335
+ }
336
+ }
337
+ }
338
+
339
+ // Any remaining word paragraphs are additions
340
+ for (let j = wordIdx; j < wordParas.length; j++) {
341
+ const word = wordParas[j];
342
+ if (word.trim()) {
343
+ result.push(`{++${word}++}`);
344
+ }
345
+ }
346
+
347
+ // Restore protected content
348
+ let finalResult = result.join('\n\n');
349
+ finalResult = restoreCitations(finalResult, citations);
350
+ finalResult = restoreMath(finalResult, mathBlocks);
351
+ finalResult = restoreCrossrefs(finalResult, crossrefs);
352
+ finalResult = restoreAnchors(finalResult, figAnchors);
353
+ finalResult = restoreImages(finalResult, origImages);
354
+ finalResult = restoreImages(finalResult, wordImages);
355
+ finalResult = restoreTables(finalResult, tables);
356
+ finalResult = restoreTables(finalResult, wordTableBlocks);
357
+
358
+ return finalResult;
359
+ }
360
+
361
+ /**
362
+ * Clean up redundant adjacent annotations
363
+ */
364
+ export function cleanupAnnotations(text: string): string {
365
+ // Convert adjacent delete+insert to substitution
366
+ text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
367
+
368
+ // Also handle insert+delete
369
+ text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
370
+
371
+ // Fix malformed patterns
372
+ text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
373
+
374
+ // Fix malformed substitutions that got split
375
+ text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
376
+ text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
377
+
378
+ // Clean up empty annotations
379
+ text = text.replace(/\{--\s*--\}/g, '');
380
+ text = text.replace(/\{\+\+\s*\+\+\}/g, '');
381
+
382
+ // Clean up double spaces in prose, but preserve table formatting
383
+ const lines = text.split('\n');
384
+ let inTable = false;
385
+
386
+ const processedLines = lines.map((line, idx) => {
387
+ const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim());
388
+
389
+ const looksLikeTableRow = /\S+\s{2,}\S+/.test(line);
390
+
391
+ if (isSeparator) {
392
+ if (!inTable) {
393
+ inTable = true;
394
+ }
395
+ return line;
396
+ }
397
+
398
+ if (inTable) {
399
+ if (line.trim() === '') {
400
+ let lookAhead = idx + 1;
401
+ let foundTableContent = false;
402
+ let foundEndSeparator = false;
403
+
404
+ while (lookAhead < lines.length && lookAhead < idx + 20) {
405
+ const nextLine = lines[lookAhead].trim();
406
+
407
+ if (nextLine === '') {
408
+ lookAhead++;
409
+ continue;
410
+ }
411
+
412
+ if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) {
413
+ foundEndSeparator = true;
414
+ break;
415
+ }
416
+
417
+ if (/\S+\s{2,}\S+/.test(nextLine)) {
418
+ foundTableContent = true;
419
+ break;
420
+ }
421
+
422
+ if (/^\*[^*]+\*\s*$/.test(nextLine)) {
423
+ foundTableContent = true;
424
+ break;
425
+ }
426
+
427
+ if (lines[lookAhead].startsWith(' ')) {
428
+ lookAhead++;
429
+ continue;
430
+ }
431
+
432
+ break;
433
+ }
434
+
435
+ if (foundTableContent || foundEndSeparator) {
436
+ return line;
437
+ }
438
+
439
+ inTable = false;
440
+ return line;
441
+ }
442
+
443
+ return line;
444
+ }
445
+
446
+ if (looksLikeTableRow) {
447
+ let nextIdx = idx + 1;
448
+ while (nextIdx < lines.length && lines[nextIdx].trim() === '') {
449
+ nextIdx++;
450
+ }
451
+ if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) {
452
+ return line;
453
+ }
454
+ }
455
+
456
+ if (line.trim().startsWith('|')) {
457
+ return line;
458
+ }
459
+
460
+ return line.replace(/ +/g, ' ');
461
+ });
462
+ text = processedLines.join('\n');
463
+
464
+ return text;
465
+ }