docrev 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +75 -0
- package/README.md +313 -0
- package/bin/rev.js +2645 -0
- package/lib/annotations.js +321 -0
- package/lib/build.js +486 -0
- package/lib/citations.js +149 -0
- package/lib/config.js +60 -0
- package/lib/crossref.js +426 -0
- package/lib/doi.js +823 -0
- package/lib/equations.js +258 -0
- package/lib/format.js +420 -0
- package/lib/import.js +1018 -0
- package/lib/response.js +182 -0
- package/lib/review.js +208 -0
- package/lib/sections.js +345 -0
- package/lib/templates.js +305 -0
- package/package.json +43 -0
package/lib/import.js
ADDED
|
@@ -0,0 +1,1018 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Import functionality - convert Word docs to annotated Markdown
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import * as fs from 'fs';
|
|
6
|
+
import * as path from 'path';
|
|
7
|
+
import { diffWords } from 'diff';
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Extract comments directly from Word docx comments.xml
|
|
11
|
+
* @param {string} docxPath
|
|
12
|
+
* @returns {Promise<Array<{id: string, author: string, date: string, text: string}>>}
|
|
13
|
+
*/
|
|
14
|
+
export async function extractWordComments(docxPath) {
|
|
15
|
+
const AdmZip = (await import('adm-zip')).default;
|
|
16
|
+
const { parseStringPromise } = await import('xml2js');
|
|
17
|
+
|
|
18
|
+
const comments = [];
|
|
19
|
+
|
|
20
|
+
try {
|
|
21
|
+
const zip = new AdmZip(docxPath);
|
|
22
|
+
const commentsEntry = zip.getEntry('word/comments.xml');
|
|
23
|
+
|
|
24
|
+
if (!commentsEntry) {
|
|
25
|
+
return comments;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const commentsXml = commentsEntry.getData().toString('utf8');
|
|
29
|
+
const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
|
|
30
|
+
|
|
31
|
+
const ns = 'w:';
|
|
32
|
+
const commentsRoot = parsed['w:comments'];
|
|
33
|
+
if (!commentsRoot || !commentsRoot['w:comment']) {
|
|
34
|
+
return comments;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Ensure it's an array
|
|
38
|
+
const commentNodes = Array.isArray(commentsRoot['w:comment'])
|
|
39
|
+
? commentsRoot['w:comment']
|
|
40
|
+
: [commentsRoot['w:comment']];
|
|
41
|
+
|
|
42
|
+
for (const comment of commentNodes) {
|
|
43
|
+
const id = comment.$?.['w:id'] || '';
|
|
44
|
+
const author = comment.$?.['w:author'] || 'Unknown';
|
|
45
|
+
const date = comment.$?.['w:date'] || '';
|
|
46
|
+
|
|
47
|
+
// Extract text from nested w:p/w:r/w:t elements
|
|
48
|
+
let text = '';
|
|
49
|
+
const extractText = (node) => {
|
|
50
|
+
if (!node) return;
|
|
51
|
+
if (typeof node === 'string') {
|
|
52
|
+
text += node;
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
if (node['w:t']) {
|
|
56
|
+
const t = node['w:t'];
|
|
57
|
+
text += typeof t === 'string' ? t : (t._ || t);
|
|
58
|
+
}
|
|
59
|
+
if (node['w:r']) {
|
|
60
|
+
const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
|
|
61
|
+
runs.forEach(extractText);
|
|
62
|
+
}
|
|
63
|
+
if (node['w:p']) {
|
|
64
|
+
const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
|
|
65
|
+
paras.forEach(extractText);
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
extractText(comment);
|
|
69
|
+
|
|
70
|
+
comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
|
|
71
|
+
}
|
|
72
|
+
} catch (err) {
|
|
73
|
+
console.error('Error extracting comments:', err.message);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
return comments;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Extract comment anchor texts from document.xml with surrounding context
|
|
81
|
+
* Returns map of comment ID -> {anchor, before, after} for better matching
|
|
82
|
+
* @param {string} docxPath
|
|
83
|
+
* @returns {Promise<Map<string, {anchor: string, before: string, after: string}>>}
|
|
84
|
+
*/
|
|
85
|
+
export async function extractCommentAnchors(docxPath) {
|
|
86
|
+
const AdmZip = (await import('adm-zip')).default;
|
|
87
|
+
const anchors = new Map();
|
|
88
|
+
|
|
89
|
+
try {
|
|
90
|
+
const zip = new AdmZip(docxPath);
|
|
91
|
+
const docEntry = zip.getEntry('word/document.xml');
|
|
92
|
+
|
|
93
|
+
if (!docEntry) {
|
|
94
|
+
return anchors;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const docXml = docEntry.getData().toString('utf8');
|
|
98
|
+
|
|
99
|
+
// Extract ALL text nodes in document order for context
|
|
100
|
+
const allTextNodes = [...docXml.matchAll(/<w:t[^>]*>([^<]*)<\/w:t>/g)].map(m => m[1]);
|
|
101
|
+
const fullDocText = allTextNodes.join('');
|
|
102
|
+
|
|
103
|
+
// Find commentRangeStart...commentRangeEnd pairs
|
|
104
|
+
const rangePattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>[\s\S]*?<w:commentRangeEnd[^>]*w:id="\1"[^>]*\/?>/g;
|
|
105
|
+
|
|
106
|
+
let match;
|
|
107
|
+
while ((match = rangePattern.exec(docXml)) !== null) {
|
|
108
|
+
const id = match[1];
|
|
109
|
+
const rangeContent = match[0];
|
|
110
|
+
|
|
111
|
+
// Extract all w:t text within this range
|
|
112
|
+
const textMatches = rangeContent.matchAll(/<w:t[^>]*>([^<]*)<\/w:t>/g);
|
|
113
|
+
let anchorText = '';
|
|
114
|
+
for (const tm of textMatches) {
|
|
115
|
+
anchorText += tm[1];
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (anchorText.trim()) {
|
|
119
|
+
// Get surrounding context from full document
|
|
120
|
+
const anchorPos = fullDocText.indexOf(anchorText.trim());
|
|
121
|
+
let before = '';
|
|
122
|
+
let after = '';
|
|
123
|
+
|
|
124
|
+
if (anchorPos >= 0) {
|
|
125
|
+
// Get ~100 chars before (up to sentence boundary)
|
|
126
|
+
const beforeText = fullDocText.slice(Math.max(0, anchorPos - 150), anchorPos);
|
|
127
|
+
const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
|
|
128
|
+
before = sentenceStart >= 0 ? beforeText.slice(sentenceStart + 2).trim() : beforeText.slice(-80).trim();
|
|
129
|
+
|
|
130
|
+
// Get ~100 chars after (up to sentence boundary)
|
|
131
|
+
const afterStart = anchorPos + anchorText.length;
|
|
132
|
+
const afterText = fullDocText.slice(afterStart, afterStart + 150);
|
|
133
|
+
const sentenceEnd = afterText.search(/[.!?]\s/);
|
|
134
|
+
after = sentenceEnd >= 0 ? afterText.slice(0, sentenceEnd + 1).trim() : afterText.slice(0, 80).trim();
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
anchors.set(id, { anchor: anchorText.trim(), before, after });
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
} catch (err) {
|
|
141
|
+
console.error('Error extracting comment anchors:', err.message);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return anchors;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Extract text from Word document using mammoth
|
|
149
|
+
* @param {string} docxPath
|
|
150
|
+
* @returns {Promise<{text: string, comments: Array, anchors: Map}>}
|
|
151
|
+
*/
|
|
152
|
+
export async function extractFromWord(docxPath) {
|
|
153
|
+
const mammoth = await import('mammoth');
|
|
154
|
+
|
|
155
|
+
// Extract raw text
|
|
156
|
+
const textResult = await mammoth.extractRawText({ path: docxPath });
|
|
157
|
+
|
|
158
|
+
// Try to extract with messages (may include comments info)
|
|
159
|
+
const htmlResult = await mammoth.convertToHtml({ path: docxPath });
|
|
160
|
+
|
|
161
|
+
// Extract comments directly from docx XML
|
|
162
|
+
const comments = await extractWordComments(docxPath);
|
|
163
|
+
|
|
164
|
+
// Extract comment anchor texts
|
|
165
|
+
const anchors = await extractCommentAnchors(docxPath);
|
|
166
|
+
|
|
167
|
+
return {
|
|
168
|
+
text: textResult.value,
|
|
169
|
+
comments,
|
|
170
|
+
anchors,
|
|
171
|
+
messages: [...textResult.messages, ...htmlResult.messages],
|
|
172
|
+
};
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Insert comments into markdown text based on anchor texts with context
|
|
177
|
+
* Uses sentence context for disambiguation and tie-breaks for duplicates
|
|
178
|
+
* @param {string} markdown - The markdown text
|
|
179
|
+
* @param {Array} comments - Array of {id, author, text}
|
|
180
|
+
* @param {Map} anchors - Map of comment id -> {anchor, before, after} or string (legacy)
|
|
181
|
+
* @param {object} options - Options {quiet: boolean}
|
|
182
|
+
* @returns {string} - Markdown with comments inserted
|
|
183
|
+
*/
|
|
184
|
+
export function insertCommentsIntoMarkdown(markdown, comments, anchors, options = {}) {
|
|
185
|
+
const { quiet = false } = options;
|
|
186
|
+
let result = markdown;
|
|
187
|
+
let unmatchedCount = 0;
|
|
188
|
+
const duplicateWarnings = [];
|
|
189
|
+
const usedPositions = new Set(); // For tie-breaking: track used positions
|
|
190
|
+
|
|
191
|
+
// Get all positions in order (for sequential tie-breaking)
|
|
192
|
+
const commentsWithPositions = comments.map((c) => {
|
|
193
|
+
const anchorData = anchors.get(c.id);
|
|
194
|
+
if (!anchorData) {
|
|
195
|
+
unmatchedCount++;
|
|
196
|
+
return { ...c, pos: -1, anchorText: null };
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Support both old format (string) and new format ({anchor, before, after})
|
|
200
|
+
const anchor = typeof anchorData === 'string' ? anchorData : anchorData.anchor;
|
|
201
|
+
const before = typeof anchorData === 'object' ? anchorData.before : '';
|
|
202
|
+
const after = typeof anchorData === 'object' ? anchorData.after : '';
|
|
203
|
+
|
|
204
|
+
const anchorLower = anchor.toLowerCase();
|
|
205
|
+
const resultLower = result.toLowerCase();
|
|
206
|
+
|
|
207
|
+
// Find ALL occurrences of anchor text
|
|
208
|
+
const occurrences = [];
|
|
209
|
+
let searchIdx = 0;
|
|
210
|
+
while ((searchIdx = resultLower.indexOf(anchorLower, searchIdx)) !== -1) {
|
|
211
|
+
occurrences.push(searchIdx);
|
|
212
|
+
searchIdx += 1;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (occurrences.length === 0) {
|
|
216
|
+
// Try normalized whitespace match
|
|
217
|
+
const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
|
|
218
|
+
const normalizedResult = result.replace(/\s+/g, ' ').toLowerCase();
|
|
219
|
+
const normalizedIdx = normalizedResult.indexOf(normalizedAnchor);
|
|
220
|
+
|
|
221
|
+
if (normalizedIdx !== -1) {
|
|
222
|
+
return { ...c, pos: normalizedIdx + anchor.length, anchorText: anchor };
|
|
223
|
+
}
|
|
224
|
+
unmatchedCount++;
|
|
225
|
+
return { ...c, pos: -1, anchorText: null };
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (occurrences.length === 1) {
|
|
229
|
+
// Unique match - easy case
|
|
230
|
+
return { ...c, pos: occurrences[0] + anchor.length, anchorText: anchor };
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Multiple occurrences - use context for disambiguation
|
|
234
|
+
duplicateWarnings.push(`"${anchor.slice(0, 40)}${anchor.length > 40 ? '...' : ''}" appears ${occurrences.length} times`);
|
|
235
|
+
|
|
236
|
+
// Score each occurrence based on context match
|
|
237
|
+
// Initialize to first UNUSED occurrence (for tie-break correctness)
|
|
238
|
+
let bestIdx = occurrences.find(p => !usedPositions.has(p)) ?? occurrences[0];
|
|
239
|
+
let bestScore = -1; // Start at -1 so first valid candidate wins
|
|
240
|
+
|
|
241
|
+
for (const pos of occurrences) {
|
|
242
|
+
// Skip positions already used by previous comments
|
|
243
|
+
if (usedPositions.has(pos)) continue;
|
|
244
|
+
|
|
245
|
+
let score = 0;
|
|
246
|
+
|
|
247
|
+
// Check context before
|
|
248
|
+
if (before) {
|
|
249
|
+
const contextBefore = result.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
|
|
250
|
+
const beforeLower = before.toLowerCase();
|
|
251
|
+
// Check if context contains parts of 'before'
|
|
252
|
+
const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
|
|
253
|
+
for (const word of beforeWords) {
|
|
254
|
+
if (contextBefore.includes(word)) score += 2;
|
|
255
|
+
}
|
|
256
|
+
// Bonus for full match
|
|
257
|
+
if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Check context after
|
|
261
|
+
if (after) {
|
|
262
|
+
const contextAfter = result.slice(pos + anchor.length, pos + anchor.length + after.length + 20).toLowerCase();
|
|
263
|
+
const afterLower = after.toLowerCase();
|
|
264
|
+
// Check if context contains parts of 'after'
|
|
265
|
+
const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
|
|
266
|
+
for (const word of afterWords) {
|
|
267
|
+
if (contextAfter.includes(word)) score += 2;
|
|
268
|
+
}
|
|
269
|
+
// Bonus for full match
|
|
270
|
+
if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Tie-break: prefer earlier unused occurrence (document order)
|
|
274
|
+
if (score > bestScore || (score === bestScore && pos < bestIdx)) {
|
|
275
|
+
bestScore = score;
|
|
276
|
+
bestIdx = pos;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Mark this position as used for tie-breaking subsequent comments
|
|
281
|
+
usedPositions.add(bestIdx);
|
|
282
|
+
|
|
283
|
+
return { ...c, pos: bestIdx + anchor.length, anchorText: anchor };
|
|
284
|
+
}).filter((c) => c.pos > 0);
|
|
285
|
+
|
|
286
|
+
// Sort by position descending (insert from end to avoid offset issues)
|
|
287
|
+
commentsWithPositions.sort((a, b) => b.pos - a.pos);
|
|
288
|
+
|
|
289
|
+
// Insert each comment
|
|
290
|
+
for (const c of commentsWithPositions) {
|
|
291
|
+
const commentMark = ` {>>${c.author}: ${c.text}<<}`;
|
|
292
|
+
result = result.slice(0, c.pos) + commentMark + result.slice(c.pos);
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Log warnings unless quiet mode
|
|
296
|
+
if (!quiet) {
|
|
297
|
+
if (unmatchedCount > 0) {
|
|
298
|
+
console.warn(`Warning: ${unmatchedCount} comment(s) could not be matched to anchor text`);
|
|
299
|
+
}
|
|
300
|
+
if (duplicateWarnings.length > 0) {
|
|
301
|
+
console.warn(`Warning: Duplicate anchor text found (using context & tie-breaks for placement):`);
|
|
302
|
+
for (const w of duplicateWarnings) {
|
|
303
|
+
console.warn(` - ${w}`);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return result;
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
/**
|
|
312
|
+
* Normalize text for comparison (handle whitespace differences)
|
|
313
|
+
* @param {string} text
|
|
314
|
+
* @returns {string}
|
|
315
|
+
*/
|
|
316
|
+
function normalizeText(text) {
|
|
317
|
+
return text
|
|
318
|
+
.replace(/\r\n/g, '\n') // Normalize line endings
|
|
319
|
+
.replace(/\t/g, ' ') // Tabs to spaces
|
|
320
|
+
.replace(/ +/g, ' ') // Collapse multiple spaces
|
|
321
|
+
.trim();
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
/**
|
|
325
|
+
* Fix citation and math annotations by preserving original markdown syntax
|
|
326
|
+
* When Word renders [@Author2021] as "(Author et al. 2021)" or $p$ as "p", we preserve markdown
|
|
327
|
+
* @param {string} text - Annotated text
|
|
328
|
+
* @param {string} originalMd - Original markdown with proper citations and math
|
|
329
|
+
* @returns {string}
|
|
330
|
+
*/
|
|
331
|
+
function fixCitationAnnotations(text, originalMd) {
|
|
332
|
+
// Step 0: Fix math annotations - preserve inline and display math
|
|
333
|
+
// Deletions of inline math should keep the math: {--$p$--} -> $p$
|
|
334
|
+
text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
|
|
335
|
+
text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
|
|
336
|
+
|
|
337
|
+
// Substitutions where math was "changed" to rendered form: {~~$p$~>p~~} -> $p$
|
|
338
|
+
text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
|
|
339
|
+
text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
|
|
340
|
+
|
|
341
|
+
// Extract all citations from original markdown with positions
|
|
342
|
+
const citationPattern = /\[@[^\]]+\]/g;
|
|
343
|
+
const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
|
|
344
|
+
|
|
345
|
+
// Step 1: Fix substitutions where left side has markdown citation
|
|
346
|
+
// {~~[@Author]~>rendered~~} -> [@Author]
|
|
347
|
+
text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
|
|
348
|
+
|
|
349
|
+
// Step 2: Fix substitutions where left side STARTS with markdown citation
|
|
350
|
+
// {~~[@Author] more text~>rendered more~~} -> [@Author] {~~more text~>more~~}
|
|
351
|
+
text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
|
|
352
|
+
// If old and new text are similar (just whitespace/formatting), keep cite + new
|
|
353
|
+
if (oldText.trim() === '' && newText.trim() === '') {
|
|
354
|
+
return cite;
|
|
355
|
+
}
|
|
356
|
+
// Otherwise, keep citation and create substitution for the rest
|
|
357
|
+
if (oldText.trim() || newText.trim()) {
|
|
358
|
+
return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
|
|
359
|
+
}
|
|
360
|
+
return cite;
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
// Step 3: Fix deletions of markdown citations (should keep them)
|
|
364
|
+
text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
|
|
365
|
+
|
|
366
|
+
// Step 4: Fix insertions of rendered citations (usually duplicates, remove)
|
|
367
|
+
// {++(Author et al. 2021)++} or {++(Author 2021)++}
|
|
368
|
+
text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
|
|
369
|
+
|
|
370
|
+
// Step 5: Clean up broken multi-part substitutions involving citations
|
|
371
|
+
// Pattern: {~~[@cite~>rendered~~} {~~text~>more~~} -> [@cite] {~~text~>more~~}
|
|
372
|
+
text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
|
|
373
|
+
|
|
374
|
+
// Step 6: Fix citations split across substitution boundaries
|
|
375
|
+
// {~~[@~>something~~}Author2021] -> [@Author2021]
|
|
376
|
+
text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
|
|
377
|
+
|
|
378
|
+
// Step 7: Clean up any remaining partial citations in substitutions
|
|
379
|
+
// {~~; @Author2021]~>something~~} -> ; [@Author2021]
|
|
380
|
+
text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
|
|
381
|
+
|
|
382
|
+
// Step 8: Remove rendered citation insertions (fragments left over from citation matching)
|
|
383
|
+
// These are leftover pieces of rendered citations that didn't match placeholders
|
|
384
|
+
// Use \p{L} for Unicode letters to handle accented chars (š, é, ü, etc.)
|
|
385
|
+
|
|
386
|
+
// Full rendered citations in parentheses: {++(Author et al. 2021)++}
|
|
387
|
+
text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
|
|
388
|
+
text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
|
|
389
|
+
|
|
390
|
+
// Trailing citation fragments: {++2019; IPBES 2023). ++} or {++2008b; Rouget et al. 2016). ++}
|
|
391
|
+
text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
|
|
392
|
+
text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
|
|
393
|
+
|
|
394
|
+
// Just year with closing paren: {++2021)++} or {++2021).++}
|
|
395
|
+
text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
|
|
396
|
+
text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
|
|
397
|
+
|
|
398
|
+
// Leading citation fragments: {++Author et al.++} or {++(Author++}
|
|
399
|
+
text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
|
|
400
|
+
|
|
401
|
+
// Semicolon-separated author-year fragments: {++; Author 2021++}
|
|
402
|
+
text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
|
|
403
|
+
|
|
404
|
+
// Year ranges with authors: {++Author 2019; Other 2020)++}
|
|
405
|
+
text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
|
|
406
|
+
text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
|
|
407
|
+
|
|
408
|
+
// Step 9: Clean up double spaces and orphaned punctuation
|
|
409
|
+
text = text.replace(/ +/g, ' ');
|
|
410
|
+
text = text.replace(/\s+\./g, '.');
|
|
411
|
+
text = text.replace(/\s+,/g, ',');
|
|
412
|
+
|
|
413
|
+
// Step 10: Final cleanup - remove empty annotations
|
|
414
|
+
text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
|
|
415
|
+
text = text.replace(/\{\+\+\s*\+\+\}/g, '');
|
|
416
|
+
text = text.replace(/\{--\s*--\}/g, '');
|
|
417
|
+
|
|
418
|
+
return text;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Strip markdown syntax to get plain text (for comparison with Word output)
|
|
423
|
+
* @param {string} md
|
|
424
|
+
* @returns {string}
|
|
425
|
+
*/
|
|
426
|
+
function stripMarkdownSyntax(md) {
|
|
427
|
+
return md
|
|
428
|
+
// Remove YAML front matter
|
|
429
|
+
.replace(/^---[\s\S]*?---\n*/m, '')
|
|
430
|
+
// Headers: # Title → Title
|
|
431
|
+
.replace(/^#{1,6}\s+/gm, '')
|
|
432
|
+
// Bold/italic: **text** or *text* or __text__ or _text_ → text
|
|
433
|
+
.replace(/(\*\*|__)(.*?)\1/g, '$2')
|
|
434
|
+
.replace(/(\*|_)(.*?)\1/g, '$2')
|
|
435
|
+
// Links: [text](url) → text
|
|
436
|
+
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
|
437
|
+
// Images:  → (remove entirely or keep alt)
|
|
438
|
+
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
|
|
439
|
+
// Inline code: `code` → code
|
|
440
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
441
|
+
// Code blocks: ```...``` → (remove)
|
|
442
|
+
.replace(/```[\s\S]*?```/g, '')
|
|
443
|
+
// Blockquotes: > text → text
|
|
444
|
+
.replace(/^>\s*/gm, '')
|
|
445
|
+
// Horizontal rules
|
|
446
|
+
.replace(/^[-*_]{3,}\s*$/gm, '')
|
|
447
|
+
// List markers: - item or * item or 1. item → item
|
|
448
|
+
.replace(/^[\s]*[-*+]\s+/gm, '')
|
|
449
|
+
.replace(/^[\s]*\d+\.\s+/gm, '')
|
|
450
|
+
// Citations: [@Author2020] → (keep as-is, Word might have them)
|
|
451
|
+
// Tables: simplified handling
|
|
452
|
+
.replace(/\|/g, ' ')
|
|
453
|
+
.replace(/^[-:]+$/gm, '')
|
|
454
|
+
// Clean up extra whitespace
|
|
455
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
456
|
+
.trim();
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
/**
|
|
460
|
+
* Generate annotated markdown by diffing original MD against Word text
|
|
461
|
+
* @param {string} originalMd - Original markdown content
|
|
462
|
+
* @param {string} wordText - Text extracted from Word
|
|
463
|
+
* @param {string} author - Author name for the changes
|
|
464
|
+
* @returns {string} Annotated markdown with CriticMarkup
|
|
465
|
+
*/
|
|
466
|
+
export function generateAnnotatedDiff(originalMd, wordText, author = 'Reviewer') {
|
|
467
|
+
// Normalize both texts
|
|
468
|
+
const normalizedOriginal = normalizeText(originalMd);
|
|
469
|
+
const normalizedWord = normalizeText(wordText);
|
|
470
|
+
|
|
471
|
+
// Compute word-level diff
|
|
472
|
+
const changes = diffWords(normalizedOriginal, normalizedWord);
|
|
473
|
+
|
|
474
|
+
let result = '';
|
|
475
|
+
|
|
476
|
+
for (const part of changes) {
|
|
477
|
+
if (part.added) {
|
|
478
|
+
// Insertion
|
|
479
|
+
result += `{++${part.value}++}`;
|
|
480
|
+
} else if (part.removed) {
|
|
481
|
+
// Deletion
|
|
482
|
+
result += `{--${part.value}--}`;
|
|
483
|
+
} else {
|
|
484
|
+
// Unchanged
|
|
485
|
+
result += part.value;
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
return result;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
/**
|
|
493
|
+
* Extract markdown prefix (headers, list markers) from a line
|
|
494
|
+
* @param {string} line
|
|
495
|
+
* @returns {{prefix: string, content: string}}
|
|
496
|
+
*/
|
|
497
|
+
function extractMarkdownPrefix(line) {
|
|
498
|
+
// Headers
|
|
499
|
+
const headerMatch = line.match(/^(#{1,6}\s+)/);
|
|
500
|
+
if (headerMatch) {
|
|
501
|
+
return { prefix: headerMatch[1], content: line.slice(headerMatch[1].length) };
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// List items
|
|
505
|
+
const listMatch = line.match(/^(\s*[-*+]\s+|\s*\d+\.\s+)/);
|
|
506
|
+
if (listMatch) {
|
|
507
|
+
return { prefix: listMatch[1], content: line.slice(listMatch[1].length) };
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
// Blockquotes
|
|
511
|
+
const quoteMatch = line.match(/^(>\s*)/);
|
|
512
|
+
if (quoteMatch) {
|
|
513
|
+
return { prefix: quoteMatch[1], content: line.slice(quoteMatch[1].length) };
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
return { prefix: '', content: line };
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
/**
|
|
520
|
+
* Protect figure/table anchors before diffing
|
|
521
|
+
* Anchors like {#fig:heatmap} and {#tbl:results} should never be deleted
|
|
522
|
+
* @param {string} md
|
|
523
|
+
* @returns {{text: string, anchors: Array<{original: string, placeholder: string}>}}
|
|
524
|
+
*/
|
|
525
|
+
function protectAnchors(md) {
|
|
526
|
+
const anchors = [];
|
|
527
|
+
|
|
528
|
+
// Match {#fig:label}, {#tbl:label}, {#eq:label}, {#sec:label} etc.
|
|
529
|
+
// Also match with additional attributes like {#fig:label width=50%}
|
|
530
|
+
const text = md.replace(/\{#(fig|tbl|eq|sec|lst):[^}]+\}/g, (match) => {
|
|
531
|
+
const idx = anchors.length;
|
|
532
|
+
const placeholder = `ANCHORBLOCK${idx}ENDANCHOR`;
|
|
533
|
+
anchors.push({ original: match, placeholder });
|
|
534
|
+
return placeholder;
|
|
535
|
+
});
|
|
536
|
+
|
|
537
|
+
return { text, anchors };
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
/**
|
|
541
|
+
* Restore anchors from placeholders
|
|
542
|
+
* @param {string} text
|
|
543
|
+
* @param {Array} anchors
|
|
544
|
+
* @returns {string}
|
|
545
|
+
*/
|
|
546
|
+
function restoreAnchors(text, anchors) {
|
|
547
|
+
for (const anchor of anchors) {
|
|
548
|
+
// Handle case where anchor is inside a deletion annotation
|
|
549
|
+
// {--...ANCHORBLOCK0ENDANCHOR--} should become {--...--}{#fig:label}
|
|
550
|
+
const deletionPattern = new RegExp(`\\{--([^}]*?)${anchor.placeholder}([^}]*?)--\\}`, 'g');
|
|
551
|
+
text = text.replace(deletionPattern, (match, before, after) => {
|
|
552
|
+
const cleanBefore = before.trim();
|
|
553
|
+
const cleanAfter = after.trim();
|
|
554
|
+
let result = '';
|
|
555
|
+
if (cleanBefore) result += `{--${cleanBefore}--}`;
|
|
556
|
+
result += anchor.original;
|
|
557
|
+
if (cleanAfter) result += `{--${cleanAfter}--}`;
|
|
558
|
+
return result;
|
|
559
|
+
});
|
|
560
|
+
|
|
561
|
+
// Handle case where anchor is inside a substitution
|
|
562
|
+
// {~~old ANCHORBLOCK0ENDANCHOR~>new~~} -> {~~old~>new~~}{#fig:label}
|
|
563
|
+
const substitutionPattern = new RegExp(`\\{~~([^~]*?)${anchor.placeholder}([^~]*?)~>([^~]*)~~\\}`, 'g');
|
|
564
|
+
text = text.replace(substitutionPattern, (match, oldBefore, oldAfter, newText) => {
|
|
565
|
+
const cleanOldBefore = oldBefore.trim();
|
|
566
|
+
const cleanOldAfter = oldAfter.trim();
|
|
567
|
+
const cleanNew = newText.trim();
|
|
568
|
+
const oldText = (cleanOldBefore + ' ' + cleanOldAfter).trim();
|
|
569
|
+
let result = '';
|
|
570
|
+
if (oldText !== cleanNew) {
|
|
571
|
+
result += `{~~${oldText}~>${cleanNew}~~}`;
|
|
572
|
+
} else {
|
|
573
|
+
result += cleanNew;
|
|
574
|
+
}
|
|
575
|
+
result += anchor.original;
|
|
576
|
+
return result;
|
|
577
|
+
});
|
|
578
|
+
|
|
579
|
+
// Normal replacement
|
|
580
|
+
text = text.split(anchor.placeholder).join(anchor.original);
|
|
581
|
+
}
|
|
582
|
+
return text;
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
/**
|
|
586
|
+
* Protect cross-references before diffing
|
|
587
|
+
* References like @fig:label, @tbl:label should be preserved
|
|
588
|
+
* @param {string} md
|
|
589
|
+
* @returns {{text: string, crossrefs: Array<{original: string, placeholder: string}>}}
|
|
590
|
+
*/
|
|
591
|
+
function protectCrossrefs(md) {
|
|
592
|
+
const crossrefs = [];
|
|
593
|
+
|
|
594
|
+
// Match @fig:label, @tbl:label, @eq:label, @sec:label
|
|
595
|
+
// Can appear as @fig:label or (@fig:label) or [@fig:label]
|
|
596
|
+
const text = md.replace(/@(fig|tbl|eq|sec|lst):[a-zA-Z0-9_-]+/g, (match) => {
|
|
597
|
+
const idx = crossrefs.length;
|
|
598
|
+
const placeholder = `XREFBLOCK${idx}ENDXREF`;
|
|
599
|
+
crossrefs.push({ original: match, placeholder });
|
|
600
|
+
return placeholder;
|
|
601
|
+
});
|
|
602
|
+
|
|
603
|
+
return { text, crossrefs };
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
/**
|
|
607
|
+
* Restore cross-references from placeholders
|
|
608
|
+
* @param {string} text
|
|
609
|
+
* @param {Array} crossrefs
|
|
610
|
+
* @returns {string}
|
|
611
|
+
*/
|
|
612
|
+
function restoreCrossrefs(text, crossrefs) {
|
|
613
|
+
for (const xref of crossrefs) {
|
|
614
|
+
// Handle deletions - restore the reference even if marked deleted
|
|
615
|
+
const deletionPattern = new RegExp(`\\{--([^}]*?)${xref.placeholder}([^}]*?)--\\}`, 'g');
|
|
616
|
+
text = text.replace(deletionPattern, (match, before, after) => {
|
|
617
|
+
const cleanBefore = before.trim();
|
|
618
|
+
const cleanAfter = after.trim();
|
|
619
|
+
let result = '';
|
|
620
|
+
if (cleanBefore) result += `{--${cleanBefore}--}`;
|
|
621
|
+
result += xref.original;
|
|
622
|
+
if (cleanAfter) result += `{--${cleanAfter}--}`;
|
|
623
|
+
return result;
|
|
624
|
+
});
|
|
625
|
+
|
|
626
|
+
// Handle substitutions where rendered form (Figure 1) replaced the reference
|
|
627
|
+
// {~~XREFBLOCK0ENDXREF~>Figure 1~~} -> @fig:label
|
|
628
|
+
const substitutionPattern = new RegExp(`\\{~~${xref.placeholder}~>[^~]+~~\\}`, 'g');
|
|
629
|
+
text = text.replace(substitutionPattern, xref.original);
|
|
630
|
+
|
|
631
|
+
// Normal replacement
|
|
632
|
+
text = text.split(xref.placeholder).join(xref.original);
|
|
633
|
+
}
|
|
634
|
+
return text;
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
/**
|
|
638
|
+
* Protect mathematical notation before diffing by replacing with placeholders
|
|
639
|
+
* Handles both inline $...$ and display $$...$$ math
|
|
640
|
+
* @param {string} md
|
|
641
|
+
* @returns {{text: string, mathBlocks: Array<{original: string, placeholder: string, type: string, simplified: string}>}}
|
|
642
|
+
*/
|
|
643
|
+
function protectMath(md) {
|
|
644
|
+
const mathBlocks = [];
|
|
645
|
+
|
|
646
|
+
// First protect display math ($$...$$) - must be done before inline math
|
|
647
|
+
let text = md.replace(/\$\$([^$]+)\$\$/g, (match, content) => {
|
|
648
|
+
const idx = mathBlocks.length;
|
|
649
|
+
const placeholder = `MATHBLOCK${idx}ENDMATH`;
|
|
650
|
+
// Create simplified version for matching in Word text
|
|
651
|
+
const simplified = simplifyMathForMatching(content);
|
|
652
|
+
mathBlocks.push({ original: match, placeholder, type: 'display', simplified });
|
|
653
|
+
return placeholder;
|
|
654
|
+
});
|
|
655
|
+
|
|
656
|
+
// Then protect inline math ($...$)
|
|
657
|
+
text = text.replace(/\$([^$\n]+)\$/g, (match, content) => {
|
|
658
|
+
const idx = mathBlocks.length;
|
|
659
|
+
const placeholder = `MATHBLOCK${idx}ENDMATH`;
|
|
660
|
+
const simplified = simplifyMathForMatching(content);
|
|
661
|
+
mathBlocks.push({ original: match, placeholder, type: 'inline', simplified });
|
|
662
|
+
return placeholder;
|
|
663
|
+
});
|
|
664
|
+
|
|
665
|
+
return { text, mathBlocks };
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
/**
|
|
669
|
+
* Simplify LaTeX math for fuzzy matching against Word text
|
|
670
|
+
* Word renders math as text, so we need to match the rendered form
|
|
671
|
+
* @param {string} latex
|
|
672
|
+
* @returns {string}
|
|
673
|
+
*/
|
|
674
|
+
function simplifyMathForMatching(latex) {
|
|
675
|
+
return latex
|
|
676
|
+
// Remove common LaTeX commands
|
|
677
|
+
.replace(/\\text\{([^}]+)\}/g, '$1')
|
|
678
|
+
.replace(/\\hat\{([^}]+)\}/g, '$1')
|
|
679
|
+
.replace(/\\bar\{([^}]+)\}/g, '$1')
|
|
680
|
+
.replace(/\\frac\{([^}]+)\}\{([^}]+)\}/g, '$1/$2')
|
|
681
|
+
.replace(/\\sum_([a-z])/g, 'Σ')
|
|
682
|
+
.replace(/\\sum/g, 'Σ')
|
|
683
|
+
.replace(/\\cdot/g, '·')
|
|
684
|
+
.replace(/\\quad/g, ' ')
|
|
685
|
+
.replace(/\\,/g, ' ')
|
|
686
|
+
.replace(/\\_/g, '_')
|
|
687
|
+
.replace(/\\{/g, '{')
|
|
688
|
+
.replace(/\\}/g, '}')
|
|
689
|
+
.replace(/\\/g, '') // Remove remaining backslashes
|
|
690
|
+
.replace(/[{}]/g, '') // Remove braces
|
|
691
|
+
.replace(/\s+/g, ' ')
|
|
692
|
+
.trim();
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
/**
|
|
696
|
+
* Restore math from placeholders
|
|
697
|
+
* @param {string} text
|
|
698
|
+
* @param {Array} mathBlocks
|
|
699
|
+
* @returns {string}
|
|
700
|
+
*/
|
|
701
|
+
function restoreMath(text, mathBlocks) {
|
|
702
|
+
for (const block of mathBlocks) {
|
|
703
|
+
text = text.split(block.placeholder).join(block.original);
|
|
704
|
+
}
|
|
705
|
+
return text;
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
/**
|
|
709
|
+
* Replace rendered math in Word text with matching placeholders
|
|
710
|
+
* This is heuristic-based since Word can render math in various ways
|
|
711
|
+
* @param {string} wordText
|
|
712
|
+
* @param {Array} mathBlocks
|
|
713
|
+
* @returns {string}
|
|
714
|
+
*/
|
|
715
|
+
function replaceRenderedMath(wordText, mathBlocks) {
|
|
716
|
+
let result = wordText;
|
|
717
|
+
|
|
718
|
+
for (const block of mathBlocks) {
|
|
719
|
+
// For inline math, try to find the simplified form in Word text
|
|
720
|
+
if (block.simplified.length >= 2) {
|
|
721
|
+
// Try exact match first
|
|
722
|
+
if (result.includes(block.simplified)) {
|
|
723
|
+
result = result.replace(block.simplified, block.placeholder);
|
|
724
|
+
}
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
return result;
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
/**
|
|
732
|
+
* Protect citations before diffing by replacing with placeholders
|
|
733
|
+
* @param {string} md
|
|
734
|
+
* @returns {{text: string, citations: string[]}}
|
|
735
|
+
*/
|
|
736
|
+
function protectCitations(md) {
|
|
737
|
+
const citations = [];
|
|
738
|
+
const text = md.replace(/\[@[^\]]+\]/g, (match) => {
|
|
739
|
+
const idx = citations.length;
|
|
740
|
+
citations.push(match);
|
|
741
|
+
return `CITEREF${idx}ENDCITE`;
|
|
742
|
+
});
|
|
743
|
+
return { text, citations };
|
|
744
|
+
}
|
|
745
|
+
|
|
746
|
+
/**
|
|
747
|
+
* Restore citations from placeholders
|
|
748
|
+
* @param {string} text
|
|
749
|
+
* @param {string[]} citations
|
|
750
|
+
* @returns {string}
|
|
751
|
+
*/
|
|
752
|
+
function restoreCitations(text, citations) {
|
|
753
|
+
for (let i = 0; i < citations.length; i++) {
|
|
754
|
+
// Handle cases where placeholder might be inside annotations
|
|
755
|
+
const placeholder = `CITEREF${i}ENDCITE`;
|
|
756
|
+
text = text.split(placeholder).join(citations[i]);
|
|
757
|
+
}
|
|
758
|
+
return text;
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
/**
|
|
762
|
+
* Remove rendered citations from Word text (replace with matching placeholders)
|
|
763
|
+
* @param {string} wordText
|
|
764
|
+
* @param {number} count
|
|
765
|
+
* @returns {string}
|
|
766
|
+
*/
|
|
767
|
+
function replaceRenderedCitations(wordText, count) {
|
|
768
|
+
// Match rendered citation patterns: (Author 2021), (Author et al. 2021), etc.
|
|
769
|
+
const pattern = /\((?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?(?:\s*[&,;]\s*[A-Z][a-zé]+(?:\s+et\s+al\.?)?)*\s+\d{4}(?:[a-z])?(?:\s*[,;]\s*(?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?\s+)?\d{4}(?:[a-z])?)*)\)/g;
|
|
770
|
+
|
|
771
|
+
let idx = 0;
|
|
772
|
+
return wordText.replace(pattern, (match) => {
|
|
773
|
+
if (idx < count) {
|
|
774
|
+
const placeholder = `CITEREF${idx}ENDCITE`;
|
|
775
|
+
idx++;
|
|
776
|
+
return placeholder;
|
|
777
|
+
}
|
|
778
|
+
return match;
|
|
779
|
+
});
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
/**
|
|
783
|
+
* Smart paragraph-level diff that preserves markdown structure
|
|
784
|
+
* @param {string} originalMd
|
|
785
|
+
* @param {string} wordText
|
|
786
|
+
* @param {string} author
|
|
787
|
+
* @returns {string}
|
|
788
|
+
*/
|
|
789
|
+
export function generateSmartDiff(originalMd, wordText, author = 'Reviewer') {
|
|
790
|
+
// Protection order matters: anchors first, then crossrefs, math, citations
|
|
791
|
+
|
|
792
|
+
// Protect figure/table anchors (CRITICAL - these must never be deleted)
|
|
793
|
+
const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(originalMd);
|
|
794
|
+
|
|
795
|
+
// Protect cross-references (@fig:label, @tbl:label)
|
|
796
|
+
const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
|
|
797
|
+
|
|
798
|
+
// Protect math (before citations, since citations might be inside math)
|
|
799
|
+
const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
|
|
800
|
+
|
|
801
|
+
// Then protect citations
|
|
802
|
+
const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
|
|
803
|
+
|
|
804
|
+
// Replace rendered math and citations in Word text with matching placeholders
|
|
805
|
+
let wordProtected = replaceRenderedMath(wordText, mathBlocks);
|
|
806
|
+
wordProtected = replaceRenderedCitations(wordProtected, citations.length);
|
|
807
|
+
|
|
808
|
+
// Split into paragraphs
|
|
809
|
+
const originalParas = mdProtected.split(/\n\n+/);
|
|
810
|
+
const wordParas = wordProtected.split(/\n\n+/);
|
|
811
|
+
|
|
812
|
+
const result = [];
|
|
813
|
+
|
|
814
|
+
// Try to match paragraphs intelligently
|
|
815
|
+
let wordIdx = 0;
|
|
816
|
+
|
|
817
|
+
for (let i = 0; i < originalParas.length; i++) {
|
|
818
|
+
const orig = originalParas[i] || '';
|
|
819
|
+
const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
|
|
820
|
+
|
|
821
|
+
// Find best matching word paragraph
|
|
822
|
+
let bestMatch = -1;
|
|
823
|
+
let bestScore = 0;
|
|
824
|
+
|
|
825
|
+
for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
|
|
826
|
+
const wordPara = wordParas[j] || '';
|
|
827
|
+
// Simple similarity: count common words
|
|
828
|
+
const origWords = new Set(origContent.toLowerCase().split(/\s+/));
|
|
829
|
+
const wordWords = wordPara.toLowerCase().split(/\s+/);
|
|
830
|
+
const common = wordWords.filter((w) => origWords.has(w)).length;
|
|
831
|
+
const score = common / Math.max(origWords.size, wordWords.length);
|
|
832
|
+
|
|
833
|
+
if (score > bestScore && score > 0.3) {
|
|
834
|
+
bestScore = score;
|
|
835
|
+
bestMatch = j;
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
if (bestMatch === -1) {
|
|
840
|
+
// No match found - paragraph was deleted or heavily modified
|
|
841
|
+
// Check if it's just a header that Word converted
|
|
842
|
+
if (mdPrefix && wordIdx < wordParas.length) {
|
|
843
|
+
const wordPara = wordParas[wordIdx];
|
|
844
|
+
if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
|
|
845
|
+
// Word paragraph contains the header content - match them
|
|
846
|
+
bestMatch = wordIdx;
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
if (bestMatch >= 0) {
|
|
852
|
+
const word = wordParas[bestMatch];
|
|
853
|
+
|
|
854
|
+
// Strip markdown from original for clean comparison
|
|
855
|
+
const origStripped = stripMarkdownSyntax(orig);
|
|
856
|
+
const wordNormalized = normalizeText(word);
|
|
857
|
+
|
|
858
|
+
if (origStripped === wordNormalized) {
|
|
859
|
+
// Unchanged (ignoring markdown syntax)
|
|
860
|
+
result.push(orig);
|
|
861
|
+
} else {
|
|
862
|
+
// Modified - diff the content, preserve markdown prefix
|
|
863
|
+
const changes = diffWords(origStripped, wordNormalized);
|
|
864
|
+
let annotated = mdPrefix; // Preserve header/list marker
|
|
865
|
+
|
|
866
|
+
for (const part of changes) {
|
|
867
|
+
if (part.added) {
|
|
868
|
+
annotated += `{++${part.value}++}`;
|
|
869
|
+
} else if (part.removed) {
|
|
870
|
+
annotated += `{--${part.value}--}`;
|
|
871
|
+
} else {
|
|
872
|
+
annotated += part.value;
|
|
873
|
+
}
|
|
874
|
+
}
|
|
875
|
+
|
|
876
|
+
result.push(annotated);
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
wordIdx = bestMatch + 1;
|
|
880
|
+
} else {
|
|
881
|
+
// Paragraph deleted entirely
|
|
882
|
+
result.push(`{--${orig}--}`);
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
// Any remaining word paragraphs are additions
|
|
887
|
+
for (let j = wordIdx; j < wordParas.length; j++) {
|
|
888
|
+
const word = wordParas[j];
|
|
889
|
+
if (word.trim()) {
|
|
890
|
+
result.push(`{++${word}++}`);
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
// Restore protected content (reverse order of protection)
|
|
895
|
+
let finalResult = result.join('\n\n');
|
|
896
|
+
finalResult = restoreCitations(finalResult, citations);
|
|
897
|
+
finalResult = restoreMath(finalResult, mathBlocks);
|
|
898
|
+
finalResult = restoreCrossrefs(finalResult, crossrefs);
|
|
899
|
+
finalResult = restoreAnchors(finalResult, figAnchors);
|
|
900
|
+
|
|
901
|
+
return finalResult;
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
/**
|
|
905
|
+
* Clean up redundant adjacent annotations
|
|
906
|
+
* e.g., {--old--}{++new++} → {~~old~>new~~}
|
|
907
|
+
* @param {string} text
|
|
908
|
+
* @returns {string}
|
|
909
|
+
*/
|
|
910
|
+
export function cleanupAnnotations(text) {
|
|
911
|
+
// Convert adjacent delete+insert to substitution (with possible whitespace between)
|
|
912
|
+
// Pattern: {--something--} {++something else++}
|
|
913
|
+
text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
|
|
914
|
+
|
|
915
|
+
// Also handle insert+delete (less common but possible)
|
|
916
|
+
text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
|
|
917
|
+
|
|
918
|
+
// Fix malformed patterns where {-- got merged with ~>
|
|
919
|
+
// {--key~>critical~~} → {~~key~>critical~~}
|
|
920
|
+
text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
|
|
921
|
+
|
|
922
|
+
// Fix malformed substitutions that got split
|
|
923
|
+
// {~~word --} ... {++other~~} patterns
|
|
924
|
+
text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
|
|
925
|
+
text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
|
|
926
|
+
|
|
927
|
+
// Clean up empty annotations
|
|
928
|
+
text = text.replace(/\{--\s*--\}/g, '');
|
|
929
|
+
text = text.replace(/\{\+\+\s*\+\+\}/g, '');
|
|
930
|
+
|
|
931
|
+
// Clean up double spaces
|
|
932
|
+
text = text.replace(/ +/g, ' ');
|
|
933
|
+
|
|
934
|
+
return text;
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
/**
|
|
938
|
+
* Parse visible comment markers from Word text
|
|
939
|
+
* Format: [Author: comment text]
|
|
940
|
+
* @param {string} text
|
|
941
|
+
* @returns {Array<{author: string, text: string, position: number}>}
|
|
942
|
+
*/
|
|
943
|
+
export function parseVisibleComments(text) {
|
|
944
|
+
const comments = [];
|
|
945
|
+
const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
|
|
946
|
+
|
|
947
|
+
let match;
|
|
948
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
949
|
+
comments.push({
|
|
950
|
+
author: match[1].trim(),
|
|
951
|
+
text: match[2].trim(),
|
|
952
|
+
position: match.index,
|
|
953
|
+
});
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
return comments;
|
|
957
|
+
}
|
|
958
|
+
|
|
959
|
+
/**
|
|
960
|
+
* Convert visible comments to CriticMarkup format
|
|
961
|
+
* @param {string} text
|
|
962
|
+
* @returns {string}
|
|
963
|
+
*/
|
|
964
|
+
export function convertVisibleComments(text) {
|
|
965
|
+
return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
/**
|
|
969
|
+
* Full import pipeline: Word doc → annotated MD
|
|
970
|
+
* @param {string} docxPath - Path to Word document
|
|
971
|
+
* @param {string} originalMdPath - Path to original markdown
|
|
972
|
+
* @param {{author?: string, sectionContent?: string}} options
|
|
973
|
+
* @returns {Promise<{annotated: string, stats: object}>}
|
|
974
|
+
*/
|
|
975
|
+
export async function importFromWord(docxPath, originalMdPath, options = {}) {
|
|
976
|
+
const { author = 'Reviewer', sectionContent } = options;
|
|
977
|
+
|
|
978
|
+
// Use provided section content or extract from Word
|
|
979
|
+
let wordText;
|
|
980
|
+
if (sectionContent !== undefined) {
|
|
981
|
+
wordText = sectionContent;
|
|
982
|
+
} else {
|
|
983
|
+
const extracted = await extractFromWord(docxPath);
|
|
984
|
+
wordText = extracted.text;
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
// Read original markdown
|
|
988
|
+
const originalMd = fs.readFileSync(originalMdPath, 'utf-8');
|
|
989
|
+
|
|
990
|
+
// Generate diff
|
|
991
|
+
let annotated = generateSmartDiff(originalMd, wordText, author);
|
|
992
|
+
|
|
993
|
+
// Clean up adjacent del/ins to substitutions
|
|
994
|
+
annotated = cleanupAnnotations(annotated);
|
|
995
|
+
|
|
996
|
+
// Fix citation-related annotations (preserve markdown citations)
|
|
997
|
+
annotated = fixCitationAnnotations(annotated, originalMd);
|
|
998
|
+
|
|
999
|
+
// Convert any visible comments
|
|
1000
|
+
annotated = convertVisibleComments(annotated);
|
|
1001
|
+
|
|
1002
|
+
// Count changes
|
|
1003
|
+
const insertions = (annotated.match(/\{\+\+/g) || []).length;
|
|
1004
|
+
const deletions = (annotated.match(/\{--/g) || []).length;
|
|
1005
|
+
const substitutions = (annotated.match(/\{~~/g) || []).length;
|
|
1006
|
+
const comments = (annotated.match(/\{>>/g) || []).length;
|
|
1007
|
+
|
|
1008
|
+
return {
|
|
1009
|
+
annotated,
|
|
1010
|
+
stats: {
|
|
1011
|
+
insertions,
|
|
1012
|
+
deletions,
|
|
1013
|
+
substitutions,
|
|
1014
|
+
comments,
|
|
1015
|
+
total: insertions + deletions + substitutions + comments,
|
|
1016
|
+
},
|
|
1017
|
+
};
|
|
1018
|
+
}
|