docrev 0.9.11 → 0.9.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -9
- package/.gitattributes +1 -1
- package/CHANGELOG.md +149 -149
- package/PLAN-tables-and-postprocess.md +850 -850
- package/README.md +391 -391
- package/bin/rev.js +11 -11
- package/bin/rev.ts +145 -145
- package/completions/rev.bash +127 -127
- package/completions/rev.ps1 +210 -210
- package/completions/rev.zsh +207 -207
- package/dev_notes/stress2/build_adversarial.ts +186 -186
- package/dev_notes/stress2/drift_matcher.ts +62 -62
- package/dev_notes/stress2/probe_anchors.ts +35 -35
- package/dev_notes/stress2/project/discussion.before.md +3 -3
- package/dev_notes/stress2/project/discussion.md +3 -3
- package/dev_notes/stress2/project/methods.before.md +20 -20
- package/dev_notes/stress2/project/methods.md +20 -20
- package/dev_notes/stress2/project/rev.yaml +5 -5
- package/dev_notes/stress2/project/sections.yaml +4 -4
- package/dev_notes/stress2/sections.yaml +5 -5
- package/dev_notes/stress2/trace_placement.ts +50 -50
- package/dev_notes/stresstest_boundaries.ts +27 -27
- package/dev_notes/stresstest_drift_apply.ts +43 -43
- package/dev_notes/stresstest_drift_compare.ts +43 -43
- package/dev_notes/stresstest_drift_v2.ts +54 -54
- package/dev_notes/stresstest_inspect.ts +54 -54
- package/dev_notes/stresstest_pstyle.ts +55 -55
- package/dev_notes/stresstest_section_debug.ts +23 -23
- package/dev_notes/stresstest_split.ts +70 -70
- package/dev_notes/stresstest_trace.ts +19 -19
- package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
- package/dist/lib/build.d.ts +50 -1
- package/dist/lib/build.d.ts.map +1 -1
- package/dist/lib/build.js +80 -30
- package/dist/lib/build.js.map +1 -1
- package/dist/lib/commands/build.d.ts.map +1 -1
- package/dist/lib/commands/build.js +38 -5
- package/dist/lib/commands/build.js.map +1 -1
- package/dist/lib/commands/utilities.js +164 -164
- package/dist/lib/commands/word-tools.js +8 -8
- package/dist/lib/grammar.js +3 -3
- package/dist/lib/import.d.ts.map +1 -1
- package/dist/lib/import.js +146 -24
- package/dist/lib/import.js.map +1 -1
- package/dist/lib/pdf-comments.js +44 -44
- package/dist/lib/plugins.js +57 -57
- package/dist/lib/pptx-themes.js +115 -115
- package/dist/lib/spelling.js +2 -2
- package/dist/lib/templates.js +387 -387
- package/dist/lib/themes.js +51 -51
- package/dist/lib/types.d.ts +20 -0
- package/dist/lib/types.d.ts.map +1 -1
- package/dist/lib/word-extraction.d.ts +6 -0
- package/dist/lib/word-extraction.d.ts.map +1 -1
- package/dist/lib/word-extraction.js +46 -3
- package/dist/lib/word-extraction.js.map +1 -1
- package/dist/lib/wordcomments.d.ts.map +1 -1
- package/dist/lib/wordcomments.js +23 -5
- package/dist/lib/wordcomments.js.map +1 -1
- package/eslint.config.js +27 -27
- package/lib/anchor-match.ts +276 -276
- package/lib/annotations.ts +644 -644
- package/lib/build.ts +1300 -1227
- package/lib/citations.ts +160 -160
- package/lib/commands/build.ts +833 -801
- package/lib/commands/citations.ts +515 -515
- package/lib/commands/comments.ts +1050 -1050
- package/lib/commands/context.ts +174 -174
- package/lib/commands/core.ts +309 -309
- package/lib/commands/doi.ts +435 -435
- package/lib/commands/file-ops.ts +372 -372
- package/lib/commands/history.ts +320 -320
- package/lib/commands/index.ts +87 -87
- package/lib/commands/init.ts +259 -259
- package/lib/commands/merge-resolve.ts +378 -378
- package/lib/commands/preview.ts +178 -178
- package/lib/commands/project-info.ts +244 -244
- package/lib/commands/quality.ts +517 -517
- package/lib/commands/response.ts +454 -454
- package/lib/commands/section-boundaries.ts +82 -82
- package/lib/commands/sections.ts +451 -451
- package/lib/commands/sync.ts +706 -706
- package/lib/commands/text-ops.ts +449 -449
- package/lib/commands/utilities.ts +448 -448
- package/lib/commands/verify-anchors.ts +272 -272
- package/lib/commands/word-tools.ts +340 -340
- package/lib/comment-realign.ts +517 -517
- package/lib/config.ts +84 -84
- package/lib/crossref.ts +781 -781
- package/lib/csl.ts +191 -191
- package/lib/dependencies.ts +98 -98
- package/lib/diff-engine.ts +465 -465
- package/lib/doi-cache.ts +115 -115
- package/lib/doi.ts +897 -897
- package/lib/equations.ts +506 -506
- package/lib/errors.ts +346 -346
- package/lib/format.ts +541 -541
- package/lib/git.ts +326 -326
- package/lib/grammar.ts +303 -303
- package/lib/image-registry.ts +180 -180
- package/lib/import.ts +911 -792
- package/lib/journals.ts +543 -543
- package/lib/merge.ts +633 -633
- package/lib/orcid.ts +144 -144
- package/lib/pdf-comments.ts +263 -263
- package/lib/pdf-import.ts +524 -524
- package/lib/plugins.ts +362 -362
- package/lib/postprocess.ts +188 -188
- package/lib/pptx-color-filter.lua +37 -37
- package/lib/pptx-template.ts +469 -469
- package/lib/pptx-themes.ts +483 -483
- package/lib/protect-restore.ts +520 -520
- package/lib/rate-limiter.ts +94 -94
- package/lib/response.ts +197 -197
- package/lib/restore-references.ts +240 -240
- package/lib/review.ts +327 -327
- package/lib/schema.ts +417 -417
- package/lib/scientific-words.ts +73 -73
- package/lib/sections.ts +335 -335
- package/lib/slides.ts +756 -756
- package/lib/spelling.ts +334 -334
- package/lib/templates.ts +526 -526
- package/lib/themes.ts +742 -742
- package/lib/trackchanges.ts +247 -247
- package/lib/tui.ts +450 -450
- package/lib/types.ts +550 -530
- package/lib/undo.ts +250 -250
- package/lib/utils.ts +69 -69
- package/lib/variables.ts +179 -179
- package/lib/word-extraction.ts +806 -759
- package/lib/word.ts +643 -643
- package/lib/wordcomments.ts +817 -798
- package/package.json +137 -137
- package/scripts/postbuild.js +28 -28
- package/skill/REFERENCE.md +431 -431
- package/skill/SKILL.md +258 -258
- package/tsconfig.json +26 -26
- package/types/index.d.ts +525 -525
package/lib/protect-restore.ts
CHANGED
|
@@ -1,520 +1,520 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Protection and restoration utilities for markdown elements during Word import
|
|
3
|
-
*
|
|
4
|
-
* These functions protect special markdown syntax (anchors, cross-refs, math, citations,
|
|
5
|
-
* images, tables) by replacing them with placeholders before diffing, then restore them after.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
// =============================================================================
|
|
9
|
-
// Interfaces
|
|
10
|
-
// =============================================================================
|
|
11
|
-
|
|
12
|
-
interface MarkdownPrefix {
|
|
13
|
-
prefix: string;
|
|
14
|
-
content: string;
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
interface ProtectedItem {
|
|
18
|
-
original: string;
|
|
19
|
-
placeholder: string;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
interface ProtectedMath extends ProtectedItem {
|
|
23
|
-
type: 'inline' | 'display';
|
|
24
|
-
simplified: string;
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
interface ProtectedImage extends ProtectedItem {
|
|
28
|
-
label: string | null;
|
|
29
|
-
caption: string;
|
|
30
|
-
path: string;
|
|
31
|
-
figureNumber: string | null;
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
interface ProtectedTable extends ProtectedItem {
|
|
35
|
-
cellCount: number;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
interface ProtectAnchorsResult {
|
|
39
|
-
text: string;
|
|
40
|
-
anchors: ProtectedItem[];
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
interface ProtectCrossrefsResult {
|
|
44
|
-
text: string;
|
|
45
|
-
crossrefs: ProtectedItem[];
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
interface ProtectMathResult {
|
|
49
|
-
text: string;
|
|
50
|
-
mathBlocks: ProtectedMath[];
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
interface ProtectCitationsResult {
|
|
54
|
-
text: string;
|
|
55
|
-
citations: string[];
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
interface ProtectImagesResult {
|
|
59
|
-
text: string;
|
|
60
|
-
images: ProtectedImage[];
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
interface ProtectTablesResult {
|
|
64
|
-
text: string;
|
|
65
|
-
tables: ProtectedTable[];
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
interface ImageRegistry {
|
|
69
|
-
byNumber?: Map<string, { label: string }>;
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
// =============================================================================
|
|
73
|
-
// Shared Helpers
|
|
74
|
-
// =============================================================================
|
|
75
|
-
|
|
76
|
-
/**
|
|
77
|
-
* Replace regex matches with indexed placeholders and collect originals
|
|
78
|
-
*/
|
|
79
|
-
function collectAndReplace(
|
|
80
|
-
text: string,
|
|
81
|
-
pattern: RegExp,
|
|
82
|
-
prefix: string,
|
|
83
|
-
suffix: string,
|
|
84
|
-
): { text: string; items: ProtectedItem[] } {
|
|
85
|
-
const items: ProtectedItem[] = [];
|
|
86
|
-
const result = text.replace(pattern, (match) => {
|
|
87
|
-
const idx = items.length;
|
|
88
|
-
const placeholder = `${prefix}${idx}${suffix}`;
|
|
89
|
-
items.push({ original: match, placeholder });
|
|
90
|
-
return placeholder;
|
|
91
|
-
});
|
|
92
|
-
return { text: result, items };
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
/**
|
|
96
|
-
* Restore protected items from placeholders, handling annotation wrappers
|
|
97
|
-
* (deletion {--...--} and insertion {++...++} wrappers are unwrapped)
|
|
98
|
-
*/
|
|
99
|
-
function restoreProtectedItems(text: string, items: ProtectedItem[]): string {
|
|
100
|
-
for (const item of items) {
|
|
101
|
-
const deletionPattern = new RegExp(`\\{--[^}]*?${item.placeholder}[^}]*?--\\}`, 'g');
|
|
102
|
-
text = text.replace(deletionPattern, item.original);
|
|
103
|
-
|
|
104
|
-
const insertionPattern = new RegExp(`\\{\\+\\+[^}]*?${item.placeholder}[^}]*?\\+\\+\\}`, 'g');
|
|
105
|
-
text = text.replace(insertionPattern, item.original);
|
|
106
|
-
|
|
107
|
-
text = text.split(item.placeholder).join(item.original);
|
|
108
|
-
}
|
|
109
|
-
return text;
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
// =============================================================================
|
|
113
|
-
// Public Functions
|
|
114
|
-
// =============================================================================
|
|
115
|
-
|
|
116
|
-
/**
|
|
117
|
-
* Extract markdown prefix (headers, list markers) from a line
|
|
118
|
-
*/
|
|
119
|
-
export function extractMarkdownPrefix(line: string): MarkdownPrefix {
|
|
120
|
-
// Headers
|
|
121
|
-
const headerMatch = line.match(/^(#{1,6}\s+)/);
|
|
122
|
-
if (headerMatch && headerMatch[1]) {
|
|
123
|
-
return { prefix: headerMatch[1], content: line.slice(headerMatch[1].length) };
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
// List items
|
|
127
|
-
const listMatch = line.match(/^(\s*[-*+]\s+|\s*\d+\.\s+)/);
|
|
128
|
-
if (listMatch && listMatch[1]) {
|
|
129
|
-
return { prefix: listMatch[1], content: line.slice(listMatch[1].length) };
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
// Blockquotes
|
|
133
|
-
const quoteMatch = line.match(/^(>\s*)/);
|
|
134
|
-
if (quoteMatch && quoteMatch[1]) {
|
|
135
|
-
return { prefix: quoteMatch[1], content: line.slice(quoteMatch[1].length) };
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
return { prefix: '', content: line };
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
/**
|
|
142
|
-
* Protect figure/table anchors before diffing
|
|
143
|
-
* Anchors like {#fig:heatmap} and {#tbl:results} should never be deleted
|
|
144
|
-
*/
|
|
145
|
-
export function protectAnchors(md: string): ProtectAnchorsResult {
|
|
146
|
-
// Match {#fig:label}, {#tbl:label}, {#eq:label}, {#sec:label} etc.
|
|
147
|
-
// Also match with additional attributes like {#fig:label width=50%}
|
|
148
|
-
const { text, items: anchors } = collectAndReplace(
|
|
149
|
-
md, /\{#(fig|tbl|eq|sec|lst):[^}]+\}/g, 'ANCHORBLOCK', 'ENDANCHOR',
|
|
150
|
-
);
|
|
151
|
-
return { text, anchors };
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
/**
|
|
155
|
-
* Restore anchors from placeholders
|
|
156
|
-
*/
|
|
157
|
-
export function restoreAnchors(text: string, anchors: ProtectedItem[]): string {
|
|
158
|
-
for (const anchor of anchors) {
|
|
159
|
-
// Handle case where anchor is inside a deletion annotation
|
|
160
|
-
// {--...ANCHORBLOCK0ENDANCHOR--} should become {--...--}{#fig:label}
|
|
161
|
-
const deletionPattern = new RegExp(`\\{--([^}]*?)${anchor.placeholder}([^}]*?)--\\}`, 'g');
|
|
162
|
-
text = text.replace(deletionPattern, (match, before, after) => {
|
|
163
|
-
const cleanBefore = before.trim();
|
|
164
|
-
const cleanAfter = after.trim();
|
|
165
|
-
let result = '';
|
|
166
|
-
if (cleanBefore) result += `{--${cleanBefore}--}`;
|
|
167
|
-
result += anchor.original;
|
|
168
|
-
if (cleanAfter) result += `{--${cleanAfter}--}`;
|
|
169
|
-
return result;
|
|
170
|
-
});
|
|
171
|
-
|
|
172
|
-
// Handle case where anchor is inside a substitution
|
|
173
|
-
// {~~old ANCHORBLOCK0ENDANCHOR~>new~~} -> {~~old~>new~~}{#fig:label}
|
|
174
|
-
const substitutionPattern = new RegExp(`\\{~~([^~]*?)${anchor.placeholder}([^~]*?)~>([^~]*)~~\\}`, 'g');
|
|
175
|
-
text = text.replace(substitutionPattern, (match: string, oldBefore: string, oldAfter: string, newText: string) => {
|
|
176
|
-
const cleanOldBefore = (oldBefore ?? '').trim();
|
|
177
|
-
const cleanOldAfter = (oldAfter ?? '').trim();
|
|
178
|
-
const cleanNew = (newText ?? '').trim();
|
|
179
|
-
const oldText = (cleanOldBefore + ' ' + cleanOldAfter).trim();
|
|
180
|
-
let result = '';
|
|
181
|
-
if (oldText !== cleanNew) {
|
|
182
|
-
result += `{~~${oldText}~>${cleanNew}~~}`;
|
|
183
|
-
} else {
|
|
184
|
-
result += cleanNew;
|
|
185
|
-
}
|
|
186
|
-
result += anchor.original;
|
|
187
|
-
return result;
|
|
188
|
-
});
|
|
189
|
-
|
|
190
|
-
// Normal replacement
|
|
191
|
-
text = text.split(anchor.placeholder).join(anchor.original);
|
|
192
|
-
}
|
|
193
|
-
return text;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
/**
|
|
197
|
-
* Protect cross-references before diffing
|
|
198
|
-
* References like @fig:label, @tbl:label should be preserved
|
|
199
|
-
*/
|
|
200
|
-
export function protectCrossrefs(md: string): ProtectCrossrefsResult {
|
|
201
|
-
// Match @fig:label, @tbl:label, @eq:label, @sec:label
|
|
202
|
-
// Can appear as @fig:label or (@fig:label) or [@fig:label]
|
|
203
|
-
const { text, items: crossrefs } = collectAndReplace(
|
|
204
|
-
md, /@(fig|tbl|eq|sec|lst):[a-zA-Z0-9_-]+/g, 'XREFBLOCK', 'ENDXREF',
|
|
205
|
-
);
|
|
206
|
-
return { text, crossrefs };
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
/**
|
|
210
|
-
* Restore cross-references from placeholders
|
|
211
|
-
*/
|
|
212
|
-
export function restoreCrossrefs(text: string, crossrefs: ProtectedItem[]): string {
|
|
213
|
-
for (const xref of crossrefs) {
|
|
214
|
-
// Handle deletions - restore the reference even if marked deleted
|
|
215
|
-
const deletionPattern = new RegExp(`\\{--([^}]*?)${xref.placeholder}([^}]*?)--\\}`, 'g');
|
|
216
|
-
text = text.replace(deletionPattern, (match, before, after) => {
|
|
217
|
-
const cleanBefore = before.trim();
|
|
218
|
-
const cleanAfter = after.trim();
|
|
219
|
-
let result = '';
|
|
220
|
-
if (cleanBefore) result += `{--${cleanBefore}--}`;
|
|
221
|
-
result += xref.original;
|
|
222
|
-
if (cleanAfter) result += `{--${cleanAfter}--}`;
|
|
223
|
-
return result;
|
|
224
|
-
});
|
|
225
|
-
|
|
226
|
-
// Handle substitutions where rendered form (Figure 1) replaced the reference
|
|
227
|
-
// {~~XREFBLOCK0ENDXREF~>Figure 1~~} -> @fig:label
|
|
228
|
-
const substitutionPattern = new RegExp(`\\{~~${xref.placeholder}~>[^~]+~~\\}`, 'g');
|
|
229
|
-
text = text.replace(substitutionPattern, xref.original);
|
|
230
|
-
|
|
231
|
-
// Normal replacement
|
|
232
|
-
text = text.split(xref.placeholder).join(xref.original);
|
|
233
|
-
}
|
|
234
|
-
return text;
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
/**
|
|
238
|
-
* Simplify LaTeX math for fuzzy matching against Word text
|
|
239
|
-
* Word renders math as text, so we need to match the rendered form
|
|
240
|
-
*/
|
|
241
|
-
export function simplifyMathForMatching(latex: string): string {
|
|
242
|
-
return latex
|
|
243
|
-
// Remove common LaTeX commands
|
|
244
|
-
.replace(/\\text\{([^}]+)\}/g, '$1')
|
|
245
|
-
.replace(/\\hat\{([^}]+)\}/g, '$1')
|
|
246
|
-
.replace(/\\bar\{([^}]+)\}/g, '$1')
|
|
247
|
-
.replace(/\\frac\{([^}]+)\}\{([^}]+)\}/g, '$1/$2')
|
|
248
|
-
.replace(/\\sum_([a-z])/g, 'Σ')
|
|
249
|
-
.replace(/\\sum/g, 'Σ')
|
|
250
|
-
.replace(/\\cdot/g, '·')
|
|
251
|
-
.replace(/\\quad/g, ' ')
|
|
252
|
-
.replace(/\\,/g, ' ')
|
|
253
|
-
.replace(/\\_/g, '_')
|
|
254
|
-
.replace(/\\{/g, '{')
|
|
255
|
-
.replace(/\\}/g, '}')
|
|
256
|
-
.replace(/\\/g, '') // Remove remaining backslashes
|
|
257
|
-
.replace(/[{}]/g, '') // Remove braces
|
|
258
|
-
.replace(/\s+/g, ' ')
|
|
259
|
-
.trim();
|
|
260
|
-
}
|
|
261
|
-
|
|
262
|
-
/**
|
|
263
|
-
* Protect mathematical notation before diffing by replacing with placeholders
|
|
264
|
-
* Handles both inline $...$ and display $$...$$ math
|
|
265
|
-
*/
|
|
266
|
-
export function protectMath(md: string): ProtectMathResult {
|
|
267
|
-
const mathBlocks: ProtectedMath[] = [];
|
|
268
|
-
|
|
269
|
-
// First protect display math ($$...$$) - must be done before inline math
|
|
270
|
-
let text = md.replace(/\$\$([^$]+)\$\$/g, (match, content) => {
|
|
271
|
-
const idx = mathBlocks.length;
|
|
272
|
-
const placeholder = `MATHBLOCK${idx}ENDMATH`;
|
|
273
|
-
// Create simplified version for matching in Word text
|
|
274
|
-
const simplified = simplifyMathForMatching(content);
|
|
275
|
-
mathBlocks.push({ original: match, placeholder, type: 'display', simplified });
|
|
276
|
-
return placeholder;
|
|
277
|
-
});
|
|
278
|
-
|
|
279
|
-
// Then protect inline math ($...$)
|
|
280
|
-
text = text.replace(/\$([^$\n]+)\$/g, (match, content) => {
|
|
281
|
-
const idx = mathBlocks.length;
|
|
282
|
-
const placeholder = `MATHBLOCK${idx}ENDMATH`;
|
|
283
|
-
const simplified = simplifyMathForMatching(content);
|
|
284
|
-
mathBlocks.push({ original: match, placeholder, type: 'inline', simplified });
|
|
285
|
-
return placeholder;
|
|
286
|
-
});
|
|
287
|
-
|
|
288
|
-
return { text, mathBlocks };
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
/**
|
|
292
|
-
* Restore math from placeholders
|
|
293
|
-
*/
|
|
294
|
-
export function restoreMath(text: string, mathBlocks: ProtectedMath[]): string {
|
|
295
|
-
for (const block of mathBlocks) {
|
|
296
|
-
text = text.split(block.placeholder).join(block.original);
|
|
297
|
-
}
|
|
298
|
-
return text;
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
/**
|
|
302
|
-
* Replace rendered math in Word text with matching placeholders
|
|
303
|
-
* This is heuristic-based since Word can render math in various ways
|
|
304
|
-
*/
|
|
305
|
-
export function replaceRenderedMath(wordText: string, mathBlocks: ProtectedMath[]): string {
|
|
306
|
-
let result = wordText;
|
|
307
|
-
|
|
308
|
-
for (const block of mathBlocks) {
|
|
309
|
-
// For inline math, try to find the simplified form in Word text
|
|
310
|
-
if (block.simplified.length >= 2) {
|
|
311
|
-
// Try exact match first
|
|
312
|
-
if (result.includes(block.simplified)) {
|
|
313
|
-
result = result.replace(block.simplified, block.placeholder);
|
|
314
|
-
}
|
|
315
|
-
}
|
|
316
|
-
}
|
|
317
|
-
|
|
318
|
-
return result;
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
/**
|
|
322
|
-
* Protect citations before diffing by replacing with placeholders
|
|
323
|
-
*/
|
|
324
|
-
export function protectCitations(md: string): ProtectCitationsResult {
|
|
325
|
-
const citations: string[] = [];
|
|
326
|
-
const text = md.replace(/\[@[^\]]+\]/g, (match) => {
|
|
327
|
-
const idx = citations.length;
|
|
328
|
-
citations.push(match);
|
|
329
|
-
return `CITEREF${idx}ENDCITE`;
|
|
330
|
-
});
|
|
331
|
-
return { text, citations };
|
|
332
|
-
}
|
|
333
|
-
|
|
334
|
-
/**
|
|
335
|
-
* Restore citations from placeholders
|
|
336
|
-
*/
|
|
337
|
-
export function restoreCitations(text: string, citations: string[]): string {
|
|
338
|
-
for (let i = 0; i < citations.length; i++) {
|
|
339
|
-
// Handle cases where placeholder might be inside annotations
|
|
340
|
-
const placeholder = `CITEREF${i}ENDCITE`;
|
|
341
|
-
text = text.split(placeholder).join(citations[i]);
|
|
342
|
-
}
|
|
343
|
-
return text;
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
/**
|
|
347
|
-
* Remove rendered citations from Word text (replace with matching placeholders)
|
|
348
|
-
*/
|
|
349
|
-
export function replaceRenderedCitations(wordText: string, count: number): string {
|
|
350
|
-
// Match rendered citation patterns: (Author 2021), (Author et al. 2021), etc.
|
|
351
|
-
const pattern = /\((?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?(?:\s*[&,;]\s*[A-Z][a-zé]+(?:\s+et\s+al\.?)?)*\s+\d{4}(?:[a-z])?(?:\s*[,;]\s*(?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?\s+)?\d{4}(?:[a-z])?)*)\)/g;
|
|
352
|
-
|
|
353
|
-
let idx = 0;
|
|
354
|
-
return wordText.replace(pattern, (match) => {
|
|
355
|
-
if (idx < count) {
|
|
356
|
-
const placeholder = `CITEREF${idx}ENDCITE`;
|
|
357
|
-
idx++;
|
|
358
|
-
return placeholder;
|
|
359
|
-
}
|
|
360
|
-
return match;
|
|
361
|
-
});
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
/**
|
|
365
|
-
* Protect markdown images before diffing by replacing with placeholders
|
|
366
|
-
* Images are treated as atomic blocks to prevent corruption during diff
|
|
367
|
-
*
|
|
368
|
-
* Matches: {#fig:label} or 
|
|
369
|
-
* Also matches Word-style: 
|
|
370
|
-
*/
|
|
371
|
-
export function protectImages(md: string, registry: ImageRegistry | null = null): ProtectImagesResult {
|
|
372
|
-
const images: ProtectedImage[] = [];
|
|
373
|
-
|
|
374
|
-
// Match markdown images: {#anchor} or 
|
|
375
|
-
// The anchor is optional and can have additional attributes
|
|
376
|
-
const imagePattern = /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g;
|
|
377
|
-
|
|
378
|
-
const text = md.replace(imagePattern, (match, caption, path, anchor) => {
|
|
379
|
-
const idx = images.length;
|
|
380
|
-
const placeholder = `IMAGEBLOCK${idx}ENDIMAGE`;
|
|
381
|
-
|
|
382
|
-
// Extract label from anchor if present (e.g., "#fig:map" -> "map")
|
|
383
|
-
let label: string | null = null;
|
|
384
|
-
if (anchor) {
|
|
385
|
-
const labelMatch = anchor.match(/#(fig|tbl):([a-zA-Z0-9_-]+)/);
|
|
386
|
-
if (labelMatch) {
|
|
387
|
-
label = labelMatch[2];
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
// Try to extract figure number from Word-style caption "Figure N: ..."
|
|
392
|
-
let figureNumber: string | null = null;
|
|
393
|
-
const figNumMatch = caption.match(/^(?:Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*/i);
|
|
394
|
-
if (figNumMatch) {
|
|
395
|
-
figureNumber = figNumMatch[1];
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
images.push({
|
|
399
|
-
original: match,
|
|
400
|
-
placeholder,
|
|
401
|
-
label,
|
|
402
|
-
caption: caption.trim(),
|
|
403
|
-
path,
|
|
404
|
-
figureNumber,
|
|
405
|
-
});
|
|
406
|
-
|
|
407
|
-
return placeholder;
|
|
408
|
-
});
|
|
409
|
-
|
|
410
|
-
return { text, images };
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
/**
|
|
414
|
-
* Restore images from placeholders
|
|
415
|
-
*/
|
|
416
|
-
export function restoreImages(text: string, images: ProtectedImage[]): string {
|
|
417
|
-
return restoreProtectedItems(text, images);
|
|
418
|
-
}
|
|
419
|
-
|
|
420
|
-
/**
|
|
421
|
-
* Match Word-extracted images to original images using registry
|
|
422
|
-
* Returns a mapping of Word image placeholders to original image placeholders
|
|
423
|
-
*/
|
|
424
|
-
export function matchWordImagesToOriginal(
|
|
425
|
-
originalImages: ProtectedImage[],
|
|
426
|
-
wordImages: ProtectedImage[],
|
|
427
|
-
registry: ImageRegistry | null = null
|
|
428
|
-
): Map<string, string> {
|
|
429
|
-
const mapping = new Map<string, string>();
|
|
430
|
-
const usedOriginals = new Set<string>();
|
|
431
|
-
|
|
432
|
-
for (const wordImg of wordImages) {
|
|
433
|
-
let bestMatch: ProtectedImage | null = null;
|
|
434
|
-
let bestScore = 0;
|
|
435
|
-
|
|
436
|
-
for (const origImg of originalImages) {
|
|
437
|
-
if (usedOriginals.has(origImg.placeholder)) continue;
|
|
438
|
-
|
|
439
|
-
let score = 0;
|
|
440
|
-
|
|
441
|
-
// Match by label (most reliable)
|
|
442
|
-
if (wordImg.label && origImg.label && wordImg.label === origImg.label) {
|
|
443
|
-
score += 100;
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
// Match by figure number via registry
|
|
447
|
-
if (wordImg.figureNumber && registry) {
|
|
448
|
-
const entry = registry.byNumber?.get(`fig:${wordImg.figureNumber}`);
|
|
449
|
-
if (entry && entry.label === origImg.label) {
|
|
450
|
-
score += 90;
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
// Match by caption similarity (first 50 chars, normalized)
|
|
455
|
-
const wordCaption = wordImg.caption.replace(/^(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*/i, '').toLowerCase().slice(0, 50);
|
|
456
|
-
const origCaption = origImg.caption.toLowerCase().slice(0, 50);
|
|
457
|
-
if (wordCaption && origCaption && wordCaption === origCaption) {
|
|
458
|
-
score += 80;
|
|
459
|
-
} else if (wordCaption && origCaption && (wordCaption.includes(origCaption.slice(0, 30)) || origCaption.includes(wordCaption.slice(0, 30)))) {
|
|
460
|
-
score += 40;
|
|
461
|
-
}
|
|
462
|
-
|
|
463
|
-
// Match by path similarity (filename)
|
|
464
|
-
const wordFile = wordImg.path.split('/').pop()?.toLowerCase() || '';
|
|
465
|
-
const origFile = origImg.path.split('/').pop()?.toLowerCase() || '';
|
|
466
|
-
if (wordFile === origFile) {
|
|
467
|
-
score += 30;
|
|
468
|
-
}
|
|
469
|
-
|
|
470
|
-
if (score > bestScore) {
|
|
471
|
-
bestScore = score;
|
|
472
|
-
bestMatch = origImg;
|
|
473
|
-
}
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
if (bestMatch && bestScore >= 40) {
|
|
477
|
-
mapping.set(wordImg.placeholder, bestMatch.placeholder);
|
|
478
|
-
usedOriginals.add(bestMatch.placeholder);
|
|
479
|
-
}
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
return mapping;
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
/**
|
|
486
|
-
* Protect markdown tables before diffing by replacing with placeholders
|
|
487
|
-
* Tables are treated as atomic blocks to prevent corruption during diff
|
|
488
|
-
*/
|
|
489
|
-
export function protectTables(md: string): ProtectTablesResult {
|
|
490
|
-
const tables: ProtectedTable[] = [];
|
|
491
|
-
|
|
492
|
-
// Match markdown tables: lines starting with | and containing |
|
|
493
|
-
// A table is: optional caption, header row, separator row (|---|), data rows
|
|
494
|
-
const tablePattern = /(?:^(?:\*\*)?Table[^\n]*\n\n?)?(?:^\|[^\n]+\|\n)+/gm;
|
|
495
|
-
|
|
496
|
-
const text = md.replace(tablePattern, (match) => {
|
|
497
|
-
// Verify it's actually a table (has separator row with dashes)
|
|
498
|
-
if (!match.includes('|---') && !match.includes('| ---') && !match.includes('|:--')) {
|
|
499
|
-
return match; // Not a real table, just lines with pipes
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
const idx = tables.length;
|
|
503
|
-
const placeholder = `\n\nTABLEBLOCK${idx}ENDTABLE\n\n`;
|
|
504
|
-
|
|
505
|
-
// Count cells for matching in Word (approximate)
|
|
506
|
-
const cellCount = (match.match(/\|/g) || []).length;
|
|
507
|
-
|
|
508
|
-
tables.push({ original: match.trim(), placeholder: placeholder.trim(), cellCount });
|
|
509
|
-
return placeholder;
|
|
510
|
-
});
|
|
511
|
-
|
|
512
|
-
return { text, tables };
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
/**
|
|
516
|
-
* Restore tables from placeholders
|
|
517
|
-
*/
|
|
518
|
-
export function restoreTables(text: string, tables: ProtectedTable[]): string {
|
|
519
|
-
return restoreProtectedItems(text, tables);
|
|
520
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* Protection and restoration utilities for markdown elements during Word import
|
|
3
|
+
*
|
|
4
|
+
* These functions protect special markdown syntax (anchors, cross-refs, math, citations,
|
|
5
|
+
* images, tables) by replacing them with placeholders before diffing, then restore them after.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// =============================================================================
|
|
9
|
+
// Interfaces
|
|
10
|
+
// =============================================================================
|
|
11
|
+
|
|
12
|
+
interface MarkdownPrefix {
|
|
13
|
+
prefix: string;
|
|
14
|
+
content: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
interface ProtectedItem {
|
|
18
|
+
original: string;
|
|
19
|
+
placeholder: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
interface ProtectedMath extends ProtectedItem {
|
|
23
|
+
type: 'inline' | 'display';
|
|
24
|
+
simplified: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
interface ProtectedImage extends ProtectedItem {
|
|
28
|
+
label: string | null;
|
|
29
|
+
caption: string;
|
|
30
|
+
path: string;
|
|
31
|
+
figureNumber: string | null;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
interface ProtectedTable extends ProtectedItem {
|
|
35
|
+
cellCount: number;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
interface ProtectAnchorsResult {
|
|
39
|
+
text: string;
|
|
40
|
+
anchors: ProtectedItem[];
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
interface ProtectCrossrefsResult {
|
|
44
|
+
text: string;
|
|
45
|
+
crossrefs: ProtectedItem[];
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
interface ProtectMathResult {
|
|
49
|
+
text: string;
|
|
50
|
+
mathBlocks: ProtectedMath[];
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
interface ProtectCitationsResult {
|
|
54
|
+
text: string;
|
|
55
|
+
citations: string[];
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
interface ProtectImagesResult {
|
|
59
|
+
text: string;
|
|
60
|
+
images: ProtectedImage[];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
interface ProtectTablesResult {
|
|
64
|
+
text: string;
|
|
65
|
+
tables: ProtectedTable[];
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
interface ImageRegistry {
|
|
69
|
+
byNumber?: Map<string, { label: string }>;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// =============================================================================
|
|
73
|
+
// Shared Helpers
|
|
74
|
+
// =============================================================================
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Replace regex matches with indexed placeholders and collect originals
|
|
78
|
+
*/
|
|
79
|
+
function collectAndReplace(
|
|
80
|
+
text: string,
|
|
81
|
+
pattern: RegExp,
|
|
82
|
+
prefix: string,
|
|
83
|
+
suffix: string,
|
|
84
|
+
): { text: string; items: ProtectedItem[] } {
|
|
85
|
+
const items: ProtectedItem[] = [];
|
|
86
|
+
const result = text.replace(pattern, (match) => {
|
|
87
|
+
const idx = items.length;
|
|
88
|
+
const placeholder = `${prefix}${idx}${suffix}`;
|
|
89
|
+
items.push({ original: match, placeholder });
|
|
90
|
+
return placeholder;
|
|
91
|
+
});
|
|
92
|
+
return { text: result, items };
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Restore protected items from placeholders, handling annotation wrappers
|
|
97
|
+
* (deletion {--...--} and insertion {++...++} wrappers are unwrapped)
|
|
98
|
+
*/
|
|
99
|
+
function restoreProtectedItems(text: string, items: ProtectedItem[]): string {
|
|
100
|
+
for (const item of items) {
|
|
101
|
+
const deletionPattern = new RegExp(`\\{--[^}]*?${item.placeholder}[^}]*?--\\}`, 'g');
|
|
102
|
+
text = text.replace(deletionPattern, item.original);
|
|
103
|
+
|
|
104
|
+
const insertionPattern = new RegExp(`\\{\\+\\+[^}]*?${item.placeholder}[^}]*?\\+\\+\\}`, 'g');
|
|
105
|
+
text = text.replace(insertionPattern, item.original);
|
|
106
|
+
|
|
107
|
+
text = text.split(item.placeholder).join(item.original);
|
|
108
|
+
}
|
|
109
|
+
return text;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// =============================================================================
|
|
113
|
+
// Public Functions
|
|
114
|
+
// =============================================================================
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Extract markdown prefix (headers, list markers) from a line
|
|
118
|
+
*/
|
|
119
|
+
export function extractMarkdownPrefix(line: string): MarkdownPrefix {
|
|
120
|
+
// Headers
|
|
121
|
+
const headerMatch = line.match(/^(#{1,6}\s+)/);
|
|
122
|
+
if (headerMatch && headerMatch[1]) {
|
|
123
|
+
return { prefix: headerMatch[1], content: line.slice(headerMatch[1].length) };
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
// List items
|
|
127
|
+
const listMatch = line.match(/^(\s*[-*+]\s+|\s*\d+\.\s+)/);
|
|
128
|
+
if (listMatch && listMatch[1]) {
|
|
129
|
+
return { prefix: listMatch[1], content: line.slice(listMatch[1].length) };
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Blockquotes
|
|
133
|
+
const quoteMatch = line.match(/^(>\s*)/);
|
|
134
|
+
if (quoteMatch && quoteMatch[1]) {
|
|
135
|
+
return { prefix: quoteMatch[1], content: line.slice(quoteMatch[1].length) };
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
return { prefix: '', content: line };
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Protect figure/table anchors before diffing
|
|
143
|
+
* Anchors like {#fig:heatmap} and {#tbl:results} should never be deleted
|
|
144
|
+
*/
|
|
145
|
+
export function protectAnchors(md: string): ProtectAnchorsResult {
|
|
146
|
+
// Match {#fig:label}, {#tbl:label}, {#eq:label}, {#sec:label} etc.
|
|
147
|
+
// Also match with additional attributes like {#fig:label width=50%}
|
|
148
|
+
const { text, items: anchors } = collectAndReplace(
|
|
149
|
+
md, /\{#(fig|tbl|eq|sec|lst):[^}]+\}/g, 'ANCHORBLOCK', 'ENDANCHOR',
|
|
150
|
+
);
|
|
151
|
+
return { text, anchors };
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Restore anchors from placeholders
|
|
156
|
+
*/
|
|
157
|
+
export function restoreAnchors(text: string, anchors: ProtectedItem[]): string {
|
|
158
|
+
for (const anchor of anchors) {
|
|
159
|
+
// Handle case where anchor is inside a deletion annotation
|
|
160
|
+
// {--...ANCHORBLOCK0ENDANCHOR--} should become {--...--}{#fig:label}
|
|
161
|
+
const deletionPattern = new RegExp(`\\{--([^}]*?)${anchor.placeholder}([^}]*?)--\\}`, 'g');
|
|
162
|
+
text = text.replace(deletionPattern, (match, before, after) => {
|
|
163
|
+
const cleanBefore = before.trim();
|
|
164
|
+
const cleanAfter = after.trim();
|
|
165
|
+
let result = '';
|
|
166
|
+
if (cleanBefore) result += `{--${cleanBefore}--}`;
|
|
167
|
+
result += anchor.original;
|
|
168
|
+
if (cleanAfter) result += `{--${cleanAfter}--}`;
|
|
169
|
+
return result;
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
// Handle case where anchor is inside a substitution
|
|
173
|
+
// {~~old ANCHORBLOCK0ENDANCHOR~>new~~} -> {~~old~>new~~}{#fig:label}
|
|
174
|
+
const substitutionPattern = new RegExp(`\\{~~([^~]*?)${anchor.placeholder}([^~]*?)~>([^~]*)~~\\}`, 'g');
|
|
175
|
+
text = text.replace(substitutionPattern, (match: string, oldBefore: string, oldAfter: string, newText: string) => {
|
|
176
|
+
const cleanOldBefore = (oldBefore ?? '').trim();
|
|
177
|
+
const cleanOldAfter = (oldAfter ?? '').trim();
|
|
178
|
+
const cleanNew = (newText ?? '').trim();
|
|
179
|
+
const oldText = (cleanOldBefore + ' ' + cleanOldAfter).trim();
|
|
180
|
+
let result = '';
|
|
181
|
+
if (oldText !== cleanNew) {
|
|
182
|
+
result += `{~~${oldText}~>${cleanNew}~~}`;
|
|
183
|
+
} else {
|
|
184
|
+
result += cleanNew;
|
|
185
|
+
}
|
|
186
|
+
result += anchor.original;
|
|
187
|
+
return result;
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
// Normal replacement
|
|
191
|
+
text = text.split(anchor.placeholder).join(anchor.original);
|
|
192
|
+
}
|
|
193
|
+
return text;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Protect cross-references before diffing
|
|
198
|
+
* References like @fig:label, @tbl:label should be preserved
|
|
199
|
+
*/
|
|
200
|
+
export function protectCrossrefs(md: string): ProtectCrossrefsResult {
|
|
201
|
+
// Match @fig:label, @tbl:label, @eq:label, @sec:label
|
|
202
|
+
// Can appear as @fig:label or (@fig:label) or [@fig:label]
|
|
203
|
+
const { text, items: crossrefs } = collectAndReplace(
|
|
204
|
+
md, /@(fig|tbl|eq|sec|lst):[a-zA-Z0-9_-]+/g, 'XREFBLOCK', 'ENDXREF',
|
|
205
|
+
);
|
|
206
|
+
return { text, crossrefs };
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Restore cross-references from placeholders
|
|
211
|
+
*/
|
|
212
|
+
export function restoreCrossrefs(text: string, crossrefs: ProtectedItem[]): string {
|
|
213
|
+
for (const xref of crossrefs) {
|
|
214
|
+
// Handle deletions - restore the reference even if marked deleted
|
|
215
|
+
const deletionPattern = new RegExp(`\\{--([^}]*?)${xref.placeholder}([^}]*?)--\\}`, 'g');
|
|
216
|
+
text = text.replace(deletionPattern, (match, before, after) => {
|
|
217
|
+
const cleanBefore = before.trim();
|
|
218
|
+
const cleanAfter = after.trim();
|
|
219
|
+
let result = '';
|
|
220
|
+
if (cleanBefore) result += `{--${cleanBefore}--}`;
|
|
221
|
+
result += xref.original;
|
|
222
|
+
if (cleanAfter) result += `{--${cleanAfter}--}`;
|
|
223
|
+
return result;
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
// Handle substitutions where rendered form (Figure 1) replaced the reference
|
|
227
|
+
// {~~XREFBLOCK0ENDXREF~>Figure 1~~} -> @fig:label
|
|
228
|
+
const substitutionPattern = new RegExp(`\\{~~${xref.placeholder}~>[^~]+~~\\}`, 'g');
|
|
229
|
+
text = text.replace(substitutionPattern, xref.original);
|
|
230
|
+
|
|
231
|
+
// Normal replacement
|
|
232
|
+
text = text.split(xref.placeholder).join(xref.original);
|
|
233
|
+
}
|
|
234
|
+
return text;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Simplify LaTeX math for fuzzy matching against Word text
|
|
239
|
+
* Word renders math as text, so we need to match the rendered form
|
|
240
|
+
*/
|
|
241
|
+
export function simplifyMathForMatching(latex: string): string {
|
|
242
|
+
return latex
|
|
243
|
+
// Remove common LaTeX commands
|
|
244
|
+
.replace(/\\text\{([^}]+)\}/g, '$1')
|
|
245
|
+
.replace(/\\hat\{([^}]+)\}/g, '$1')
|
|
246
|
+
.replace(/\\bar\{([^}]+)\}/g, '$1')
|
|
247
|
+
.replace(/\\frac\{([^}]+)\}\{([^}]+)\}/g, '$1/$2')
|
|
248
|
+
.replace(/\\sum_([a-z])/g, 'Σ')
|
|
249
|
+
.replace(/\\sum/g, 'Σ')
|
|
250
|
+
.replace(/\\cdot/g, '·')
|
|
251
|
+
.replace(/\\quad/g, ' ')
|
|
252
|
+
.replace(/\\,/g, ' ')
|
|
253
|
+
.replace(/\\_/g, '_')
|
|
254
|
+
.replace(/\\{/g, '{')
|
|
255
|
+
.replace(/\\}/g, '}')
|
|
256
|
+
.replace(/\\/g, '') // Remove remaining backslashes
|
|
257
|
+
.replace(/[{}]/g, '') // Remove braces
|
|
258
|
+
.replace(/\s+/g, ' ')
|
|
259
|
+
.trim();
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Protect mathematical notation before diffing by replacing with placeholders
|
|
264
|
+
* Handles both inline $...$ and display $$...$$ math
|
|
265
|
+
*/
|
|
266
|
+
export function protectMath(md: string): ProtectMathResult {
|
|
267
|
+
const mathBlocks: ProtectedMath[] = [];
|
|
268
|
+
|
|
269
|
+
// First protect display math ($$...$$) - must be done before inline math
|
|
270
|
+
let text = md.replace(/\$\$([^$]+)\$\$/g, (match, content) => {
|
|
271
|
+
const idx = mathBlocks.length;
|
|
272
|
+
const placeholder = `MATHBLOCK${idx}ENDMATH`;
|
|
273
|
+
// Create simplified version for matching in Word text
|
|
274
|
+
const simplified = simplifyMathForMatching(content);
|
|
275
|
+
mathBlocks.push({ original: match, placeholder, type: 'display', simplified });
|
|
276
|
+
return placeholder;
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
// Then protect inline math ($...$)
|
|
280
|
+
text = text.replace(/\$([^$\n]+)\$/g, (match, content) => {
|
|
281
|
+
const idx = mathBlocks.length;
|
|
282
|
+
const placeholder = `MATHBLOCK${idx}ENDMATH`;
|
|
283
|
+
const simplified = simplifyMathForMatching(content);
|
|
284
|
+
mathBlocks.push({ original: match, placeholder, type: 'inline', simplified });
|
|
285
|
+
return placeholder;
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
return { text, mathBlocks };
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Restore math from placeholders
|
|
293
|
+
*/
|
|
294
|
+
export function restoreMath(text: string, mathBlocks: ProtectedMath[]): string {
|
|
295
|
+
for (const block of mathBlocks) {
|
|
296
|
+
text = text.split(block.placeholder).join(block.original);
|
|
297
|
+
}
|
|
298
|
+
return text;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Replace rendered math in Word text with matching placeholders
|
|
303
|
+
* This is heuristic-based since Word can render math in various ways
|
|
304
|
+
*/
|
|
305
|
+
export function replaceRenderedMath(wordText: string, mathBlocks: ProtectedMath[]): string {
|
|
306
|
+
let result = wordText;
|
|
307
|
+
|
|
308
|
+
for (const block of mathBlocks) {
|
|
309
|
+
// For inline math, try to find the simplified form in Word text
|
|
310
|
+
if (block.simplified.length >= 2) {
|
|
311
|
+
// Try exact match first
|
|
312
|
+
if (result.includes(block.simplified)) {
|
|
313
|
+
result = result.replace(block.simplified, block.placeholder);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
return result;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/**
|
|
322
|
+
* Protect citations before diffing by replacing with placeholders
|
|
323
|
+
*/
|
|
324
|
+
export function protectCitations(md: string): ProtectCitationsResult {
|
|
325
|
+
const citations: string[] = [];
|
|
326
|
+
const text = md.replace(/\[@[^\]]+\]/g, (match) => {
|
|
327
|
+
const idx = citations.length;
|
|
328
|
+
citations.push(match);
|
|
329
|
+
return `CITEREF${idx}ENDCITE`;
|
|
330
|
+
});
|
|
331
|
+
return { text, citations };
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
/**
|
|
335
|
+
* Restore citations from placeholders
|
|
336
|
+
*/
|
|
337
|
+
export function restoreCitations(text: string, citations: string[]): string {
|
|
338
|
+
for (let i = 0; i < citations.length; i++) {
|
|
339
|
+
// Handle cases where placeholder might be inside annotations
|
|
340
|
+
const placeholder = `CITEREF${i}ENDCITE`;
|
|
341
|
+
text = text.split(placeholder).join(citations[i]);
|
|
342
|
+
}
|
|
343
|
+
return text;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Remove rendered citations from Word text (replace with matching placeholders)
|
|
348
|
+
*/
|
|
349
|
+
export function replaceRenderedCitations(wordText: string, count: number): string {
|
|
350
|
+
// Match rendered citation patterns: (Author 2021), (Author et al. 2021), etc.
|
|
351
|
+
const pattern = /\((?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?(?:\s*[&,;]\s*[A-Z][a-zé]+(?:\s+et\s+al\.?)?)*\s+\d{4}(?:[a-z])?(?:\s*[,;]\s*(?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?\s+)?\d{4}(?:[a-z])?)*)\)/g;
|
|
352
|
+
|
|
353
|
+
let idx = 0;
|
|
354
|
+
return wordText.replace(pattern, (match) => {
|
|
355
|
+
if (idx < count) {
|
|
356
|
+
const placeholder = `CITEREF${idx}ENDCITE`;
|
|
357
|
+
idx++;
|
|
358
|
+
return placeholder;
|
|
359
|
+
}
|
|
360
|
+
return match;
|
|
361
|
+
});
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
/**
|
|
365
|
+
* Protect markdown images before diffing by replacing with placeholders
|
|
366
|
+
* Images are treated as atomic blocks to prevent corruption during diff
|
|
367
|
+
*
|
|
368
|
+
* Matches: {#fig:label} or 
|
|
369
|
+
* Also matches Word-style: 
|
|
370
|
+
*/
|
|
371
|
+
export function protectImages(md: string, registry: ImageRegistry | null = null): ProtectImagesResult {
|
|
372
|
+
const images: ProtectedImage[] = [];
|
|
373
|
+
|
|
374
|
+
// Match markdown images: {#anchor} or 
|
|
375
|
+
// The anchor is optional and can have additional attributes
|
|
376
|
+
const imagePattern = /!\[([^\]]*)\]\(([^)]+)\)(?:\{([^}]+)\})?/g;
|
|
377
|
+
|
|
378
|
+
const text = md.replace(imagePattern, (match, caption, path, anchor) => {
|
|
379
|
+
const idx = images.length;
|
|
380
|
+
const placeholder = `IMAGEBLOCK${idx}ENDIMAGE`;
|
|
381
|
+
|
|
382
|
+
// Extract label from anchor if present (e.g., "#fig:map" -> "map")
|
|
383
|
+
let label: string | null = null;
|
|
384
|
+
if (anchor) {
|
|
385
|
+
const labelMatch = anchor.match(/#(fig|tbl):([a-zA-Z0-9_-]+)/);
|
|
386
|
+
if (labelMatch) {
|
|
387
|
+
label = labelMatch[2];
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// Try to extract figure number from Word-style caption "Figure N: ..."
|
|
392
|
+
let figureNumber: string | null = null;
|
|
393
|
+
const figNumMatch = caption.match(/^(?:Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*/i);
|
|
394
|
+
if (figNumMatch) {
|
|
395
|
+
figureNumber = figNumMatch[1];
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
images.push({
|
|
399
|
+
original: match,
|
|
400
|
+
placeholder,
|
|
401
|
+
label,
|
|
402
|
+
caption: caption.trim(),
|
|
403
|
+
path,
|
|
404
|
+
figureNumber,
|
|
405
|
+
});
|
|
406
|
+
|
|
407
|
+
return placeholder;
|
|
408
|
+
});
|
|
409
|
+
|
|
410
|
+
return { text, images };
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/**
|
|
414
|
+
* Restore images from placeholders
|
|
415
|
+
*/
|
|
416
|
+
export function restoreImages(text: string, images: ProtectedImage[]): string {
|
|
417
|
+
return restoreProtectedItems(text, images);
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
/**
|
|
421
|
+
* Match Word-extracted images to original images using registry
|
|
422
|
+
* Returns a mapping of Word image placeholders to original image placeholders
|
|
423
|
+
*/
|
|
424
|
+
export function matchWordImagesToOriginal(
|
|
425
|
+
originalImages: ProtectedImage[],
|
|
426
|
+
wordImages: ProtectedImage[],
|
|
427
|
+
registry: ImageRegistry | null = null
|
|
428
|
+
): Map<string, string> {
|
|
429
|
+
const mapping = new Map<string, string>();
|
|
430
|
+
const usedOriginals = new Set<string>();
|
|
431
|
+
|
|
432
|
+
for (const wordImg of wordImages) {
|
|
433
|
+
let bestMatch: ProtectedImage | null = null;
|
|
434
|
+
let bestScore = 0;
|
|
435
|
+
|
|
436
|
+
for (const origImg of originalImages) {
|
|
437
|
+
if (usedOriginals.has(origImg.placeholder)) continue;
|
|
438
|
+
|
|
439
|
+
let score = 0;
|
|
440
|
+
|
|
441
|
+
// Match by label (most reliable)
|
|
442
|
+
if (wordImg.label && origImg.label && wordImg.label === origImg.label) {
|
|
443
|
+
score += 100;
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Match by figure number via registry
|
|
447
|
+
if (wordImg.figureNumber && registry) {
|
|
448
|
+
const entry = registry.byNumber?.get(`fig:${wordImg.figureNumber}`);
|
|
449
|
+
if (entry && entry.label === origImg.label) {
|
|
450
|
+
score += 90;
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// Match by caption similarity (first 50 chars, normalized)
|
|
455
|
+
const wordCaption = wordImg.caption.replace(/^(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*/i, '').toLowerCase().slice(0, 50);
|
|
456
|
+
const origCaption = origImg.caption.toLowerCase().slice(0, 50);
|
|
457
|
+
if (wordCaption && origCaption && wordCaption === origCaption) {
|
|
458
|
+
score += 80;
|
|
459
|
+
} else if (wordCaption && origCaption && (wordCaption.includes(origCaption.slice(0, 30)) || origCaption.includes(wordCaption.slice(0, 30)))) {
|
|
460
|
+
score += 40;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// Match by path similarity (filename)
|
|
464
|
+
const wordFile = wordImg.path.split('/').pop()?.toLowerCase() || '';
|
|
465
|
+
const origFile = origImg.path.split('/').pop()?.toLowerCase() || '';
|
|
466
|
+
if (wordFile === origFile) {
|
|
467
|
+
score += 30;
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
if (score > bestScore) {
|
|
471
|
+
bestScore = score;
|
|
472
|
+
bestMatch = origImg;
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
if (bestMatch && bestScore >= 40) {
|
|
477
|
+
mapping.set(wordImg.placeholder, bestMatch.placeholder);
|
|
478
|
+
usedOriginals.add(bestMatch.placeholder);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
return mapping;
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
/**
|
|
486
|
+
* Protect markdown tables before diffing by replacing with placeholders
|
|
487
|
+
* Tables are treated as atomic blocks to prevent corruption during diff
|
|
488
|
+
*/
|
|
489
|
+
export function protectTables(md: string): ProtectTablesResult {
|
|
490
|
+
const tables: ProtectedTable[] = [];
|
|
491
|
+
|
|
492
|
+
// Match markdown tables: lines starting with | and containing |
|
|
493
|
+
// A table is: optional caption, header row, separator row (|---|), data rows
|
|
494
|
+
const tablePattern = /(?:^(?:\*\*)?Table[^\n]*\n\n?)?(?:^\|[^\n]+\|\n)+/gm;
|
|
495
|
+
|
|
496
|
+
const text = md.replace(tablePattern, (match) => {
|
|
497
|
+
// Verify it's actually a table (has separator row with dashes)
|
|
498
|
+
if (!match.includes('|---') && !match.includes('| ---') && !match.includes('|:--')) {
|
|
499
|
+
return match; // Not a real table, just lines with pipes
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
const idx = tables.length;
|
|
503
|
+
const placeholder = `\n\nTABLEBLOCK${idx}ENDTABLE\n\n`;
|
|
504
|
+
|
|
505
|
+
// Count cells for matching in Word (approximate)
|
|
506
|
+
const cellCount = (match.match(/\|/g) || []).length;
|
|
507
|
+
|
|
508
|
+
tables.push({ original: match.trim(), placeholder: placeholder.trim(), cellCount });
|
|
509
|
+
return placeholder;
|
|
510
|
+
});
|
|
511
|
+
|
|
512
|
+
return { text, tables };
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
/**
|
|
516
|
+
* Restore tables from placeholders
|
|
517
|
+
*/
|
|
518
|
+
export function restoreTables(text: string, tables: ProtectedTable[]): string {
|
|
519
|
+
return restoreProtectedItems(text, tables);
|
|
520
|
+
}
|