docrev 0.9.5 → 0.9.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/dev_notes/bug_repro_comment_parser.md +71 -0
- package/dist/lib/anchor-match.d.ts +41 -0
- package/dist/lib/anchor-match.d.ts.map +1 -0
- package/dist/lib/anchor-match.js +192 -0
- package/dist/lib/anchor-match.js.map +1 -0
- package/dist/lib/annotations.d.ts.map +1 -1
- package/dist/lib/annotations.js +8 -5
- package/dist/lib/annotations.js.map +1 -1
- package/dist/lib/commands/file-ops.d.ts +11 -0
- package/dist/lib/commands/file-ops.d.ts.map +1 -0
- package/dist/lib/commands/file-ops.js +301 -0
- package/dist/lib/commands/file-ops.js.map +1 -0
- package/dist/lib/commands/index.d.ts +10 -1
- package/dist/lib/commands/index.d.ts.map +1 -1
- package/dist/lib/commands/index.js +19 -1
- package/dist/lib/commands/index.js.map +1 -1
- package/dist/lib/commands/merge-resolve.d.ts +12 -0
- package/dist/lib/commands/merge-resolve.d.ts.map +1 -0
- package/dist/lib/commands/merge-resolve.js +318 -0
- package/dist/lib/commands/merge-resolve.js.map +1 -0
- package/dist/lib/commands/preview.d.ts +11 -0
- package/dist/lib/commands/preview.d.ts.map +1 -0
- package/dist/lib/commands/preview.js +138 -0
- package/dist/lib/commands/preview.js.map +1 -0
- package/dist/lib/commands/project-info.d.ts +11 -0
- package/dist/lib/commands/project-info.d.ts.map +1 -0
- package/dist/lib/commands/project-info.js +187 -0
- package/dist/lib/commands/project-info.js.map +1 -0
- package/dist/lib/commands/quality.d.ts +11 -0
- package/dist/lib/commands/quality.d.ts.map +1 -0
- package/dist/lib/commands/quality.js +384 -0
- package/dist/lib/commands/quality.js.map +1 -0
- package/dist/lib/commands/section-boundaries.d.ts +22 -0
- package/dist/lib/commands/section-boundaries.d.ts.map +1 -0
- package/dist/lib/commands/section-boundaries.js +53 -0
- package/dist/lib/commands/section-boundaries.js.map +1 -0
- package/dist/lib/commands/sections.d.ts +3 -2
- package/dist/lib/commands/sections.d.ts.map +1 -1
- package/dist/lib/commands/sections.js +4 -736
- package/dist/lib/commands/sections.js.map +1 -1
- package/dist/lib/commands/sync.d.ts +11 -0
- package/dist/lib/commands/sync.d.ts.map +1 -0
- package/dist/lib/commands/sync.js +576 -0
- package/dist/lib/commands/sync.js.map +1 -0
- package/dist/lib/commands/text-ops.d.ts +11 -0
- package/dist/lib/commands/text-ops.d.ts.map +1 -0
- package/dist/lib/commands/text-ops.js +357 -0
- package/dist/lib/commands/text-ops.js.map +1 -0
- package/dist/lib/commands/utilities.d.ts +2 -4
- package/dist/lib/commands/utilities.d.ts.map +1 -1
- package/dist/lib/commands/utilities.js +3 -1572
- package/dist/lib/commands/utilities.js.map +1 -1
- package/dist/lib/commands/verify-anchors.d.ts +17 -0
- package/dist/lib/commands/verify-anchors.d.ts.map +1 -0
- package/dist/lib/commands/verify-anchors.js +215 -0
- package/dist/lib/commands/verify-anchors.js.map +1 -0
- package/dist/lib/commands/word-tools.d.ts +11 -0
- package/dist/lib/commands/word-tools.d.ts.map +1 -0
- package/dist/lib/commands/word-tools.js +272 -0
- package/dist/lib/commands/word-tools.js.map +1 -0
- package/dist/lib/diff-engine.d.ts +25 -0
- package/dist/lib/diff-engine.d.ts.map +1 -0
- package/dist/lib/diff-engine.js +354 -0
- package/dist/lib/diff-engine.js.map +1 -0
- package/dist/lib/import.d.ts +44 -118
- package/dist/lib/import.d.ts.map +1 -1
- package/dist/lib/import.js +25 -1173
- package/dist/lib/import.js.map +1 -1
- package/dist/lib/restore-references.d.ts +35 -0
- package/dist/lib/restore-references.d.ts.map +1 -0
- package/dist/lib/restore-references.js +188 -0
- package/dist/lib/restore-references.js.map +1 -0
- package/dist/lib/word-extraction.d.ts +100 -0
- package/dist/lib/word-extraction.d.ts.map +1 -0
- package/dist/lib/word-extraction.js +594 -0
- package/dist/lib/word-extraction.js.map +1 -0
- package/lib/anchor-match.ts +238 -0
- package/lib/annotations.ts +9 -5
- package/lib/commands/file-ops.ts +372 -0
- package/lib/commands/index.ts +27 -0
- package/lib/commands/merge-resolve.ts +378 -0
- package/lib/commands/preview.ts +178 -0
- package/lib/commands/project-info.ts +244 -0
- package/lib/commands/quality.ts +517 -0
- package/lib/commands/section-boundaries.ts +72 -0
- package/lib/commands/sections.ts +3 -870
- package/lib/commands/sync.ts +701 -0
- package/lib/commands/text-ops.ts +449 -0
- package/lib/commands/utilities.ts +62 -2043
- package/lib/commands/verify-anchors.ts +261 -0
- package/lib/commands/word-tools.ts +340 -0
- package/lib/diff-engine.ts +465 -0
- package/lib/import.ts +108 -1504
- package/lib/restore-references.ts +240 -0
- package/lib/word-extraction.ts +759 -0
- package/package.json +1 -1
- package/skill/REFERENCE.md +29 -2
- package/skill/SKILL.md +12 -2
package/lib/import.ts
CHANGED
|
@@ -1,108 +1,108 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Import functionality - convert Word docs to annotated Markdown
|
|
3
|
+
*
|
|
4
|
+
* Orchestration workflows + re-exports from extraction/diff/restore modules
|
|
3
5
|
*/
|
|
4
6
|
|
|
5
7
|
import * as fs from 'fs';
|
|
6
8
|
import * as path from 'path';
|
|
7
|
-
import { diffWords, Change } from 'diff';
|
|
8
9
|
import { stripAnnotations } from './annotations.js';
|
|
9
10
|
import { readImageRegistry } from './image-registry.js';
|
|
10
11
|
import { exec } from 'child_process';
|
|
11
12
|
import { promisify } from 'util';
|
|
13
|
+
|
|
14
|
+
// Import from split modules
|
|
15
|
+
import {
|
|
16
|
+
extractFromWord,
|
|
17
|
+
extractWordComments,
|
|
18
|
+
extractCommentAnchors,
|
|
19
|
+
extractWordTables,
|
|
20
|
+
} from './word-extraction.js';
|
|
21
|
+
import type {
|
|
22
|
+
WordComment,
|
|
23
|
+
CommentAnchorData,
|
|
24
|
+
WordTable,
|
|
25
|
+
ExtractFromWordResult,
|
|
26
|
+
} from './word-extraction.js';
|
|
12
27
|
import {
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
28
|
+
generateSmartDiff,
|
|
29
|
+
generateAnnotatedDiff,
|
|
30
|
+
cleanupAnnotations,
|
|
31
|
+
fixCitationAnnotations,
|
|
32
|
+
} from './diff-engine.js';
|
|
33
|
+
import {
|
|
34
|
+
restoreCrossrefFromWord,
|
|
35
|
+
restoreImagesFromRegistry,
|
|
36
|
+
parseVisibleComments,
|
|
37
|
+
convertVisibleComments,
|
|
38
|
+
} from './restore-references.js';
|
|
39
|
+
import { findAnchorInText } from './anchor-match.js';
|
|
40
|
+
|
|
41
|
+
// Re-export everything so existing imports from './import.js' still work
|
|
42
|
+
export {
|
|
43
|
+
extractFromWord,
|
|
44
|
+
extractWordComments,
|
|
45
|
+
extractCommentAnchors,
|
|
46
|
+
extractHeadings,
|
|
47
|
+
extractWordTables,
|
|
48
|
+
} from './word-extraction.js';
|
|
49
|
+
export type {
|
|
50
|
+
WordComment,
|
|
51
|
+
TextNode,
|
|
52
|
+
CommentAnchorData,
|
|
53
|
+
CommentAnchorsResult,
|
|
54
|
+
DocxHeading,
|
|
55
|
+
WordTable,
|
|
56
|
+
ParsedRow,
|
|
57
|
+
ExtractFromWordOptions,
|
|
58
|
+
ExtractMessage,
|
|
59
|
+
ExtractFromWordResult,
|
|
60
|
+
} from './word-extraction.js';
|
|
61
|
+
|
|
62
|
+
export {
|
|
63
|
+
generateSmartDiff,
|
|
64
|
+
generateAnnotatedDiff,
|
|
65
|
+
cleanupAnnotations,
|
|
66
|
+
fixCitationAnnotations,
|
|
67
|
+
} from './diff-engine.js';
|
|
68
|
+
export type {
|
|
69
|
+
GenerateSmartDiffOptions,
|
|
70
|
+
} from './diff-engine.js';
|
|
71
|
+
|
|
72
|
+
export {
|
|
73
|
+
restoreCrossrefFromWord,
|
|
74
|
+
restoreImagesFromRegistry,
|
|
75
|
+
parseVisibleComments,
|
|
76
|
+
convertVisibleComments,
|
|
77
|
+
} from './restore-references.js';
|
|
78
|
+
export type {
|
|
79
|
+
RestoreCrossrefResult,
|
|
80
|
+
RestoreImagesResult,
|
|
81
|
+
} from './restore-references.js';
|
|
32
82
|
|
|
33
83
|
const execAsync = promisify(exec);
|
|
34
84
|
|
|
35
85
|
// ============================================
|
|
36
|
-
// Type Definitions
|
|
86
|
+
// Type Definitions (orchestration-specific)
|
|
37
87
|
// ============================================
|
|
38
88
|
|
|
39
|
-
interface
|
|
40
|
-
id: string;
|
|
41
|
-
author: string;
|
|
42
|
-
date: string;
|
|
43
|
-
text: string;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
interface TextNode {
|
|
47
|
-
xmlStart: number;
|
|
48
|
-
xmlEnd: number;
|
|
49
|
-
textStart: number;
|
|
50
|
-
textEnd: number;
|
|
51
|
-
text: string;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
interface CommentAnchorData {
|
|
55
|
-
anchor: string;
|
|
56
|
-
before: string;
|
|
57
|
-
after: string;
|
|
58
|
-
docPosition: number;
|
|
59
|
-
docLength: number;
|
|
60
|
-
isEmpty: boolean;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
interface CommentAnchorsResult {
|
|
64
|
-
anchors: Map<string, CommentAnchorData>;
|
|
65
|
-
fullDocText: string;
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
interface WordTable {
|
|
69
|
-
markdown: string;
|
|
70
|
-
rowCount: number;
|
|
71
|
-
colCount: number;
|
|
72
|
-
}
|
|
73
|
-
|
|
74
|
-
interface ParsedRow {
|
|
75
|
-
cells: string[];
|
|
76
|
-
colSpans: number[];
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
interface ExtractFromWordOptions {
|
|
80
|
-
mediaDir?: string;
|
|
81
|
-
skipMediaExtraction?: boolean;
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
interface ExtractMessage {
|
|
85
|
-
type: 'info' | 'warning';
|
|
86
|
-
message: string;
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
interface ExtractFromWordResult {
|
|
90
|
-
text: string;
|
|
91
|
-
comments: WordComment[];
|
|
92
|
-
anchors: Map<string, CommentAnchorData>;
|
|
93
|
-
messages: ExtractMessage[];
|
|
94
|
-
extractedMedia: string[];
|
|
95
|
-
tables: WordTable[];
|
|
96
|
-
hasTrackChanges: boolean;
|
|
97
|
-
trackChangeStats: { insertions: number; deletions: number };
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
interface InsertCommentsOptions {
|
|
89
|
+
export interface InsertCommentsOptions {
|
|
101
90
|
quiet?: boolean;
|
|
102
91
|
sectionBoundary?: { start: number; end: number } | null;
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
92
|
+
/**
|
|
93
|
+
* When true (default), comments wrap their anchor text in `[anchor]{.mark}`
|
|
94
|
+
* so the rebuilt docx restores the original Word comment range. When false,
|
|
95
|
+
* comments are inserted as standalone `{>>...<<}` blocks adjacent to the
|
|
96
|
+
* anchor — the prose stays byte-identical except for the inserted blocks.
|
|
97
|
+
*
|
|
98
|
+
* Set to false from `sync --comments-only` so a draft revised after the
|
|
99
|
+
* docx was sent for review keeps its prose intact, and so multiple
|
|
100
|
+
* comments sharing one anchor don't produce nested broken markup.
|
|
101
|
+
*/
|
|
102
|
+
wrapAnchor?: boolean;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
export interface CommentWithPos {
|
|
106
106
|
id: string;
|
|
107
107
|
author: string;
|
|
108
108
|
text: string;
|
|
@@ -114,42 +114,19 @@ interface CommentWithPos {
|
|
|
114
114
|
strategy?: string;
|
|
115
115
|
}
|
|
116
116
|
|
|
117
|
-
|
|
118
|
-
occurrences: number[];
|
|
119
|
-
matchedAnchor: string | null;
|
|
120
|
-
strategy: string;
|
|
121
|
-
stripped?: boolean;
|
|
122
|
-
}
|
|
117
|
+
export type { AnchorSearchResult } from './anchor-match.js';
|
|
123
118
|
|
|
124
|
-
interface MarkdownPrefixResult {
|
|
119
|
+
export interface MarkdownPrefixResult {
|
|
125
120
|
prefix: string;
|
|
126
121
|
content: string;
|
|
127
122
|
}
|
|
128
123
|
|
|
129
|
-
interface
|
|
130
|
-
wordTables?: WordTable[];
|
|
131
|
-
imageRegistry?: any;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
interface RestoreCrossrefResult {
|
|
135
|
-
text: string;
|
|
136
|
-
restored: number;
|
|
137
|
-
messages: string[];
|
|
138
|
-
restoredLabels: Set<string>;
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
interface RestoreImagesResult {
|
|
142
|
-
text: string;
|
|
143
|
-
restored: number;
|
|
144
|
-
messages: string[];
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
interface ImportWordWithTrackChangesOptions {
|
|
124
|
+
export interface ImportWordWithTrackChangesOptions {
|
|
148
125
|
mediaDir?: string;
|
|
149
126
|
projectDir?: string;
|
|
150
127
|
}
|
|
151
128
|
|
|
152
|
-
interface ImportWordWithTrackChangesResult {
|
|
129
|
+
export interface ImportWordWithTrackChangesResult {
|
|
153
130
|
text: string;
|
|
154
131
|
stats: {
|
|
155
132
|
insertions: number;
|
|
@@ -164,14 +141,14 @@ interface ImportWordWithTrackChangesResult {
|
|
|
164
141
|
comments: WordComment[];
|
|
165
142
|
}
|
|
166
143
|
|
|
167
|
-
interface ImportFromWordOptions {
|
|
144
|
+
export interface ImportFromWordOptions {
|
|
168
145
|
author?: string;
|
|
169
146
|
sectionContent?: string;
|
|
170
147
|
figuresDir?: string;
|
|
171
148
|
wordTables?: WordTable[];
|
|
172
149
|
}
|
|
173
150
|
|
|
174
|
-
interface ImportFromWordResult {
|
|
151
|
+
export interface ImportFromWordResult {
|
|
175
152
|
annotated: string;
|
|
176
153
|
stats: {
|
|
177
154
|
insertions: number;
|
|
@@ -183,13 +160,13 @@ interface ImportFromWordResult {
|
|
|
183
160
|
extractedMedia: string[];
|
|
184
161
|
}
|
|
185
162
|
|
|
186
|
-
interface MovedFile {
|
|
163
|
+
export interface MovedFile {
|
|
187
164
|
from: string;
|
|
188
165
|
to: string;
|
|
189
166
|
name: string;
|
|
190
167
|
}
|
|
191
168
|
|
|
192
|
-
interface MoveExtractedMediaResult {
|
|
169
|
+
export interface MoveExtractedMediaResult {
|
|
193
170
|
moved: MovedFile[];
|
|
194
171
|
errors: string[];
|
|
195
172
|
}
|
|
@@ -198,593 +175,6 @@ interface MoveExtractedMediaResult {
|
|
|
198
175
|
// Functions
|
|
199
176
|
// ============================================
|
|
200
177
|
|
|
201
|
-
/**
|
|
202
|
-
* Extract comments directly from Word docx comments.xml
|
|
203
|
-
*/
|
|
204
|
-
export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
|
|
205
|
-
const AdmZip = (await import('adm-zip')).default;
|
|
206
|
-
const { parseStringPromise } = await import('xml2js');
|
|
207
|
-
|
|
208
|
-
const comments: WordComment[] = [];
|
|
209
|
-
|
|
210
|
-
// Validate file exists
|
|
211
|
-
if (!fs.existsSync(docxPath)) {
|
|
212
|
-
throw new Error(`File not found: ${docxPath}`);
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
try {
|
|
216
|
-
let zip;
|
|
217
|
-
try {
|
|
218
|
-
zip = new AdmZip(docxPath);
|
|
219
|
-
} catch (err: any) {
|
|
220
|
-
throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
const commentsEntry = zip.getEntry('word/comments.xml');
|
|
224
|
-
|
|
225
|
-
if (!commentsEntry) {
|
|
226
|
-
return comments;
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
let commentsXml;
|
|
230
|
-
try {
|
|
231
|
-
commentsXml = commentsEntry.getData().toString('utf8');
|
|
232
|
-
} catch (err: any) {
|
|
233
|
-
throw new Error(`Failed to read comments from document: ${err.message}`);
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
|
|
237
|
-
|
|
238
|
-
const ns = 'w:';
|
|
239
|
-
const commentsRoot = parsed['w:comments'];
|
|
240
|
-
if (!commentsRoot || !commentsRoot['w:comment']) {
|
|
241
|
-
return comments;
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
// Ensure it's an array
|
|
245
|
-
const commentNodes = Array.isArray(commentsRoot['w:comment'])
|
|
246
|
-
? commentsRoot['w:comment']
|
|
247
|
-
: [commentsRoot['w:comment']];
|
|
248
|
-
|
|
249
|
-
for (const comment of commentNodes) {
|
|
250
|
-
const id = comment.$?.['w:id'] || '';
|
|
251
|
-
const author = comment.$?.['w:author'] || 'Unknown';
|
|
252
|
-
const date = comment.$?.['w:date'] || '';
|
|
253
|
-
|
|
254
|
-
// Extract text from nested w:p/w:r/w:t elements
|
|
255
|
-
let text = '';
|
|
256
|
-
const extractText = (node: any): void => {
|
|
257
|
-
if (!node) return;
|
|
258
|
-
if (typeof node === 'string') {
|
|
259
|
-
text += node;
|
|
260
|
-
return;
|
|
261
|
-
}
|
|
262
|
-
if (node['w:t']) {
|
|
263
|
-
const t = node['w:t'];
|
|
264
|
-
text += typeof t === 'string' ? t : (t._ || t);
|
|
265
|
-
}
|
|
266
|
-
if (node['w:r']) {
|
|
267
|
-
const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
|
|
268
|
-
runs.forEach(extractText);
|
|
269
|
-
}
|
|
270
|
-
if (node['w:p']) {
|
|
271
|
-
const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
|
|
272
|
-
paras.forEach(extractText);
|
|
273
|
-
}
|
|
274
|
-
};
|
|
275
|
-
extractText(comment);
|
|
276
|
-
|
|
277
|
-
comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
|
|
278
|
-
}
|
|
279
|
-
} catch (err: any) {
|
|
280
|
-
// Re-throw with more context if it's already an Error we created
|
|
281
|
-
if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
|
|
282
|
-
throw err;
|
|
283
|
-
}
|
|
284
|
-
throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
return comments;
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
/**
|
|
291
|
-
* Extract comment anchor texts from document.xml with surrounding context
|
|
292
|
-
* Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
|
|
293
|
-
* Also returns fullDocText for section boundary matching
|
|
294
|
-
*/
|
|
295
|
-
export async function extractCommentAnchors(docxPath: string): Promise<CommentAnchorsResult> {
|
|
296
|
-
const AdmZip = (await import('adm-zip')).default;
|
|
297
|
-
const anchors = new Map<string, CommentAnchorData>();
|
|
298
|
-
let fullDocText = '';
|
|
299
|
-
|
|
300
|
-
try {
|
|
301
|
-
const zip = new AdmZip(docxPath);
|
|
302
|
-
const docEntry = zip.getEntry('word/document.xml');
|
|
303
|
-
|
|
304
|
-
if (!docEntry) {
|
|
305
|
-
return { anchors, fullDocText };
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
const docXml = docEntry.getData().toString('utf8');
|
|
309
|
-
|
|
310
|
-
// ========================================
|
|
311
|
-
// STEP 1: Build text position mapping
|
|
312
|
-
// ========================================
|
|
313
|
-
const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
|
|
314
|
-
const textNodes: TextNode[] = [];
|
|
315
|
-
let textPosition = 0;
|
|
316
|
-
let nodeMatch;
|
|
317
|
-
|
|
318
|
-
while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
|
|
319
|
-
const rawText = nodeMatch[1] ?? '';
|
|
320
|
-
const decodedText = decodeXmlEntities(rawText);
|
|
321
|
-
textNodes.push({
|
|
322
|
-
xmlStart: nodeMatch.index,
|
|
323
|
-
xmlEnd: nodeMatch.index + nodeMatch[0].length,
|
|
324
|
-
textStart: textPosition,
|
|
325
|
-
textEnd: textPosition + decodedText.length,
|
|
326
|
-
text: decodedText
|
|
327
|
-
});
|
|
328
|
-
textPosition += decodedText.length;
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
fullDocText = textNodes.map(n => n.text).join('');
|
|
332
|
-
|
|
333
|
-
// Helper: convert XML position to text position
|
|
334
|
-
function xmlPosToTextPos(xmlPos: number): number {
|
|
335
|
-
for (let i = 0; i < textNodes.length; i++) {
|
|
336
|
-
const node = textNodes[i];
|
|
337
|
-
if (!node) continue;
|
|
338
|
-
if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
|
|
339
|
-
return node.textStart;
|
|
340
|
-
}
|
|
341
|
-
if (xmlPos < node.xmlStart) {
|
|
342
|
-
return node.textStart;
|
|
343
|
-
}
|
|
344
|
-
}
|
|
345
|
-
const lastNode = textNodes[textNodes.length - 1];
|
|
346
|
-
return lastNode ? lastNode.textEnd : 0;
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
// Helper: extract context before a position
|
|
350
|
-
function getContextBefore(position: number, maxLength: number = 150): string {
|
|
351
|
-
const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
|
|
352
|
-
const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
|
|
353
|
-
return sentenceStart >= 0
|
|
354
|
-
? beforeText.slice(sentenceStart + 2).trim()
|
|
355
|
-
: beforeText.slice(-80).trim();
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
// Helper: extract context after a position
|
|
359
|
-
function getContextAfter(position: number, maxLength: number = 150): string {
|
|
360
|
-
const afterText = fullDocText.slice(position, position + maxLength);
|
|
361
|
-
const sentenceEnd = afterText.search(/[.!?]\s/);
|
|
362
|
-
return sentenceEnd >= 0
|
|
363
|
-
? afterText.slice(0, sentenceEnd + 1).trim()
|
|
364
|
-
: afterText.slice(0, 80).trim();
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
// ========================================
|
|
368
|
-
// STEP 2: Collect all start/end markers separately
|
|
369
|
-
// ========================================
|
|
370
|
-
const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
|
|
371
|
-
const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
|
|
372
|
-
|
|
373
|
-
const starts = new Map<string, number>(); // id -> position after start tag
|
|
374
|
-
const ends = new Map<string, number>(); // id -> position before end tag
|
|
375
|
-
|
|
376
|
-
let match;
|
|
377
|
-
while ((match = startPattern.exec(docXml)) !== null) {
|
|
378
|
-
const id = match[1];
|
|
379
|
-
if (!starts.has(id)) {
|
|
380
|
-
starts.set(id, match.index + match[0].length);
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
while ((match = endPattern.exec(docXml)) !== null) {
|
|
385
|
-
const id = match[1];
|
|
386
|
-
if (!ends.has(id)) {
|
|
387
|
-
ends.set(id, match.index);
|
|
388
|
-
}
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
// ========================================
|
|
392
|
-
// STEP 3: Process each comment range by ID
|
|
393
|
-
// ========================================
|
|
394
|
-
for (const [id, startXmlPos] of starts) {
|
|
395
|
-
const endXmlPos = ends.get(id);
|
|
396
|
-
|
|
397
|
-
// Missing end marker - skip with warning
|
|
398
|
-
if (endXmlPos === undefined) {
|
|
399
|
-
console.warn(`Comment ${id}: missing end marker`);
|
|
400
|
-
continue;
|
|
401
|
-
}
|
|
402
|
-
|
|
403
|
-
// Calculate text position
|
|
404
|
-
const docPosition = xmlPosToTextPos(startXmlPos);
|
|
405
|
-
|
|
406
|
-
// Handle empty or inverted ranges
|
|
407
|
-
if (endXmlPos <= startXmlPos) {
|
|
408
|
-
anchors.set(id, {
|
|
409
|
-
anchor: '',
|
|
410
|
-
before: getContextBefore(docPosition),
|
|
411
|
-
after: getContextAfter(docPosition),
|
|
412
|
-
docPosition,
|
|
413
|
-
docLength: fullDocText.length,
|
|
414
|
-
isEmpty: true
|
|
415
|
-
});
|
|
416
|
-
continue;
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
// Extract XML segment between markers
|
|
420
|
-
const segment = docXml.slice(startXmlPos, endXmlPos);
|
|
421
|
-
|
|
422
|
-
// Extract text from w:t (regular) AND w:delText (deleted text in track changes)
|
|
423
|
-
const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
|
|
424
|
-
let anchorText = '';
|
|
425
|
-
let tm;
|
|
426
|
-
while ((tm = textInRangePattern.exec(segment)) !== null) {
|
|
427
|
-
anchorText += tm[1] || tm[2] || '';
|
|
428
|
-
}
|
|
429
|
-
anchorText = decodeXmlEntities(anchorText);
|
|
430
|
-
|
|
431
|
-
// Get context
|
|
432
|
-
const anchorLength = anchorText.length;
|
|
433
|
-
const before = getContextBefore(docPosition);
|
|
434
|
-
const after = getContextAfter(docPosition + anchorLength);
|
|
435
|
-
|
|
436
|
-
// ALWAYS add entry (even if anchor is empty)
|
|
437
|
-
anchors.set(id, {
|
|
438
|
-
anchor: anchorText.trim(),
|
|
439
|
-
before,
|
|
440
|
-
after,
|
|
441
|
-
docPosition,
|
|
442
|
-
docLength: fullDocText.length,
|
|
443
|
-
isEmpty: !anchorText.trim()
|
|
444
|
-
});
|
|
445
|
-
}
|
|
446
|
-
} catch (err: any) {
|
|
447
|
-
console.error('Error extracting comment anchors:', err.message);
|
|
448
|
-
return { anchors, fullDocText: '' };
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
return { anchors, fullDocText };
|
|
452
|
-
}
|
|
453
|
-
|
|
454
|
-
/**
|
|
455
|
-
* Decode XML entities in text
|
|
456
|
-
*/
|
|
457
|
-
function decodeXmlEntities(text: string): string {
|
|
458
|
-
return text
|
|
459
|
-
.replace(/&/g, '&')
|
|
460
|
-
.replace(/</g, '<')
|
|
461
|
-
.replace(/>/g, '>')
|
|
462
|
-
.replace(/"/g, '"')
|
|
463
|
-
.replace(/'/g, "'")
|
|
464
|
-
.replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
|
|
465
|
-
.replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
/**
|
|
469
|
-
* Extract text content from a Word XML cell
|
|
470
|
-
*/
|
|
471
|
-
function extractCellText(cellXml: string): string {
|
|
472
|
-
const parts: string[] = [];
|
|
473
|
-
|
|
474
|
-
// Check for OMML math - replace with [math] placeholder
|
|
475
|
-
if (cellXml.includes('<m:oMath')) {
|
|
476
|
-
// Try to extract the text representation of math
|
|
477
|
-
const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
|
|
478
|
-
if (mathTextMatches.length > 0) {
|
|
479
|
-
const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
|
|
480
|
-
parts.push(mathText);
|
|
481
|
-
} else {
|
|
482
|
-
parts.push('[math]');
|
|
483
|
-
}
|
|
484
|
-
}
|
|
485
|
-
|
|
486
|
-
// Extract regular text from w:t elements
|
|
487
|
-
const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
|
|
488
|
-
for (const match of textMatches) {
|
|
489
|
-
const text = match.replace(/<[^>]+>/g, '');
|
|
490
|
-
if (text) {
|
|
491
|
-
parts.push(text);
|
|
492
|
-
}
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
let result = parts.join('').trim();
|
|
496
|
-
result = decodeXmlEntities(result);
|
|
497
|
-
|
|
498
|
-
// Escape pipe characters in cell content (would break table)
|
|
499
|
-
result = result.replace(/\|/g, '\\|');
|
|
500
|
-
|
|
501
|
-
return result;
|
|
502
|
-
}
|
|
503
|
-
|
|
504
|
-
/**
|
|
505
|
-
* Parse a table row, handling merged cells (gridSpan)
|
|
506
|
-
*/
|
|
507
|
-
function parseTableRow(rowXml: string, expectedCols: number): ParsedRow {
|
|
508
|
-
// Match cells - handle both <w:tc> and <w:tc ...>
|
|
509
|
-
const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
|
|
510
|
-
const cells: string[] = [];
|
|
511
|
-
const colSpans: number[] = [];
|
|
512
|
-
|
|
513
|
-
for (const cellXml of cellMatches) {
|
|
514
|
-
// Check for horizontal merge (gridSpan)
|
|
515
|
-
const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
|
|
516
|
-
const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
|
|
517
|
-
|
|
518
|
-
// Check for vertical merge continuation (vMerge without restart)
|
|
519
|
-
// If vMerge is present without w:val="restart", it's a continuation - use empty
|
|
520
|
-
const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
|
|
521
|
-
const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
|
|
522
|
-
|
|
523
|
-
const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
|
|
524
|
-
|
|
525
|
-
// Add the cell content
|
|
526
|
-
cells.push(cellText);
|
|
527
|
-
colSpans.push(span);
|
|
528
|
-
|
|
529
|
-
// For gridSpan > 1, add empty cells to maintain column alignment
|
|
530
|
-
for (let i = 1; i < span; i++) {
|
|
531
|
-
cells.push('');
|
|
532
|
-
colSpans.push(0); // 0 indicates this is a spanned cell
|
|
533
|
-
}
|
|
534
|
-
}
|
|
535
|
-
|
|
536
|
-
return { cells, colSpans };
|
|
537
|
-
}
|
|
538
|
-
|
|
539
|
-
/**
|
|
540
|
-
* Determine table grid column count from table XML
|
|
541
|
-
*/
|
|
542
|
-
function getTableGridCols(tableXml: string): number {
|
|
543
|
-
// Try to get from tblGrid
|
|
544
|
-
const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
|
|
545
|
-
if (gridColMatches.length > 0) {
|
|
546
|
-
return gridColMatches.length;
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
// Fallback: count max cells in any row
|
|
550
|
-
const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
|
|
551
|
-
let maxCols = 0;
|
|
552
|
-
for (const rowXml of rowMatches) {
|
|
553
|
-
const { cells } = parseTableRow(rowXml, 0);
|
|
554
|
-
maxCols = Math.max(maxCols, cells.length);
|
|
555
|
-
}
|
|
556
|
-
return maxCols;
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
/**
|
|
560
|
-
* Extract tables directly from Word document XML and convert to markdown pipe tables
|
|
561
|
-
*/
|
|
562
|
-
export async function extractWordTables(docxPath: string): Promise<WordTable[]> {
|
|
563
|
-
const AdmZip = (await import('adm-zip')).default;
|
|
564
|
-
const tables: WordTable[] = [];
|
|
565
|
-
|
|
566
|
-
try {
|
|
567
|
-
const zip = new AdmZip(docxPath);
|
|
568
|
-
const docEntry = zip.getEntry('word/document.xml');
|
|
569
|
-
|
|
570
|
-
if (!docEntry) {
|
|
571
|
-
return tables;
|
|
572
|
-
}
|
|
573
|
-
|
|
574
|
-
const xml = docEntry.getData().toString('utf8');
|
|
575
|
-
|
|
576
|
-
// Find all table elements
|
|
577
|
-
const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
|
|
578
|
-
|
|
579
|
-
for (const tableXml of tableMatches) {
|
|
580
|
-
// Determine expected column count from grid
|
|
581
|
-
const expectedCols = getTableGridCols(tableXml);
|
|
582
|
-
|
|
583
|
-
// Extract rows
|
|
584
|
-
const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
|
|
585
|
-
const rows: string[][] = [];
|
|
586
|
-
|
|
587
|
-
for (const rowXml of rowMatches) {
|
|
588
|
-
const { cells } = parseTableRow(rowXml, expectedCols);
|
|
589
|
-
if (cells.length > 0) {
|
|
590
|
-
rows.push(cells);
|
|
591
|
-
}
|
|
592
|
-
}
|
|
593
|
-
|
|
594
|
-
if (rows.length > 0) {
|
|
595
|
-
// Convert to markdown pipe table
|
|
596
|
-
const markdown = convertRowsToMarkdownTable(rows);
|
|
597
|
-
tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
|
|
598
|
-
}
|
|
599
|
-
}
|
|
600
|
-
} catch (err: any) {
|
|
601
|
-
console.error('Error extracting tables from Word:', err.message);
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
return tables;
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
/**
|
|
608
|
-
* Convert array of rows (each row is array of cell strings) to markdown pipe table
|
|
609
|
-
*/
|
|
610
|
-
function convertRowsToMarkdownTable(rows: string[][]): string {
|
|
611
|
-
if (rows.length === 0) return '';
|
|
612
|
-
|
|
613
|
-
// Normalize column count (use max across all rows)
|
|
614
|
-
const colCount = Math.max(...rows.map((r) => r.length));
|
|
615
|
-
|
|
616
|
-
// Pad rows to have consistent column count
|
|
617
|
-
const normalizedRows = rows.map((row) => {
|
|
618
|
-
while (row.length < colCount) {
|
|
619
|
-
row.push('');
|
|
620
|
-
}
|
|
621
|
-
return row;
|
|
622
|
-
});
|
|
623
|
-
|
|
624
|
-
// Build markdown table
|
|
625
|
-
const lines: string[] = [];
|
|
626
|
-
|
|
627
|
-
// Header row
|
|
628
|
-
const header = normalizedRows[0];
|
|
629
|
-
lines.push('| ' + header.join(' | ') + ' |');
|
|
630
|
-
|
|
631
|
-
// Separator row
|
|
632
|
-
lines.push('|' + header.map(() => '---').join('|') + '|');
|
|
633
|
-
|
|
634
|
-
// Data rows
|
|
635
|
-
for (let i = 1; i < normalizedRows.length; i++) {
|
|
636
|
-
lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
|
|
637
|
-
}
|
|
638
|
-
|
|
639
|
-
return lines.join('\n');
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
/**
|
|
643
|
-
* Extract text from Word document using pandoc with track changes preserved
|
|
644
|
-
*/
|
|
645
|
-
export async function extractFromWord(
|
|
646
|
-
docxPath: string,
|
|
647
|
-
options: ExtractFromWordOptions = {}
|
|
648
|
-
): Promise<ExtractFromWordResult> {
|
|
649
|
-
let text: string;
|
|
650
|
-
let messages: ExtractMessage[] = [];
|
|
651
|
-
let extractedMedia: string[] = [];
|
|
652
|
-
let hasTrackChanges = false;
|
|
653
|
-
let trackChangeStats = { insertions: 0, deletions: 0 };
|
|
654
|
-
|
|
655
|
-
// Determine media extraction directory
|
|
656
|
-
const docxDir = path.dirname(docxPath);
|
|
657
|
-
const mediaDir = options.mediaDir || path.join(docxDir, 'media');
|
|
658
|
-
|
|
659
|
-
// Skip media extraction if figures already exist (e.g., when re-importing with existing source)
|
|
660
|
-
const skipMediaExtraction = options.skipMediaExtraction || false;
|
|
661
|
-
|
|
662
|
-
// Extract tables directly from Word XML (reliable, no heuristics)
|
|
663
|
-
const wordTables = await extractWordTables(docxPath);
|
|
664
|
-
|
|
665
|
-
// Try pandoc first with --track-changes=all to preserve reviewer edits
|
|
666
|
-
try {
|
|
667
|
-
// Build pandoc command
|
|
668
|
-
let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
|
|
669
|
-
if (!skipMediaExtraction) {
|
|
670
|
-
pandocCmd += ` --extract-media="${mediaDir}"`;
|
|
671
|
-
}
|
|
672
|
-
|
|
673
|
-
const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
|
|
674
|
-
text = stdout;
|
|
675
|
-
|
|
676
|
-
// Convert pandoc's track change format to CriticMarkup
|
|
677
|
-
const origLength = text.length;
|
|
678
|
-
|
|
679
|
-
// Use a more robust pattern that handles nested content
|
|
680
|
-
text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
|
|
681
|
-
if (content.trim()) {
|
|
682
|
-
trackChangeStats.insertions++;
|
|
683
|
-
return `{++${content}++}`;
|
|
684
|
-
}
|
|
685
|
-
return ''; // Empty insertions are removed
|
|
686
|
-
});
|
|
687
|
-
|
|
688
|
-
text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
|
|
689
|
-
if (content.trim()) {
|
|
690
|
-
trackChangeStats.deletions++;
|
|
691
|
-
return `{--${content}--}`;
|
|
692
|
-
}
|
|
693
|
-
return ''; // Empty deletions are removed
|
|
694
|
-
});
|
|
695
|
-
|
|
696
|
-
// Handle any remaining pandoc track change patterns
|
|
697
|
-
let prevText;
|
|
698
|
-
do {
|
|
699
|
-
prevText = text;
|
|
700
|
-
text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
|
|
701
|
-
if (content.trim()) {
|
|
702
|
-
trackChangeStats.insertions++;
|
|
703
|
-
return `{++${content}++}`;
|
|
704
|
-
}
|
|
705
|
-
return '';
|
|
706
|
-
});
|
|
707
|
-
text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
|
|
708
|
-
if (content.trim()) {
|
|
709
|
-
trackChangeStats.deletions++;
|
|
710
|
-
return `{--${content}--}`;
|
|
711
|
-
}
|
|
712
|
-
return '';
|
|
713
|
-
});
|
|
714
|
-
} while (text !== prevText);
|
|
715
|
-
|
|
716
|
-
// Handle pandoc comment patterns - remove comment text from body
|
|
717
|
-
text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
|
|
718
|
-
text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
|
|
719
|
-
|
|
720
|
-
// Also handle {.mark} spans
|
|
721
|
-
text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
|
|
722
|
-
|
|
723
|
-
hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
|
|
724
|
-
|
|
725
|
-
if (hasTrackChanges) {
|
|
726
|
-
messages.push({
|
|
727
|
-
type: 'info',
|
|
728
|
-
message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
|
|
729
|
-
});
|
|
730
|
-
}
|
|
731
|
-
|
|
732
|
-
// Find extracted media files
|
|
733
|
-
const mediaSubdir = path.join(mediaDir, 'media');
|
|
734
|
-
if (fs.existsSync(mediaSubdir)) {
|
|
735
|
-
extractedMedia = fs.readdirSync(mediaSubdir)
|
|
736
|
-
.filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
|
|
737
|
-
.map(f => path.join(mediaSubdir, f));
|
|
738
|
-
|
|
739
|
-
if (extractedMedia.length > 0) {
|
|
740
|
-
messages.push({
|
|
741
|
-
type: 'info',
|
|
742
|
-
message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
|
|
743
|
-
});
|
|
744
|
-
}
|
|
745
|
-
}
|
|
746
|
-
} catch (pandocErr: any) {
|
|
747
|
-
// Pandoc not available — use XML-based extraction with track change support
|
|
748
|
-
const { extractPlainTextWithTrackChanges } = await import('./word.js');
|
|
749
|
-
const { getInstallInstructions } = await import('./dependencies.js');
|
|
750
|
-
const installCmd = getInstallInstructions('pandoc');
|
|
751
|
-
|
|
752
|
-
const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
|
|
753
|
-
text = xmlResult.text;
|
|
754
|
-
hasTrackChanges = xmlResult.hasTrackChanges;
|
|
755
|
-
trackChangeStats = xmlResult.stats;
|
|
756
|
-
|
|
757
|
-
if (hasTrackChanges) {
|
|
758
|
-
messages.push({
|
|
759
|
-
type: 'warning',
|
|
760
|
-
message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
|
|
761
|
-
});
|
|
762
|
-
} else {
|
|
763
|
-
messages.push({
|
|
764
|
-
type: 'warning',
|
|
765
|
-
message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
|
|
766
|
-
});
|
|
767
|
-
}
|
|
768
|
-
}
|
|
769
|
-
|
|
770
|
-
// Extract comments directly from docx XML
|
|
771
|
-
const comments = await extractWordComments(docxPath);
|
|
772
|
-
|
|
773
|
-
// Extract comment anchor texts
|
|
774
|
-
const { anchors } = await extractCommentAnchors(docxPath);
|
|
775
|
-
|
|
776
|
-
return {
|
|
777
|
-
text,
|
|
778
|
-
comments,
|
|
779
|
-
anchors,
|
|
780
|
-
messages,
|
|
781
|
-
extractedMedia,
|
|
782
|
-
tables: wordTables,
|
|
783
|
-
hasTrackChanges,
|
|
784
|
-
trackChangeStats,
|
|
785
|
-
};
|
|
786
|
-
}
|
|
787
|
-
|
|
788
178
|
/**
|
|
789
179
|
* Insert comments into markdown text based on anchor texts with context
|
|
790
180
|
*/
|
|
@@ -794,165 +184,14 @@ export function insertCommentsIntoMarkdown(
|
|
|
794
184
|
anchors: Map<string, CommentAnchorData | string>,
|
|
795
185
|
options: InsertCommentsOptions = {}
|
|
796
186
|
): string {
|
|
797
|
-
const { quiet = false, sectionBoundary = null } = options;
|
|
187
|
+
const { quiet = false, sectionBoundary = null, wrapAnchor = true } = options;
|
|
798
188
|
let result = markdown;
|
|
799
189
|
let unmatchedCount = 0;
|
|
800
190
|
const duplicateWarnings: string[] = [];
|
|
801
191
|
const usedPositions = new Set<number>(); // For tie-breaking: track used positions
|
|
802
192
|
|
|
803
|
-
//
|
|
804
|
-
|
|
805
|
-
return text
|
|
806
|
-
.replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep inserted text
|
|
807
|
-
.replace(/\{--([^-]*)--\}/g, '') // deletions: remove deleted text
|
|
808
|
-
.replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
|
|
809
|
-
.replace(/\{>>[^<]*<<\}/g, '') // comments: remove
|
|
810
|
-
.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
|
|
811
|
-
}
|
|
812
|
-
|
|
813
|
-
// Helper: Find anchor in text with multiple fallback strategies
|
|
814
|
-
function findAnchorInText(anchor: string, text: string, before: string = '', after: string = ''): AnchorSearchResult {
|
|
815
|
-
// If anchor is empty, skip directly to context-based matching
|
|
816
|
-
if (!anchor || anchor.trim().length === 0) {
|
|
817
|
-
// Jump to context-based strategies (Strategy 5)
|
|
818
|
-
if (before || after) {
|
|
819
|
-
const beforeLower = (before || '').toLowerCase();
|
|
820
|
-
const afterLower = (after || '').toLowerCase();
|
|
821
|
-
const textLower = text.toLowerCase();
|
|
822
|
-
|
|
823
|
-
if (before && after) {
|
|
824
|
-
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
825
|
-
if (beforeIdx !== -1) {
|
|
826
|
-
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
827
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
828
|
-
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
829
|
-
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
830
|
-
}
|
|
831
|
-
}
|
|
832
|
-
}
|
|
833
|
-
|
|
834
|
-
if (before) {
|
|
835
|
-
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
836
|
-
if (beforeIdx !== -1) {
|
|
837
|
-
return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
|
|
838
|
-
}
|
|
839
|
-
}
|
|
840
|
-
|
|
841
|
-
if (after) {
|
|
842
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
843
|
-
if (afterIdx !== -1) {
|
|
844
|
-
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
845
|
-
}
|
|
846
|
-
}
|
|
847
|
-
}
|
|
848
|
-
return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
|
|
849
|
-
}
|
|
850
|
-
|
|
851
|
-
const anchorLower = anchor.toLowerCase();
|
|
852
|
-
const textLower = text.toLowerCase();
|
|
853
|
-
|
|
854
|
-
// Strategy 1: Direct match
|
|
855
|
-
let occurrences = findAllOccurrences(textLower, anchorLower);
|
|
856
|
-
if (occurrences.length > 0) {
|
|
857
|
-
return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
// Strategy 2: Normalized whitespace
|
|
861
|
-
const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
|
|
862
|
-
const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
|
|
863
|
-
let idx = normalizedText.indexOf(normalizedAnchor);
|
|
864
|
-
if (idx !== -1) {
|
|
865
|
-
return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
|
|
866
|
-
}
|
|
867
|
-
|
|
868
|
-
// Strategy 3: Try matching in stripped CriticMarkup version
|
|
869
|
-
const strippedText = stripCriticMarkup(text);
|
|
870
|
-
const strippedLower = strippedText.toLowerCase();
|
|
871
|
-
occurrences = findAllOccurrences(strippedLower, anchorLower);
|
|
872
|
-
if (occurrences.length > 0) {
|
|
873
|
-
return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
// Strategy 4: First N words of anchor (for long anchors)
|
|
877
|
-
const words = anchor.split(/\s+/);
|
|
878
|
-
if (words.length > 3) {
|
|
879
|
-
for (let n = Math.min(6, words.length); n >= 3; n--) {
|
|
880
|
-
const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
|
|
881
|
-
if (partialAnchor.length >= 15) {
|
|
882
|
-
occurrences = findAllOccurrences(textLower, partialAnchor);
|
|
883
|
-
if (occurrences.length > 0) {
|
|
884
|
-
return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
|
|
885
|
-
}
|
|
886
|
-
occurrences = findAllOccurrences(strippedLower, partialAnchor);
|
|
887
|
-
if (occurrences.length > 0) {
|
|
888
|
-
return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start-stripped', stripped: true };
|
|
889
|
-
}
|
|
890
|
-
}
|
|
891
|
-
}
|
|
892
|
-
}
|
|
893
|
-
|
|
894
|
-
// Strategy 5: Use context (before/after) to find approximate position
|
|
895
|
-
if (before || after) {
|
|
896
|
-
const beforeLower = before.toLowerCase();
|
|
897
|
-
const afterLower = after.toLowerCase();
|
|
898
|
-
|
|
899
|
-
if (before && after) {
|
|
900
|
-
const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
|
|
901
|
-
if (beforeIdx !== -1) {
|
|
902
|
-
const searchStart = beforeIdx + beforeLower.slice(-50).length;
|
|
903
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
|
|
904
|
-
if (afterIdx !== -1 && afterIdx - searchStart < 500) {
|
|
905
|
-
return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
|
|
906
|
-
}
|
|
907
|
-
}
|
|
908
|
-
}
|
|
909
|
-
|
|
910
|
-
if (before) {
|
|
911
|
-
const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
|
|
912
|
-
if (beforeIdx !== -1) {
|
|
913
|
-
return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
|
|
914
|
-
}
|
|
915
|
-
}
|
|
916
|
-
|
|
917
|
-
if (after) {
|
|
918
|
-
const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
|
|
919
|
-
if (afterIdx !== -1) {
|
|
920
|
-
return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
|
|
921
|
-
}
|
|
922
|
-
}
|
|
923
|
-
}
|
|
924
|
-
|
|
925
|
-
// Strategy 6: Try splitting anchor on common transition words
|
|
926
|
-
const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
|
|
927
|
-
for (const sep of splitPatterns) {
|
|
928
|
-
if (anchor.includes(sep)) {
|
|
929
|
-
const parts = anchor.split(sep).filter(p => p.length >= 4);
|
|
930
|
-
for (const part of parts) {
|
|
931
|
-
const partLower = part.toLowerCase();
|
|
932
|
-
occurrences = findAllOccurrences(textLower, partLower);
|
|
933
|
-
if (occurrences.length > 0 && occurrences.length < 5) {
|
|
934
|
-
return { occurrences, matchedAnchor: part, strategy: 'split-match' };
|
|
935
|
-
}
|
|
936
|
-
}
|
|
937
|
-
}
|
|
938
|
-
}
|
|
939
|
-
|
|
940
|
-
return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
|
|
941
|
-
}
|
|
942
|
-
|
|
943
|
-
// Helper: Find all occurrences of needle in haystack
|
|
944
|
-
function findAllOccurrences(haystack: string, needle: string): number[] {
|
|
945
|
-
if (!needle || needle.length === 0) {
|
|
946
|
-
return [];
|
|
947
|
-
}
|
|
948
|
-
const occurrences: number[] = [];
|
|
949
|
-
let idx = 0;
|
|
950
|
-
while ((idx = haystack.indexOf(needle, idx)) !== -1) {
|
|
951
|
-
occurrences.push(idx);
|
|
952
|
-
idx += 1;
|
|
953
|
-
}
|
|
954
|
-
return occurrences;
|
|
955
|
-
}
|
|
193
|
+
// Anchor matching primitives live in lib/anchor-match.ts so that
|
|
194
|
+
// `rev verify-anchors` can use the same strategies for drift reporting.
|
|
956
195
|
|
|
957
196
|
// Get all positions in order (for sequential tie-breaking)
|
|
958
197
|
const commentsWithPositions = comments.map((c): CommentWithPos => {
|
|
@@ -1108,18 +347,24 @@ export function insertCommentsIntoMarkdown(
|
|
|
1108
347
|
// Sort by position descending (insert from end to avoid offset issues)
|
|
1109
348
|
matched.sort((a, b) => b.pos - a.pos);
|
|
1110
349
|
|
|
1111
|
-
// Insert each comment
|
|
350
|
+
// Insert each comment. With `wrapAnchor` (the default), the anchor text
|
|
351
|
+
// gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
|
|
352
|
+
// original Word comment range. Without it, the comment block is inserted
|
|
353
|
+
// adjacent to the anchor and prose stays untouched — required for
|
|
354
|
+
// comments-only sync where multiple comments may share one anchor.
|
|
1112
355
|
for (const c of matched) {
|
|
1113
356
|
const comment = `{>>${c.author}: ${c.text}<<}`;
|
|
1114
|
-
if (c.anchorText && c.anchorEnd) {
|
|
1115
|
-
// Replace anchor text with: {>>comment<<}[anchor]{.mark}
|
|
357
|
+
if (wrapAnchor && c.anchorText && c.anchorEnd) {
|
|
1116
358
|
const before = result.slice(0, c.pos);
|
|
1117
359
|
const anchor = result.slice(c.pos, c.anchorEnd);
|
|
1118
360
|
const after = result.slice(c.anchorEnd);
|
|
1119
361
|
result = before + comment + `[${anchor}]{.mark}` + after;
|
|
1120
362
|
} else {
|
|
1121
|
-
//
|
|
1122
|
-
|
|
363
|
+
// Insert comment at the anchor position with no surrounding whitespace
|
|
364
|
+
// tweaks; CriticMarkup blocks are invisible to readers, and adding a
|
|
365
|
+
// leading space would shift prose byte-for-byte (relevant when callers
|
|
366
|
+
// verify that --comments-only didn't touch the original).
|
|
367
|
+
result = result.slice(0, c.pos) + comment + result.slice(c.pos);
|
|
1123
368
|
}
|
|
1124
369
|
}
|
|
1125
370
|
|
|
@@ -1139,647 +384,6 @@ export function insertCommentsIntoMarkdown(
|
|
|
1139
384
|
return result;
|
|
1140
385
|
}
|
|
1141
386
|
|
|
1142
|
-
/**
|
|
1143
|
-
* Fix citation and math annotations by preserving original markdown syntax
|
|
1144
|
-
*/
|
|
1145
|
-
function fixCitationAnnotations(text: string, originalMd: string): string {
|
|
1146
|
-
// Fix math annotations - preserve inline and display math
|
|
1147
|
-
text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
|
|
1148
|
-
text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
|
|
1149
|
-
|
|
1150
|
-
text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
|
|
1151
|
-
text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
|
|
1152
|
-
|
|
1153
|
-
// Extract all citations from original markdown
|
|
1154
|
-
const citationPattern = /\[@[^\]]+\]/g;
|
|
1155
|
-
const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
|
|
1156
|
-
|
|
1157
|
-
// Fix substitutions where left side has markdown citation
|
|
1158
|
-
text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
|
|
1159
|
-
|
|
1160
|
-
// Fix substitutions where left side STARTS with markdown citation
|
|
1161
|
-
text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
|
|
1162
|
-
if (oldText.trim() === '' && newText.trim() === '') {
|
|
1163
|
-
return cite;
|
|
1164
|
-
}
|
|
1165
|
-
if (oldText.trim() || newText.trim()) {
|
|
1166
|
-
return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
|
|
1167
|
-
}
|
|
1168
|
-
return cite;
|
|
1169
|
-
});
|
|
1170
|
-
|
|
1171
|
-
// Fix deletions of markdown citations
|
|
1172
|
-
text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
|
|
1173
|
-
|
|
1174
|
-
// Fix insertions of rendered citations
|
|
1175
|
-
text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
|
|
1176
|
-
|
|
1177
|
-
// Clean up broken multi-part substitutions
|
|
1178
|
-
text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
|
|
1179
|
-
|
|
1180
|
-
// Fix citations split across substitution boundaries
|
|
1181
|
-
text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
|
|
1182
|
-
|
|
1183
|
-
// Clean up any remaining partial citations
|
|
1184
|
-
text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
|
|
1185
|
-
|
|
1186
|
-
// Remove rendered citation insertions (with Unicode support)
|
|
1187
|
-
text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
|
|
1188
|
-
text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
|
|
1189
|
-
|
|
1190
|
-
// Trailing citation fragments
|
|
1191
|
-
text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
|
|
1192
|
-
text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
|
|
1193
|
-
|
|
1194
|
-
// Just year with closing paren
|
|
1195
|
-
text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
|
|
1196
|
-
text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
|
|
1197
|
-
|
|
1198
|
-
// Leading citation fragments
|
|
1199
|
-
text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
|
|
1200
|
-
|
|
1201
|
-
// Semicolon-separated fragments
|
|
1202
|
-
text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
|
|
1203
|
-
|
|
1204
|
-
// Year ranges with authors
|
|
1205
|
-
text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
|
|
1206
|
-
text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
|
|
1207
|
-
|
|
1208
|
-
// Clean up double spaces and orphaned punctuation
|
|
1209
|
-
text = text.replace(/ +/g, ' ');
|
|
1210
|
-
text = text.replace(/\s+\./g, '.');
|
|
1211
|
-
text = text.replace(/\s+,/g, ',');
|
|
1212
|
-
|
|
1213
|
-
// Final cleanup - remove empty annotations
|
|
1214
|
-
text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
|
|
1215
|
-
text = text.replace(/\{\+\+\s*\+\+\}/g, '');
|
|
1216
|
-
text = text.replace(/\{--\s*--\}/g, '');
|
|
1217
|
-
|
|
1218
|
-
return text;
|
|
1219
|
-
}
|
|
1220
|
-
|
|
1221
|
-
/**
|
|
1222
|
-
* Strip markdown syntax to get plain text
|
|
1223
|
-
*/
|
|
1224
|
-
function stripMarkdownSyntax(md: string): string {
|
|
1225
|
-
return md
|
|
1226
|
-
.replace(/^---[\s\S]*?---\n*/m, '')
|
|
1227
|
-
.replace(/^#{1,6}\s+/gm, '')
|
|
1228
|
-
.replace(/(\*\*|__)(.*?)\1/g, '$2')
|
|
1229
|
-
.replace(/(\*|_)(.*?)\1/g, '$2')
|
|
1230
|
-
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
|
|
1231
|
-
.replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
|
|
1232
|
-
.replace(/`([^`]+)`/g, '$1')
|
|
1233
|
-
.replace(/```[\s\S]*?```/g, '')
|
|
1234
|
-
.replace(/^>\s*/gm, '')
|
|
1235
|
-
.replace(/^[-*_]{3,}\s*$/gm, '')
|
|
1236
|
-
.replace(/^[\s]*[-*+]\s+/gm, '')
|
|
1237
|
-
.replace(/^[\s]*\d+\.\s+/gm, '')
|
|
1238
|
-
.replace(/\|/g, ' ')
|
|
1239
|
-
.replace(/^[-:]+$/gm, '')
|
|
1240
|
-
.replace(/\n{3,}/g, '\n\n')
|
|
1241
|
-
.trim();
|
|
1242
|
-
}
|
|
1243
|
-
|
|
1244
|
-
/**
|
|
1245
|
-
* Generate annotated markdown by diffing original MD against Word text
|
|
1246
|
-
*/
|
|
1247
|
-
export function generateAnnotatedDiff(originalMd: string, wordText: string, author: string = 'Reviewer'): string {
|
|
1248
|
-
const normalizedOriginal = normalizeWhitespace(originalMd);
|
|
1249
|
-
const normalizedWord = normalizeWhitespace(wordText);
|
|
1250
|
-
|
|
1251
|
-
const changes = diffWords(normalizedOriginal, normalizedWord);
|
|
1252
|
-
|
|
1253
|
-
let result = '';
|
|
1254
|
-
|
|
1255
|
-
for (const part of changes) {
|
|
1256
|
-
if (part.added) {
|
|
1257
|
-
result += `{++${part.value}++}`;
|
|
1258
|
-
} else if (part.removed) {
|
|
1259
|
-
result += `{--${part.value}--}`;
|
|
1260
|
-
} else {
|
|
1261
|
-
result += part.value;
|
|
1262
|
-
}
|
|
1263
|
-
}
|
|
1264
|
-
|
|
1265
|
-
return result;
|
|
1266
|
-
}
|
|
1267
|
-
|
|
1268
|
-
/**
|
|
1269
|
-
* Inject Word tables (extracted from XML) into pandoc text output
|
|
1270
|
-
*/
|
|
1271
|
-
function injectWordTables(pandocText: string, wordTables: WordTable[]): string {
|
|
1272
|
-
if (!wordTables || wordTables.length === 0) {
|
|
1273
|
-
return pandocText;
|
|
1274
|
-
}
|
|
1275
|
-
|
|
1276
|
-
let result = pandocText;
|
|
1277
|
-
|
|
1278
|
-
for (const table of wordTables) {
|
|
1279
|
-
const firstLine = table.markdown.split('\n')[0];
|
|
1280
|
-
const headerCells = firstLine
|
|
1281
|
-
.split('|')
|
|
1282
|
-
.map((c) => c.trim())
|
|
1283
|
-
.filter((c) => c.length > 0);
|
|
1284
|
-
|
|
1285
|
-
if (headerCells.length === 0) continue;
|
|
1286
|
-
|
|
1287
|
-
const firstCell = headerCells[0];
|
|
1288
|
-
const startIdx = result.indexOf(firstCell);
|
|
1289
|
-
|
|
1290
|
-
if (startIdx === -1) continue;
|
|
1291
|
-
|
|
1292
|
-
const lastLine = table.markdown.split('\n').pop();
|
|
1293
|
-
const lastCells = lastLine!
|
|
1294
|
-
.split('|')
|
|
1295
|
-
.map((c) => c.trim())
|
|
1296
|
-
.filter((c) => c.length > 0);
|
|
1297
|
-
const lastCell = lastCells[lastCells.length - 1] || lastCells[0];
|
|
1298
|
-
|
|
1299
|
-
const endIdx = result.indexOf(lastCell, startIdx);
|
|
1300
|
-
if (endIdx === -1) continue;
|
|
1301
|
-
|
|
1302
|
-
let regionStart = result.lastIndexOf('\n\n', startIdx);
|
|
1303
|
-
if (regionStart === -1) regionStart = 0;
|
|
1304
|
-
else regionStart += 2;
|
|
1305
|
-
|
|
1306
|
-
let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length);
|
|
1307
|
-
if (regionEnd === -1) regionEnd = result.length;
|
|
1308
|
-
|
|
1309
|
-
result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd);
|
|
1310
|
-
}
|
|
1311
|
-
|
|
1312
|
-
return result;
|
|
1313
|
-
}
|
|
1314
|
-
|
|
1315
|
-
/**
|
|
1316
|
-
* Smart paragraph-level diff that preserves markdown structure
|
|
1317
|
-
*/
|
|
1318
|
-
export function generateSmartDiff(
|
|
1319
|
-
originalMd: string,
|
|
1320
|
-
wordText: string,
|
|
1321
|
-
author: string = 'Reviewer',
|
|
1322
|
-
options: GenerateSmartDiffOptions = {}
|
|
1323
|
-
): string {
|
|
1324
|
-
const { wordTables = [], imageRegistry = null } = options;
|
|
1325
|
-
|
|
1326
|
-
// Inject Word tables into pandoc output
|
|
1327
|
-
let wordTextWithTables = injectWordTables(wordText, wordTables);
|
|
1328
|
-
|
|
1329
|
-
// Protect markdown tables
|
|
1330
|
-
const { text: mdWithTablesProtected, tables } = protectTables(originalMd);
|
|
1331
|
-
|
|
1332
|
-
// Also protect tables in Word text
|
|
1333
|
-
const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables);
|
|
1334
|
-
|
|
1335
|
-
// Protect images
|
|
1336
|
-
const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry);
|
|
1337
|
-
|
|
1338
|
-
const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry);
|
|
1339
|
-
|
|
1340
|
-
// Match Word images to original images
|
|
1341
|
-
const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry);
|
|
1342
|
-
|
|
1343
|
-
// Replace Word image placeholders with matching original placeholders
|
|
1344
|
-
let wordWithMappedImages = wordWithImagesProtected;
|
|
1345
|
-
for (const [wordPlaceholder, origPlaceholder] of imageMapping) {
|
|
1346
|
-
wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder);
|
|
1347
|
-
}
|
|
1348
|
-
|
|
1349
|
-
// Protect figure/table anchors
|
|
1350
|
-
const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected);
|
|
1351
|
-
|
|
1352
|
-
// Protect cross-references
|
|
1353
|
-
const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
|
|
1354
|
-
|
|
1355
|
-
// Protect math
|
|
1356
|
-
const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
|
|
1357
|
-
|
|
1358
|
-
// Protect citations
|
|
1359
|
-
const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
|
|
1360
|
-
|
|
1361
|
-
// Replace rendered elements in Word text
|
|
1362
|
-
let wordProtected = wordWithMappedImages;
|
|
1363
|
-
wordProtected = replaceRenderedMath(wordProtected, mathBlocks);
|
|
1364
|
-
wordProtected = replaceRenderedCitations(wordProtected, citations.length);
|
|
1365
|
-
|
|
1366
|
-
// Split into paragraphs
|
|
1367
|
-
const originalParas = mdProtected.split(/\n\n+/);
|
|
1368
|
-
const wordParas = wordProtected.split(/\n\n+/);
|
|
1369
|
-
|
|
1370
|
-
const result: string[] = [];
|
|
1371
|
-
|
|
1372
|
-
// Try to match paragraphs intelligently
|
|
1373
|
-
let wordIdx = 0;
|
|
1374
|
-
|
|
1375
|
-
for (let i = 0; i < originalParas.length; i++) {
|
|
1376
|
-
const orig = originalParas[i] || '';
|
|
1377
|
-
const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
|
|
1378
|
-
|
|
1379
|
-
// Find best matching word paragraph
|
|
1380
|
-
let bestMatch = -1;
|
|
1381
|
-
let bestScore = 0;
|
|
1382
|
-
|
|
1383
|
-
for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
|
|
1384
|
-
const wordPara = wordParas[j] || '';
|
|
1385
|
-
const origWords = new Set(origContent.toLowerCase().split(/\s+/));
|
|
1386
|
-
const wordWords = wordPara.toLowerCase().split(/\s+/);
|
|
1387
|
-
const common = wordWords.filter((w) => origWords.has(w)).length;
|
|
1388
|
-
const score = common / Math.max(origWords.size, wordWords.length);
|
|
1389
|
-
|
|
1390
|
-
if (score > bestScore && score > 0.3) {
|
|
1391
|
-
bestScore = score;
|
|
1392
|
-
bestMatch = j;
|
|
1393
|
-
}
|
|
1394
|
-
}
|
|
1395
|
-
|
|
1396
|
-
if (bestMatch === -1) {
|
|
1397
|
-
if (mdPrefix && wordIdx < wordParas.length) {
|
|
1398
|
-
const wordPara = wordParas[wordIdx];
|
|
1399
|
-
if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
|
|
1400
|
-
bestMatch = wordIdx;
|
|
1401
|
-
}
|
|
1402
|
-
}
|
|
1403
|
-
}
|
|
1404
|
-
|
|
1405
|
-
if (bestMatch >= 0) {
|
|
1406
|
-
const word = wordParas[bestMatch];
|
|
1407
|
-
|
|
1408
|
-
const origStripped = stripMarkdownSyntax(orig);
|
|
1409
|
-
const wordNormalized = normalizeWhitespace(word);
|
|
1410
|
-
|
|
1411
|
-
if (origStripped === wordNormalized) {
|
|
1412
|
-
result.push(orig);
|
|
1413
|
-
} else {
|
|
1414
|
-
const changes = diffWords(origStripped, wordNormalized);
|
|
1415
|
-
let annotated = mdPrefix;
|
|
1416
|
-
|
|
1417
|
-
for (const part of changes) {
|
|
1418
|
-
if (part.added) {
|
|
1419
|
-
annotated += `{++${part.value}++}`;
|
|
1420
|
-
} else if (part.removed) {
|
|
1421
|
-
annotated += `{--${part.value}--}`;
|
|
1422
|
-
} else {
|
|
1423
|
-
annotated += part.value;
|
|
1424
|
-
}
|
|
1425
|
-
}
|
|
1426
|
-
|
|
1427
|
-
result.push(annotated);
|
|
1428
|
-
}
|
|
1429
|
-
|
|
1430
|
-
wordIdx = bestMatch + 1;
|
|
1431
|
-
} else {
|
|
1432
|
-
// Paragraph deleted entirely
|
|
1433
|
-
if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) {
|
|
1434
|
-
result.push(orig);
|
|
1435
|
-
} else {
|
|
1436
|
-
result.push(`{--${orig}--}`);
|
|
1437
|
-
}
|
|
1438
|
-
}
|
|
1439
|
-
}
|
|
1440
|
-
|
|
1441
|
-
// Any remaining word paragraphs are additions
|
|
1442
|
-
for (let j = wordIdx; j < wordParas.length; j++) {
|
|
1443
|
-
const word = wordParas[j];
|
|
1444
|
-
if (word.trim()) {
|
|
1445
|
-
result.push(`{++${word}++}`);
|
|
1446
|
-
}
|
|
1447
|
-
}
|
|
1448
|
-
|
|
1449
|
-
// Restore protected content
|
|
1450
|
-
let finalResult = result.join('\n\n');
|
|
1451
|
-
finalResult = restoreCitations(finalResult, citations);
|
|
1452
|
-
finalResult = restoreMath(finalResult, mathBlocks);
|
|
1453
|
-
finalResult = restoreCrossrefs(finalResult, crossrefs);
|
|
1454
|
-
finalResult = restoreAnchors(finalResult, figAnchors);
|
|
1455
|
-
finalResult = restoreImages(finalResult, origImages);
|
|
1456
|
-
finalResult = restoreImages(finalResult, wordImages);
|
|
1457
|
-
finalResult = restoreTables(finalResult, tables);
|
|
1458
|
-
finalResult = restoreTables(finalResult, wordTableBlocks);
|
|
1459
|
-
|
|
1460
|
-
return finalResult;
|
|
1461
|
-
}
|
|
1462
|
-
|
|
1463
|
-
/**
|
|
1464
|
-
* Clean up redundant adjacent annotations
|
|
1465
|
-
*/
|
|
1466
|
-
export function cleanupAnnotations(text: string): string {
|
|
1467
|
-
// Convert adjacent delete+insert to substitution
|
|
1468
|
-
text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
|
|
1469
|
-
|
|
1470
|
-
// Also handle insert+delete
|
|
1471
|
-
text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
|
|
1472
|
-
|
|
1473
|
-
// Fix malformed patterns
|
|
1474
|
-
text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
|
|
1475
|
-
|
|
1476
|
-
// Fix malformed substitutions that got split
|
|
1477
|
-
text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
|
|
1478
|
-
text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
|
|
1479
|
-
|
|
1480
|
-
// Clean up empty annotations
|
|
1481
|
-
text = text.replace(/\{--\s*--\}/g, '');
|
|
1482
|
-
text = text.replace(/\{\+\+\s*\+\+\}/g, '');
|
|
1483
|
-
|
|
1484
|
-
// Clean up double spaces in prose, but preserve table formatting
|
|
1485
|
-
const lines = text.split('\n');
|
|
1486
|
-
let inTable = false;
|
|
1487
|
-
|
|
1488
|
-
const processedLines = lines.map((line, idx) => {
|
|
1489
|
-
const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim());
|
|
1490
|
-
|
|
1491
|
-
const looksLikeTableRow = /\S+\s{2,}\S+/.test(line);
|
|
1492
|
-
|
|
1493
|
-
if (isSeparator) {
|
|
1494
|
-
if (!inTable) {
|
|
1495
|
-
inTable = true;
|
|
1496
|
-
}
|
|
1497
|
-
return line;
|
|
1498
|
-
}
|
|
1499
|
-
|
|
1500
|
-
if (inTable) {
|
|
1501
|
-
if (line.trim() === '') {
|
|
1502
|
-
let lookAhead = idx + 1;
|
|
1503
|
-
let foundTableContent = false;
|
|
1504
|
-
let foundEndSeparator = false;
|
|
1505
|
-
|
|
1506
|
-
while (lookAhead < lines.length && lookAhead < idx + 20) {
|
|
1507
|
-
const nextLine = lines[lookAhead].trim();
|
|
1508
|
-
|
|
1509
|
-
if (nextLine === '') {
|
|
1510
|
-
lookAhead++;
|
|
1511
|
-
continue;
|
|
1512
|
-
}
|
|
1513
|
-
|
|
1514
|
-
if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) {
|
|
1515
|
-
foundEndSeparator = true;
|
|
1516
|
-
break;
|
|
1517
|
-
}
|
|
1518
|
-
|
|
1519
|
-
if (/\S+\s{2,}\S+/.test(nextLine)) {
|
|
1520
|
-
foundTableContent = true;
|
|
1521
|
-
break;
|
|
1522
|
-
}
|
|
1523
|
-
|
|
1524
|
-
if (/^\*[^*]+\*\s*$/.test(nextLine)) {
|
|
1525
|
-
foundTableContent = true;
|
|
1526
|
-
break;
|
|
1527
|
-
}
|
|
1528
|
-
|
|
1529
|
-
if (lines[lookAhead].startsWith(' ')) {
|
|
1530
|
-
lookAhead++;
|
|
1531
|
-
continue;
|
|
1532
|
-
}
|
|
1533
|
-
|
|
1534
|
-
break;
|
|
1535
|
-
}
|
|
1536
|
-
|
|
1537
|
-
if (foundTableContent || foundEndSeparator) {
|
|
1538
|
-
return line;
|
|
1539
|
-
}
|
|
1540
|
-
|
|
1541
|
-
inTable = false;
|
|
1542
|
-
return line;
|
|
1543
|
-
}
|
|
1544
|
-
|
|
1545
|
-
return line;
|
|
1546
|
-
}
|
|
1547
|
-
|
|
1548
|
-
if (looksLikeTableRow) {
|
|
1549
|
-
let nextIdx = idx + 1;
|
|
1550
|
-
while (nextIdx < lines.length && lines[nextIdx].trim() === '') {
|
|
1551
|
-
nextIdx++;
|
|
1552
|
-
}
|
|
1553
|
-
if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) {
|
|
1554
|
-
return line;
|
|
1555
|
-
}
|
|
1556
|
-
}
|
|
1557
|
-
|
|
1558
|
-
if (line.trim().startsWith('|')) {
|
|
1559
|
-
return line;
|
|
1560
|
-
}
|
|
1561
|
-
|
|
1562
|
-
return line.replace(/ +/g, ' ');
|
|
1563
|
-
});
|
|
1564
|
-
text = processedLines.join('\n');
|
|
1565
|
-
|
|
1566
|
-
return text;
|
|
1567
|
-
}
|
|
1568
|
-
|
|
1569
|
-
/**
|
|
1570
|
-
* Parse visible comment markers from Word text
|
|
1571
|
-
*/
|
|
1572
|
-
export function parseVisibleComments(text: string): Array<{ author: string; text: string; position: number }> {
|
|
1573
|
-
const comments: Array<{ author: string; text: string; position: number }> = [];
|
|
1574
|
-
const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
|
|
1575
|
-
|
|
1576
|
-
let match;
|
|
1577
|
-
while ((match = pattern.exec(text)) !== null) {
|
|
1578
|
-
comments.push({
|
|
1579
|
-
author: match[1].trim(),
|
|
1580
|
-
text: match[2].trim(),
|
|
1581
|
-
position: match.index,
|
|
1582
|
-
});
|
|
1583
|
-
}
|
|
1584
|
-
|
|
1585
|
-
return comments;
|
|
1586
|
-
}
|
|
1587
|
-
|
|
1588
|
-
/**
|
|
1589
|
-
* Convert visible comments to CriticMarkup format
|
|
1590
|
-
*/
|
|
1591
|
-
export function convertVisibleComments(text: string): string {
|
|
1592
|
-
return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
|
|
1593
|
-
}
|
|
1594
|
-
|
|
1595
|
-
/**
|
|
1596
|
-
* Restore pandoc-crossref figure/table references from Word-rendered format
|
|
1597
|
-
*/
|
|
1598
|
-
export function restoreCrossrefFromWord(
|
|
1599
|
-
text: string,
|
|
1600
|
-
projectDir: string,
|
|
1601
|
-
restoredLabels: Set<string> | null = null
|
|
1602
|
-
): RestoreCrossrefResult {
|
|
1603
|
-
const messages: string[] = [];
|
|
1604
|
-
let restored = 0;
|
|
1605
|
-
let result = text;
|
|
1606
|
-
|
|
1607
|
-
const registry = readImageRegistry(projectDir);
|
|
1608
|
-
|
|
1609
|
-
if (!restoredLabels) {
|
|
1610
|
-
restoredLabels = new Set<string>();
|
|
1611
|
-
}
|
|
1612
|
-
|
|
1613
|
-
// Pattern 1: [Figure]{.mark} [N]{.mark}
|
|
1614
|
-
result = result.replace(/\[(Figure|Table|Fig\.?)\]\{\.mark\}\s*\[(\d+|S\d+)\]\{\.mark\}/gi, (match, type, num) => {
|
|
1615
|
-
const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
|
|
1616
|
-
if (registry) {
|
|
1617
|
-
const entry = registry.byNumber?.get(`${prefix}:${num}`);
|
|
1618
|
-
if (entry && entry.label) {
|
|
1619
|
-
restored++;
|
|
1620
|
-
return `@${prefix}:${entry.label}`;
|
|
1621
|
-
}
|
|
1622
|
-
}
|
|
1623
|
-
restored++;
|
|
1624
|
-
messages.push(`Restored ${type} ${num} (no label found, using placeholder)`);
|
|
1625
|
-
return `@${prefix}:fig${num}`;
|
|
1626
|
-
});
|
|
1627
|
-
|
|
1628
|
-
// Pattern 2: Plain "Figure N" or "Fig. N"
|
|
1629
|
-
result = result.replace(/(?<!!)\b(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)\b(?!\s*:)/gi, (match, type, num) => {
|
|
1630
|
-
const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
|
|
1631
|
-
if (registry) {
|
|
1632
|
-
const entry = registry.byNumber?.get(`${prefix}:${num}`);
|
|
1633
|
-
if (entry && entry.label) {
|
|
1634
|
-
restored++;
|
|
1635
|
-
return `@${prefix}:${entry.label}`;
|
|
1636
|
-
}
|
|
1637
|
-
}
|
|
1638
|
-
return match;
|
|
1639
|
-
});
|
|
1640
|
-
|
|
1641
|
-
// Pattern 3: Remove duplicate plain-text captions
|
|
1642
|
-
result = result.replace(/(\!\[[^\]]+\]\([^)]+\)(?:\{[^}]*\})?)\s*\n+\s*(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*[^\n]+/gi, '$1');
|
|
1643
|
-
|
|
1644
|
-
// Pattern 4: Clean up image captions that start with "Figure N: "
|
|
1645
|
-
result = result.replace(/!\[(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*([^\]]*)\]\(([^)]+)\)(?:\{[^}]*\})?/gi,
|
|
1646
|
-
(match, type, num, caption, imgPath) => {
|
|
1647
|
-
const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
|
|
1648
|
-
const labelKey = `${prefix}:${num}`;
|
|
1649
|
-
|
|
1650
|
-
if (registry) {
|
|
1651
|
-
const entry = registry.byNumber?.get(labelKey);
|
|
1652
|
-
if (entry) {
|
|
1653
|
-
if (restoredLabels!.has(labelKey)) {
|
|
1654
|
-
messages.push(`Skipped duplicate ${prefix}:${entry.label} (already restored)`);
|
|
1655
|
-
return ``;
|
|
1656
|
-
}
|
|
1657
|
-
restoredLabels!.add(labelKey);
|
|
1658
|
-
restored++;
|
|
1659
|
-
messages.push(`Restored image ${prefix}:${entry.label} from Figure ${num}`);
|
|
1660
|
-
return `{#${prefix}:${entry.label}}`;
|
|
1661
|
-
}
|
|
1662
|
-
}
|
|
1663
|
-
const cleanCaption = caption.trim();
|
|
1664
|
-
return ``;
|
|
1665
|
-
});
|
|
1666
|
-
|
|
1667
|
-
return { text: result, restored, messages, restoredLabels };
|
|
1668
|
-
}
|
|
1669
|
-
|
|
1670
|
-
/**
|
|
1671
|
-
* Restore proper markdown image syntax from Word-extracted text using image registry
|
|
1672
|
-
*/
|
|
1673
|
-
export function restoreImagesFromRegistry(
|
|
1674
|
-
text: string,
|
|
1675
|
-
projectDir: string,
|
|
1676
|
-
restoredLabels: Set<string> | null = null
|
|
1677
|
-
): RestoreImagesResult {
|
|
1678
|
-
const messages: string[] = [];
|
|
1679
|
-
let restored = 0;
|
|
1680
|
-
|
|
1681
|
-
const registry = readImageRegistry(projectDir);
|
|
1682
|
-
if (!registry || !registry.figures || registry.figures.length === 0) {
|
|
1683
|
-
return { text, restored: 0, messages: ['No image registry found'] };
|
|
1684
|
-
}
|
|
1685
|
-
|
|
1686
|
-
if (!restoredLabels) {
|
|
1687
|
-
restoredLabels = new Set<string>();
|
|
1688
|
-
}
|
|
1689
|
-
|
|
1690
|
-
let result = text;
|
|
1691
|
-
|
|
1692
|
-
// Pattern 1: Caption-like text
|
|
1693
|
-
const captionPatterns = [
|
|
1694
|
-
/@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^\n]+)/gi,
|
|
1695
|
-
/^(Figure|Fig\.?)\s+(\d+|S\d+)[.:]\s*([^\n]+)/gim,
|
|
1696
|
-
/\|\s*@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^|]+)\s*\|/gi,
|
|
1697
|
-
];
|
|
1698
|
-
|
|
1699
|
-
// Fix @fig:label: caption patterns
|
|
1700
|
-
result = result.replace(captionPatterns[0], (match, type, label, caption) => {
|
|
1701
|
-
const key = `${type}:${label}`;
|
|
1702
|
-
const entry = registry.byLabel.get(key);
|
|
1703
|
-
if (entry) {
|
|
1704
|
-
if (restoredLabels!.has(key)) {
|
|
1705
|
-
messages.push(`Skipped duplicate ${key} (already restored)`);
|
|
1706
|
-
return ``;
|
|
1707
|
-
}
|
|
1708
|
-
restoredLabels!.add(key);
|
|
1709
|
-
restored++;
|
|
1710
|
-
messages.push(`Restored ${type}:${label} from registry`);
|
|
1711
|
-
return `{#${type}:${label}}`;
|
|
1712
|
-
}
|
|
1713
|
-
return match;
|
|
1714
|
-
});
|
|
1715
|
-
|
|
1716
|
-
// Fix table-wrapped captions
|
|
1717
|
-
result = result.replace(captionPatterns[2], (match, type, label, caption) => {
|
|
1718
|
-
const key = `${type}:${label}`;
|
|
1719
|
-
const entry = registry.byLabel.get(key);
|
|
1720
|
-
if (entry) {
|
|
1721
|
-
if (restoredLabels!.has(key)) {
|
|
1722
|
-
messages.push(`Skipped duplicate ${key} from table wrapper`);
|
|
1723
|
-
return ``;
|
|
1724
|
-
}
|
|
1725
|
-
restoredLabels!.add(key);
|
|
1726
|
-
restored++;
|
|
1727
|
-
messages.push(`Restored ${type}:${label} from table wrapper`);
|
|
1728
|
-
return `{#${type}:${label}}`;
|
|
1729
|
-
}
|
|
1730
|
-
return match;
|
|
1731
|
-
});
|
|
1732
|
-
|
|
1733
|
-
// Clean up empty table structures
|
|
1734
|
-
result = result.replace(/\|\s*\|\s*\n\|:--:\|\s*\n/g, '');
|
|
1735
|
-
|
|
1736
|
-
// Fix "Figure N:" standalone lines
|
|
1737
|
-
result = result.replace(captionPatterns[1], (match, prefix, num, caption) => {
|
|
1738
|
-
const numKey = `fig:${num}`;
|
|
1739
|
-
const entry = registry.byNumber.get(numKey);
|
|
1740
|
-
if (entry) {
|
|
1741
|
-
const labelKey = `fig:${entry.label}`;
|
|
1742
|
-
if (restoredLabels!.has(labelKey)) {
|
|
1743
|
-
messages.push(`Skipped duplicate Figure ${num} (already restored)`);
|
|
1744
|
-
return ``;
|
|
1745
|
-
}
|
|
1746
|
-
restoredLabels!.add(labelKey);
|
|
1747
|
-
restored++;
|
|
1748
|
-
messages.push(`Restored Figure ${num} by number lookup`);
|
|
1749
|
-
return `{#fig:${entry.label}}`;
|
|
1750
|
-
}
|
|
1751
|
-
return match;
|
|
1752
|
-
});
|
|
1753
|
-
|
|
1754
|
-
// Fix generic media paths by matching caption text
|
|
1755
|
-
const genericImagePattern = /!\[([^\]]*)\]\(media\/[^)]+\)/g;
|
|
1756
|
-
result = result.replace(genericImagePattern, (match, caption) => {
|
|
1757
|
-
if (!caption || caption.trim() === '') {
|
|
1758
|
-
return match;
|
|
1759
|
-
}
|
|
1760
|
-
|
|
1761
|
-
const captionKey = caption.slice(0, 50).toLowerCase().trim();
|
|
1762
|
-
const entry = registry.byCaption.get(captionKey);
|
|
1763
|
-
if (entry) {
|
|
1764
|
-
const labelKey = entry.label ? `${entry.type}:${entry.label}` : null;
|
|
1765
|
-
if (labelKey && restoredLabels!.has(labelKey)) {
|
|
1766
|
-
messages.push(`Skipped duplicate by caption match: ${captionKey.slice(0, 30)}...`);
|
|
1767
|
-
return ``;
|
|
1768
|
-
}
|
|
1769
|
-
if (labelKey) {
|
|
1770
|
-
restoredLabels!.add(labelKey);
|
|
1771
|
-
}
|
|
1772
|
-
restored++;
|
|
1773
|
-
messages.push(`Restored image by caption match: ${captionKey.slice(0, 30)}...`);
|
|
1774
|
-
const anchor = (entry.label && !restoredLabels!.has(labelKey!)) ? `{#${entry.type}:${entry.label}}` : '';
|
|
1775
|
-
return `${anchor}`;
|
|
1776
|
-
}
|
|
1777
|
-
return match;
|
|
1778
|
-
});
|
|
1779
|
-
|
|
1780
|
-
return { text: result, restored, messages };
|
|
1781
|
-
}
|
|
1782
|
-
|
|
1783
387
|
/**
|
|
1784
388
|
* Import Word document with track changes directly as CriticMarkup
|
|
1785
389
|
*/
|