docrev 0.9.18 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/.gitattributes +1 -1
  2. package/CHANGELOG.md +173 -149
  3. package/PLAN-tables-and-postprocess.md +850 -850
  4. package/README.md +431 -406
  5. package/bin/rev.js +11 -11
  6. package/bin/rev.ts +145 -145
  7. package/completions/rev.bash +127 -127
  8. package/completions/rev.ps1 +210 -210
  9. package/completions/rev.zsh +207 -207
  10. package/dist/lib/build.d.ts +8 -0
  11. package/dist/lib/build.d.ts.map +1 -1
  12. package/dist/lib/build.js +62 -6
  13. package/dist/lib/build.js.map +1 -1
  14. package/dist/lib/commands/context.d.ts +1 -1
  15. package/dist/lib/commands/context.d.ts.map +1 -1
  16. package/dist/lib/commands/context.js +1 -1
  17. package/dist/lib/commands/context.js.map +1 -1
  18. package/dist/lib/commands/sections.js +7 -7
  19. package/dist/lib/commands/sections.js.map +1 -1
  20. package/dist/lib/commands/sync.d.ts.map +1 -1
  21. package/dist/lib/commands/sync.js +15 -14
  22. package/dist/lib/commands/sync.js.map +1 -1
  23. package/dist/lib/commands/utilities.js +164 -164
  24. package/dist/lib/commands/verify-anchors.js +6 -6
  25. package/dist/lib/commands/verify-anchors.js.map +1 -1
  26. package/dist/lib/commands/word-tools.js +8 -8
  27. package/dist/lib/grammar.js +3 -3
  28. package/dist/lib/macro-filter.lua +201 -0
  29. package/dist/lib/macros.d.ts +102 -0
  30. package/dist/lib/macros.d.ts.map +1 -0
  31. package/dist/lib/macros.js +218 -0
  32. package/dist/lib/macros.js.map +1 -0
  33. package/dist/lib/pdf-comments.js +44 -44
  34. package/dist/lib/plugins.js +57 -57
  35. package/dist/lib/pptx-color-filter.lua +37 -0
  36. package/dist/lib/pptx-themes.js +115 -115
  37. package/dist/lib/schema.d.ts.map +1 -1
  38. package/dist/lib/schema.js +34 -0
  39. package/dist/lib/schema.js.map +1 -1
  40. package/dist/lib/sections.d.ts +35 -0
  41. package/dist/lib/sections.d.ts.map +1 -1
  42. package/dist/lib/sections.js +81 -0
  43. package/dist/lib/sections.js.map +1 -1
  44. package/dist/lib/spelling.js +2 -2
  45. package/dist/lib/templates.js +387 -387
  46. package/dist/lib/themes.js +51 -51
  47. package/eslint.config.js +27 -27
  48. package/lib/anchor-match.ts +276 -276
  49. package/lib/annotations.ts +644 -644
  50. package/lib/build.ts +1766 -1694
  51. package/lib/citations.ts +160 -160
  52. package/lib/commands/build.ts +855 -855
  53. package/lib/commands/citations.ts +515 -515
  54. package/lib/commands/comments.ts +1050 -1050
  55. package/lib/commands/context.ts +176 -174
  56. package/lib/commands/core.ts +309 -309
  57. package/lib/commands/doi.ts +435 -435
  58. package/lib/commands/file-ops.ts +372 -372
  59. package/lib/commands/history.ts +320 -320
  60. package/lib/commands/index.ts +87 -87
  61. package/lib/commands/init.ts +259 -259
  62. package/lib/commands/merge-resolve.ts +378 -378
  63. package/lib/commands/preview.ts +178 -178
  64. package/lib/commands/project-info.ts +244 -244
  65. package/lib/commands/quality.ts +517 -517
  66. package/lib/commands/response.ts +454 -454
  67. package/lib/commands/section-boundaries.ts +82 -82
  68. package/lib/commands/sections.ts +451 -451
  69. package/lib/commands/sync.ts +709 -706
  70. package/lib/commands/text-ops.ts +449 -449
  71. package/lib/commands/utilities.ts +448 -448
  72. package/lib/commands/verify-anchors.ts +272 -272
  73. package/lib/commands/word-tools.ts +340 -340
  74. package/lib/comment-realign.ts +517 -517
  75. package/lib/config.ts +84 -84
  76. package/lib/crossref.ts +781 -781
  77. package/lib/csl.ts +191 -191
  78. package/lib/dependencies.ts +98 -98
  79. package/lib/diff-engine.ts +465 -465
  80. package/lib/doi-cache.ts +115 -115
  81. package/lib/doi.ts +897 -897
  82. package/lib/equations.ts +506 -506
  83. package/lib/errors.ts +346 -346
  84. package/lib/format.ts +541 -541
  85. package/lib/git.ts +326 -326
  86. package/lib/grammar.ts +303 -303
  87. package/lib/image-registry.ts +180 -180
  88. package/lib/import.ts +911 -911
  89. package/lib/journals.ts +543 -543
  90. package/lib/macro-filter.lua +201 -0
  91. package/lib/macros.ts +273 -0
  92. package/lib/merge.ts +633 -633
  93. package/lib/orcid.ts +144 -144
  94. package/lib/pdf-comments.ts +263 -263
  95. package/lib/pdf-import.ts +524 -524
  96. package/lib/plugins.ts +362 -362
  97. package/lib/postprocess.ts +188 -188
  98. package/lib/pptx-color-filter.lua +37 -37
  99. package/lib/pptx-template.ts +469 -469
  100. package/lib/pptx-themes.ts +483 -483
  101. package/lib/protect-restore.ts +520 -520
  102. package/lib/rate-limiter.ts +94 -94
  103. package/lib/response.ts +197 -197
  104. package/lib/restore-references.ts +240 -240
  105. package/lib/review.ts +327 -327
  106. package/lib/schema.ts +488 -454
  107. package/lib/scientific-words.ts +73 -73
  108. package/lib/sections.ts +425 -335
  109. package/lib/slides.ts +756 -756
  110. package/lib/spelling.ts +334 -334
  111. package/lib/templates.ts +526 -526
  112. package/lib/themes.ts +742 -742
  113. package/lib/trackchanges.ts +247 -247
  114. package/lib/tui.ts +450 -450
  115. package/lib/types.ts +550 -550
  116. package/lib/undo.ts +250 -250
  117. package/lib/utils.ts +69 -69
  118. package/lib/variables.ts +179 -179
  119. package/lib/word-extraction.ts +806 -806
  120. package/lib/word.ts +643 -643
  121. package/lib/wordcomments.ts +840 -840
  122. package/package.json +137 -137
  123. package/scripts/postbuild.js +47 -28
  124. package/skill/REFERENCE.md +539 -539
  125. package/skill/SKILL.md +295 -295
  126. package/tsconfig.json +26 -26
  127. package/types/index.d.ts +525 -525
  128. package/issues.md +0 -180
  129. package/site/assets/extra.css +0 -208
  130. package/site/commands.html +0 -926
  131. package/site/configuration.html +0 -469
  132. package/site/index.html +0 -288
  133. package/site/troubleshooting.html +0 -461
  134. package/site/workflow.html +0 -518
@@ -1,806 +1,806 @@
1
- /**
2
- * Word document data extraction - raw extraction from .docx files
3
- */
4
-
5
- import * as fs from 'fs';
6
- import * as path from 'path';
7
- import { exec } from 'child_process';
8
- import { promisify } from 'util';
9
-
10
- const execAsync = promisify(exec);
11
-
12
- // ============================================
13
- // Type Definitions
14
- // ============================================
15
-
16
- export interface WordComment {
17
- id: string;
18
- author: string;
19
- date: string;
20
- text: string;
21
- /**
22
- * Parent comment id when this is a reply in a Word comment thread.
23
- * Resolved from `commentsExtended.xml`'s `w15:paraIdParent` field.
24
- * `undefined` for top-level comments.
25
- */
26
- parentId?: string;
27
- }
28
-
29
- export interface TextNode {
30
- xmlStart: number;
31
- xmlEnd: number;
32
- textStart: number;
33
- textEnd: number;
34
- text: string;
35
- }
36
-
37
- export interface CommentAnchorData {
38
- anchor: string;
39
- before: string;
40
- after: string;
41
- docPosition: number;
42
- docLength: number;
43
- isEmpty: boolean;
44
- }
45
-
46
- export interface CommentAnchorsResult {
47
- anchors: Map<string, CommentAnchorData>;
48
- fullDocText: string;
49
- }
50
-
51
- export interface DocxHeading {
52
- /** Heading style name from `<w:pStyle>`, e.g. "Heading1" */
53
- style: string;
54
- /** Heading depth: 1, 2, 3, ... (parsed from style name; 0 if unknown) */
55
- level: number;
56
- /** Concatenated text content of the heading paragraph */
57
- text: string;
58
- /** Position in fullDocText (same coordinate system as CommentAnchorData.docPosition) */
59
- docPosition: number;
60
- }
61
-
62
- export interface WordTable {
63
- markdown: string;
64
- rowCount: number;
65
- colCount: number;
66
- }
67
-
68
- export interface ParsedRow {
69
- cells: string[];
70
- colSpans: number[];
71
- }
72
-
73
- export interface ExtractFromWordOptions {
74
- mediaDir?: string;
75
- skipMediaExtraction?: boolean;
76
- }
77
-
78
- export interface ExtractMessage {
79
- type: 'info' | 'warning';
80
- message: string;
81
- }
82
-
83
- export interface ExtractFromWordResult {
84
- text: string;
85
- comments: WordComment[];
86
- anchors: Map<string, CommentAnchorData>;
87
- messages: ExtractMessage[];
88
- extractedMedia: string[];
89
- tables: WordTable[];
90
- hasTrackChanges: boolean;
91
- trackChangeStats: { insertions: number; deletions: number };
92
- }
93
-
94
- // ============================================
95
- // Functions
96
- // ============================================
97
-
98
- /**
99
- * Extract comments directly from Word docx comments.xml
100
- */
101
- export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
102
- const AdmZip = (await import('adm-zip')).default;
103
- const { parseStringPromise } = await import('xml2js');
104
-
105
- const comments: WordComment[] = [];
106
-
107
- // Validate file exists
108
- if (!fs.existsSync(docxPath)) {
109
- throw new Error(`File not found: ${docxPath}`);
110
- }
111
-
112
- try {
113
- let zip;
114
- try {
115
- zip = new AdmZip(docxPath);
116
- } catch (err: any) {
117
- throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
118
- }
119
-
120
- const commentsEntry = zip.getEntry('word/comments.xml');
121
-
122
- if (!commentsEntry) {
123
- return comments;
124
- }
125
-
126
- let commentsXml;
127
- try {
128
- commentsXml = commentsEntry.getData().toString('utf8');
129
- } catch (err: any) {
130
- throw new Error(`Failed to read comments from document: ${err.message}`);
131
- }
132
-
133
- const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
134
-
135
- const commentsRoot = parsed['w:comments'];
136
- if (!commentsRoot || !commentsRoot['w:comment']) {
137
- return comments;
138
- }
139
-
140
- // Ensure it's an array
141
- const commentNodes = Array.isArray(commentsRoot['w:comment'])
142
- ? commentsRoot['w:comment']
143
- : [commentsRoot['w:comment']];
144
-
145
- // Map every paraId that lives inside a comment back to that comment's id.
146
- // Word's commentsExtended.xml expresses threading via w15:paraIdParent,
147
- // which references the parent's first <w:p>. Replies use a secondary
148
- // (often-empty) <w:p>, so each comment may contribute multiple paraIds.
149
- const paraIdToCommentId = new Map<string, string>();
150
-
151
- for (const comment of commentNodes) {
152
- const id = comment.$?.['w:id'] || '';
153
- const author = comment.$?.['w:author'] || 'Unknown';
154
- const date = comment.$?.['w:date'] || '';
155
-
156
- // Extract text from nested w:p/w:r/w:t elements and record paraIds.
157
- let text = '';
158
- const extractText = (node: any): void => {
159
- if (!node) return;
160
- if (typeof node === 'string') {
161
- text += node;
162
- return;
163
- }
164
- if (node['w:t']) {
165
- const t = node['w:t'];
166
- text += typeof t === 'string' ? t : (t._ || t);
167
- }
168
- if (node['w:r']) {
169
- const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
170
- runs.forEach(extractText);
171
- }
172
- if (node['w:p']) {
173
- const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
174
- for (const para of paras) {
175
- const paraId = para?.$?.['w14:paraId'];
176
- if (paraId && id) paraIdToCommentId.set(paraId, id);
177
- extractText(para);
178
- }
179
- }
180
- };
181
- extractText(comment);
182
-
183
- comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
184
- }
185
-
186
- // Resolve parent links from commentsExtended.xml. Missing entry just
187
- // means the docx has no threading metadata (e.g. legacy/non-Word source).
188
- const extendedEntry = zip.getEntry('word/commentsExtended.xml');
189
- if (extendedEntry && paraIdToCommentId.size > 0) {
190
- let extendedXml = '';
191
- try {
192
- extendedXml = extendedEntry.getData().toString('utf8');
193
- } catch {
194
- // Unreadable threading metadata is non-fatal; skip parent linking.
195
- }
196
- if (extendedXml) {
197
- const parentByCommentId = new Map<string, string>();
198
- const exPattern = /<w15:commentEx\b([^>]*?)\/>/g;
199
- let m: RegExpExecArray | null;
200
- while ((m = exPattern.exec(extendedXml)) !== null) {
201
- const attrs = m[1] ?? '';
202
- const paraIdMatch = attrs.match(/w15:paraId="([^"]+)"/);
203
- const parentMatch = attrs.match(/w15:paraIdParent="([^"]+)"/);
204
- if (!paraIdMatch || !parentMatch) continue;
205
- const childCommentId = paraIdToCommentId.get(paraIdMatch[1]);
206
- const parentCommentId = paraIdToCommentId.get(parentMatch[1]);
207
- if (childCommentId && parentCommentId && childCommentId !== parentCommentId) {
208
- parentByCommentId.set(childCommentId, parentCommentId);
209
- }
210
- }
211
- for (const c of comments) {
212
- const parent = parentByCommentId.get(c.id);
213
- if (parent) c.parentId = parent;
214
- }
215
- }
216
- }
217
- } catch (err: any) {
218
- // Re-throw with more context if it's already an Error we created
219
- if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
220
- throw err;
221
- }
222
- throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
223
- }
224
-
225
- return comments;
226
- }
227
-
228
- /**
229
- * Extract comment anchor texts from document.xml with surrounding context
230
- * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
231
- * Also returns fullDocText for section boundary matching
232
- */
233
- export async function extractCommentAnchors(docxPath: string): Promise<CommentAnchorsResult> {
234
- const AdmZip = (await import('adm-zip')).default;
235
- const anchors = new Map<string, CommentAnchorData>();
236
- let fullDocText = '';
237
-
238
- try {
239
- const zip = new AdmZip(docxPath);
240
- const docEntry = zip.getEntry('word/document.xml');
241
-
242
- if (!docEntry) {
243
- return { anchors, fullDocText };
244
- }
245
-
246
- const docXml = docEntry.getData().toString('utf8');
247
-
248
- // ========================================
249
- // STEP 1: Build text position mapping
250
- // ========================================
251
- const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
252
- const textNodes: TextNode[] = [];
253
- let textPosition = 0;
254
- let nodeMatch;
255
-
256
- while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
257
- const rawText = nodeMatch[1] ?? '';
258
- const decodedText = decodeXmlEntities(rawText);
259
- textNodes.push({
260
- xmlStart: nodeMatch.index,
261
- xmlEnd: nodeMatch.index + nodeMatch[0].length,
262
- textStart: textPosition,
263
- textEnd: textPosition + decodedText.length,
264
- text: decodedText
265
- });
266
- textPosition += decodedText.length;
267
- }
268
-
269
- fullDocText = textNodes.map(n => n.text).join('');
270
-
271
- // Helper: convert XML position to text position
272
- function xmlPosToTextPos(xmlPos: number): number {
273
- for (let i = 0; i < textNodes.length; i++) {
274
- const node = textNodes[i];
275
- if (!node) continue;
276
- if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
277
- return node.textStart;
278
- }
279
- if (xmlPos < node.xmlStart) {
280
- return node.textStart;
281
- }
282
- }
283
- const lastNode = textNodes[textNodes.length - 1];
284
- return lastNode ? lastNode.textEnd : 0;
285
- }
286
-
287
- // Helper: extract context before a position
288
- function getContextBefore(position: number, maxLength: number = 150): string {
289
- const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
290
- const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
291
- return sentenceStart >= 0
292
- ? beforeText.slice(sentenceStart + 2).trim()
293
- : beforeText.slice(-80).trim();
294
- }
295
-
296
- // Helper: extract context after a position
297
- function getContextAfter(position: number, maxLength: number = 150): string {
298
- const afterText = fullDocText.slice(position, position + maxLength);
299
- const sentenceEnd = afterText.search(/[.!?]\s/);
300
- return sentenceEnd >= 0
301
- ? afterText.slice(0, sentenceEnd + 1).trim()
302
- : afterText.slice(0, 80).trim();
303
- }
304
-
305
- // ========================================
306
- // STEP 2: Collect all start/end markers separately
307
- // ========================================
308
- const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
309
- const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
310
-
311
- const starts = new Map<string, number>(); // id -> position after start tag
312
- const ends = new Map<string, number>(); // id -> position before end tag
313
-
314
- let match;
315
- while ((match = startPattern.exec(docXml)) !== null) {
316
- const id = match[1];
317
- if (!starts.has(id)) {
318
- starts.set(id, match.index + match[0].length);
319
- }
320
- }
321
-
322
- while ((match = endPattern.exec(docXml)) !== null) {
323
- const id = match[1];
324
- if (!ends.has(id)) {
325
- ends.set(id, match.index);
326
- }
327
- }
328
-
329
- // ========================================
330
- // STEP 3: Process each comment range by ID
331
- // ========================================
332
- for (const [id, startXmlPos] of starts) {
333
- const endXmlPos = ends.get(id);
334
-
335
- // Missing end marker - skip with warning
336
- if (endXmlPos === undefined) {
337
- console.warn(`Comment ${id}: missing end marker`);
338
- continue;
339
- }
340
-
341
- // Calculate text position
342
- const docPosition = xmlPosToTextPos(startXmlPos);
343
-
344
- // Handle empty or inverted ranges
345
- if (endXmlPos <= startXmlPos) {
346
- anchors.set(id, {
347
- anchor: '',
348
- before: getContextBefore(docPosition),
349
- after: getContextAfter(docPosition),
350
- docPosition,
351
- docLength: fullDocText.length,
352
- isEmpty: true
353
- });
354
- continue;
355
- }
356
-
357
- // Extract XML segment between markers
358
- const segment = docXml.slice(startXmlPos, endXmlPos);
359
-
360
- // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
361
- const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
362
- let anchorText = '';
363
- let tm;
364
- while ((tm = textInRangePattern.exec(segment)) !== null) {
365
- anchorText += tm[1] || tm[2] || '';
366
- }
367
- anchorText = decodeXmlEntities(anchorText);
368
-
369
- // Get context
370
- const anchorLength = anchorText.length;
371
- const before = getContextBefore(docPosition);
372
- const after = getContextAfter(docPosition + anchorLength);
373
-
374
- // ALWAYS add entry (even if anchor is empty)
375
- anchors.set(id, {
376
- anchor: anchorText.trim(),
377
- before,
378
- after,
379
- docPosition,
380
- docLength: fullDocText.length,
381
- isEmpty: !anchorText.trim()
382
- });
383
- }
384
- } catch (err: any) {
385
- console.error('Error extracting comment anchors:', err.message);
386
- return { anchors, fullDocText: '' };
387
- }
388
-
389
- return { anchors, fullDocText };
390
- }
391
-
392
- /**
393
- * Extract heading paragraphs from a docx, with their text positions in the
394
- * same coordinate system as `extractCommentAnchors`'s `fullDocText` and
395
- * `CommentAnchorData.docPosition`.
396
- *
397
- * Headings are paragraphs whose `<w:pStyle>` is a Heading style. Reading
398
- * styles directly is more reliable than keyword-matching the concatenated
399
- * body text — there, paragraph boundaries are gone, so the literal string
400
- * "Methods" can appear inside prose ("results across countries") and the
401
- * structured-abstract label "Methods:" loses its colon when text runs are
402
- * concatenated.
403
- */
404
- export async function extractHeadings(docxPath: string): Promise<DocxHeading[]> {
405
- const AdmZip = (await import('adm-zip')).default;
406
-
407
- if (!fs.existsSync(docxPath)) {
408
- throw new Error(`File not found: ${docxPath}`);
409
- }
410
-
411
- const zip = new AdmZip(docxPath);
412
- const docEntry = zip.getEntry('word/document.xml');
413
- if (!docEntry) return [];
414
- const xml = docEntry.getData().toString('utf8');
415
-
416
- // Build the same xml-pos → text-pos mapping that extractCommentAnchors does
417
- const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
418
- const nodes: Array<{ xmlStart: number; xmlEnd: number; textStart: number; textEnd: number }> = [];
419
- let textPos = 0;
420
- let m;
421
- while ((m = textNodePattern.exec(xml)) !== null) {
422
- const decoded = decodeXmlEntities(m[1] ?? '');
423
- nodes.push({
424
- xmlStart: m.index,
425
- xmlEnd: m.index + m[0].length,
426
- textStart: textPos,
427
- textEnd: textPos + decoded.length,
428
- });
429
- textPos += decoded.length;
430
- }
431
-
432
- function xmlToTextPos(xmlPos: number): number {
433
- for (const n of nodes) {
434
- if (xmlPos >= n.xmlStart && xmlPos < n.xmlEnd) return n.textStart;
435
- if (xmlPos < n.xmlStart) return n.textStart;
436
- }
437
- return nodes.length ? nodes[nodes.length - 1].textEnd : 0;
438
- }
439
-
440
- const headings: DocxHeading[] = [];
441
- const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
442
- let pm;
443
- while ((pm = paraPattern.exec(xml)) !== null) {
444
- const inner = pm[1];
445
- const styleMatch = inner.match(/<w:pStyle[^>]*w:val="([^"]+)"/);
446
- if (!styleMatch) continue;
447
- const style = styleMatch[1];
448
- if (!/heading/i.test(style)) continue;
449
-
450
- // Concatenate text runs; include w:delText so a heading inside a tracked
451
- // deletion is still surfaced (verifying anchors against an original draft)
452
- const textInRange = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
453
- let txt = '';
454
- let tm;
455
- while ((tm = textInRange.exec(inner)) !== null) {
456
- txt += decodeXmlEntities(tm[1] || tm[2] || '');
457
- }
458
- const trimmed = txt.trim();
459
- if (!trimmed) continue;
460
-
461
- const levelMatch = style.match(/(\d+)/);
462
- const level = levelMatch ? parseInt(levelMatch[1], 10) : 0;
463
- headings.push({
464
- style,
465
- level,
466
- text: trimmed,
467
- docPosition: xmlToTextPos(pm.index),
468
- });
469
- }
470
-
471
- return headings;
472
- }
473
-
474
- /**
475
- * Decode XML entities in text
476
- */
477
- function decodeXmlEntities(text: string): string {
478
- return text
479
- .replace(/&amp;/g, '&')
480
- .replace(/&lt;/g, '<')
481
- .replace(/&gt;/g, '>')
482
- .replace(/&quot;/g, '"')
483
- .replace(/&apos;/g, "'")
484
- .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
485
- .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
486
- }
487
-
488
- /**
489
- * Extract text content from a Word XML cell
490
- */
491
- function extractCellText(cellXml: string): string {
492
- const parts: string[] = [];
493
-
494
- // Check for OMML math - replace with [math] placeholder
495
- if (cellXml.includes('<m:oMath')) {
496
- // Try to extract the text representation of math
497
- const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
498
- if (mathTextMatches.length > 0) {
499
- const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
500
- parts.push(mathText);
501
- } else {
502
- parts.push('[math]');
503
- }
504
- }
505
-
506
- // Extract regular text from w:t elements
507
- const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
508
- for (const match of textMatches) {
509
- const text = match.replace(/<[^>]+>/g, '');
510
- if (text) {
511
- parts.push(text);
512
- }
513
- }
514
-
515
- let result = parts.join('').trim();
516
- result = decodeXmlEntities(result);
517
-
518
- // Escape pipe characters in cell content (would break table)
519
- result = result.replace(/\|/g, '\\|');
520
-
521
- return result;
522
- }
523
-
524
- /**
525
- * Parse a table row, handling merged cells (gridSpan)
526
- */
527
- function parseTableRow(rowXml: string, expectedCols: number): ParsedRow {
528
- // Match cells - handle both <w:tc> and <w:tc ...>
529
- const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
530
- const cells: string[] = [];
531
- const colSpans: number[] = [];
532
-
533
- for (const cellXml of cellMatches) {
534
- // Check for horizontal merge (gridSpan)
535
- const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
536
- const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
537
-
538
- // Check for vertical merge continuation (vMerge without restart)
539
- // If vMerge is present without w:val="restart", it's a continuation - use empty
540
- const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
541
- const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
542
-
543
- const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
544
-
545
- // Add the cell content
546
- cells.push(cellText);
547
- colSpans.push(span);
548
-
549
- // For gridSpan > 1, add empty cells to maintain column alignment
550
- for (let i = 1; i < span; i++) {
551
- cells.push('');
552
- colSpans.push(0); // 0 indicates this is a spanned cell
553
- }
554
- }
555
-
556
- return { cells, colSpans };
557
- }
558
-
559
- /**
560
- * Determine table grid column count from table XML
561
- */
562
- function getTableGridCols(tableXml: string): number {
563
- // Try to get from tblGrid
564
- const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
565
- if (gridColMatches.length > 0) {
566
- return gridColMatches.length;
567
- }
568
-
569
- // Fallback: count max cells in any row
570
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
571
- let maxCols = 0;
572
- for (const rowXml of rowMatches) {
573
- const { cells } = parseTableRow(rowXml, 0);
574
- maxCols = Math.max(maxCols, cells.length);
575
- }
576
- return maxCols;
577
- }
578
-
579
- /**
580
- * Extract tables directly from Word document XML and convert to markdown pipe tables
581
- */
582
- export async function extractWordTables(docxPath: string): Promise<WordTable[]> {
583
- const AdmZip = (await import('adm-zip')).default;
584
- const tables: WordTable[] = [];
585
-
586
- try {
587
- const zip = new AdmZip(docxPath);
588
- const docEntry = zip.getEntry('word/document.xml');
589
-
590
- if (!docEntry) {
591
- return tables;
592
- }
593
-
594
- const xml = docEntry.getData().toString('utf8');
595
-
596
- // Find all table elements
597
- const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
598
-
599
- for (const tableXml of tableMatches) {
600
- // Determine expected column count from grid
601
- const expectedCols = getTableGridCols(tableXml);
602
-
603
- // Extract rows
604
- const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
605
- const rows: string[][] = [];
606
-
607
- for (const rowXml of rowMatches) {
608
- const { cells } = parseTableRow(rowXml, expectedCols);
609
- if (cells.length > 0) {
610
- rows.push(cells);
611
- }
612
- }
613
-
614
- if (rows.length > 0) {
615
- // Convert to markdown pipe table
616
- const markdown = convertRowsToMarkdownTable(rows);
617
- tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
618
- }
619
- }
620
- } catch (err: any) {
621
- console.error('Error extracting tables from Word:', err.message);
622
- }
623
-
624
- return tables;
625
- }
626
-
627
- /**
628
- * Convert array of rows (each row is array of cell strings) to markdown pipe table
629
- */
630
- function convertRowsToMarkdownTable(rows: string[][]): string {
631
- if (rows.length === 0) return '';
632
-
633
- // Normalize column count (use max across all rows)
634
- const colCount = Math.max(...rows.map((r) => r.length));
635
-
636
- // Pad rows to have consistent column count
637
- const normalizedRows = rows.map((row) => {
638
- while (row.length < colCount) {
639
- row.push('');
640
- }
641
- return row;
642
- });
643
-
644
- // Build markdown table
645
- const lines: string[] = [];
646
-
647
- // Header row
648
- const header = normalizedRows[0];
649
- lines.push('| ' + header.join(' | ') + ' |');
650
-
651
- // Separator row
652
- lines.push('|' + header.map(() => '---').join('|') + '|');
653
-
654
- // Data rows
655
- for (let i = 1; i < normalizedRows.length; i++) {
656
- lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
657
- }
658
-
659
- return lines.join('\n');
660
- }
661
-
662
- /**
663
- * Extract text from Word document using pandoc with track changes preserved
664
- */
665
- export async function extractFromWord(
666
- docxPath: string,
667
- options: ExtractFromWordOptions = {}
668
- ): Promise<ExtractFromWordResult> {
669
- let text: string;
670
- let messages: ExtractMessage[] = [];
671
- let extractedMedia: string[] = [];
672
- let hasTrackChanges = false;
673
- let trackChangeStats = { insertions: 0, deletions: 0 };
674
-
675
- // Determine media extraction directory
676
- const docxDir = path.dirname(docxPath);
677
- const mediaDir = options.mediaDir || path.join(docxDir, 'media');
678
-
679
- // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
680
- const skipMediaExtraction = options.skipMediaExtraction || false;
681
-
682
- // Extract tables directly from Word XML (reliable, no heuristics)
683
- const wordTables = await extractWordTables(docxPath);
684
-
685
- // Try pandoc first with --track-changes=all to preserve reviewer edits
686
- try {
687
- // Build pandoc command
688
- let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
689
- if (!skipMediaExtraction) {
690
- pandocCmd += ` --extract-media="${mediaDir}"`;
691
- }
692
-
693
- const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
694
- text = stdout;
695
-
696
- // Convert pandoc's track change format to CriticMarkup
697
- const origLength = text.length;
698
-
699
- // Use a more robust pattern that handles nested content
700
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
701
- if (content.trim()) {
702
- trackChangeStats.insertions++;
703
- return `{++${content}++}`;
704
- }
705
- return ''; // Empty insertions are removed
706
- });
707
-
708
- text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
709
- if (content.trim()) {
710
- trackChangeStats.deletions++;
711
- return `{--${content}--}`;
712
- }
713
- return ''; // Empty deletions are removed
714
- });
715
-
716
- // Handle any remaining pandoc track change patterns
717
- let prevText;
718
- do {
719
- prevText = text;
720
- text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
721
- if (content.trim()) {
722
- trackChangeStats.insertions++;
723
- return `{++${content}++}`;
724
- }
725
- return '';
726
- });
727
- text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
728
- if (content.trim()) {
729
- trackChangeStats.deletions++;
730
- return `{--${content}--}`;
731
- }
732
- return '';
733
- });
734
- } while (text !== prevText);
735
-
736
- // Handle pandoc comment patterns - remove comment text from body
737
- text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
738
- text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
739
-
740
- // Also handle {.mark} spans
741
- text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
742
-
743
- hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
744
-
745
- if (hasTrackChanges) {
746
- messages.push({
747
- type: 'info',
748
- message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
749
- });
750
- }
751
-
752
- // Find extracted media files
753
- const mediaSubdir = path.join(mediaDir, 'media');
754
- if (fs.existsSync(mediaSubdir)) {
755
- extractedMedia = fs.readdirSync(mediaSubdir)
756
- .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
757
- .map(f => path.join(mediaSubdir, f));
758
-
759
- if (extractedMedia.length > 0) {
760
- messages.push({
761
- type: 'info',
762
- message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
763
- });
764
- }
765
- }
766
- } catch (pandocErr: any) {
767
- // Pandoc not available — use XML-based extraction with track change support
768
- const { extractPlainTextWithTrackChanges } = await import('./word.js');
769
- const { getInstallInstructions } = await import('./dependencies.js');
770
- const installCmd = getInstallInstructions('pandoc');
771
-
772
- const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
773
- text = xmlResult.text;
774
- hasTrackChanges = xmlResult.hasTrackChanges;
775
- trackChangeStats = xmlResult.stats;
776
-
777
- if (hasTrackChanges) {
778
- messages.push({
779
- type: 'warning',
780
- message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
781
- });
782
- } else {
783
- messages.push({
784
- type: 'warning',
785
- message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
786
- });
787
- }
788
- }
789
-
790
- // Extract comments directly from docx XML
791
- const comments = await extractWordComments(docxPath);
792
-
793
- // Extract comment anchor texts
794
- const { anchors } = await extractCommentAnchors(docxPath);
795
-
796
- return {
797
- text,
798
- comments,
799
- anchors,
800
- messages,
801
- extractedMedia,
802
- tables: wordTables,
803
- hasTrackChanges,
804
- trackChangeStats,
805
- };
806
- }
1
+ /**
2
+ * Word document data extraction - raw extraction from .docx files
3
+ */
4
+
5
+ import * as fs from 'fs';
6
+ import * as path from 'path';
7
+ import { exec } from 'child_process';
8
+ import { promisify } from 'util';
9
+
10
+ const execAsync = promisify(exec);
11
+
12
+ // ============================================
13
+ // Type Definitions
14
+ // ============================================
15
+
16
+ export interface WordComment {
17
+ id: string;
18
+ author: string;
19
+ date: string;
20
+ text: string;
21
+ /**
22
+ * Parent comment id when this is a reply in a Word comment thread.
23
+ * Resolved from `commentsExtended.xml`'s `w15:paraIdParent` field.
24
+ * `undefined` for top-level comments.
25
+ */
26
+ parentId?: string;
27
+ }
28
+
29
+ export interface TextNode {
30
+ xmlStart: number;
31
+ xmlEnd: number;
32
+ textStart: number;
33
+ textEnd: number;
34
+ text: string;
35
+ }
36
+
37
+ export interface CommentAnchorData {
38
+ anchor: string;
39
+ before: string;
40
+ after: string;
41
+ docPosition: number;
42
+ docLength: number;
43
+ isEmpty: boolean;
44
+ }
45
+
46
+ export interface CommentAnchorsResult {
47
+ anchors: Map<string, CommentAnchorData>;
48
+ fullDocText: string;
49
+ }
50
+
51
+ export interface DocxHeading {
52
+ /** Heading style name from `<w:pStyle>`, e.g. "Heading1" */
53
+ style: string;
54
+ /** Heading depth: 1, 2, 3, ... (parsed from style name; 0 if unknown) */
55
+ level: number;
56
+ /** Concatenated text content of the heading paragraph */
57
+ text: string;
58
+ /** Position in fullDocText (same coordinate system as CommentAnchorData.docPosition) */
59
+ docPosition: number;
60
+ }
61
+
62
+ export interface WordTable {
63
+ markdown: string;
64
+ rowCount: number;
65
+ colCount: number;
66
+ }
67
+
68
+ export interface ParsedRow {
69
+ cells: string[];
70
+ colSpans: number[];
71
+ }
72
+
73
+ export interface ExtractFromWordOptions {
74
+ mediaDir?: string;
75
+ skipMediaExtraction?: boolean;
76
+ }
77
+
78
+ export interface ExtractMessage {
79
+ type: 'info' | 'warning';
80
+ message: string;
81
+ }
82
+
83
+ export interface ExtractFromWordResult {
84
+ text: string;
85
+ comments: WordComment[];
86
+ anchors: Map<string, CommentAnchorData>;
87
+ messages: ExtractMessage[];
88
+ extractedMedia: string[];
89
+ tables: WordTable[];
90
+ hasTrackChanges: boolean;
91
+ trackChangeStats: { insertions: number; deletions: number };
92
+ }
93
+
94
+ // ============================================
95
+ // Functions
96
+ // ============================================
97
+
98
+ /**
99
+ * Extract comments directly from Word docx comments.xml
100
+ */
101
+ export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
102
+ const AdmZip = (await import('adm-zip')).default;
103
+ const { parseStringPromise } = await import('xml2js');
104
+
105
+ const comments: WordComment[] = [];
106
+
107
+ // Validate file exists
108
+ if (!fs.existsSync(docxPath)) {
109
+ throw new Error(`File not found: ${docxPath}`);
110
+ }
111
+
112
+ try {
113
+ let zip;
114
+ try {
115
+ zip = new AdmZip(docxPath);
116
+ } catch (err: any) {
117
+ throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
118
+ }
119
+
120
+ const commentsEntry = zip.getEntry('word/comments.xml');
121
+
122
+ if (!commentsEntry) {
123
+ return comments;
124
+ }
125
+
126
+ let commentsXml;
127
+ try {
128
+ commentsXml = commentsEntry.getData().toString('utf8');
129
+ } catch (err: any) {
130
+ throw new Error(`Failed to read comments from document: ${err.message}`);
131
+ }
132
+
133
+ const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
134
+
135
+ const commentsRoot = parsed['w:comments'];
136
+ if (!commentsRoot || !commentsRoot['w:comment']) {
137
+ return comments;
138
+ }
139
+
140
+ // Ensure it's an array
141
+ const commentNodes = Array.isArray(commentsRoot['w:comment'])
142
+ ? commentsRoot['w:comment']
143
+ : [commentsRoot['w:comment']];
144
+
145
+ // Map every paraId that lives inside a comment back to that comment's id.
146
+ // Word's commentsExtended.xml expresses threading via w15:paraIdParent,
147
+ // which references the parent's first <w:p>. Replies use a secondary
148
+ // (often-empty) <w:p>, so each comment may contribute multiple paraIds.
149
+ const paraIdToCommentId = new Map<string, string>();
150
+
151
+ for (const comment of commentNodes) {
152
+ const id = comment.$?.['w:id'] || '';
153
+ const author = comment.$?.['w:author'] || 'Unknown';
154
+ const date = comment.$?.['w:date'] || '';
155
+
156
+ // Extract text from nested w:p/w:r/w:t elements and record paraIds.
157
+ let text = '';
158
+ const extractText = (node: any): void => {
159
+ if (!node) return;
160
+ if (typeof node === 'string') {
161
+ text += node;
162
+ return;
163
+ }
164
+ if (node['w:t']) {
165
+ const t = node['w:t'];
166
+ text += typeof t === 'string' ? t : (t._ || t);
167
+ }
168
+ if (node['w:r']) {
169
+ const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
170
+ runs.forEach(extractText);
171
+ }
172
+ if (node['w:p']) {
173
+ const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
174
+ for (const para of paras) {
175
+ const paraId = para?.$?.['w14:paraId'];
176
+ if (paraId && id) paraIdToCommentId.set(paraId, id);
177
+ extractText(para);
178
+ }
179
+ }
180
+ };
181
+ extractText(comment);
182
+
183
+ comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
184
+ }
185
+
186
+ // Resolve parent links from commentsExtended.xml. Missing entry just
187
+ // means the docx has no threading metadata (e.g. legacy/non-Word source).
188
+ const extendedEntry = zip.getEntry('word/commentsExtended.xml');
189
+ if (extendedEntry && paraIdToCommentId.size > 0) {
190
+ let extendedXml = '';
191
+ try {
192
+ extendedXml = extendedEntry.getData().toString('utf8');
193
+ } catch {
194
+ // Unreadable threading metadata is non-fatal; skip parent linking.
195
+ }
196
+ if (extendedXml) {
197
+ const parentByCommentId = new Map<string, string>();
198
+ const exPattern = /<w15:commentEx\b([^>]*?)\/>/g;
199
+ let m: RegExpExecArray | null;
200
+ while ((m = exPattern.exec(extendedXml)) !== null) {
201
+ const attrs = m[1] ?? '';
202
+ const paraIdMatch = attrs.match(/w15:paraId="([^"]+)"/);
203
+ const parentMatch = attrs.match(/w15:paraIdParent="([^"]+)"/);
204
+ if (!paraIdMatch || !parentMatch) continue;
205
+ const childCommentId = paraIdToCommentId.get(paraIdMatch[1]);
206
+ const parentCommentId = paraIdToCommentId.get(parentMatch[1]);
207
+ if (childCommentId && parentCommentId && childCommentId !== parentCommentId) {
208
+ parentByCommentId.set(childCommentId, parentCommentId);
209
+ }
210
+ }
211
+ for (const c of comments) {
212
+ const parent = parentByCommentId.get(c.id);
213
+ if (parent) c.parentId = parent;
214
+ }
215
+ }
216
+ }
217
+ } catch (err: any) {
218
+ // Re-throw with more context if it's already an Error we created
219
+ if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
220
+ throw err;
221
+ }
222
+ throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
223
+ }
224
+
225
+ return comments;
226
+ }
227
+
228
+ /**
229
+ * Extract comment anchor texts from document.xml with surrounding context
230
+ * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
231
+ * Also returns fullDocText for section boundary matching
232
+ */
233
+ export async function extractCommentAnchors(docxPath: string): Promise<CommentAnchorsResult> {
234
+ const AdmZip = (await import('adm-zip')).default;
235
+ const anchors = new Map<string, CommentAnchorData>();
236
+ let fullDocText = '';
237
+
238
+ try {
239
+ const zip = new AdmZip(docxPath);
240
+ const docEntry = zip.getEntry('word/document.xml');
241
+
242
+ if (!docEntry) {
243
+ return { anchors, fullDocText };
244
+ }
245
+
246
+ const docXml = docEntry.getData().toString('utf8');
247
+
248
+ // ========================================
249
+ // STEP 1: Build text position mapping
250
+ // ========================================
251
+ const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
252
+ const textNodes: TextNode[] = [];
253
+ let textPosition = 0;
254
+ let nodeMatch;
255
+
256
+ while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
257
+ const rawText = nodeMatch[1] ?? '';
258
+ const decodedText = decodeXmlEntities(rawText);
259
+ textNodes.push({
260
+ xmlStart: nodeMatch.index,
261
+ xmlEnd: nodeMatch.index + nodeMatch[0].length,
262
+ textStart: textPosition,
263
+ textEnd: textPosition + decodedText.length,
264
+ text: decodedText
265
+ });
266
+ textPosition += decodedText.length;
267
+ }
268
+
269
+ fullDocText = textNodes.map(n => n.text).join('');
270
+
271
+ // Helper: convert XML position to text position
272
+ function xmlPosToTextPos(xmlPos: number): number {
273
+ for (let i = 0; i < textNodes.length; i++) {
274
+ const node = textNodes[i];
275
+ if (!node) continue;
276
+ if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
277
+ return node.textStart;
278
+ }
279
+ if (xmlPos < node.xmlStart) {
280
+ return node.textStart;
281
+ }
282
+ }
283
+ const lastNode = textNodes[textNodes.length - 1];
284
+ return lastNode ? lastNode.textEnd : 0;
285
+ }
286
+
287
+ // Helper: extract context before a position
288
+ function getContextBefore(position: number, maxLength: number = 150): string {
289
+ const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
290
+ const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
291
+ return sentenceStart >= 0
292
+ ? beforeText.slice(sentenceStart + 2).trim()
293
+ : beforeText.slice(-80).trim();
294
+ }
295
+
296
+ // Helper: extract context after a position
297
+ function getContextAfter(position: number, maxLength: number = 150): string {
298
+ const afterText = fullDocText.slice(position, position + maxLength);
299
+ const sentenceEnd = afterText.search(/[.!?]\s/);
300
+ return sentenceEnd >= 0
301
+ ? afterText.slice(0, sentenceEnd + 1).trim()
302
+ : afterText.slice(0, 80).trim();
303
+ }
304
+
305
+ // ========================================
306
+ // STEP 2: Collect all start/end markers separately
307
+ // ========================================
308
+ const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
309
+ const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
310
+
311
+ const starts = new Map<string, number>(); // id -> position after start tag
312
+ const ends = new Map<string, number>(); // id -> position before end tag
313
+
314
+ let match;
315
+ while ((match = startPattern.exec(docXml)) !== null) {
316
+ const id = match[1];
317
+ if (!starts.has(id)) {
318
+ starts.set(id, match.index + match[0].length);
319
+ }
320
+ }
321
+
322
+ while ((match = endPattern.exec(docXml)) !== null) {
323
+ const id = match[1];
324
+ if (!ends.has(id)) {
325
+ ends.set(id, match.index);
326
+ }
327
+ }
328
+
329
+ // ========================================
330
+ // STEP 3: Process each comment range by ID
331
+ // ========================================
332
+ for (const [id, startXmlPos] of starts) {
333
+ const endXmlPos = ends.get(id);
334
+
335
+ // Missing end marker - skip with warning
336
+ if (endXmlPos === undefined) {
337
+ console.warn(`Comment ${id}: missing end marker`);
338
+ continue;
339
+ }
340
+
341
+ // Calculate text position
342
+ const docPosition = xmlPosToTextPos(startXmlPos);
343
+
344
+ // Handle empty or inverted ranges
345
+ if (endXmlPos <= startXmlPos) {
346
+ anchors.set(id, {
347
+ anchor: '',
348
+ before: getContextBefore(docPosition),
349
+ after: getContextAfter(docPosition),
350
+ docPosition,
351
+ docLength: fullDocText.length,
352
+ isEmpty: true
353
+ });
354
+ continue;
355
+ }
356
+
357
+ // Extract XML segment between markers
358
+ const segment = docXml.slice(startXmlPos, endXmlPos);
359
+
360
+ // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
361
+ const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
362
+ let anchorText = '';
363
+ let tm;
364
+ while ((tm = textInRangePattern.exec(segment)) !== null) {
365
+ anchorText += tm[1] || tm[2] || '';
366
+ }
367
+ anchorText = decodeXmlEntities(anchorText);
368
+
369
+ // Get context
370
+ const anchorLength = anchorText.length;
371
+ const before = getContextBefore(docPosition);
372
+ const after = getContextAfter(docPosition + anchorLength);
373
+
374
+ // ALWAYS add entry (even if anchor is empty)
375
+ anchors.set(id, {
376
+ anchor: anchorText.trim(),
377
+ before,
378
+ after,
379
+ docPosition,
380
+ docLength: fullDocText.length,
381
+ isEmpty: !anchorText.trim()
382
+ });
383
+ }
384
+ } catch (err: any) {
385
+ console.error('Error extracting comment anchors:', err.message);
386
+ return { anchors, fullDocText: '' };
387
+ }
388
+
389
+ return { anchors, fullDocText };
390
+ }
391
+
392
+ /**
393
+ * Extract heading paragraphs from a docx, with their text positions in the
394
+ * same coordinate system as `extractCommentAnchors`'s `fullDocText` and
395
+ * `CommentAnchorData.docPosition`.
396
+ *
397
+ * Headings are paragraphs whose `<w:pStyle>` is a Heading style. Reading
398
+ * styles directly is more reliable than keyword-matching the concatenated
399
+ * body text — there, paragraph boundaries are gone, so the literal string
400
+ * "Methods" can appear inside prose ("results across countries") and the
401
+ * structured-abstract label "Methods:" loses its colon when text runs are
402
+ * concatenated.
403
+ */
404
+ export async function extractHeadings(docxPath: string): Promise<DocxHeading[]> {
405
+ const AdmZip = (await import('adm-zip')).default;
406
+
407
+ if (!fs.existsSync(docxPath)) {
408
+ throw new Error(`File not found: ${docxPath}`);
409
+ }
410
+
411
+ const zip = new AdmZip(docxPath);
412
+ const docEntry = zip.getEntry('word/document.xml');
413
+ if (!docEntry) return [];
414
+ const xml = docEntry.getData().toString('utf8');
415
+
416
+ // Build the same xml-pos → text-pos mapping that extractCommentAnchors does
417
+ const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
418
+ const nodes: Array<{ xmlStart: number; xmlEnd: number; textStart: number; textEnd: number }> = [];
419
+ let textPos = 0;
420
+ let m;
421
+ while ((m = textNodePattern.exec(xml)) !== null) {
422
+ const decoded = decodeXmlEntities(m[1] ?? '');
423
+ nodes.push({
424
+ xmlStart: m.index,
425
+ xmlEnd: m.index + m[0].length,
426
+ textStart: textPos,
427
+ textEnd: textPos + decoded.length,
428
+ });
429
+ textPos += decoded.length;
430
+ }
431
+
432
+ function xmlToTextPos(xmlPos: number): number {
433
+ for (const n of nodes) {
434
+ if (xmlPos >= n.xmlStart && xmlPos < n.xmlEnd) return n.textStart;
435
+ if (xmlPos < n.xmlStart) return n.textStart;
436
+ }
437
+ return nodes.length ? nodes[nodes.length - 1].textEnd : 0;
438
+ }
439
+
440
+ const headings: DocxHeading[] = [];
441
+ const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
442
+ let pm;
443
+ while ((pm = paraPattern.exec(xml)) !== null) {
444
+ const inner = pm[1];
445
+ const styleMatch = inner.match(/<w:pStyle[^>]*w:val="([^"]+)"/);
446
+ if (!styleMatch) continue;
447
+ const style = styleMatch[1];
448
+ if (!/heading/i.test(style)) continue;
449
+
450
+ // Concatenate text runs; include w:delText so a heading inside a tracked
451
+ // deletion is still surfaced (verifying anchors against an original draft)
452
+ const textInRange = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
453
+ let txt = '';
454
+ let tm;
455
+ while ((tm = textInRange.exec(inner)) !== null) {
456
+ txt += decodeXmlEntities(tm[1] || tm[2] || '');
457
+ }
458
+ const trimmed = txt.trim();
459
+ if (!trimmed) continue;
460
+
461
+ const levelMatch = style.match(/(\d+)/);
462
+ const level = levelMatch ? parseInt(levelMatch[1], 10) : 0;
463
+ headings.push({
464
+ style,
465
+ level,
466
+ text: trimmed,
467
+ docPosition: xmlToTextPos(pm.index),
468
+ });
469
+ }
470
+
471
+ return headings;
472
+ }
473
+
474
+ /**
475
+ * Decode XML entities in text
476
+ */
477
+ function decodeXmlEntities(text: string): string {
478
+ return text
479
+ .replace(/&amp;/g, '&')
480
+ .replace(/&lt;/g, '<')
481
+ .replace(/&gt;/g, '>')
482
+ .replace(/&quot;/g, '"')
483
+ .replace(/&apos;/g, "'")
484
+ .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
485
+ .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
486
+ }
487
+
488
+ /**
489
+ * Extract text content from a Word XML cell
490
+ */
491
+ function extractCellText(cellXml: string): string {
492
+ const parts: string[] = [];
493
+
494
+ // Check for OMML math - replace with [math] placeholder
495
+ if (cellXml.includes('<m:oMath')) {
496
+ // Try to extract the text representation of math
497
+ const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
498
+ if (mathTextMatches.length > 0) {
499
+ const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
500
+ parts.push(mathText);
501
+ } else {
502
+ parts.push('[math]');
503
+ }
504
+ }
505
+
506
+ // Extract regular text from w:t elements
507
+ const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
508
+ for (const match of textMatches) {
509
+ const text = match.replace(/<[^>]+>/g, '');
510
+ if (text) {
511
+ parts.push(text);
512
+ }
513
+ }
514
+
515
+ let result = parts.join('').trim();
516
+ result = decodeXmlEntities(result);
517
+
518
+ // Escape pipe characters in cell content (would break table)
519
+ result = result.replace(/\|/g, '\\|');
520
+
521
+ return result;
522
+ }
523
+
524
+ /**
525
+ * Parse a table row, handling merged cells (gridSpan)
526
+ */
527
+ function parseTableRow(rowXml: string, expectedCols: number): ParsedRow {
528
+ // Match cells - handle both <w:tc> and <w:tc ...>
529
+ const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
530
+ const cells: string[] = [];
531
+ const colSpans: number[] = [];
532
+
533
+ for (const cellXml of cellMatches) {
534
+ // Check for horizontal merge (gridSpan)
535
+ const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
536
+ const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
537
+
538
+ // Check for vertical merge continuation (vMerge without restart)
539
+ // If vMerge is present without w:val="restart", it's a continuation - use empty
540
+ const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
541
+ const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
542
+
543
+ const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
544
+
545
+ // Add the cell content
546
+ cells.push(cellText);
547
+ colSpans.push(span);
548
+
549
+ // For gridSpan > 1, add empty cells to maintain column alignment
550
+ for (let i = 1; i < span; i++) {
551
+ cells.push('');
552
+ colSpans.push(0); // 0 indicates this is a spanned cell
553
+ }
554
+ }
555
+
556
+ return { cells, colSpans };
557
+ }
558
+
559
+ /**
560
+ * Determine table grid column count from table XML
561
+ */
562
+ function getTableGridCols(tableXml: string): number {
563
+ // Try to get from tblGrid
564
+ const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
565
+ if (gridColMatches.length > 0) {
566
+ return gridColMatches.length;
567
+ }
568
+
569
+ // Fallback: count max cells in any row
570
+ const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
571
+ let maxCols = 0;
572
+ for (const rowXml of rowMatches) {
573
+ const { cells } = parseTableRow(rowXml, 0);
574
+ maxCols = Math.max(maxCols, cells.length);
575
+ }
576
+ return maxCols;
577
+ }
578
+
579
+ /**
580
+ * Extract tables directly from Word document XML and convert to markdown pipe tables
581
+ */
582
+ export async function extractWordTables(docxPath: string): Promise<WordTable[]> {
583
+ const AdmZip = (await import('adm-zip')).default;
584
+ const tables: WordTable[] = [];
585
+
586
+ try {
587
+ const zip = new AdmZip(docxPath);
588
+ const docEntry = zip.getEntry('word/document.xml');
589
+
590
+ if (!docEntry) {
591
+ return tables;
592
+ }
593
+
594
+ const xml = docEntry.getData().toString('utf8');
595
+
596
+ // Find all table elements
597
+ const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
598
+
599
+ for (const tableXml of tableMatches) {
600
+ // Determine expected column count from grid
601
+ const expectedCols = getTableGridCols(tableXml);
602
+
603
+ // Extract rows
604
+ const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
605
+ const rows: string[][] = [];
606
+
607
+ for (const rowXml of rowMatches) {
608
+ const { cells } = parseTableRow(rowXml, expectedCols);
609
+ if (cells.length > 0) {
610
+ rows.push(cells);
611
+ }
612
+ }
613
+
614
+ if (rows.length > 0) {
615
+ // Convert to markdown pipe table
616
+ const markdown = convertRowsToMarkdownTable(rows);
617
+ tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
618
+ }
619
+ }
620
+ } catch (err: any) {
621
+ console.error('Error extracting tables from Word:', err.message);
622
+ }
623
+
624
+ return tables;
625
+ }
626
+
627
+ /**
628
+ * Convert array of rows (each row is array of cell strings) to markdown pipe table
629
+ */
630
+ function convertRowsToMarkdownTable(rows: string[][]): string {
631
+ if (rows.length === 0) return '';
632
+
633
+ // Normalize column count (use max across all rows)
634
+ const colCount = Math.max(...rows.map((r) => r.length));
635
+
636
+ // Pad rows to have consistent column count
637
+ const normalizedRows = rows.map((row) => {
638
+ while (row.length < colCount) {
639
+ row.push('');
640
+ }
641
+ return row;
642
+ });
643
+
644
+ // Build markdown table
645
+ const lines: string[] = [];
646
+
647
+ // Header row
648
+ const header = normalizedRows[0];
649
+ lines.push('| ' + header.join(' | ') + ' |');
650
+
651
+ // Separator row
652
+ lines.push('|' + header.map(() => '---').join('|') + '|');
653
+
654
+ // Data rows
655
+ for (let i = 1; i < normalizedRows.length; i++) {
656
+ lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
657
+ }
658
+
659
+ return lines.join('\n');
660
+ }
661
+
662
+ /**
663
+ * Extract text from Word document using pandoc with track changes preserved
664
+ */
665
+ export async function extractFromWord(
666
+ docxPath: string,
667
+ options: ExtractFromWordOptions = {}
668
+ ): Promise<ExtractFromWordResult> {
669
+ let text: string;
670
+ let messages: ExtractMessage[] = [];
671
+ let extractedMedia: string[] = [];
672
+ let hasTrackChanges = false;
673
+ let trackChangeStats = { insertions: 0, deletions: 0 };
674
+
675
+ // Determine media extraction directory
676
+ const docxDir = path.dirname(docxPath);
677
+ const mediaDir = options.mediaDir || path.join(docxDir, 'media');
678
+
679
+ // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
680
+ const skipMediaExtraction = options.skipMediaExtraction || false;
681
+
682
+ // Extract tables directly from Word XML (reliable, no heuristics)
683
+ const wordTables = await extractWordTables(docxPath);
684
+
685
+ // Try pandoc first with --track-changes=all to preserve reviewer edits
686
+ try {
687
+ // Build pandoc command
688
+ let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
689
+ if (!skipMediaExtraction) {
690
+ pandocCmd += ` --extract-media="${mediaDir}"`;
691
+ }
692
+
693
+ const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
694
+ text = stdout;
695
+
696
+ // Convert pandoc's track change format to CriticMarkup
697
+ const origLength = text.length;
698
+
699
+ // Use a more robust pattern that handles nested content
700
+ text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
701
+ if (content.trim()) {
702
+ trackChangeStats.insertions++;
703
+ return `{++${content}++}`;
704
+ }
705
+ return ''; // Empty insertions are removed
706
+ });
707
+
708
+ text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
709
+ if (content.trim()) {
710
+ trackChangeStats.deletions++;
711
+ return `{--${content}--}`;
712
+ }
713
+ return ''; // Empty deletions are removed
714
+ });
715
+
716
+ // Handle any remaining pandoc track change patterns
717
+ let prevText;
718
+ do {
719
+ prevText = text;
720
+ text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
721
+ if (content.trim()) {
722
+ trackChangeStats.insertions++;
723
+ return `{++${content}++}`;
724
+ }
725
+ return '';
726
+ });
727
+ text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
728
+ if (content.trim()) {
729
+ trackChangeStats.deletions++;
730
+ return `{--${content}--}`;
731
+ }
732
+ return '';
733
+ });
734
+ } while (text !== prevText);
735
+
736
+ // Handle pandoc comment patterns - remove comment text from body
737
+ text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
738
+ text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
739
+
740
+ // Also handle {.mark} spans
741
+ text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
742
+
743
+ hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
744
+
745
+ if (hasTrackChanges) {
746
+ messages.push({
747
+ type: 'info',
748
+ message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
749
+ });
750
+ }
751
+
752
+ // Find extracted media files
753
+ const mediaSubdir = path.join(mediaDir, 'media');
754
+ if (fs.existsSync(mediaSubdir)) {
755
+ extractedMedia = fs.readdirSync(mediaSubdir)
756
+ .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
757
+ .map(f => path.join(mediaSubdir, f));
758
+
759
+ if (extractedMedia.length > 0) {
760
+ messages.push({
761
+ type: 'info',
762
+ message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
763
+ });
764
+ }
765
+ }
766
+ } catch (pandocErr: any) {
767
+ // Pandoc not available — use XML-based extraction with track change support
768
+ const { extractPlainTextWithTrackChanges } = await import('./word.js');
769
+ const { getInstallInstructions } = await import('./dependencies.js');
770
+ const installCmd = getInstallInstructions('pandoc');
771
+
772
+ const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
773
+ text = xmlResult.text;
774
+ hasTrackChanges = xmlResult.hasTrackChanges;
775
+ trackChangeStats = xmlResult.stats;
776
+
777
+ if (hasTrackChanges) {
778
+ messages.push({
779
+ type: 'warning',
780
+ message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
781
+ });
782
+ } else {
783
+ messages.push({
784
+ type: 'warning',
785
+ message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
786
+ });
787
+ }
788
+ }
789
+
790
+ // Extract comments directly from docx XML
791
+ const comments = await extractWordComments(docxPath);
792
+
793
+ // Extract comment anchor texts
794
+ const { anchors } = await extractCommentAnchors(docxPath);
795
+
796
+ return {
797
+ text,
798
+ comments,
799
+ anchors,
800
+ messages,
801
+ extractedMedia,
802
+ tables: wordTables,
803
+ hasTrackChanges,
804
+ trackChangeStats,
805
+ };
806
+ }