docrev 0.9.11 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.claude/settings.local.json +9 -9
  2. package/.gitattributes +1 -1
  3. package/CHANGELOG.md +149 -149
  4. package/PLAN-tables-and-postprocess.md +850 -850
  5. package/README.md +391 -391
  6. package/bin/rev.js +11 -11
  7. package/bin/rev.ts +145 -145
  8. package/completions/rev.bash +127 -127
  9. package/completions/rev.ps1 +210 -210
  10. package/completions/rev.zsh +207 -207
  11. package/dev_notes/stress2/build_adversarial.ts +186 -186
  12. package/dev_notes/stress2/drift_matcher.ts +62 -62
  13. package/dev_notes/stress2/probe_anchors.ts +35 -35
  14. package/dev_notes/stress2/project/discussion.before.md +3 -3
  15. package/dev_notes/stress2/project/discussion.md +3 -3
  16. package/dev_notes/stress2/project/methods.before.md +20 -20
  17. package/dev_notes/stress2/project/methods.md +20 -20
  18. package/dev_notes/stress2/project/rev.yaml +5 -5
  19. package/dev_notes/stress2/project/sections.yaml +4 -4
  20. package/dev_notes/stress2/sections.yaml +5 -5
  21. package/dev_notes/stress2/trace_placement.ts +50 -50
  22. package/dev_notes/stresstest_boundaries.ts +27 -27
  23. package/dev_notes/stresstest_drift_apply.ts +43 -43
  24. package/dev_notes/stresstest_drift_compare.ts +43 -43
  25. package/dev_notes/stresstest_drift_v2.ts +54 -54
  26. package/dev_notes/stresstest_inspect.ts +54 -54
  27. package/dev_notes/stresstest_pstyle.ts +55 -55
  28. package/dev_notes/stresstest_section_debug.ts +23 -23
  29. package/dev_notes/stresstest_split.ts +70 -70
  30. package/dev_notes/stresstest_trace.ts +19 -19
  31. package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
  32. package/dist/lib/build.d.ts +50 -1
  33. package/dist/lib/build.d.ts.map +1 -1
  34. package/dist/lib/build.js +80 -30
  35. package/dist/lib/build.js.map +1 -1
  36. package/dist/lib/commands/build.d.ts.map +1 -1
  37. package/dist/lib/commands/build.js +38 -5
  38. package/dist/lib/commands/build.js.map +1 -1
  39. package/dist/lib/commands/utilities.js +164 -164
  40. package/dist/lib/commands/word-tools.js +8 -8
  41. package/dist/lib/grammar.js +3 -3
  42. package/dist/lib/import.d.ts.map +1 -1
  43. package/dist/lib/import.js +146 -24
  44. package/dist/lib/import.js.map +1 -1
  45. package/dist/lib/pdf-comments.js +44 -44
  46. package/dist/lib/plugins.js +57 -57
  47. package/dist/lib/pptx-themes.js +115 -115
  48. package/dist/lib/spelling.js +2 -2
  49. package/dist/lib/templates.js +387 -387
  50. package/dist/lib/themes.js +51 -51
  51. package/dist/lib/types.d.ts +20 -0
  52. package/dist/lib/types.d.ts.map +1 -1
  53. package/dist/lib/word-extraction.d.ts +6 -0
  54. package/dist/lib/word-extraction.d.ts.map +1 -1
  55. package/dist/lib/word-extraction.js +46 -3
  56. package/dist/lib/word-extraction.js.map +1 -1
  57. package/dist/lib/wordcomments.d.ts.map +1 -1
  58. package/dist/lib/wordcomments.js +23 -5
  59. package/dist/lib/wordcomments.js.map +1 -1
  60. package/eslint.config.js +27 -27
  61. package/lib/anchor-match.ts +276 -276
  62. package/lib/annotations.ts +644 -644
  63. package/lib/build.ts +1300 -1227
  64. package/lib/citations.ts +160 -160
  65. package/lib/commands/build.ts +833 -801
  66. package/lib/commands/citations.ts +515 -515
  67. package/lib/commands/comments.ts +1050 -1050
  68. package/lib/commands/context.ts +174 -174
  69. package/lib/commands/core.ts +309 -309
  70. package/lib/commands/doi.ts +435 -435
  71. package/lib/commands/file-ops.ts +372 -372
  72. package/lib/commands/history.ts +320 -320
  73. package/lib/commands/index.ts +87 -87
  74. package/lib/commands/init.ts +259 -259
  75. package/lib/commands/merge-resolve.ts +378 -378
  76. package/lib/commands/preview.ts +178 -178
  77. package/lib/commands/project-info.ts +244 -244
  78. package/lib/commands/quality.ts +517 -517
  79. package/lib/commands/response.ts +454 -454
  80. package/lib/commands/section-boundaries.ts +82 -82
  81. package/lib/commands/sections.ts +451 -451
  82. package/lib/commands/sync.ts +706 -706
  83. package/lib/commands/text-ops.ts +449 -449
  84. package/lib/commands/utilities.ts +448 -448
  85. package/lib/commands/verify-anchors.ts +272 -272
  86. package/lib/commands/word-tools.ts +340 -340
  87. package/lib/comment-realign.ts +517 -517
  88. package/lib/config.ts +84 -84
  89. package/lib/crossref.ts +781 -781
  90. package/lib/csl.ts +191 -191
  91. package/lib/dependencies.ts +98 -98
  92. package/lib/diff-engine.ts +465 -465
  93. package/lib/doi-cache.ts +115 -115
  94. package/lib/doi.ts +897 -897
  95. package/lib/equations.ts +506 -506
  96. package/lib/errors.ts +346 -346
  97. package/lib/format.ts +541 -541
  98. package/lib/git.ts +326 -326
  99. package/lib/grammar.ts +303 -303
  100. package/lib/image-registry.ts +180 -180
  101. package/lib/import.ts +911 -792
  102. package/lib/journals.ts +543 -543
  103. package/lib/merge.ts +633 -633
  104. package/lib/orcid.ts +144 -144
  105. package/lib/pdf-comments.ts +263 -263
  106. package/lib/pdf-import.ts +524 -524
  107. package/lib/plugins.ts +362 -362
  108. package/lib/postprocess.ts +188 -188
  109. package/lib/pptx-color-filter.lua +37 -37
  110. package/lib/pptx-template.ts +469 -469
  111. package/lib/pptx-themes.ts +483 -483
  112. package/lib/protect-restore.ts +520 -520
  113. package/lib/rate-limiter.ts +94 -94
  114. package/lib/response.ts +197 -197
  115. package/lib/restore-references.ts +240 -240
  116. package/lib/review.ts +327 -327
  117. package/lib/schema.ts +417 -417
  118. package/lib/scientific-words.ts +73 -73
  119. package/lib/sections.ts +335 -335
  120. package/lib/slides.ts +756 -756
  121. package/lib/spelling.ts +334 -334
  122. package/lib/templates.ts +526 -526
  123. package/lib/themes.ts +742 -742
  124. package/lib/trackchanges.ts +247 -247
  125. package/lib/tui.ts +450 -450
  126. package/lib/types.ts +550 -530
  127. package/lib/undo.ts +250 -250
  128. package/lib/utils.ts +69 -69
  129. package/lib/variables.ts +179 -179
  130. package/lib/word-extraction.ts +806 -759
  131. package/lib/word.ts +643 -643
  132. package/lib/wordcomments.ts +817 -798
  133. package/package.json +137 -137
  134. package/scripts/postbuild.js +28 -28
  135. package/skill/REFERENCE.md +431 -431
  136. package/skill/SKILL.md +258 -258
  137. package/tsconfig.json +26 -26
  138. package/types/index.d.ts +525 -525
package/lib/word.ts CHANGED
@@ -1,643 +1,643 @@
1
- /**
2
- * Word document extraction utilities
3
- * Handle reading text, comments, and anchors from .docx files
4
- */
5
-
6
- import * as fs from 'fs';
7
- import * as path from 'path';
8
- import AdmZip from 'adm-zip';
9
- import { parseString } from 'xml2js';
10
- import { promisify } from 'util';
11
- import type { WordComment, CommentAnchor, WordMetadata, TrackChangesResult } from './types.js';
12
-
13
- const parseXml = promisify(parseString);
14
-
15
- // =============================================================================
16
- // Constants
17
- // =============================================================================
18
-
19
- /** Characters of context to extract around comment anchors */
20
- const ANCHOR_CONTEXT_SIZE = 100;
21
-
22
- /** Characters of context before comment range start */
23
- const CONTEXT_BEFORE_SIZE = 500;
24
-
25
- // =============================================================================
26
- // Public API
27
- // =============================================================================
28
-
29
- /**
30
- * Extract comments from Word document's comments.xml
31
- * @param docxPath - Path to .docx file
32
- * @returns Array of extracted comments
33
- * @throws {TypeError} If docxPath is not a string
34
- * @throws {Error} If file not found or invalid docx
35
- */
36
- export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
37
- if (typeof docxPath !== 'string') {
38
- throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
39
- }
40
- if (!fs.existsSync(docxPath)) {
41
- throw new Error(`File not found: ${docxPath}`);
42
- }
43
-
44
- const zip = new AdmZip(docxPath);
45
- const commentsEntry = zip.getEntry('word/comments.xml');
46
-
47
- if (!commentsEntry) {
48
- return []; // No comments in document
49
- }
50
-
51
- const commentsXml = zip.readAsText(commentsEntry);
52
- const parsed = await parseXml(commentsXml) as any;
53
-
54
- if (!parsed?.['w:comments'] || !parsed['w:comments']['w:comment']) {
55
- return [];
56
- }
57
-
58
- const comments: WordComment[] = [];
59
- const rawComments = parsed['w:comments']['w:comment'];
60
-
61
- for (const comment of rawComments) {
62
- const id = comment.$?.['w:id'];
63
- const author = comment.$?.['w:author'] || 'Unknown';
64
- const date = comment.$?.['w:date'];
65
-
66
- // Extract text from all paragraphs in comment
67
- let text = '';
68
- const paragraphs = comment['w:p'] || [];
69
- for (const para of paragraphs) {
70
- const runs = para['w:r'] || [];
71
- for (const run of runs) {
72
- const texts = run['w:t'] || [];
73
- for (const t of texts) {
74
- text += typeof t === 'string' ? t : (t._ || '');
75
- }
76
- }
77
- }
78
-
79
- if (id && text.trim()) {
80
- comments.push({
81
- id,
82
- author,
83
- date,
84
- text: text.trim(),
85
- });
86
- }
87
- }
88
-
89
- return comments;
90
- }
91
-
92
- /**
93
- * Extract comment anchors (where comments are attached) from document.xml
94
- * Returns mapping of comment ID to the text they're anchored to
95
- * @param docxPath - Path to .docx file
96
- * @returns Map of comment ID to anchor info
97
- * @throws {TypeError} If docxPath is not a string
98
- * @throws {Error} If invalid docx structure
99
- */
100
- export async function extractCommentAnchors(docxPath: string): Promise<Map<string, CommentAnchor>> {
101
- if (typeof docxPath !== 'string') {
102
- throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
103
- }
104
-
105
- const zip = new AdmZip(docxPath);
106
- const documentEntry = zip.getEntry('word/document.xml');
107
-
108
- if (!documentEntry) {
109
- throw new Error('Invalid docx: no document.xml');
110
- }
111
-
112
- const documentXml = zip.readAsText(documentEntry);
113
- const anchors = new Map<string, CommentAnchor>();
114
-
115
- // Find commentRangeStart and commentRangeEnd pairs
116
- // The text between them is what the comment is anchored to
117
- const startPattern = /<w:commentRangeStart w:id="(\d+)"\/>/g;
118
- const endPattern = /<w:commentRangeEnd w:id="(\d+)"\/>/g;
119
-
120
- let match: RegExpExecArray | null;
121
- const starts = new Map<string, number>();
122
- const ends = new Map<string, number>();
123
-
124
- while ((match = startPattern.exec(documentXml)) !== null) {
125
- if (match[1]) {
126
- starts.set(match[1], match.index);
127
- }
128
- }
129
-
130
- while ((match = endPattern.exec(documentXml)) !== null) {
131
- if (match[1]) {
132
- ends.set(match[1], match.index);
133
- }
134
- }
135
-
136
- // For each comment, extract the text between start and end
137
- for (const [id, startPos] of starts) {
138
- const endPos = ends.get(id);
139
- if (!endPos) continue;
140
-
141
- const segment = documentXml.slice(startPos, endPos);
142
-
143
- // Extract all text content from the segment
144
- const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
145
- let text = '';
146
- let textMatch: RegExpExecArray | null;
147
- while ((textMatch = textPattern.exec(segment)) !== null) {
148
- text += textMatch[1] ?? '';
149
- }
150
-
151
- // Get surrounding context (text before the anchor)
152
- const contextStart = Math.max(0, startPos - CONTEXT_BEFORE_SIZE);
153
- const contextSegment = documentXml.slice(contextStart, startPos);
154
- let context = '';
155
- while ((textMatch = textPattern.exec(contextSegment)) !== null) {
156
- context += textMatch[1] ?? '';
157
- }
158
-
159
- anchors.set(id, {
160
- text: text.trim(),
161
- context: context.slice(-ANCHOR_CONTEXT_SIZE),
162
- });
163
- }
164
-
165
- return anchors;
166
- }
167
-
168
- /**
169
- * Extract plain text from Word document (strips track change markup)
170
- * @param docxPath - Path to .docx file
171
- * @returns Extracted plain text (accepted changes applied)
172
- * @throws {TypeError} If docxPath is not a string
173
- * @throws {Error} If file not found
174
- */
175
- export async function extractTextFromWord(docxPath: string): Promise<string> {
176
- if (typeof docxPath !== 'string') {
177
- throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
178
- }
179
- const result = await extractPlainTextWithTrackChanges(docxPath);
180
- // Strip CriticMarkup: accept insertions, remove deletions, apply substitutions
181
- let text = result.text;
182
- text = text.replace(/\{~~[^~]*~>([^~]*)~~\}/g, '$1'); // substitutions → new
183
- text = text.replace(/\{\+\+([^+]*)\+\+\}/g, '$1'); // insertions → keep
184
- text = text.replace(/\{--[^}]*--\}/g, ''); // deletions → remove
185
- return text;
186
- }
187
-
188
- /**
189
- * Get document metadata from Word file
190
- * @param docxPath - Path to .docx file
191
- * @returns Document metadata
192
- * @throws {TypeError} If docxPath is not a string
193
- */
194
- export async function getWordMetadata(docxPath: string): Promise<WordMetadata> {
195
- if (typeof docxPath !== 'string') {
196
- throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
197
- }
198
-
199
- const zip = new AdmZip(docxPath);
200
- const coreEntry = zip.getEntry('docProps/core.xml');
201
-
202
- if (!coreEntry) {
203
- return {};
204
- }
205
-
206
- const coreXml = zip.readAsText(coreEntry);
207
- const metadata: WordMetadata = {};
208
-
209
- // Extract common metadata fields
210
- const patterns: Record<string, RegExp> = {
211
- title: /<dc:title>([^<]*)<\/dc:title>/,
212
- author: /<dc:creator>([^<]*)<\/dc:creator>/,
213
- created: /<dcterms:created[^>]*>([^<]*)<\/dcterms:created>/,
214
- modified: /<dcterms:modified[^>]*>([^<]*)<\/dcterms:modified>/,
215
- };
216
-
217
- for (const [key, pattern] of Object.entries(patterns)) {
218
- const match = coreXml.match(pattern);
219
- if (match) {
220
- (metadata as any)[key] = match[1];
221
- }
222
- }
223
-
224
- return metadata;
225
- }
226
-
227
- /**
228
- * Check if file is a valid Word document
229
- * @param filePath - Path to file to check
230
- * @returns True if valid .docx file
231
- */
232
- export function isWordDocument(filePath: string): boolean {
233
- if (typeof filePath !== 'string') return false;
234
- if (!fs.existsSync(filePath)) return false;
235
- if (!filePath.toLowerCase().endsWith('.docx')) return false;
236
-
237
- try {
238
- const zip = new AdmZip(filePath);
239
- return zip.getEntry('word/document.xml') !== null;
240
- } catch {
241
- return false;
242
- }
243
- }
244
-
245
- /**
246
- * Extract text content from XML element, handling nested elements
247
- * @param xml - XML string
248
- * @returns Plain text content
249
- */
250
- function extractTextFromXml(xml: string): string {
251
- let text = '';
252
- // Match w:t elements (regular text)
253
- const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
254
- let match: RegExpExecArray | null;
255
- while ((match = textPattern.exec(xml)) !== null) {
256
- text += match[1];
257
- }
258
- // Also match w:delText (deleted text)
259
- const delTextPattern = /<w:delText[^>]*>([^<]*)<\/w:delText>/g;
260
- while ((match = delTextPattern.exec(xml)) !== null) {
261
- text += match[1];
262
- }
263
- return text;
264
- }
265
-
266
- /**
267
- * Extract track changes (insertions and deletions) from Word document
268
- * Converts Word's w:ins and w:del elements to CriticMarkup format
269
- *
270
- * @param docxPath - Path to Word document
271
- * @returns Track changes result with content and stats
272
- */
273
- export async function extractTrackChanges(docxPath: string): Promise<TrackChangesResult> {
274
- if (!fs.existsSync(docxPath)) {
275
- throw new Error(`File not found: ${docxPath}`);
276
- }
277
-
278
- const zip = new AdmZip(docxPath);
279
- const documentEntry = zip.getEntry('word/document.xml');
280
-
281
- if (!documentEntry) {
282
- throw new Error('Invalid docx: no document.xml');
283
- }
284
-
285
- let xml = zip.readAsText(documentEntry);
286
- let insertions = 0;
287
- let deletions = 0;
288
-
289
- // Check if there are any track changes
290
- const hasInsertions = xml.includes('<w:ins ');
291
- const hasDeletions = xml.includes('<w:del ');
292
-
293
- if (!hasInsertions && !hasDeletions) {
294
- return { hasTrackChanges: false, content: null, stats: { insertions: 0, deletions: 0 } };
295
- }
296
-
297
- // Process insertions: <w:ins ...>...</w:ins> -> {++...++}
298
- // Match the full w:ins element including nested content
299
- xml = xml.replace(/<w:ins\b[^>]*>([\s\S]*?)<\/w:ins>/g, (match, content) => {
300
- const text = extractTextFromXml(content);
301
- if (text.trim()) {
302
- insertions++;
303
- return `{++${text}++}`;
304
- }
305
- return text;
306
- });
307
-
308
- // Process deletions: <w:del ...>...</w:del> -> {--...--}
309
- xml = xml.replace(/<w:del\b[^>]*>([\s\S]*?)<\/w:del>/g, (match, content) => {
310
- const text = extractTextFromXml(content);
311
- if (text.trim()) {
312
- deletions++;
313
- return `{--${text}--}`;
314
- }
315
- return '';
316
- });
317
-
318
- return {
319
- hasTrackChanges: true,
320
- content: xml,
321
- stats: { insertions, deletions },
322
- };
323
- }
324
-
325
- /**
326
- * Extract a single marker's content starting at position i.
327
- * Returns { content, end } where end is the position after the closing marker,
328
- * or null if no valid closing marker found.
329
- */
330
- function extractMarker(text: string, i: number, open: string, close: string): { content: string; end: number } | null {
331
- if (!text.startsWith(open, i)) return null;
332
- const start = i + open.length;
333
- const closeIdx = text.indexOf(close, start);
334
- if (closeIdx === -1) return null;
335
- return { content: text.slice(start, closeIdx), end: closeIdx + close.length };
336
- }
337
-
338
- /**
339
- * Greedily collect consecutive markers of the same type.
340
- * E.g. {++a++}{++b++}{++c++} → "abc", advancing past all three.
341
- */
342
- function collectConsecutive(text: string, i: number, open: string, close: string): { content: string; end: number } | null {
343
- const first = extractMarker(text, i, open, close);
344
- if (!first) return null;
345
-
346
- let content = first.content;
347
- let end = first.end;
348
-
349
- while (end < text.length) {
350
- const next = extractMarker(text, end, open, close);
351
- if (!next) break;
352
- content += next.content;
353
- end = next.end;
354
- }
355
-
356
- return { content, end };
357
- }
358
-
359
- /**
360
- * Scan text for adjacent CriticMarkup markers and:
361
- * 1. Merge consecutive same-type markers: {++a++}{++b++} → {++ab++}
362
- * 2. Merge adjacent del+ins or ins+del into substitutions: {--old--}{++new++} → {~~old~>new~~}
363
- *
364
- * Uses a linear scanner — no regex backtracking, no ambiguity.
365
- */
366
- function mergeAdjacentMarkers(text: string): string {
367
- let result = '';
368
- let i = 0;
369
-
370
- while (i < text.length) {
371
- // --- Deletion block ---
372
- if (text.startsWith('{--', i)) {
373
- const del = collectConsecutive(text, i, '{--', '--}');
374
- if (!del) { result += text[i]; i++; continue; }
375
-
376
- // Skip spaces, then check for adjacent insertion
377
- let j = del.end;
378
- while (j < text.length && text[j] === ' ') j++;
379
-
380
- const ins = collectConsecutive(text, j, '{++', '++}');
381
- if (ins) {
382
- // Merge into substitution
383
- const trailing = del.content.endsWith(' ') || ins.content.endsWith(' ');
384
- result += `{~~${del.content.trimEnd()}~>${ins.content.trimEnd()}~~}${trailing ? ' ' : ''}`;
385
- i = ins.end;
386
- } else {
387
- // Emit merged deletion
388
- result += `{--${del.content}--}`;
389
- i = del.end;
390
- }
391
- continue;
392
- }
393
-
394
- // --- Insertion block ---
395
- if (text.startsWith('{++', i)) {
396
- const ins = collectConsecutive(text, i, '{++', '++}');
397
- if (!ins) { result += text[i]; i++; continue; }
398
-
399
- // Skip spaces, then check for adjacent deletion
400
- let j = ins.end;
401
- while (j < text.length && text[j] === ' ') j++;
402
-
403
- const del = collectConsecutive(text, j, '{--', '--}');
404
- if (del) {
405
- // Merge into substitution (del → ins order in output)
406
- const trailing = del.content.endsWith(' ') || ins.content.endsWith(' ');
407
- result += `{~~${del.content.trimEnd()}~>${ins.content.trimEnd()}~~}${trailing ? ' ' : ''}`;
408
- i = del.end;
409
- } else {
410
- // Emit merged insertion
411
- result += `{++${ins.content}++}`;
412
- i = ins.end;
413
- }
414
- continue;
415
- }
416
-
417
- result += text[i];
418
- i++;
419
- }
420
-
421
- return result;
422
- }
423
-
424
- /**
425
- * Extract plain text from Word XML with track changes preserved as CriticMarkup.
426
- * This is a pandoc-free fallback that reads document.xml directly.
427
- *
428
- * Converts:
429
- * <w:ins> content </w:ins> → {++text++}
430
- * <w:del> content </w:del> → {--text--}
431
- *
432
- * Also detects headings (w:pStyle Heading1-6) and outputs markdown # syntax.
433
- *
434
- * @param docxPath - Path to Word document
435
- * @returns Plain text with CriticMarkup and stats
436
- */
437
- export async function extractPlainTextWithTrackChanges(docxPath: string): Promise<{
438
- text: string;
439
- hasTrackChanges: boolean;
440
- stats: { insertions: number; deletions: number };
441
- }> {
442
- if (!fs.existsSync(docxPath)) {
443
- throw new Error(`File not found: ${docxPath}`);
444
- }
445
-
446
- const zip = new AdmZip(docxPath);
447
- const docEntry = zip.getEntry('word/document.xml');
448
-
449
- if (!docEntry) {
450
- throw new Error('Invalid docx: no document.xml');
451
- }
452
-
453
- let xml = docEntry.getData().toString('utf8');
454
- let insertions = 0;
455
- let deletions = 0;
456
-
457
- // Use unique markers (null bytes) that won't appear in normal text
458
- const INS_S = '\x00IS\x00';
459
- const INS_E = '\x00IE\x00';
460
- const DEL_S = '\x00DS\x00';
461
- const DEL_E = '\x00DE\x00';
462
-
463
- // Step 1: Replace <w:ins> with marker-wrapped text injected as <w:t>
464
- // Whitespace-only insertions are kept as plain text (not markers) to preserve spacing.
465
- xml = xml.replace(/<w:ins\b[^>]*>([\s\S]*?)<\/w:ins>/g, (_match, content: string) => {
466
- const texts: string[] = [];
467
- const tPat = /<w:t[^>]*>([^<]*)<\/w:t>/g;
468
- let m: RegExpExecArray | null;
469
- while ((m = tPat.exec(content)) !== null) {
470
- texts.push(m[1] || '');
471
- }
472
- const text = texts.join('');
473
- if (text.trim()) {
474
- insertions++;
475
- return `<w:r><w:t>${INS_S}${text}${INS_E}</w:t></w:r>`;
476
- }
477
- // Whitespace-only: preserve as plain text for spacing
478
- if (text.length > 0) {
479
- return `<w:r><w:t>${text}</w:t></w:r>`;
480
- }
481
- return '';
482
- });
483
-
484
- // Step 2: Replace <w:del> similarly (uses w:delText inside)
485
- // Whitespace-only deletions are kept as plain text to preserve spacing.
486
- xml = xml.replace(/<w:del\b[^>]*>([\s\S]*?)<\/w:del>/g, (_match, content: string) => {
487
- const texts: string[] = [];
488
- const tPat = /<w:delText[^>]*>([^<]*)<\/w:delText>|<w:t[^>]*>([^<]*)<\/w:t>/g;
489
- let m: RegExpExecArray | null;
490
- while ((m = tPat.exec(content)) !== null) {
491
- texts.push(m[1] || m[2] || '');
492
- }
493
- const text = texts.join('');
494
- if (text.trim()) {
495
- deletions++;
496
- return `<w:r><w:t>${DEL_S}${text}${DEL_E}</w:t></w:r>`;
497
- }
498
- // Whitespace-only: preserve as plain text for spacing
499
- if (text.length > 0) {
500
- return `<w:r><w:t>${text}</w:t></w:r>`;
501
- }
502
- return '';
503
- });
504
-
505
- // Step 3: Extract text paragraph by paragraph
506
- const paragraphs: string[] = [];
507
- const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
508
- let pm: RegExpExecArray | null;
509
-
510
- while ((pm = paraPattern.exec(xml)) !== null) {
511
- const paraXml = pm[1];
512
-
513
- // Detect heading level from paragraph style
514
- let headingLevel = 0;
515
- const styleMatch = paraXml.match(/<w:pStyle\s+w:val="Heading(\d)"/i);
516
- if (styleMatch && styleMatch[1]) {
517
- headingLevel = parseInt(styleMatch[1], 10);
518
- }
519
-
520
- // Extract all <w:t> text in order
521
- const texts: string[] = [];
522
- const tPat = /<w:t[^>]*>([^<]*)<\/w:t>/g;
523
- let tm: RegExpExecArray | null;
524
- while ((tm = tPat.exec(paraXml)) !== null) {
525
- texts.push(tm[1] || '');
526
- }
527
-
528
- let paraText = texts.join('');
529
-
530
- // Decode XML entities
531
- paraText = paraText
532
- .replace(/&amp;/g, '&')
533
- .replace(/&lt;/g, '<')
534
- .replace(/&gt;/g, '>')
535
- .replace(/&quot;/g, '"')
536
- .replace(/&apos;/g, "'");
537
-
538
- // Convert markers to CriticMarkup
539
- paraText = paraText
540
- .split(INS_S).join('{++')
541
- .split(INS_E).join('++}')
542
- .split(DEL_S).join('{--')
543
- .split(DEL_E).join('--}');
544
-
545
- // Merge adjacent del+ins (or ins+del) into substitutions.
546
- // Uses a scanner instead of regex to avoid backtracking across marker boundaries.
547
- paraText = mergeAdjacentMarkers(paraText);
548
-
549
- // Collapse runs of multiple spaces into single space
550
- paraText = paraText.replace(/ {2,}/g, ' ');
551
-
552
- if (paraText.trim()) {
553
- if (headingLevel > 0 && headingLevel <= 6) {
554
- paragraphs.push('#'.repeat(headingLevel) + ' ' + paraText.trim());
555
- } else {
556
- paragraphs.push(paraText);
557
- }
558
- }
559
- }
560
-
561
- return {
562
- text: paragraphs.join('\n\n'),
563
- hasTrackChanges: insertions > 0 || deletions > 0,
564
- stats: { insertions, deletions },
565
- };
566
- }
567
-
568
- interface ExtractWithTrackChangesOptions {
569
- mediaDir?: string;
570
- }
571
-
572
- /**
573
- * Extract Word document content with track changes preserved as CriticMarkup
574
- * Uses pandoc with track-changes=all option to preserve insertions/deletions
575
- *
576
- * @param docxPath - Path to Word document
577
- * @param options - Options
578
- * @returns Track changes result with text and stats
579
- */
580
- export async function extractWithTrackChanges(
581
- docxPath: string,
582
- options: ExtractWithTrackChangesOptions = {}
583
- ): Promise<{ text: string; hasTrackChanges: boolean; stats: { insertions: number; deletions: number } }> {
584
- const { mediaDir } = options;
585
-
586
- if (!fs.existsSync(docxPath)) {
587
- throw new Error(`File not found: ${docxPath}`);
588
- }
589
-
590
- const { execSync } = await import('child_process');
591
-
592
- // Use pandoc with --track-changes=all to preserve track changes
593
- // This outputs insertions as [insertion]{.insertion} and deletions as [deletion]{.deletion}
594
- let pandocArgs = `"${docxPath}" -t markdown --wrap=none --track-changes=all`;
595
- if (mediaDir) {
596
- pandocArgs += ` --extract-media="${mediaDir}"`;
597
- }
598
-
599
- let text: string;
600
- try {
601
- text = execSync(`pandoc ${pandocArgs}`, {
602
- encoding: 'utf-8',
603
- maxBuffer: 50 * 1024 * 1024,
604
- });
605
- } catch (err: any) {
606
- throw new Error(`Pandoc extraction failed: ${err.message}`);
607
- }
608
-
609
- // Count track changes from pandoc output
610
- let insertions = 0;
611
- let deletions = 0;
612
-
613
- // Pandoc outputs track changes as:
614
- // [inserted text]{.insertion author="..."}
615
- // [deleted text]{.deletion author="..."}
616
-
617
- // Convert pandoc's track change format to CriticMarkup
618
- // Insertions: [text]{.insertion ...} -> {++text++}
619
- text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
620
- if (content.trim()) {
621
- insertions++;
622
- return `{++${content}++}`;
623
- }
624
- return '';
625
- });
626
-
627
- // Deletions: [text]{.deletion ...} -> {--text--}
628
- text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
629
- if (content.trim()) {
630
- deletions++;
631
- return `{--${content}--}`;
632
- }
633
- return '';
634
- });
635
-
636
- const hasTrackChanges = insertions > 0 || deletions > 0;
637
-
638
- return {
639
- text,
640
- hasTrackChanges,
641
- stats: { insertions, deletions },
642
- };
643
- }
1
+ /**
2
+ * Word document extraction utilities
3
+ * Handle reading text, comments, and anchors from .docx files
4
+ */
5
+
6
+ import * as fs from 'fs';
7
+ import * as path from 'path';
8
+ import AdmZip from 'adm-zip';
9
+ import { parseString } from 'xml2js';
10
+ import { promisify } from 'util';
11
+ import type { WordComment, CommentAnchor, WordMetadata, TrackChangesResult } from './types.js';
12
+
13
+ const parseXml = promisify(parseString);
14
+
15
+ // =============================================================================
16
+ // Constants
17
+ // =============================================================================
18
+
19
+ /** Characters of context to extract around comment anchors */
20
+ const ANCHOR_CONTEXT_SIZE = 100;
21
+
22
+ /** Characters of context before comment range start */
23
+ const CONTEXT_BEFORE_SIZE = 500;
24
+
25
+ // =============================================================================
26
+ // Public API
27
+ // =============================================================================
28
+
29
+ /**
30
+ * Extract comments from Word document's comments.xml
31
+ * @param docxPath - Path to .docx file
32
+ * @returns Array of extracted comments
33
+ * @throws {TypeError} If docxPath is not a string
34
+ * @throws {Error} If file not found or invalid docx
35
+ */
36
+ export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
37
+ if (typeof docxPath !== 'string') {
38
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
39
+ }
40
+ if (!fs.existsSync(docxPath)) {
41
+ throw new Error(`File not found: ${docxPath}`);
42
+ }
43
+
44
+ const zip = new AdmZip(docxPath);
45
+ const commentsEntry = zip.getEntry('word/comments.xml');
46
+
47
+ if (!commentsEntry) {
48
+ return []; // No comments in document
49
+ }
50
+
51
+ const commentsXml = zip.readAsText(commentsEntry);
52
+ const parsed = await parseXml(commentsXml) as any;
53
+
54
+ if (!parsed?.['w:comments'] || !parsed['w:comments']['w:comment']) {
55
+ return [];
56
+ }
57
+
58
+ const comments: WordComment[] = [];
59
+ const rawComments = parsed['w:comments']['w:comment'];
60
+
61
+ for (const comment of rawComments) {
62
+ const id = comment.$?.['w:id'];
63
+ const author = comment.$?.['w:author'] || 'Unknown';
64
+ const date = comment.$?.['w:date'];
65
+
66
+ // Extract text from all paragraphs in comment
67
+ let text = '';
68
+ const paragraphs = comment['w:p'] || [];
69
+ for (const para of paragraphs) {
70
+ const runs = para['w:r'] || [];
71
+ for (const run of runs) {
72
+ const texts = run['w:t'] || [];
73
+ for (const t of texts) {
74
+ text += typeof t === 'string' ? t : (t._ || '');
75
+ }
76
+ }
77
+ }
78
+
79
+ if (id && text.trim()) {
80
+ comments.push({
81
+ id,
82
+ author,
83
+ date,
84
+ text: text.trim(),
85
+ });
86
+ }
87
+ }
88
+
89
+ return comments;
90
+ }
91
+
92
+ /**
93
+ * Extract comment anchors (where comments are attached) from document.xml
94
+ * Returns mapping of comment ID to the text they're anchored to
95
+ * @param docxPath - Path to .docx file
96
+ * @returns Map of comment ID to anchor info
97
+ * @throws {TypeError} If docxPath is not a string
98
+ * @throws {Error} If invalid docx structure
99
+ */
100
+ export async function extractCommentAnchors(docxPath: string): Promise<Map<string, CommentAnchor>> {
101
+ if (typeof docxPath !== 'string') {
102
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
103
+ }
104
+
105
+ const zip = new AdmZip(docxPath);
106
+ const documentEntry = zip.getEntry('word/document.xml');
107
+
108
+ if (!documentEntry) {
109
+ throw new Error('Invalid docx: no document.xml');
110
+ }
111
+
112
+ const documentXml = zip.readAsText(documentEntry);
113
+ const anchors = new Map<string, CommentAnchor>();
114
+
115
+ // Find commentRangeStart and commentRangeEnd pairs
116
+ // The text between them is what the comment is anchored to
117
+ const startPattern = /<w:commentRangeStart w:id="(\d+)"\/>/g;
118
+ const endPattern = /<w:commentRangeEnd w:id="(\d+)"\/>/g;
119
+
120
+ let match: RegExpExecArray | null;
121
+ const starts = new Map<string, number>();
122
+ const ends = new Map<string, number>();
123
+
124
+ while ((match = startPattern.exec(documentXml)) !== null) {
125
+ if (match[1]) {
126
+ starts.set(match[1], match.index);
127
+ }
128
+ }
129
+
130
+ while ((match = endPattern.exec(documentXml)) !== null) {
131
+ if (match[1]) {
132
+ ends.set(match[1], match.index);
133
+ }
134
+ }
135
+
136
+ // For each comment, extract the text between start and end
137
+ for (const [id, startPos] of starts) {
138
+ const endPos = ends.get(id);
139
+ if (!endPos) continue;
140
+
141
+ const segment = documentXml.slice(startPos, endPos);
142
+
143
+ // Extract all text content from the segment
144
+ const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
145
+ let text = '';
146
+ let textMatch: RegExpExecArray | null;
147
+ while ((textMatch = textPattern.exec(segment)) !== null) {
148
+ text += textMatch[1] ?? '';
149
+ }
150
+
151
+ // Get surrounding context (text before the anchor)
152
+ const contextStart = Math.max(0, startPos - CONTEXT_BEFORE_SIZE);
153
+ const contextSegment = documentXml.slice(contextStart, startPos);
154
+ let context = '';
155
+ while ((textMatch = textPattern.exec(contextSegment)) !== null) {
156
+ context += textMatch[1] ?? '';
157
+ }
158
+
159
+ anchors.set(id, {
160
+ text: text.trim(),
161
+ context: context.slice(-ANCHOR_CONTEXT_SIZE),
162
+ });
163
+ }
164
+
165
+ return anchors;
166
+ }
167
+
168
+ /**
169
+ * Extract plain text from Word document (strips track change markup)
170
+ * @param docxPath - Path to .docx file
171
+ * @returns Extracted plain text (accepted changes applied)
172
+ * @throws {TypeError} If docxPath is not a string
173
+ * @throws {Error} If file not found
174
+ */
175
+ export async function extractTextFromWord(docxPath: string): Promise<string> {
176
+ if (typeof docxPath !== 'string') {
177
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
178
+ }
179
+ const result = await extractPlainTextWithTrackChanges(docxPath);
180
+ // Strip CriticMarkup: accept insertions, remove deletions, apply substitutions
181
+ let text = result.text;
182
+ text = text.replace(/\{~~[^~]*~>([^~]*)~~\}/g, '$1'); // substitutions → new
183
+ text = text.replace(/\{\+\+([^+]*)\+\+\}/g, '$1'); // insertions → keep
184
+ text = text.replace(/\{--[^}]*--\}/g, ''); // deletions → remove
185
+ return text;
186
+ }
187
+
188
+ /**
189
+ * Get document metadata from Word file
190
+ * @param docxPath - Path to .docx file
191
+ * @returns Document metadata
192
+ * @throws {TypeError} If docxPath is not a string
193
+ */
194
+ export async function getWordMetadata(docxPath: string): Promise<WordMetadata> {
195
+ if (typeof docxPath !== 'string') {
196
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
197
+ }
198
+
199
+ const zip = new AdmZip(docxPath);
200
+ const coreEntry = zip.getEntry('docProps/core.xml');
201
+
202
+ if (!coreEntry) {
203
+ return {};
204
+ }
205
+
206
+ const coreXml = zip.readAsText(coreEntry);
207
+ const metadata: WordMetadata = {};
208
+
209
+ // Extract common metadata fields
210
+ const patterns: Record<string, RegExp> = {
211
+ title: /<dc:title>([^<]*)<\/dc:title>/,
212
+ author: /<dc:creator>([^<]*)<\/dc:creator>/,
213
+ created: /<dcterms:created[^>]*>([^<]*)<\/dcterms:created>/,
214
+ modified: /<dcterms:modified[^>]*>([^<]*)<\/dcterms:modified>/,
215
+ };
216
+
217
+ for (const [key, pattern] of Object.entries(patterns)) {
218
+ const match = coreXml.match(pattern);
219
+ if (match) {
220
+ (metadata as any)[key] = match[1];
221
+ }
222
+ }
223
+
224
+ return metadata;
225
+ }
226
+
227
+ /**
228
+ * Check if file is a valid Word document
229
+ * @param filePath - Path to file to check
230
+ * @returns True if valid .docx file
231
+ */
232
+ export function isWordDocument(filePath: string): boolean {
233
+ if (typeof filePath !== 'string') return false;
234
+ if (!fs.existsSync(filePath)) return false;
235
+ if (!filePath.toLowerCase().endsWith('.docx')) return false;
236
+
237
+ try {
238
+ const zip = new AdmZip(filePath);
239
+ return zip.getEntry('word/document.xml') !== null;
240
+ } catch {
241
+ return false;
242
+ }
243
+ }
244
+
245
+ /**
246
+ * Extract text content from XML element, handling nested elements
247
+ * @param xml - XML string
248
+ * @returns Plain text content
249
+ */
250
+ function extractTextFromXml(xml: string): string {
251
+ let text = '';
252
+ // Match w:t elements (regular text)
253
+ const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
254
+ let match: RegExpExecArray | null;
255
+ while ((match = textPattern.exec(xml)) !== null) {
256
+ text += match[1];
257
+ }
258
+ // Also match w:delText (deleted text)
259
+ const delTextPattern = /<w:delText[^>]*>([^<]*)<\/w:delText>/g;
260
+ while ((match = delTextPattern.exec(xml)) !== null) {
261
+ text += match[1];
262
+ }
263
+ return text;
264
+ }
265
+
266
+ /**
267
+ * Extract track changes (insertions and deletions) from Word document
268
+ * Converts Word's w:ins and w:del elements to CriticMarkup format
269
+ *
270
+ * @param docxPath - Path to Word document
271
+ * @returns Track changes result with content and stats
272
+ */
273
+ export async function extractTrackChanges(docxPath: string): Promise<TrackChangesResult> {
274
+ if (!fs.existsSync(docxPath)) {
275
+ throw new Error(`File not found: ${docxPath}`);
276
+ }
277
+
278
+ const zip = new AdmZip(docxPath);
279
+ const documentEntry = zip.getEntry('word/document.xml');
280
+
281
+ if (!documentEntry) {
282
+ throw new Error('Invalid docx: no document.xml');
283
+ }
284
+
285
+ let xml = zip.readAsText(documentEntry);
286
+ let insertions = 0;
287
+ let deletions = 0;
288
+
289
+ // Check if there are any track changes
290
+ const hasInsertions = xml.includes('<w:ins ');
291
+ const hasDeletions = xml.includes('<w:del ');
292
+
293
+ if (!hasInsertions && !hasDeletions) {
294
+ return { hasTrackChanges: false, content: null, stats: { insertions: 0, deletions: 0 } };
295
+ }
296
+
297
+ // Process insertions: <w:ins ...>...</w:ins> -> {++...++}
298
+ // Match the full w:ins element including nested content
299
+ xml = xml.replace(/<w:ins\b[^>]*>([\s\S]*?)<\/w:ins>/g, (match, content) => {
300
+ const text = extractTextFromXml(content);
301
+ if (text.trim()) {
302
+ insertions++;
303
+ return `{++${text}++}`;
304
+ }
305
+ return text;
306
+ });
307
+
308
+ // Process deletions: <w:del ...>...</w:del> -> {--...--}
309
+ xml = xml.replace(/<w:del\b[^>]*>([\s\S]*?)<\/w:del>/g, (match, content) => {
310
+ const text = extractTextFromXml(content);
311
+ if (text.trim()) {
312
+ deletions++;
313
+ return `{--${text}--}`;
314
+ }
315
+ return '';
316
+ });
317
+
318
+ return {
319
+ hasTrackChanges: true,
320
+ content: xml,
321
+ stats: { insertions, deletions },
322
+ };
323
+ }
324
+
325
+ /**
326
+ * Extract a single marker's content starting at position i.
327
+ * Returns { content, end } where end is the position after the closing marker,
328
+ * or null if no valid closing marker found.
329
+ */
330
+ function extractMarker(text: string, i: number, open: string, close: string): { content: string; end: number } | null {
331
+ if (!text.startsWith(open, i)) return null;
332
+ const start = i + open.length;
333
+ const closeIdx = text.indexOf(close, start);
334
+ if (closeIdx === -1) return null;
335
+ return { content: text.slice(start, closeIdx), end: closeIdx + close.length };
336
+ }
337
+
338
+ /**
339
+ * Greedily collect consecutive markers of the same type.
340
+ * E.g. {++a++}{++b++}{++c++} → "abc", advancing past all three.
341
+ */
342
+ function collectConsecutive(text: string, i: number, open: string, close: string): { content: string; end: number } | null {
343
+ const first = extractMarker(text, i, open, close);
344
+ if (!first) return null;
345
+
346
+ let content = first.content;
347
+ let end = first.end;
348
+
349
+ while (end < text.length) {
350
+ const next = extractMarker(text, end, open, close);
351
+ if (!next) break;
352
+ content += next.content;
353
+ end = next.end;
354
+ }
355
+
356
+ return { content, end };
357
+ }
358
+
359
+ /**
360
+ * Scan text for adjacent CriticMarkup markers and:
361
+ * 1. Merge consecutive same-type markers: {++a++}{++b++} → {++ab++}
362
+ * 2. Merge adjacent del+ins or ins+del into substitutions: {--old--}{++new++} → {~~old~>new~~}
363
+ *
364
+ * Uses a linear scanner — no regex backtracking, no ambiguity.
365
+ */
366
+ function mergeAdjacentMarkers(text: string): string {
367
+ let result = '';
368
+ let i = 0;
369
+
370
+ while (i < text.length) {
371
+ // --- Deletion block ---
372
+ if (text.startsWith('{--', i)) {
373
+ const del = collectConsecutive(text, i, '{--', '--}');
374
+ if (!del) { result += text[i]; i++; continue; }
375
+
376
+ // Skip spaces, then check for adjacent insertion
377
+ let j = del.end;
378
+ while (j < text.length && text[j] === ' ') j++;
379
+
380
+ const ins = collectConsecutive(text, j, '{++', '++}');
381
+ if (ins) {
382
+ // Merge into substitution
383
+ const trailing = del.content.endsWith(' ') || ins.content.endsWith(' ');
384
+ result += `{~~${del.content.trimEnd()}~>${ins.content.trimEnd()}~~}${trailing ? ' ' : ''}`;
385
+ i = ins.end;
386
+ } else {
387
+ // Emit merged deletion
388
+ result += `{--${del.content}--}`;
389
+ i = del.end;
390
+ }
391
+ continue;
392
+ }
393
+
394
+ // --- Insertion block ---
395
+ if (text.startsWith('{++', i)) {
396
+ const ins = collectConsecutive(text, i, '{++', '++}');
397
+ if (!ins) { result += text[i]; i++; continue; }
398
+
399
+ // Skip spaces, then check for adjacent deletion
400
+ let j = ins.end;
401
+ while (j < text.length && text[j] === ' ') j++;
402
+
403
+ const del = collectConsecutive(text, j, '{--', '--}');
404
+ if (del) {
405
+ // Merge into substitution (del → ins order in output)
406
+ const trailing = del.content.endsWith(' ') || ins.content.endsWith(' ');
407
+ result += `{~~${del.content.trimEnd()}~>${ins.content.trimEnd()}~~}${trailing ? ' ' : ''}`;
408
+ i = del.end;
409
+ } else {
410
+ // Emit merged insertion
411
+ result += `{++${ins.content}++}`;
412
+ i = ins.end;
413
+ }
414
+ continue;
415
+ }
416
+
417
+ result += text[i];
418
+ i++;
419
+ }
420
+
421
+ return result;
422
+ }
423
+
424
+ /**
425
+ * Extract plain text from Word XML with track changes preserved as CriticMarkup.
426
+ * This is a pandoc-free fallback that reads document.xml directly.
427
+ *
428
+ * Converts:
429
+ * <w:ins> content </w:ins> → {++text++}
430
+ * <w:del> content </w:del> → {--text--}
431
+ *
432
+ * Also detects headings (w:pStyle Heading1-6) and outputs markdown # syntax.
433
+ *
434
+ * @param docxPath - Path to Word document
435
+ * @returns Plain text with CriticMarkup and stats
436
+ */
437
+ export async function extractPlainTextWithTrackChanges(docxPath: string): Promise<{
438
+ text: string;
439
+ hasTrackChanges: boolean;
440
+ stats: { insertions: number; deletions: number };
441
+ }> {
442
+ if (!fs.existsSync(docxPath)) {
443
+ throw new Error(`File not found: ${docxPath}`);
444
+ }
445
+
446
+ const zip = new AdmZip(docxPath);
447
+ const docEntry = zip.getEntry('word/document.xml');
448
+
449
+ if (!docEntry) {
450
+ throw new Error('Invalid docx: no document.xml');
451
+ }
452
+
453
+ let xml = docEntry.getData().toString('utf8');
454
+ let insertions = 0;
455
+ let deletions = 0;
456
+
457
+ // Use unique markers (null bytes) that won't appear in normal text
458
+ const INS_S = '\x00IS\x00';
459
+ const INS_E = '\x00IE\x00';
460
+ const DEL_S = '\x00DS\x00';
461
+ const DEL_E = '\x00DE\x00';
462
+
463
+ // Step 1: Replace <w:ins> with marker-wrapped text injected as <w:t>
464
+ // Whitespace-only insertions are kept as plain text (not markers) to preserve spacing.
465
+ xml = xml.replace(/<w:ins\b[^>]*>([\s\S]*?)<\/w:ins>/g, (_match, content: string) => {
466
+ const texts: string[] = [];
467
+ const tPat = /<w:t[^>]*>([^<]*)<\/w:t>/g;
468
+ let m: RegExpExecArray | null;
469
+ while ((m = tPat.exec(content)) !== null) {
470
+ texts.push(m[1] || '');
471
+ }
472
+ const text = texts.join('');
473
+ if (text.trim()) {
474
+ insertions++;
475
+ return `<w:r><w:t>${INS_S}${text}${INS_E}</w:t></w:r>`;
476
+ }
477
+ // Whitespace-only: preserve as plain text for spacing
478
+ if (text.length > 0) {
479
+ return `<w:r><w:t>${text}</w:t></w:r>`;
480
+ }
481
+ return '';
482
+ });
483
+
484
+ // Step 2: Replace <w:del> similarly (uses w:delText inside)
485
+ // Whitespace-only deletions are kept as plain text to preserve spacing.
486
+ xml = xml.replace(/<w:del\b[^>]*>([\s\S]*?)<\/w:del>/g, (_match, content: string) => {
487
+ const texts: string[] = [];
488
+ const tPat = /<w:delText[^>]*>([^<]*)<\/w:delText>|<w:t[^>]*>([^<]*)<\/w:t>/g;
489
+ let m: RegExpExecArray | null;
490
+ while ((m = tPat.exec(content)) !== null) {
491
+ texts.push(m[1] || m[2] || '');
492
+ }
493
+ const text = texts.join('');
494
+ if (text.trim()) {
495
+ deletions++;
496
+ return `<w:r><w:t>${DEL_S}${text}${DEL_E}</w:t></w:r>`;
497
+ }
498
+ // Whitespace-only: preserve as plain text for spacing
499
+ if (text.length > 0) {
500
+ return `<w:r><w:t>${text}</w:t></w:r>`;
501
+ }
502
+ return '';
503
+ });
504
+
505
+ // Step 3: Extract text paragraph by paragraph
506
+ const paragraphs: string[] = [];
507
+ const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
508
+ let pm: RegExpExecArray | null;
509
+
510
+ while ((pm = paraPattern.exec(xml)) !== null) {
511
+ const paraXml = pm[1];
512
+
513
+ // Detect heading level from paragraph style
514
+ let headingLevel = 0;
515
+ const styleMatch = paraXml.match(/<w:pStyle\s+w:val="Heading(\d)"/i);
516
+ if (styleMatch && styleMatch[1]) {
517
+ headingLevel = parseInt(styleMatch[1], 10);
518
+ }
519
+
520
+ // Extract all <w:t> text in order
521
+ const texts: string[] = [];
522
+ const tPat = /<w:t[^>]*>([^<]*)<\/w:t>/g;
523
+ let tm: RegExpExecArray | null;
524
+ while ((tm = tPat.exec(paraXml)) !== null) {
525
+ texts.push(tm[1] || '');
526
+ }
527
+
528
+ let paraText = texts.join('');
529
+
530
+ // Decode XML entities
531
+ paraText = paraText
532
+ .replace(/&amp;/g, '&')
533
+ .replace(/&lt;/g, '<')
534
+ .replace(/&gt;/g, '>')
535
+ .replace(/&quot;/g, '"')
536
+ .replace(/&apos;/g, "'");
537
+
538
+ // Convert markers to CriticMarkup
539
+ paraText = paraText
540
+ .split(INS_S).join('{++')
541
+ .split(INS_E).join('++}')
542
+ .split(DEL_S).join('{--')
543
+ .split(DEL_E).join('--}');
544
+
545
+ // Merge adjacent del+ins (or ins+del) into substitutions.
546
+ // Uses a scanner instead of regex to avoid backtracking across marker boundaries.
547
+ paraText = mergeAdjacentMarkers(paraText);
548
+
549
+ // Collapse runs of multiple spaces into single space
550
+ paraText = paraText.replace(/ {2,}/g, ' ');
551
+
552
+ if (paraText.trim()) {
553
+ if (headingLevel > 0 && headingLevel <= 6) {
554
+ paragraphs.push('#'.repeat(headingLevel) + ' ' + paraText.trim());
555
+ } else {
556
+ paragraphs.push(paraText);
557
+ }
558
+ }
559
+ }
560
+
561
+ return {
562
+ text: paragraphs.join('\n\n'),
563
+ hasTrackChanges: insertions > 0 || deletions > 0,
564
+ stats: { insertions, deletions },
565
+ };
566
+ }
567
+
568
+ interface ExtractWithTrackChangesOptions {
569
+ mediaDir?: string;
570
+ }
571
+
572
+ /**
573
+ * Extract Word document content with track changes preserved as CriticMarkup
574
+ * Uses pandoc with track-changes=all option to preserve insertions/deletions
575
+ *
576
+ * @param docxPath - Path to Word document
577
+ * @param options - Options
578
+ * @returns Track changes result with text and stats
579
+ */
580
+ export async function extractWithTrackChanges(
581
+ docxPath: string,
582
+ options: ExtractWithTrackChangesOptions = {}
583
+ ): Promise<{ text: string; hasTrackChanges: boolean; stats: { insertions: number; deletions: number } }> {
584
+ const { mediaDir } = options;
585
+
586
+ if (!fs.existsSync(docxPath)) {
587
+ throw new Error(`File not found: ${docxPath}`);
588
+ }
589
+
590
+ const { execSync } = await import('child_process');
591
+
592
+ // Use pandoc with --track-changes=all to preserve track changes
593
+ // This outputs insertions as [insertion]{.insertion} and deletions as [deletion]{.deletion}
594
+ let pandocArgs = `"${docxPath}" -t markdown --wrap=none --track-changes=all`;
595
+ if (mediaDir) {
596
+ pandocArgs += ` --extract-media="${mediaDir}"`;
597
+ }
598
+
599
+ let text: string;
600
+ try {
601
+ text = execSync(`pandoc ${pandocArgs}`, {
602
+ encoding: 'utf-8',
603
+ maxBuffer: 50 * 1024 * 1024,
604
+ });
605
+ } catch (err: any) {
606
+ throw new Error(`Pandoc extraction failed: ${err.message}`);
607
+ }
608
+
609
+ // Count track changes from pandoc output
610
+ let insertions = 0;
611
+ let deletions = 0;
612
+
613
+ // Pandoc outputs track changes as:
614
+ // [inserted text]{.insertion author="..."}
615
+ // [deleted text]{.deletion author="..."}
616
+
617
+ // Convert pandoc's track change format to CriticMarkup
618
+ // Insertions: [text]{.insertion ...} -> {++text++}
619
+ text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
620
+ if (content.trim()) {
621
+ insertions++;
622
+ return `{++${content}++}`;
623
+ }
624
+ return '';
625
+ });
626
+
627
+ // Deletions: [text]{.deletion ...} -> {--text--}
628
+ text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
629
+ if (content.trim()) {
630
+ deletions++;
631
+ return `{--${content}--}`;
632
+ }
633
+ return '';
634
+ });
635
+
636
+ const hasTrackChanges = insertions > 0 || deletions > 0;
637
+
638
+ return {
639
+ text,
640
+ hasTrackChanges,
641
+ stats: { insertions, deletions },
642
+ };
643
+ }