docrev 0.9.11 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/.claude/settings.local.json +9 -9
  2. package/.gitattributes +1 -1
  3. package/CHANGELOG.md +149 -149
  4. package/PLAN-tables-and-postprocess.md +850 -850
  5. package/README.md +391 -391
  6. package/bin/rev.js +11 -11
  7. package/bin/rev.ts +145 -145
  8. package/completions/rev.bash +127 -127
  9. package/completions/rev.ps1 +210 -210
  10. package/completions/rev.zsh +207 -207
  11. package/dev_notes/stress2/build_adversarial.ts +186 -186
  12. package/dev_notes/stress2/drift_matcher.ts +62 -62
  13. package/dev_notes/stress2/probe_anchors.ts +35 -35
  14. package/dev_notes/stress2/project/discussion.before.md +3 -3
  15. package/dev_notes/stress2/project/discussion.md +3 -3
  16. package/dev_notes/stress2/project/methods.before.md +20 -20
  17. package/dev_notes/stress2/project/methods.md +20 -20
  18. package/dev_notes/stress2/project/rev.yaml +5 -5
  19. package/dev_notes/stress2/project/sections.yaml +4 -4
  20. package/dev_notes/stress2/sections.yaml +5 -5
  21. package/dev_notes/stress2/trace_placement.ts +50 -50
  22. package/dev_notes/stresstest_boundaries.ts +27 -27
  23. package/dev_notes/stresstest_drift_apply.ts +43 -43
  24. package/dev_notes/stresstest_drift_compare.ts +43 -43
  25. package/dev_notes/stresstest_drift_v2.ts +54 -54
  26. package/dev_notes/stresstest_inspect.ts +54 -54
  27. package/dev_notes/stresstest_pstyle.ts +55 -55
  28. package/dev_notes/stresstest_section_debug.ts +23 -23
  29. package/dev_notes/stresstest_split.ts +70 -70
  30. package/dev_notes/stresstest_trace.ts +19 -19
  31. package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
  32. package/dist/lib/build.d.ts +50 -1
  33. package/dist/lib/build.d.ts.map +1 -1
  34. package/dist/lib/build.js +80 -30
  35. package/dist/lib/build.js.map +1 -1
  36. package/dist/lib/commands/build.d.ts.map +1 -1
  37. package/dist/lib/commands/build.js +38 -5
  38. package/dist/lib/commands/build.js.map +1 -1
  39. package/dist/lib/commands/utilities.js +164 -164
  40. package/dist/lib/commands/word-tools.js +8 -8
  41. package/dist/lib/grammar.js +3 -3
  42. package/dist/lib/import.d.ts.map +1 -1
  43. package/dist/lib/import.js +146 -24
  44. package/dist/lib/import.js.map +1 -1
  45. package/dist/lib/pdf-comments.js +44 -44
  46. package/dist/lib/plugins.js +57 -57
  47. package/dist/lib/pptx-themes.js +115 -115
  48. package/dist/lib/spelling.js +2 -2
  49. package/dist/lib/templates.js +387 -387
  50. package/dist/lib/themes.js +51 -51
  51. package/dist/lib/types.d.ts +20 -0
  52. package/dist/lib/types.d.ts.map +1 -1
  53. package/dist/lib/word-extraction.d.ts +6 -0
  54. package/dist/lib/word-extraction.d.ts.map +1 -1
  55. package/dist/lib/word-extraction.js +46 -3
  56. package/dist/lib/word-extraction.js.map +1 -1
  57. package/dist/lib/wordcomments.d.ts.map +1 -1
  58. package/dist/lib/wordcomments.js +23 -5
  59. package/dist/lib/wordcomments.js.map +1 -1
  60. package/eslint.config.js +27 -27
  61. package/lib/anchor-match.ts +276 -276
  62. package/lib/annotations.ts +644 -644
  63. package/lib/build.ts +1300 -1227
  64. package/lib/citations.ts +160 -160
  65. package/lib/commands/build.ts +833 -801
  66. package/lib/commands/citations.ts +515 -515
  67. package/lib/commands/comments.ts +1050 -1050
  68. package/lib/commands/context.ts +174 -174
  69. package/lib/commands/core.ts +309 -309
  70. package/lib/commands/doi.ts +435 -435
  71. package/lib/commands/file-ops.ts +372 -372
  72. package/lib/commands/history.ts +320 -320
  73. package/lib/commands/index.ts +87 -87
  74. package/lib/commands/init.ts +259 -259
  75. package/lib/commands/merge-resolve.ts +378 -378
  76. package/lib/commands/preview.ts +178 -178
  77. package/lib/commands/project-info.ts +244 -244
  78. package/lib/commands/quality.ts +517 -517
  79. package/lib/commands/response.ts +454 -454
  80. package/lib/commands/section-boundaries.ts +82 -82
  81. package/lib/commands/sections.ts +451 -451
  82. package/lib/commands/sync.ts +706 -706
  83. package/lib/commands/text-ops.ts +449 -449
  84. package/lib/commands/utilities.ts +448 -448
  85. package/lib/commands/verify-anchors.ts +272 -272
  86. package/lib/commands/word-tools.ts +340 -340
  87. package/lib/comment-realign.ts +517 -517
  88. package/lib/config.ts +84 -84
  89. package/lib/crossref.ts +781 -781
  90. package/lib/csl.ts +191 -191
  91. package/lib/dependencies.ts +98 -98
  92. package/lib/diff-engine.ts +465 -465
  93. package/lib/doi-cache.ts +115 -115
  94. package/lib/doi.ts +897 -897
  95. package/lib/equations.ts +506 -506
  96. package/lib/errors.ts +346 -346
  97. package/lib/format.ts +541 -541
  98. package/lib/git.ts +326 -326
  99. package/lib/grammar.ts +303 -303
  100. package/lib/image-registry.ts +180 -180
  101. package/lib/import.ts +911 -792
  102. package/lib/journals.ts +543 -543
  103. package/lib/merge.ts +633 -633
  104. package/lib/orcid.ts +144 -144
  105. package/lib/pdf-comments.ts +263 -263
  106. package/lib/pdf-import.ts +524 -524
  107. package/lib/plugins.ts +362 -362
  108. package/lib/postprocess.ts +188 -188
  109. package/lib/pptx-color-filter.lua +37 -37
  110. package/lib/pptx-template.ts +469 -469
  111. package/lib/pptx-themes.ts +483 -483
  112. package/lib/protect-restore.ts +520 -520
  113. package/lib/rate-limiter.ts +94 -94
  114. package/lib/response.ts +197 -197
  115. package/lib/restore-references.ts +240 -240
  116. package/lib/review.ts +327 -327
  117. package/lib/schema.ts +417 -417
  118. package/lib/scientific-words.ts +73 -73
  119. package/lib/sections.ts +335 -335
  120. package/lib/slides.ts +756 -756
  121. package/lib/spelling.ts +334 -334
  122. package/lib/templates.ts +526 -526
  123. package/lib/themes.ts +742 -742
  124. package/lib/trackchanges.ts +247 -247
  125. package/lib/tui.ts +450 -450
  126. package/lib/types.ts +550 -530
  127. package/lib/undo.ts +250 -250
  128. package/lib/utils.ts +69 -69
  129. package/lib/variables.ts +179 -179
  130. package/lib/word-extraction.ts +806 -759
  131. package/lib/word.ts +643 -643
  132. package/lib/wordcomments.ts +817 -798
  133. package/package.json +137 -137
  134. package/scripts/postbuild.js +28 -28
  135. package/skill/REFERENCE.md +431 -431
  136. package/skill/SKILL.md +258 -258
  137. package/tsconfig.json +26 -26
  138. package/types/index.d.ts +525 -525
@@ -1,517 +1,517 @@
1
- /**
2
- * Realign comments from a reference DOCX to markdown
3
- * Uses paragraph-level matching with exact positions
4
- */
5
-
6
- import * as fs from 'fs';
7
- import AdmZip from 'adm-zip';
8
- import { parseStringPromise } from 'xml2js';
9
-
10
- interface CommentData {
11
- author: string;
12
- text: string;
13
- }
14
-
15
- interface CommentWithPosition {
16
- id: string;
17
- position: number;
18
- author: string;
19
- text: string;
20
- }
21
-
22
- interface Paragraph {
23
- text: string;
24
- comments: CommentWithPosition[];
25
- }
26
-
27
- interface MdParagraph {
28
- text: string;
29
- start: number;
30
- end: number;
31
- }
32
-
33
- interface ParagraphMatch {
34
- index: number;
35
- score: number;
36
- paragraph: MdParagraph;
37
- }
38
-
39
- interface WordContext {
40
- before: string[];
41
- after: string[];
42
- }
43
-
44
- interface CommentInsertion {
45
- position: number;
46
- text: string;
47
- commentText: string;
48
- hasReplies: boolean;
49
- debug: string;
50
- }
51
-
52
- interface RealignOptions {
53
- dryRun?: boolean;
54
- author?: string;
55
- replyAuthor?: string;
56
- }
57
-
58
- interface RealignResult {
59
- success: boolean;
60
- dryRun?: boolean;
61
- insertions: number;
62
- matched?: number;
63
- unmatched?: number;
64
- }
65
-
66
- interface RealignMarkdownOptions {
67
- author?: string;
68
- replyAuthor?: string;
69
- }
70
-
71
- interface RealignMarkdownResult {
72
- success: boolean;
73
- markdown: string;
74
- insertions: number;
75
- error?: string;
76
- }
77
-
78
- /**
79
- * Extract paragraphs with their full text and comment positions from DOCX
80
- */
81
- export async function extractParagraphsWithComments(docxPath: string): Promise<Paragraph[]> {
82
- const zip = new AdmZip(docxPath);
83
- const doc = zip.readAsText('word/document.xml');
84
- const commentsXml = zip.readAsText('word/comments.xml');
85
-
86
- // Parse comments to get authors and texts
87
- const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
88
- const commentNodes = parsed['w:comments']?.['w:comment'];
89
- if (!commentNodes) return [];
90
-
91
- const nodes = Array.isArray(commentNodes) ? commentNodes : [commentNodes];
92
- const commentData: Record<string, CommentData> = {};
93
-
94
- for (const c of nodes) {
95
- const id = c.$?.['w:id'] ?? '';
96
- const author = c.$?.['w:author'] ?? 'Unknown';
97
- let text = '';
98
- const extractT = (n: any): void => {
99
- if (!n) return;
100
- if (n['w:t']) {
101
- const t = n['w:t'];
102
- text += typeof t === 'string' ? t : (t._ || t);
103
- }
104
- if (n['w:r']) {
105
- (Array.isArray(n['w:r']) ? n['w:r'] : [n['w:r']]).forEach(extractT);
106
- }
107
- if (n['w:p']) {
108
- (Array.isArray(n['w:p']) ? n['w:p'] : [n['w:p']]).forEach(extractT);
109
- }
110
- };
111
- extractT(c);
112
- commentData[id] = { author, text: text.trim() };
113
- }
114
-
115
- // Extract paragraphs with comments
116
- const paragraphs: Paragraph[] = [];
117
- const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
118
- let match;
119
-
120
- while ((match = paraPattern.exec(doc)) !== null) {
121
- const paraContent = match[1];
122
- const hasComments = /commentRangeStart/.test(paraContent);
123
-
124
- // Build paragraph text and track comment positions
125
- let text = '';
126
- const comments: CommentWithPosition[] = [];
127
-
128
- const tokenPattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
129
- let tokenMatch;
130
-
131
- while ((tokenMatch = tokenPattern.exec(paraContent)) !== null) {
132
- if (tokenMatch[1] !== undefined) {
133
- text += tokenMatch[1];
134
- } else if (tokenMatch[2] !== undefined) {
135
- const cid = tokenMatch[2];
136
- const data = commentData[cid];
137
- if (data) {
138
- comments.push({
139
- id: cid,
140
- position: text.length,
141
- author: data.author,
142
- text: data.text,
143
- });
144
- }
145
- }
146
- }
147
-
148
- if (text.trim() || hasComments) {
149
- paragraphs.push({ text: text.trim(), comments });
150
- }
151
- }
152
-
153
- return paragraphs;
154
- }
155
-
156
- /**
157
- * Find best matching paragraph in markdown for a reference paragraph
158
- */
159
- function findMatchingParagraph(refText: string, mdParagraphs: MdParagraph[]): ParagraphMatch | null {
160
- // Normalize for comparison
161
- const normalize = (s: string): string => s.toLowerCase().replace(/\s+/g, ' ').trim();
162
- const refNorm = normalize(refText);
163
-
164
- if (refNorm.length < 20) return null;
165
-
166
- let bestMatch: ParagraphMatch | null = null;
167
- let bestScore = 0;
168
-
169
- for (let i = 0; i < mdParagraphs.length; i++) {
170
- const mdNorm = normalize(mdParagraphs[i].text);
171
-
172
- // Calculate word overlap
173
- const refWords = new Set(refNorm.split(' ').filter((w) => w.length > 3));
174
- const mdWords = mdNorm.split(' ').filter((w) => w.length > 3);
175
- const overlap = mdWords.filter((w) => refWords.has(w)).length;
176
- const score = overlap / Math.max(refWords.size, 1);
177
-
178
- // Also check for substring containment (for section headers)
179
- const containsStart = mdNorm.includes(refNorm.slice(0, 50));
180
-
181
- if (score > bestScore || (containsStart && score > 0.3)) {
182
- bestScore = Math.max(score, containsStart ? 0.8 : score);
183
- bestMatch = { index: i, score: bestScore, paragraph: mdParagraphs[i] };
184
- }
185
- }
186
-
187
- return bestScore > 0.4 ? bestMatch : null;
188
- }
189
-
190
- /**
191
- * Extract paragraphs from markdown (split by blank lines)
192
- */
193
- function parseMdParagraphs(markdown: string): MdParagraph[] {
194
- const paragraphs: MdParagraph[] = [];
195
- const parts = markdown.split(/\n\n+/);
196
-
197
- let pos = 0;
198
- for (const part of parts) {
199
- const trimmed = part.trim();
200
- if (trimmed) {
201
- const partStart = markdown.indexOf(part, pos);
202
- if (partStart !== -1) {
203
- paragraphs.push({
204
- text: trimmed,
205
- start: partStart,
206
- end: partStart + part.length,
207
- });
208
- pos = partStart + part.length;
209
- }
210
- }
211
- }
212
-
213
- return paragraphs;
214
- }
215
-
216
- /**
217
- * Normalize text for matching (remove citations, extra whitespace)
218
- */
219
- function normalizeForMatching(text: string): string {
220
- return text
221
- // Remove Word citation placeholders
222
- .replace(/\(\s*\$+\s*\)/g, '')
223
- .replace(/\$+/g, '')
224
- // Remove markdown citations
225
- .replace(/\[@[^\]]+\]/g, '')
226
- .replace(/@[A-Z][a-z]+\d{4}/g, '')
227
- // Remove rendered citations like "(Author et al. 2021)"
228
- .replace(/\([A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*[A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)/g, '')
229
- // Remove figure references like "Fig. 1" or "(Fig. 1)"
230
- .replace(/\(?Fig\.?\s*\d+[a-z]?\)?/gi, '')
231
- // Normalize whitespace
232
- .replace(/\s+/g, ' ')
233
- .trim()
234
- .toLowerCase();
235
- }
236
-
237
- /**
238
- * Find the word at or near a position in text
239
- */
240
- function getWordAtPosition(text: string, pos: number): WordContext {
241
- const before = text.slice(Math.max(0, pos - 30), pos);
242
- const after = text.slice(pos, pos + 30);
243
-
244
- // Get the last complete word before position
245
- const beforeWords = before.split(/\s+/).filter(w => w.length > 2);
246
- const afterWords = after.split(/\s+/).filter(w => w.length > 2);
247
-
248
- return {
249
- before: beforeWords.slice(-3),
250
- after: afterWords.slice(0, 3)
251
- };
252
- }
253
-
254
- /**
255
- * Find position in markdown paragraph matching reference position
256
- * Uses the anchor word (word immediately before the comment) for precise matching
257
- */
258
- function findMdPosition(refText: string, refPos: number, mdText: string): number {
259
- // Get the word(s) immediately before the comment position in reference
260
- const refWords = getWordAtPosition(refText, refPos);
261
- const normalizedMd = normalizeForMatching(mdText);
262
-
263
- // The "anchor word" is the last word before the comment
264
- const anchorWords = refWords.before;
265
-
266
- if (anchorWords.length === 0) {
267
- const ratio = refPos / Math.max(refText.length, 1);
268
- return Math.round(ratio * mdText.length);
269
- }
270
-
271
- // Try to find the anchor word(s) in markdown
272
- // Start with the most specific (all words), fall back to fewer
273
- for (let numWords = anchorWords.length; numWords >= 1; numWords--) {
274
- const searchWords = anchorWords.slice(-numWords);
275
- const pattern = searchWords.map(w =>
276
- w.toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
277
- ).join('\\s+');
278
-
279
- const regex = new RegExp(pattern, 'g');
280
- const matches = [...normalizedMd.matchAll(regex)];
281
-
282
- if (matches.length === 1) {
283
- // Unique match - use this position
284
- const matchEnd = matches[0].index! + matches[0][0].length;
285
- // Map back to original markdown position
286
- const ratio = matchEnd / Math.max(normalizedMd.length, 1);
287
- return Math.round(ratio * mdText.length);
288
- } else if (matches.length > 1) {
289
- // Multiple matches - use context after to disambiguate
290
- const afterWords = refWords.after;
291
- if (afterWords.length > 0) {
292
- const afterPattern = afterWords[0].toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
293
- for (const match of matches) {
294
- const matchEnd = match.index! + match[0].length;
295
- const afterContext = normalizedMd.slice(matchEnd, matchEnd + 50);
296
- if (afterContext.includes(afterPattern)) {
297
- const ratio = matchEnd / Math.max(normalizedMd.length, 1);
298
- return Math.round(ratio * mdText.length);
299
- }
300
- }
301
- }
302
- // Fall back to first match
303
- const matchEnd = matches[0].index! + matches[0][0].length;
304
- const ratio = matchEnd / Math.max(normalizedMd.length, 1);
305
- return Math.round(ratio * mdText.length);
306
- }
307
- }
308
-
309
- // Fallback: proportional position
310
- const ratio = refPos / Math.max(refText.length, 1);
311
- return Math.round(ratio * mdText.length);
312
- }
313
-
314
- /**
315
- * Extract reply comments that follow a parent comment
316
- * Returns map of parent comment text -> array of reply texts
317
- */
318
- function extractReplies(markdown: string, parentAuthor: string, replyAuthor: string): Map<string, string[]> {
319
- const replies = new Map<string, string[]>();
320
- const pattern = new RegExp(
321
- `\\{>>${parentAuthor}:\\s*([^<]+)<<\\}((?:\\s*\\{>>${replyAuthor}:[^<]+<<\\})*)`,
322
- 'g'
323
- );
324
-
325
- let match;
326
- while ((match = pattern.exec(markdown)) !== null) {
327
- const parentText = match[1].trim();
328
- const replyBlock = match[2];
329
-
330
- if (replyBlock) {
331
- const replyPattern = new RegExp(`\\{>>${replyAuthor}:\\s*([^<]+)<<\\}`, 'g');
332
- const replyTexts: string[] = [];
333
- let replyMatch;
334
- while ((replyMatch = replyPattern.exec(replyBlock)) !== null) {
335
- replyTexts.push(replyMatch[1].trim());
336
- }
337
- if (replyTexts.length > 0) {
338
- replies.set(parentText.slice(0, 50), replyTexts); // Use first 50 chars as key
339
- }
340
- }
341
- }
342
-
343
- return replies;
344
- }
345
-
346
- /**
347
- * Realign comments from reference DOCX to markdown
348
- */
349
- export async function realignComments(
350
- docxPath: string,
351
- markdownPath: string,
352
- options: RealignOptions = {}
353
- ): Promise<RealignResult> {
354
- const { dryRun = false, author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options;
355
-
356
- // Read original markdown to extract replies before stripping
357
- const originalMarkdown = fs.readFileSync(markdownPath, 'utf-8');
358
-
359
- // Extract reply relationships
360
- const replies = extractReplies(originalMarkdown, author, replyAuthor);
361
- console.log(`Found ${replies.size} ${author} comments with ${replyAuthor} replies`);
362
-
363
- // Extract reference paragraphs with comments
364
- const refParagraphs = await extractParagraphsWithComments(docxPath);
365
- const refWithComments = refParagraphs.filter(
366
- (p) => p.comments.length > 0 && p.comments.some((c) => c.author === author)
367
- );
368
-
369
- console.log(`Found ${refWithComments.length} paragraphs with ${author} comments in reference`);
370
-
371
- // Strip ALL comments (both authors) from markdown to start fresh
372
- let markdown = originalMarkdown;
373
- markdown = markdown.replace(/\s*\{>>[\s\S]+?<<\}/g, '');
374
- console.log(`Stripped all comments from markdown`);
375
-
376
- // Parse markdown paragraphs
377
- const mdParagraphs = parseMdParagraphs(markdown);
378
-
379
- // Track insertions (position, text) - will insert from end to start
380
- const insertions: CommentInsertion[] = [];
381
- let matched = 0;
382
- let unmatched = 0;
383
-
384
- for (const refPara of refWithComments) {
385
- const match = findMatchingParagraph(refPara.text, mdParagraphs);
386
-
387
- if (!match) {
388
- console.log(` No match for: "${refPara.text.slice(0, 60)}..."`);
389
- unmatched++;
390
- continue;
391
- }
392
-
393
- matched++;
394
- const mdPara = match.paragraph;
395
-
396
- // Get author's comments in this paragraph
397
- const authorComments = refPara.comments.filter((c) => c.author === author);
398
-
399
- for (const comment of authorComments) {
400
- // Find corresponding position in markdown paragraph
401
- const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? '');
402
- const absolutePos = (mdPara?.start ?? 0) + mdPos;
403
-
404
- // Build comment mark with any replies
405
- let commentMark = ` {>>${comment.author}: ${comment.text}<<}`;
406
-
407
- // Check for replies
408
- const replyKey = comment.text.trim().slice(0, 50);
409
- const replyTexts = replies.get(replyKey);
410
- if (replyTexts) {
411
- for (const replyText of replyTexts) {
412
- commentMark += ` {>>${replyAuthor}: ${replyText}<<}`;
413
- }
414
- }
415
-
416
- insertions.push({
417
- position: absolutePos,
418
- text: commentMark,
419
- commentText: comment.text.slice(0, 30),
420
- hasReplies: !!replyTexts,
421
- debug: `"${(mdPara?.text ?? '').slice(Math.max(0, mdPos - 20), mdPos)}|HERE|${(mdPara?.text ?? '').slice(mdPos, mdPos + 20)}"`,
422
- });
423
- }
424
- }
425
-
426
- console.log(`Matched ${matched} paragraphs, ${unmatched} unmatched`);
427
- console.log(`Inserting ${insertions.length} comments (${insertions.filter((i) => i.hasReplies).length} with replies)`);
428
-
429
- if (dryRun) {
430
- console.log('\nDry run - would insert:');
431
- for (const ins of insertions.slice(0, 10)) {
432
- console.log(` At ${ins.position}: ${ins.debug}`);
433
- console.log(` Comment: "${ins.commentText}..."${ins.hasReplies ? ' (+ replies)' : ''}`);
434
- }
435
- return { success: true, dryRun: true, insertions: insertions.length };
436
- }
437
-
438
- // Sort by position descending and insert
439
- insertions.sort((a, b) => b.position - a.position);
440
-
441
- for (const ins of insertions) {
442
- markdown = markdown.slice(0, ins.position) + ins.text + markdown.slice(ins.position);
443
- }
444
-
445
- // Write result
446
- fs.writeFileSync(markdownPath, markdown);
447
-
448
- return { success: true, insertions: insertions.length, matched, unmatched };
449
- }
450
-
451
- /**
452
- * Realign comments in markdown string (in-memory, doesn't write to file)
453
- */
454
- export async function realignMarkdown(
455
- docxPath: string,
456
- markdown: string,
457
- options: RealignMarkdownOptions = {}
458
- ): Promise<RealignMarkdownResult> {
459
- const { author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options;
460
-
461
- try {
462
- // Extract reply relationships from original markdown
463
- const replies = extractReplies(markdown, author, replyAuthor);
464
-
465
- // Extract reference paragraphs with comments
466
- const refParagraphs = await extractParagraphsWithComments(docxPath);
467
- const refWithComments = refParagraphs.filter(
468
- (p) => p.comments.length > 0 && p.comments.some((c) => c.author === author)
469
- );
470
-
471
- // Strip ALL comments from markdown
472
- let result = markdown.replace(/\s*\{>>[\s\S]+?<<\}/g, '');
473
-
474
- // Parse markdown paragraphs
475
- const mdParagraphs = parseMdParagraphs(result);
476
-
477
- // Track insertions
478
- const insertions: Array<{ position: number; text: string }> = [];
479
-
480
- for (const refPara of refWithComments) {
481
- const match = findMatchingParagraph(refPara.text, mdParagraphs);
482
- if (!match) continue;
483
-
484
- const mdPara = match.paragraph;
485
- const authorComments = refPara.comments.filter((c) => c.author === author);
486
-
487
- for (const comment of authorComments) {
488
- const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? '');
489
- const absolutePos = (mdPara?.start ?? 0) + mdPos;
490
-
491
- let commentMark = ` {>>${comment.author}: ${comment.text}<<}`;
492
-
493
- // Check for replies
494
- const replyKey = comment.text.trim().slice(0, 50);
495
- const replyTexts = replies.get(replyKey);
496
- if (replyTexts) {
497
- for (const replyText of replyTexts) {
498
- commentMark += ` {>>${replyAuthor}: ${replyText}<<}`;
499
- }
500
- }
501
-
502
- insertions.push({ position: absolutePos, text: commentMark });
503
- }
504
- }
505
-
506
- // Sort by position descending and insert
507
- insertions.sort((a, b) => b.position - a.position);
508
-
509
- for (const ins of insertions) {
510
- result = result.slice(0, ins.position) + ins.text + result.slice(ins.position);
511
- }
512
-
513
- return { success: true, markdown: result, insertions: insertions.length };
514
- } catch (err: any) {
515
- return { success: false, markdown, insertions: 0, error: err.message };
516
- }
517
- }
1
+ /**
2
+ * Realign comments from a reference DOCX to markdown
3
+ * Uses paragraph-level matching with exact positions
4
+ */
5
+
6
+ import * as fs from 'fs';
7
+ import AdmZip from 'adm-zip';
8
+ import { parseStringPromise } from 'xml2js';
9
+
10
+ interface CommentData {
11
+ author: string;
12
+ text: string;
13
+ }
14
+
15
+ interface CommentWithPosition {
16
+ id: string;
17
+ position: number;
18
+ author: string;
19
+ text: string;
20
+ }
21
+
22
+ interface Paragraph {
23
+ text: string;
24
+ comments: CommentWithPosition[];
25
+ }
26
+
27
+ interface MdParagraph {
28
+ text: string;
29
+ start: number;
30
+ end: number;
31
+ }
32
+
33
+ interface ParagraphMatch {
34
+ index: number;
35
+ score: number;
36
+ paragraph: MdParagraph;
37
+ }
38
+
39
+ interface WordContext {
40
+ before: string[];
41
+ after: string[];
42
+ }
43
+
44
+ interface CommentInsertion {
45
+ position: number;
46
+ text: string;
47
+ commentText: string;
48
+ hasReplies: boolean;
49
+ debug: string;
50
+ }
51
+
52
+ interface RealignOptions {
53
+ dryRun?: boolean;
54
+ author?: string;
55
+ replyAuthor?: string;
56
+ }
57
+
58
+ interface RealignResult {
59
+ success: boolean;
60
+ dryRun?: boolean;
61
+ insertions: number;
62
+ matched?: number;
63
+ unmatched?: number;
64
+ }
65
+
66
+ interface RealignMarkdownOptions {
67
+ author?: string;
68
+ replyAuthor?: string;
69
+ }
70
+
71
+ interface RealignMarkdownResult {
72
+ success: boolean;
73
+ markdown: string;
74
+ insertions: number;
75
+ error?: string;
76
+ }
77
+
78
+ /**
79
+ * Extract paragraphs with their full text and comment positions from DOCX
80
+ */
81
+ export async function extractParagraphsWithComments(docxPath: string): Promise<Paragraph[]> {
82
+ const zip = new AdmZip(docxPath);
83
+ const doc = zip.readAsText('word/document.xml');
84
+ const commentsXml = zip.readAsText('word/comments.xml');
85
+
86
+ // Parse comments to get authors and texts
87
+ const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
88
+ const commentNodes = parsed['w:comments']?.['w:comment'];
89
+ if (!commentNodes) return [];
90
+
91
+ const nodes = Array.isArray(commentNodes) ? commentNodes : [commentNodes];
92
+ const commentData: Record<string, CommentData> = {};
93
+
94
+ for (const c of nodes) {
95
+ const id = c.$?.['w:id'] ?? '';
96
+ const author = c.$?.['w:author'] ?? 'Unknown';
97
+ let text = '';
98
+ const extractT = (n: any): void => {
99
+ if (!n) return;
100
+ if (n['w:t']) {
101
+ const t = n['w:t'];
102
+ text += typeof t === 'string' ? t : (t._ || t);
103
+ }
104
+ if (n['w:r']) {
105
+ (Array.isArray(n['w:r']) ? n['w:r'] : [n['w:r']]).forEach(extractT);
106
+ }
107
+ if (n['w:p']) {
108
+ (Array.isArray(n['w:p']) ? n['w:p'] : [n['w:p']]).forEach(extractT);
109
+ }
110
+ };
111
+ extractT(c);
112
+ commentData[id] = { author, text: text.trim() };
113
+ }
114
+
115
+ // Extract paragraphs with comments
116
+ const paragraphs: Paragraph[] = [];
117
+ const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
118
+ let match;
119
+
120
+ while ((match = paraPattern.exec(doc)) !== null) {
121
+ const paraContent = match[1];
122
+ const hasComments = /commentRangeStart/.test(paraContent);
123
+
124
+ // Build paragraph text and track comment positions
125
+ let text = '';
126
+ const comments: CommentWithPosition[] = [];
127
+
128
+ const tokenPattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
129
+ let tokenMatch;
130
+
131
+ while ((tokenMatch = tokenPattern.exec(paraContent)) !== null) {
132
+ if (tokenMatch[1] !== undefined) {
133
+ text += tokenMatch[1];
134
+ } else if (tokenMatch[2] !== undefined) {
135
+ const cid = tokenMatch[2];
136
+ const data = commentData[cid];
137
+ if (data) {
138
+ comments.push({
139
+ id: cid,
140
+ position: text.length,
141
+ author: data.author,
142
+ text: data.text,
143
+ });
144
+ }
145
+ }
146
+ }
147
+
148
+ if (text.trim() || hasComments) {
149
+ paragraphs.push({ text: text.trim(), comments });
150
+ }
151
+ }
152
+
153
+ return paragraphs;
154
+ }
155
+
156
+ /**
157
+ * Find best matching paragraph in markdown for a reference paragraph
158
+ */
159
+ function findMatchingParagraph(refText: string, mdParagraphs: MdParagraph[]): ParagraphMatch | null {
160
+ // Normalize for comparison
161
+ const normalize = (s: string): string => s.toLowerCase().replace(/\s+/g, ' ').trim();
162
+ const refNorm = normalize(refText);
163
+
164
+ if (refNorm.length < 20) return null;
165
+
166
+ let bestMatch: ParagraphMatch | null = null;
167
+ let bestScore = 0;
168
+
169
+ for (let i = 0; i < mdParagraphs.length; i++) {
170
+ const mdNorm = normalize(mdParagraphs[i].text);
171
+
172
+ // Calculate word overlap
173
+ const refWords = new Set(refNorm.split(' ').filter((w) => w.length > 3));
174
+ const mdWords = mdNorm.split(' ').filter((w) => w.length > 3);
175
+ const overlap = mdWords.filter((w) => refWords.has(w)).length;
176
+ const score = overlap / Math.max(refWords.size, 1);
177
+
178
+ // Also check for substring containment (for section headers)
179
+ const containsStart = mdNorm.includes(refNorm.slice(0, 50));
180
+
181
+ if (score > bestScore || (containsStart && score > 0.3)) {
182
+ bestScore = Math.max(score, containsStart ? 0.8 : score);
183
+ bestMatch = { index: i, score: bestScore, paragraph: mdParagraphs[i] };
184
+ }
185
+ }
186
+
187
+ return bestScore > 0.4 ? bestMatch : null;
188
+ }
189
+
190
+ /**
191
+ * Extract paragraphs from markdown (split by blank lines)
192
+ */
193
+ function parseMdParagraphs(markdown: string): MdParagraph[] {
194
+ const paragraphs: MdParagraph[] = [];
195
+ const parts = markdown.split(/\n\n+/);
196
+
197
+ let pos = 0;
198
+ for (const part of parts) {
199
+ const trimmed = part.trim();
200
+ if (trimmed) {
201
+ const partStart = markdown.indexOf(part, pos);
202
+ if (partStart !== -1) {
203
+ paragraphs.push({
204
+ text: trimmed,
205
+ start: partStart,
206
+ end: partStart + part.length,
207
+ });
208
+ pos = partStart + part.length;
209
+ }
210
+ }
211
+ }
212
+
213
+ return paragraphs;
214
+ }
215
+
216
+ /**
217
+ * Normalize text for matching (remove citations, extra whitespace)
218
+ */
219
+ function normalizeForMatching(text: string): string {
220
+ return text
221
+ // Remove Word citation placeholders
222
+ .replace(/\(\s*\$+\s*\)/g, '')
223
+ .replace(/\$+/g, '')
224
+ // Remove markdown citations
225
+ .replace(/\[@[^\]]+\]/g, '')
226
+ .replace(/@[A-Z][a-z]+\d{4}/g, '')
227
+ // Remove rendered citations like "(Author et al. 2021)"
228
+ .replace(/\([A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*[A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)/g, '')
229
+ // Remove figure references like "Fig. 1" or "(Fig. 1)"
230
+ .replace(/\(?Fig\.?\s*\d+[a-z]?\)?/gi, '')
231
+ // Normalize whitespace
232
+ .replace(/\s+/g, ' ')
233
+ .trim()
234
+ .toLowerCase();
235
+ }
236
+
237
+ /**
238
+ * Find the word at or near a position in text
239
+ */
240
+ function getWordAtPosition(text: string, pos: number): WordContext {
241
+ const before = text.slice(Math.max(0, pos - 30), pos);
242
+ const after = text.slice(pos, pos + 30);
243
+
244
+ // Get the last complete word before position
245
+ const beforeWords = before.split(/\s+/).filter(w => w.length > 2);
246
+ const afterWords = after.split(/\s+/).filter(w => w.length > 2);
247
+
248
+ return {
249
+ before: beforeWords.slice(-3),
250
+ after: afterWords.slice(0, 3)
251
+ };
252
+ }
253
+
254
+ /**
255
+ * Find position in markdown paragraph matching reference position
256
+ * Uses the anchor word (word immediately before the comment) for precise matching
257
+ */
258
+ function findMdPosition(refText: string, refPos: number, mdText: string): number {
259
+ // Get the word(s) immediately before the comment position in reference
260
+ const refWords = getWordAtPosition(refText, refPos);
261
+ const normalizedMd = normalizeForMatching(mdText);
262
+
263
+ // The "anchor word" is the last word before the comment
264
+ const anchorWords = refWords.before;
265
+
266
+ if (anchorWords.length === 0) {
267
+ const ratio = refPos / Math.max(refText.length, 1);
268
+ return Math.round(ratio * mdText.length);
269
+ }
270
+
271
+ // Try to find the anchor word(s) in markdown
272
+ // Start with the most specific (all words), fall back to fewer
273
+ for (let numWords = anchorWords.length; numWords >= 1; numWords--) {
274
+ const searchWords = anchorWords.slice(-numWords);
275
+ const pattern = searchWords.map(w =>
276
+ w.toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
277
+ ).join('\\s+');
278
+
279
+ const regex = new RegExp(pattern, 'g');
280
+ const matches = [...normalizedMd.matchAll(regex)];
281
+
282
+ if (matches.length === 1) {
283
+ // Unique match - use this position
284
+ const matchEnd = matches[0].index! + matches[0][0].length;
285
+ // Map back to original markdown position
286
+ const ratio = matchEnd / Math.max(normalizedMd.length, 1);
287
+ return Math.round(ratio * mdText.length);
288
+ } else if (matches.length > 1) {
289
+ // Multiple matches - use context after to disambiguate
290
+ const afterWords = refWords.after;
291
+ if (afterWords.length > 0) {
292
+ const afterPattern = afterWords[0].toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
293
+ for (const match of matches) {
294
+ const matchEnd = match.index! + match[0].length;
295
+ const afterContext = normalizedMd.slice(matchEnd, matchEnd + 50);
296
+ if (afterContext.includes(afterPattern)) {
297
+ const ratio = matchEnd / Math.max(normalizedMd.length, 1);
298
+ return Math.round(ratio * mdText.length);
299
+ }
300
+ }
301
+ }
302
+ // Fall back to first match
303
+ const matchEnd = matches[0].index! + matches[0][0].length;
304
+ const ratio = matchEnd / Math.max(normalizedMd.length, 1);
305
+ return Math.round(ratio * mdText.length);
306
+ }
307
+ }
308
+
309
+ // Fallback: proportional position
310
+ const ratio = refPos / Math.max(refText.length, 1);
311
+ return Math.round(ratio * mdText.length);
312
+ }
313
+
314
+ /**
315
+ * Extract reply comments that follow a parent comment
316
+ * Returns map of parent comment text -> array of reply texts
317
+ */
318
+ function extractReplies(markdown: string, parentAuthor: string, replyAuthor: string): Map<string, string[]> {
319
+ const replies = new Map<string, string[]>();
320
+ const pattern = new RegExp(
321
+ `\\{>>${parentAuthor}:\\s*([^<]+)<<\\}((?:\\s*\\{>>${replyAuthor}:[^<]+<<\\})*)`,
322
+ 'g'
323
+ );
324
+
325
+ let match;
326
+ while ((match = pattern.exec(markdown)) !== null) {
327
+ const parentText = match[1].trim();
328
+ const replyBlock = match[2];
329
+
330
+ if (replyBlock) {
331
+ const replyPattern = new RegExp(`\\{>>${replyAuthor}:\\s*([^<]+)<<\\}`, 'g');
332
+ const replyTexts: string[] = [];
333
+ let replyMatch;
334
+ while ((replyMatch = replyPattern.exec(replyBlock)) !== null) {
335
+ replyTexts.push(replyMatch[1].trim());
336
+ }
337
+ if (replyTexts.length > 0) {
338
+ replies.set(parentText.slice(0, 50), replyTexts); // Use first 50 chars as key
339
+ }
340
+ }
341
+ }
342
+
343
+ return replies;
344
+ }
345
+
346
+ /**
347
+ * Realign comments from reference DOCX to markdown
348
+ */
349
+ export async function realignComments(
350
+ docxPath: string,
351
+ markdownPath: string,
352
+ options: RealignOptions = {}
353
+ ): Promise<RealignResult> {
354
+ const { dryRun = false, author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options;
355
+
356
+ // Read original markdown to extract replies before stripping
357
+ const originalMarkdown = fs.readFileSync(markdownPath, 'utf-8');
358
+
359
+ // Extract reply relationships
360
+ const replies = extractReplies(originalMarkdown, author, replyAuthor);
361
+ console.log(`Found ${replies.size} ${author} comments with ${replyAuthor} replies`);
362
+
363
+ // Extract reference paragraphs with comments
364
+ const refParagraphs = await extractParagraphsWithComments(docxPath);
365
+ const refWithComments = refParagraphs.filter(
366
+ (p) => p.comments.length > 0 && p.comments.some((c) => c.author === author)
367
+ );
368
+
369
+ console.log(`Found ${refWithComments.length} paragraphs with ${author} comments in reference`);
370
+
371
+ // Strip ALL comments (both authors) from markdown to start fresh
372
+ let markdown = originalMarkdown;
373
+ markdown = markdown.replace(/\s*\{>>[\s\S]+?<<\}/g, '');
374
+ console.log(`Stripped all comments from markdown`);
375
+
376
+ // Parse markdown paragraphs
377
+ const mdParagraphs = parseMdParagraphs(markdown);
378
+
379
+ // Track insertions (position, text) - will insert from end to start
380
+ const insertions: CommentInsertion[] = [];
381
+ let matched = 0;
382
+ let unmatched = 0;
383
+
384
+ for (const refPara of refWithComments) {
385
+ const match = findMatchingParagraph(refPara.text, mdParagraphs);
386
+
387
+ if (!match) {
388
+ console.log(` No match for: "${refPara.text.slice(0, 60)}..."`);
389
+ unmatched++;
390
+ continue;
391
+ }
392
+
393
+ matched++;
394
+ const mdPara = match.paragraph;
395
+
396
+ // Get author's comments in this paragraph
397
+ const authorComments = refPara.comments.filter((c) => c.author === author);
398
+
399
+ for (const comment of authorComments) {
400
+ // Find corresponding position in markdown paragraph
401
+ const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? '');
402
+ const absolutePos = (mdPara?.start ?? 0) + mdPos;
403
+
404
+ // Build comment mark with any replies
405
+ let commentMark = ` {>>${comment.author}: ${comment.text}<<}`;
406
+
407
+ // Check for replies
408
+ const replyKey = comment.text.trim().slice(0, 50);
409
+ const replyTexts = replies.get(replyKey);
410
+ if (replyTexts) {
411
+ for (const replyText of replyTexts) {
412
+ commentMark += ` {>>${replyAuthor}: ${replyText}<<}`;
413
+ }
414
+ }
415
+
416
+ insertions.push({
417
+ position: absolutePos,
418
+ text: commentMark,
419
+ commentText: comment.text.slice(0, 30),
420
+ hasReplies: !!replyTexts,
421
+ debug: `"${(mdPara?.text ?? '').slice(Math.max(0, mdPos - 20), mdPos)}|HERE|${(mdPara?.text ?? '').slice(mdPos, mdPos + 20)}"`,
422
+ });
423
+ }
424
+ }
425
+
426
+ console.log(`Matched ${matched} paragraphs, ${unmatched} unmatched`);
427
+ console.log(`Inserting ${insertions.length} comments (${insertions.filter((i) => i.hasReplies).length} with replies)`);
428
+
429
+ if (dryRun) {
430
+ console.log('\nDry run - would insert:');
431
+ for (const ins of insertions.slice(0, 10)) {
432
+ console.log(` At ${ins.position}: ${ins.debug}`);
433
+ console.log(` Comment: "${ins.commentText}..."${ins.hasReplies ? ' (+ replies)' : ''}`);
434
+ }
435
+ return { success: true, dryRun: true, insertions: insertions.length };
436
+ }
437
+
438
+ // Sort by position descending and insert
439
+ insertions.sort((a, b) => b.position - a.position);
440
+
441
+ for (const ins of insertions) {
442
+ markdown = markdown.slice(0, ins.position) + ins.text + markdown.slice(ins.position);
443
+ }
444
+
445
+ // Write result
446
+ fs.writeFileSync(markdownPath, markdown);
447
+
448
+ return { success: true, insertions: insertions.length, matched, unmatched };
449
+ }
450
+
451
+ /**
452
+ * Realign comments in markdown string (in-memory, doesn't write to file)
453
+ */
454
+ export async function realignMarkdown(
455
+ docxPath: string,
456
+ markdown: string,
457
+ options: RealignMarkdownOptions = {}
458
+ ): Promise<RealignMarkdownResult> {
459
+ const { author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options;
460
+
461
+ try {
462
+ // Extract reply relationships from original markdown
463
+ const replies = extractReplies(markdown, author, replyAuthor);
464
+
465
+ // Extract reference paragraphs with comments
466
+ const refParagraphs = await extractParagraphsWithComments(docxPath);
467
+ const refWithComments = refParagraphs.filter(
468
+ (p) => p.comments.length > 0 && p.comments.some((c) => c.author === author)
469
+ );
470
+
471
+ // Strip ALL comments from markdown
472
+ let result = markdown.replace(/\s*\{>>[\s\S]+?<<\}/g, '');
473
+
474
+ // Parse markdown paragraphs
475
+ const mdParagraphs = parseMdParagraphs(result);
476
+
477
+ // Track insertions
478
+ const insertions: Array<{ position: number; text: string }> = [];
479
+
480
+ for (const refPara of refWithComments) {
481
+ const match = findMatchingParagraph(refPara.text, mdParagraphs);
482
+ if (!match) continue;
483
+
484
+ const mdPara = match.paragraph;
485
+ const authorComments = refPara.comments.filter((c) => c.author === author);
486
+
487
+ for (const comment of authorComments) {
488
+ const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? '');
489
+ const absolutePos = (mdPara?.start ?? 0) + mdPos;
490
+
491
+ let commentMark = ` {>>${comment.author}: ${comment.text}<<}`;
492
+
493
+ // Check for replies
494
+ const replyKey = comment.text.trim().slice(0, 50);
495
+ const replyTexts = replies.get(replyKey);
496
+ if (replyTexts) {
497
+ for (const replyText of replyTexts) {
498
+ commentMark += ` {>>${replyAuthor}: ${replyText}<<}`;
499
+ }
500
+ }
501
+
502
+ insertions.push({ position: absolutePos, text: commentMark });
503
+ }
504
+ }
505
+
506
+ // Sort by position descending and insert
507
+ insertions.sort((a, b) => b.position - a.position);
508
+
509
+ for (const ins of insertions) {
510
+ result = result.slice(0, ins.position) + ins.text + result.slice(ins.position);
511
+ }
512
+
513
+ return { success: true, markdown: result, insertions: insertions.length };
514
+ } catch (err: any) {
515
+ return { success: false, markdown, insertions: 0, error: err.message };
516
+ }
517
+ }