docrev 0.9.13 → 0.9.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +9 -9
- package/.gitattributes +1 -1
- package/CHANGELOG.md +149 -149
- package/PLAN-tables-and-postprocess.md +850 -850
- package/README.md +411 -391
- package/bin/rev.js +11 -11
- package/bin/rev.ts +145 -145
- package/completions/rev.bash +127 -127
- package/completions/rev.ps1 +210 -210
- package/completions/rev.zsh +207 -207
- package/dev_notes/stress2/build_adversarial.ts +186 -186
- package/dev_notes/stress2/drift_matcher.ts +62 -62
- package/dev_notes/stress2/probe_anchors.ts +35 -35
- package/dev_notes/stress2/project/discussion.before.md +3 -3
- package/dev_notes/stress2/project/discussion.md +3 -3
- package/dev_notes/stress2/project/methods.before.md +20 -20
- package/dev_notes/stress2/project/methods.md +20 -20
- package/dev_notes/stress2/project/rev.yaml +5 -5
- package/dev_notes/stress2/project/sections.yaml +4 -4
- package/dev_notes/stress2/sections.yaml +5 -5
- package/dev_notes/stress2/trace_placement.ts +50 -50
- package/dev_notes/stresstest_boundaries.ts +27 -27
- package/dev_notes/stresstest_drift_apply.ts +43 -43
- package/dev_notes/stresstest_drift_compare.ts +43 -43
- package/dev_notes/stresstest_drift_v2.ts +54 -54
- package/dev_notes/stresstest_inspect.ts +54 -54
- package/dev_notes/stresstest_pstyle.ts +55 -55
- package/dev_notes/stresstest_section_debug.ts +23 -23
- package/dev_notes/stresstest_split.ts +70 -70
- package/dev_notes/stresstest_trace.ts +19 -19
- package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
- package/dist/lib/build.d.ts +38 -1
- package/dist/lib/build.d.ts.map +1 -1
- package/dist/lib/build.js +68 -30
- package/dist/lib/build.js.map +1 -1
- package/dist/lib/commands/build.d.ts.map +1 -1
- package/dist/lib/commands/build.js +38 -5
- package/dist/lib/commands/build.js.map +1 -1
- package/dist/lib/commands/utilities.js +164 -164
- package/dist/lib/commands/word-tools.js +8 -8
- package/dist/lib/grammar.js +3 -3
- package/dist/lib/pdf-comments.js +44 -44
- package/dist/lib/plugins.js +57 -57
- package/dist/lib/pptx-themes.js +115 -115
- package/dist/lib/spelling.js +2 -2
- package/dist/lib/templates.js +387 -387
- package/dist/lib/themes.js +51 -51
- package/eslint.config.js +27 -27
- package/lib/anchor-match.ts +276 -276
- package/lib/annotations.ts +644 -644
- package/lib/build.ts +1300 -1251
- package/lib/citations.ts +160 -160
- package/lib/commands/build.ts +833 -801
- package/lib/commands/citations.ts +515 -515
- package/lib/commands/comments.ts +1050 -1050
- package/lib/commands/context.ts +174 -174
- package/lib/commands/core.ts +309 -309
- package/lib/commands/doi.ts +435 -435
- package/lib/commands/file-ops.ts +372 -372
- package/lib/commands/history.ts +320 -320
- package/lib/commands/index.ts +87 -87
- package/lib/commands/init.ts +259 -259
- package/lib/commands/merge-resolve.ts +378 -378
- package/lib/commands/preview.ts +178 -178
- package/lib/commands/project-info.ts +244 -244
- package/lib/commands/quality.ts +517 -517
- package/lib/commands/response.ts +454 -454
- package/lib/commands/section-boundaries.ts +82 -82
- package/lib/commands/sections.ts +451 -451
- package/lib/commands/sync.ts +706 -706
- package/lib/commands/text-ops.ts +449 -449
- package/lib/commands/utilities.ts +448 -448
- package/lib/commands/verify-anchors.ts +272 -272
- package/lib/commands/word-tools.ts +340 -340
- package/lib/comment-realign.ts +517 -517
- package/lib/config.ts +84 -84
- package/lib/crossref.ts +781 -781
- package/lib/csl.ts +191 -191
- package/lib/dependencies.ts +98 -98
- package/lib/diff-engine.ts +465 -465
- package/lib/doi-cache.ts +115 -115
- package/lib/doi.ts +897 -897
- package/lib/equations.ts +506 -506
- package/lib/errors.ts +346 -346
- package/lib/format.ts +541 -541
- package/lib/git.ts +326 -326
- package/lib/grammar.ts +303 -303
- package/lib/image-registry.ts +180 -180
- package/lib/import.ts +911 -911
- package/lib/journals.ts +543 -543
- package/lib/merge.ts +633 -633
- package/lib/orcid.ts +144 -144
- package/lib/pdf-comments.ts +263 -263
- package/lib/pdf-import.ts +524 -524
- package/lib/plugins.ts +362 -362
- package/lib/postprocess.ts +188 -188
- package/lib/pptx-color-filter.lua +37 -37
- package/lib/pptx-template.ts +469 -469
- package/lib/pptx-themes.ts +483 -483
- package/lib/protect-restore.ts +520 -520
- package/lib/rate-limiter.ts +94 -94
- package/lib/response.ts +197 -197
- package/lib/restore-references.ts +240 -240
- package/lib/review.ts +327 -327
- package/lib/schema.ts +417 -417
- package/lib/scientific-words.ts +73 -73
- package/lib/sections.ts +335 -335
- package/lib/slides.ts +756 -756
- package/lib/spelling.ts +334 -334
- package/lib/templates.ts +526 -526
- package/lib/themes.ts +742 -742
- package/lib/trackchanges.ts +247 -247
- package/lib/tui.ts +450 -450
- package/lib/types.ts +550 -550
- package/lib/undo.ts +250 -250
- package/lib/utils.ts +69 -69
- package/lib/variables.ts +179 -179
- package/lib/word-extraction.ts +806 -806
- package/lib/word.ts +643 -643
- package/lib/wordcomments.ts +817 -817
- package/package.json +137 -137
- package/scripts/postbuild.js +28 -28
- package/skill/REFERENCE.md +473 -431
- package/skill/SKILL.md +274 -258
- package/tsconfig.json +26 -26
- package/types/index.d.ts +525 -525
package/lib/comment-realign.ts
CHANGED
|
@@ -1,517 +1,517 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Realign comments from a reference DOCX to markdown
|
|
3
|
-
* Uses paragraph-level matching with exact positions
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
import * as fs from 'fs';
|
|
7
|
-
import AdmZip from 'adm-zip';
|
|
8
|
-
import { parseStringPromise } from 'xml2js';
|
|
9
|
-
|
|
10
|
-
interface CommentData {
|
|
11
|
-
author: string;
|
|
12
|
-
text: string;
|
|
13
|
-
}
|
|
14
|
-
|
|
15
|
-
interface CommentWithPosition {
|
|
16
|
-
id: string;
|
|
17
|
-
position: number;
|
|
18
|
-
author: string;
|
|
19
|
-
text: string;
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
interface Paragraph {
|
|
23
|
-
text: string;
|
|
24
|
-
comments: CommentWithPosition[];
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
interface MdParagraph {
|
|
28
|
-
text: string;
|
|
29
|
-
start: number;
|
|
30
|
-
end: number;
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
interface ParagraphMatch {
|
|
34
|
-
index: number;
|
|
35
|
-
score: number;
|
|
36
|
-
paragraph: MdParagraph;
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
interface WordContext {
|
|
40
|
-
before: string[];
|
|
41
|
-
after: string[];
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
interface CommentInsertion {
|
|
45
|
-
position: number;
|
|
46
|
-
text: string;
|
|
47
|
-
commentText: string;
|
|
48
|
-
hasReplies: boolean;
|
|
49
|
-
debug: string;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
interface RealignOptions {
|
|
53
|
-
dryRun?: boolean;
|
|
54
|
-
author?: string;
|
|
55
|
-
replyAuthor?: string;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
interface RealignResult {
|
|
59
|
-
success: boolean;
|
|
60
|
-
dryRun?: boolean;
|
|
61
|
-
insertions: number;
|
|
62
|
-
matched?: number;
|
|
63
|
-
unmatched?: number;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
interface RealignMarkdownOptions {
|
|
67
|
-
author?: string;
|
|
68
|
-
replyAuthor?: string;
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
interface RealignMarkdownResult {
|
|
72
|
-
success: boolean;
|
|
73
|
-
markdown: string;
|
|
74
|
-
insertions: number;
|
|
75
|
-
error?: string;
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
/**
|
|
79
|
-
* Extract paragraphs with their full text and comment positions from DOCX
|
|
80
|
-
*/
|
|
81
|
-
export async function extractParagraphsWithComments(docxPath: string): Promise<Paragraph[]> {
|
|
82
|
-
const zip = new AdmZip(docxPath);
|
|
83
|
-
const doc = zip.readAsText('word/document.xml');
|
|
84
|
-
const commentsXml = zip.readAsText('word/comments.xml');
|
|
85
|
-
|
|
86
|
-
// Parse comments to get authors and texts
|
|
87
|
-
const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
|
|
88
|
-
const commentNodes = parsed['w:comments']?.['w:comment'];
|
|
89
|
-
if (!commentNodes) return [];
|
|
90
|
-
|
|
91
|
-
const nodes = Array.isArray(commentNodes) ? commentNodes : [commentNodes];
|
|
92
|
-
const commentData: Record<string, CommentData> = {};
|
|
93
|
-
|
|
94
|
-
for (const c of nodes) {
|
|
95
|
-
const id = c.$?.['w:id'] ?? '';
|
|
96
|
-
const author = c.$?.['w:author'] ?? 'Unknown';
|
|
97
|
-
let text = '';
|
|
98
|
-
const extractT = (n: any): void => {
|
|
99
|
-
if (!n) return;
|
|
100
|
-
if (n['w:t']) {
|
|
101
|
-
const t = n['w:t'];
|
|
102
|
-
text += typeof t === 'string' ? t : (t._ || t);
|
|
103
|
-
}
|
|
104
|
-
if (n['w:r']) {
|
|
105
|
-
(Array.isArray(n['w:r']) ? n['w:r'] : [n['w:r']]).forEach(extractT);
|
|
106
|
-
}
|
|
107
|
-
if (n['w:p']) {
|
|
108
|
-
(Array.isArray(n['w:p']) ? n['w:p'] : [n['w:p']]).forEach(extractT);
|
|
109
|
-
}
|
|
110
|
-
};
|
|
111
|
-
extractT(c);
|
|
112
|
-
commentData[id] = { author, text: text.trim() };
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
// Extract paragraphs with comments
|
|
116
|
-
const paragraphs: Paragraph[] = [];
|
|
117
|
-
const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
|
|
118
|
-
let match;
|
|
119
|
-
|
|
120
|
-
while ((match = paraPattern.exec(doc)) !== null) {
|
|
121
|
-
const paraContent = match[1];
|
|
122
|
-
const hasComments = /commentRangeStart/.test(paraContent);
|
|
123
|
-
|
|
124
|
-
// Build paragraph text and track comment positions
|
|
125
|
-
let text = '';
|
|
126
|
-
const comments: CommentWithPosition[] = [];
|
|
127
|
-
|
|
128
|
-
const tokenPattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
|
|
129
|
-
let tokenMatch;
|
|
130
|
-
|
|
131
|
-
while ((tokenMatch = tokenPattern.exec(paraContent)) !== null) {
|
|
132
|
-
if (tokenMatch[1] !== undefined) {
|
|
133
|
-
text += tokenMatch[1];
|
|
134
|
-
} else if (tokenMatch[2] !== undefined) {
|
|
135
|
-
const cid = tokenMatch[2];
|
|
136
|
-
const data = commentData[cid];
|
|
137
|
-
if (data) {
|
|
138
|
-
comments.push({
|
|
139
|
-
id: cid,
|
|
140
|
-
position: text.length,
|
|
141
|
-
author: data.author,
|
|
142
|
-
text: data.text,
|
|
143
|
-
});
|
|
144
|
-
}
|
|
145
|
-
}
|
|
146
|
-
}
|
|
147
|
-
|
|
148
|
-
if (text.trim() || hasComments) {
|
|
149
|
-
paragraphs.push({ text: text.trim(), comments });
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
return paragraphs;
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
/**
|
|
157
|
-
* Find best matching paragraph in markdown for a reference paragraph
|
|
158
|
-
*/
|
|
159
|
-
function findMatchingParagraph(refText: string, mdParagraphs: MdParagraph[]): ParagraphMatch | null {
|
|
160
|
-
// Normalize for comparison
|
|
161
|
-
const normalize = (s: string): string => s.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
162
|
-
const refNorm = normalize(refText);
|
|
163
|
-
|
|
164
|
-
if (refNorm.length < 20) return null;
|
|
165
|
-
|
|
166
|
-
let bestMatch: ParagraphMatch | null = null;
|
|
167
|
-
let bestScore = 0;
|
|
168
|
-
|
|
169
|
-
for (let i = 0; i < mdParagraphs.length; i++) {
|
|
170
|
-
const mdNorm = normalize(mdParagraphs[i].text);
|
|
171
|
-
|
|
172
|
-
// Calculate word overlap
|
|
173
|
-
const refWords = new Set(refNorm.split(' ').filter((w) => w.length > 3));
|
|
174
|
-
const mdWords = mdNorm.split(' ').filter((w) => w.length > 3);
|
|
175
|
-
const overlap = mdWords.filter((w) => refWords.has(w)).length;
|
|
176
|
-
const score = overlap / Math.max(refWords.size, 1);
|
|
177
|
-
|
|
178
|
-
// Also check for substring containment (for section headers)
|
|
179
|
-
const containsStart = mdNorm.includes(refNorm.slice(0, 50));
|
|
180
|
-
|
|
181
|
-
if (score > bestScore || (containsStart && score > 0.3)) {
|
|
182
|
-
bestScore = Math.max(score, containsStart ? 0.8 : score);
|
|
183
|
-
bestMatch = { index: i, score: bestScore, paragraph: mdParagraphs[i] };
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
return bestScore > 0.4 ? bestMatch : null;
|
|
188
|
-
}
|
|
189
|
-
|
|
190
|
-
/**
|
|
191
|
-
* Extract paragraphs from markdown (split by blank lines)
|
|
192
|
-
*/
|
|
193
|
-
function parseMdParagraphs(markdown: string): MdParagraph[] {
|
|
194
|
-
const paragraphs: MdParagraph[] = [];
|
|
195
|
-
const parts = markdown.split(/\n\n+/);
|
|
196
|
-
|
|
197
|
-
let pos = 0;
|
|
198
|
-
for (const part of parts) {
|
|
199
|
-
const trimmed = part.trim();
|
|
200
|
-
if (trimmed) {
|
|
201
|
-
const partStart = markdown.indexOf(part, pos);
|
|
202
|
-
if (partStart !== -1) {
|
|
203
|
-
paragraphs.push({
|
|
204
|
-
text: trimmed,
|
|
205
|
-
start: partStart,
|
|
206
|
-
end: partStart + part.length,
|
|
207
|
-
});
|
|
208
|
-
pos = partStart + part.length;
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
return paragraphs;
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
/**
|
|
217
|
-
* Normalize text for matching (remove citations, extra whitespace)
|
|
218
|
-
*/
|
|
219
|
-
function normalizeForMatching(text: string): string {
|
|
220
|
-
return text
|
|
221
|
-
// Remove Word citation placeholders
|
|
222
|
-
.replace(/\(\s*\$+\s*\)/g, '')
|
|
223
|
-
.replace(/\$+/g, '')
|
|
224
|
-
// Remove markdown citations
|
|
225
|
-
.replace(/\[@[^\]]+\]/g, '')
|
|
226
|
-
.replace(/@[A-Z][a-z]+\d{4}/g, '')
|
|
227
|
-
// Remove rendered citations like "(Author et al. 2021)"
|
|
228
|
-
.replace(/\([A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*[A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)/g, '')
|
|
229
|
-
// Remove figure references like "Fig. 1" or "(Fig. 1)"
|
|
230
|
-
.replace(/\(?Fig\.?\s*\d+[a-z]?\)?/gi, '')
|
|
231
|
-
// Normalize whitespace
|
|
232
|
-
.replace(/\s+/g, ' ')
|
|
233
|
-
.trim()
|
|
234
|
-
.toLowerCase();
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
/**
|
|
238
|
-
* Find the word at or near a position in text
|
|
239
|
-
*/
|
|
240
|
-
function getWordAtPosition(text: string, pos: number): WordContext {
|
|
241
|
-
const before = text.slice(Math.max(0, pos - 30), pos);
|
|
242
|
-
const after = text.slice(pos, pos + 30);
|
|
243
|
-
|
|
244
|
-
// Get the last complete word before position
|
|
245
|
-
const beforeWords = before.split(/\s+/).filter(w => w.length > 2);
|
|
246
|
-
const afterWords = after.split(/\s+/).filter(w => w.length > 2);
|
|
247
|
-
|
|
248
|
-
return {
|
|
249
|
-
before: beforeWords.slice(-3),
|
|
250
|
-
after: afterWords.slice(0, 3)
|
|
251
|
-
};
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
/**
|
|
255
|
-
* Find position in markdown paragraph matching reference position
|
|
256
|
-
* Uses the anchor word (word immediately before the comment) for precise matching
|
|
257
|
-
*/
|
|
258
|
-
function findMdPosition(refText: string, refPos: number, mdText: string): number {
|
|
259
|
-
// Get the word(s) immediately before the comment position in reference
|
|
260
|
-
const refWords = getWordAtPosition(refText, refPos);
|
|
261
|
-
const normalizedMd = normalizeForMatching(mdText);
|
|
262
|
-
|
|
263
|
-
// The "anchor word" is the last word before the comment
|
|
264
|
-
const anchorWords = refWords.before;
|
|
265
|
-
|
|
266
|
-
if (anchorWords.length === 0) {
|
|
267
|
-
const ratio = refPos / Math.max(refText.length, 1);
|
|
268
|
-
return Math.round(ratio * mdText.length);
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
// Try to find the anchor word(s) in markdown
|
|
272
|
-
// Start with the most specific (all words), fall back to fewer
|
|
273
|
-
for (let numWords = anchorWords.length; numWords >= 1; numWords--) {
|
|
274
|
-
const searchWords = anchorWords.slice(-numWords);
|
|
275
|
-
const pattern = searchWords.map(w =>
|
|
276
|
-
w.toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
|
277
|
-
).join('\\s+');
|
|
278
|
-
|
|
279
|
-
const regex = new RegExp(pattern, 'g');
|
|
280
|
-
const matches = [...normalizedMd.matchAll(regex)];
|
|
281
|
-
|
|
282
|
-
if (matches.length === 1) {
|
|
283
|
-
// Unique match - use this position
|
|
284
|
-
const matchEnd = matches[0].index! + matches[0][0].length;
|
|
285
|
-
// Map back to original markdown position
|
|
286
|
-
const ratio = matchEnd / Math.max(normalizedMd.length, 1);
|
|
287
|
-
return Math.round(ratio * mdText.length);
|
|
288
|
-
} else if (matches.length > 1) {
|
|
289
|
-
// Multiple matches - use context after to disambiguate
|
|
290
|
-
const afterWords = refWords.after;
|
|
291
|
-
if (afterWords.length > 0) {
|
|
292
|
-
const afterPattern = afterWords[0].toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
293
|
-
for (const match of matches) {
|
|
294
|
-
const matchEnd = match.index! + match[0].length;
|
|
295
|
-
const afterContext = normalizedMd.slice(matchEnd, matchEnd + 50);
|
|
296
|
-
if (afterContext.includes(afterPattern)) {
|
|
297
|
-
const ratio = matchEnd / Math.max(normalizedMd.length, 1);
|
|
298
|
-
return Math.round(ratio * mdText.length);
|
|
299
|
-
}
|
|
300
|
-
}
|
|
301
|
-
}
|
|
302
|
-
// Fall back to first match
|
|
303
|
-
const matchEnd = matches[0].index! + matches[0][0].length;
|
|
304
|
-
const ratio = matchEnd / Math.max(normalizedMd.length, 1);
|
|
305
|
-
return Math.round(ratio * mdText.length);
|
|
306
|
-
}
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
// Fallback: proportional position
|
|
310
|
-
const ratio = refPos / Math.max(refText.length, 1);
|
|
311
|
-
return Math.round(ratio * mdText.length);
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
/**
|
|
315
|
-
* Extract reply comments that follow a parent comment
|
|
316
|
-
* Returns map of parent comment text -> array of reply texts
|
|
317
|
-
*/
|
|
318
|
-
function extractReplies(markdown: string, parentAuthor: string, replyAuthor: string): Map<string, string[]> {
|
|
319
|
-
const replies = new Map<string, string[]>();
|
|
320
|
-
const pattern = new RegExp(
|
|
321
|
-
`\\{>>${parentAuthor}:\\s*([^<]+)<<\\}((?:\\s*\\{>>${replyAuthor}:[^<]+<<\\})*)`,
|
|
322
|
-
'g'
|
|
323
|
-
);
|
|
324
|
-
|
|
325
|
-
let match;
|
|
326
|
-
while ((match = pattern.exec(markdown)) !== null) {
|
|
327
|
-
const parentText = match[1].trim();
|
|
328
|
-
const replyBlock = match[2];
|
|
329
|
-
|
|
330
|
-
if (replyBlock) {
|
|
331
|
-
const replyPattern = new RegExp(`\\{>>${replyAuthor}:\\s*([^<]+)<<\\}`, 'g');
|
|
332
|
-
const replyTexts: string[] = [];
|
|
333
|
-
let replyMatch;
|
|
334
|
-
while ((replyMatch = replyPattern.exec(replyBlock)) !== null) {
|
|
335
|
-
replyTexts.push(replyMatch[1].trim());
|
|
336
|
-
}
|
|
337
|
-
if (replyTexts.length > 0) {
|
|
338
|
-
replies.set(parentText.slice(0, 50), replyTexts); // Use first 50 chars as key
|
|
339
|
-
}
|
|
340
|
-
}
|
|
341
|
-
}
|
|
342
|
-
|
|
343
|
-
return replies;
|
|
344
|
-
}
|
|
345
|
-
|
|
346
|
-
/**
|
|
347
|
-
* Realign comments from reference DOCX to markdown
|
|
348
|
-
*/
|
|
349
|
-
export async function realignComments(
|
|
350
|
-
docxPath: string,
|
|
351
|
-
markdownPath: string,
|
|
352
|
-
options: RealignOptions = {}
|
|
353
|
-
): Promise<RealignResult> {
|
|
354
|
-
const { dryRun = false, author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options;
|
|
355
|
-
|
|
356
|
-
// Read original markdown to extract replies before stripping
|
|
357
|
-
const originalMarkdown = fs.readFileSync(markdownPath, 'utf-8');
|
|
358
|
-
|
|
359
|
-
// Extract reply relationships
|
|
360
|
-
const replies = extractReplies(originalMarkdown, author, replyAuthor);
|
|
361
|
-
console.log(`Found ${replies.size} ${author} comments with ${replyAuthor} replies`);
|
|
362
|
-
|
|
363
|
-
// Extract reference paragraphs with comments
|
|
364
|
-
const refParagraphs = await extractParagraphsWithComments(docxPath);
|
|
365
|
-
const refWithComments = refParagraphs.filter(
|
|
366
|
-
(p) => p.comments.length > 0 && p.comments.some((c) => c.author === author)
|
|
367
|
-
);
|
|
368
|
-
|
|
369
|
-
console.log(`Found ${refWithComments.length} paragraphs with ${author} comments in reference`);
|
|
370
|
-
|
|
371
|
-
// Strip ALL comments (both authors) from markdown to start fresh
|
|
372
|
-
let markdown = originalMarkdown;
|
|
373
|
-
markdown = markdown.replace(/\s*\{>>[\s\S]+?<<\}/g, '');
|
|
374
|
-
console.log(`Stripped all comments from markdown`);
|
|
375
|
-
|
|
376
|
-
// Parse markdown paragraphs
|
|
377
|
-
const mdParagraphs = parseMdParagraphs(markdown);
|
|
378
|
-
|
|
379
|
-
// Track insertions (position, text) - will insert from end to start
|
|
380
|
-
const insertions: CommentInsertion[] = [];
|
|
381
|
-
let matched = 0;
|
|
382
|
-
let unmatched = 0;
|
|
383
|
-
|
|
384
|
-
for (const refPara of refWithComments) {
|
|
385
|
-
const match = findMatchingParagraph(refPara.text, mdParagraphs);
|
|
386
|
-
|
|
387
|
-
if (!match) {
|
|
388
|
-
console.log(` No match for: "${refPara.text.slice(0, 60)}..."`);
|
|
389
|
-
unmatched++;
|
|
390
|
-
continue;
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
matched++;
|
|
394
|
-
const mdPara = match.paragraph;
|
|
395
|
-
|
|
396
|
-
// Get author's comments in this paragraph
|
|
397
|
-
const authorComments = refPara.comments.filter((c) => c.author === author);
|
|
398
|
-
|
|
399
|
-
for (const comment of authorComments) {
|
|
400
|
-
// Find corresponding position in markdown paragraph
|
|
401
|
-
const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? '');
|
|
402
|
-
const absolutePos = (mdPara?.start ?? 0) + mdPos;
|
|
403
|
-
|
|
404
|
-
// Build comment mark with any replies
|
|
405
|
-
let commentMark = ` {>>${comment.author}: ${comment.text}<<}`;
|
|
406
|
-
|
|
407
|
-
// Check for replies
|
|
408
|
-
const replyKey = comment.text.trim().slice(0, 50);
|
|
409
|
-
const replyTexts = replies.get(replyKey);
|
|
410
|
-
if (replyTexts) {
|
|
411
|
-
for (const replyText of replyTexts) {
|
|
412
|
-
commentMark += ` {>>${replyAuthor}: ${replyText}<<}`;
|
|
413
|
-
}
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
insertions.push({
|
|
417
|
-
position: absolutePos,
|
|
418
|
-
text: commentMark,
|
|
419
|
-
commentText: comment.text.slice(0, 30),
|
|
420
|
-
hasReplies: !!replyTexts,
|
|
421
|
-
debug: `"${(mdPara?.text ?? '').slice(Math.max(0, mdPos - 20), mdPos)}|HERE|${(mdPara?.text ?? '').slice(mdPos, mdPos + 20)}"`,
|
|
422
|
-
});
|
|
423
|
-
}
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
console.log(`Matched ${matched} paragraphs, ${unmatched} unmatched`);
|
|
427
|
-
console.log(`Inserting ${insertions.length} comments (${insertions.filter((i) => i.hasReplies).length} with replies)`);
|
|
428
|
-
|
|
429
|
-
if (dryRun) {
|
|
430
|
-
console.log('\nDry run - would insert:');
|
|
431
|
-
for (const ins of insertions.slice(0, 10)) {
|
|
432
|
-
console.log(` At ${ins.position}: ${ins.debug}`);
|
|
433
|
-
console.log(` Comment: "${ins.commentText}..."${ins.hasReplies ? ' (+ replies)' : ''}`);
|
|
434
|
-
}
|
|
435
|
-
return { success: true, dryRun: true, insertions: insertions.length };
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
// Sort by position descending and insert
|
|
439
|
-
insertions.sort((a, b) => b.position - a.position);
|
|
440
|
-
|
|
441
|
-
for (const ins of insertions) {
|
|
442
|
-
markdown = markdown.slice(0, ins.position) + ins.text + markdown.slice(ins.position);
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
// Write result
|
|
446
|
-
fs.writeFileSync(markdownPath, markdown);
|
|
447
|
-
|
|
448
|
-
return { success: true, insertions: insertions.length, matched, unmatched };
|
|
449
|
-
}
|
|
450
|
-
|
|
451
|
-
/**
|
|
452
|
-
* Realign comments in markdown string (in-memory, doesn't write to file)
|
|
453
|
-
*/
|
|
454
|
-
export async function realignMarkdown(
|
|
455
|
-
docxPath: string,
|
|
456
|
-
markdown: string,
|
|
457
|
-
options: RealignMarkdownOptions = {}
|
|
458
|
-
): Promise<RealignMarkdownResult> {
|
|
459
|
-
const { author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options;
|
|
460
|
-
|
|
461
|
-
try {
|
|
462
|
-
// Extract reply relationships from original markdown
|
|
463
|
-
const replies = extractReplies(markdown, author, replyAuthor);
|
|
464
|
-
|
|
465
|
-
// Extract reference paragraphs with comments
|
|
466
|
-
const refParagraphs = await extractParagraphsWithComments(docxPath);
|
|
467
|
-
const refWithComments = refParagraphs.filter(
|
|
468
|
-
(p) => p.comments.length > 0 && p.comments.some((c) => c.author === author)
|
|
469
|
-
);
|
|
470
|
-
|
|
471
|
-
// Strip ALL comments from markdown
|
|
472
|
-
let result = markdown.replace(/\s*\{>>[\s\S]+?<<\}/g, '');
|
|
473
|
-
|
|
474
|
-
// Parse markdown paragraphs
|
|
475
|
-
const mdParagraphs = parseMdParagraphs(result);
|
|
476
|
-
|
|
477
|
-
// Track insertions
|
|
478
|
-
const insertions: Array<{ position: number; text: string }> = [];
|
|
479
|
-
|
|
480
|
-
for (const refPara of refWithComments) {
|
|
481
|
-
const match = findMatchingParagraph(refPara.text, mdParagraphs);
|
|
482
|
-
if (!match) continue;
|
|
483
|
-
|
|
484
|
-
const mdPara = match.paragraph;
|
|
485
|
-
const authorComments = refPara.comments.filter((c) => c.author === author);
|
|
486
|
-
|
|
487
|
-
for (const comment of authorComments) {
|
|
488
|
-
const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? '');
|
|
489
|
-
const absolutePos = (mdPara?.start ?? 0) + mdPos;
|
|
490
|
-
|
|
491
|
-
let commentMark = ` {>>${comment.author}: ${comment.text}<<}`;
|
|
492
|
-
|
|
493
|
-
// Check for replies
|
|
494
|
-
const replyKey = comment.text.trim().slice(0, 50);
|
|
495
|
-
const replyTexts = replies.get(replyKey);
|
|
496
|
-
if (replyTexts) {
|
|
497
|
-
for (const replyText of replyTexts) {
|
|
498
|
-
commentMark += ` {>>${replyAuthor}: ${replyText}<<}`;
|
|
499
|
-
}
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
insertions.push({ position: absolutePos, text: commentMark });
|
|
503
|
-
}
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
// Sort by position descending and insert
|
|
507
|
-
insertions.sort((a, b) => b.position - a.position);
|
|
508
|
-
|
|
509
|
-
for (const ins of insertions) {
|
|
510
|
-
result = result.slice(0, ins.position) + ins.text + result.slice(ins.position);
|
|
511
|
-
}
|
|
512
|
-
|
|
513
|
-
return { success: true, markdown: result, insertions: insertions.length };
|
|
514
|
-
} catch (err: any) {
|
|
515
|
-
return { success: false, markdown, insertions: 0, error: err.message };
|
|
516
|
-
}
|
|
517
|
-
}
|
|
1
|
+
/**
|
|
2
|
+
* Realign comments from a reference DOCX to markdown
|
|
3
|
+
* Uses paragraph-level matching with exact positions
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import * as fs from 'fs';
|
|
7
|
+
import AdmZip from 'adm-zip';
|
|
8
|
+
import { parseStringPromise } from 'xml2js';
|
|
9
|
+
|
|
10
|
+
interface CommentData {
|
|
11
|
+
author: string;
|
|
12
|
+
text: string;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
interface CommentWithPosition {
|
|
16
|
+
id: string;
|
|
17
|
+
position: number;
|
|
18
|
+
author: string;
|
|
19
|
+
text: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
interface Paragraph {
|
|
23
|
+
text: string;
|
|
24
|
+
comments: CommentWithPosition[];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
interface MdParagraph {
|
|
28
|
+
text: string;
|
|
29
|
+
start: number;
|
|
30
|
+
end: number;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
interface ParagraphMatch {
|
|
34
|
+
index: number;
|
|
35
|
+
score: number;
|
|
36
|
+
paragraph: MdParagraph;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
interface WordContext {
|
|
40
|
+
before: string[];
|
|
41
|
+
after: string[];
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
interface CommentInsertion {
|
|
45
|
+
position: number;
|
|
46
|
+
text: string;
|
|
47
|
+
commentText: string;
|
|
48
|
+
hasReplies: boolean;
|
|
49
|
+
debug: string;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
interface RealignOptions {
|
|
53
|
+
dryRun?: boolean;
|
|
54
|
+
author?: string;
|
|
55
|
+
replyAuthor?: string;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
interface RealignResult {
|
|
59
|
+
success: boolean;
|
|
60
|
+
dryRun?: boolean;
|
|
61
|
+
insertions: number;
|
|
62
|
+
matched?: number;
|
|
63
|
+
unmatched?: number;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
interface RealignMarkdownOptions {
|
|
67
|
+
author?: string;
|
|
68
|
+
replyAuthor?: string;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
interface RealignMarkdownResult {
|
|
72
|
+
success: boolean;
|
|
73
|
+
markdown: string;
|
|
74
|
+
insertions: number;
|
|
75
|
+
error?: string;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Extract paragraphs with their full text and comment positions from DOCX
|
|
80
|
+
*/
|
|
81
|
+
export async function extractParagraphsWithComments(docxPath: string): Promise<Paragraph[]> {
|
|
82
|
+
const zip = new AdmZip(docxPath);
|
|
83
|
+
const doc = zip.readAsText('word/document.xml');
|
|
84
|
+
const commentsXml = zip.readAsText('word/comments.xml');
|
|
85
|
+
|
|
86
|
+
// Parse comments to get authors and texts
|
|
87
|
+
const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
|
|
88
|
+
const commentNodes = parsed['w:comments']?.['w:comment'];
|
|
89
|
+
if (!commentNodes) return [];
|
|
90
|
+
|
|
91
|
+
const nodes = Array.isArray(commentNodes) ? commentNodes : [commentNodes];
|
|
92
|
+
const commentData: Record<string, CommentData> = {};
|
|
93
|
+
|
|
94
|
+
for (const c of nodes) {
|
|
95
|
+
const id = c.$?.['w:id'] ?? '';
|
|
96
|
+
const author = c.$?.['w:author'] ?? 'Unknown';
|
|
97
|
+
let text = '';
|
|
98
|
+
const extractT = (n: any): void => {
|
|
99
|
+
if (!n) return;
|
|
100
|
+
if (n['w:t']) {
|
|
101
|
+
const t = n['w:t'];
|
|
102
|
+
text += typeof t === 'string' ? t : (t._ || t);
|
|
103
|
+
}
|
|
104
|
+
if (n['w:r']) {
|
|
105
|
+
(Array.isArray(n['w:r']) ? n['w:r'] : [n['w:r']]).forEach(extractT);
|
|
106
|
+
}
|
|
107
|
+
if (n['w:p']) {
|
|
108
|
+
(Array.isArray(n['w:p']) ? n['w:p'] : [n['w:p']]).forEach(extractT);
|
|
109
|
+
}
|
|
110
|
+
};
|
|
111
|
+
extractT(c);
|
|
112
|
+
commentData[id] = { author, text: text.trim() };
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Extract paragraphs with comments
|
|
116
|
+
const paragraphs: Paragraph[] = [];
|
|
117
|
+
const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
|
|
118
|
+
let match;
|
|
119
|
+
|
|
120
|
+
while ((match = paraPattern.exec(doc)) !== null) {
|
|
121
|
+
const paraContent = match[1];
|
|
122
|
+
const hasComments = /commentRangeStart/.test(paraContent);
|
|
123
|
+
|
|
124
|
+
// Build paragraph text and track comment positions
|
|
125
|
+
let text = '';
|
|
126
|
+
const comments: CommentWithPosition[] = [];
|
|
127
|
+
|
|
128
|
+
const tokenPattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
|
|
129
|
+
let tokenMatch;
|
|
130
|
+
|
|
131
|
+
while ((tokenMatch = tokenPattern.exec(paraContent)) !== null) {
|
|
132
|
+
if (tokenMatch[1] !== undefined) {
|
|
133
|
+
text += tokenMatch[1];
|
|
134
|
+
} else if (tokenMatch[2] !== undefined) {
|
|
135
|
+
const cid = tokenMatch[2];
|
|
136
|
+
const data = commentData[cid];
|
|
137
|
+
if (data) {
|
|
138
|
+
comments.push({
|
|
139
|
+
id: cid,
|
|
140
|
+
position: text.length,
|
|
141
|
+
author: data.author,
|
|
142
|
+
text: data.text,
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (text.trim() || hasComments) {
|
|
149
|
+
paragraphs.push({ text: text.trim(), comments });
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return paragraphs;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Find best matching paragraph in markdown for a reference paragraph
|
|
158
|
+
*/
|
|
159
|
+
function findMatchingParagraph(refText: string, mdParagraphs: MdParagraph[]): ParagraphMatch | null {
|
|
160
|
+
// Normalize for comparison
|
|
161
|
+
const normalize = (s: string): string => s.toLowerCase().replace(/\s+/g, ' ').trim();
|
|
162
|
+
const refNorm = normalize(refText);
|
|
163
|
+
|
|
164
|
+
if (refNorm.length < 20) return null;
|
|
165
|
+
|
|
166
|
+
let bestMatch: ParagraphMatch | null = null;
|
|
167
|
+
let bestScore = 0;
|
|
168
|
+
|
|
169
|
+
for (let i = 0; i < mdParagraphs.length; i++) {
|
|
170
|
+
const mdNorm = normalize(mdParagraphs[i].text);
|
|
171
|
+
|
|
172
|
+
// Calculate word overlap
|
|
173
|
+
const refWords = new Set(refNorm.split(' ').filter((w) => w.length > 3));
|
|
174
|
+
const mdWords = mdNorm.split(' ').filter((w) => w.length > 3);
|
|
175
|
+
const overlap = mdWords.filter((w) => refWords.has(w)).length;
|
|
176
|
+
const score = overlap / Math.max(refWords.size, 1);
|
|
177
|
+
|
|
178
|
+
// Also check for substring containment (for section headers)
|
|
179
|
+
const containsStart = mdNorm.includes(refNorm.slice(0, 50));
|
|
180
|
+
|
|
181
|
+
if (score > bestScore || (containsStart && score > 0.3)) {
|
|
182
|
+
bestScore = Math.max(score, containsStart ? 0.8 : score);
|
|
183
|
+
bestMatch = { index: i, score: bestScore, paragraph: mdParagraphs[i] };
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return bestScore > 0.4 ? bestMatch : null;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Extract paragraphs from markdown (split by blank lines)
|
|
192
|
+
*/
|
|
193
|
+
function parseMdParagraphs(markdown: string): MdParagraph[] {
|
|
194
|
+
const paragraphs: MdParagraph[] = [];
|
|
195
|
+
const parts = markdown.split(/\n\n+/);
|
|
196
|
+
|
|
197
|
+
let pos = 0;
|
|
198
|
+
for (const part of parts) {
|
|
199
|
+
const trimmed = part.trim();
|
|
200
|
+
if (trimmed) {
|
|
201
|
+
const partStart = markdown.indexOf(part, pos);
|
|
202
|
+
if (partStart !== -1) {
|
|
203
|
+
paragraphs.push({
|
|
204
|
+
text: trimmed,
|
|
205
|
+
start: partStart,
|
|
206
|
+
end: partStart + part.length,
|
|
207
|
+
});
|
|
208
|
+
pos = partStart + part.length;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return paragraphs;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Normalize text for matching (remove citations, extra whitespace)
|
|
218
|
+
*/
|
|
219
|
+
function normalizeForMatching(text: string): string {
|
|
220
|
+
return text
|
|
221
|
+
// Remove Word citation placeholders
|
|
222
|
+
.replace(/\(\s*\$+\s*\)/g, '')
|
|
223
|
+
.replace(/\$+/g, '')
|
|
224
|
+
// Remove markdown citations
|
|
225
|
+
.replace(/\[@[^\]]+\]/g, '')
|
|
226
|
+
.replace(/@[A-Z][a-z]+\d{4}/g, '')
|
|
227
|
+
// Remove rendered citations like "(Author et al. 2021)"
|
|
228
|
+
.replace(/\([A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*[A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)/g, '')
|
|
229
|
+
// Remove figure references like "Fig. 1" or "(Fig. 1)"
|
|
230
|
+
.replace(/\(?Fig\.?\s*\d+[a-z]?\)?/gi, '')
|
|
231
|
+
// Normalize whitespace
|
|
232
|
+
.replace(/\s+/g, ' ')
|
|
233
|
+
.trim()
|
|
234
|
+
.toLowerCase();
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* Find the word at or near a position in text
|
|
239
|
+
*/
|
|
240
|
+
function getWordAtPosition(text: string, pos: number): WordContext {
|
|
241
|
+
const before = text.slice(Math.max(0, pos - 30), pos);
|
|
242
|
+
const after = text.slice(pos, pos + 30);
|
|
243
|
+
|
|
244
|
+
// Get the last complete word before position
|
|
245
|
+
const beforeWords = before.split(/\s+/).filter(w => w.length > 2);
|
|
246
|
+
const afterWords = after.split(/\s+/).filter(w => w.length > 2);
|
|
247
|
+
|
|
248
|
+
return {
|
|
249
|
+
before: beforeWords.slice(-3),
|
|
250
|
+
after: afterWords.slice(0, 3)
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Find position in markdown paragraph matching reference position
|
|
256
|
+
* Uses the anchor word (word immediately before the comment) for precise matching
|
|
257
|
+
*/
|
|
258
|
+
function findMdPosition(refText: string, refPos: number, mdText: string): number {
|
|
259
|
+
// Get the word(s) immediately before the comment position in reference
|
|
260
|
+
const refWords = getWordAtPosition(refText, refPos);
|
|
261
|
+
const normalizedMd = normalizeForMatching(mdText);
|
|
262
|
+
|
|
263
|
+
// The "anchor word" is the last word before the comment
|
|
264
|
+
const anchorWords = refWords.before;
|
|
265
|
+
|
|
266
|
+
if (anchorWords.length === 0) {
|
|
267
|
+
const ratio = refPos / Math.max(refText.length, 1);
|
|
268
|
+
return Math.round(ratio * mdText.length);
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
// Try to find the anchor word(s) in markdown
|
|
272
|
+
// Start with the most specific (all words), fall back to fewer
|
|
273
|
+
for (let numWords = anchorWords.length; numWords >= 1; numWords--) {
|
|
274
|
+
const searchWords = anchorWords.slice(-numWords);
|
|
275
|
+
const pattern = searchWords.map(w =>
|
|
276
|
+
w.toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
|
277
|
+
).join('\\s+');
|
|
278
|
+
|
|
279
|
+
const regex = new RegExp(pattern, 'g');
|
|
280
|
+
const matches = [...normalizedMd.matchAll(regex)];
|
|
281
|
+
|
|
282
|
+
if (matches.length === 1) {
|
|
283
|
+
// Unique match - use this position
|
|
284
|
+
const matchEnd = matches[0].index! + matches[0][0].length;
|
|
285
|
+
// Map back to original markdown position
|
|
286
|
+
const ratio = matchEnd / Math.max(normalizedMd.length, 1);
|
|
287
|
+
return Math.round(ratio * mdText.length);
|
|
288
|
+
} else if (matches.length > 1) {
|
|
289
|
+
// Multiple matches - use context after to disambiguate
|
|
290
|
+
const afterWords = refWords.after;
|
|
291
|
+
if (afterWords.length > 0) {
|
|
292
|
+
const afterPattern = afterWords[0].toLowerCase().replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
293
|
+
for (const match of matches) {
|
|
294
|
+
const matchEnd = match.index! + match[0].length;
|
|
295
|
+
const afterContext = normalizedMd.slice(matchEnd, matchEnd + 50);
|
|
296
|
+
if (afterContext.includes(afterPattern)) {
|
|
297
|
+
const ratio = matchEnd / Math.max(normalizedMd.length, 1);
|
|
298
|
+
return Math.round(ratio * mdText.length);
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
// Fall back to first match
|
|
303
|
+
const matchEnd = matches[0].index! + matches[0][0].length;
|
|
304
|
+
const ratio = matchEnd / Math.max(normalizedMd.length, 1);
|
|
305
|
+
return Math.round(ratio * mdText.length);
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Fallback: proportional position
|
|
310
|
+
const ratio = refPos / Math.max(refText.length, 1);
|
|
311
|
+
return Math.round(ratio * mdText.length);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Extract reply comments that follow a parent comment
|
|
316
|
+
* Returns map of parent comment text -> array of reply texts
|
|
317
|
+
*/
|
|
318
|
+
function extractReplies(markdown: string, parentAuthor: string, replyAuthor: string): Map<string, string[]> {
|
|
319
|
+
const replies = new Map<string, string[]>();
|
|
320
|
+
const pattern = new RegExp(
|
|
321
|
+
`\\{>>${parentAuthor}:\\s*([^<]+)<<\\}((?:\\s*\\{>>${replyAuthor}:[^<]+<<\\})*)`,
|
|
322
|
+
'g'
|
|
323
|
+
);
|
|
324
|
+
|
|
325
|
+
let match;
|
|
326
|
+
while ((match = pattern.exec(markdown)) !== null) {
|
|
327
|
+
const parentText = match[1].trim();
|
|
328
|
+
const replyBlock = match[2];
|
|
329
|
+
|
|
330
|
+
if (replyBlock) {
|
|
331
|
+
const replyPattern = new RegExp(`\\{>>${replyAuthor}:\\s*([^<]+)<<\\}`, 'g');
|
|
332
|
+
const replyTexts: string[] = [];
|
|
333
|
+
let replyMatch;
|
|
334
|
+
while ((replyMatch = replyPattern.exec(replyBlock)) !== null) {
|
|
335
|
+
replyTexts.push(replyMatch[1].trim());
|
|
336
|
+
}
|
|
337
|
+
if (replyTexts.length > 0) {
|
|
338
|
+
replies.set(parentText.slice(0, 50), replyTexts); // Use first 50 chars as key
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
return replies;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Realign comments from reference DOCX to markdown
|
|
348
|
+
*/
|
|
349
|
+
export async function realignComments(
|
|
350
|
+
docxPath: string,
|
|
351
|
+
markdownPath: string,
|
|
352
|
+
options: RealignOptions = {}
|
|
353
|
+
): Promise<RealignResult> {
|
|
354
|
+
const { dryRun = false, author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options;
|
|
355
|
+
|
|
356
|
+
// Read original markdown to extract replies before stripping
|
|
357
|
+
const originalMarkdown = fs.readFileSync(markdownPath, 'utf-8');
|
|
358
|
+
|
|
359
|
+
// Extract reply relationships
|
|
360
|
+
const replies = extractReplies(originalMarkdown, author, replyAuthor);
|
|
361
|
+
console.log(`Found ${replies.size} ${author} comments with ${replyAuthor} replies`);
|
|
362
|
+
|
|
363
|
+
// Extract reference paragraphs with comments
|
|
364
|
+
const refParagraphs = await extractParagraphsWithComments(docxPath);
|
|
365
|
+
const refWithComments = refParagraphs.filter(
|
|
366
|
+
(p) => p.comments.length > 0 && p.comments.some((c) => c.author === author)
|
|
367
|
+
);
|
|
368
|
+
|
|
369
|
+
console.log(`Found ${refWithComments.length} paragraphs with ${author} comments in reference`);
|
|
370
|
+
|
|
371
|
+
// Strip ALL comments (both authors) from markdown to start fresh
|
|
372
|
+
let markdown = originalMarkdown;
|
|
373
|
+
markdown = markdown.replace(/\s*\{>>[\s\S]+?<<\}/g, '');
|
|
374
|
+
console.log(`Stripped all comments from markdown`);
|
|
375
|
+
|
|
376
|
+
// Parse markdown paragraphs
|
|
377
|
+
const mdParagraphs = parseMdParagraphs(markdown);
|
|
378
|
+
|
|
379
|
+
// Track insertions (position, text) - will insert from end to start
|
|
380
|
+
const insertions: CommentInsertion[] = [];
|
|
381
|
+
let matched = 0;
|
|
382
|
+
let unmatched = 0;
|
|
383
|
+
|
|
384
|
+
for (const refPara of refWithComments) {
|
|
385
|
+
const match = findMatchingParagraph(refPara.text, mdParagraphs);
|
|
386
|
+
|
|
387
|
+
if (!match) {
|
|
388
|
+
console.log(` No match for: "${refPara.text.slice(0, 60)}..."`);
|
|
389
|
+
unmatched++;
|
|
390
|
+
continue;
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
matched++;
|
|
394
|
+
const mdPara = match.paragraph;
|
|
395
|
+
|
|
396
|
+
// Get author's comments in this paragraph
|
|
397
|
+
const authorComments = refPara.comments.filter((c) => c.author === author);
|
|
398
|
+
|
|
399
|
+
for (const comment of authorComments) {
|
|
400
|
+
// Find corresponding position in markdown paragraph
|
|
401
|
+
const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? '');
|
|
402
|
+
const absolutePos = (mdPara?.start ?? 0) + mdPos;
|
|
403
|
+
|
|
404
|
+
// Build comment mark with any replies
|
|
405
|
+
let commentMark = ` {>>${comment.author}: ${comment.text}<<}`;
|
|
406
|
+
|
|
407
|
+
// Check for replies
|
|
408
|
+
const replyKey = comment.text.trim().slice(0, 50);
|
|
409
|
+
const replyTexts = replies.get(replyKey);
|
|
410
|
+
if (replyTexts) {
|
|
411
|
+
for (const replyText of replyTexts) {
|
|
412
|
+
commentMark += ` {>>${replyAuthor}: ${replyText}<<}`;
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
insertions.push({
|
|
417
|
+
position: absolutePos,
|
|
418
|
+
text: commentMark,
|
|
419
|
+
commentText: comment.text.slice(0, 30),
|
|
420
|
+
hasReplies: !!replyTexts,
|
|
421
|
+
debug: `"${(mdPara?.text ?? '').slice(Math.max(0, mdPos - 20), mdPos)}|HERE|${(mdPara?.text ?? '').slice(mdPos, mdPos + 20)}"`,
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
console.log(`Matched ${matched} paragraphs, ${unmatched} unmatched`);
|
|
427
|
+
console.log(`Inserting ${insertions.length} comments (${insertions.filter((i) => i.hasReplies).length} with replies)`);
|
|
428
|
+
|
|
429
|
+
if (dryRun) {
|
|
430
|
+
console.log('\nDry run - would insert:');
|
|
431
|
+
for (const ins of insertions.slice(0, 10)) {
|
|
432
|
+
console.log(` At ${ins.position}: ${ins.debug}`);
|
|
433
|
+
console.log(` Comment: "${ins.commentText}..."${ins.hasReplies ? ' (+ replies)' : ''}`);
|
|
434
|
+
}
|
|
435
|
+
return { success: true, dryRun: true, insertions: insertions.length };
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Sort by position descending and insert
|
|
439
|
+
insertions.sort((a, b) => b.position - a.position);
|
|
440
|
+
|
|
441
|
+
for (const ins of insertions) {
|
|
442
|
+
markdown = markdown.slice(0, ins.position) + ins.text + markdown.slice(ins.position);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Write result
|
|
446
|
+
fs.writeFileSync(markdownPath, markdown);
|
|
447
|
+
|
|
448
|
+
return { success: true, insertions: insertions.length, matched, unmatched };
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/**
|
|
452
|
+
* Realign comments in markdown string (in-memory, doesn't write to file)
|
|
453
|
+
*/
|
|
454
|
+
export async function realignMarkdown(
|
|
455
|
+
docxPath: string,
|
|
456
|
+
markdown: string,
|
|
457
|
+
options: RealignMarkdownOptions = {}
|
|
458
|
+
): Promise<RealignMarkdownResult> {
|
|
459
|
+
const { author = 'Guy Colling', replyAuthor = 'Gilles Colling' } = options;
|
|
460
|
+
|
|
461
|
+
try {
|
|
462
|
+
// Extract reply relationships from original markdown
|
|
463
|
+
const replies = extractReplies(markdown, author, replyAuthor);
|
|
464
|
+
|
|
465
|
+
// Extract reference paragraphs with comments
|
|
466
|
+
const refParagraphs = await extractParagraphsWithComments(docxPath);
|
|
467
|
+
const refWithComments = refParagraphs.filter(
|
|
468
|
+
(p) => p.comments.length > 0 && p.comments.some((c) => c.author === author)
|
|
469
|
+
);
|
|
470
|
+
|
|
471
|
+
// Strip ALL comments from markdown
|
|
472
|
+
let result = markdown.replace(/\s*\{>>[\s\S]+?<<\}/g, '');
|
|
473
|
+
|
|
474
|
+
// Parse markdown paragraphs
|
|
475
|
+
const mdParagraphs = parseMdParagraphs(result);
|
|
476
|
+
|
|
477
|
+
// Track insertions
|
|
478
|
+
const insertions: Array<{ position: number; text: string }> = [];
|
|
479
|
+
|
|
480
|
+
for (const refPara of refWithComments) {
|
|
481
|
+
const match = findMatchingParagraph(refPara.text, mdParagraphs);
|
|
482
|
+
if (!match) continue;
|
|
483
|
+
|
|
484
|
+
const mdPara = match.paragraph;
|
|
485
|
+
const authorComments = refPara.comments.filter((c) => c.author === author);
|
|
486
|
+
|
|
487
|
+
for (const comment of authorComments) {
|
|
488
|
+
const mdPos = findMdPosition(refPara.text, comment.position, mdPara?.text ?? '');
|
|
489
|
+
const absolutePos = (mdPara?.start ?? 0) + mdPos;
|
|
490
|
+
|
|
491
|
+
let commentMark = ` {>>${comment.author}: ${comment.text}<<}`;
|
|
492
|
+
|
|
493
|
+
// Check for replies
|
|
494
|
+
const replyKey = comment.text.trim().slice(0, 50);
|
|
495
|
+
const replyTexts = replies.get(replyKey);
|
|
496
|
+
if (replyTexts) {
|
|
497
|
+
for (const replyText of replyTexts) {
|
|
498
|
+
commentMark += ` {>>${replyAuthor}: ${replyText}<<}`;
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
insertions.push({ position: absolutePos, text: commentMark });
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
// Sort by position descending and insert
|
|
507
|
+
insertions.sort((a, b) => b.position - a.position);
|
|
508
|
+
|
|
509
|
+
for (const ins of insertions) {
|
|
510
|
+
result = result.slice(0, ins.position) + ins.text + result.slice(ins.position);
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
return { success: true, markdown: result, insertions: insertions.length };
|
|
514
|
+
} catch (err: any) {
|
|
515
|
+
return { success: false, markdown, insertions: 0, error: err.message };
|
|
516
|
+
}
|
|
517
|
+
}
|