docrev 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/import.ts CHANGED
@@ -269,6 +269,43 @@ function pushPastSectionHeading(text: string, pos: number): number {
269
269
  return after;
270
270
  }
271
271
 
272
+ /**
273
+ * Snap a position to the nearest whitespace boundary within ±50 chars so a
274
+ * proportional fallback insertion never lands mid-word.
275
+ */
276
+ function snapToWordBoundary(text: string, pos: number): number {
277
+ if (pos <= 0) return 0;
278
+ if (pos >= text.length) return text.length;
279
+ if (/\s/.test(text[pos] ?? '')) return pos;
280
+ for (let d = 1; d <= 50; d++) {
281
+ if (pos + d < text.length && /\s/.test(text[pos + d] ?? '')) return pos + d;
282
+ if (pos - d >= 0 && /\s/.test(text[pos - d] ?? '')) return pos - d;
283
+ }
284
+ return pos;
285
+ }
286
+
287
+ /**
288
+ * Final-resort placement when every text-matching strategy failed. The docx
289
+ * carries a real `<w:commentRangeStart w:id="N">` marker at a known offset
290
+ * inside its body text — that's a structural anchor, even if the anchored
291
+ * span itself is empty and the surrounding context drifted in the target.
292
+ *
293
+ * Map docPosition into the target markdown proportionally and snap to a word
294
+ * boundary. This is approximate when the document was heavily restructured,
295
+ * but it's strictly better than silently dropping a reviewer's comment: the
296
+ * comment lands in roughly the right neighborhood and the reviewer can
297
+ * relocate it during their next pass.
298
+ */
299
+ function proportionalFallback(
300
+ anchorData: CommentAnchorData,
301
+ target: string,
302
+ ): number | null {
303
+ if (anchorData.docLength <= 0) return null;
304
+ const proportion = Math.min(anchorData.docPosition / anchorData.docLength, 1.0);
305
+ const rawPos = Math.floor(proportion * target.length);
306
+ return pushPastSectionHeading(target, snapToWordBoundary(target, rawPos));
307
+ }
308
+
272
309
  /**
273
310
  * Insert comments into markdown text based on anchor texts with context
274
311
  */
@@ -285,11 +322,41 @@ export function insertCommentsIntoMarkdown(
285
322
  const duplicateWarnings: string[] = [];
286
323
  const usedPositions = new Set<number>(); // For tie-breaking: track used positions
287
324
 
325
+ // Resolve threading: replies share their parent's anchor in Word, so they
326
+ // must inherit the parent's position and ride alongside it as one cluster.
327
+ // Letting each reply run through anchor scoring scatters the cluster (the
328
+ // same docPosition forces `usedPositions` to push later replies onto a
329
+ // different occurrence), which on re-build looks like independent comments
330
+ // and loses the paraIdParent threading. See gcol33/docrev issue #2.
331
+ const inputById = new Map<string, WordComment>();
332
+ for (const c of comments) inputById.set(c.id, c);
333
+ function rootIdOf(c: WordComment): string {
334
+ let cur: WordComment = c;
335
+ const seen = new Set<string>();
336
+ while (cur.parentId && !seen.has(cur.id)) {
337
+ seen.add(cur.id);
338
+ const parent = inputById.get(cur.parentId);
339
+ if (!parent || parent === cur) break;
340
+ cur = parent;
341
+ }
342
+ return cur.id;
343
+ }
344
+ const replyRootId = new Map<string, string>();
345
+ for (const c of comments) {
346
+ const root = rootIdOf(c);
347
+ if (root !== c.id) replyRootId.set(c.id, root);
348
+ }
349
+
288
350
  // Anchor matching primitives live in lib/anchor-match.ts so that
289
351
  // `rev verify-anchors` can use the same strategies for drift reporting.
290
352
 
291
- // Get all positions in order (for sequential tie-breaking)
353
+ // Get all positions in order (for sequential tie-breaking).
354
+ // Replies skip scoring entirely — they piggyback on their root's position
355
+ // in the emit pass below.
292
356
  const commentsWithPositions = comments.map((c): CommentWithPos => {
357
+ if (replyRootId.has(c.id)) {
358
+ return { ...c, pos: -1, anchorText: null, strategy: 'reply' };
359
+ }
293
360
  const anchorData = anchors.get(c.id);
294
361
  if (!anchorData) {
295
362
  unmatchedCount++;
@@ -396,6 +463,14 @@ export function insertCommentsIntoMarkdown(
396
463
  return { ...c, pos: occurrences[0], anchorText: null, isEmpty: true };
397
464
  }
398
465
  }
466
+ // Last resort: docx carried a structural marker at docPosition; map
467
+ // it proportionally into the target so the comment isn't dropped.
468
+ if (typeof anchorData === 'object') {
469
+ const fallback = proportionalFallback(anchorData, result);
470
+ if (fallback !== null) {
471
+ return { ...c, pos: fallback, anchorText: null, isEmpty: true, strategy: 'proportional-fallback' };
472
+ }
473
+ }
399
474
  unmatchedCount++;
400
475
  return { ...c, pos: -1, anchorText: null, isEmpty: true };
401
476
  }
@@ -404,6 +479,14 @@ export function insertCommentsIntoMarkdown(
404
479
  const { occurrences, matchedAnchor, strategy, stripped } = findAnchorInText(anchor, result, before, after);
405
480
 
406
481
  if (occurrences.length === 0) {
482
+ // Same last-resort as the empty-anchor path: anchor text is gone from
483
+ // the target, but the marker's text-offset survived extraction.
484
+ if (typeof anchorData === 'object') {
485
+ const fallback = proportionalFallback(anchorData, result);
486
+ if (fallback !== null) {
487
+ return { ...c, pos: fallback, anchorText: null, strategy: 'proportional-fallback' };
488
+ }
489
+ }
407
490
  unmatchedCount++;
408
491
  return { ...c, pos: -1, anchorText: null };
409
492
  }
@@ -434,53 +517,89 @@ export function insertCommentsIntoMarkdown(
434
517
  }
435
518
  });
436
519
 
437
- // Log any unmatched comments for debugging
438
- const unmatched = commentsWithPositions.filter((c) => c.pos < 0);
520
+ // Group comments into clusters (root + ordered replies). The root carries
521
+ // the resolved position; replies inherit it and ride along in input order
522
+ // so the rebuilt CriticMarkup looks like `{>>p<<}{>>r1<<}{>>r2<<}[anchor]`
523
+ // and adjacency-based reply detection picks the cluster up again.
524
+ const byId = new Map<string, CommentWithPos>();
525
+ for (const cwp of commentsWithPositions) byId.set(cwp.id, cwp);
526
+ const repliesByRoot = new Map<string, CommentWithPos[]>();
527
+ for (const c of comments) {
528
+ const rootId = replyRootId.get(c.id);
529
+ if (!rootId) continue;
530
+ const cwp = byId.get(c.id);
531
+ if (!cwp) continue;
532
+ const list = repliesByRoot.get(rootId);
533
+ if (list) list.push(cwp);
534
+ else repliesByRoot.set(rootId, [cwp]);
535
+ }
536
+
537
+ // Replies whose root never resolved (parent missing from the input slice or
538
+ // parent unmatched) count as unmatched too — there's no position to attach
539
+ // them to.
540
+ for (const [rootId, replies] of repliesByRoot) {
541
+ const root = byId.get(rootId);
542
+ if (!root || root.pos < 0) {
543
+ unmatchedCount += replies.length;
544
+ }
545
+ }
546
+
547
+ // Roots only — replies attach during emission.
548
+ const rootsWithPos = commentsWithPositions.filter(
549
+ c => !replyRootId.has(c.id)
550
+ );
551
+
552
+ // Log any unmatched roots for debugging
553
+ const unmatched = rootsWithPos.filter((c) => c.pos < 0);
439
554
  if (process.env.DEBUG) {
440
- console.log(`[DEBUG] insertComments: ${comments.length} input, ${commentsWithPositions.length} processed, ${unmatched.length} unmatched`);
555
+ console.log(`[DEBUG] insertComments: ${comments.length} input, ${rootsWithPos.length} roots, ${unmatched.length} unmatched roots, ${replyRootId.size} replies`);
441
556
  if (unmatched.length > 0) {
442
557
  unmatched.forEach(c => console.log(`[DEBUG] Unmatched ID=${c.id}: anchor="${(c.anchorText || 'none').slice(0,30)}"`));
443
558
  }
444
559
  }
445
560
 
446
- const matched = commentsWithPositions.filter((c) => c.pos >= 0);
561
+ const matchedRoots = rootsWithPos.filter((c) => c.pos >= 0);
447
562
 
448
563
  // Sort by position descending (insert from end to avoid offset issues)
449
- matched.sort((a, b) => b.pos - a.pos);
564
+ matchedRoots.sort((a, b) => b.pos - a.pos);
450
565
 
451
- // Insert each comment. With `wrapAnchor` (the default), the anchor text
566
+ // Insert each cluster. With `wrapAnchor` (the default), the anchor text
452
567
  // gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
453
568
  // original Word comment range. Without it, the comment block is inserted
454
569
  // adjacent to the anchor and prose stays untouched — required for
455
570
  // comments-only sync where multiple comments may share one anchor.
456
- // Skip insertion when an identical comment already lives near the target.
457
- // Re-running sync against the same docx would otherwise stack duplicate
458
- // CriticMarkup blocks (`{>>R1: ...<<}{>>R1: ...<<}...`) on each invocation.
459
- // A 200-char window catches both wrapped (`{>>...<<}[anchor]{.mark}`) and
460
- // bare (`{>>...<<}anchor`) forms while ignoring incidental matches farther
461
- // away.
571
+ // Skip insertion when the parent's CriticMarkup already lives near the
572
+ // target — re-running sync against the same docx would otherwise stack
573
+ // duplicates. A 200-char window catches both wrapped
574
+ // (`{>>...<<}[anchor]{.mark}`) and bare (`{>>...<<}anchor`) forms while
575
+ // ignoring incidental matches farther away.
462
576
  let dedupedCount = 0;
463
- for (const c of matched) {
464
- const comment = `{>>${c.author}: ${c.text}<<}`;
577
+ for (const c of matchedRoots) {
578
+ const parentBlock = `{>>${c.author}: ${c.text}<<}`;
579
+ const replies = repliesByRoot.get(c.id) ?? [];
465
580
  const windowStart = Math.max(0, c.pos - 200);
466
581
  const windowEnd = Math.min(result.length, c.pos + 200);
467
- if (result.slice(windowStart, windowEnd).includes(comment)) {
468
- dedupedCount++;
582
+ if (result.slice(windowStart, windowEnd).includes(parentBlock)) {
583
+ // Cluster already synced; treat all members as deduped.
584
+ dedupedCount += 1 + replies.length;
469
585
  continue;
470
586
  }
587
+ // Replies carry an explicit `↪ ` author prefix so the round-trip does not
588
+ // depend on positional adjacency in the markdown. On dense reviewer docs
589
+ // distinct clusters frequently land at the same anchor position; without
590
+ // the prefix the re-parse would misthread them. The injection side strips
591
+ // `↪ ` back off the author so Word renders the original name.
592
+ const replyBlocks = replies.map(r => `{>>↪ ${r.author}: ${r.text}<<}`);
593
+ const combined = parentBlock + replyBlocks.join('');
471
594
  if (wrapAnchor && c.anchorText && c.anchorEnd) {
472
595
  const before = result.slice(0, c.pos);
473
596
  const anchor = result.slice(c.pos, c.anchorEnd);
474
597
  const after = result.slice(c.anchorEnd);
475
- result = before + comment + `[${anchor}]{.mark}` + after;
598
+ result = before + combined + `[${anchor}]{.mark}` + after;
476
599
  } else {
477
- // Insert comment at the anchor position with no surrounding whitespace
478
- // tweaks; CriticMarkup blocks are invisible to readers, and adding a
479
- // leading space would shift prose byte-for-byte (relevant when callers
480
- // verify that --comments-only didn't touch the original).
481
- result = result.slice(0, c.pos) + comment + result.slice(c.pos);
600
+ result = result.slice(0, c.pos) + combined + result.slice(c.pos);
482
601
  }
483
- placedCount++;
602
+ placedCount += 1 + replies.length;
484
603
  }
485
604
 
486
605
  if (outStats) {
package/lib/types.ts CHANGED
@@ -69,6 +69,22 @@ export interface PdfConfig {
69
69
  geometry?: string;
70
70
  linestretch?: number;
71
71
  toc?: boolean;
72
+ /**
73
+ * LaTeX engine to use for PDF output. One of `pdflatex` (default),
74
+ * `xelatex`, `lualatex`, `tectonic`, etc. xelatex/lualatex are required
75
+ * for native UTF-8 rendering of diacritics in author names, place
76
+ * names, and species epithets.
77
+ */
78
+ engine?: string;
79
+ /** Roman/serif main font (xelatex/lualatex only — uses fontspec). */
80
+ mainfont?: string;
81
+ /** Sans-serif font (xelatex/lualatex only). */
82
+ sansfont?: string;
83
+ /** Monospace font (xelatex/lualatex only). */
84
+ monofont?: string;
85
+ numbersections?: boolean;
86
+ template?: string;
87
+ headerIncludes?: string;
72
88
  }
73
89
 
74
90
  export interface DocxConfig {
@@ -338,6 +354,10 @@ export interface JournalFormatting {
338
354
  linestretch?: number;
339
355
  template?: string;
340
356
  numbersections?: boolean;
357
+ engine?: string;
358
+ mainfont?: string;
359
+ sansfont?: string;
360
+ monofont?: string;
341
361
  };
342
362
  docx?: {
343
363
  reference?: string;
@@ -18,6 +18,12 @@ export interface WordComment {
18
18
  author: string;
19
19
  date: string;
20
20
  text: string;
21
+ /**
22
+ * Parent comment id when this is a reply in a Word comment thread.
23
+ * Resolved from `commentsExtended.xml`'s `w15:paraIdParent` field.
24
+ * `undefined` for top-level comments.
25
+ */
26
+ parentId?: string;
21
27
  }
22
28
 
23
29
  export interface TextNode {
@@ -126,7 +132,6 @@ export async function extractWordComments(docxPath: string): Promise<WordComment
126
132
 
127
133
  const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
128
134
 
129
- const ns = 'w:';
130
135
  const commentsRoot = parsed['w:comments'];
131
136
  if (!commentsRoot || !commentsRoot['w:comment']) {
132
137
  return comments;
@@ -137,12 +142,18 @@ export async function extractWordComments(docxPath: string): Promise<WordComment
137
142
  ? commentsRoot['w:comment']
138
143
  : [commentsRoot['w:comment']];
139
144
 
145
+ // Map every paraId that lives inside a comment back to that comment's id.
146
+ // Word's commentsExtended.xml expresses threading via w15:paraIdParent,
147
+ // which references the parent's first <w:p>. Replies use a secondary
148
+ // (often-empty) <w:p>, so each comment may contribute multiple paraIds.
149
+ const paraIdToCommentId = new Map<string, string>();
150
+
140
151
  for (const comment of commentNodes) {
141
152
  const id = comment.$?.['w:id'] || '';
142
153
  const author = comment.$?.['w:author'] || 'Unknown';
143
154
  const date = comment.$?.['w:date'] || '';
144
155
 
145
- // Extract text from nested w:p/w:r/w:t elements
156
+ // Extract text from nested w:p/w:r/w:t elements and record paraIds.
146
157
  let text = '';
147
158
  const extractText = (node: any): void => {
148
159
  if (!node) return;
@@ -160,13 +171,49 @@ export async function extractWordComments(docxPath: string): Promise<WordComment
160
171
  }
161
172
  if (node['w:p']) {
162
173
  const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
163
- paras.forEach(extractText);
174
+ for (const para of paras) {
175
+ const paraId = para?.$?.['w14:paraId'];
176
+ if (paraId && id) paraIdToCommentId.set(paraId, id);
177
+ extractText(para);
178
+ }
164
179
  }
165
180
  };
166
181
  extractText(comment);
167
182
 
168
183
  comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
169
184
  }
185
+
186
+ // Resolve parent links from commentsExtended.xml. Missing entry just
187
+ // means the docx has no threading metadata (e.g. legacy/non-Word source).
188
+ const extendedEntry = zip.getEntry('word/commentsExtended.xml');
189
+ if (extendedEntry && paraIdToCommentId.size > 0) {
190
+ let extendedXml = '';
191
+ try {
192
+ extendedXml = extendedEntry.getData().toString('utf8');
193
+ } catch {
194
+ // Unreadable threading metadata is non-fatal; skip parent linking.
195
+ }
196
+ if (extendedXml) {
197
+ const parentByCommentId = new Map<string, string>();
198
+ const exPattern = /<w15:commentEx\b([^>]*?)\/>/g;
199
+ let m: RegExpExecArray | null;
200
+ while ((m = exPattern.exec(extendedXml)) !== null) {
201
+ const attrs = m[1] ?? '';
202
+ const paraIdMatch = attrs.match(/w15:paraId="([^"]+)"/);
203
+ const parentMatch = attrs.match(/w15:paraIdParent="([^"]+)"/);
204
+ if (!paraIdMatch || !parentMatch) continue;
205
+ const childCommentId = paraIdToCommentId.get(paraIdMatch[1]);
206
+ const parentCommentId = paraIdToCommentId.get(parentMatch[1]);
207
+ if (childCommentId && parentCommentId && childCommentId !== parentCommentId) {
208
+ parentByCommentId.set(childCommentId, parentCommentId);
209
+ }
210
+ }
211
+ for (const c of comments) {
212
+ const parent = parentByCommentId.get(c.id);
213
+ if (parent) c.parentId = parent;
214
+ }
215
+ }
216
+ }
170
217
  } catch (err: any) {
171
218
  // Re-throw with more context if it's already an Error we created
172
219
  if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
@@ -102,7 +102,8 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
102
102
  return { anchor: text.slice(i + 1, j), endIdx: j + 8 };
103
103
  }
104
104
 
105
- const rawMatches: ParsedComment[] = [];
105
+ const REPLY_PREFIX = '↪ ';
106
+ const rawMatches: (ParsedComment & { explicitReply: boolean })[] = [];
106
107
  let match: RegExpExecArray | null;
107
108
  while ((match = commentPattern.exec(markdown)) !== null) {
108
109
  const content = match[1] ?? '';
@@ -114,6 +115,15 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
114
115
  text = content.slice(colonIdx + 1).trim();
115
116
  }
116
117
 
118
+ // The `↪ ` prefix is the authoritative reply signal emitted by
119
+ // `insertCommentsIntoMarkdown`. Strip it from the author before injection
120
+ // so Word displays the real name.
121
+ let explicitReply = false;
122
+ if (author.startsWith(REPLY_PREFIX)) {
123
+ explicitReply = true;
124
+ author = author.slice(REPLY_PREFIX.length).trim();
125
+ }
126
+
117
127
  const commentEnd = match.index + match[0].length;
118
128
  const trailing = tryParseTrailingAnchor(markdown, commentEnd);
119
129
 
@@ -124,6 +134,7 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
124
134
  start: match.index,
125
135
  end: trailing ? trailing.endIdx : commentEnd,
126
136
  fullMatch: markdown.slice(match.index, trailing ? trailing.endIdx : commentEnd),
137
+ explicitReply,
127
138
  });
128
139
 
129
140
  // Advance regex lastIndex past the consumed anchor so the next iteration
@@ -139,10 +150,17 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
139
150
  return { markedMarkdown: markdown, comments: [] };
140
151
  }
141
152
 
142
- // Detect reply relationships based on adjacency
143
- // First comment in a cluster = parent, all subsequent = replies to that parent
144
- // Comments are "adjacent" if there's minimal text between them (< 10 chars)
153
+ // Two-mode reply detection driven by the markdown itself:
154
+ // - If any comment carries the `↪ ` author prefix, the markdown came
155
+ // through `insertCommentsIntoMarkdown` and we use prefix-only mode.
156
+ // Distinct clusters that happen to land at gap=0 (a real failure
157
+ // mode on dense reviewer docs — 298-comment paper produced 9 such
158
+ // collisions) are not misthreaded.
159
+ // - If no comment carries the prefix, the markdown was hand-typed.
160
+ // Fall back to gap < 10 adjacency for backward compat with users
161
+ // who write CriticMarkup directly.
145
162
  const ADJACENT_THRESHOLD = 10;
163
+ const useExplicitMode = rawMatches.some(m => m.explicitReply);
146
164
  const comments: PreparedComment[] = [];
147
165
  let clusterParentIdx = -1; // Index of first comment in current cluster
148
166
  let lastCommentEnd = -1;
@@ -151,9 +169,10 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
151
169
  const m = rawMatches[i];
152
170
  if (!m) continue;
153
171
 
154
- // Check if this comment is adjacent to the previous one
155
172
  const gap = lastCommentEnd >= 0 ? m.start - lastCommentEnd : Infinity;
156
- const isAdjacent = gap < ADJACENT_THRESHOLD;
173
+ const isAdjacent = useExplicitMode
174
+ ? m.explicitReply
175
+ : gap < ADJACENT_THRESHOLD;
157
176
 
158
177
  // Reset cluster if there's a gap (comments not in same cluster)
159
178
  if (!isAdjacent) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "docrev",
3
- "version": "0.9.11",
3
+ "version": "0.9.13",
4
4
  "description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
5
5
  "type": "module",
6
6
  "types": "dist/lib/types.d.ts",