npm - docrev - Versions diffs - 0.9.11 → 0.9.13 - Mend

docrev 0.9.11 → 0.9.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/lib/build.d.ts +12 -0
package/dist/lib/build.d.ts.map +1 -1
package/dist/lib/build.js +12 -0
package/dist/lib/build.js.map +1 -1
package/dist/lib/import.d.ts.map +1 -1
package/dist/lib/import.js +146 -24
package/dist/lib/import.js.map +1 -1
package/dist/lib/types.d.ts +20 -0
package/dist/lib/types.d.ts.map +1 -1
package/dist/lib/word-extraction.d.ts +6 -0
package/dist/lib/word-extraction.d.ts.map +1 -1
package/dist/lib/word-extraction.js +46 -3
package/dist/lib/word-extraction.js.map +1 -1
package/dist/lib/wordcomments.d.ts.map +1 -1
package/dist/lib/wordcomments.js +23 -5
package/dist/lib/wordcomments.js.map +1 -1
package/lib/build.ts +24 -0
package/lib/import.ts +143 -24
package/lib/types.ts +20 -0
package/lib/word-extraction.ts +50 -3
package/lib/wordcomments.ts +25 -6
package/package.json +1 -1

package/lib/import.ts CHANGED Viewed

@@ -269,6 +269,43 @@ function pushPastSectionHeading(text: string, pos: number): number {
   return after;
 }
+/**
+ * Snap a position to the nearest whitespace boundary within ±50 chars so a
+ * proportional fallback insertion never lands mid-word.
+ */
+function snapToWordBoundary(text: string, pos: number): number {
+  if (pos <= 0) return 0;
+  if (pos >= text.length) return text.length;
+  if (/\s/.test(text[pos] ?? '')) return pos;
+  for (let d = 1; d <= 50; d++) {
+    if (pos + d < text.length && /\s/.test(text[pos + d] ?? '')) return pos + d;
+    if (pos - d >= 0 && /\s/.test(text[pos - d] ?? '')) return pos - d;
+  }
+  return pos;
+}
+/**
+ * Final-resort placement when every text-matching strategy failed. The docx
+ * carries a real `<w:commentRangeStart w:id="N">` marker at a known offset
+ * inside its body text — that's a structural anchor, even if the anchored
+ * span itself is empty and the surrounding context drifted in the target.
+ *
+ * Map docPosition into the target markdown proportionally and snap to a word
+ * boundary. This is approximate when the document was heavily restructured,
+ * but it's strictly better than silently dropping a reviewer's comment: the
+ * comment lands in roughly the right neighborhood and the reviewer can
+ * relocate it during their next pass.
+ */
+function proportionalFallback(
+  anchorData: CommentAnchorData,
+  target: string,
+): number | null {
+  if (anchorData.docLength <= 0) return null;
+  const proportion = Math.min(anchorData.docPosition / anchorData.docLength, 1.0);
+  const rawPos = Math.floor(proportion * target.length);
+  return pushPastSectionHeading(target, snapToWordBoundary(target, rawPos));
+}
 /**
  * Insert comments into markdown text based on anchor texts with context
  */
@@ -285,11 +322,41 @@ export function insertCommentsIntoMarkdown(
   const duplicateWarnings: string[] = [];
   const usedPositions = new Set<number>(); // For tie-breaking: track used positions
+  // Resolve threading: replies share their parent's anchor in Word, so they
+  // must inherit the parent's position and ride alongside it as one cluster.
+  // Letting each reply run through anchor scoring scatters the cluster (the
+  // same docPosition forces `usedPositions` to push later replies onto a
+  // different occurrence), which on re-build looks like independent comments
+  // and loses the paraIdParent threading. See gcol33/docrev issue #2.
+  const inputById = new Map<string, WordComment>();
+  for (const c of comments) inputById.set(c.id, c);
+  function rootIdOf(c: WordComment): string {
+    let cur: WordComment = c;
+    const seen = new Set<string>();
+    while (cur.parentId && !seen.has(cur.id)) {
+      seen.add(cur.id);
+      const parent = inputById.get(cur.parentId);
+      if (!parent || parent === cur) break;
+      cur = parent;
+    }
+    return cur.id;
+  }
+  const replyRootId = new Map<string, string>();
+  for (const c of comments) {
+    const root = rootIdOf(c);
+    if (root !== c.id) replyRootId.set(c.id, root);
+  }
   // Anchor matching primitives live in lib/anchor-match.ts so that
   // `rev verify-anchors` can use the same strategies for drift reporting.
-  // Get all positions in order (for sequential tie-breaking)
+  // Get all positions in order (for sequential tie-breaking).
+  // Replies skip scoring entirely — they piggyback on their root's position
+  // in the emit pass below.
   const commentsWithPositions = comments.map((c): CommentWithPos => {
+    if (replyRootId.has(c.id)) {
+      return { ...c, pos: -1, anchorText: null, strategy: 'reply' };
+    }
     const anchorData = anchors.get(c.id);
     if (!anchorData) {
       unmatchedCount++;
@@ -396,6 +463,14 @@ export function insertCommentsIntoMarkdown(
           return { ...c, pos: occurrences[0], anchorText: null, isEmpty: true };
         }
       }
+      // Last resort: docx carried a structural marker at docPosition; map
+      // it proportionally into the target so the comment isn't dropped.
+      if (typeof anchorData === 'object') {
+        const fallback = proportionalFallback(anchorData, result);
+        if (fallback !== null) {
+          return { ...c, pos: fallback, anchorText: null, isEmpty: true, strategy: 'proportional-fallback' };
+        }
+      }
       unmatchedCount++;
       return { ...c, pos: -1, anchorText: null, isEmpty: true };
     }
@@ -404,6 +479,14 @@ export function insertCommentsIntoMarkdown(
     const { occurrences, matchedAnchor, strategy, stripped } = findAnchorInText(anchor, result, before, after);
     if (occurrences.length === 0) {
+      // Same last-resort as the empty-anchor path: anchor text is gone from
+      // the target, but the marker's text-offset survived extraction.
+      if (typeof anchorData === 'object') {
+        const fallback = proportionalFallback(anchorData, result);
+        if (fallback !== null) {
+          return { ...c, pos: fallback, anchorText: null, strategy: 'proportional-fallback' };
+        }
+      }
       unmatchedCount++;
       return { ...c, pos: -1, anchorText: null };
     }
@@ -434,53 +517,89 @@ export function insertCommentsIntoMarkdown(
     }
   });
-  // Log any unmatched comments for debugging
-  const unmatched = commentsWithPositions.filter((c) => c.pos < 0);
+  // Group comments into clusters (root + ordered replies). The root carries
+  // the resolved position; replies inherit it and ride along in input order
+  // so the rebuilt CriticMarkup looks like `{>>p<<}{>>r1<<}{>>r2<<}[anchor]`
+  // and adjacency-based reply detection picks the cluster up again.
+  const byId = new Map<string, CommentWithPos>();
+  for (const cwp of commentsWithPositions) byId.set(cwp.id, cwp);
+  const repliesByRoot = new Map<string, CommentWithPos[]>();
+  for (const c of comments) {
+    const rootId = replyRootId.get(c.id);
+    if (!rootId) continue;
+    const cwp = byId.get(c.id);
+    if (!cwp) continue;
+    const list = repliesByRoot.get(rootId);
+    if (list) list.push(cwp);
+    else repliesByRoot.set(rootId, [cwp]);
+  }
+  // Replies whose root never resolved (parent missing from the input slice or
+  // parent unmatched) count as unmatched too — there's no position to attach
+  // them to.
+  for (const [rootId, replies] of repliesByRoot) {
+    const root = byId.get(rootId);
+    if (!root || root.pos < 0) {
+      unmatchedCount += replies.length;
+    }
+  }
+  // Roots only — replies attach during emission.
+  const rootsWithPos = commentsWithPositions.filter(
+    c => !replyRootId.has(c.id)
+  );
+  // Log any unmatched roots for debugging
+  const unmatched = rootsWithPos.filter((c) => c.pos < 0);
   if (process.env.DEBUG) {
-    console.log(`[DEBUG] insertComments: ${comments.length} input, ${commentsWithPositions.length} processed, ${unmatched.length} unmatched`);
+    console.log(`[DEBUG] insertComments: ${comments.length} input, ${rootsWithPos.length} roots, ${unmatched.length} unmatched roots, ${replyRootId.size} replies`);
     if (unmatched.length > 0) {
       unmatched.forEach(c => console.log(`[DEBUG]   Unmatched ID=${c.id}: anchor="${(c.anchorText || 'none').slice(0,30)}"`));
     }
   }
-  const matched = commentsWithPositions.filter((c) => c.pos >= 0);
+  const matchedRoots = rootsWithPos.filter((c) => c.pos >= 0);
   // Sort by position descending (insert from end to avoid offset issues)
-  matched.sort((a, b) => b.pos - a.pos);
+  matchedRoots.sort((a, b) => b.pos - a.pos);
-  // Insert each comment. With `wrapAnchor` (the default), the anchor text
+  // Insert each cluster. With `wrapAnchor` (the default), the anchor text
   // gets wrapped in `[anchor]{.mark}` so the rebuilt docx restores the
   // original Word comment range. Without it, the comment block is inserted
   // adjacent to the anchor and prose stays untouched — required for
   // comments-only sync where multiple comments may share one anchor.
-  // Skip insertion when an identical comment already lives near the target.
-  // Re-running sync against the same docx would otherwise stack duplicate
-  // CriticMarkup blocks (`{>>R1: ...<<}{>>R1: ...<<}...`) on each invocation.
-  // A 200-char window catches both wrapped (`{>>...<<}[anchor]{.mark}`) and
-  // bare (`{>>...<<}anchor`) forms while ignoring incidental matches farther
-  // away.
+  // Skip insertion when the parent's CriticMarkup already lives near the
+  // target — re-running sync against the same docx would otherwise stack
+  // duplicates. A 200-char window catches both wrapped
+  // (`{>>...<<}[anchor]{.mark}`) and bare (`{>>...<<}anchor`) forms while
+  // ignoring incidental matches farther away.
   let dedupedCount = 0;
-  for (const c of matched) {
-    const comment = `{>>${c.author}: ${c.text}<<}`;
+  for (const c of matchedRoots) {
+    const parentBlock = `{>>${c.author}: ${c.text}<<}`;
+    const replies = repliesByRoot.get(c.id) ?? [];
     const windowStart = Math.max(0, c.pos - 200);
     const windowEnd = Math.min(result.length, c.pos + 200);
-    if (result.slice(windowStart, windowEnd).includes(comment)) {
-      dedupedCount++;
+    if (result.slice(windowStart, windowEnd).includes(parentBlock)) {
+      // Cluster already synced; treat all members as deduped.
+      dedupedCount += 1 + replies.length;
       continue;
     }
+    // Replies carry an explicit `↪ ` author prefix so the round-trip does not
+    // depend on positional adjacency in the markdown. On dense reviewer docs
+    // distinct clusters frequently land at the same anchor position; without
+    // the prefix the re-parse would misthread them. The injection side strips
+    // `↪ ` back off the author so Word renders the original name.
+    const replyBlocks = replies.map(r => `{>>↪ ${r.author}: ${r.text}<<}`);
+    const combined = parentBlock + replyBlocks.join('');
     if (wrapAnchor && c.anchorText && c.anchorEnd) {
       const before = result.slice(0, c.pos);
       const anchor = result.slice(c.pos, c.anchorEnd);
       const after = result.slice(c.anchorEnd);
-      result = before + comment + `[${anchor}]{.mark}` + after;
+      result = before + combined + `[${anchor}]{.mark}` + after;
     } else {
-      // Insert comment at the anchor position with no surrounding whitespace
-      // tweaks; CriticMarkup blocks are invisible to readers, and adding a
-      // leading space would shift prose byte-for-byte (relevant when callers
-      // verify that --comments-only didn't touch the original).
-      result = result.slice(0, c.pos) + comment + result.slice(c.pos);
+      result = result.slice(0, c.pos) + combined + result.slice(c.pos);
     }
-    placedCount++;
+    placedCount += 1 + replies.length;
   }
   if (outStats) {

package/lib/types.ts CHANGED Viewed

@@ -69,6 +69,22 @@ export interface PdfConfig {
   geometry?: string;
   linestretch?: number;
   toc?: boolean;
+  /**
+   * LaTeX engine to use for PDF output. One of `pdflatex` (default),
+   * `xelatex`, `lualatex`, `tectonic`, etc. xelatex/lualatex are required
+   * for native UTF-8 rendering of diacritics in author names, place
+   * names, and species epithets.
+   */
+  engine?: string;
+  /** Roman/serif main font (xelatex/lualatex only — uses fontspec). */
+  mainfont?: string;
+  /** Sans-serif font (xelatex/lualatex only). */
+  sansfont?: string;
+  /** Monospace font (xelatex/lualatex only). */
+  monofont?: string;
+  numbersections?: boolean;
+  template?: string;
+  headerIncludes?: string;
 }
 export interface DocxConfig {
@@ -338,6 +354,10 @@ export interface JournalFormatting {
     linestretch?: number;
     template?: string;
     numbersections?: boolean;
+    engine?: string;
+    mainfont?: string;
+    sansfont?: string;
+    monofont?: string;
   };
   docx?: {
     reference?: string;

package/lib/word-extraction.ts CHANGED Viewed

@@ -18,6 +18,12 @@ export interface WordComment {
   author: string;
   date: string;
   text: string;
+  /**
+   * Parent comment id when this is a reply in a Word comment thread.
+   * Resolved from `commentsExtended.xml`'s `w15:paraIdParent` field.
+   * `undefined` for top-level comments.
+   */
+  parentId?: string;
 }
 export interface TextNode {
@@ -126,7 +132,6 @@ export async function extractWordComments(docxPath: string): Promise<WordComment
     const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
-    const ns = 'w:';
     const commentsRoot = parsed['w:comments'];
     if (!commentsRoot || !commentsRoot['w:comment']) {
       return comments;
@@ -137,12 +142,18 @@ export async function extractWordComments(docxPath: string): Promise<WordComment
       ? commentsRoot['w:comment']
       : [commentsRoot['w:comment']];
+    // Map every paraId that lives inside a comment back to that comment's id.
+    // Word's commentsExtended.xml expresses threading via w15:paraIdParent,
+    // which references the parent's first <w:p>. Replies use a secondary
+    // (often-empty) <w:p>, so each comment may contribute multiple paraIds.
+    const paraIdToCommentId = new Map<string, string>();
     for (const comment of commentNodes) {
       const id = comment.$?.['w:id'] || '';
       const author = comment.$?.['w:author'] || 'Unknown';
       const date = comment.$?.['w:date'] || '';
-      // Extract text from nested w:p/w:r/w:t elements
+      // Extract text from nested w:p/w:r/w:t elements and record paraIds.
       let text = '';
       const extractText = (node: any): void => {
         if (!node) return;
@@ -160,13 +171,49 @@ export async function extractWordComments(docxPath: string): Promise<WordComment
         }
         if (node['w:p']) {
           const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
-          paras.forEach(extractText);
+          for (const para of paras) {
+            const paraId = para?.$?.['w14:paraId'];
+            if (paraId && id) paraIdToCommentId.set(paraId, id);
+            extractText(para);
+          }
         }
       };
       extractText(comment);
       comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
     }
+    // Resolve parent links from commentsExtended.xml. Missing entry just
+    // means the docx has no threading metadata (e.g. legacy/non-Word source).
+    const extendedEntry = zip.getEntry('word/commentsExtended.xml');
+    if (extendedEntry && paraIdToCommentId.size > 0) {
+      let extendedXml = '';
+      try {
+        extendedXml = extendedEntry.getData().toString('utf8');
+      } catch {
+        // Unreadable threading metadata is non-fatal; skip parent linking.
+      }
+      if (extendedXml) {
+        const parentByCommentId = new Map<string, string>();
+        const exPattern = /<w15:commentEx\b([^>]*?)\/>/g;
+        let m: RegExpExecArray | null;
+        while ((m = exPattern.exec(extendedXml)) !== null) {
+          const attrs = m[1] ?? '';
+          const paraIdMatch = attrs.match(/w15:paraId="([^"]+)"/);
+          const parentMatch = attrs.match(/w15:paraIdParent="([^"]+)"/);
+          if (!paraIdMatch || !parentMatch) continue;
+          const childCommentId = paraIdToCommentId.get(paraIdMatch[1]);
+          const parentCommentId = paraIdToCommentId.get(parentMatch[1]);
+          if (childCommentId && parentCommentId && childCommentId !== parentCommentId) {
+            parentByCommentId.set(childCommentId, parentCommentId);
+          }
+        }
+        for (const c of comments) {
+          const parent = parentByCommentId.get(c.id);
+          if (parent) c.parentId = parent;
+        }
+      }
+    }
   } catch (err: any) {
     // Re-throw with more context if it's already an Error we created
     if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {

package/lib/wordcomments.ts CHANGED Viewed

@@ -102,7 +102,8 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
     return { anchor: text.slice(i + 1, j), endIdx: j + 8 };
   }
-  const rawMatches: ParsedComment[] = [];
+  const REPLY_PREFIX = '↪ ';
+  const rawMatches: (ParsedComment & { explicitReply: boolean })[] = [];
   let match: RegExpExecArray | null;
   while ((match = commentPattern.exec(markdown)) !== null) {
     const content = match[1] ?? '';
@@ -114,6 +115,15 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
       text = content.slice(colonIdx + 1).trim();
     }
+    // The `↪ ` prefix is the authoritative reply signal emitted by
+    // `insertCommentsIntoMarkdown`. Strip it from the author before injection
+    // so Word displays the real name.
+    let explicitReply = false;
+    if (author.startsWith(REPLY_PREFIX)) {
+      explicitReply = true;
+      author = author.slice(REPLY_PREFIX.length).trim();
+    }
     const commentEnd = match.index + match[0].length;
     const trailing = tryParseTrailingAnchor(markdown, commentEnd);
@@ -124,6 +134,7 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
       start: match.index,
       end: trailing ? trailing.endIdx : commentEnd,
       fullMatch: markdown.slice(match.index, trailing ? trailing.endIdx : commentEnd),
+      explicitReply,
     });
     // Advance regex lastIndex past the consumed anchor so the next iteration
@@ -139,10 +150,17 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
     return { markedMarkdown: markdown, comments: [] };
   }
-  // Detect reply relationships based on adjacency
-  // First comment in a cluster = parent, all subsequent = replies to that parent
-  // Comments are "adjacent" if there's minimal text between them (< 10 chars)
+  // Two-mode reply detection driven by the markdown itself:
+  //   - If any comment carries the `↪ ` author prefix, the markdown came
+  //     through `insertCommentsIntoMarkdown` and we use prefix-only mode.
+  //     Distinct clusters that happen to land at gap=0 (a real failure
+  //     mode on dense reviewer docs — 298-comment paper produced 9 such
+  //     collisions) are not misthreaded.
+  //   - If no comment carries the prefix, the markdown was hand-typed.
+  //     Fall back to gap < 10 adjacency for backward compat with users
+  //     who write CriticMarkup directly.
   const ADJACENT_THRESHOLD = 10;
+  const useExplicitMode = rawMatches.some(m => m.explicitReply);
   const comments: PreparedComment[] = [];
   let clusterParentIdx = -1;  // Index of first comment in current cluster
   let lastCommentEnd = -1;
@@ -151,9 +169,10 @@ export function prepareMarkdownWithMarkers(markdown: string): PrepareResult {
     const m = rawMatches[i];
     if (!m) continue;
-    // Check if this comment is adjacent to the previous one
     const gap = lastCommentEnd >= 0 ? m.start - lastCommentEnd : Infinity;
-    const isAdjacent = gap < ADJACENT_THRESHOLD;
+    const isAdjacent = useExplicitMode
+      ? m.explicitReply
+      : gap < ADJACENT_THRESHOLD;
     // Reset cluster if there's a gap (comments not in same cluster)
     if (!isAdjacent) {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "docrev",
-  "version": "0.9.11",
+  "version": "0.9.13",
   "description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
   "type": "module",
   "types": "dist/lib/types.d.ts",