npm - docrev - Versions diffs - 0.9.13 → 0.9.14 - Mend

docrev 0.9.13 → 0.9.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (126) hide show

package/.claude/settings.local.json +9 -9
package/.gitattributes +1 -1
package/CHANGELOG.md +149 -149
package/PLAN-tables-and-postprocess.md +850 -850
package/README.md +391 -391
package/bin/rev.js +11 -11
package/bin/rev.ts +145 -145
package/completions/rev.bash +127 -127
package/completions/rev.ps1 +210 -210
package/completions/rev.zsh +207 -207
package/dev_notes/stress2/build_adversarial.ts +186 -186
package/dev_notes/stress2/drift_matcher.ts +62 -62
package/dev_notes/stress2/probe_anchors.ts +35 -35
package/dev_notes/stress2/project/discussion.before.md +3 -3
package/dev_notes/stress2/project/discussion.md +3 -3
package/dev_notes/stress2/project/methods.before.md +20 -20
package/dev_notes/stress2/project/methods.md +20 -20
package/dev_notes/stress2/project/rev.yaml +5 -5
package/dev_notes/stress2/project/sections.yaml +4 -4
package/dev_notes/stress2/sections.yaml +5 -5
package/dev_notes/stress2/trace_placement.ts +50 -50
package/dev_notes/stresstest_boundaries.ts +27 -27
package/dev_notes/stresstest_drift_apply.ts +43 -43
package/dev_notes/stresstest_drift_compare.ts +43 -43
package/dev_notes/stresstest_drift_v2.ts +54 -54
package/dev_notes/stresstest_inspect.ts +54 -54
package/dev_notes/stresstest_pstyle.ts +55 -55
package/dev_notes/stresstest_section_debug.ts +23 -23
package/dev_notes/stresstest_split.ts +70 -70
package/dev_notes/stresstest_trace.ts +19 -19
package/dev_notes/stresstest_verify_no_overwrite.ts +40 -40
package/dist/lib/build.d.ts +38 -1
package/dist/lib/build.d.ts.map +1 -1
package/dist/lib/build.js +68 -30
package/dist/lib/build.js.map +1 -1
package/dist/lib/commands/build.d.ts.map +1 -1
package/dist/lib/commands/build.js +38 -5
package/dist/lib/commands/build.js.map +1 -1
package/dist/lib/commands/utilities.js +164 -164
package/dist/lib/commands/word-tools.js +8 -8
package/dist/lib/grammar.js +3 -3
package/dist/lib/pdf-comments.js +44 -44
package/dist/lib/plugins.js +57 -57
package/dist/lib/pptx-themes.js +115 -115
package/dist/lib/spelling.js +2 -2
package/dist/lib/templates.js +387 -387
package/dist/lib/themes.js +51 -51
package/eslint.config.js +27 -27
package/lib/anchor-match.ts +276 -276
package/lib/annotations.ts +644 -644
package/lib/build.ts +1300 -1251
package/lib/citations.ts +160 -160
package/lib/commands/build.ts +833 -801
package/lib/commands/citations.ts +515 -515
package/lib/commands/comments.ts +1050 -1050
package/lib/commands/context.ts +174 -174
package/lib/commands/core.ts +309 -309
package/lib/commands/doi.ts +435 -435
package/lib/commands/file-ops.ts +372 -372
package/lib/commands/history.ts +320 -320
package/lib/commands/index.ts +87 -87
package/lib/commands/init.ts +259 -259
package/lib/commands/merge-resolve.ts +378 -378
package/lib/commands/preview.ts +178 -178
package/lib/commands/project-info.ts +244 -244
package/lib/commands/quality.ts +517 -517
package/lib/commands/response.ts +454 -454
package/lib/commands/section-boundaries.ts +82 -82
package/lib/commands/sections.ts +451 -451
package/lib/commands/sync.ts +706 -706
package/lib/commands/text-ops.ts +449 -449
package/lib/commands/utilities.ts +448 -448
package/lib/commands/verify-anchors.ts +272 -272
package/lib/commands/word-tools.ts +340 -340
package/lib/comment-realign.ts +517 -517
package/lib/config.ts +84 -84
package/lib/crossref.ts +781 -781
package/lib/csl.ts +191 -191
package/lib/dependencies.ts +98 -98
package/lib/diff-engine.ts +465 -465
package/lib/doi-cache.ts +115 -115
package/lib/doi.ts +897 -897
package/lib/equations.ts +506 -506
package/lib/errors.ts +346 -346
package/lib/format.ts +541 -541
package/lib/git.ts +326 -326
package/lib/grammar.ts +303 -303
package/lib/image-registry.ts +180 -180
package/lib/import.ts +911 -911
package/lib/journals.ts +543 -543
package/lib/merge.ts +633 -633
package/lib/orcid.ts +144 -144
package/lib/pdf-comments.ts +263 -263
package/lib/pdf-import.ts +524 -524
package/lib/plugins.ts +362 -362
package/lib/postprocess.ts +188 -188
package/lib/pptx-color-filter.lua +37 -37
package/lib/pptx-template.ts +469 -469
package/lib/pptx-themes.ts +483 -483
package/lib/protect-restore.ts +520 -520
package/lib/rate-limiter.ts +94 -94
package/lib/response.ts +197 -197
package/lib/restore-references.ts +240 -240
package/lib/review.ts +327 -327
package/lib/schema.ts +417 -417
package/lib/scientific-words.ts +73 -73
package/lib/sections.ts +335 -335
package/lib/slides.ts +756 -756
package/lib/spelling.ts +334 -334
package/lib/templates.ts +526 -526
package/lib/themes.ts +742 -742
package/lib/trackchanges.ts +247 -247
package/lib/tui.ts +450 -450
package/lib/types.ts +550 -550
package/lib/undo.ts +250 -250
package/lib/utils.ts +69 -69
package/lib/variables.ts +179 -179
package/lib/word-extraction.ts +806 -806
package/lib/word.ts +643 -643
package/lib/wordcomments.ts +817 -817
package/package.json +137 -137
package/scripts/postbuild.js +28 -28
package/skill/REFERENCE.md +431 -431
package/skill/SKILL.md +258 -258
package/tsconfig.json +26 -26
package/types/index.d.ts +525 -525

package/lib/word-extraction.ts CHANGED Viewed

@@ -1,806 +1,806 @@
-/**
- * Word document data extraction - raw extraction from .docx files
- */
-import * as fs from 'fs';
-import * as path from 'path';
-import { exec } from 'child_process';
-import { promisify } from 'util';
-const execAsync = promisify(exec);
-// ============================================
-// Type Definitions
-// ============================================
-export interface WordComment {
-  id: string;
-  author: string;
-  date: string;
-  text: string;
-  /**
-   * Parent comment id when this is a reply in a Word comment thread.
-   * Resolved from `commentsExtended.xml`'s `w15:paraIdParent` field.
-   * `undefined` for top-level comments.
-   */
-  parentId?: string;
-}
-export interface TextNode {
-  xmlStart: number;
-  xmlEnd: number;
-  textStart: number;
-  textEnd: number;
-  text: string;
-}
-export interface CommentAnchorData {
-  anchor: string;
-  before: string;
-  after: string;
-  docPosition: number;
-  docLength: number;
-  isEmpty: boolean;
-}
-export interface CommentAnchorsResult {
-  anchors: Map<string, CommentAnchorData>;
-  fullDocText: string;
-}
-export interface DocxHeading {
-  /** Heading style name from `<w:pStyle>`, e.g. "Heading1" */
-  style: string;
-  /** Heading depth: 1, 2, 3, ... (parsed from style name; 0 if unknown) */
-  level: number;
-  /** Concatenated text content of the heading paragraph */
-  text: string;
-  /** Position in fullDocText (same coordinate system as CommentAnchorData.docPosition) */
-  docPosition: number;
-}
-export interface WordTable {
-  markdown: string;
-  rowCount: number;
-  colCount: number;
-}
-export interface ParsedRow {
-  cells: string[];
-  colSpans: number[];
-}
-export interface ExtractFromWordOptions {
-  mediaDir?: string;
-  skipMediaExtraction?: boolean;
-}
-export interface ExtractMessage {
-  type: 'info' | 'warning';
-  message: string;
-}
-export interface ExtractFromWordResult {
-  text: string;
-  comments: WordComment[];
-  anchors: Map<string, CommentAnchorData>;
-  messages: ExtractMessage[];
-  extractedMedia: string[];
-  tables: WordTable[];
-  hasTrackChanges: boolean;
-  trackChangeStats: { insertions: number; deletions: number };
-}
-// ============================================
-// Functions
-// ============================================
-/**
- * Extract comments directly from Word docx comments.xml
- */
-export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
-  const AdmZip = (await import('adm-zip')).default;
-  const { parseStringPromise } = await import('xml2js');
-  const comments: WordComment[] = [];
-  // Validate file exists
-  if (!fs.existsSync(docxPath)) {
-    throw new Error(`File not found: ${docxPath}`);
-  }
-  try {
-    let zip;
-    try {
-      zip = new AdmZip(docxPath);
-    } catch (err: any) {
-      throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
-    }
-    const commentsEntry = zip.getEntry('word/comments.xml');
-    if (!commentsEntry) {
-      return comments;
-    }
-    let commentsXml;
-    try {
-      commentsXml = commentsEntry.getData().toString('utf8');
-    } catch (err: any) {
-      throw new Error(`Failed to read comments from document: ${err.message}`);
-    }
-    const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
-    const commentsRoot = parsed['w:comments'];
-    if (!commentsRoot || !commentsRoot['w:comment']) {
-      return comments;
-    }
-    // Ensure it's an array
-    const commentNodes = Array.isArray(commentsRoot['w:comment'])
-      ? commentsRoot['w:comment']
-      : [commentsRoot['w:comment']];
-    // Map every paraId that lives inside a comment back to that comment's id.
-    // Word's commentsExtended.xml expresses threading via w15:paraIdParent,
-    // which references the parent's first <w:p>. Replies use a secondary
-    // (often-empty) <w:p>, so each comment may contribute multiple paraIds.
-    const paraIdToCommentId = new Map<string, string>();
-    for (const comment of commentNodes) {
-      const id = comment.$?.['w:id'] || '';
-      const author = comment.$?.['w:author'] || 'Unknown';
-      const date = comment.$?.['w:date'] || '';
-      // Extract text from nested w:p/w:r/w:t elements and record paraIds.
-      let text = '';
-      const extractText = (node: any): void => {
-        if (!node) return;
-        if (typeof node === 'string') {
-          text += node;
-          return;
-        }
-        if (node['w:t']) {
-          const t = node['w:t'];
-          text += typeof t === 'string' ? t : (t._ || t);
-        }
-        if (node['w:r']) {
-          const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
-          runs.forEach(extractText);
-        }
-        if (node['w:p']) {
-          const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
-          for (const para of paras) {
-            const paraId = para?.$?.['w14:paraId'];
-            if (paraId && id) paraIdToCommentId.set(paraId, id);
-            extractText(para);
-          }
-        }
-      };
-      extractText(comment);
-      comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
-    }
-    // Resolve parent links from commentsExtended.xml. Missing entry just
-    // means the docx has no threading metadata (e.g. legacy/non-Word source).
-    const extendedEntry = zip.getEntry('word/commentsExtended.xml');
-    if (extendedEntry && paraIdToCommentId.size > 0) {
-      let extendedXml = '';
-      try {
-        extendedXml = extendedEntry.getData().toString('utf8');
-      } catch {
-        // Unreadable threading metadata is non-fatal; skip parent linking.
-      }
-      if (extendedXml) {
-        const parentByCommentId = new Map<string, string>();
-        const exPattern = /<w15:commentEx\b([^>]*?)\/>/g;
-        let m: RegExpExecArray | null;
-        while ((m = exPattern.exec(extendedXml)) !== null) {
-          const attrs = m[1] ?? '';
-          const paraIdMatch = attrs.match(/w15:paraId="([^"]+)"/);
-          const parentMatch = attrs.match(/w15:paraIdParent="([^"]+)"/);
-          if (!paraIdMatch || !parentMatch) continue;
-          const childCommentId = paraIdToCommentId.get(paraIdMatch[1]);
-          const parentCommentId = paraIdToCommentId.get(parentMatch[1]);
-          if (childCommentId && parentCommentId && childCommentId !== parentCommentId) {
-            parentByCommentId.set(childCommentId, parentCommentId);
-          }
-        }
-        for (const c of comments) {
-          const parent = parentByCommentId.get(c.id);
-          if (parent) c.parentId = parent;
-        }
-      }
-    }
-  } catch (err: any) {
-    // Re-throw with more context if it's already an Error we created
-    if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
-      throw err;
-    }
-    throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
-  }
-  return comments;
-}
-/**
- * Extract comment anchor texts from document.xml with surrounding context
- * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
- * Also returns fullDocText for section boundary matching
- */
-export async function extractCommentAnchors(docxPath: string): Promise<CommentAnchorsResult> {
-  const AdmZip = (await import('adm-zip')).default;
-  const anchors = new Map<string, CommentAnchorData>();
-  let fullDocText = '';
-  try {
-    const zip = new AdmZip(docxPath);
-    const docEntry = zip.getEntry('word/document.xml');
-    if (!docEntry) {
-      return { anchors, fullDocText };
-    }
-    const docXml = docEntry.getData().toString('utf8');
-    // ========================================
-    // STEP 1: Build text position mapping
-    // ========================================
-    const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
-    const textNodes: TextNode[] = [];
-    let textPosition = 0;
-    let nodeMatch;
-    while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
-      const rawText = nodeMatch[1] ?? '';
-      const decodedText = decodeXmlEntities(rawText);
-      textNodes.push({
-        xmlStart: nodeMatch.index,
-        xmlEnd: nodeMatch.index + nodeMatch[0].length,
-        textStart: textPosition,
-        textEnd: textPosition + decodedText.length,
-        text: decodedText
-      });
-      textPosition += decodedText.length;
-    }
-    fullDocText = textNodes.map(n => n.text).join('');
-    // Helper: convert XML position to text position
-    function xmlPosToTextPos(xmlPos: number): number {
-      for (let i = 0; i < textNodes.length; i++) {
-        const node = textNodes[i];
-        if (!node) continue;
-        if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
-          return node.textStart;
-        }
-        if (xmlPos < node.xmlStart) {
-          return node.textStart;
-        }
-      }
-      const lastNode = textNodes[textNodes.length - 1];
-      return lastNode ? lastNode.textEnd : 0;
-    }
-    // Helper: extract context before a position
-    function getContextBefore(position: number, maxLength: number = 150): string {
-      const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
-      const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
-      return sentenceStart >= 0
-        ? beforeText.slice(sentenceStart + 2).trim()
-        : beforeText.slice(-80).trim();
-    }
-    // Helper: extract context after a position
-    function getContextAfter(position: number, maxLength: number = 150): string {
-      const afterText = fullDocText.slice(position, position + maxLength);
-      const sentenceEnd = afterText.search(/[.!?]\s/);
-      return sentenceEnd >= 0
-        ? afterText.slice(0, sentenceEnd + 1).trim()
-        : afterText.slice(0, 80).trim();
-    }
-    // ========================================
-    // STEP 2: Collect all start/end markers separately
-    // ========================================
-    const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
-    const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
-    const starts = new Map<string, number>();  // id -> position after start tag
-    const ends = new Map<string, number>();    // id -> position before end tag
-    let match;
-    while ((match = startPattern.exec(docXml)) !== null) {
-      const id = match[1];
-      if (!starts.has(id)) {
-        starts.set(id, match.index + match[0].length);
-      }
-    }
-    while ((match = endPattern.exec(docXml)) !== null) {
-      const id = match[1];
-      if (!ends.has(id)) {
-        ends.set(id, match.index);
-      }
-    }
-    // ========================================
-    // STEP 3: Process each comment range by ID
-    // ========================================
-    for (const [id, startXmlPos] of starts) {
-      const endXmlPos = ends.get(id);
-      // Missing end marker - skip with warning
-      if (endXmlPos === undefined) {
-        console.warn(`Comment ${id}: missing end marker`);
-        continue;
-      }
-      // Calculate text position
-      const docPosition = xmlPosToTextPos(startXmlPos);
-      // Handle empty or inverted ranges
-      if (endXmlPos <= startXmlPos) {
-        anchors.set(id, {
-          anchor: '',
-          before: getContextBefore(docPosition),
-          after: getContextAfter(docPosition),
-          docPosition,
-          docLength: fullDocText.length,
-          isEmpty: true
-        });
-        continue;
-      }
-      // Extract XML segment between markers
-      const segment = docXml.slice(startXmlPos, endXmlPos);
-      // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
-      const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
-      let anchorText = '';
-      let tm;
-      while ((tm = textInRangePattern.exec(segment)) !== null) {
-        anchorText += tm[1] || tm[2] || '';
-      }
-      anchorText = decodeXmlEntities(anchorText);
-      // Get context
-      const anchorLength = anchorText.length;
-      const before = getContextBefore(docPosition);
-      const after = getContextAfter(docPosition + anchorLength);
-      // ALWAYS add entry (even if anchor is empty)
-      anchors.set(id, {
-        anchor: anchorText.trim(),
-        before,
-        after,
-        docPosition,
-        docLength: fullDocText.length,
-        isEmpty: !anchorText.trim()
-      });
-    }
-  } catch (err: any) {
-    console.error('Error extracting comment anchors:', err.message);
-    return { anchors, fullDocText: '' };
-  }
-  return { anchors, fullDocText };
-}
-/**
- * Extract heading paragraphs from a docx, with their text positions in the
- * same coordinate system as `extractCommentAnchors`'s `fullDocText` and
- * `CommentAnchorData.docPosition`.
- *
- * Headings are paragraphs whose `<w:pStyle>` is a Heading style. Reading
- * styles directly is more reliable than keyword-matching the concatenated
- * body text — there, paragraph boundaries are gone, so the literal string
- * "Methods" can appear inside prose ("results across countries") and the
- * structured-abstract label "Methods:" loses its colon when text runs are
- * concatenated.
- */
-export async function extractHeadings(docxPath: string): Promise<DocxHeading[]> {
-  const AdmZip = (await import('adm-zip')).default;
-  if (!fs.existsSync(docxPath)) {
-    throw new Error(`File not found: ${docxPath}`);
-  }
-  const zip = new AdmZip(docxPath);
-  const docEntry = zip.getEntry('word/document.xml');
-  if (!docEntry) return [];
-  const xml = docEntry.getData().toString('utf8');
-  // Build the same xml-pos → text-pos mapping that extractCommentAnchors does
-  const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
-  const nodes: Array<{ xmlStart: number; xmlEnd: number; textStart: number; textEnd: number }> = [];
-  let textPos = 0;
-  let m;
-  while ((m = textNodePattern.exec(xml)) !== null) {
-    const decoded = decodeXmlEntities(m[1] ?? '');
-    nodes.push({
-      xmlStart: m.index,
-      xmlEnd: m.index + m[0].length,
-      textStart: textPos,
-      textEnd: textPos + decoded.length,
-    });
-    textPos += decoded.length;
-  }
-  function xmlToTextPos(xmlPos: number): number {
-    for (const n of nodes) {
-      if (xmlPos >= n.xmlStart && xmlPos < n.xmlEnd) return n.textStart;
-      if (xmlPos < n.xmlStart) return n.textStart;
-    }
-    return nodes.length ? nodes[nodes.length - 1].textEnd : 0;
-  }
-  const headings: DocxHeading[] = [];
-  const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
-  let pm;
-  while ((pm = paraPattern.exec(xml)) !== null) {
-    const inner = pm[1];
-    const styleMatch = inner.match(/<w:pStyle[^>]*w:val="([^"]+)"/);
-    if (!styleMatch) continue;
-    const style = styleMatch[1];
-    if (!/heading/i.test(style)) continue;
-    // Concatenate text runs; include w:delText so a heading inside a tracked
-    // deletion is still surfaced (verifying anchors against an original draft)
-    const textInRange = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
-    let txt = '';
-    let tm;
-    while ((tm = textInRange.exec(inner)) !== null) {
-      txt += decodeXmlEntities(tm[1] || tm[2] || '');
-    }
-    const trimmed = txt.trim();
-    if (!trimmed) continue;
-    const levelMatch = style.match(/(\d+)/);
-    const level = levelMatch ? parseInt(levelMatch[1], 10) : 0;
-    headings.push({
-      style,
-      level,
-      text: trimmed,
-      docPosition: xmlToTextPos(pm.index),
-    });
-  }
-  return headings;
-}
-/**
- * Decode XML entities in text
- */
-function decodeXmlEntities(text: string): string {
-  return text
-    .replace(/&amp;/g, '&')
-    .replace(/&lt;/g, '<')
-    .replace(/&gt;/g, '>')
-    .replace(/&quot;/g, '"')
-    .replace(/&apos;/g, "'")
-    .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
-    .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
-}
-/**
- * Extract text content from a Word XML cell
- */
-function extractCellText(cellXml: string): string {
-  const parts: string[] = [];
-  // Check for OMML math - replace with [math] placeholder
-  if (cellXml.includes('<m:oMath')) {
-    // Try to extract the text representation of math
-    const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
-    if (mathTextMatches.length > 0) {
-      const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
-      parts.push(mathText);
-    } else {
-      parts.push('[math]');
-    }
-  }
-  // Extract regular text from w:t elements
-  const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
-  for (const match of textMatches) {
-    const text = match.replace(/<[^>]+>/g, '');
-    if (text) {
-      parts.push(text);
-    }
-  }
-  let result = parts.join('').trim();
-  result = decodeXmlEntities(result);
-  // Escape pipe characters in cell content (would break table)
-  result = result.replace(/\|/g, '\\|');
-  return result;
-}
-/**
- * Parse a table row, handling merged cells (gridSpan)
- */
-function parseTableRow(rowXml: string, expectedCols: number): ParsedRow {
-  // Match cells - handle both <w:tc> and <w:tc ...>
-  const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
-  const cells: string[] = [];
-  const colSpans: number[] = [];
-  for (const cellXml of cellMatches) {
-    // Check for horizontal merge (gridSpan)
-    const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
-    const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
-    // Check for vertical merge continuation (vMerge without restart)
-    // If vMerge is present without w:val="restart", it's a continuation - use empty
-    const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
-    const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
-    const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
-    // Add the cell content
-    cells.push(cellText);
-    colSpans.push(span);
-    // For gridSpan > 1, add empty cells to maintain column alignment
-    for (let i = 1; i < span; i++) {
-      cells.push('');
-      colSpans.push(0); // 0 indicates this is a spanned cell
-    }
-  }
-  return { cells, colSpans };
-}
-/**
- * Determine table grid column count from table XML
- */
-function getTableGridCols(tableXml: string): number {
-  // Try to get from tblGrid
-  const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
-  if (gridColMatches.length > 0) {
-    return gridColMatches.length;
-  }
-  // Fallback: count max cells in any row
-  const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
-  let maxCols = 0;
-  for (const rowXml of rowMatches) {
-    const { cells } = parseTableRow(rowXml, 0);
-    maxCols = Math.max(maxCols, cells.length);
-  }
-  return maxCols;
-}
-/**
- * Extract tables directly from Word document XML and convert to markdown pipe tables
- */
-export async function extractWordTables(docxPath: string): Promise<WordTable[]> {
-  const AdmZip = (await import('adm-zip')).default;
-  const tables: WordTable[] = [];
-  try {
-    const zip = new AdmZip(docxPath);
-    const docEntry = zip.getEntry('word/document.xml');
-    if (!docEntry) {
-      return tables;
-    }
-    const xml = docEntry.getData().toString('utf8');
-    // Find all table elements
-    const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
-    for (const tableXml of tableMatches) {
-      // Determine expected column count from grid
-      const expectedCols = getTableGridCols(tableXml);
-      // Extract rows
-      const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
-      const rows: string[][] = [];
-      for (const rowXml of rowMatches) {
-        const { cells } = parseTableRow(rowXml, expectedCols);
-        if (cells.length > 0) {
-          rows.push(cells);
-        }
-      }
-      if (rows.length > 0) {
-        // Convert to markdown pipe table
-        const markdown = convertRowsToMarkdownTable(rows);
-        tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
-      }
-    }
-  } catch (err: any) {
-    console.error('Error extracting tables from Word:', err.message);
-  }
-  return tables;
-}
-/**
- * Convert array of rows (each row is array of cell strings) to markdown pipe table
- */
-function convertRowsToMarkdownTable(rows: string[][]): string {
-  if (rows.length === 0) return '';
-  // Normalize column count (use max across all rows)
-  const colCount = Math.max(...rows.map((r) => r.length));
-  // Pad rows to have consistent column count
-  const normalizedRows = rows.map((row) => {
-    while (row.length < colCount) {
-      row.push('');
-    }
-    return row;
-  });
-  // Build markdown table
-  const lines: string[] = [];
-  // Header row
-  const header = normalizedRows[0];
-  lines.push('| ' + header.join(' | ') + ' |');
-  // Separator row
-  lines.push('|' + header.map(() => '---').join('|') + '|');
-  // Data rows
-  for (let i = 1; i < normalizedRows.length; i++) {
-    lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
-  }
-  return lines.join('\n');
-}
-/**
- * Extract text from Word document using pandoc with track changes preserved
- */
-export async function extractFromWord(
-  docxPath: string,
-  options: ExtractFromWordOptions = {}
-): Promise<ExtractFromWordResult> {
-  let text: string;
-  let messages: ExtractMessage[] = [];
-  let extractedMedia: string[] = [];
-  let hasTrackChanges = false;
-  let trackChangeStats = { insertions: 0, deletions: 0 };
-  // Determine media extraction directory
-  const docxDir = path.dirname(docxPath);
-  const mediaDir = options.mediaDir || path.join(docxDir, 'media');
-  // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
-  const skipMediaExtraction = options.skipMediaExtraction || false;
-  // Extract tables directly from Word XML (reliable, no heuristics)
-  const wordTables = await extractWordTables(docxPath);
-  // Try pandoc first with --track-changes=all to preserve reviewer edits
-  try {
-    // Build pandoc command
-    let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
-    if (!skipMediaExtraction) {
-      pandocCmd += ` --extract-media="${mediaDir}"`;
-    }
-    const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
-    text = stdout;
-    // Convert pandoc's track change format to CriticMarkup
-    const origLength = text.length;
-    // Use a more robust pattern that handles nested content
-    text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
-      if (content.trim()) {
-        trackChangeStats.insertions++;
-        return `{++${content}++}`;
-      }
-      return ''; // Empty insertions are removed
-    });
-    text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
-      if (content.trim()) {
-        trackChangeStats.deletions++;
-        return `{--${content}--}`;
-      }
-      return ''; // Empty deletions are removed
-    });
-    // Handle any remaining pandoc track change patterns
-    let prevText;
-    do {
-      prevText = text;
-      text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
-        if (content.trim()) {
-          trackChangeStats.insertions++;
-          return `{++${content}++}`;
-        }
-        return '';
-      });
-      text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
-        if (content.trim()) {
-          trackChangeStats.deletions++;
-          return `{--${content}--}`;
-        }
-        return '';
-      });
-    } while (text !== prevText);
-    // Handle pandoc comment patterns - remove comment text from body
-    text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
-    text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
-    // Also handle {.mark} spans
-    text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
-    hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
-    if (hasTrackChanges) {
-      messages.push({
-        type: 'info',
-        message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
-      });
-    }
-    // Find extracted media files
-    const mediaSubdir = path.join(mediaDir, 'media');
-    if (fs.existsSync(mediaSubdir)) {
-      extractedMedia = fs.readdirSync(mediaSubdir)
-        .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
-        .map(f => path.join(mediaSubdir, f));
-      if (extractedMedia.length > 0) {
-        messages.push({
-          type: 'info',
-          message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
-        });
-      }
-    }
-  } catch (pandocErr: any) {
-    // Pandoc not available — use XML-based extraction with track change support
-    const { extractPlainTextWithTrackChanges } = await import('./word.js');
-    const { getInstallInstructions } = await import('./dependencies.js');
-    const installCmd = getInstallInstructions('pandoc');
-    const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
-    text = xmlResult.text;
-    hasTrackChanges = xmlResult.hasTrackChanges;
-    trackChangeStats = xmlResult.stats;
-    if (hasTrackChanges) {
-      messages.push({
-        type: 'warning',
-        message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
-      });
-    } else {
-      messages.push({
-        type: 'warning',
-        message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
-      });
-    }
-  }
-  // Extract comments directly from docx XML
-  const comments = await extractWordComments(docxPath);
-  // Extract comment anchor texts
-  const { anchors } = await extractCommentAnchors(docxPath);
-  return {
-    text,
-    comments,
-    anchors,
-    messages,
-    extractedMedia,
-    tables: wordTables,
-    hasTrackChanges,
-    trackChangeStats,
-  };
-}
+/**
+ * Word document data extraction - raw extraction from .docx files
+ */
+import * as fs from 'fs';
+import * as path from 'path';
+import { exec } from 'child_process';
+import { promisify } from 'util';
+const execAsync = promisify(exec);
+// ============================================
+// Type Definitions
+// ============================================
+export interface WordComment {
+  id: string;
+  author: string;
+  date: string;
+  text: string;
+  /**
+   * Parent comment id when this is a reply in a Word comment thread.
+   * Resolved from `commentsExtended.xml`'s `w15:paraIdParent` field.
+   * `undefined` for top-level comments.
+   */
+  parentId?: string;
+}
+export interface TextNode {
+  xmlStart: number;
+  xmlEnd: number;
+  textStart: number;
+  textEnd: number;
+  text: string;
+}
+export interface CommentAnchorData {
+  anchor: string;
+  before: string;
+  after: string;
+  docPosition: number;
+  docLength: number;
+  isEmpty: boolean;
+}
+export interface CommentAnchorsResult {
+  anchors: Map<string, CommentAnchorData>;
+  fullDocText: string;
+}
+export interface DocxHeading {
+  /** Heading style name from `<w:pStyle>`, e.g. "Heading1" */
+  style: string;
+  /** Heading depth: 1, 2, 3, ... (parsed from style name; 0 if unknown) */
+  level: number;
+  /** Concatenated text content of the heading paragraph */
+  text: string;
+  /** Position in fullDocText (same coordinate system as CommentAnchorData.docPosition) */
+  docPosition: number;
+}
+export interface WordTable {
+  markdown: string;
+  rowCount: number;
+  colCount: number;
+}
+export interface ParsedRow {
+  cells: string[];
+  colSpans: number[];
+}
+export interface ExtractFromWordOptions {
+  mediaDir?: string;
+  skipMediaExtraction?: boolean;
+}
+export interface ExtractMessage {
+  type: 'info' | 'warning';
+  message: string;
+}
+export interface ExtractFromWordResult {
+  text: string;
+  comments: WordComment[];
+  anchors: Map<string, CommentAnchorData>;
+  messages: ExtractMessage[];
+  extractedMedia: string[];
+  tables: WordTable[];
+  hasTrackChanges: boolean;
+  trackChangeStats: { insertions: number; deletions: number };
+}
+// ============================================
+// Functions
+// ============================================
+/**
+ * Extract comments directly from Word docx comments.xml
+ */
+export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
+  const AdmZip = (await import('adm-zip')).default;
+  const { parseStringPromise } = await import('xml2js');
+  const comments: WordComment[] = [];
+  // Validate file exists
+  if (!fs.existsSync(docxPath)) {
+    throw new Error(`File not found: ${docxPath}`);
+  }
+  try {
+    let zip;
+    try {
+      zip = new AdmZip(docxPath);
+    } catch (err: any) {
+      throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
+    }
+    const commentsEntry = zip.getEntry('word/comments.xml');
+    if (!commentsEntry) {
+      return comments;
+    }
+    let commentsXml;
+    try {
+      commentsXml = commentsEntry.getData().toString('utf8');
+    } catch (err: any) {
+      throw new Error(`Failed to read comments from document: ${err.message}`);
+    }
+    const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
+    const commentsRoot = parsed['w:comments'];
+    if (!commentsRoot || !commentsRoot['w:comment']) {
+      return comments;
+    }
+    // Ensure it's an array
+    const commentNodes = Array.isArray(commentsRoot['w:comment'])
+      ? commentsRoot['w:comment']
+      : [commentsRoot['w:comment']];
+    // Map every paraId that lives inside a comment back to that comment's id.
+    // Word's commentsExtended.xml expresses threading via w15:paraIdParent,
+    // which references the parent's first <w:p>. Replies use a secondary
+    // (often-empty) <w:p>, so each comment may contribute multiple paraIds.
+    const paraIdToCommentId = new Map<string, string>();
+    for (const comment of commentNodes) {
+      const id = comment.$?.['w:id'] || '';
+      const author = comment.$?.['w:author'] || 'Unknown';
+      const date = comment.$?.['w:date'] || '';
+      // Extract text from nested w:p/w:r/w:t elements and record paraIds.
+      let text = '';
+      const extractText = (node: any): void => {
+        if (!node) return;
+        if (typeof node === 'string') {
+          text += node;
+          return;
+        }
+        if (node['w:t']) {
+          const t = node['w:t'];
+          text += typeof t === 'string' ? t : (t._ || t);
+        }
+        if (node['w:r']) {
+          const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
+          runs.forEach(extractText);
+        }
+        if (node['w:p']) {
+          const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
+          for (const para of paras) {
+            const paraId = para?.$?.['w14:paraId'];
+            if (paraId && id) paraIdToCommentId.set(paraId, id);
+            extractText(para);
+          }
+        }
+      };
+      extractText(comment);
+      comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
+    }
+    // Resolve parent links from commentsExtended.xml. Missing entry just
+    // means the docx has no threading metadata (e.g. legacy/non-Word source).
+    const extendedEntry = zip.getEntry('word/commentsExtended.xml');
+    if (extendedEntry && paraIdToCommentId.size > 0) {
+      let extendedXml = '';
+      try {
+        extendedXml = extendedEntry.getData().toString('utf8');
+      } catch {
+        // Unreadable threading metadata is non-fatal; skip parent linking.
+      }
+      if (extendedXml) {
+        const parentByCommentId = new Map<string, string>();
+        const exPattern = /<w15:commentEx\b([^>]*?)\/>/g;
+        let m: RegExpExecArray | null;
+        while ((m = exPattern.exec(extendedXml)) !== null) {
+          const attrs = m[1] ?? '';
+          const paraIdMatch = attrs.match(/w15:paraId="([^"]+)"/);
+          const parentMatch = attrs.match(/w15:paraIdParent="([^"]+)"/);
+          if (!paraIdMatch || !parentMatch) continue;
+          const childCommentId = paraIdToCommentId.get(paraIdMatch[1]);
+          const parentCommentId = paraIdToCommentId.get(parentMatch[1]);
+          if (childCommentId && parentCommentId && childCommentId !== parentCommentId) {
+            parentByCommentId.set(childCommentId, parentCommentId);
+          }
+        }
+        for (const c of comments) {
+          const parent = parentByCommentId.get(c.id);
+          if (parent) c.parentId = parent;
+        }
+      }
+    }
+  } catch (err: any) {
+    // Re-throw with more context if it's already an Error we created
+    if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
+      throw err;
+    }
+    throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
+  }
+  return comments;
+}
+/**
+ * Extract comment anchor texts from document.xml with surrounding context
+ * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
+ * Also returns fullDocText for section boundary matching
+ */
+export async function extractCommentAnchors(docxPath: string): Promise<CommentAnchorsResult> {
+  const AdmZip = (await import('adm-zip')).default;
+  const anchors = new Map<string, CommentAnchorData>();
+  let fullDocText = '';
+  try {
+    const zip = new AdmZip(docxPath);
+    const docEntry = zip.getEntry('word/document.xml');
+    if (!docEntry) {
+      return { anchors, fullDocText };
+    }
+    const docXml = docEntry.getData().toString('utf8');
+    // ========================================
+    // STEP 1: Build text position mapping
+    // ========================================
+    const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
+    const textNodes: TextNode[] = [];
+    let textPosition = 0;
+    let nodeMatch;
+    while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
+      const rawText = nodeMatch[1] ?? '';
+      const decodedText = decodeXmlEntities(rawText);
+      textNodes.push({
+        xmlStart: nodeMatch.index,
+        xmlEnd: nodeMatch.index + nodeMatch[0].length,
+        textStart: textPosition,
+        textEnd: textPosition + decodedText.length,
+        text: decodedText
+      });
+      textPosition += decodedText.length;
+    }
+    fullDocText = textNodes.map(n => n.text).join('');
+    // Helper: convert XML position to text position
+    function xmlPosToTextPos(xmlPos: number): number {
+      for (let i = 0; i < textNodes.length; i++) {
+        const node = textNodes[i];
+        if (!node) continue;
+        if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
+          return node.textStart;
+        }
+        if (xmlPos < node.xmlStart) {
+          return node.textStart;
+        }
+      }
+      const lastNode = textNodes[textNodes.length - 1];
+      return lastNode ? lastNode.textEnd : 0;
+    }
+    // Helper: extract context before a position
+    function getContextBefore(position: number, maxLength: number = 150): string {
+      const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
+      const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
+      return sentenceStart >= 0
+        ? beforeText.slice(sentenceStart + 2).trim()
+        : beforeText.slice(-80).trim();
+    }
+    // Helper: extract context after a position
+    function getContextAfter(position: number, maxLength: number = 150): string {
+      const afterText = fullDocText.slice(position, position + maxLength);
+      const sentenceEnd = afterText.search(/[.!?]\s/);
+      return sentenceEnd >= 0
+        ? afterText.slice(0, sentenceEnd + 1).trim()
+        : afterText.slice(0, 80).trim();
+    }
+    // ========================================
+    // STEP 2: Collect all start/end markers separately
+    // ========================================
+    const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
+    const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
+    const starts = new Map<string, number>();  // id -> position after start tag
+    const ends = new Map<string, number>();    // id -> position before end tag
+    let match;
+    while ((match = startPattern.exec(docXml)) !== null) {
+      const id = match[1];
+      if (!starts.has(id)) {
+        starts.set(id, match.index + match[0].length);
+      }
+    }
+    while ((match = endPattern.exec(docXml)) !== null) {
+      const id = match[1];
+      if (!ends.has(id)) {
+        ends.set(id, match.index);
+      }
+    }
+    // ========================================
+    // STEP 3: Process each comment range by ID
+    // ========================================
+    for (const [id, startXmlPos] of starts) {
+      const endXmlPos = ends.get(id);
+      // Missing end marker - skip with warning
+      if (endXmlPos === undefined) {
+        console.warn(`Comment ${id}: missing end marker`);
+        continue;
+      }
+      // Calculate text position
+      const docPosition = xmlPosToTextPos(startXmlPos);
+      // Handle empty or inverted ranges
+      if (endXmlPos <= startXmlPos) {
+        anchors.set(id, {
+          anchor: '',
+          before: getContextBefore(docPosition),
+          after: getContextAfter(docPosition),
+          docPosition,
+          docLength: fullDocText.length,
+          isEmpty: true
+        });
+        continue;
+      }
+      // Extract XML segment between markers
+      const segment = docXml.slice(startXmlPos, endXmlPos);
+      // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
+      const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
+      let anchorText = '';
+      let tm;
+      while ((tm = textInRangePattern.exec(segment)) !== null) {
+        anchorText += tm[1] || tm[2] || '';
+      }
+      anchorText = decodeXmlEntities(anchorText);
+      // Get context
+      const anchorLength = anchorText.length;
+      const before = getContextBefore(docPosition);
+      const after = getContextAfter(docPosition + anchorLength);
+      // ALWAYS add entry (even if anchor is empty)
+      anchors.set(id, {
+        anchor: anchorText.trim(),
+        before,
+        after,
+        docPosition,
+        docLength: fullDocText.length,
+        isEmpty: !anchorText.trim()
+      });
+    }
+  } catch (err: any) {
+    console.error('Error extracting comment anchors:', err.message);
+    return { anchors, fullDocText: '' };
+  }
+  return { anchors, fullDocText };
+}
+/**
+ * Extract heading paragraphs from a docx, with their text positions in the
+ * same coordinate system as `extractCommentAnchors`'s `fullDocText` and
+ * `CommentAnchorData.docPosition`.
+ *
+ * Headings are paragraphs whose `<w:pStyle>` is a Heading style. Reading
+ * styles directly is more reliable than keyword-matching the concatenated
+ * body text — there, paragraph boundaries are gone, so the literal string
+ * "Methods" can appear inside prose ("results across countries") and the
+ * structured-abstract label "Methods:" loses its colon when text runs are
+ * concatenated.
+ */
+export async function extractHeadings(docxPath: string): Promise<DocxHeading[]> {
+  const AdmZip = (await import('adm-zip')).default;
+  if (!fs.existsSync(docxPath)) {
+    throw new Error(`File not found: ${docxPath}`);
+  }
+  const zip = new AdmZip(docxPath);
+  const docEntry = zip.getEntry('word/document.xml');
+  if (!docEntry) return [];
+  const xml = docEntry.getData().toString('utf8');
+  // Build the same xml-pos → text-pos mapping that extractCommentAnchors does
+  const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
+  const nodes: Array<{ xmlStart: number; xmlEnd: number; textStart: number; textEnd: number }> = [];
+  let textPos = 0;
+  let m;
+  while ((m = textNodePattern.exec(xml)) !== null) {
+    const decoded = decodeXmlEntities(m[1] ?? '');
+    nodes.push({
+      xmlStart: m.index,
+      xmlEnd: m.index + m[0].length,
+      textStart: textPos,
+      textEnd: textPos + decoded.length,
+    });
+    textPos += decoded.length;
+  }
+  function xmlToTextPos(xmlPos: number): number {
+    for (const n of nodes) {
+      if (xmlPos >= n.xmlStart && xmlPos < n.xmlEnd) return n.textStart;
+      if (xmlPos < n.xmlStart) return n.textStart;
+    }
+    return nodes.length ? nodes[nodes.length - 1].textEnd : 0;
+  }
+  const headings: DocxHeading[] = [];
+  const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
+  let pm;
+  while ((pm = paraPattern.exec(xml)) !== null) {
+    const inner = pm[1];
+    const styleMatch = inner.match(/<w:pStyle[^>]*w:val="([^"]+)"/);
+    if (!styleMatch) continue;
+    const style = styleMatch[1];
+    if (!/heading/i.test(style)) continue;
+    // Concatenate text runs; include w:delText so a heading inside a tracked
+    // deletion is still surfaced (verifying anchors against an original draft)
+    const textInRange = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
+    let txt = '';
+    let tm;
+    while ((tm = textInRange.exec(inner)) !== null) {
+      txt += decodeXmlEntities(tm[1] || tm[2] || '');
+    }
+    const trimmed = txt.trim();
+    if (!trimmed) continue;
+    const levelMatch = style.match(/(\d+)/);
+    const level = levelMatch ? parseInt(levelMatch[1], 10) : 0;
+    headings.push({
+      style,
+      level,
+      text: trimmed,
+      docPosition: xmlToTextPos(pm.index),
+    });
+  }
+  return headings;
+}
+/**
+ * Decode XML entities in text
+ */
+function decodeXmlEntities(text: string): string {
+  return text
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&apos;/g, "'")
+    .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
+    .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
+}
+/**
+ * Extract text content from a Word XML cell
+ */
+function extractCellText(cellXml: string): string {
+  const parts: string[] = [];
+  // Check for OMML math - replace with [math] placeholder
+  if (cellXml.includes('<m:oMath')) {
+    // Try to extract the text representation of math
+    const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
+    if (mathTextMatches.length > 0) {
+      const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
+      parts.push(mathText);
+    } else {
+      parts.push('[math]');
+    }
+  }
+  // Extract regular text from w:t elements
+  const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
+  for (const match of textMatches) {
+    const text = match.replace(/<[^>]+>/g, '');
+    if (text) {
+      parts.push(text);
+    }
+  }
+  let result = parts.join('').trim();
+  result = decodeXmlEntities(result);
+  // Escape pipe characters in cell content (would break table)
+  result = result.replace(/\|/g, '\\|');
+  return result;
+}
+/**
+ * Parse a table row, handling merged cells (gridSpan)
+ */
+function parseTableRow(rowXml: string, expectedCols: number): ParsedRow {
+  // Match cells - handle both <w:tc> and <w:tc ...>
+  const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
+  const cells: string[] = [];
+  const colSpans: number[] = [];
+  for (const cellXml of cellMatches) {
+    // Check for horizontal merge (gridSpan)
+    const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
+    const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
+    // Check for vertical merge continuation (vMerge without restart)
+    // If vMerge is present without w:val="restart", it's a continuation - use empty
+    const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
+    const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
+    const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
+    // Add the cell content
+    cells.push(cellText);
+    colSpans.push(span);
+    // For gridSpan > 1, add empty cells to maintain column alignment
+    for (let i = 1; i < span; i++) {
+      cells.push('');
+      colSpans.push(0); // 0 indicates this is a spanned cell
+    }
+  }
+  return { cells, colSpans };
+}
+/**
+ * Determine table grid column count from table XML
+ */
+function getTableGridCols(tableXml: string): number {
+  // Try to get from tblGrid
+  const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
+  if (gridColMatches.length > 0) {
+    return gridColMatches.length;
+  }
+  // Fallback: count max cells in any row
+  const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
+  let maxCols = 0;
+  for (const rowXml of rowMatches) {
+    const { cells } = parseTableRow(rowXml, 0);
+    maxCols = Math.max(maxCols, cells.length);
+  }
+  return maxCols;
+}
+/**
+ * Extract tables directly from Word document XML and convert to markdown pipe tables
+ */
+export async function extractWordTables(docxPath: string): Promise<WordTable[]> {
+  const AdmZip = (await import('adm-zip')).default;
+  const tables: WordTable[] = [];
+  try {
+    const zip = new AdmZip(docxPath);
+    const docEntry = zip.getEntry('word/document.xml');
+    if (!docEntry) {
+      return tables;
+    }
+    const xml = docEntry.getData().toString('utf8');
+    // Find all table elements
+    const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
+    for (const tableXml of tableMatches) {
+      // Determine expected column count from grid
+      const expectedCols = getTableGridCols(tableXml);
+      // Extract rows
+      const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
+      const rows: string[][] = [];
+      for (const rowXml of rowMatches) {
+        const { cells } = parseTableRow(rowXml, expectedCols);
+        if (cells.length > 0) {
+          rows.push(cells);
+        }
+      }
+      if (rows.length > 0) {
+        // Convert to markdown pipe table
+        const markdown = convertRowsToMarkdownTable(rows);
+        tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
+      }
+    }
+  } catch (err: any) {
+    console.error('Error extracting tables from Word:', err.message);
+  }
+  return tables;
+}
+/**
+ * Convert array of rows (each row is array of cell strings) to markdown pipe table
+ */
+function convertRowsToMarkdownTable(rows: string[][]): string {
+  if (rows.length === 0) return '';
+  // Normalize column count (use max across all rows)
+  const colCount = Math.max(...rows.map((r) => r.length));
+  // Pad rows to have consistent column count
+  const normalizedRows = rows.map((row) => {
+    while (row.length < colCount) {
+      row.push('');
+    }
+    return row;
+  });
+  // Build markdown table
+  const lines: string[] = [];
+  // Header row
+  const header = normalizedRows[0];
+  lines.push('| ' + header.join(' | ') + ' |');
+  // Separator row
+  lines.push('|' + header.map(() => '---').join('|') + '|');
+  // Data rows
+  for (let i = 1; i < normalizedRows.length; i++) {
+    lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
+  }
+  return lines.join('\n');
+}
+/**
+ * Extract text from Word document using pandoc with track changes preserved
+ */
+export async function extractFromWord(
+  docxPath: string,
+  options: ExtractFromWordOptions = {}
+): Promise<ExtractFromWordResult> {
+  let text: string;
+  let messages: ExtractMessage[] = [];
+  let extractedMedia: string[] = [];
+  let hasTrackChanges = false;
+  let trackChangeStats = { insertions: 0, deletions: 0 };
+  // Determine media extraction directory
+  const docxDir = path.dirname(docxPath);
+  const mediaDir = options.mediaDir || path.join(docxDir, 'media');
+  // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
+  const skipMediaExtraction = options.skipMediaExtraction || false;
+  // Extract tables directly from Word XML (reliable, no heuristics)
+  const wordTables = await extractWordTables(docxPath);
+  // Try pandoc first with --track-changes=all to preserve reviewer edits
+  try {
+    // Build pandoc command
+    let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
+    if (!skipMediaExtraction) {
+      pandocCmd += ` --extract-media="${mediaDir}"`;
+    }
+    const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
+    text = stdout;
+    // Convert pandoc's track change format to CriticMarkup
+    const origLength = text.length;
+    // Use a more robust pattern that handles nested content
+    text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
+      if (content.trim()) {
+        trackChangeStats.insertions++;
+        return `{++${content}++}`;
+      }
+      return ''; // Empty insertions are removed
+    });
+    text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
+      if (content.trim()) {
+        trackChangeStats.deletions++;
+        return `{--${content}--}`;
+      }
+      return ''; // Empty deletions are removed
+    });
+    // Handle any remaining pandoc track change patterns
+    let prevText;
+    do {
+      prevText = text;
+      text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
+        if (content.trim()) {
+          trackChangeStats.insertions++;
+          return `{++${content}++}`;
+        }
+        return '';
+      });
+      text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
+        if (content.trim()) {
+          trackChangeStats.deletions++;
+          return `{--${content}--}`;
+        }
+        return '';
+      });
+    } while (text !== prevText);
+    // Handle pandoc comment patterns - remove comment text from body
+    text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
+    text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
+    // Also handle {.mark} spans
+    text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
+    hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
+    if (hasTrackChanges) {
+      messages.push({
+        type: 'info',
+        message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
+      });
+    }
+    // Find extracted media files
+    const mediaSubdir = path.join(mediaDir, 'media');
+    if (fs.existsSync(mediaSubdir)) {
+      extractedMedia = fs.readdirSync(mediaSubdir)
+        .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
+        .map(f => path.join(mediaSubdir, f));
+      if (extractedMedia.length > 0) {
+        messages.push({
+          type: 'info',
+          message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
+        });
+      }
+    }
+  } catch (pandocErr: any) {
+    // Pandoc not available — use XML-based extraction with track change support
+    const { extractPlainTextWithTrackChanges } = await import('./word.js');
+    const { getInstallInstructions } = await import('./dependencies.js');
+    const installCmd = getInstallInstructions('pandoc');
+    const xmlResult = await extractPlainTextWithTrackChanges(docxPath);
+    text = xmlResult.text;
+    hasTrackChanges = xmlResult.hasTrackChanges;
+    trackChangeStats = xmlResult.stats;
+    if (hasTrackChanges) {
+      messages.push({
+        type: 'warning',
+        message: `Pandoc not installed. Using built-in XML extractor (${trackChangeStats.insertions} insertions, ${trackChangeStats.deletions} deletions preserved). Formatting may differ. Install pandoc for best results: ${installCmd}`
+      });
+    } else {
+      messages.push({
+        type: 'warning',
+        message: `Pandoc not installed. Using built-in XML extractor (no track changes found). Install pandoc for better formatting: ${installCmd}`
+      });
+    }
+  }
+  // Extract comments directly from docx XML
+  const comments = await extractWordComments(docxPath);
+  // Extract comment anchor texts
+  const { anchors } = await extractCommentAnchors(docxPath);
+  return {
+    text,
+    comments,
+    anchors,
+    messages,
+    extractedMedia,
+    tables: wordTables,
+    hasTrackChanges,
+    trackChangeStats,
+  };
+}