npm - docrev - Versions diffs - 0.9.6 → 0.9.7 - Mend

docrev 0.9.6 → 0.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/CHANGELOG.md +20 -0
package/dev_notes/bug_repro_comment_parser.md +71 -0
package/dist/lib/anchor-match.d.ts +41 -0
package/dist/lib/anchor-match.d.ts.map +1 -0
package/dist/lib/anchor-match.js +192 -0
package/dist/lib/anchor-match.js.map +1 -0
package/dist/lib/annotations.d.ts.map +1 -1
package/dist/lib/annotations.js +8 -5
package/dist/lib/annotations.js.map +1 -1
package/dist/lib/commands/index.d.ts +2 -1
package/dist/lib/commands/index.d.ts.map +1 -1
package/dist/lib/commands/index.js +3 -1
package/dist/lib/commands/index.js.map +1 -1
package/dist/lib/commands/section-boundaries.d.ts +22 -0
package/dist/lib/commands/section-boundaries.d.ts.map +1 -0
package/dist/lib/commands/section-boundaries.js +53 -0
package/dist/lib/commands/section-boundaries.js.map +1 -0
package/dist/lib/commands/sync.d.ts.map +1 -1
package/dist/lib/commands/sync.js +135 -0
package/dist/lib/commands/sync.js.map +1 -1
package/dist/lib/commands/verify-anchors.d.ts +17 -0
package/dist/lib/commands/verify-anchors.d.ts.map +1 -0
package/dist/lib/commands/verify-anchors.js +215 -0
package/dist/lib/commands/verify-anchors.js.map +1 -0
package/dist/lib/import.d.ts +14 -8
package/dist/lib/import.d.ts.map +1 -1
package/dist/lib/import.js +16 -144
package/dist/lib/import.js.map +1 -1
package/dist/lib/word-extraction.d.ts +23 -0
package/dist/lib/word-extraction.d.ts.map +1 -1
package/dist/lib/word-extraction.js +79 -0
package/dist/lib/word-extraction.js.map +1 -1
package/lib/anchor-match.ts +238 -0
package/lib/annotations.ts +9 -5
package/lib/commands/index.ts +3 -0
package/lib/commands/section-boundaries.ts +72 -0
package/lib/commands/sync.ts +165 -0
package/lib/commands/verify-anchors.ts +261 -0
package/lib/import.ts +29 -165
package/lib/word-extraction.ts +93 -0
package/package.json +1 -1
package/skill/REFERENCE.md +29 -2
package/skill/SKILL.md +12 -2

package/lib/anchor-match.ts ADDED Viewed

@@ -0,0 +1,238 @@
+/**
+ * Anchor matching primitives shared between sync (insertion) and
+ * verify-anchors (drift reporting). The functions are pure: given an
+ * anchor string and surrounding context, locate candidate positions in
+ * a target text using progressively looser strategies.
+ */
+export type AnchorStrategy =
+  | 'direct'
+  | 'normalized'
+  | 'stripped'
+  | 'partial-start'
+  | 'partial-start-stripped'
+  | 'context-both'
+  | 'context-before'
+  | 'context-after'
+  | 'split-match'
+  | 'empty-anchor'
+  | 'failed';
+export interface AnchorSearchResult {
+  occurrences: number[];
+  matchedAnchor: string | null;
+  strategy: AnchorStrategy;
+  stripped?: boolean;
+}
+/**
+ * Strip CriticMarkup so the matcher sees plain prose instead of
+ * `{++inserted++}`/`{--deleted--}`/etc. Used when an anchor lives
+ * underneath previously imported track changes.
+ */
+export function stripCriticMarkup(text: string): string {
+  return text
+    .replace(/\{\+\+([^+]*)\+\+\}/g, '$1')        // insertions: keep new text
+    .replace(/\{--([^-]*)--\}/g, '')              // deletions: remove old text
+    .replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2')   // substitutions: keep new text
+    .replace(/\{>>[\s\S]*?<<\}/g, '')             // comments: remove (non-greedy; comment text may contain '<')
+    .replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');    // marked text: keep text
+}
+/**
+ * Return every starting index where `needle` occurs in `haystack`.
+ * Empty needles return no occurrences (empty matches are not useful
+ * for anchor placement).
+ */
+export function findAllOccurrences(haystack: string, needle: string): number[] {
+  if (!needle || needle.length === 0) return [];
+  const occurrences: number[] = [];
+  let idx = 0;
+  while ((idx = haystack.indexOf(needle, idx)) !== -1) {
+    occurrences.push(idx);
+    idx += 1;
+  }
+  return occurrences;
+}
+/**
+ * Find candidate positions for `anchor` in `text`, falling back through
+ * progressively looser strategies (whitespace normalization, stripped
+ * CriticMarkup, partial-prefix, surrounding context, word splitting).
+ *
+ * The returned `strategy` lets callers distinguish a clean direct hit
+ * from a fuzzy approximation — useful for drift reporting.
+ */
+export function findAnchorInText(
+  anchor: string,
+  text: string,
+  before: string = '',
+  after: string = ''
+): AnchorSearchResult {
+  // Empty anchor: skip directly to context-based matching
+  if (!anchor || anchor.trim().length === 0) {
+    if (before || after) {
+      const beforeLower = (before || '').toLowerCase();
+      const afterLower = (after || '').toLowerCase();
+      const textLower = text.toLowerCase();
+      if (before && after) {
+        const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
+        if (beforeIdx !== -1) {
+          const searchStart = beforeIdx + beforeLower.slice(-50).length;
+          const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
+          if (afterIdx !== -1 && afterIdx - searchStart < 500) {
+            return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
+          }
+        }
+      }
+      if (before) {
+        const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
+        if (beforeIdx !== -1) {
+          return {
+            occurrences: [beforeIdx + beforeLower.slice(-30).length],
+            matchedAnchor: null,
+            strategy: 'context-before',
+          };
+        }
+      }
+      if (after) {
+        const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
+        if (afterIdx !== -1) {
+          return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
+        }
+      }
+    }
+    return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
+  }
+  const anchorLower = anchor.toLowerCase();
+  const textLower = text.toLowerCase();
+  // Strategy 1: direct match
+  let occurrences = findAllOccurrences(textLower, anchorLower);
+  if (occurrences.length > 0) {
+    return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
+  }
+  // Strategy 2: normalized whitespace
+  const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
+  const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
+  const idx = normalizedText.indexOf(normalizedAnchor);
+  if (idx !== -1) {
+    return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
+  }
+  // Strategy 3: match in stripped CriticMarkup version
+  const strippedText = stripCriticMarkup(text);
+  const strippedLower = strippedText.toLowerCase();
+  occurrences = findAllOccurrences(strippedLower, anchorLower);
+  if (occurrences.length > 0) {
+    return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
+  }
+  // Strategy 4: first N words of anchor (long anchors)
+  const words = anchor.split(/\s+/);
+  if (words.length > 3) {
+    for (let n = Math.min(6, words.length); n >= 3; n--) {
+      const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
+      if (partialAnchor.length >= 15) {
+        occurrences = findAllOccurrences(textLower, partialAnchor);
+        if (occurrences.length > 0) {
+          return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
+        }
+        occurrences = findAllOccurrences(strippedLower, partialAnchor);
+        if (occurrences.length > 0) {
+          return {
+            occurrences,
+            matchedAnchor: words.slice(0, n).join(' '),
+            strategy: 'partial-start-stripped',
+            stripped: true,
+          };
+        }
+      }
+    }
+  }
+  // Strategy 5: context (before/after) only
+  if (before || after) {
+    const beforeLower = before.toLowerCase();
+    const afterLower = after.toLowerCase();
+    if (before && after) {
+      const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
+      if (beforeIdx !== -1) {
+        const searchStart = beforeIdx + beforeLower.slice(-50).length;
+        const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
+        if (afterIdx !== -1 && afterIdx - searchStart < 500) {
+          return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
+        }
+      }
+    }
+    if (before) {
+      const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
+      if (beforeIdx !== -1) {
+        return {
+          occurrences: [beforeIdx + beforeLower.slice(-30).length],
+          matchedAnchor: null,
+          strategy: 'context-before',
+        };
+      }
+    }
+    if (after) {
+      const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
+      if (afterIdx !== -1) {
+        return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
+      }
+    }
+  }
+  // Strategy 6: split anchor on transition characters
+  const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
+  for (const sep of splitPatterns) {
+    if (anchor.includes(sep)) {
+      const parts = anchor.split(sep).filter(p => p.length >= 4);
+      for (const part of parts) {
+        const partLower = part.toLowerCase();
+        occurrences = findAllOccurrences(textLower, partLower);
+        if (occurrences.length > 0 && occurrences.length < 5) {
+          return { occurrences, matchedAnchor: part, strategy: 'split-match' };
+        }
+      }
+    }
+  }
+  return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
+}
+/**
+ * Classify a strategy as a clean hit, a fuzzy/drifted hit, or no hit.
+ * Used by `verify-anchors` to summarize per-comment match quality.
+ */
+export type AnchorMatchQuality = 'clean' | 'drift' | 'context-only' | 'unmatched';
+export function classifyStrategy(strategy: AnchorStrategy, occurrences: number): AnchorMatchQuality {
+  if (occurrences === 0) return 'unmatched';
+  switch (strategy) {
+    case 'direct':
+    case 'normalized':
+      return 'clean';
+    case 'stripped':
+    case 'partial-start':
+    case 'partial-start-stripped':
+    case 'split-match':
+      return 'drift';
+    case 'context-both':
+    case 'context-before':
+    case 'context-after':
+      return 'context-only';
+    case 'empty-anchor':
+    case 'failed':
+    default:
+      return 'unmatched';
+  }
+}

package/lib/annotations.ts CHANGED Viewed

@@ -91,16 +91,20 @@ function isCommentFalsePositive(commentContent: string, fullText: string, positi
   // Contains markdown figure reference syntax
   if (/\{#fig:|!\[/.test(commentContent)) return true;
-  // Contains URL patterns (likely a link, not a comment)
-  if (/https?:\/\/|www\./i.test(commentContent) && commentContent.length < 150) return true;
+  // Real comments typically have "Author:" at start. Accept hyphens, apostrophes,
+  // periods, and Unicode letters so names like "Jens-Christian Svenning" or
+  // "Camilla T Colding-Jørgensen" don't get rejected. See gcol33/docrev#1.
+  const hasAuthorPrefix = /^[\p{L}][\p{L}\s\-'.]{0,30}:\s/u.test(commentContent.trim());
+  const hasResolvedMark = /^[✓✔]\s/.test(commentContent.trim());
+  // Contains URL patterns (likely a link, not a comment) — only filter when
+  // there is no real author prefix, since reviewers legitimately cite URLs/DOIs.
+  if (!hasAuthorPrefix && /https?:\/\/|www\./i.test(commentContent) && commentContent.length < 150) return true;
   // Looks like code (contains programming patterns)
   if (/function\s*\(|=>|import\s+|export\s+|const\s+|let\s+|var\s+/.test(commentContent)) return true;
   // Very long without clear author pattern (likely caption, not comment)
-  // Real comments typically have "Author:" at start and are shorter
-  const hasAuthorPrefix = /^[A-Za-z][A-Za-z\s]{0,20}:\s/.test(commentContent.trim());
-  const hasResolvedMark = /^[✓✔]\s/.test(commentContent.trim());
   if (!hasAuthorPrefix && !hasResolvedMark && commentContent.length > MAX_COMMENT_CONTENT_LENGTH) return true;
   // Looks like a figure caption (starts with "Fig" or contains typical caption words)

package/lib/commands/index.ts CHANGED Viewed

@@ -11,6 +11,7 @@ import { register as registerCommentCommands } from './comments.js';
 import { register as registerInitCommands } from './init.js';
 import { register as registerSectionCommands } from './sections.js';
 import { register as registerSyncCommands } from './sync.js';
+import { register as registerVerifyAnchorsCommands } from './verify-anchors.js';
 import { register as registerMergeResolveCommands } from './merge-resolve.js';
 import { register as registerBuildCommands } from './build.js';
 import { register as registerResponseCommands } from './response.js';
@@ -31,6 +32,7 @@ export {
   registerInitCommands,
   registerSectionCommands,
   registerSyncCommands,
+  registerVerifyAnchorsCommands,
   registerMergeResolveCommands,
   registerBuildCommands,
   registerResponseCommands,
@@ -68,6 +70,7 @@ export function registerAllCommands(program: Command, pkg?: PackageJson): void {
   registerInitCommands(program);
   registerSectionCommands(program);
   registerSyncCommands(program);
+  registerVerifyAnchorsCommands(program);
   registerMergeResolveCommands(program);
   registerBuildCommands(program, pkg || {});
   registerResponseCommands(program);

package/lib/commands/section-boundaries.ts ADDED Viewed

@@ -0,0 +1,72 @@
+/**
+ * Compute section boundaries in a DOCX from its real heading paragraphs.
+ *
+ * Given the configured `sections.yaml` and the headings extracted via
+ * `extractHeadings()`, return one boundary per section file with text
+ * positions in the same coordinate system as `CommentAnchorData.docPosition`.
+ *
+ * Matching is by heading text (primary header + aliases, case-insensitive).
+ * This replaces the older keyword-search-in-body-text approach which would
+ * pick up section names that happen to appear inside prose ("results across
+ * countries") or in structured-abstract labels where paragraph boundaries
+ * are lost in concatenation.
+ */
+import type { DocxHeading } from '../word-extraction.js';
+import type { SectionConfig } from '../types.js';
+export interface SectionBoundary {
+  file: string;
+  start: number;
+  end: number;
+}
+export function computeSectionBoundaries(
+  sections: Record<string, SectionConfig>,
+  headings: DocxHeading[],
+): SectionBoundary[] {
+  const matched: SectionBoundary[] = [];
+  // Only consider top-level (Heading1-style) when level info is available;
+  // when level==0 (unparseable style), fall back to all headings.
+  const haveLevels = headings.some(h => h.level > 0);
+  const candidates = haveLevels ? headings.filter(h => h.level === 1) : headings;
+  for (const [file, cfg] of Object.entries(sections)) {
+    const targets = [cfg.header, ...(cfg.aliases || [])]
+      .filter(Boolean)
+      .map(s => s.toLowerCase().trim());
+    let firstMatch = -1;
+    for (const h of candidates) {
+      const text = h.text.toLowerCase().trim();
+      if (targets.includes(text)) {
+        firstMatch = h.docPosition;
+        break;
+      }
+    }
+    // Fallback: if no level-1 hit, allow any-level match (handles single-level docs)
+    if (firstMatch < 0 && haveLevels) {
+      for (const h of headings) {
+        const text = h.text.toLowerCase().trim();
+        if (targets.includes(text)) {
+          firstMatch = h.docPosition;
+          break;
+        }
+      }
+    }
+    if (firstMatch >= 0) {
+      matched.push({ file, start: firstMatch, end: Number.MAX_SAFE_INTEGER });
+    }
+  }
+  // Sort by start position and tighten each end to the next start
+  matched.sort((a, b) => a.start - b.start);
+  for (let i = 0; i < matched.length - 1; i++) {
+    matched[i].end = matched[i + 1].start;
+  }
+  return matched;
+}

package/lib/commands/sync.ts CHANGED Viewed

@@ -35,6 +35,10 @@ interface SyncOptions {
   diff?: boolean;
   force?: boolean;
   dryRun?: boolean;
+  /** Commander maps `--comments-only` (a positive flag) cleanly. `--no-overwrite`
+   * conflicts with the existing `overwrite` semantics in `--force`-style flags
+   * and Commander's `--no-X` convention assigns `options.x === false`. */
+  commentsOnly?: boolean;
 }
 /**
@@ -57,6 +61,7 @@ export function register(program: Command): void {
     .option('--no-diff', 'Skip showing diff preview')
     .option('--force', 'Overwrite files without conflict warning')
     .option('--dry-run', 'Preview without writing files')
+    .option('--comments-only', 'Insert comments at fuzzy-matched anchors only; never modify existing prose or apply track changes (use when markdown was revised after the docx was sent for review)')
     .action(async (docx: string | undefined, sections: string[], options: SyncOptions) => {
       // Auto-detect most recent docx or pdf if not provided
       if (!docx) {
@@ -137,6 +142,14 @@ export function register(program: Command): void {
         process.exit(1);
       }
+      // --comments-only: import comments only, never modify existing prose.
+      // Use this when the markdown has been revised since the docx was sent
+      // out — track changes from a stale draft would clobber newer edits.
+      if (options.commentsOnly) {
+        await syncCommentsOnly(docx, sections, options, configPath);
+        return;
+      }
       // Check pandoc availability upfront and warn
       const { hasPandoc, getInstallInstructions } = await import('../dependencies.js');
       if (!hasPandoc()) {
@@ -534,3 +547,155 @@ export function register(program: Command): void {
       }
     });
 }
+/**
+ * `sync --comments-only`: import only Word comments at fuzzy-matched anchors.
+ *
+ * Skips the Word→Markdown diff entirely (no track changes, no pandoc, no
+ * prose modifications). Useful when the markdown has been edited after the
+ * docx was sent for review — applying track changes from a stale draft
+ * would overwrite newer edits.
+ */
+async function syncCommentsOnly(
+  docx: string,
+  sectionFilter: string[] | undefined,
+  options: SyncOptions,
+  configPath: string,
+): Promise<void> {
+  const config = loadConfig(configPath);
+  const { extractWordComments, extractCommentAnchors, extractHeadings, insertCommentsIntoMarkdown } = await import('../import.js');
+  const { computeSectionBoundaries } = await import('./section-boundaries.js');
+  const spin = fmt.spinner(`Reading comments from ${path.basename(docx)}...`).start();
+  let comments;
+  let anchors;
+  let headings;
+  try {
+    comments = await extractWordComments(docx);
+    const result = await extractCommentAnchors(docx);
+    anchors = result.anchors;
+    headings = await extractHeadings(docx);
+    spin.stop();
+  } catch (err) {
+    spin.stop();
+    const error = err as Error;
+    console.error(fmt.status('error', error.message));
+    process.exit(1);
+  }
+  console.log(fmt.header(`Comments from ${path.basename(docx)} (comments-only)`));
+  console.log();
+  if (comments.length === 0) {
+    console.log(fmt.status('info', 'No comments found in document.'));
+    return;
+  }
+  const boundaries = computeSectionBoundaries(config.sections, headings);
+  if (boundaries.length === 0) {
+    console.error(fmt.status('warning', 'No section headings detected in Word document.'));
+    console.error(chalk.dim('  Check that headers in sections.yaml match heading paragraphs in the docx.'));
+    process.exit(1);
+  }
+  // Apply optional section filter from CLI
+  let activeBoundaries = boundaries;
+  if (sectionFilter && sectionFilter.length > 0) {
+    const wanted = sectionFilter.map(s => s.trim().toLowerCase());
+    activeBoundaries = boundaries.filter(b => {
+      const base = b.file.replace(/\.md$/i, '').toLowerCase();
+      return wanted.some(name => base === name || base.includes(name));
+    });
+    if (activeBoundaries.length === 0) {
+      console.error(fmt.status('error', `No sections matched: ${sectionFilter.join(', ')}`));
+      process.exit(1);
+    }
+  }
+  const firstBoundaryStart = boundaries[0].start;
+  const results: Array<{ file: string; placed: number; unmatched: number; skipped: boolean }> = [];
+  for (const boundary of activeBoundaries) {
+    const sectionPath = path.join(options.dir, boundary.file);
+    if (!fs.existsSync(sectionPath)) {
+      results.push({ file: boundary.file, placed: 0, unmatched: 0, skipped: true });
+      continue;
+    }
+    const isFirstSection = boundary === activeBoundaries[0];
+    const sectionComments = comments.filter((c: { id: string }) => {
+      const anchor = anchors.get(c.id);
+      if (!anchor || anchor.docPosition === undefined) return false;
+      if (anchor.docPosition >= boundary.start && anchor.docPosition < boundary.end) return true;
+      // Comments before the first heading land in the first matched section
+      if (isFirstSection && anchor.docPosition < firstBoundaryStart) return true;
+      return false;
+    });
+    if (sectionComments.length === 0) {
+      results.push({ file: boundary.file, placed: 0, unmatched: 0, skipped: false });
+      continue;
+    }
+    const original = fs.readFileSync(sectionPath, 'utf-8');
+    const commentPattern = /\{>>.*?<<\}/gs;
+    const beforeCount = (original.match(commentPattern) || []).length;
+    const annotated = insertCommentsIntoMarkdown(original, sectionComments, anchors, {
+      quiet: !process.env.DEBUG,
+      sectionBoundary: { start: boundary.start, end: boundary.end },
+      wrapAnchor: false,
+    });
+    const afterCount = (annotated.match(commentPattern) || []).length;
+    const placed = afterCount - beforeCount;
+    const unmatched = sectionComments.length - placed;
+    if (!options.dryRun && placed > 0) {
+      fs.writeFileSync(sectionPath, annotated, 'utf-8');
+    }
+    results.push({ file: boundary.file, placed, unmatched, skipped: false });
+  }
+  const tableRows = results.map(r => {
+    if (r.skipped) {
+      return [chalk.dim(r.file), chalk.yellow('missing'), '', ''];
+    }
+    return [
+      chalk.bold(r.file),
+      chalk.green(`${r.placed}`),
+      r.unmatched > 0 ? chalk.yellow(`${r.unmatched}`) : chalk.dim('-'),
+      chalk.dim('comments only'),
+    ];
+  });
+  console.log(fmt.table(
+    ['File', 'Placed', 'Unmatched', 'Mode'],
+    tableRows,
+    { align: ['left', 'right', 'right', 'left'] },
+  ));
+  console.log();
+  const totalPlaced = results.reduce((s, r) => s + r.placed, 0);
+  const totalUnmatched = results.reduce((s, r) => s + r.unmatched, 0);
+  const lines: string[] = [];
+  lines.push(`${chalk.bold(comments.length)} comments in document`);
+  lines.push(`${chalk.bold(totalPlaced)} placed at fuzzy-matched anchors`);
+  if (totalUnmatched > 0) {
+    lines.push(`${chalk.yellow(totalUnmatched)} unmatched (no anchor in current prose)`);
+  }
+  if (options.dryRun) {
+    lines.push(chalk.yellow('Dry run — no files written'));
+  } else if (totalPlaced > 0) {
+    lines.push(chalk.dim('Existing prose unchanged.'));
+  }
+  console.log(fmt.box(lines.join('\n'), { title: 'Summary', padding: 0 }));
+  if (totalUnmatched > 0) {
+    console.log();
+    console.log(chalk.dim('Tip: run "rev verify-anchors" to see which comments drifted.'));
+  }
+}