npm - docrev - Versions diffs - 0.8.5 → 0.9.3 - Mend

docrev 0.8.5 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

package/.gitattributes +1 -0
package/README.md +25 -1
package/dist/lib/annotations.d.ts.map +1 -1
package/dist/lib/annotations.js +6 -0
package/dist/lib/annotations.js.map +1 -1
package/dist/lib/build.d.ts +6 -1
package/dist/lib/build.d.ts.map +1 -1
package/dist/lib/build.js +67 -1
package/dist/lib/build.js.map +1 -1
package/dist/lib/commands/build.d.ts.map +1 -1
package/dist/lib/commands/build.js +26 -7
package/dist/lib/commands/build.js.map +1 -1
package/dist/lib/commands/response.d.ts.map +1 -1
package/dist/lib/commands/response.js +50 -2
package/dist/lib/commands/response.js.map +1 -1
package/dist/lib/commands/sections.d.ts.map +1 -1
package/dist/lib/commands/sections.js +28 -9
package/dist/lib/commands/sections.js.map +1 -1
package/dist/lib/csl.d.ts +38 -0
package/dist/lib/csl.d.ts.map +1 -0
package/dist/lib/csl.js +170 -0
package/dist/lib/csl.js.map +1 -0
package/dist/lib/import.d.ts.map +1 -1
package/dist/lib/import.js +20 -7
package/dist/lib/import.js.map +1 -1
package/dist/lib/journals.d.ts.map +1 -1
package/dist/lib/journals.js +24 -0
package/dist/lib/journals.js.map +1 -1
package/dist/lib/plugins.d.ts +11 -0
package/dist/lib/plugins.d.ts.map +1 -1
package/dist/lib/plugins.js +21 -1
package/dist/lib/plugins.js.map +1 -1
package/dist/lib/pptx-template.d.ts +17 -22
package/dist/lib/pptx-template.d.ts.map +1 -1
package/dist/lib/pptx-template.js +296 -552
package/dist/lib/pptx-template.js.map +1 -1
package/dist/lib/schema.d.ts.map +1 -1
package/dist/lib/schema.js +4 -0
package/dist/lib/schema.js.map +1 -1
package/dist/lib/types.d.ts +19 -1
package/dist/lib/types.d.ts.map +1 -1
package/dist/lib/word.d.ts +24 -11
package/dist/lib/word.d.ts.map +1 -1
package/dist/lib/word.js +233 -32
package/dist/lib/word.js.map +1 -1
package/lib/annotations.ts +8 -0
package/lib/build.ts +75 -2
package/lib/commands/build.ts +25 -7
package/lib/commands/response.ts +55 -2
package/lib/commands/sections.ts +31 -9
package/lib/csl.ts +191 -0
package/lib/import.ts +21 -7
package/lib/journals.ts +25 -1
package/lib/plugins.ts +35 -1
package/lib/pptx-template.ts +346 -502
package/lib/schema.ts +4 -0
package/lib/types.ts +20 -1
package/lib/word.ts +253 -38
package/package.json +1 -2
package/lib/apply-buildup-colors.py +0 -88

package/lib/schema.ts CHANGED Viewed

@@ -87,6 +87,10 @@ export const revYamlSchema: Schema = {
         ],
       },
     },
+    journal: {
+      type: 'string',
+      description: 'Journal profile name for formatting defaults and validation',
+    },
     sections: {
       type: 'array',
       description: 'Ordered list of section files to include',

package/lib/types.ts CHANGED Viewed

@@ -325,13 +325,32 @@ export interface JournalRequirements {
   figures?: { max?: number };
   tables?: { max?: number };
   sections?: string[];
-  formatting?: object;
+}
+export interface JournalFormatting {
+  csl?: string;
+  pdf?: {
+    documentclass?: string;
+    fontsize?: string;
+    geometry?: string;
+    linestretch?: number;
+    template?: string;
+    numbersections?: boolean;
+  };
+  docx?: {
+    reference?: string;
+  };
+  crossref?: {
+    figPrefix?: string | string[];
+    tblPrefix?: string | string[];
+  };
 }
 export interface JournalProfile {
   name: string;
   url: string;
   requirements: JournalRequirements;
+  formatting?: JournalFormatting;
 }
 export interface ValidationResult {

package/lib/word.ts CHANGED Viewed

@@ -8,7 +8,7 @@ import * as path from 'path';
 import AdmZip from 'adm-zip';
 import { parseString } from 'xml2js';
 import { promisify } from 'util';
-import type { WordComment, CommentAnchor, WordContent, WordMetadata, TrackChangesResult } from './types.js';
+import type { WordComment, CommentAnchor, WordMetadata, TrackChangesResult } from './types.js';
 const parseXml = promisify(parseString);
@@ -166,9 +166,9 @@ export async function extractCommentAnchors(docxPath: string): Promise<Map<strin
 }
 /**
- * Extract plain text from Word document using mammoth
+ * Extract plain text from Word document (strips track change markup)
  * @param docxPath - Path to .docx file
- * @returns Extracted plain text
+ * @returns Extracted plain text (accepted changes applied)
  * @throws {TypeError} If docxPath is not a string
  * @throws {Error} If file not found
  */
@@ -176,41 +176,13 @@ export async function extractTextFromWord(docxPath: string): Promise<string> {
   if (typeof docxPath !== 'string') {
     throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
   }
-  if (!fs.existsSync(docxPath)) {
-    throw new Error(`File not found: ${docxPath}`);
-  }
-  const mammoth = await import('mammoth');
-  const result = await mammoth.extractRawText({ path: docxPath });
-  return result.value;
-}
-/**
- * Extract rich content from Word with basic formatting
- * @param docxPath - Path to .docx file
- * @returns Text and HTML content
- * @throws {TypeError} If docxPath is not a string
- * @throws {Error} If file not found
- */
-export async function extractFromWord(docxPath: string): Promise<WordContent> {
-  if (typeof docxPath !== 'string') {
-    throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
-  }
-  if (!fs.existsSync(docxPath)) {
-    throw new Error(`File not found: ${docxPath}`);
-  }
-  const mammoth = await import('mammoth');
-  const [textResult, htmlResult] = await Promise.all([
-    mammoth.extractRawText({ path: docxPath }),
-    mammoth.convertToHtml({ path: docxPath }),
-  ]);
-  return {
-    text: textResult.value,
-    html: htmlResult.value,
-  };
+  const result = await extractPlainTextWithTrackChanges(docxPath);
+  // Strip CriticMarkup: accept insertions, remove deletions, apply substitutions
+  let text = result.text;
+  text = text.replace(/\{~~[^~]*~>([^~]*)~~\}/g, '$1');  // substitutions → new
+  text = text.replace(/\{\+\+([^+]*)\+\+\}/g, '$1');      // insertions → keep
+  text = text.replace(/\{--[^}]*--\}/g, '');               // deletions → remove
+  return text;
 }
 /**
@@ -350,6 +322,249 @@ export async function extractTrackChanges(docxPath: string): Promise<TrackChange
   };
 }
+/**
+ * Extract a single marker's content starting at position i.
+ * Returns { content, end } where end is the position after the closing marker,
+ * or null if no valid closing marker found.
+ */
+function extractMarker(text: string, i: number, open: string, close: string): { content: string; end: number } | null {
+  if (!text.startsWith(open, i)) return null;
+  const start = i + open.length;
+  const closeIdx = text.indexOf(close, start);
+  if (closeIdx === -1) return null;
+  return { content: text.slice(start, closeIdx), end: closeIdx + close.length };
+}
+/**
+ * Greedily collect consecutive markers of the same type.
+ * E.g. {++a++}{++b++}{++c++} → "abc", advancing past all three.
+ */
+function collectConsecutive(text: string, i: number, open: string, close: string): { content: string; end: number } | null {
+  const first = extractMarker(text, i, open, close);
+  if (!first) return null;
+  let content = first.content;
+  let end = first.end;
+  while (end < text.length) {
+    const next = extractMarker(text, end, open, close);
+    if (!next) break;
+    content += next.content;
+    end = next.end;
+  }
+  return { content, end };
+}
+/**
+ * Scan text for adjacent CriticMarkup markers and:
+ * 1. Merge consecutive same-type markers: {++a++}{++b++} → {++ab++}
+ * 2. Merge adjacent del+ins or ins+del into substitutions: {--old--}{++new++} → {~~old~>new~~}
+ *
+ * Uses a linear scanner — no regex backtracking, no ambiguity.
+ */
+function mergeAdjacentMarkers(text: string): string {
+  let result = '';
+  let i = 0;
+  while (i < text.length) {
+    // --- Deletion block ---
+    if (text.startsWith('{--', i)) {
+      const del = collectConsecutive(text, i, '{--', '--}');
+      if (!del) { result += text[i]; i++; continue; }
+      // Skip spaces, then check for adjacent insertion
+      let j = del.end;
+      while (j < text.length && text[j] === ' ') j++;
+      const ins = collectConsecutive(text, j, '{++', '++}');
+      if (ins) {
+        // Merge into substitution
+        const trailing = del.content.endsWith(' ') || ins.content.endsWith(' ');
+        result += `{~~${del.content.trimEnd()}~>${ins.content.trimEnd()}~~}${trailing ? ' ' : ''}`;
+        i = ins.end;
+      } else {
+        // Emit merged deletion
+        result += `{--${del.content}--}`;
+        i = del.end;
+      }
+      continue;
+    }
+    // --- Insertion block ---
+    if (text.startsWith('{++', i)) {
+      const ins = collectConsecutive(text, i, '{++', '++}');
+      if (!ins) { result += text[i]; i++; continue; }
+      // Skip spaces, then check for adjacent deletion
+      let j = ins.end;
+      while (j < text.length && text[j] === ' ') j++;
+      const del = collectConsecutive(text, j, '{--', '--}');
+      if (del) {
+        // Merge into substitution (del → ins order in output)
+        const trailing = del.content.endsWith(' ') || ins.content.endsWith(' ');
+        result += `{~~${del.content.trimEnd()}~>${ins.content.trimEnd()}~~}${trailing ? ' ' : ''}`;
+        i = del.end;
+      } else {
+        // Emit merged insertion
+        result += `{++${ins.content}++}`;
+        i = ins.end;
+      }
+      continue;
+    }
+    result += text[i];
+    i++;
+  }
+  return result;
+}
+/**
+ * Extract plain text from Word XML with track changes preserved as CriticMarkup.
+ * This is a pandoc-free fallback that reads document.xml directly.
+ *
+ * Converts:
+ *   <w:ins> content </w:ins>  →  {++text++}
+ *   <w:del> content </w:del>  →  {--text--}
+ *
+ * Also detects headings (w:pStyle Heading1-6) and outputs markdown # syntax.
+ *
+ * @param docxPath - Path to Word document
+ * @returns Plain text with CriticMarkup and stats
+ */
+export async function extractPlainTextWithTrackChanges(docxPath: string): Promise<{
+  text: string;
+  hasTrackChanges: boolean;
+  stats: { insertions: number; deletions: number };
+}> {
+  if (!fs.existsSync(docxPath)) {
+    throw new Error(`File not found: ${docxPath}`);
+  }
+  const zip = new AdmZip(docxPath);
+  const docEntry = zip.getEntry('word/document.xml');
+  if (!docEntry) {
+    throw new Error('Invalid docx: no document.xml');
+  }
+  let xml = docEntry.getData().toString('utf8');
+  let insertions = 0;
+  let deletions = 0;
+  // Use unique markers (null bytes) that won't appear in normal text
+  const INS_S = '\x00IS\x00';
+  const INS_E = '\x00IE\x00';
+  const DEL_S = '\x00DS\x00';
+  const DEL_E = '\x00DE\x00';
+  // Step 1: Replace <w:ins> with marker-wrapped text injected as <w:t>
+  // Whitespace-only insertions are kept as plain text (not markers) to preserve spacing.
+  xml = xml.replace(/<w:ins\b[^>]*>([\s\S]*?)<\/w:ins>/g, (_match, content: string) => {
+    const texts: string[] = [];
+    const tPat = /<w:t[^>]*>([^<]*)<\/w:t>/g;
+    let m: RegExpExecArray | null;
+    while ((m = tPat.exec(content)) !== null) {
+      texts.push(m[1] || '');
+    }
+    const text = texts.join('');
+    if (text.trim()) {
+      insertions++;
+      return `<w:r><w:t>${INS_S}${text}${INS_E}</w:t></w:r>`;
+    }
+    // Whitespace-only: preserve as plain text for spacing
+    if (text.length > 0) {
+      return `<w:r><w:t>${text}</w:t></w:r>`;
+    }
+    return '';
+  });
+  // Step 2: Replace <w:del> similarly (uses w:delText inside)
+  // Whitespace-only deletions are kept as plain text to preserve spacing.
+  xml = xml.replace(/<w:del\b[^>]*>([\s\S]*?)<\/w:del>/g, (_match, content: string) => {
+    const texts: string[] = [];
+    const tPat = /<w:delText[^>]*>([^<]*)<\/w:delText>|<w:t[^>]*>([^<]*)<\/w:t>/g;
+    let m: RegExpExecArray | null;
+    while ((m = tPat.exec(content)) !== null) {
+      texts.push(m[1] || m[2] || '');
+    }
+    const text = texts.join('');
+    if (text.trim()) {
+      deletions++;
+      return `<w:r><w:t>${DEL_S}${text}${DEL_E}</w:t></w:r>`;
+    }
+    // Whitespace-only: preserve as plain text for spacing
+    if (text.length > 0) {
+      return `<w:r><w:t>${text}</w:t></w:r>`;
+    }
+    return '';
+  });
+  // Step 3: Extract text paragraph by paragraph
+  const paragraphs: string[] = [];
+  const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
+  let pm: RegExpExecArray | null;
+  while ((pm = paraPattern.exec(xml)) !== null) {
+    const paraXml = pm[1];
+    // Detect heading level from paragraph style
+    let headingLevel = 0;
+    const styleMatch = paraXml.match(/<w:pStyle\s+w:val="Heading(\d)"/i);
+    if (styleMatch && styleMatch[1]) {
+      headingLevel = parseInt(styleMatch[1], 10);
+    }
+    // Extract all <w:t> text in order
+    const texts: string[] = [];
+    const tPat = /<w:t[^>]*>([^<]*)<\/w:t>/g;
+    let tm: RegExpExecArray | null;
+    while ((tm = tPat.exec(paraXml)) !== null) {
+      texts.push(tm[1] || '');
+    }
+    let paraText = texts.join('');
+    // Decode XML entities
+    paraText = paraText
+      .replace(/&amp;/g, '&')
+      .replace(/&lt;/g, '<')
+      .replace(/&gt;/g, '>')
+      .replace(/&quot;/g, '"')
+      .replace(/&apos;/g, "'");
+    // Convert markers to CriticMarkup
+    paraText = paraText
+      .split(INS_S).join('{++')
+      .split(INS_E).join('++}')
+      .split(DEL_S).join('{--')
+      .split(DEL_E).join('--}');
+    // Merge adjacent del+ins (or ins+del) into substitutions.
+    // Uses a scanner instead of regex to avoid backtracking across marker boundaries.
+    paraText = mergeAdjacentMarkers(paraText);
+    // Collapse runs of multiple spaces into single space
+    paraText = paraText.replace(/ {2,}/g, ' ');
+    if (paraText.trim()) {
+      if (headingLevel > 0 && headingLevel <= 6) {
+        paragraphs.push('#'.repeat(headingLevel) + ' ' + paraText.trim());
+      } else {
+        paragraphs.push(paraText);
+      }
+    }
+  }
+  return {
+    text: paragraphs.join('\n\n'),
+    hasTrackChanges: insertions > 0 || deletions > 0,
+    stats: { insertions, deletions },
+  };
+}
 interface ExtractWithTrackChangesOptions {
   mediaDir?: string;
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "docrev",
-  "version": "0.8.5",
+  "version": "0.9.3",
   "description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
   "type": "module",
   "types": "types/index.d.ts",
@@ -119,7 +119,6 @@
     "dictionary-en": "^4.0.0",
     "dictionary-en-gb": "^3.0.0",
     "diff": "^8.0.2",
-    "mammoth": "^1.6.0",
     "mathml-to-latex": "^1.5.0",
     "nspell": "^2.1.5",
     "pdf-lib": "^1.17.1",

package/lib/apply-buildup-colors.py DELETED Viewed

@@ -1,88 +0,0 @@
-"""
-Apply buildup greying to PPTX slides.
-Greys out all bullet items except the last one in each content placeholder.
-"""
-import zipfile
-import sys
-import re
-import os
-pptx_path = sys.argv[1]
-temp_path = pptx_path + '.tmp'
-GREY_COLOR = '888888'
-def apply_grey_to_content_placeholder(text):
-    """Find content placeholder and grey all paragraphs except the last"""
-    # Find content placeholder (idx="1") shape
-    content_match = re.search(r'(<p:sp>.*?<p:ph\s+idx="1"[^>]*/>.*?<p:txBody>)(.*?)(</p:txBody></p:sp>)', text, re.DOTALL)
-    if not content_match:
-        return text
-    before = content_match.group(1)
-    body_content = content_match.group(2)
-    after = content_match.group(3)
-    # Find all paragraphs in the body
-    para_pattern = r'(<a:p>.*?</a:p>)'
-    paras = list(re.finditer(para_pattern, body_content, re.DOTALL))
-    if len(paras) <= 1:
-        return text  # Nothing to grey if 0 or 1 paragraph
-    # Grey out all but the last paragraph
-    new_body = body_content
-    offset = 0
-    for match in paras[:-1]:  # All but last
-        start = match.start() + offset
-        end = match.end() + offset
-        para = match.group(0)
-        # Add grey color to all <a:r> (run) elements
-        def add_grey_to_run(run_match):
-            run = run_match.group(0)
-            # Find <a:rPr> and add solidFill
-            if '<a:solidFill>' in run:
-                # Replace existing color
-                run = re.sub(r'<a:srgbClr val="[^"]*"/>', f'<a:srgbClr val="{GREY_COLOR}"/>', run)
-            elif '<a:rPr />' in run:
-                # Replace self-closing rPr with one that has color
-                run = run.replace('<a:rPr />', f'<a:rPr><a:solidFill><a:srgbClr val="{GREY_COLOR}"/></a:solidFill></a:rPr>')
-            elif '<a:rPr>' in run:
-                # Add solidFill after opening rPr tag
-                run = re.sub(r'(<a:rPr[^>]*>)', r'\1<a:solidFill><a:srgbClr val="' + GREY_COLOR + r'"/></a:solidFill>', run)
-            elif '</a:rPr>' in run:
-                # Insert before closing rPr
-                run = run.replace('</a:rPr>', f'<a:solidFill><a:srgbClr val="{GREY_COLOR}"/></a:solidFill></a:rPr>')
-            else:
-                # No rPr at all, add it after <a:r>
-                run = run.replace('<a:r>', f'<a:r><a:rPr><a:solidFill><a:srgbClr val="{GREY_COLOR}"/></a:solidFill></a:rPr>')
-            return run
-        new_para = re.sub(r'<a:r>.*?</a:r>', add_grey_to_run, para, flags=re.DOTALL)
-        new_body = new_body[:start] + new_para + new_body[end:]
-        offset += len(new_para) - len(para)
-    # Reconstruct the full text
-    full_start = content_match.start()
-    full_end = content_match.end()
-    return text[:full_start] + before + new_body + after + text[full_end:]
-with zipfile.ZipFile(pptx_path, 'r') as zin:
-    with zipfile.ZipFile(temp_path, 'w') as zout:
-        for item in zin.infolist():
-            content = zin.read(item.filename)
-            # Process slide XML files
-            if item.filename.startswith('ppt/slides/slide') and item.filename.endswith('.xml'):
-                text = content.decode('utf-8')
-                text = apply_grey_to_content_placeholder(text)
-                content = text.encode('utf-8')
-            zout.writestr(item, content)
-os.replace(temp_path, pptx_path)
-print('Buildup colors applied')