docrev 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/build.js CHANGED
@@ -39,10 +39,12 @@ export const DEFAULT_CONFIG = {
39
39
  geometry: 'margin=1in',
40
40
  linestretch: 1.5,
41
41
  numbersections: false,
42
+ toc: false,
42
43
  },
43
44
  docx: {
44
45
  reference: null,
45
46
  keepComments: true,
47
+ toc: false,
46
48
  },
47
49
  tex: {
48
50
  standalone: true,
@@ -317,10 +319,16 @@ export function buildPandocArgs(format, config, outputPath) {
317
319
  if (config.pdf.numbersections) {
318
320
  args.push('--number-sections');
319
321
  }
322
+ if (config.pdf.toc) {
323
+ args.push('--toc');
324
+ }
320
325
  } else if (format === 'docx') {
321
326
  if (config.docx.reference) {
322
327
  args.push('--reference-doc', config.docx.reference);
323
328
  }
329
+ if (config.docx.toc) {
330
+ args.push('--toc');
331
+ }
324
332
  }
325
333
 
326
334
  return args;
@@ -434,8 +442,8 @@ export async function build(directory, formats = ['pdf', 'docx'], options = {})
434
442
  throw new Error('pandoc not found. Run `rev install` to install dependencies.');
435
443
  }
436
444
 
437
- // Load config
438
- const config = loadConfig(directory);
445
+ // Load config (use passed config if provided, otherwise load from file)
446
+ const config = options.config || loadConfig(directory);
439
447
 
440
448
  // Combine sections → paper.md
441
449
  const paperPath = combineSections(directory, config, options);
package/lib/crossref.js CHANGED
@@ -12,20 +12,26 @@ import * as path from 'path';
12
12
 
13
13
  /**
14
14
  * Patterns for detecting hardcoded references
15
- * Matches: Figure 1, Fig. 1a, fig 1b-c, Figs. 1-3, Table S1, etc.
16
- * Includes optional letter suffixes for sub-panels (a, b, c, etc.)
15
+ * Matches complex patterns including:
16
+ * - Simple: "Figure 1", "Fig. 2a", "Table S1"
17
+ * - Ranges: "Figures 1-3", "Fig. 1a-c", "Figs. 1a-3b"
18
+ * - Lists: "Figures 1, 2, and 3", "Fig. 1a, b, c", "Tables 1 & 2"
19
+ * - Mixed: "Figs. 1, 3-5, and 7"
20
+ *
21
+ * Uses a simpler base pattern and parses the full match for lists
17
22
  */
18
23
  const DETECTION_PATTERNS = {
19
- // Figures: Fig, Fig., fig, figure, Figure, FIGURE, Figs, Figures (plural)
20
- // With optional letter suffix: 1a, 1b, 2a-c, etc.
21
- figure: /\b(Figures?|Figs?\.?)\s*(\d+|S\d+)([a-z])?(?:\s*[-–—&,]\s*(\d+|S\d+)?([a-z])?)?\b/gi,
24
+ // Captures the full reference including lists with "and"
25
+ // Group 1: type prefix (Figure, Fig., etc.)
26
+ // Group 2: reference list (parsed by parseReferenceList())
27
+ // Matches: "1", "1a", "1-3", "1a-c", "1, 2, 3", "1 and 2", "1, 2 and 3", "1, 2, and 3"
28
+ // Separator: comma/dash/ampersand, optionally followed by "and"
29
+ // Standalone letters must be followed by separator, punctuation, or word boundary
30
+ figure: /\b(Figures?|Figs?\.?)\s+((?:\d+|S\d+)[a-z]?(?:(?:\s*[-–—,&]\s*(?:and\s+)?|\s+and\s+)(?:(?:\d+|S\d+)[a-z]?|[a-z]\b))*)/gi,
22
31
 
23
- // Tables: Tab, Tab., tab, table, Table, TABLE, Tabs, Tables (plural)
24
- // With optional letter suffix for sub-tables
25
- table: /\b(Tables?|Tabs?\.?)\s*(\d+|S\d+)([a-z])?(?:\s*[-–—&,]\s*(\d+|S\d+)?([a-z])?)?\b/gi,
32
+ table: /\b(Tables?|Tabs?\.?)\s+((?:\d+|S\d+)[a-z]?(?:(?:\s*[-–—,&]\s*(?:and\s+)?|\s+and\s+)(?:(?:\d+|S\d+)[a-z]?|[a-z]\b))*)/gi,
26
33
 
27
- // Equations: Eq, Eq., eq, equation, Equation
28
- equation: /\b(Equations?|Eqs?\.?)\s*(\d+)([a-z])?(?:\s*[-–—&,]\s*(\d+)?([a-z])?)?\b/gi,
34
+ equation: /\b(Equations?|Eqs?\.?)\s+((?:\d+)[a-z]?(?:(?:\s*[-–—,&]\s*(?:and\s+)?|\s+and\s+)(?:(?:\d+)[a-z]?|[a-z]\b))*)/gi,
29
35
  };
30
36
 
31
37
  /**
@@ -68,6 +74,120 @@ export function parseRefNumber(numStr, suffix = null) {
68
74
  return { isSupp, num, suffix: extractedSuffix ? extractedSuffix.toLowerCase() : null };
69
75
  }
70
76
 
77
+ /**
78
+ * Parse a reference list string like "1, 2, and 3" or "1a-c" or "1a-3b"
79
+ * Returns an array of {num, isSupp, suffix} objects
80
+ *
81
+ * @param {string} listStr - e.g., "1, 2, and 3", "1a-c", "1a-3b", "1a, b, c"
82
+ * @returns {Array<{num: number, isSupp: boolean, suffix: string|null}>}
83
+ */
84
+ export function parseReferenceList(listStr) {
85
+ const results = [];
86
+ if (!listStr) return results;
87
+
88
+ // Normalize: replace "and" with comma, normalize dashes
89
+ let normalized = listStr
90
+ .replace(/\s+and\s+/gi, ', ')
91
+ .replace(/[–—]/g, '-') // en-dash, em-dash → hyphen
92
+ .replace(/&/g, ', '); // & → comma
93
+
94
+ // Split by comma (but not by dash, which indicates ranges)
95
+ const parts = normalized.split(/\s*,\s*/).filter(p => p.trim());
96
+
97
+ let lastFullRef = null; // Track the last full reference for implicit prefixes
98
+
99
+ for (const part of parts) {
100
+ const trimmed = part.trim();
101
+ if (!trimmed) continue;
102
+
103
+ // Check if this is a range (contains -)
104
+ if (trimmed.includes('-')) {
105
+ const [start, end] = trimmed.split('-').map(s => s.trim());
106
+
107
+ // Check if end is just a letter (e.g., "1a-c" where end is "c")
108
+ const endIsLetterOnly = /^[a-z]$/i.test(end);
109
+
110
+ const startRef = parseRefNumber(start);
111
+ // For letter-only end, don't parse as number
112
+ const endRef = endIsLetterOnly
113
+ ? { num: startRef.num, isSupp: startRef.isSupp, suffix: end.toLowerCase() }
114
+ : parseRefNumber(end);
115
+
116
+ // Handle different range types:
117
+ // 1. Suffix-only range on same number: "1a-c" → 1a, 1b, 1c
118
+ // 2. Number range: "1-3" → 1, 2, 3
119
+ // 3. Cross-number suffix range: "1a-3b" → 1a...1z, 2a...2z, 3a, 3b (limited)
120
+
121
+ if (startRef.suffix && endRef.suffix && startRef.num !== endRef.num) {
122
+ // Cross-number suffix range: "1a-3b"
123
+ // For academic papers, limit intermediate figures to same suffix range
124
+ // e.g., "1a-3b" typically means 1a, 1b, 2a, 2b, 3a, 3b
125
+ const maxSuffix = Math.max(
126
+ startRef.suffix.charCodeAt(0),
127
+ endRef.suffix.charCodeAt(0)
128
+ );
129
+
130
+ for (let n = startRef.num; n <= endRef.num; n++) {
131
+ const suffixStart = (n === startRef.num) ? startRef.suffix.charCodeAt(0) : 'a'.charCodeAt(0);
132
+ const suffixEnd = (n === endRef.num) ? endRef.suffix.charCodeAt(0) : maxSuffix;
133
+
134
+ for (let s = suffixStart; s <= suffixEnd; s++) {
135
+ results.push({
136
+ num: n,
137
+ isSupp: startRef.isSupp,
138
+ suffix: String.fromCharCode(s)
139
+ });
140
+ }
141
+ }
142
+ lastFullRef = { num: endRef.num, isSupp: startRef.isSupp };
143
+ } else if (startRef.suffix || endRef.suffix) {
144
+ // Suffix range on same number: "1a-c"
145
+ const num = startRef.num || (lastFullRef ? lastFullRef.num : 1);
146
+ const isSupp = startRef.isSupp || (lastFullRef ? lastFullRef.isSupp : false);
147
+ const startCode = (startRef.suffix || 'a').charCodeAt(0);
148
+ const endCode = (endRef.suffix || 'a').charCodeAt(0);
149
+
150
+ for (let code = startCode; code <= endCode; code++) {
151
+ results.push({
152
+ num,
153
+ isSupp,
154
+ suffix: String.fromCharCode(code)
155
+ });
156
+ }
157
+ lastFullRef = { num, isSupp };
158
+ } else {
159
+ // Pure number range: "1-3"
160
+ for (let n = startRef.num; n <= endRef.num; n++) {
161
+ results.push({
162
+ num: n,
163
+ isSupp: startRef.isSupp,
164
+ suffix: null
165
+ });
166
+ }
167
+ lastFullRef = { num: endRef.num, isSupp: startRef.isSupp };
168
+ }
169
+ } else {
170
+ // Single reference or implicit suffix
171
+ // Check if it's just a letter (implicit prefix from previous number)
172
+ if (/^[a-z]$/i.test(trimmed) && lastFullRef) {
173
+ // Implicit prefix: "b" after "1a" means "1b"
174
+ results.push({
175
+ num: lastFullRef.num,
176
+ isSupp: lastFullRef.isSupp,
177
+ suffix: trimmed.toLowerCase()
178
+ });
179
+ } else {
180
+ // Full reference: "1", "1a", "S1", "S1a"
181
+ const ref = parseRefNumber(trimmed);
182
+ results.push(ref);
183
+ lastFullRef = { num: ref.num, isSupp: ref.isSupp };
184
+ }
185
+ }
186
+ }
187
+
188
+ return results;
189
+ }
190
+
71
191
  /**
72
192
  * Build a registry of figure/table labels from .md files
73
193
  * Scans for {#fig:label} and {#tbl:label} anchors
@@ -232,46 +352,15 @@ export function detectHardcodedRefs(text) {
232
352
  let match;
233
353
 
234
354
  while ((match = pattern.exec(text)) !== null) {
235
- const numbers = [];
236
-
237
355
  // Pattern groups:
238
- // [1] = type (Figure, Fig., etc.)
239
- // [2] = first number (1, S1)
240
- // [3] = first suffix (a, b) - optional
241
- // [4] = second number for range (2, S2) - optional
242
- // [5] = second suffix (b, c) - optional
243
-
244
- // Parse first number with optional suffix
245
- const first = parseRefNumber(match[2], match[3]);
246
- numbers.push(first);
247
-
248
- // Parse second number/suffix if present (range like 1a-c or 1-3)
249
- if (match[4] || match[5]) {
250
- const second = parseRefNumber(match[4] || match[2], match[5]);
251
-
252
- // Handle suffix-only ranges (e.g., "1a-c" means 1a, 1b, 1c)
253
- if (!match[4] && match[5] && first.suffix) {
254
- // Expand letter range: a-c → a, b, c
255
- const startCode = first.suffix.charCodeAt(0);
256
- const endCode = match[5].charCodeAt(0);
257
- for (let code = startCode + 1; code <= endCode; code++) {
258
- numbers.push({
259
- num: first.num,
260
- isSupp: first.isSupp,
261
- suffix: String.fromCharCode(code)
262
- });
263
- }
264
- } else if (match[4]) {
265
- // Expand number range
266
- if (first.isSupp === second.isSupp && !first.suffix && !second.suffix) {
267
- for (let n = first.num + 1; n <= second.num; n++) {
268
- numbers.push({ num: n, isSupp: first.isSupp, suffix: null });
269
- }
270
- } else {
271
- numbers.push(second);
272
- }
273
- }
274
- }
356
+ // [1] = type prefix (Figure, Fig., etc.)
357
+ // [2] = reference list string (e.g., "1, 2, and 3" or "1a-3b")
358
+
359
+ const listStr = match[2];
360
+ const numbers = parseReferenceList(listStr);
361
+
362
+ // Skip if no valid numbers were parsed
363
+ if (numbers.length === 0) continue;
275
364
 
276
365
  refs.push({
277
366
  type: normalizeType(type),
package/lib/equations.js CHANGED
@@ -1,14 +1,36 @@
1
1
  /**
2
2
  * Equation extraction and conversion utilities
3
3
  * Handle LaTeX math in Markdown ↔ Word workflows
4
+ *
5
+ * Supports:
6
+ * - Extract LaTeX equations from Markdown
7
+ * - Extract equations from Word documents (OMML → LaTeX via Pandoc)
8
+ * - Convert Markdown with equations to Word (LaTeX → MathML)
4
9
  */
5
10
 
6
11
  import * as fs from 'fs';
7
12
  import * as path from 'path';
8
13
  import { exec } from 'child_process';
9
14
  import { promisify } from 'util';
15
+ import AdmZip from 'adm-zip';
16
+ import { parseString } from 'xml2js';
10
17
 
11
18
  const execAsync = promisify(exec);
19
+ const parseXml = promisify(parseString);
20
+
21
+ // Dynamic import for mathml-to-latex (ESM)
22
+ let MathMLToLaTeX = null;
23
+ async function getMathMLConverter() {
24
+ if (!MathMLToLaTeX) {
25
+ try {
26
+ const module = await import('mathml-to-latex');
27
+ MathMLToLaTeX = module.MathMLToLaTeX;
28
+ } catch {
29
+ return null;
30
+ }
31
+ }
32
+ return MathMLToLaTeX;
33
+ }
12
34
 
13
35
  /**
14
36
  * Extract all equations from markdown text
@@ -256,3 +278,216 @@ export function getEquationStats(files) {
256
278
  byFile,
257
279
  };
258
280
  }
281
+
282
+ /**
283
+ * Extract equations from a Word document using Pandoc
284
+ * Converts OMML (Office Math Markup) to LaTeX
285
+ *
286
+ * @param {string} docxPath - Path to Word document
287
+ * @returns {Promise<{success: boolean, equations: Array<{type: string, latex: string, position: number}>, error?: string}>}
288
+ */
289
+ export async function extractEquationsFromWord(docxPath) {
290
+ if (!fs.existsSync(docxPath)) {
291
+ return { success: false, equations: [], error: `File not found: ${docxPath}` };
292
+ }
293
+
294
+ // Method 1: Use Pandoc to convert docx to markdown with LaTeX math
295
+ try {
296
+ const { stdout } = await execAsync(
297
+ `pandoc "${docxPath}" -t markdown --wrap=none`,
298
+ { maxBuffer: 50 * 1024 * 1024 }
299
+ );
300
+
301
+ // Extract equations from the markdown output
302
+ const equations = extractEquations(stdout, path.basename(docxPath));
303
+
304
+ return {
305
+ success: true,
306
+ equations: equations.map((eq, i) => ({
307
+ type: eq.type,
308
+ latex: eq.content,
309
+ position: i,
310
+ line: eq.line,
311
+ })),
312
+ };
313
+ } catch (err) {
314
+ // Pandoc failed, try fallback method
315
+ return extractEquationsFromWordDirect(docxPath);
316
+ }
317
+ }
318
+
319
+ /**
320
+ * Direct OMML extraction from Word document (fallback if Pandoc fails)
321
+ * Parses document.xml for <m:oMath> elements and attempts conversion
322
+ *
323
+ * @param {string} docxPath
324
+ * @returns {Promise<{success: boolean, equations: Array, error?: string}>}
325
+ */
326
+ async function extractEquationsFromWordDirect(docxPath) {
327
+ try {
328
+ const zip = new AdmZip(docxPath);
329
+ const documentEntry = zip.getEntry('word/document.xml');
330
+
331
+ if (!documentEntry) {
332
+ return { success: false, equations: [], error: 'Invalid docx: no document.xml' };
333
+ }
334
+
335
+ const documentXml = zip.readAsText(documentEntry);
336
+
337
+ // Find all OMML equations (<m:oMath> or <m:oMathPara>)
338
+ const ommlPattern = /<m:oMath[^>]*>[\s\S]*?<\/m:oMath>/gi;
339
+ const matches = documentXml.match(ommlPattern) || [];
340
+
341
+ if (matches.length === 0) {
342
+ return { success: true, equations: [], message: 'No equations found' };
343
+ }
344
+
345
+ // Try to convert OMML to LaTeX via MathML intermediate
346
+ const Converter = await getMathMLConverter();
347
+ const equations = [];
348
+
349
+ for (let i = 0; i < matches.length; i++) {
350
+ const omml = matches[i];
351
+
352
+ // Attempt OMML → MathML → LaTeX conversion
353
+ // Note: This is a simplified approach; full OMML→MathML requires XSLT
354
+ try {
355
+ const latex = await ommlToLatex(omml, Converter);
356
+ if (latex) {
357
+ equations.push({
358
+ type: isDisplayMath(omml) ? 'display' : 'inline',
359
+ latex,
360
+ position: i,
361
+ raw: omml.substring(0, 100) + '...',
362
+ });
363
+ }
364
+ } catch {
365
+ // Keep raw OMML reference if conversion fails
366
+ equations.push({
367
+ type: 'unknown',
368
+ latex: null,
369
+ position: i,
370
+ raw: omml.substring(0, 100) + '...',
371
+ error: 'Conversion failed',
372
+ });
373
+ }
374
+ }
375
+
376
+ return { success: true, equations };
377
+ } catch (err) {
378
+ return { success: false, equations: [], error: err.message };
379
+ }
380
+ }
381
+
382
+ /**
383
+ * Check if OMML represents display math (equation on its own line)
384
+ */
385
+ function isDisplayMath(omml) {
386
+ return omml.includes('<m:oMathPara') || omml.includes('m:jc');
387
+ }
388
+
389
+ /**
390
+ * Convert OMML to LaTeX (simplified approach)
391
+ * For complex equations, Pandoc method is more reliable
392
+ *
393
+ * @param {string} omml - OMML XML string
394
+ * @param {Function} Converter - MathMLToLaTeX converter
395
+ * @returns {Promise<string|null>}
396
+ */
397
+ async function ommlToLatex(omml, Converter) {
398
+ if (!Converter) return null;
399
+
400
+ // Extract key elements from OMML and build approximate MathML
401
+ // This is a simplified conversion - not all OMML features are supported
402
+ try {
403
+ // Build basic MathML from OMML structure
404
+ const mathml = ommlToMathML(omml);
405
+ if (!mathml) return null;
406
+
407
+ // Convert MathML to LaTeX
408
+ const latex = Converter.convert(mathml);
409
+ return latex;
410
+ } catch {
411
+ return null;
412
+ }
413
+ }
414
+
415
+ /**
416
+ * Convert OMML to MathML (simplified)
417
+ * Maps common OMML elements to MathML equivalents
418
+ */
419
+ function ommlToMathML(omml) {
420
+ // Remove namespace prefixes for easier parsing
421
+ let xml = omml
422
+ .replace(/<m:/g, '<')
423
+ .replace(/<\/m:/g, '</')
424
+ .replace(/<w:/g, '<w_')
425
+ .replace(/<\/w:/g, '</w_');
426
+
427
+ // Map OMML elements to MathML
428
+ const mappings = [
429
+ [/<oMath[^>]*>/gi, '<math xmlns="http://www.w3.org/1998/Math/MathML">'],
430
+ [/<\/oMath>/gi, '</math>'],
431
+ [/<r>/gi, '<mi>'],
432
+ [/<\/r>/gi, '</mi>'],
433
+ [/<t>/gi, ''],
434
+ [/<\/t>/gi, ''],
435
+ [/<f>/gi, '<mfrac>'],
436
+ [/<\/f>/gi, '</mfrac>'],
437
+ [/<num>/gi, '<mrow>'],
438
+ [/<\/num>/gi, '</mrow>'],
439
+ [/<den>/gi, '<mrow>'],
440
+ [/<\/den>/gi, '</mrow>'],
441
+ [/<sup>/gi, '<msup><mrow>'],
442
+ [/<\/sup>/gi, '</mrow></msup>'],
443
+ [/<sub>/gi, '<msub><mrow>'],
444
+ [/<\/sub>/gi, '</mrow></msub>'],
445
+ [/<rad>/gi, '<msqrt>'],
446
+ [/<\/rad>/gi, '</msqrt>'],
447
+ [/<e>/gi, '<mrow>'],
448
+ [/<\/e>/gi, '</mrow>'],
449
+ // Remove elements we don't map
450
+ [/<rPr>[\s\S]*?<\/rPr>/gi, ''],
451
+ [/<ctrlPr>[\s\S]*?<\/ctrlPr>/gi, ''],
452
+ [/<w_[^>]*>[\s\S]*?<\/w_[^>]*>/gi, ''],
453
+ [/<[^>]*\/>/gi, ''], // Self-closing tags
454
+ ];
455
+
456
+ for (const [pattern, replacement] of mappings) {
457
+ xml = xml.replace(pattern, replacement);
458
+ }
459
+
460
+ // Clean up any remaining unrecognized tags
461
+ xml = xml.replace(/<[a-zA-Z][^>]*>/g, '').replace(/<\/[a-zA-Z]+>/g, '');
462
+
463
+ // Wrap in math if not already
464
+ if (!xml.includes('<math')) {
465
+ xml = `<math xmlns="http://www.w3.org/1998/Math/MathML">${xml}</math>`;
466
+ }
467
+
468
+ return xml;
469
+ }
470
+
471
+ /**
472
+ * Get equation summary from Word document
473
+ * @param {string} docxPath
474
+ * @returns {Promise<{count: number, display: number, inline: number, converted: number}>}
475
+ */
476
+ export async function getWordEquationStats(docxPath) {
477
+ const result = await extractEquationsFromWord(docxPath);
478
+
479
+ if (!result.success) {
480
+ return { count: 0, display: 0, inline: 0, converted: 0, error: result.error };
481
+ }
482
+
483
+ const display = result.equations.filter(e => e.type === 'display').length;
484
+ const inline = result.equations.filter(e => e.type === 'inline').length;
485
+ const converted = result.equations.filter(e => e.latex).length;
486
+
487
+ return {
488
+ count: result.equations.length,
489
+ display,
490
+ inline,
491
+ converted,
492
+ };
493
+ }