docrev 0.8.5 → 0.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/.gitattributes +1 -0
  2. package/README.md +25 -1
  3. package/dist/lib/annotations.d.ts.map +1 -1
  4. package/dist/lib/annotations.js +6 -0
  5. package/dist/lib/annotations.js.map +1 -1
  6. package/dist/lib/build.d.ts +6 -1
  7. package/dist/lib/build.d.ts.map +1 -1
  8. package/dist/lib/build.js +67 -1
  9. package/dist/lib/build.js.map +1 -1
  10. package/dist/lib/commands/build.d.ts.map +1 -1
  11. package/dist/lib/commands/build.js +26 -7
  12. package/dist/lib/commands/build.js.map +1 -1
  13. package/dist/lib/commands/response.d.ts.map +1 -1
  14. package/dist/lib/commands/response.js +50 -2
  15. package/dist/lib/commands/response.js.map +1 -1
  16. package/dist/lib/commands/sections.d.ts.map +1 -1
  17. package/dist/lib/commands/sections.js +28 -9
  18. package/dist/lib/commands/sections.js.map +1 -1
  19. package/dist/lib/csl.d.ts +38 -0
  20. package/dist/lib/csl.d.ts.map +1 -0
  21. package/dist/lib/csl.js +170 -0
  22. package/dist/lib/csl.js.map +1 -0
  23. package/dist/lib/import.d.ts.map +1 -1
  24. package/dist/lib/import.js +20 -7
  25. package/dist/lib/import.js.map +1 -1
  26. package/dist/lib/journals.d.ts.map +1 -1
  27. package/dist/lib/journals.js +24 -0
  28. package/dist/lib/journals.js.map +1 -1
  29. package/dist/lib/plugins.d.ts +11 -0
  30. package/dist/lib/plugins.d.ts.map +1 -1
  31. package/dist/lib/plugins.js +21 -1
  32. package/dist/lib/plugins.js.map +1 -1
  33. package/dist/lib/pptx-template.d.ts +17 -22
  34. package/dist/lib/pptx-template.d.ts.map +1 -1
  35. package/dist/lib/pptx-template.js +296 -552
  36. package/dist/lib/pptx-template.js.map +1 -1
  37. package/dist/lib/schema.d.ts.map +1 -1
  38. package/dist/lib/schema.js +4 -0
  39. package/dist/lib/schema.js.map +1 -1
  40. package/dist/lib/types.d.ts +19 -1
  41. package/dist/lib/types.d.ts.map +1 -1
  42. package/dist/lib/word.d.ts +24 -11
  43. package/dist/lib/word.d.ts.map +1 -1
  44. package/dist/lib/word.js +233 -32
  45. package/dist/lib/word.js.map +1 -1
  46. package/lib/annotations.ts +8 -0
  47. package/lib/build.ts +75 -2
  48. package/lib/commands/build.ts +25 -7
  49. package/lib/commands/response.ts +55 -2
  50. package/lib/commands/sections.ts +31 -9
  51. package/lib/csl.ts +191 -0
  52. package/lib/import.ts +21 -7
  53. package/lib/journals.ts +25 -1
  54. package/lib/plugins.ts +35 -1
  55. package/lib/pptx-template.ts +346 -502
  56. package/lib/schema.ts +4 -0
  57. package/lib/types.ts +20 -1
  58. package/lib/word.ts +253 -38
  59. package/package.json +1 -2
  60. package/lib/apply-buildup-colors.py +0 -88
package/lib/schema.ts CHANGED
@@ -87,6 +87,10 @@ export const revYamlSchema: Schema = {
87
87
  ],
88
88
  },
89
89
  },
90
+ journal: {
91
+ type: 'string',
92
+ description: 'Journal profile name for formatting defaults and validation',
93
+ },
90
94
  sections: {
91
95
  type: 'array',
92
96
  description: 'Ordered list of section files to include',
package/lib/types.ts CHANGED
@@ -325,13 +325,32 @@ export interface JournalRequirements {
325
325
  figures?: { max?: number };
326
326
  tables?: { max?: number };
327
327
  sections?: string[];
328
- formatting?: object;
328
+ }
329
+
330
+ export interface JournalFormatting {
331
+ csl?: string;
332
+ pdf?: {
333
+ documentclass?: string;
334
+ fontsize?: string;
335
+ geometry?: string;
336
+ linestretch?: number;
337
+ template?: string;
338
+ numbersections?: boolean;
339
+ };
340
+ docx?: {
341
+ reference?: string;
342
+ };
343
+ crossref?: {
344
+ figPrefix?: string | string[];
345
+ tblPrefix?: string | string[];
346
+ };
329
347
  }
330
348
 
331
349
  export interface JournalProfile {
332
350
  name: string;
333
351
  url: string;
334
352
  requirements: JournalRequirements;
353
+ formatting?: JournalFormatting;
335
354
  }
336
355
 
337
356
  export interface ValidationResult {
package/lib/word.ts CHANGED
@@ -8,7 +8,7 @@ import * as path from 'path';
8
8
  import AdmZip from 'adm-zip';
9
9
  import { parseString } from 'xml2js';
10
10
  import { promisify } from 'util';
11
- import type { WordComment, CommentAnchor, WordContent, WordMetadata, TrackChangesResult } from './types.js';
11
+ import type { WordComment, CommentAnchor, WordMetadata, TrackChangesResult } from './types.js';
12
12
 
13
13
  const parseXml = promisify(parseString);
14
14
 
@@ -166,9 +166,9 @@ export async function extractCommentAnchors(docxPath: string): Promise<Map<strin
166
166
  }
167
167
 
168
168
  /**
169
- * Extract plain text from Word document using mammoth
169
+ * Extract plain text from Word document (strips track change markup)
170
170
  * @param docxPath - Path to .docx file
171
- * @returns Extracted plain text
171
+ * @returns Extracted plain text (accepted changes applied)
172
172
  * @throws {TypeError} If docxPath is not a string
173
173
  * @throws {Error} If file not found
174
174
  */
@@ -176,41 +176,13 @@ export async function extractTextFromWord(docxPath: string): Promise<string> {
176
176
  if (typeof docxPath !== 'string') {
177
177
  throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
178
178
  }
179
- if (!fs.existsSync(docxPath)) {
180
- throw new Error(`File not found: ${docxPath}`);
181
- }
182
-
183
- const mammoth = await import('mammoth');
184
- const result = await mammoth.extractRawText({ path: docxPath });
185
- return result.value;
186
- }
187
-
188
- /**
189
- * Extract rich content from Word with basic formatting
190
- * @param docxPath - Path to .docx file
191
- * @returns Text and HTML content
192
- * @throws {TypeError} If docxPath is not a string
193
- * @throws {Error} If file not found
194
- */
195
- export async function extractFromWord(docxPath: string): Promise<WordContent> {
196
- if (typeof docxPath !== 'string') {
197
- throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
198
- }
199
- if (!fs.existsSync(docxPath)) {
200
- throw new Error(`File not found: ${docxPath}`);
201
- }
202
-
203
- const mammoth = await import('mammoth');
204
-
205
- const [textResult, htmlResult] = await Promise.all([
206
- mammoth.extractRawText({ path: docxPath }),
207
- mammoth.convertToHtml({ path: docxPath }),
208
- ]);
209
-
210
- return {
211
- text: textResult.value,
212
- html: htmlResult.value,
213
- };
179
+ const result = await extractPlainTextWithTrackChanges(docxPath);
180
+ // Strip CriticMarkup: accept insertions, remove deletions, apply substitutions
181
+ let text = result.text;
182
+ text = text.replace(/\{~~[^~]*~>([^~]*)~~\}/g, '$1'); // substitutions → new
183
+ text = text.replace(/\{\+\+([^+]*)\+\+\}/g, '$1'); // insertions → keep
184
+ text = text.replace(/\{--[^}]*--\}/g, ''); // deletions → remove
185
+ return text;
214
186
  }
215
187
 
216
188
  /**
@@ -350,6 +322,249 @@ export async function extractTrackChanges(docxPath: string): Promise<TrackChange
350
322
  };
351
323
  }
352
324
 
325
+ /**
326
+ * Extract a single marker's content starting at position i.
327
+ * Returns { content, end } where end is the position after the closing marker,
328
+ * or null if no valid closing marker found.
329
+ */
330
+ function extractMarker(text: string, i: number, open: string, close: string): { content: string; end: number } | null {
331
+ if (!text.startsWith(open, i)) return null;
332
+ const start = i + open.length;
333
+ const closeIdx = text.indexOf(close, start);
334
+ if (closeIdx === -1) return null;
335
+ return { content: text.slice(start, closeIdx), end: closeIdx + close.length };
336
+ }
337
+
338
+ /**
339
+ * Greedily collect consecutive markers of the same type.
340
+ * E.g. {++a++}{++b++}{++c++} → "abc", advancing past all three.
341
+ */
342
+ function collectConsecutive(text: string, i: number, open: string, close: string): { content: string; end: number } | null {
343
+ const first = extractMarker(text, i, open, close);
344
+ if (!first) return null;
345
+
346
+ let content = first.content;
347
+ let end = first.end;
348
+
349
+ while (end < text.length) {
350
+ const next = extractMarker(text, end, open, close);
351
+ if (!next) break;
352
+ content += next.content;
353
+ end = next.end;
354
+ }
355
+
356
+ return { content, end };
357
+ }
358
+
359
+ /**
360
+ * Scan text for adjacent CriticMarkup markers and:
361
+ * 1. Merge consecutive same-type markers: {++a++}{++b++} → {++ab++}
362
+ * 2. Merge adjacent del+ins or ins+del into substitutions: {--old--}{++new++} → {~~old~>new~~}
363
+ *
364
+ * Uses a linear scanner — no regex backtracking, no ambiguity.
365
+ */
366
+ function mergeAdjacentMarkers(text: string): string {
367
+ let result = '';
368
+ let i = 0;
369
+
370
+ while (i < text.length) {
371
+ // --- Deletion block ---
372
+ if (text.startsWith('{--', i)) {
373
+ const del = collectConsecutive(text, i, '{--', '--}');
374
+ if (!del) { result += text[i]; i++; continue; }
375
+
376
+ // Skip spaces, then check for adjacent insertion
377
+ let j = del.end;
378
+ while (j < text.length && text[j] === ' ') j++;
379
+
380
+ const ins = collectConsecutive(text, j, '{++', '++}');
381
+ if (ins) {
382
+ // Merge into substitution
383
+ const trailing = del.content.endsWith(' ') || ins.content.endsWith(' ');
384
+ result += `{~~${del.content.trimEnd()}~>${ins.content.trimEnd()}~~}${trailing ? ' ' : ''}`;
385
+ i = ins.end;
386
+ } else {
387
+ // Emit merged deletion
388
+ result += `{--${del.content}--}`;
389
+ i = del.end;
390
+ }
391
+ continue;
392
+ }
393
+
394
+ // --- Insertion block ---
395
+ if (text.startsWith('{++', i)) {
396
+ const ins = collectConsecutive(text, i, '{++', '++}');
397
+ if (!ins) { result += text[i]; i++; continue; }
398
+
399
+ // Skip spaces, then check for adjacent deletion
400
+ let j = ins.end;
401
+ while (j < text.length && text[j] === ' ') j++;
402
+
403
+ const del = collectConsecutive(text, j, '{--', '--}');
404
+ if (del) {
405
+ // Merge into substitution (del → ins order in output)
406
+ const trailing = del.content.endsWith(' ') || ins.content.endsWith(' ');
407
+ result += `{~~${del.content.trimEnd()}~>${ins.content.trimEnd()}~~}${trailing ? ' ' : ''}`;
408
+ i = del.end;
409
+ } else {
410
+ // Emit merged insertion
411
+ result += `{++${ins.content}++}`;
412
+ i = ins.end;
413
+ }
414
+ continue;
415
+ }
416
+
417
+ result += text[i];
418
+ i++;
419
+ }
420
+
421
+ return result;
422
+ }
423
+
424
+ /**
425
+ * Extract plain text from Word XML with track changes preserved as CriticMarkup.
426
+ * This is a pandoc-free fallback that reads document.xml directly.
427
+ *
428
+ * Converts:
429
+ * <w:ins> content </w:ins> → {++text++}
430
+ * <w:del> content </w:del> → {--text--}
431
+ *
432
+ * Also detects headings (w:pStyle Heading1-6) and outputs markdown # syntax.
433
+ *
434
+ * @param docxPath - Path to Word document
435
+ * @returns Plain text with CriticMarkup and stats
436
+ */
437
+ export async function extractPlainTextWithTrackChanges(docxPath: string): Promise<{
438
+ text: string;
439
+ hasTrackChanges: boolean;
440
+ stats: { insertions: number; deletions: number };
441
+ }> {
442
+ if (!fs.existsSync(docxPath)) {
443
+ throw new Error(`File not found: ${docxPath}`);
444
+ }
445
+
446
+ const zip = new AdmZip(docxPath);
447
+ const docEntry = zip.getEntry('word/document.xml');
448
+
449
+ if (!docEntry) {
450
+ throw new Error('Invalid docx: no document.xml');
451
+ }
452
+
453
+ let xml = docEntry.getData().toString('utf8');
454
+ let insertions = 0;
455
+ let deletions = 0;
456
+
457
+ // Use unique markers (null bytes) that won't appear in normal text
458
+ const INS_S = '\x00IS\x00';
459
+ const INS_E = '\x00IE\x00';
460
+ const DEL_S = '\x00DS\x00';
461
+ const DEL_E = '\x00DE\x00';
462
+
463
+ // Step 1: Replace <w:ins> with marker-wrapped text injected as <w:t>
464
+ // Whitespace-only insertions are kept as plain text (not markers) to preserve spacing.
465
+ xml = xml.replace(/<w:ins\b[^>]*>([\s\S]*?)<\/w:ins>/g, (_match, content: string) => {
466
+ const texts: string[] = [];
467
+ const tPat = /<w:t[^>]*>([^<]*)<\/w:t>/g;
468
+ let m: RegExpExecArray | null;
469
+ while ((m = tPat.exec(content)) !== null) {
470
+ texts.push(m[1] || '');
471
+ }
472
+ const text = texts.join('');
473
+ if (text.trim()) {
474
+ insertions++;
475
+ return `<w:r><w:t>${INS_S}${text}${INS_E}</w:t></w:r>`;
476
+ }
477
+ // Whitespace-only: preserve as plain text for spacing
478
+ if (text.length > 0) {
479
+ return `<w:r><w:t>${text}</w:t></w:r>`;
480
+ }
481
+ return '';
482
+ });
483
+
484
+ // Step 2: Replace <w:del> similarly (uses w:delText inside)
485
+ // Whitespace-only deletions are kept as plain text to preserve spacing.
486
+ xml = xml.replace(/<w:del\b[^>]*>([\s\S]*?)<\/w:del>/g, (_match, content: string) => {
487
+ const texts: string[] = [];
488
+ const tPat = /<w:delText[^>]*>([^<]*)<\/w:delText>|<w:t[^>]*>([^<]*)<\/w:t>/g;
489
+ let m: RegExpExecArray | null;
490
+ while ((m = tPat.exec(content)) !== null) {
491
+ texts.push(m[1] || m[2] || '');
492
+ }
493
+ const text = texts.join('');
494
+ if (text.trim()) {
495
+ deletions++;
496
+ return `<w:r><w:t>${DEL_S}${text}${DEL_E}</w:t></w:r>`;
497
+ }
498
+ // Whitespace-only: preserve as plain text for spacing
499
+ if (text.length > 0) {
500
+ return `<w:r><w:t>${text}</w:t></w:r>`;
501
+ }
502
+ return '';
503
+ });
504
+
505
+ // Step 3: Extract text paragraph by paragraph
506
+ const paragraphs: string[] = [];
507
+ const paraPattern = /<w:p\b[^>]*>([\s\S]*?)<\/w:p>/g;
508
+ let pm: RegExpExecArray | null;
509
+
510
+ while ((pm = paraPattern.exec(xml)) !== null) {
511
+ const paraXml = pm[1];
512
+
513
+ // Detect heading level from paragraph style
514
+ let headingLevel = 0;
515
+ const styleMatch = paraXml.match(/<w:pStyle\s+w:val="Heading(\d)"/i);
516
+ if (styleMatch && styleMatch[1]) {
517
+ headingLevel = parseInt(styleMatch[1], 10);
518
+ }
519
+
520
+ // Extract all <w:t> text in order
521
+ const texts: string[] = [];
522
+ const tPat = /<w:t[^>]*>([^<]*)<\/w:t>/g;
523
+ let tm: RegExpExecArray | null;
524
+ while ((tm = tPat.exec(paraXml)) !== null) {
525
+ texts.push(tm[1] || '');
526
+ }
527
+
528
+ let paraText = texts.join('');
529
+
530
+ // Decode XML entities
531
+ paraText = paraText
532
+ .replace(/&amp;/g, '&')
533
+ .replace(/&lt;/g, '<')
534
+ .replace(/&gt;/g, '>')
535
+ .replace(/&quot;/g, '"')
536
+ .replace(/&apos;/g, "'");
537
+
538
+ // Convert markers to CriticMarkup
539
+ paraText = paraText
540
+ .split(INS_S).join('{++')
541
+ .split(INS_E).join('++}')
542
+ .split(DEL_S).join('{--')
543
+ .split(DEL_E).join('--}');
544
+
545
+ // Merge adjacent del+ins (or ins+del) into substitutions.
546
+ // Uses a scanner instead of regex to avoid backtracking across marker boundaries.
547
+ paraText = mergeAdjacentMarkers(paraText);
548
+
549
+ // Collapse runs of multiple spaces into single space
550
+ paraText = paraText.replace(/ {2,}/g, ' ');
551
+
552
+ if (paraText.trim()) {
553
+ if (headingLevel > 0 && headingLevel <= 6) {
554
+ paragraphs.push('#'.repeat(headingLevel) + ' ' + paraText.trim());
555
+ } else {
556
+ paragraphs.push(paraText);
557
+ }
558
+ }
559
+ }
560
+
561
+ return {
562
+ text: paragraphs.join('\n\n'),
563
+ hasTrackChanges: insertions > 0 || deletions > 0,
564
+ stats: { insertions, deletions },
565
+ };
566
+ }
567
+
353
568
  interface ExtractWithTrackChangesOptions {
354
569
  mediaDir?: string;
355
570
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "docrev",
3
- "version": "0.8.5",
3
+ "version": "0.9.3",
4
4
  "description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
5
5
  "type": "module",
6
6
  "types": "types/index.d.ts",
@@ -119,7 +119,6 @@
119
119
  "dictionary-en": "^4.0.0",
120
120
  "dictionary-en-gb": "^3.0.0",
121
121
  "diff": "^8.0.2",
122
- "mammoth": "^1.6.0",
123
122
  "mathml-to-latex": "^1.5.0",
124
123
  "nspell": "^2.1.5",
125
124
  "pdf-lib": "^1.17.1",
@@ -1,88 +0,0 @@
1
- """
2
- Apply buildup greying to PPTX slides.
3
- Greys out all bullet items except the last one in each content placeholder.
4
- """
5
- import zipfile
6
- import sys
7
- import re
8
- import os
9
-
10
- pptx_path = sys.argv[1]
11
- temp_path = pptx_path + '.tmp'
12
-
13
- GREY_COLOR = '888888'
14
-
15
- def apply_grey_to_content_placeholder(text):
16
- """Find content placeholder and grey all paragraphs except the last"""
17
- # Find content placeholder (idx="1") shape
18
- content_match = re.search(r'(<p:sp>.*?<p:ph\s+idx="1"[^>]*/>.*?<p:txBody>)(.*?)(</p:txBody></p:sp>)', text, re.DOTALL)
19
-
20
- if not content_match:
21
- return text
22
-
23
- before = content_match.group(1)
24
- body_content = content_match.group(2)
25
- after = content_match.group(3)
26
-
27
- # Find all paragraphs in the body
28
- para_pattern = r'(<a:p>.*?</a:p>)'
29
- paras = list(re.finditer(para_pattern, body_content, re.DOTALL))
30
-
31
- if len(paras) <= 1:
32
- return text # Nothing to grey if 0 or 1 paragraph
33
-
34
- # Grey out all but the last paragraph
35
- new_body = body_content
36
- offset = 0
37
-
38
- for match in paras[:-1]: # All but last
39
- start = match.start() + offset
40
- end = match.end() + offset
41
- para = match.group(0)
42
-
43
- # Add grey color to all <a:r> (run) elements
44
- def add_grey_to_run(run_match):
45
- run = run_match.group(0)
46
- # Find <a:rPr> and add solidFill
47
- if '<a:solidFill>' in run:
48
- # Replace existing color
49
- run = re.sub(r'<a:srgbClr val="[^"]*"/>', f'<a:srgbClr val="{GREY_COLOR}"/>', run)
50
- elif '<a:rPr />' in run:
51
- # Replace self-closing rPr with one that has color
52
- run = run.replace('<a:rPr />', f'<a:rPr><a:solidFill><a:srgbClr val="{GREY_COLOR}"/></a:solidFill></a:rPr>')
53
- elif '<a:rPr>' in run:
54
- # Add solidFill after opening rPr tag
55
- run = re.sub(r'(<a:rPr[^>]*>)', r'\1<a:solidFill><a:srgbClr val="' + GREY_COLOR + r'"/></a:solidFill>', run)
56
- elif '</a:rPr>' in run:
57
- # Insert before closing rPr
58
- run = run.replace('</a:rPr>', f'<a:solidFill><a:srgbClr val="{GREY_COLOR}"/></a:solidFill></a:rPr>')
59
- else:
60
- # No rPr at all, add it after <a:r>
61
- run = run.replace('<a:r>', f'<a:r><a:rPr><a:solidFill><a:srgbClr val="{GREY_COLOR}"/></a:solidFill></a:rPr>')
62
- return run
63
-
64
- new_para = re.sub(r'<a:r>.*?</a:r>', add_grey_to_run, para, flags=re.DOTALL)
65
-
66
- new_body = new_body[:start] + new_para + new_body[end:]
67
- offset += len(new_para) - len(para)
68
-
69
- # Reconstruct the full text
70
- full_start = content_match.start()
71
- full_end = content_match.end()
72
- return text[:full_start] + before + new_body + after + text[full_end:]
73
-
74
- with zipfile.ZipFile(pptx_path, 'r') as zin:
75
- with zipfile.ZipFile(temp_path, 'w') as zout:
76
- for item in zin.infolist():
77
- content = zin.read(item.filename)
78
-
79
- # Process slide XML files
80
- if item.filename.startswith('ppt/slides/slide') and item.filename.endswith('.xml'):
81
- text = content.decode('utf-8')
82
- text = apply_grey_to_content_placeholder(text)
83
- content = text.encode('utf-8')
84
-
85
- zout.writestr(item, content)
86
-
87
- os.replace(temp_path, pptx_path)
88
- print('Buildup colors applied')