docrev 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,173 @@
1
+ /**
2
+ * Template variable substitution for rev
3
+ *
4
+ * Supported variables:
5
+ * {{date}} - Current date (YYYY-MM-DD)
6
+ * {{date:format}} - Custom date format (e.g., {{date:MMMM D, YYYY}})
7
+ * {{version}} - Version from rev.yaml
8
+ * {{word_count}} - Total word count
9
+ * {{author}} - First author name
10
+ * {{authors}} - All authors (comma-separated)
11
+ * {{title}} - Document title
12
+ * {{year}} - Current year
13
+ */
14
+
15
+ import * as fs from 'fs';
16
+
17
+ /**
18
+ * Format date with simple pattern
19
+ * @param {Date} date
20
+ * @param {string} format - Pattern (YYYY, MM, DD, MMMM, MMM, D)
21
+ * @returns {string}
22
+ */
23
+ function formatDate(date, format = 'YYYY-MM-DD') {
24
+ const months = [
25
+ 'January', 'February', 'March', 'April', 'May', 'June',
26
+ 'July', 'August', 'September', 'October', 'November', 'December'
27
+ ];
28
+ const monthsShort = [
29
+ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
30
+ 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
31
+ ];
32
+
33
+ const year = date.getFullYear();
34
+ const month = date.getMonth();
35
+ const day = date.getDate();
36
+
37
+ // Use placeholders to avoid replacement conflicts (e.g., D in December)
38
+ return format
39
+ .replace('YYYY', '\x00YEAR\x00')
40
+ .replace('MMMM', '\x00MONTHFULL\x00')
41
+ .replace('MMM', '\x00MONTHSHORT\x00')
42
+ .replace('MM', '\x00MONTHNUM\x00')
43
+ .replace('DD', '\x00DAYPAD\x00')
44
+ .replace(/\bD\b/, '\x00DAY\x00')
45
+ .replace('\x00YEAR\x00', year.toString())
46
+ .replace('\x00MONTHFULL\x00', months[month])
47
+ .replace('\x00MONTHSHORT\x00', monthsShort[month])
48
+ .replace('\x00MONTHNUM\x00', (month + 1).toString().padStart(2, '0'))
49
+ .replace('\x00DAYPAD\x00', day.toString().padStart(2, '0'))
50
+ .replace('\x00DAY\x00', day.toString());
51
+ }
52
+
53
+ /**
54
+ * Count words in text (excluding markdown syntax)
55
+ * @param {string} text
56
+ * @returns {number}
57
+ */
58
+ function countWords(text) {
59
+ return text
60
+ .replace(/^---[\s\S]*?---/m, '') // Remove frontmatter
61
+ .replace(/!\[.*?\]\(.*?\)/g, '') // Remove images
62
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // Keep link text
63
+ .replace(/#+\s*/g, '') // Remove headers
64
+ .replace(/\*\*|__|[*_`]/g, '') // Remove formatting
65
+ .replace(/```[\s\S]*?```/g, '') // Remove code blocks
66
+ .replace(/\{[^}]+\}/g, '') // Remove annotations
67
+ .replace(/@\w+:\w+/g, '') // Remove refs
68
+ .replace(/@\w+/g, '') // Remove citations
69
+ .replace(/\|[^|]+\|/g, ' ') // Remove tables
70
+ .replace(/\n+/g, ' ')
71
+ .trim()
72
+ .split(/\s+/)
73
+ .filter(w => w.length > 0).length;
74
+ }
75
+
76
+ /**
77
+ * Get first author name from authors array
78
+ * @param {Array|string} authors
79
+ * @returns {string}
80
+ */
81
+ function getFirstAuthor(authors) {
82
+ if (!authors || authors.length === 0) return '';
83
+
84
+ const first = Array.isArray(authors) ? authors[0] : authors;
85
+
86
+ if (typeof first === 'string') return first;
87
+ if (first.name) return first.name;
88
+
89
+ return '';
90
+ }
91
+
92
+ /**
93
+ * Get all author names
94
+ * @param {Array|string} authors
95
+ * @returns {string}
96
+ */
97
+ function getAllAuthors(authors) {
98
+ if (!authors) return '';
99
+ if (typeof authors === 'string') return authors;
100
+
101
+ return authors
102
+ .map(a => typeof a === 'string' ? a : a.name)
103
+ .filter(Boolean)
104
+ .join(', ');
105
+ }
106
+
107
+ /**
108
+ * Process template variables in text
109
+ * @param {string} text - Text with {{variable}} placeholders
110
+ * @param {object} config - rev.yaml config
111
+ * @param {object} options - Additional options
112
+ * @param {string[]} options.sections - Section file contents for word count
113
+ * @returns {string} Text with variables replaced
114
+ */
115
+ export function processVariables(text, config = {}, options = {}) {
116
+ const now = new Date();
117
+ let result = text;
118
+
119
+ // Calculate word count from sections if provided
120
+ let wordCount = 0;
121
+ if (options.sectionContents) {
122
+ for (const content of options.sectionContents) {
123
+ wordCount += countWords(content);
124
+ }
125
+ }
126
+
127
+ // {{date}} - Current date
128
+ result = result.replace(/\{\{date\}\}/g, formatDate(now));
129
+
130
+ // {{date:format}} - Custom date format
131
+ result = result.replace(/\{\{date:([^}]+)\}\}/g, (match, format) => {
132
+ return formatDate(now, format);
133
+ });
134
+
135
+ // {{year}} - Current year
136
+ result = result.replace(/\{\{year\}\}/g, now.getFullYear().toString());
137
+
138
+ // {{version}} - From config
139
+ result = result.replace(/\{\{version\}\}/g, config.version || '');
140
+
141
+ // {{title}} - Document title
142
+ result = result.replace(/\{\{title\}\}/g, config.title || '');
143
+
144
+ // {{author}} - First author
145
+ result = result.replace(/\{\{author\}\}/g, getFirstAuthor(config.authors));
146
+
147
+ // {{authors}} - All authors
148
+ result = result.replace(/\{\{authors\}\}/g, getAllAuthors(config.authors));
149
+
150
+ // {{word_count}} - Total word count
151
+ result = result.replace(/\{\{word_count\}\}/g, wordCount.toLocaleString());
152
+
153
+ return result;
154
+ }
155
+
156
+ /**
157
+ * Check if text contains any template variables
158
+ * @param {string} text
159
+ * @returns {boolean}
160
+ */
161
+ export function hasVariables(text) {
162
+ return /\{\{[^}]+\}\}/.test(text);
163
+ }
164
+
165
+ /**
166
+ * List all variables found in text
167
+ * @param {string} text
168
+ * @returns {string[]}
169
+ */
170
+ export function findVariables(text) {
171
+ const matches = text.match(/\{\{([^}]+)\}\}/g) || [];
172
+ return [...new Set(matches.map(m => m.slice(2, -2)))];
173
+ }
package/lib/word.js ADDED
@@ -0,0 +1,225 @@
1
+ /**
2
+ * Word document extraction utilities
3
+ * Handle reading text, comments, and anchors from .docx files
4
+ */
5
+
6
+ import * as fs from 'fs';
7
+ import * as path from 'path';
8
+ import AdmZip from 'adm-zip';
9
+ import { parseString } from 'xml2js';
10
+ import { promisify } from 'util';
11
+
12
+ const parseXml = promisify(parseString);
13
+
14
+ /**
15
+ * Extract comments from Word document's comments.xml
16
+ * @param {string} docxPath
17
+ * @returns {Promise<Array<{id: string, author: string, date: string, text: string}>>}
18
+ */
19
+ export async function extractWordComments(docxPath) {
20
+ if (!fs.existsSync(docxPath)) {
21
+ throw new Error(`File not found: ${docxPath}`);
22
+ }
23
+
24
+ const zip = new AdmZip(docxPath);
25
+ const commentsEntry = zip.getEntry('word/comments.xml');
26
+
27
+ if (!commentsEntry) {
28
+ return []; // No comments in document
29
+ }
30
+
31
+ const commentsXml = zip.readAsText(commentsEntry);
32
+ const parsed = await parseXml(commentsXml);
33
+
34
+ if (!parsed['w:comments'] || !parsed['w:comments']['w:comment']) {
35
+ return [];
36
+ }
37
+
38
+ const comments = [];
39
+ const rawComments = parsed['w:comments']['w:comment'];
40
+
41
+ for (const comment of rawComments) {
42
+ const id = comment.$?.['w:id'];
43
+ const author = comment.$?.['w:author'] || 'Unknown';
44
+ const date = comment.$?.['w:date'];
45
+
46
+ // Extract text from all paragraphs in comment
47
+ let text = '';
48
+ const paragraphs = comment['w:p'] || [];
49
+ for (const para of paragraphs) {
50
+ const runs = para['w:r'] || [];
51
+ for (const run of runs) {
52
+ const texts = run['w:t'] || [];
53
+ for (const t of texts) {
54
+ text += typeof t === 'string' ? t : (t._ || '');
55
+ }
56
+ }
57
+ }
58
+
59
+ if (id && text.trim()) {
60
+ comments.push({
61
+ id,
62
+ author,
63
+ date,
64
+ text: text.trim(),
65
+ });
66
+ }
67
+ }
68
+
69
+ return comments;
70
+ }
71
+
72
+ /**
73
+ * Extract comment anchors (where comments are attached) from document.xml
74
+ * Returns mapping of comment ID to the text they're anchored to
75
+ * @param {string} docxPath
76
+ * @returns {Promise<Map<string, {text: string, context: string}>>}
77
+ */
78
+ export async function extractCommentAnchors(docxPath) {
79
+ const zip = new AdmZip(docxPath);
80
+ const documentEntry = zip.getEntry('word/document.xml');
81
+
82
+ if (!documentEntry) {
83
+ throw new Error('Invalid docx: no document.xml');
84
+ }
85
+
86
+ const documentXml = zip.readAsText(documentEntry);
87
+ const anchors = new Map();
88
+
89
+ // Find commentRangeStart and commentRangeEnd pairs
90
+ // The text between them is what the comment is anchored to
91
+ const startPattern = /<w:commentRangeStart w:id="(\d+)"\/>/g;
92
+ const endPattern = /<w:commentRangeEnd w:id="(\d+)"\/>/g;
93
+
94
+ let match;
95
+ const starts = new Map();
96
+ const ends = new Map();
97
+
98
+ while ((match = startPattern.exec(documentXml)) !== null) {
99
+ starts.set(match[1], match.index);
100
+ }
101
+
102
+ while ((match = endPattern.exec(documentXml)) !== null) {
103
+ ends.set(match[1], match.index);
104
+ }
105
+
106
+ // For each comment, extract the text between start and end
107
+ for (const [id, startPos] of starts) {
108
+ const endPos = ends.get(id);
109
+ if (!endPos) continue;
110
+
111
+ const segment = documentXml.slice(startPos, endPos);
112
+
113
+ // Extract all text content from the segment
114
+ const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
115
+ let text = '';
116
+ let textMatch;
117
+ while ((textMatch = textPattern.exec(segment)) !== null) {
118
+ text += textMatch[1];
119
+ }
120
+
121
+ // Get surrounding context (text before the anchor)
122
+ const contextStart = Math.max(0, startPos - 500);
123
+ const contextSegment = documentXml.slice(contextStart, startPos);
124
+ let context = '';
125
+ while ((textMatch = textPattern.exec(contextSegment)) !== null) {
126
+ context += textMatch[1];
127
+ }
128
+
129
+ anchors.set(id, {
130
+ text: text.trim(),
131
+ context: context.slice(-100), // Last 100 chars of context
132
+ });
133
+ }
134
+
135
+ return anchors;
136
+ }
137
+
138
+ /**
139
+ * Extract plain text from Word document using mammoth
140
+ * @param {string} docxPath
141
+ * @returns {Promise<string>}
142
+ */
143
+ export async function extractTextFromWord(docxPath) {
144
+ if (!fs.existsSync(docxPath)) {
145
+ throw new Error(`File not found: ${docxPath}`);
146
+ }
147
+
148
+ const mammoth = await import('mammoth');
149
+ const result = await mammoth.extractRawText({ path: docxPath });
150
+ return result.value;
151
+ }
152
+
153
+ /**
154
+ * Extract rich content from Word with basic formatting
155
+ * @param {string} docxPath
156
+ * @returns {Promise<{text: string, html: string}>}
157
+ */
158
+ export async function extractFromWord(docxPath) {
159
+ if (!fs.existsSync(docxPath)) {
160
+ throw new Error(`File not found: ${docxPath}`);
161
+ }
162
+
163
+ const mammoth = await import('mammoth');
164
+
165
+ const [textResult, htmlResult] = await Promise.all([
166
+ mammoth.extractRawText({ path: docxPath }),
167
+ mammoth.convertToHtml({ path: docxPath }),
168
+ ]);
169
+
170
+ return {
171
+ text: textResult.value,
172
+ html: htmlResult.value,
173
+ };
174
+ }
175
+
176
+ /**
177
+ * Get document metadata from Word file
178
+ * @param {string} docxPath
179
+ * @returns {Promise<{title?: string, author?: string, created?: string, modified?: string}>}
180
+ */
181
+ export async function getWordMetadata(docxPath) {
182
+ const zip = new AdmZip(docxPath);
183
+ const coreEntry = zip.getEntry('docProps/core.xml');
184
+
185
+ if (!coreEntry) {
186
+ return {};
187
+ }
188
+
189
+ const coreXml = zip.readAsText(coreEntry);
190
+ const metadata = {};
191
+
192
+ // Extract common metadata fields
193
+ const patterns = {
194
+ title: /<dc:title>([^<]*)<\/dc:title>/,
195
+ author: /<dc:creator>([^<]*)<\/dc:creator>/,
196
+ created: /<dcterms:created[^>]*>([^<]*)<\/dcterms:created>/,
197
+ modified: /<dcterms:modified[^>]*>([^<]*)<\/dcterms:modified>/,
198
+ };
199
+
200
+ for (const [key, pattern] of Object.entries(patterns)) {
201
+ const match = coreXml.match(pattern);
202
+ if (match) {
203
+ metadata[key] = match[1];
204
+ }
205
+ }
206
+
207
+ return metadata;
208
+ }
209
+
210
+ /**
211
+ * Check if file is a valid Word document
212
+ * @param {string} filePath
213
+ * @returns {boolean}
214
+ */
215
+ export function isWordDocument(filePath) {
216
+ if (!fs.existsSync(filePath)) return false;
217
+ if (!filePath.toLowerCase().endsWith('.docx')) return false;
218
+
219
+ try {
220
+ const zip = new AdmZip(filePath);
221
+ return zip.getEntry('word/document.xml') !== null;
222
+ } catch {
223
+ return false;
224
+ }
225
+ }
package/package.json CHANGED
@@ -1,14 +1,86 @@
1
1
  {
2
2
  "name": "docrev",
3
- "version": "0.2.1",
3
+ "version": "0.5.0",
4
4
  "description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
5
5
  "type": "module",
6
+ "types": "types/index.d.ts",
7
+ "exports": {
8
+ ".": {
9
+ "types": "./types/index.d.ts",
10
+ "import": "./lib/annotations.js"
11
+ },
12
+ "./annotations": {
13
+ "types": "./types/index.d.ts",
14
+ "import": "./lib/annotations.js"
15
+ },
16
+ "./build": {
17
+ "types": "./types/index.d.ts",
18
+ "import": "./lib/build.js"
19
+ },
20
+ "./citations": {
21
+ "types": "./types/index.d.ts",
22
+ "import": "./lib/citations.js"
23
+ },
24
+ "./crossref": {
25
+ "types": "./types/index.d.ts",
26
+ "import": "./lib/crossref.js"
27
+ },
28
+ "./doi": {
29
+ "types": "./types/index.d.ts",
30
+ "import": "./lib/doi.js"
31
+ },
32
+ "./equations": {
33
+ "types": "./types/index.d.ts",
34
+ "import": "./lib/equations.js"
35
+ },
36
+ "./git": {
37
+ "types": "./types/index.d.ts",
38
+ "import": "./lib/git.js"
39
+ },
40
+ "./journals": {
41
+ "types": "./types/index.d.ts",
42
+ "import": "./lib/journals.js"
43
+ },
44
+ "./merge": {
45
+ "types": "./types/index.d.ts",
46
+ "import": "./lib/merge.js"
47
+ },
48
+ "./sections": {
49
+ "types": "./types/index.d.ts",
50
+ "import": "./lib/sections.js"
51
+ },
52
+ "./word": {
53
+ "types": "./types/index.d.ts",
54
+ "import": "./lib/word.js"
55
+ },
56
+ "./variables": {
57
+ "types": "./types/index.d.ts",
58
+ "import": "./lib/variables.js"
59
+ },
60
+ "./grammar": {
61
+ "types": "./types/index.d.ts",
62
+ "import": "./lib/grammar.js"
63
+ },
64
+ "./trackchanges": {
65
+ "types": "./types/index.d.ts",
66
+ "import": "./lib/trackchanges.js"
67
+ },
68
+ "./spelling": {
69
+ "types": "./types/index.d.ts",
70
+ "import": "./lib/spelling.js"
71
+ }
72
+ },
73
+ "engines": {
74
+ "node": ">=18.0.0"
75
+ },
6
76
  "bin": {
7
77
  "rev": "bin/rev.js"
8
78
  },
9
79
  "scripts": {
10
80
  "build": "echo 'No build needed'",
11
- "test": "node bin/rev.js --help"
81
+ "test": "node --test test/*.test.js",
82
+ "test:watch": "node --test --watch test/*.test.js",
83
+ "test:coverage": "c8 --reporter=text --reporter=lcov node --test test/*.test.js"
12
84
  },
13
85
  "repository": {
14
86
  "type": "git",
@@ -35,10 +107,16 @@
35
107
  "adm-zip": "^0.5.16",
36
108
  "chalk": "^5.3.0",
37
109
  "commander": "^12.0.0",
110
+ "dictionary-en": "^4.0.0",
111
+ "dictionary-en-gb": "^3.0.0",
38
112
  "diff": "^8.0.2",
39
113
  "js-yaml": "^4.1.1",
40
114
  "mammoth": "^1.6.0",
41
115
  "mathml-to-latex": "^1.5.0",
116
+ "nspell": "^2.1.5",
42
117
  "xml2js": "^0.6.2"
118
+ },
119
+ "devDependencies": {
120
+ "c8": "^10.1.2"
43
121
  }
44
122
  }