docrev 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -2
- package/README.md +35 -2
- package/bin/rev.js +696 -5
- package/lib/build.js +10 -2
- package/lib/crossref.js +138 -49
- package/lib/equations.js +235 -0
- package/lib/git.js +238 -0
- package/lib/journals.js +420 -0
- package/lib/merge.js +365 -0
- package/lib/trackchanges.js +273 -0
- package/lib/word.js +225 -0
- package/package.json +7 -5
package/lib/word.js
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Word document extraction utilities
|
|
3
|
+
* Handle reading text, comments, and anchors from .docx files
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import * as fs from 'fs';
|
|
7
|
+
import * as path from 'path';
|
|
8
|
+
import AdmZip from 'adm-zip';
|
|
9
|
+
import { parseString } from 'xml2js';
|
|
10
|
+
import { promisify } from 'util';
|
|
11
|
+
|
|
12
|
+
const parseXml = promisify(parseString);
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Extract comments from Word document's comments.xml
|
|
16
|
+
* @param {string} docxPath
|
|
17
|
+
* @returns {Promise<Array<{id: string, author: string, date: string, text: string}>>}
|
|
18
|
+
*/
|
|
19
|
+
export async function extractWordComments(docxPath) {
|
|
20
|
+
if (!fs.existsSync(docxPath)) {
|
|
21
|
+
throw new Error(`File not found: ${docxPath}`);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const zip = new AdmZip(docxPath);
|
|
25
|
+
const commentsEntry = zip.getEntry('word/comments.xml');
|
|
26
|
+
|
|
27
|
+
if (!commentsEntry) {
|
|
28
|
+
return []; // No comments in document
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const commentsXml = zip.readAsText(commentsEntry);
|
|
32
|
+
const parsed = await parseXml(commentsXml);
|
|
33
|
+
|
|
34
|
+
if (!parsed['w:comments'] || !parsed['w:comments']['w:comment']) {
|
|
35
|
+
return [];
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const comments = [];
|
|
39
|
+
const rawComments = parsed['w:comments']['w:comment'];
|
|
40
|
+
|
|
41
|
+
for (const comment of rawComments) {
|
|
42
|
+
const id = comment.$?.['w:id'];
|
|
43
|
+
const author = comment.$?.['w:author'] || 'Unknown';
|
|
44
|
+
const date = comment.$?.['w:date'];
|
|
45
|
+
|
|
46
|
+
// Extract text from all paragraphs in comment
|
|
47
|
+
let text = '';
|
|
48
|
+
const paragraphs = comment['w:p'] || [];
|
|
49
|
+
for (const para of paragraphs) {
|
|
50
|
+
const runs = para['w:r'] || [];
|
|
51
|
+
for (const run of runs) {
|
|
52
|
+
const texts = run['w:t'] || [];
|
|
53
|
+
for (const t of texts) {
|
|
54
|
+
text += typeof t === 'string' ? t : (t._ || '');
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
if (id && text.trim()) {
|
|
60
|
+
comments.push({
|
|
61
|
+
id,
|
|
62
|
+
author,
|
|
63
|
+
date,
|
|
64
|
+
text: text.trim(),
|
|
65
|
+
});
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return comments;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Extract comment anchors (where comments are attached) from document.xml
|
|
74
|
+
* Returns mapping of comment ID to the text they're anchored to
|
|
75
|
+
* @param {string} docxPath
|
|
76
|
+
* @returns {Promise<Map<string, {text: string, context: string}>>}
|
|
77
|
+
*/
|
|
78
|
+
export async function extractCommentAnchors(docxPath) {
|
|
79
|
+
const zip = new AdmZip(docxPath);
|
|
80
|
+
const documentEntry = zip.getEntry('word/document.xml');
|
|
81
|
+
|
|
82
|
+
if (!documentEntry) {
|
|
83
|
+
throw new Error('Invalid docx: no document.xml');
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const documentXml = zip.readAsText(documentEntry);
|
|
87
|
+
const anchors = new Map();
|
|
88
|
+
|
|
89
|
+
// Find commentRangeStart and commentRangeEnd pairs
|
|
90
|
+
// The text between them is what the comment is anchored to
|
|
91
|
+
const startPattern = /<w:commentRangeStart w:id="(\d+)"\/>/g;
|
|
92
|
+
const endPattern = /<w:commentRangeEnd w:id="(\d+)"\/>/g;
|
|
93
|
+
|
|
94
|
+
let match;
|
|
95
|
+
const starts = new Map();
|
|
96
|
+
const ends = new Map();
|
|
97
|
+
|
|
98
|
+
while ((match = startPattern.exec(documentXml)) !== null) {
|
|
99
|
+
starts.set(match[1], match.index);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
while ((match = endPattern.exec(documentXml)) !== null) {
|
|
103
|
+
ends.set(match[1], match.index);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// For each comment, extract the text between start and end
|
|
107
|
+
for (const [id, startPos] of starts) {
|
|
108
|
+
const endPos = ends.get(id);
|
|
109
|
+
if (!endPos) continue;
|
|
110
|
+
|
|
111
|
+
const segment = documentXml.slice(startPos, endPos);
|
|
112
|
+
|
|
113
|
+
// Extract all text content from the segment
|
|
114
|
+
const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
|
|
115
|
+
let text = '';
|
|
116
|
+
let textMatch;
|
|
117
|
+
while ((textMatch = textPattern.exec(segment)) !== null) {
|
|
118
|
+
text += textMatch[1];
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Get surrounding context (text before the anchor)
|
|
122
|
+
const contextStart = Math.max(0, startPos - 500);
|
|
123
|
+
const contextSegment = documentXml.slice(contextStart, startPos);
|
|
124
|
+
let context = '';
|
|
125
|
+
while ((textMatch = textPattern.exec(contextSegment)) !== null) {
|
|
126
|
+
context += textMatch[1];
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
anchors.set(id, {
|
|
130
|
+
text: text.trim(),
|
|
131
|
+
context: context.slice(-100), // Last 100 chars of context
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return anchors;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Extract plain text from Word document using mammoth
|
|
140
|
+
* @param {string} docxPath
|
|
141
|
+
* @returns {Promise<string>}
|
|
142
|
+
*/
|
|
143
|
+
export async function extractTextFromWord(docxPath) {
|
|
144
|
+
if (!fs.existsSync(docxPath)) {
|
|
145
|
+
throw new Error(`File not found: ${docxPath}`);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
const mammoth = await import('mammoth');
|
|
149
|
+
const result = await mammoth.extractRawText({ path: docxPath });
|
|
150
|
+
return result.value;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Extract rich content from Word with basic formatting
|
|
155
|
+
* @param {string} docxPath
|
|
156
|
+
* @returns {Promise<{text: string, html: string}>}
|
|
157
|
+
*/
|
|
158
|
+
export async function extractFromWord(docxPath) {
|
|
159
|
+
if (!fs.existsSync(docxPath)) {
|
|
160
|
+
throw new Error(`File not found: ${docxPath}`);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const mammoth = await import('mammoth');
|
|
164
|
+
|
|
165
|
+
const [textResult, htmlResult] = await Promise.all([
|
|
166
|
+
mammoth.extractRawText({ path: docxPath }),
|
|
167
|
+
mammoth.convertToHtml({ path: docxPath }),
|
|
168
|
+
]);
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
text: textResult.value,
|
|
172
|
+
html: htmlResult.value,
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Get document metadata from Word file
|
|
178
|
+
* @param {string} docxPath
|
|
179
|
+
* @returns {Promise<{title?: string, author?: string, created?: string, modified?: string}>}
|
|
180
|
+
*/
|
|
181
|
+
export async function getWordMetadata(docxPath) {
|
|
182
|
+
const zip = new AdmZip(docxPath);
|
|
183
|
+
const coreEntry = zip.getEntry('docProps/core.xml');
|
|
184
|
+
|
|
185
|
+
if (!coreEntry) {
|
|
186
|
+
return {};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const coreXml = zip.readAsText(coreEntry);
|
|
190
|
+
const metadata = {};
|
|
191
|
+
|
|
192
|
+
// Extract common metadata fields
|
|
193
|
+
const patterns = {
|
|
194
|
+
title: /<dc:title>([^<]*)<\/dc:title>/,
|
|
195
|
+
author: /<dc:creator>([^<]*)<\/dc:creator>/,
|
|
196
|
+
created: /<dcterms:created[^>]*>([^<]*)<\/dcterms:created>/,
|
|
197
|
+
modified: /<dcterms:modified[^>]*>([^<]*)<\/dcterms:modified>/,
|
|
198
|
+
};
|
|
199
|
+
|
|
200
|
+
for (const [key, pattern] of Object.entries(patterns)) {
|
|
201
|
+
const match = coreXml.match(pattern);
|
|
202
|
+
if (match) {
|
|
203
|
+
metadata[key] = match[1];
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return metadata;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Check if file is a valid Word document
|
|
212
|
+
* @param {string} filePath
|
|
213
|
+
* @returns {boolean}
|
|
214
|
+
*/
|
|
215
|
+
export function isWordDocument(filePath) {
|
|
216
|
+
if (!fs.existsSync(filePath)) return false;
|
|
217
|
+
if (!filePath.toLowerCase().endsWith('.docx')) return false;
|
|
218
|
+
|
|
219
|
+
try {
|
|
220
|
+
const zip = new AdmZip(filePath);
|
|
221
|
+
return zip.getEntry('word/document.xml') !== null;
|
|
222
|
+
} catch {
|
|
223
|
+
return false;
|
|
224
|
+
}
|
|
225
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "docrev",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -8,16 +8,17 @@
|
|
|
8
8
|
},
|
|
9
9
|
"scripts": {
|
|
10
10
|
"build": "echo 'No build needed'",
|
|
11
|
-
"test": "node
|
|
11
|
+
"test": "node --test test/*.test.js",
|
|
12
|
+
"test:watch": "node --test --watch test/*.test.js"
|
|
12
13
|
},
|
|
13
14
|
"repository": {
|
|
14
15
|
"type": "git",
|
|
15
|
-
"url": "git+https://github.com/gcol33/
|
|
16
|
+
"url": "git+https://github.com/gcol33/docrev.git"
|
|
16
17
|
},
|
|
17
18
|
"bugs": {
|
|
18
|
-
"url": "https://github.com/gcol33/
|
|
19
|
+
"url": "https://github.com/gcol33/docrev/issues"
|
|
19
20
|
},
|
|
20
|
-
"homepage": "https://github.com/gcol33/
|
|
21
|
+
"homepage": "https://github.com/gcol33/docrev#readme",
|
|
21
22
|
"keywords": [
|
|
22
23
|
"markdown",
|
|
23
24
|
"word",
|
|
@@ -38,6 +39,7 @@
|
|
|
38
39
|
"diff": "^8.0.2",
|
|
39
40
|
"js-yaml": "^4.1.1",
|
|
40
41
|
"mammoth": "^1.6.0",
|
|
42
|
+
"mathml-to-latex": "^1.5.0",
|
|
41
43
|
"xml2js": "^0.6.2"
|
|
42
44
|
}
|
|
43
45
|
}
|