docrev 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/word.js ADDED
@@ -0,0 +1,225 @@
1
+ /**
2
+ * Word document extraction utilities
3
+ * Handle reading text, comments, and anchors from .docx files
4
+ */
5
+
6
+ import * as fs from 'fs';
7
+ import * as path from 'path';
8
+ import AdmZip from 'adm-zip';
9
+ import { parseString } from 'xml2js';
10
+ import { promisify } from 'util';
11
+
12
+ const parseXml = promisify(parseString);
13
+
14
+ /**
15
+ * Extract comments from Word document's comments.xml
16
+ * @param {string} docxPath
17
+ * @returns {Promise<Array<{id: string, author: string, date: string, text: string}>>}
18
+ */
19
+ export async function extractWordComments(docxPath) {
20
+ if (!fs.existsSync(docxPath)) {
21
+ throw new Error(`File not found: ${docxPath}`);
22
+ }
23
+
24
+ const zip = new AdmZip(docxPath);
25
+ const commentsEntry = zip.getEntry('word/comments.xml');
26
+
27
+ if (!commentsEntry) {
28
+ return []; // No comments in document
29
+ }
30
+
31
+ const commentsXml = zip.readAsText(commentsEntry);
32
+ const parsed = await parseXml(commentsXml);
33
+
34
+ if (!parsed['w:comments'] || !parsed['w:comments']['w:comment']) {
35
+ return [];
36
+ }
37
+
38
+ const comments = [];
39
+ const rawComments = parsed['w:comments']['w:comment'];
40
+
41
+ for (const comment of rawComments) {
42
+ const id = comment.$?.['w:id'];
43
+ const author = comment.$?.['w:author'] || 'Unknown';
44
+ const date = comment.$?.['w:date'];
45
+
46
+ // Extract text from all paragraphs in comment
47
+ let text = '';
48
+ const paragraphs = comment['w:p'] || [];
49
+ for (const para of paragraphs) {
50
+ const runs = para['w:r'] || [];
51
+ for (const run of runs) {
52
+ const texts = run['w:t'] || [];
53
+ for (const t of texts) {
54
+ text += typeof t === 'string' ? t : (t._ || '');
55
+ }
56
+ }
57
+ }
58
+
59
+ if (id && text.trim()) {
60
+ comments.push({
61
+ id,
62
+ author,
63
+ date,
64
+ text: text.trim(),
65
+ });
66
+ }
67
+ }
68
+
69
+ return comments;
70
+ }
71
+
72
+ /**
73
+ * Extract comment anchors (where comments are attached) from document.xml
74
+ * Returns mapping of comment ID to the text they're anchored to
75
+ * @param {string} docxPath
76
+ * @returns {Promise<Map<string, {text: string, context: string}>>}
77
+ */
78
+ export async function extractCommentAnchors(docxPath) {
79
+ const zip = new AdmZip(docxPath);
80
+ const documentEntry = zip.getEntry('word/document.xml');
81
+
82
+ if (!documentEntry) {
83
+ throw new Error('Invalid docx: no document.xml');
84
+ }
85
+
86
+ const documentXml = zip.readAsText(documentEntry);
87
+ const anchors = new Map();
88
+
89
+ // Find commentRangeStart and commentRangeEnd pairs
90
+ // The text between them is what the comment is anchored to
91
+ const startPattern = /<w:commentRangeStart w:id="(\d+)"\/>/g;
92
+ const endPattern = /<w:commentRangeEnd w:id="(\d+)"\/>/g;
93
+
94
+ let match;
95
+ const starts = new Map();
96
+ const ends = new Map();
97
+
98
+ while ((match = startPattern.exec(documentXml)) !== null) {
99
+ starts.set(match[1], match.index);
100
+ }
101
+
102
+ while ((match = endPattern.exec(documentXml)) !== null) {
103
+ ends.set(match[1], match.index);
104
+ }
105
+
106
+ // For each comment, extract the text between start and end
107
+ for (const [id, startPos] of starts) {
108
+ const endPos = ends.get(id);
109
+ if (!endPos) continue;
110
+
111
+ const segment = documentXml.slice(startPos, endPos);
112
+
113
+ // Extract all text content from the segment
114
+ const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
115
+ let text = '';
116
+ let textMatch;
117
+ while ((textMatch = textPattern.exec(segment)) !== null) {
118
+ text += textMatch[1];
119
+ }
120
+
121
+ // Get surrounding context (text before the anchor)
122
+ const contextStart = Math.max(0, startPos - 500);
123
+ const contextSegment = documentXml.slice(contextStart, startPos);
124
+ let context = '';
125
+ while ((textMatch = textPattern.exec(contextSegment)) !== null) {
126
+ context += textMatch[1];
127
+ }
128
+
129
+ anchors.set(id, {
130
+ text: text.trim(),
131
+ context: context.slice(-100), // Last 100 chars of context
132
+ });
133
+ }
134
+
135
+ return anchors;
136
+ }
137
+
138
+ /**
139
+ * Extract plain text from Word document using mammoth
140
+ * @param {string} docxPath
141
+ * @returns {Promise<string>}
142
+ */
143
+ export async function extractTextFromWord(docxPath) {
144
+ if (!fs.existsSync(docxPath)) {
145
+ throw new Error(`File not found: ${docxPath}`);
146
+ }
147
+
148
+ const mammoth = await import('mammoth');
149
+ const result = await mammoth.extractRawText({ path: docxPath });
150
+ return result.value;
151
+ }
152
+
153
+ /**
154
+ * Extract rich content from Word with basic formatting
155
+ * @param {string} docxPath
156
+ * @returns {Promise<{text: string, html: string}>}
157
+ */
158
+ export async function extractFromWord(docxPath) {
159
+ if (!fs.existsSync(docxPath)) {
160
+ throw new Error(`File not found: ${docxPath}`);
161
+ }
162
+
163
+ const mammoth = await import('mammoth');
164
+
165
+ const [textResult, htmlResult] = await Promise.all([
166
+ mammoth.extractRawText({ path: docxPath }),
167
+ mammoth.convertToHtml({ path: docxPath }),
168
+ ]);
169
+
170
+ return {
171
+ text: textResult.value,
172
+ html: htmlResult.value,
173
+ };
174
+ }
175
+
176
+ /**
177
+ * Get document metadata from Word file
178
+ * @param {string} docxPath
179
+ * @returns {Promise<{title?: string, author?: string, created?: string, modified?: string}>}
180
+ */
181
+ export async function getWordMetadata(docxPath) {
182
+ const zip = new AdmZip(docxPath);
183
+ const coreEntry = zip.getEntry('docProps/core.xml');
184
+
185
+ if (!coreEntry) {
186
+ return {};
187
+ }
188
+
189
+ const coreXml = zip.readAsText(coreEntry);
190
+ const metadata = {};
191
+
192
+ // Extract common metadata fields
193
+ const patterns = {
194
+ title: /<dc:title>([^<]*)<\/dc:title>/,
195
+ author: /<dc:creator>([^<]*)<\/dc:creator>/,
196
+ created: /<dcterms:created[^>]*>([^<]*)<\/dcterms:created>/,
197
+ modified: /<dcterms:modified[^>]*>([^<]*)<\/dcterms:modified>/,
198
+ };
199
+
200
+ for (const [key, pattern] of Object.entries(patterns)) {
201
+ const match = coreXml.match(pattern);
202
+ if (match) {
203
+ metadata[key] = match[1];
204
+ }
205
+ }
206
+
207
+ return metadata;
208
+ }
209
+
210
+ /**
211
+ * Check if file is a valid Word document
212
+ * @param {string} filePath
213
+ * @returns {boolean}
214
+ */
215
+ export function isWordDocument(filePath) {
216
+ if (!fs.existsSync(filePath)) return false;
217
+ if (!filePath.toLowerCase().endsWith('.docx')) return false;
218
+
219
+ try {
220
+ const zip = new AdmZip(filePath);
221
+ return zip.getEntry('word/document.xml') !== null;
222
+ } catch {
223
+ return false;
224
+ }
225
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "docrev",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "Academic paper revision workflow: Word ↔ Markdown round-trips, DOI validation, reviewer comments",
5
5
  "type": "module",
6
6
  "bin": {
@@ -8,16 +8,17 @@
8
8
  },
9
9
  "scripts": {
10
10
  "build": "echo 'No build needed'",
11
- "test": "node bin/rev.js --help"
11
+ "test": "node --test test/*.test.js",
12
+ "test:watch": "node --test --watch test/*.test.js"
12
13
  },
13
14
  "repository": {
14
15
  "type": "git",
15
- "url": "git+https://github.com/gcol33/rev.git"
16
+ "url": "git+https://github.com/gcol33/docrev.git"
16
17
  },
17
18
  "bugs": {
18
- "url": "https://github.com/gcol33/rev/issues"
19
+ "url": "https://github.com/gcol33/docrev/issues"
19
20
  },
20
- "homepage": "https://github.com/gcol33/rev#readme",
21
+ "homepage": "https://github.com/gcol33/docrev#readme",
21
22
  "keywords": [
22
23
  "markdown",
23
24
  "word",
@@ -38,6 +39,7 @@
38
39
  "diff": "^8.0.2",
39
40
  "js-yaml": "^4.1.1",
40
41
  "mammoth": "^1.6.0",
42
+ "mathml-to-latex": "^1.5.0",
41
43
  "xml2js": "^0.6.2"
42
44
  }
43
45
  }