docrev 0.8.1 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. package/.claude/settings.local.json +9 -0
  2. package/PLAN-tables-and-postprocess.md +850 -0
  3. package/README.md +33 -0
  4. package/bin/rev.js +12 -131
  5. package/bin/rev.ts +145 -0
  6. package/dist/bin/rev.d.ts +9 -0
  7. package/dist/bin/rev.d.ts.map +1 -0
  8. package/dist/bin/rev.js +118 -0
  9. package/dist/bin/rev.js.map +1 -0
  10. package/dist/lib/annotations.d.ts +91 -0
  11. package/dist/lib/annotations.d.ts.map +1 -0
  12. package/dist/lib/annotations.js +554 -0
  13. package/dist/lib/annotations.js.map +1 -0
  14. package/dist/lib/build.d.ts +171 -0
  15. package/dist/lib/build.d.ts.map +1 -0
  16. package/dist/lib/build.js +755 -0
  17. package/dist/lib/build.js.map +1 -0
  18. package/dist/lib/citations.d.ts +34 -0
  19. package/dist/lib/citations.d.ts.map +1 -0
  20. package/dist/lib/citations.js +140 -0
  21. package/dist/lib/citations.js.map +1 -0
  22. package/dist/lib/commands/build.d.ts +13 -0
  23. package/dist/lib/commands/build.d.ts.map +1 -0
  24. package/dist/lib/commands/build.js +678 -0
  25. package/dist/lib/commands/build.js.map +1 -0
  26. package/dist/lib/commands/citations.d.ts +11 -0
  27. package/dist/lib/commands/citations.d.ts.map +1 -0
  28. package/dist/lib/commands/citations.js +428 -0
  29. package/dist/lib/commands/citations.js.map +1 -0
  30. package/dist/lib/commands/comments.d.ts +11 -0
  31. package/dist/lib/commands/comments.d.ts.map +1 -0
  32. package/dist/lib/commands/comments.js +883 -0
  33. package/dist/lib/commands/comments.js.map +1 -0
  34. package/dist/lib/commands/context.d.ts +35 -0
  35. package/dist/lib/commands/context.d.ts.map +1 -0
  36. package/dist/lib/commands/context.js +59 -0
  37. package/dist/lib/commands/context.js.map +1 -0
  38. package/dist/lib/commands/core.d.ts +11 -0
  39. package/dist/lib/commands/core.d.ts.map +1 -0
  40. package/dist/lib/commands/core.js +246 -0
  41. package/dist/lib/commands/core.js.map +1 -0
  42. package/dist/lib/commands/doi.d.ts +11 -0
  43. package/dist/lib/commands/doi.d.ts.map +1 -0
  44. package/dist/lib/commands/doi.js +373 -0
  45. package/dist/lib/commands/doi.js.map +1 -0
  46. package/dist/lib/commands/history.d.ts +11 -0
  47. package/dist/lib/commands/history.d.ts.map +1 -0
  48. package/dist/lib/commands/history.js +245 -0
  49. package/dist/lib/commands/history.js.map +1 -0
  50. package/dist/lib/commands/index.d.ts +28 -0
  51. package/dist/lib/commands/index.d.ts.map +1 -0
  52. package/dist/lib/commands/index.js +35 -0
  53. package/dist/lib/commands/index.js.map +1 -0
  54. package/dist/lib/commands/init.d.ts +11 -0
  55. package/dist/lib/commands/init.d.ts.map +1 -0
  56. package/dist/lib/commands/init.js +209 -0
  57. package/dist/lib/commands/init.js.map +1 -0
  58. package/dist/lib/commands/response.d.ts +11 -0
  59. package/dist/lib/commands/response.d.ts.map +1 -0
  60. package/dist/lib/commands/response.js +317 -0
  61. package/dist/lib/commands/response.js.map +1 -0
  62. package/dist/lib/commands/sections.d.ts +11 -0
  63. package/dist/lib/commands/sections.d.ts.map +1 -0
  64. package/dist/lib/commands/sections.js +1071 -0
  65. package/dist/lib/commands/sections.js.map +1 -0
  66. package/dist/lib/commands/utilities.d.ts +19 -0
  67. package/dist/lib/commands/utilities.d.ts.map +1 -0
  68. package/dist/lib/commands/utilities.js +2009 -0
  69. package/dist/lib/commands/utilities.js.map +1 -0
  70. package/dist/lib/comment-realign.d.ts +50 -0
  71. package/dist/lib/comment-realign.d.ts.map +1 -0
  72. package/dist/lib/comment-realign.js +372 -0
  73. package/dist/lib/comment-realign.js.map +1 -0
  74. package/dist/lib/config.d.ts +41 -0
  75. package/dist/lib/config.d.ts.map +1 -0
  76. package/dist/lib/config.js +76 -0
  77. package/dist/lib/config.js.map +1 -0
  78. package/dist/lib/crossref.d.ts +108 -0
  79. package/dist/lib/crossref.d.ts.map +1 -0
  80. package/dist/lib/crossref.js +597 -0
  81. package/dist/lib/crossref.js.map +1 -0
  82. package/dist/lib/dependencies.d.ts +30 -0
  83. package/dist/lib/dependencies.d.ts.map +1 -0
  84. package/dist/lib/dependencies.js +95 -0
  85. package/dist/lib/dependencies.js.map +1 -0
  86. package/dist/lib/doi-cache.d.ts +29 -0
  87. package/dist/lib/doi-cache.d.ts.map +1 -0
  88. package/dist/lib/doi-cache.js +104 -0
  89. package/dist/lib/doi-cache.js.map +1 -0
  90. package/dist/lib/doi.d.ts +65 -0
  91. package/dist/lib/doi.d.ts.map +1 -0
  92. package/dist/lib/doi.js +710 -0
  93. package/dist/lib/doi.js.map +1 -0
  94. package/dist/lib/equations.d.ts +61 -0
  95. package/dist/lib/equations.d.ts.map +1 -0
  96. package/dist/lib/equations.js +445 -0
  97. package/dist/lib/equations.js.map +1 -0
  98. package/dist/lib/errors.d.ts +60 -0
  99. package/dist/lib/errors.d.ts.map +1 -0
  100. package/dist/lib/errors.js +303 -0
  101. package/dist/lib/errors.js.map +1 -0
  102. package/dist/lib/format.d.ts +104 -0
  103. package/dist/lib/format.d.ts.map +1 -0
  104. package/dist/lib/format.js +416 -0
  105. package/dist/lib/format.js.map +1 -0
  106. package/dist/lib/git.d.ts +88 -0
  107. package/dist/lib/git.d.ts.map +1 -0
  108. package/dist/lib/git.js +304 -0
  109. package/dist/lib/git.js.map +1 -0
  110. package/dist/lib/grammar.d.ts +62 -0
  111. package/dist/lib/grammar.d.ts.map +1 -0
  112. package/dist/lib/grammar.js +244 -0
  113. package/dist/lib/grammar.js.map +1 -0
  114. package/dist/lib/image-registry.d.ts +68 -0
  115. package/dist/lib/image-registry.d.ts.map +1 -0
  116. package/dist/lib/image-registry.js +112 -0
  117. package/dist/lib/image-registry.js.map +1 -0
  118. package/dist/lib/import.d.ts +184 -0
  119. package/dist/lib/import.d.ts.map +1 -0
  120. package/dist/lib/import.js +1581 -0
  121. package/dist/lib/import.js.map +1 -0
  122. package/dist/lib/journals.d.ts +55 -0
  123. package/dist/lib/journals.d.ts.map +1 -0
  124. package/dist/lib/journals.js +417 -0
  125. package/dist/lib/journals.js.map +1 -0
  126. package/dist/lib/merge.d.ts +138 -0
  127. package/dist/lib/merge.d.ts.map +1 -0
  128. package/dist/lib/merge.js +603 -0
  129. package/dist/lib/merge.js.map +1 -0
  130. package/dist/lib/orcid.d.ts +36 -0
  131. package/dist/lib/orcid.d.ts.map +1 -0
  132. package/dist/lib/orcid.js +117 -0
  133. package/dist/lib/orcid.js.map +1 -0
  134. package/dist/lib/pdf-comments.d.ts +95 -0
  135. package/dist/lib/pdf-comments.d.ts.map +1 -0
  136. package/dist/lib/pdf-comments.js +192 -0
  137. package/dist/lib/pdf-comments.js.map +1 -0
  138. package/dist/lib/pdf-import.d.ts +118 -0
  139. package/dist/lib/pdf-import.d.ts.map +1 -0
  140. package/dist/lib/pdf-import.js +397 -0
  141. package/dist/lib/pdf-import.js.map +1 -0
  142. package/dist/lib/plugins.d.ts +76 -0
  143. package/dist/lib/plugins.d.ts.map +1 -0
  144. package/dist/lib/plugins.js +235 -0
  145. package/dist/lib/plugins.js.map +1 -0
  146. package/dist/lib/postprocess.d.ts +42 -0
  147. package/dist/lib/postprocess.d.ts.map +1 -0
  148. package/dist/lib/postprocess.js +138 -0
  149. package/dist/lib/postprocess.js.map +1 -0
  150. package/dist/lib/pptx-template.d.ts +59 -0
  151. package/dist/lib/pptx-template.d.ts.map +1 -0
  152. package/dist/lib/pptx-template.js +613 -0
  153. package/dist/lib/pptx-template.js.map +1 -0
  154. package/dist/lib/pptx-themes.d.ts +80 -0
  155. package/dist/lib/pptx-themes.d.ts.map +1 -0
  156. package/dist/lib/pptx-themes.js +818 -0
  157. package/dist/lib/pptx-themes.js.map +1 -0
  158. package/dist/lib/protect-restore.d.ts +137 -0
  159. package/dist/lib/protect-restore.d.ts.map +1 -0
  160. package/dist/lib/protect-restore.js +394 -0
  161. package/dist/lib/protect-restore.js.map +1 -0
  162. package/dist/lib/rate-limiter.d.ts +27 -0
  163. package/dist/lib/rate-limiter.d.ts.map +1 -0
  164. package/dist/lib/rate-limiter.js +79 -0
  165. package/dist/lib/rate-limiter.js.map +1 -0
  166. package/dist/lib/response.d.ts +41 -0
  167. package/dist/lib/response.d.ts.map +1 -0
  168. package/dist/lib/response.js +150 -0
  169. package/dist/lib/response.js.map +1 -0
  170. package/dist/lib/review.d.ts +35 -0
  171. package/dist/lib/review.d.ts.map +1 -0
  172. package/dist/lib/review.js +263 -0
  173. package/dist/lib/review.js.map +1 -0
  174. package/dist/lib/schema.d.ts +66 -0
  175. package/dist/lib/schema.d.ts.map +1 -0
  176. package/dist/lib/schema.js +339 -0
  177. package/dist/lib/schema.js.map +1 -0
  178. package/dist/lib/scientific-words.d.ts +6 -0
  179. package/dist/lib/scientific-words.d.ts.map +1 -0
  180. package/dist/lib/scientific-words.js +66 -0
  181. package/dist/lib/scientific-words.js.map +1 -0
  182. package/dist/lib/sections.d.ts +40 -0
  183. package/dist/lib/sections.d.ts.map +1 -0
  184. package/dist/lib/sections.js +288 -0
  185. package/dist/lib/sections.js.map +1 -0
  186. package/dist/lib/slides.d.ts +86 -0
  187. package/dist/lib/slides.d.ts.map +1 -0
  188. package/dist/lib/slides.js +676 -0
  189. package/dist/lib/slides.js.map +1 -0
  190. package/dist/lib/spelling.d.ts +76 -0
  191. package/dist/lib/spelling.d.ts.map +1 -0
  192. package/dist/lib/spelling.js +272 -0
  193. package/dist/lib/spelling.js.map +1 -0
  194. package/dist/lib/templates.d.ts +30 -0
  195. package/dist/lib/templates.d.ts.map +1 -0
  196. package/dist/lib/templates.js +504 -0
  197. package/dist/lib/templates.js.map +1 -0
  198. package/dist/lib/themes.d.ts +85 -0
  199. package/dist/lib/themes.d.ts.map +1 -0
  200. package/dist/lib/themes.js +652 -0
  201. package/dist/lib/themes.js.map +1 -0
  202. package/dist/lib/trackchanges.d.ts +51 -0
  203. package/dist/lib/trackchanges.d.ts.map +1 -0
  204. package/dist/lib/trackchanges.js +202 -0
  205. package/dist/lib/trackchanges.js.map +1 -0
  206. package/dist/lib/tui.d.ts +76 -0
  207. package/dist/lib/tui.d.ts.map +1 -0
  208. package/dist/lib/tui.js +377 -0
  209. package/dist/lib/tui.js.map +1 -0
  210. package/dist/lib/types.d.ts +447 -0
  211. package/dist/lib/types.d.ts.map +1 -0
  212. package/dist/lib/types.js +6 -0
  213. package/dist/lib/types.js.map +1 -0
  214. package/dist/lib/undo.d.ts +57 -0
  215. package/dist/lib/undo.d.ts.map +1 -0
  216. package/dist/lib/undo.js +185 -0
  217. package/dist/lib/undo.js.map +1 -0
  218. package/dist/lib/utils.d.ts +16 -0
  219. package/dist/lib/utils.d.ts.map +1 -0
  220. package/dist/lib/utils.js +40 -0
  221. package/dist/lib/utils.js.map +1 -0
  222. package/dist/lib/variables.d.ts +42 -0
  223. package/dist/lib/variables.d.ts.map +1 -0
  224. package/dist/lib/variables.js +141 -0
  225. package/dist/lib/variables.js.map +1 -0
  226. package/dist/lib/word.d.ts +80 -0
  227. package/dist/lib/word.d.ts.map +1 -0
  228. package/dist/lib/word.js +360 -0
  229. package/dist/lib/word.js.map +1 -0
  230. package/dist/lib/wordcomments.d.ts +51 -0
  231. package/dist/lib/wordcomments.d.ts.map +1 -0
  232. package/dist/lib/wordcomments.js +587 -0
  233. package/dist/lib/wordcomments.js.map +1 -0
  234. package/eslint.config.js +27 -0
  235. package/lib/annotations.ts +622 -0
  236. package/lib/apply-buildup-colors.py +88 -0
  237. package/lib/build.ts +1013 -0
  238. package/lib/{citations.js → citations.ts} +38 -27
  239. package/lib/commands/{build.js → build.ts} +80 -27
  240. package/lib/commands/{citations.js → citations.ts} +36 -18
  241. package/lib/commands/{comments.js → comments.ts} +187 -54
  242. package/lib/commands/{context.js → context.ts} +18 -8
  243. package/lib/commands/{core.js → core.ts} +34 -20
  244. package/lib/commands/{doi.js → doi.ts} +32 -16
  245. package/lib/commands/{history.js → history.ts} +25 -12
  246. package/lib/commands/{index.js → index.ts} +9 -5
  247. package/lib/commands/{init.js → init.ts} +20 -8
  248. package/lib/commands/{response.js → response.ts} +47 -20
  249. package/lib/commands/{sections.js → sections.ts} +273 -68
  250. package/lib/commands/{utilities.js → utilities.ts} +338 -158
  251. package/lib/{comment-realign.js → comment-realign.ts} +117 -45
  252. package/lib/config.ts +84 -0
  253. package/lib/{crossref.js → crossref.ts} +213 -138
  254. package/lib/dependencies.ts +106 -0
  255. package/lib/doi-cache.ts +115 -0
  256. package/lib/{doi.js → doi.ts} +115 -281
  257. package/lib/{equations.js → equations.ts} +60 -64
  258. package/lib/{errors.js → errors.ts} +56 -48
  259. package/lib/{format.js → format.ts} +137 -63
  260. package/lib/{git.js → git.ts} +66 -63
  261. package/lib/{grammar.js → grammar.ts} +45 -32
  262. package/lib/image-registry.ts +180 -0
  263. package/lib/import.ts +2060 -0
  264. package/lib/journals.ts +505 -0
  265. package/lib/{merge.js → merge.ts} +185 -135
  266. package/lib/{orcid.js → orcid.ts} +17 -22
  267. package/lib/{pdf-comments.js → pdf-comments.ts} +76 -18
  268. package/lib/{pdf-import.js → pdf-import.ts} +148 -70
  269. package/lib/{plugins.js → plugins.ts} +82 -39
  270. package/lib/postprocess.ts +188 -0
  271. package/lib/pptx-color-filter.lua +37 -0
  272. package/lib/pptx-template.ts +625 -0
  273. package/lib/pptx-themes/academic.pptx +0 -0
  274. package/lib/pptx-themes/corporate.pptx +0 -0
  275. package/lib/pptx-themes/dark.pptx +0 -0
  276. package/lib/pptx-themes/default.pptx +0 -0
  277. package/lib/pptx-themes/minimal.pptx +0 -0
  278. package/lib/pptx-themes/plant.pptx +0 -0
  279. package/lib/pptx-themes.ts +896 -0
  280. package/lib/protect-restore.ts +516 -0
  281. package/lib/rate-limiter.ts +94 -0
  282. package/lib/{response.js → response.ts} +36 -21
  283. package/lib/{review.js → review.ts} +53 -43
  284. package/lib/{schema.js → schema.ts} +70 -25
  285. package/lib/{sections.js → sections.ts} +71 -76
  286. package/lib/slides.ts +793 -0
  287. package/lib/{spelling.js → spelling.ts} +43 -59
  288. package/lib/{templates.js → templates.ts} +20 -17
  289. package/lib/themes.ts +742 -0
  290. package/lib/{trackchanges.js → trackchanges.ts} +52 -23
  291. package/lib/types.ts +509 -0
  292. package/lib/{undo.js → undo.ts} +75 -52
  293. package/lib/utils.ts +41 -0
  294. package/lib/{variables.js → variables.ts} +60 -54
  295. package/lib/word.ts +428 -0
  296. package/lib/{wordcomments.js → wordcomments.ts} +94 -40
  297. package/package.json +15 -5
  298. package/skill/REFERENCE.md +67 -0
  299. package/tsconfig.json +26 -0
  300. package/lib/annotations.js +0 -414
  301. package/lib/build.js +0 -639
  302. package/lib/config.js +0 -79
  303. package/lib/import.js +0 -1145
  304. package/lib/journals.js +0 -629
  305. package/lib/word.js +0 -225
  306. /package/lib/{scientific-words.js → scientific-words.ts} +0 -0
package/lib/word.ts ADDED
@@ -0,0 +1,428 @@
1
+ /**
2
+ * Word document extraction utilities
3
+ * Handle reading text, comments, and anchors from .docx files
4
+ */
5
+
6
+ import * as fs from 'fs';
7
+ import * as path from 'path';
8
+ import AdmZip from 'adm-zip';
9
+ import { parseString } from 'xml2js';
10
+ import { promisify } from 'util';
11
+ import type { WordComment, CommentAnchor, WordContent, WordMetadata, TrackChangesResult } from './types.js';
12
+
13
+ const parseXml = promisify(parseString);
14
+
15
+ // =============================================================================
16
+ // Constants
17
+ // =============================================================================
18
+
19
+ /** Characters of context to extract around comment anchors */
20
+ const ANCHOR_CONTEXT_SIZE = 100;
21
+
22
+ /** Characters of context before comment range start */
23
+ const CONTEXT_BEFORE_SIZE = 500;
24
+
25
+ // =============================================================================
26
+ // Public API
27
+ // =============================================================================
28
+
29
+ /**
30
+ * Extract comments from Word document's comments.xml
31
+ * @param docxPath - Path to .docx file
32
+ * @returns Array of extracted comments
33
+ * @throws {TypeError} If docxPath is not a string
34
+ * @throws {Error} If file not found or invalid docx
35
+ */
36
+ export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
37
+ if (typeof docxPath !== 'string') {
38
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
39
+ }
40
+ if (!fs.existsSync(docxPath)) {
41
+ throw new Error(`File not found: ${docxPath}`);
42
+ }
43
+
44
+ const zip = new AdmZip(docxPath);
45
+ const commentsEntry = zip.getEntry('word/comments.xml');
46
+
47
+ if (!commentsEntry) {
48
+ return []; // No comments in document
49
+ }
50
+
51
+ const commentsXml = zip.readAsText(commentsEntry);
52
+ const parsed = await parseXml(commentsXml) as any;
53
+
54
+ if (!parsed?.['w:comments'] || !parsed['w:comments']['w:comment']) {
55
+ return [];
56
+ }
57
+
58
+ const comments: WordComment[] = [];
59
+ const rawComments = parsed['w:comments']['w:comment'];
60
+
61
+ for (const comment of rawComments) {
62
+ const id = comment.$?.['w:id'];
63
+ const author = comment.$?.['w:author'] || 'Unknown';
64
+ const date = comment.$?.['w:date'];
65
+
66
+ // Extract text from all paragraphs in comment
67
+ let text = '';
68
+ const paragraphs = comment['w:p'] || [];
69
+ for (const para of paragraphs) {
70
+ const runs = para['w:r'] || [];
71
+ for (const run of runs) {
72
+ const texts = run['w:t'] || [];
73
+ for (const t of texts) {
74
+ text += typeof t === 'string' ? t : (t._ || '');
75
+ }
76
+ }
77
+ }
78
+
79
+ if (id && text.trim()) {
80
+ comments.push({
81
+ id,
82
+ author,
83
+ date,
84
+ text: text.trim(),
85
+ });
86
+ }
87
+ }
88
+
89
+ return comments;
90
+ }
91
+
92
+ /**
93
+ * Extract comment anchors (where comments are attached) from document.xml
94
+ * Returns mapping of comment ID to the text they're anchored to
95
+ * @param docxPath - Path to .docx file
96
+ * @returns Map of comment ID to anchor info
97
+ * @throws {TypeError} If docxPath is not a string
98
+ * @throws {Error} If invalid docx structure
99
+ */
100
+ export async function extractCommentAnchors(docxPath: string): Promise<Map<string, CommentAnchor>> {
101
+ if (typeof docxPath !== 'string') {
102
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
103
+ }
104
+
105
+ const zip = new AdmZip(docxPath);
106
+ const documentEntry = zip.getEntry('word/document.xml');
107
+
108
+ if (!documentEntry) {
109
+ throw new Error('Invalid docx: no document.xml');
110
+ }
111
+
112
+ const documentXml = zip.readAsText(documentEntry);
113
+ const anchors = new Map<string, CommentAnchor>();
114
+
115
+ // Find commentRangeStart and commentRangeEnd pairs
116
+ // The text between them is what the comment is anchored to
117
+ const startPattern = /<w:commentRangeStart w:id="(\d+)"\/>/g;
118
+ const endPattern = /<w:commentRangeEnd w:id="(\d+)"\/>/g;
119
+
120
+ let match: RegExpExecArray | null;
121
+ const starts = new Map<string, number>();
122
+ const ends = new Map<string, number>();
123
+
124
+ while ((match = startPattern.exec(documentXml)) !== null) {
125
+ if (match[1]) {
126
+ starts.set(match[1], match.index);
127
+ }
128
+ }
129
+
130
+ while ((match = endPattern.exec(documentXml)) !== null) {
131
+ if (match[1]) {
132
+ ends.set(match[1], match.index);
133
+ }
134
+ }
135
+
136
+ // For each comment, extract the text between start and end
137
+ for (const [id, startPos] of starts) {
138
+ const endPos = ends.get(id);
139
+ if (!endPos) continue;
140
+
141
+ const segment = documentXml.slice(startPos, endPos);
142
+
143
+ // Extract all text content from the segment
144
+ const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
145
+ let text = '';
146
+ let textMatch: RegExpExecArray | null;
147
+ while ((textMatch = textPattern.exec(segment)) !== null) {
148
+ text += textMatch[1] ?? '';
149
+ }
150
+
151
+ // Get surrounding context (text before the anchor)
152
+ const contextStart = Math.max(0, startPos - CONTEXT_BEFORE_SIZE);
153
+ const contextSegment = documentXml.slice(contextStart, startPos);
154
+ let context = '';
155
+ while ((textMatch = textPattern.exec(contextSegment)) !== null) {
156
+ context += textMatch[1] ?? '';
157
+ }
158
+
159
+ anchors.set(id, {
160
+ text: text.trim(),
161
+ context: context.slice(-ANCHOR_CONTEXT_SIZE),
162
+ });
163
+ }
164
+
165
+ return anchors;
166
+ }
167
+
168
+ /**
169
+ * Extract plain text from Word document using mammoth
170
+ * @param docxPath - Path to .docx file
171
+ * @returns Extracted plain text
172
+ * @throws {TypeError} If docxPath is not a string
173
+ * @throws {Error} If file not found
174
+ */
175
+ export async function extractTextFromWord(docxPath: string): Promise<string> {
176
+ if (typeof docxPath !== 'string') {
177
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
178
+ }
179
+ if (!fs.existsSync(docxPath)) {
180
+ throw new Error(`File not found: ${docxPath}`);
181
+ }
182
+
183
+ const mammoth = await import('mammoth');
184
+ const result = await mammoth.extractRawText({ path: docxPath });
185
+ return result.value;
186
+ }
187
+
188
+ /**
189
+ * Extract rich content from Word with basic formatting
190
+ * @param docxPath - Path to .docx file
191
+ * @returns Text and HTML content
192
+ * @throws {TypeError} If docxPath is not a string
193
+ * @throws {Error} If file not found
194
+ */
195
+ export async function extractFromWord(docxPath: string): Promise<WordContent> {
196
+ if (typeof docxPath !== 'string') {
197
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
198
+ }
199
+ if (!fs.existsSync(docxPath)) {
200
+ throw new Error(`File not found: ${docxPath}`);
201
+ }
202
+
203
+ const mammoth = await import('mammoth');
204
+
205
+ const [textResult, htmlResult] = await Promise.all([
206
+ mammoth.extractRawText({ path: docxPath }),
207
+ mammoth.convertToHtml({ path: docxPath }),
208
+ ]);
209
+
210
+ return {
211
+ text: textResult.value,
212
+ html: htmlResult.value,
213
+ };
214
+ }
215
+
216
+ /**
217
+ * Get document metadata from Word file
218
+ * @param docxPath - Path to .docx file
219
+ * @returns Document metadata
220
+ * @throws {TypeError} If docxPath is not a string
221
+ */
222
+ export async function getWordMetadata(docxPath: string): Promise<WordMetadata> {
223
+ if (typeof docxPath !== 'string') {
224
+ throw new TypeError(`docxPath must be a string, got ${typeof docxPath}`);
225
+ }
226
+
227
+ const zip = new AdmZip(docxPath);
228
+ const coreEntry = zip.getEntry('docProps/core.xml');
229
+
230
+ if (!coreEntry) {
231
+ return {};
232
+ }
233
+
234
+ const coreXml = zip.readAsText(coreEntry);
235
+ const metadata: WordMetadata = {};
236
+
237
+ // Extract common metadata fields
238
+ const patterns: Record<string, RegExp> = {
239
+ title: /<dc:title>([^<]*)<\/dc:title>/,
240
+ author: /<dc:creator>([^<]*)<\/dc:creator>/,
241
+ created: /<dcterms:created[^>]*>([^<]*)<\/dcterms:created>/,
242
+ modified: /<dcterms:modified[^>]*>([^<]*)<\/dcterms:modified>/,
243
+ };
244
+
245
+ for (const [key, pattern] of Object.entries(patterns)) {
246
+ const match = coreXml.match(pattern);
247
+ if (match) {
248
+ (metadata as any)[key] = match[1];
249
+ }
250
+ }
251
+
252
+ return metadata;
253
+ }
254
+
255
+ /**
256
+ * Check if file is a valid Word document
257
+ * @param filePath - Path to file to check
258
+ * @returns True if valid .docx file
259
+ */
260
+ export function isWordDocument(filePath: string): boolean {
261
+ if (typeof filePath !== 'string') return false;
262
+ if (!fs.existsSync(filePath)) return false;
263
+ if (!filePath.toLowerCase().endsWith('.docx')) return false;
264
+
265
+ try {
266
+ const zip = new AdmZip(filePath);
267
+ return zip.getEntry('word/document.xml') !== null;
268
+ } catch {
269
+ return false;
270
+ }
271
+ }
272
+
273
+ /**
274
+ * Extract text content from XML element, handling nested elements
275
+ * @param xml - XML string
276
+ * @returns Plain text content
277
+ */
278
+ function extractTextFromXml(xml: string): string {
279
+ let text = '';
280
+ // Match w:t elements (regular text)
281
+ const textPattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
282
+ let match: RegExpExecArray | null;
283
+ while ((match = textPattern.exec(xml)) !== null) {
284
+ text += match[1];
285
+ }
286
+ // Also match w:delText (deleted text)
287
+ const delTextPattern = /<w:delText[^>]*>([^<]*)<\/w:delText>/g;
288
+ while ((match = delTextPattern.exec(xml)) !== null) {
289
+ text += match[1];
290
+ }
291
+ return text;
292
+ }
293
+
294
+ /**
295
+ * Extract track changes (insertions and deletions) from Word document
296
+ * Converts Word's w:ins and w:del elements to CriticMarkup format
297
+ *
298
+ * @param docxPath - Path to Word document
299
+ * @returns Track changes result with content and stats
300
+ */
301
+ export async function extractTrackChanges(docxPath: string): Promise<TrackChangesResult> {
302
+ if (!fs.existsSync(docxPath)) {
303
+ throw new Error(`File not found: ${docxPath}`);
304
+ }
305
+
306
+ const zip = new AdmZip(docxPath);
307
+ const documentEntry = zip.getEntry('word/document.xml');
308
+
309
+ if (!documentEntry) {
310
+ throw new Error('Invalid docx: no document.xml');
311
+ }
312
+
313
+ let xml = zip.readAsText(documentEntry);
314
+ let insertions = 0;
315
+ let deletions = 0;
316
+
317
+ // Check if there are any track changes
318
+ const hasInsertions = xml.includes('<w:ins ');
319
+ const hasDeletions = xml.includes('<w:del ');
320
+
321
+ if (!hasInsertions && !hasDeletions) {
322
+ return { hasTrackChanges: false, content: null, stats: { insertions: 0, deletions: 0 } };
323
+ }
324
+
325
+ // Process insertions: <w:ins ...>...</w:ins> -> {++...++}
326
+ // Match the full w:ins element including nested content
327
+ xml = xml.replace(/<w:ins\b[^>]*>([\s\S]*?)<\/w:ins>/g, (match, content) => {
328
+ const text = extractTextFromXml(content);
329
+ if (text.trim()) {
330
+ insertions++;
331
+ return `{++${text}++}`;
332
+ }
333
+ return text;
334
+ });
335
+
336
+ // Process deletions: <w:del ...>...</w:del> -> {--...--}
337
+ xml = xml.replace(/<w:del\b[^>]*>([\s\S]*?)<\/w:del>/g, (match, content) => {
338
+ const text = extractTextFromXml(content);
339
+ if (text.trim()) {
340
+ deletions++;
341
+ return `{--${text}--}`;
342
+ }
343
+ return '';
344
+ });
345
+
346
+ return {
347
+ hasTrackChanges: true,
348
+ content: xml,
349
+ stats: { insertions, deletions },
350
+ };
351
+ }
352
+
353
+ interface ExtractWithTrackChangesOptions {
354
+ mediaDir?: string;
355
+ }
356
+
357
+ /**
358
+ * Extract Word document content with track changes preserved as CriticMarkup
359
+ * Uses pandoc with track-changes=all option to preserve insertions/deletions
360
+ *
361
+ * @param docxPath - Path to Word document
362
+ * @param options - Options
363
+ * @returns Track changes result with text and stats
364
+ */
365
+ export async function extractWithTrackChanges(
366
+ docxPath: string,
367
+ options: ExtractWithTrackChangesOptions = {}
368
+ ): Promise<{ text: string; hasTrackChanges: boolean; stats: { insertions: number; deletions: number } }> {
369
+ const { mediaDir } = options;
370
+
371
+ if (!fs.existsSync(docxPath)) {
372
+ throw new Error(`File not found: ${docxPath}`);
373
+ }
374
+
375
+ const { execSync } = await import('child_process');
376
+
377
+ // Use pandoc with --track-changes=all to preserve track changes
378
+ // This outputs insertions as [insertion]{.insertion} and deletions as [deletion]{.deletion}
379
+ let pandocArgs = `"${docxPath}" -t markdown --wrap=none --track-changes=all`;
380
+ if (mediaDir) {
381
+ pandocArgs += ` --extract-media="${mediaDir}"`;
382
+ }
383
+
384
+ let text: string;
385
+ try {
386
+ text = execSync(`pandoc ${pandocArgs}`, {
387
+ encoding: 'utf-8',
388
+ maxBuffer: 50 * 1024 * 1024,
389
+ });
390
+ } catch (err: any) {
391
+ throw new Error(`Pandoc extraction failed: ${err.message}`);
392
+ }
393
+
394
+ // Count track changes from pandoc output
395
+ let insertions = 0;
396
+ let deletions = 0;
397
+
398
+ // Pandoc outputs track changes as:
399
+ // [inserted text]{.insertion author="..."}
400
+ // [deleted text]{.deletion author="..."}
401
+
402
+ // Convert pandoc's track change format to CriticMarkup
403
+ // Insertions: [text]{.insertion ...} -> {++text++}
404
+ text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
405
+ if (content.trim()) {
406
+ insertions++;
407
+ return `{++${content}++}`;
408
+ }
409
+ return '';
410
+ });
411
+
412
+ // Deletions: [text]{.deletion ...} -> {--text--}
413
+ text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
414
+ if (content.trim()) {
415
+ deletions++;
416
+ return `{--${content}--}`;
417
+ }
418
+ return '';
419
+ });
420
+
421
+ const hasTrackChanges = insertions > 0 || deletions > 0;
422
+
423
+ return {
424
+ text,
425
+ hasTrackChanges,
426
+ stats: { insertions, deletions },
427
+ };
428
+ }