docrev 0.8.1 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. package/.claude/settings.local.json +9 -0
  2. package/PLAN-tables-and-postprocess.md +850 -0
  3. package/README.md +33 -0
  4. package/bin/rev.js +12 -131
  5. package/bin/rev.ts +145 -0
  6. package/dist/bin/rev.d.ts +9 -0
  7. package/dist/bin/rev.d.ts.map +1 -0
  8. package/dist/bin/rev.js +118 -0
  9. package/dist/bin/rev.js.map +1 -0
  10. package/dist/lib/annotations.d.ts +91 -0
  11. package/dist/lib/annotations.d.ts.map +1 -0
  12. package/dist/lib/annotations.js +554 -0
  13. package/dist/lib/annotations.js.map +1 -0
  14. package/dist/lib/build.d.ts +171 -0
  15. package/dist/lib/build.d.ts.map +1 -0
  16. package/dist/lib/build.js +755 -0
  17. package/dist/lib/build.js.map +1 -0
  18. package/dist/lib/citations.d.ts +34 -0
  19. package/dist/lib/citations.d.ts.map +1 -0
  20. package/dist/lib/citations.js +140 -0
  21. package/dist/lib/citations.js.map +1 -0
  22. package/dist/lib/commands/build.d.ts +13 -0
  23. package/dist/lib/commands/build.d.ts.map +1 -0
  24. package/dist/lib/commands/build.js +678 -0
  25. package/dist/lib/commands/build.js.map +1 -0
  26. package/dist/lib/commands/citations.d.ts +11 -0
  27. package/dist/lib/commands/citations.d.ts.map +1 -0
  28. package/dist/lib/commands/citations.js +428 -0
  29. package/dist/lib/commands/citations.js.map +1 -0
  30. package/dist/lib/commands/comments.d.ts +11 -0
  31. package/dist/lib/commands/comments.d.ts.map +1 -0
  32. package/dist/lib/commands/comments.js +883 -0
  33. package/dist/lib/commands/comments.js.map +1 -0
  34. package/dist/lib/commands/context.d.ts +35 -0
  35. package/dist/lib/commands/context.d.ts.map +1 -0
  36. package/dist/lib/commands/context.js +59 -0
  37. package/dist/lib/commands/context.js.map +1 -0
  38. package/dist/lib/commands/core.d.ts +11 -0
  39. package/dist/lib/commands/core.d.ts.map +1 -0
  40. package/dist/lib/commands/core.js +246 -0
  41. package/dist/lib/commands/core.js.map +1 -0
  42. package/dist/lib/commands/doi.d.ts +11 -0
  43. package/dist/lib/commands/doi.d.ts.map +1 -0
  44. package/dist/lib/commands/doi.js +373 -0
  45. package/dist/lib/commands/doi.js.map +1 -0
  46. package/dist/lib/commands/history.d.ts +11 -0
  47. package/dist/lib/commands/history.d.ts.map +1 -0
  48. package/dist/lib/commands/history.js +245 -0
  49. package/dist/lib/commands/history.js.map +1 -0
  50. package/dist/lib/commands/index.d.ts +28 -0
  51. package/dist/lib/commands/index.d.ts.map +1 -0
  52. package/dist/lib/commands/index.js +35 -0
  53. package/dist/lib/commands/index.js.map +1 -0
  54. package/dist/lib/commands/init.d.ts +11 -0
  55. package/dist/lib/commands/init.d.ts.map +1 -0
  56. package/dist/lib/commands/init.js +209 -0
  57. package/dist/lib/commands/init.js.map +1 -0
  58. package/dist/lib/commands/response.d.ts +11 -0
  59. package/dist/lib/commands/response.d.ts.map +1 -0
  60. package/dist/lib/commands/response.js +317 -0
  61. package/dist/lib/commands/response.js.map +1 -0
  62. package/dist/lib/commands/sections.d.ts +11 -0
  63. package/dist/lib/commands/sections.d.ts.map +1 -0
  64. package/dist/lib/commands/sections.js +1071 -0
  65. package/dist/lib/commands/sections.js.map +1 -0
  66. package/dist/lib/commands/utilities.d.ts +19 -0
  67. package/dist/lib/commands/utilities.d.ts.map +1 -0
  68. package/dist/lib/commands/utilities.js +2009 -0
  69. package/dist/lib/commands/utilities.js.map +1 -0
  70. package/dist/lib/comment-realign.d.ts +50 -0
  71. package/dist/lib/comment-realign.d.ts.map +1 -0
  72. package/dist/lib/comment-realign.js +372 -0
  73. package/dist/lib/comment-realign.js.map +1 -0
  74. package/dist/lib/config.d.ts +41 -0
  75. package/dist/lib/config.d.ts.map +1 -0
  76. package/dist/lib/config.js +76 -0
  77. package/dist/lib/config.js.map +1 -0
  78. package/dist/lib/crossref.d.ts +108 -0
  79. package/dist/lib/crossref.d.ts.map +1 -0
  80. package/dist/lib/crossref.js +597 -0
  81. package/dist/lib/crossref.js.map +1 -0
  82. package/dist/lib/dependencies.d.ts +30 -0
  83. package/dist/lib/dependencies.d.ts.map +1 -0
  84. package/dist/lib/dependencies.js +95 -0
  85. package/dist/lib/dependencies.js.map +1 -0
  86. package/dist/lib/doi-cache.d.ts +29 -0
  87. package/dist/lib/doi-cache.d.ts.map +1 -0
  88. package/dist/lib/doi-cache.js +104 -0
  89. package/dist/lib/doi-cache.js.map +1 -0
  90. package/dist/lib/doi.d.ts +65 -0
  91. package/dist/lib/doi.d.ts.map +1 -0
  92. package/dist/lib/doi.js +710 -0
  93. package/dist/lib/doi.js.map +1 -0
  94. package/dist/lib/equations.d.ts +61 -0
  95. package/dist/lib/equations.d.ts.map +1 -0
  96. package/dist/lib/equations.js +445 -0
  97. package/dist/lib/equations.js.map +1 -0
  98. package/dist/lib/errors.d.ts +60 -0
  99. package/dist/lib/errors.d.ts.map +1 -0
  100. package/dist/lib/errors.js +303 -0
  101. package/dist/lib/errors.js.map +1 -0
  102. package/dist/lib/format.d.ts +104 -0
  103. package/dist/lib/format.d.ts.map +1 -0
  104. package/dist/lib/format.js +416 -0
  105. package/dist/lib/format.js.map +1 -0
  106. package/dist/lib/git.d.ts +88 -0
  107. package/dist/lib/git.d.ts.map +1 -0
  108. package/dist/lib/git.js +304 -0
  109. package/dist/lib/git.js.map +1 -0
  110. package/dist/lib/grammar.d.ts +62 -0
  111. package/dist/lib/grammar.d.ts.map +1 -0
  112. package/dist/lib/grammar.js +244 -0
  113. package/dist/lib/grammar.js.map +1 -0
  114. package/dist/lib/image-registry.d.ts +68 -0
  115. package/dist/lib/image-registry.d.ts.map +1 -0
  116. package/dist/lib/image-registry.js +112 -0
  117. package/dist/lib/image-registry.js.map +1 -0
  118. package/dist/lib/import.d.ts +184 -0
  119. package/dist/lib/import.d.ts.map +1 -0
  120. package/dist/lib/import.js +1581 -0
  121. package/dist/lib/import.js.map +1 -0
  122. package/dist/lib/journals.d.ts +55 -0
  123. package/dist/lib/journals.d.ts.map +1 -0
  124. package/dist/lib/journals.js +417 -0
  125. package/dist/lib/journals.js.map +1 -0
  126. package/dist/lib/merge.d.ts +138 -0
  127. package/dist/lib/merge.d.ts.map +1 -0
  128. package/dist/lib/merge.js +603 -0
  129. package/dist/lib/merge.js.map +1 -0
  130. package/dist/lib/orcid.d.ts +36 -0
  131. package/dist/lib/orcid.d.ts.map +1 -0
  132. package/dist/lib/orcid.js +117 -0
  133. package/dist/lib/orcid.js.map +1 -0
  134. package/dist/lib/pdf-comments.d.ts +95 -0
  135. package/dist/lib/pdf-comments.d.ts.map +1 -0
  136. package/dist/lib/pdf-comments.js +192 -0
  137. package/dist/lib/pdf-comments.js.map +1 -0
  138. package/dist/lib/pdf-import.d.ts +118 -0
  139. package/dist/lib/pdf-import.d.ts.map +1 -0
  140. package/dist/lib/pdf-import.js +397 -0
  141. package/dist/lib/pdf-import.js.map +1 -0
  142. package/dist/lib/plugins.d.ts +76 -0
  143. package/dist/lib/plugins.d.ts.map +1 -0
  144. package/dist/lib/plugins.js +235 -0
  145. package/dist/lib/plugins.js.map +1 -0
  146. package/dist/lib/postprocess.d.ts +42 -0
  147. package/dist/lib/postprocess.d.ts.map +1 -0
  148. package/dist/lib/postprocess.js +138 -0
  149. package/dist/lib/postprocess.js.map +1 -0
  150. package/dist/lib/pptx-template.d.ts +59 -0
  151. package/dist/lib/pptx-template.d.ts.map +1 -0
  152. package/dist/lib/pptx-template.js +613 -0
  153. package/dist/lib/pptx-template.js.map +1 -0
  154. package/dist/lib/pptx-themes.d.ts +80 -0
  155. package/dist/lib/pptx-themes.d.ts.map +1 -0
  156. package/dist/lib/pptx-themes.js +818 -0
  157. package/dist/lib/pptx-themes.js.map +1 -0
  158. package/dist/lib/protect-restore.d.ts +137 -0
  159. package/dist/lib/protect-restore.d.ts.map +1 -0
  160. package/dist/lib/protect-restore.js +394 -0
  161. package/dist/lib/protect-restore.js.map +1 -0
  162. package/dist/lib/rate-limiter.d.ts +27 -0
  163. package/dist/lib/rate-limiter.d.ts.map +1 -0
  164. package/dist/lib/rate-limiter.js +79 -0
  165. package/dist/lib/rate-limiter.js.map +1 -0
  166. package/dist/lib/response.d.ts +41 -0
  167. package/dist/lib/response.d.ts.map +1 -0
  168. package/dist/lib/response.js +150 -0
  169. package/dist/lib/response.js.map +1 -0
  170. package/dist/lib/review.d.ts +35 -0
  171. package/dist/lib/review.d.ts.map +1 -0
  172. package/dist/lib/review.js +263 -0
  173. package/dist/lib/review.js.map +1 -0
  174. package/dist/lib/schema.d.ts +66 -0
  175. package/dist/lib/schema.d.ts.map +1 -0
  176. package/dist/lib/schema.js +339 -0
  177. package/dist/lib/schema.js.map +1 -0
  178. package/dist/lib/scientific-words.d.ts +6 -0
  179. package/dist/lib/scientific-words.d.ts.map +1 -0
  180. package/dist/lib/scientific-words.js +66 -0
  181. package/dist/lib/scientific-words.js.map +1 -0
  182. package/dist/lib/sections.d.ts +40 -0
  183. package/dist/lib/sections.d.ts.map +1 -0
  184. package/dist/lib/sections.js +288 -0
  185. package/dist/lib/sections.js.map +1 -0
  186. package/dist/lib/slides.d.ts +86 -0
  187. package/dist/lib/slides.d.ts.map +1 -0
  188. package/dist/lib/slides.js +676 -0
  189. package/dist/lib/slides.js.map +1 -0
  190. package/dist/lib/spelling.d.ts +76 -0
  191. package/dist/lib/spelling.d.ts.map +1 -0
  192. package/dist/lib/spelling.js +272 -0
  193. package/dist/lib/spelling.js.map +1 -0
  194. package/dist/lib/templates.d.ts +30 -0
  195. package/dist/lib/templates.d.ts.map +1 -0
  196. package/dist/lib/templates.js +504 -0
  197. package/dist/lib/templates.js.map +1 -0
  198. package/dist/lib/themes.d.ts +85 -0
  199. package/dist/lib/themes.d.ts.map +1 -0
  200. package/dist/lib/themes.js +652 -0
  201. package/dist/lib/themes.js.map +1 -0
  202. package/dist/lib/trackchanges.d.ts +51 -0
  203. package/dist/lib/trackchanges.d.ts.map +1 -0
  204. package/dist/lib/trackchanges.js +202 -0
  205. package/dist/lib/trackchanges.js.map +1 -0
  206. package/dist/lib/tui.d.ts +76 -0
  207. package/dist/lib/tui.d.ts.map +1 -0
  208. package/dist/lib/tui.js +377 -0
  209. package/dist/lib/tui.js.map +1 -0
  210. package/dist/lib/types.d.ts +447 -0
  211. package/dist/lib/types.d.ts.map +1 -0
  212. package/dist/lib/types.js +6 -0
  213. package/dist/lib/types.js.map +1 -0
  214. package/dist/lib/undo.d.ts +57 -0
  215. package/dist/lib/undo.d.ts.map +1 -0
  216. package/dist/lib/undo.js +185 -0
  217. package/dist/lib/undo.js.map +1 -0
  218. package/dist/lib/utils.d.ts +16 -0
  219. package/dist/lib/utils.d.ts.map +1 -0
  220. package/dist/lib/utils.js +40 -0
  221. package/dist/lib/utils.js.map +1 -0
  222. package/dist/lib/variables.d.ts +42 -0
  223. package/dist/lib/variables.d.ts.map +1 -0
  224. package/dist/lib/variables.js +141 -0
  225. package/dist/lib/variables.js.map +1 -0
  226. package/dist/lib/word.d.ts +80 -0
  227. package/dist/lib/word.d.ts.map +1 -0
  228. package/dist/lib/word.js +360 -0
  229. package/dist/lib/word.js.map +1 -0
  230. package/dist/lib/wordcomments.d.ts +51 -0
  231. package/dist/lib/wordcomments.d.ts.map +1 -0
  232. package/dist/lib/wordcomments.js +587 -0
  233. package/dist/lib/wordcomments.js.map +1 -0
  234. package/eslint.config.js +27 -0
  235. package/lib/annotations.ts +622 -0
  236. package/lib/apply-buildup-colors.py +88 -0
  237. package/lib/build.ts +1013 -0
  238. package/lib/{citations.js → citations.ts} +38 -27
  239. package/lib/commands/{build.js → build.ts} +80 -27
  240. package/lib/commands/{citations.js → citations.ts} +36 -18
  241. package/lib/commands/{comments.js → comments.ts} +187 -54
  242. package/lib/commands/{context.js → context.ts} +18 -8
  243. package/lib/commands/{core.js → core.ts} +34 -20
  244. package/lib/commands/{doi.js → doi.ts} +32 -16
  245. package/lib/commands/{history.js → history.ts} +25 -12
  246. package/lib/commands/{index.js → index.ts} +9 -5
  247. package/lib/commands/{init.js → init.ts} +20 -8
  248. package/lib/commands/{response.js → response.ts} +47 -20
  249. package/lib/commands/{sections.js → sections.ts} +273 -68
  250. package/lib/commands/{utilities.js → utilities.ts} +338 -158
  251. package/lib/{comment-realign.js → comment-realign.ts} +117 -45
  252. package/lib/config.ts +84 -0
  253. package/lib/{crossref.js → crossref.ts} +213 -138
  254. package/lib/dependencies.ts +106 -0
  255. package/lib/doi-cache.ts +115 -0
  256. package/lib/{doi.js → doi.ts} +115 -281
  257. package/lib/{equations.js → equations.ts} +60 -64
  258. package/lib/{errors.js → errors.ts} +56 -48
  259. package/lib/{format.js → format.ts} +137 -63
  260. package/lib/{git.js → git.ts} +66 -63
  261. package/lib/{grammar.js → grammar.ts} +45 -32
  262. package/lib/image-registry.ts +180 -0
  263. package/lib/import.ts +2060 -0
  264. package/lib/journals.ts +505 -0
  265. package/lib/{merge.js → merge.ts} +185 -135
  266. package/lib/{orcid.js → orcid.ts} +17 -22
  267. package/lib/{pdf-comments.js → pdf-comments.ts} +76 -18
  268. package/lib/{pdf-import.js → pdf-import.ts} +148 -70
  269. package/lib/{plugins.js → plugins.ts} +82 -39
  270. package/lib/postprocess.ts +188 -0
  271. package/lib/pptx-color-filter.lua +37 -0
  272. package/lib/pptx-template.ts +625 -0
  273. package/lib/pptx-themes/academic.pptx +0 -0
  274. package/lib/pptx-themes/corporate.pptx +0 -0
  275. package/lib/pptx-themes/dark.pptx +0 -0
  276. package/lib/pptx-themes/default.pptx +0 -0
  277. package/lib/pptx-themes/minimal.pptx +0 -0
  278. package/lib/pptx-themes/plant.pptx +0 -0
  279. package/lib/pptx-themes.ts +896 -0
  280. package/lib/protect-restore.ts +516 -0
  281. package/lib/rate-limiter.ts +94 -0
  282. package/lib/{response.js → response.ts} +36 -21
  283. package/lib/{review.js → review.ts} +53 -43
  284. package/lib/{schema.js → schema.ts} +70 -25
  285. package/lib/{sections.js → sections.ts} +71 -76
  286. package/lib/slides.ts +793 -0
  287. package/lib/{spelling.js → spelling.ts} +43 -59
  288. package/lib/{templates.js → templates.ts} +20 -17
  289. package/lib/themes.ts +742 -0
  290. package/lib/{trackchanges.js → trackchanges.ts} +52 -23
  291. package/lib/types.ts +509 -0
  292. package/lib/{undo.js → undo.ts} +75 -52
  293. package/lib/utils.ts +41 -0
  294. package/lib/{variables.js → variables.ts} +60 -54
  295. package/lib/word.ts +428 -0
  296. package/lib/{wordcomments.js → wordcomments.ts} +94 -40
  297. package/package.json +15 -5
  298. package/skill/REFERENCE.md +67 -0
  299. package/tsconfig.json +26 -0
  300. package/lib/annotations.js +0 -414
  301. package/lib/build.js +0 -639
  302. package/lib/config.js +0 -79
  303. package/lib/import.js +0 -1145
  304. package/lib/journals.js +0 -629
  305. package/lib/word.js +0 -225
  306. /package/lib/{scientific-words.js → scientific-words.ts} +0 -0
package/lib/import.js DELETED
@@ -1,1145 +0,0 @@
1
- /**
2
- * Import functionality - convert Word docs to annotated Markdown
3
- */
4
-
5
- import * as fs from 'fs';
6
- import * as path from 'path';
7
- import { diffWords } from 'diff';
8
- import { stripAnnotations } from './annotations.js';
9
- import { exec } from 'child_process';
10
- import { promisify } from 'util';
11
-
12
- const execAsync = promisify(exec);
13
-
14
- /**
15
- * Extract comments directly from Word docx comments.xml
16
- * @param {string} docxPath
17
- * @returns {Promise<Array<{id: string, author: string, date: string, text: string}>>}
18
- */
19
- export async function extractWordComments(docxPath) {
20
- const AdmZip = (await import('adm-zip')).default;
21
- const { parseStringPromise } = await import('xml2js');
22
-
23
- const comments = [];
24
-
25
- // Validate file exists
26
- if (!fs.existsSync(docxPath)) {
27
- throw new Error(`File not found: ${docxPath}`);
28
- }
29
-
30
- try {
31
- let zip;
32
- try {
33
- zip = new AdmZip(docxPath);
34
- } catch (err) {
35
- throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
36
- }
37
-
38
- const commentsEntry = zip.getEntry('word/comments.xml');
39
-
40
- if (!commentsEntry) {
41
- return comments;
42
- }
43
-
44
- let commentsXml;
45
- try {
46
- commentsXml = commentsEntry.getData().toString('utf8');
47
- } catch (err) {
48
- throw new Error(`Failed to read comments from document: ${err.message}`);
49
- }
50
-
51
- const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
52
-
53
- const ns = 'w:';
54
- const commentsRoot = parsed['w:comments'];
55
- if (!commentsRoot || !commentsRoot['w:comment']) {
56
- return comments;
57
- }
58
-
59
- // Ensure it's an array
60
- const commentNodes = Array.isArray(commentsRoot['w:comment'])
61
- ? commentsRoot['w:comment']
62
- : [commentsRoot['w:comment']];
63
-
64
- for (const comment of commentNodes) {
65
- const id = comment.$?.['w:id'] || '';
66
- const author = comment.$?.['w:author'] || 'Unknown';
67
- const date = comment.$?.['w:date'] || '';
68
-
69
- // Extract text from nested w:p/w:r/w:t elements
70
- let text = '';
71
- const extractText = (node) => {
72
- if (!node) return;
73
- if (typeof node === 'string') {
74
- text += node;
75
- return;
76
- }
77
- if (node['w:t']) {
78
- const t = node['w:t'];
79
- text += typeof t === 'string' ? t : (t._ || t);
80
- }
81
- if (node['w:r']) {
82
- const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
83
- runs.forEach(extractText);
84
- }
85
- if (node['w:p']) {
86
- const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
87
- paras.forEach(extractText);
88
- }
89
- };
90
- extractText(comment);
91
-
92
- comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
93
- }
94
- } catch (err) {
95
- // Re-throw with more context if it's already an Error we created
96
- if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
97
- throw err;
98
- }
99
- throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
100
- }
101
-
102
- return comments;
103
- }
104
-
105
- /**
106
- * Extract comment anchor texts from document.xml with surrounding context
107
- * Returns map of comment ID -> {anchor, before, after} for better matching
108
- * @param {string} docxPath
109
- * @returns {Promise<Map<string, {anchor: string, before: string, after: string}>>}
110
- */
111
- export async function extractCommentAnchors(docxPath) {
112
- const AdmZip = (await import('adm-zip')).default;
113
- const anchors = new Map();
114
-
115
- try {
116
- const zip = new AdmZip(docxPath);
117
- const docEntry = zip.getEntry('word/document.xml');
118
-
119
- if (!docEntry) {
120
- return anchors;
121
- }
122
-
123
- const docXml = docEntry.getData().toString('utf8');
124
-
125
- // Extract ALL text nodes in document order for context
126
- const allTextNodes = [...docXml.matchAll(/<w:t[^>]*>([^<]*)<\/w:t>/g)].map(m => m[1]);
127
- const fullDocText = allTextNodes.join('');
128
-
129
- // Find commentRangeStart...commentRangeEnd pairs
130
- const rangePattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>[\s\S]*?<w:commentRangeEnd[^>]*w:id="\1"[^>]*\/?>/g;
131
-
132
- let match;
133
- while ((match = rangePattern.exec(docXml)) !== null) {
134
- const id = match[1];
135
- const rangeContent = match[0];
136
-
137
- // Extract all w:t text within this range
138
- const textMatches = rangeContent.matchAll(/<w:t[^>]*>([^<]*)<\/w:t>/g);
139
- let anchorText = '';
140
- for (const tm of textMatches) {
141
- anchorText += tm[1];
142
- }
143
-
144
- if (anchorText.trim()) {
145
- // Get surrounding context from full document
146
- const anchorPos = fullDocText.indexOf(anchorText.trim());
147
- let before = '';
148
- let after = '';
149
-
150
- if (anchorPos >= 0) {
151
- // Get ~100 chars before (up to sentence boundary)
152
- const beforeText = fullDocText.slice(Math.max(0, anchorPos - 150), anchorPos);
153
- const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
154
- before = sentenceStart >= 0 ? beforeText.slice(sentenceStart + 2).trim() : beforeText.slice(-80).trim();
155
-
156
- // Get ~100 chars after (up to sentence boundary)
157
- const afterStart = anchorPos + anchorText.length;
158
- const afterText = fullDocText.slice(afterStart, afterStart + 150);
159
- const sentenceEnd = afterText.search(/[.!?]\s/);
160
- after = sentenceEnd >= 0 ? afterText.slice(0, sentenceEnd + 1).trim() : afterText.slice(0, 80).trim();
161
- }
162
-
163
- anchors.set(id, { anchor: anchorText.trim(), before, after });
164
- }
165
- }
166
- } catch (err) {
167
- console.error('Error extracting comment anchors:', err.message);
168
- }
169
-
170
- return anchors;
171
- }
172
-
173
- /**
174
- * Extract text from Word document using pandoc (preserves equations as LaTeX)
175
- * Falls back to mammoth if pandoc fails
176
- * @param {string} docxPath
177
- * @param {object} options - { mediaDir: string } - Directory to extract images to
178
- * @returns {Promise<{text: string, comments: Array, anchors: Map, extractedMedia: string[]}>}
179
- */
180
- export async function extractFromWord(docxPath, options = {}) {
181
- let text;
182
- let messages = [];
183
- let extractedMedia = [];
184
-
185
- // Determine media extraction directory
186
- const docxDir = path.dirname(docxPath);
187
- const mediaDir = options.mediaDir || path.join(docxDir, 'media');
188
-
189
- // Try pandoc first (preserves OMML equations as LaTeX and extracts images)
190
- try {
191
- // Use --extract-media to save images from the Word document
192
- const { stdout } = await execAsync(
193
- `pandoc "${docxPath}" -t markdown --wrap=none --extract-media="${mediaDir}"`,
194
- { maxBuffer: 50 * 1024 * 1024 }
195
- );
196
- text = stdout;
197
-
198
- // Find extracted media files
199
- const mediaSubdir = path.join(mediaDir, 'media');
200
- if (fs.existsSync(mediaSubdir)) {
201
- extractedMedia = fs.readdirSync(mediaSubdir)
202
- .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
203
- .map(f => path.join(mediaSubdir, f));
204
-
205
- if (extractedMedia.length > 0) {
206
- messages.push({
207
- type: 'info',
208
- message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
209
- });
210
- }
211
- }
212
- } catch (pandocErr) {
213
- // Fall back to mammoth if pandoc fails
214
- messages.push({ type: 'warning', message: 'Pandoc failed, using mammoth (equations and images may not be preserved)' });
215
- const mammoth = await import('mammoth');
216
- const textResult = await mammoth.extractRawText({ path: docxPath });
217
- const htmlResult = await mammoth.convertToHtml({ path: docxPath });
218
- text = textResult.value;
219
- messages = [...textResult.messages, ...htmlResult.messages];
220
- }
221
-
222
- // Extract comments directly from docx XML
223
- const comments = await extractWordComments(docxPath);
224
-
225
- // Extract comment anchor texts
226
- const anchors = await extractCommentAnchors(docxPath);
227
-
228
- return {
229
- text,
230
- comments,
231
- anchors,
232
- messages,
233
- extractedMedia,
234
- };
235
- }
236
-
237
- /**
238
- * Insert comments into markdown text based on anchor texts with context
239
- * Uses sentence context for disambiguation and tie-breaks for duplicates
240
- * @param {string} markdown - The markdown text
241
- * @param {Array} comments - Array of {id, author, text}
242
- * @param {Map} anchors - Map of comment id -> {anchor, before, after} or string (legacy)
243
- * @param {object} options - Options {quiet: boolean}
244
- * @returns {string} - Markdown with comments inserted
245
- */
246
- export function insertCommentsIntoMarkdown(markdown, comments, anchors, options = {}) {
247
- const { quiet = false } = options;
248
- let result = markdown;
249
- let unmatchedCount = 0;
250
- const duplicateWarnings = [];
251
- const usedPositions = new Set(); // For tie-breaking: track used positions
252
-
253
- // Get all positions in order (for sequential tie-breaking)
254
- const commentsWithPositions = comments.map((c) => {
255
- const anchorData = anchors.get(c.id);
256
- if (!anchorData) {
257
- unmatchedCount++;
258
- return { ...c, pos: -1, anchorText: null };
259
- }
260
-
261
- // Support both old format (string) and new format ({anchor, before, after})
262
- const anchor = typeof anchorData === 'string' ? anchorData : anchorData.anchor;
263
- const before = typeof anchorData === 'object' ? anchorData.before : '';
264
- const after = typeof anchorData === 'object' ? anchorData.after : '';
265
-
266
- const anchorLower = anchor.toLowerCase();
267
- const resultLower = result.toLowerCase();
268
-
269
- // Find ALL occurrences of anchor text
270
- const occurrences = [];
271
- let searchIdx = 0;
272
- while ((searchIdx = resultLower.indexOf(anchorLower, searchIdx)) !== -1) {
273
- occurrences.push(searchIdx);
274
- searchIdx += 1;
275
- }
276
-
277
- if (occurrences.length === 0) {
278
- // Try normalized whitespace match
279
- const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
280
- const normalizedResult = result.replace(/\s+/g, ' ').toLowerCase();
281
- const normalizedIdx = normalizedResult.indexOf(normalizedAnchor);
282
-
283
- if (normalizedIdx !== -1) {
284
- return { ...c, pos: normalizedIdx + anchor.length, anchorText: anchor };
285
- }
286
- unmatchedCount++;
287
- return { ...c, pos: -1, anchorText: null };
288
- }
289
-
290
- if (occurrences.length === 1) {
291
- // Unique match - easy case
292
- // Position at START of anchor (comment goes before, anchor gets marked)
293
- return { ...c, pos: occurrences[0], anchorText: anchor, anchorEnd: occurrences[0] + anchor.length };
294
- }
295
-
296
- // Multiple occurrences - use context for disambiguation
297
- duplicateWarnings.push(`"${anchor.slice(0, 40)}${anchor.length > 40 ? '...' : ''}" appears ${occurrences.length} times`);
298
-
299
- // Score each occurrence based on context match
300
- // Initialize to first UNUSED occurrence (for tie-break correctness)
301
- let bestIdx = occurrences.find(p => !usedPositions.has(p)) ?? occurrences[0];
302
- let bestScore = -1; // Start at -1 so first valid candidate wins
303
-
304
- for (const pos of occurrences) {
305
- // Skip positions already used by previous comments
306
- if (usedPositions.has(pos)) continue;
307
-
308
- let score = 0;
309
-
310
- // Check context before
311
- if (before) {
312
- const contextBefore = result.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
313
- const beforeLower = before.toLowerCase();
314
- // Check if context contains parts of 'before'
315
- const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
316
- for (const word of beforeWords) {
317
- if (contextBefore.includes(word)) score += 2;
318
- }
319
- // Bonus for full match
320
- if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
321
- }
322
-
323
- // Check context after
324
- if (after) {
325
- const contextAfter = result.slice(pos + anchor.length, pos + anchor.length + after.length + 20).toLowerCase();
326
- const afterLower = after.toLowerCase();
327
- // Check if context contains parts of 'after'
328
- const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
329
- for (const word of afterWords) {
330
- if (contextAfter.includes(word)) score += 2;
331
- }
332
- // Bonus for full match
333
- if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
334
- }
335
-
336
- // Tie-break: prefer earlier unused occurrence (document order)
337
- if (score > bestScore || (score === bestScore && pos < bestIdx)) {
338
- bestScore = score;
339
- bestIdx = pos;
340
- }
341
- }
342
-
343
- // Mark this position as used for tie-breaking subsequent comments
344
- usedPositions.add(bestIdx);
345
-
346
- // Position at START of anchor (comment goes before, anchor gets marked)
347
- return { ...c, pos: bestIdx, anchorText: anchor, anchorEnd: bestIdx + anchor.length };
348
- }).filter((c) => c.pos >= 0);
349
-
350
- // Sort by position descending (insert from end to avoid offset issues)
351
- commentsWithPositions.sort((a, b) => b.pos - a.pos);
352
-
353
- // Insert each comment with anchor marking
354
- for (const c of commentsWithPositions) {
355
- const comment = `{>>${c.author}: ${c.text}<<}`;
356
- if (c.anchorText && c.anchorEnd) {
357
- // Replace anchor text with: {>>comment<<}[anchor]{.mark}
358
- const before = result.slice(0, c.pos);
359
- const anchor = result.slice(c.pos, c.anchorEnd);
360
- const after = result.slice(c.anchorEnd);
361
- result = before + comment + `[${anchor}]{.mark}` + after;
362
- } else {
363
- // No anchor - just insert comment at position
364
- result = result.slice(0, c.pos) + ` ${comment}` + result.slice(c.pos);
365
- }
366
- }
367
-
368
- // Log warnings unless quiet mode
369
- if (!quiet) {
370
- if (unmatchedCount > 0) {
371
- console.warn(`Warning: ${unmatchedCount} comment(s) could not be matched to anchor text`);
372
- }
373
- if (duplicateWarnings.length > 0) {
374
- console.warn(`Warning: Duplicate anchor text found (using context & tie-breaks for placement):`);
375
- for (const w of duplicateWarnings) {
376
- console.warn(` - ${w}`);
377
- }
378
- }
379
- }
380
-
381
- return result;
382
- }
383
-
384
- /**
385
- * Normalize text for comparison (handle whitespace differences)
386
- * @param {string} text
387
- * @returns {string}
388
- */
389
- function normalizeText(text) {
390
- return text
391
- .replace(/\r\n/g, '\n') // Normalize line endings
392
- .replace(/\t/g, ' ') // Tabs to spaces
393
- .replace(/ +/g, ' ') // Collapse multiple spaces
394
- .trim();
395
- }
396
-
397
- /**
398
- * Fix citation and math annotations by preserving original markdown syntax
399
- * When Word renders [@Author2021] as "(Author et al. 2021)" or $p$ as "p", we preserve markdown
400
- * @param {string} text - Annotated text
401
- * @param {string} originalMd - Original markdown with proper citations and math
402
- * @returns {string}
403
- */
404
- function fixCitationAnnotations(text, originalMd) {
405
- // Step 0: Fix math annotations - preserve inline and display math
406
- // Deletions of inline math should keep the math: {--$p$--} -> $p$
407
- text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
408
- text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
409
-
410
- // Substitutions where math was "changed" to rendered form: {~~$p$~>p~~} -> $p$
411
- text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
412
- text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
413
-
414
- // Extract all citations from original markdown with positions
415
- const citationPattern = /\[@[^\]]+\]/g;
416
- const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
417
-
418
- // Step 1: Fix substitutions where left side has markdown citation
419
- // {~~[@Author]~>rendered~~} -> [@Author]
420
- text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
421
-
422
- // Step 2: Fix substitutions where left side STARTS with markdown citation
423
- // {~~[@Author] more text~>rendered more~~} -> [@Author] {~~more text~>more~~}
424
- text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
425
- // If old and new text are similar (just whitespace/formatting), keep cite + new
426
- if (oldText.trim() === '' && newText.trim() === '') {
427
- return cite;
428
- }
429
- // Otherwise, keep citation and create substitution for the rest
430
- if (oldText.trim() || newText.trim()) {
431
- return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
432
- }
433
- return cite;
434
- });
435
-
436
- // Step 3: Fix deletions of markdown citations (should keep them)
437
- text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
438
-
439
- // Step 4: Fix insertions of rendered citations (usually duplicates, remove)
440
- // {++(Author et al. 2021)++} or {++(Author 2021)++}
441
- text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
442
-
443
- // Step 5: Clean up broken multi-part substitutions involving citations
444
- // Pattern: {~~[@cite~>rendered~~} {~~text~>more~~} -> [@cite] {~~text~>more~~}
445
- text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
446
-
447
- // Step 6: Fix citations split across substitution boundaries
448
- // {~~[@~>something~~}Author2021] -> [@Author2021]
449
- text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
450
-
451
- // Step 7: Clean up any remaining partial citations in substitutions
452
- // {~~; @Author2021]~>something~~} -> ; [@Author2021]
453
- text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
454
-
455
- // Step 8: Remove rendered citation insertions (fragments left over from citation matching)
456
- // These are leftover pieces of rendered citations that didn't match placeholders
457
- // Use \p{L} for Unicode letters to handle accented chars (š, é, ü, etc.)
458
-
459
- // Full rendered citations in parentheses: {++(Author et al. 2021)++}
460
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
461
- text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
462
-
463
- // Trailing citation fragments: {++2019; IPBES 2023). ++} or {++2008b; Rouget et al. 2016). ++}
464
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
465
- text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
466
-
467
- // Just year with closing paren: {++2021)++} or {++2021).++}
468
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
469
- text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
470
-
471
- // Leading citation fragments: {++Author et al.++} or {++(Author++}
472
- text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
473
-
474
- // Semicolon-separated author-year fragments: {++; Author 2021++}
475
- text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
476
-
477
- // Year ranges with authors: {++Author 2019; Other 2020)++}
478
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
479
- text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
480
-
481
- // Step 9: Clean up double spaces and orphaned punctuation
482
- text = text.replace(/ +/g, ' ');
483
- text = text.replace(/\s+\./g, '.');
484
- text = text.replace(/\s+,/g, ',');
485
-
486
- // Step 10: Final cleanup - remove empty annotations
487
- text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
488
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
489
- text = text.replace(/\{--\s*--\}/g, '');
490
-
491
- return text;
492
- }
493
-
494
- /**
495
- * Strip markdown syntax to get plain text (for comparison with Word output)
496
- * @param {string} md
497
- * @returns {string}
498
- */
499
- function stripMarkdownSyntax(md) {
500
- return md
501
- // Remove YAML front matter
502
- .replace(/^---[\s\S]*?---\n*/m, '')
503
- // Headers: # Title → Title
504
- .replace(/^#{1,6}\s+/gm, '')
505
- // Bold/italic: **text** or *text* or __text__ or _text_ → text
506
- .replace(/(\*\*|__)(.*?)\1/g, '$2')
507
- .replace(/(\*|_)(.*?)\1/g, '$2')
508
- // Links: [text](url) → text
509
- .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
510
- // Images: ![alt](url) → (remove entirely or keep alt)
511
- .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
512
- // Inline code: `code` → code
513
- .replace(/`([^`]+)`/g, '$1')
514
- // Code blocks: ```...``` → (remove)
515
- .replace(/```[\s\S]*?```/g, '')
516
- // Blockquotes: > text → text
517
- .replace(/^>\s*/gm, '')
518
- // Horizontal rules
519
- .replace(/^[-*_]{3,}\s*$/gm, '')
520
- // List markers: - item or * item or 1. item → item
521
- .replace(/^[\s]*[-*+]\s+/gm, '')
522
- .replace(/^[\s]*\d+\.\s+/gm, '')
523
- // Citations: [@Author2020] → (keep as-is, Word might have them)
524
- // Tables: simplified handling
525
- .replace(/\|/g, ' ')
526
- .replace(/^[-:]+$/gm, '')
527
- // Clean up extra whitespace
528
- .replace(/\n{3,}/g, '\n\n')
529
- .trim();
530
- }
531
-
532
- /**
533
- * Generate annotated markdown by diffing original MD against Word text
534
- * @param {string} originalMd - Original markdown content
535
- * @param {string} wordText - Text extracted from Word
536
- * @param {string} author - Author name for the changes
537
- * @returns {string} Annotated markdown with CriticMarkup
538
- */
539
- export function generateAnnotatedDiff(originalMd, wordText, author = 'Reviewer') {
540
- // Normalize both texts
541
- const normalizedOriginal = normalizeText(originalMd);
542
- const normalizedWord = normalizeText(wordText);
543
-
544
- // Compute word-level diff
545
- const changes = diffWords(normalizedOriginal, normalizedWord);
546
-
547
- let result = '';
548
-
549
- for (const part of changes) {
550
- if (part.added) {
551
- // Insertion
552
- result += `{++${part.value}++}`;
553
- } else if (part.removed) {
554
- // Deletion
555
- result += `{--${part.value}--}`;
556
- } else {
557
- // Unchanged
558
- result += part.value;
559
- }
560
- }
561
-
562
- return result;
563
- }
564
-
565
- /**
566
- * Extract markdown prefix (headers, list markers) from a line
567
- * @param {string} line
568
- * @returns {{prefix: string, content: string}}
569
- */
570
- function extractMarkdownPrefix(line) {
571
- // Headers
572
- const headerMatch = line.match(/^(#{1,6}\s+)/);
573
- if (headerMatch) {
574
- return { prefix: headerMatch[1], content: line.slice(headerMatch[1].length) };
575
- }
576
-
577
- // List items
578
- const listMatch = line.match(/^(\s*[-*+]\s+|\s*\d+\.\s+)/);
579
- if (listMatch) {
580
- return { prefix: listMatch[1], content: line.slice(listMatch[1].length) };
581
- }
582
-
583
- // Blockquotes
584
- const quoteMatch = line.match(/^(>\s*)/);
585
- if (quoteMatch) {
586
- return { prefix: quoteMatch[1], content: line.slice(quoteMatch[1].length) };
587
- }
588
-
589
- return { prefix: '', content: line };
590
- }
591
-
592
- /**
593
- * Protect figure/table anchors before diffing
594
- * Anchors like {#fig:heatmap} and {#tbl:results} should never be deleted
595
- * @param {string} md
596
- * @returns {{text: string, anchors: Array<{original: string, placeholder: string}>}}
597
- */
598
- function protectAnchors(md) {
599
- const anchors = [];
600
-
601
- // Match {#fig:label}, {#tbl:label}, {#eq:label}, {#sec:label} etc.
602
- // Also match with additional attributes like {#fig:label width=50%}
603
- const text = md.replace(/\{#(fig|tbl|eq|sec|lst):[^}]+\}/g, (match) => {
604
- const idx = anchors.length;
605
- const placeholder = `ANCHORBLOCK${idx}ENDANCHOR`;
606
- anchors.push({ original: match, placeholder });
607
- return placeholder;
608
- });
609
-
610
- return { text, anchors };
611
- }
612
-
613
- /**
614
- * Restore anchors from placeholders
615
- * @param {string} text
616
- * @param {Array} anchors
617
- * @returns {string}
618
- */
619
- function restoreAnchors(text, anchors) {
620
- for (const anchor of anchors) {
621
- // Handle case where anchor is inside a deletion annotation
622
- // {--...ANCHORBLOCK0ENDANCHOR--} should become {--...--}{#fig:label}
623
- const deletionPattern = new RegExp(`\\{--([^}]*?)${anchor.placeholder}([^}]*?)--\\}`, 'g');
624
- text = text.replace(deletionPattern, (match, before, after) => {
625
- const cleanBefore = before.trim();
626
- const cleanAfter = after.trim();
627
- let result = '';
628
- if (cleanBefore) result += `{--${cleanBefore}--}`;
629
- result += anchor.original;
630
- if (cleanAfter) result += `{--${cleanAfter}--}`;
631
- return result;
632
- });
633
-
634
- // Handle case where anchor is inside a substitution
635
- // {~~old ANCHORBLOCK0ENDANCHOR~>new~~} -> {~~old~>new~~}{#fig:label}
636
- const substitutionPattern = new RegExp(`\\{~~([^~]*?)${anchor.placeholder}([^~]*?)~>([^~]*)~~\\}`, 'g');
637
- text = text.replace(substitutionPattern, (match, oldBefore, oldAfter, newText) => {
638
- const cleanOldBefore = oldBefore.trim();
639
- const cleanOldAfter = oldAfter.trim();
640
- const cleanNew = newText.trim();
641
- const oldText = (cleanOldBefore + ' ' + cleanOldAfter).trim();
642
- let result = '';
643
- if (oldText !== cleanNew) {
644
- result += `{~~${oldText}~>${cleanNew}~~}`;
645
- } else {
646
- result += cleanNew;
647
- }
648
- result += anchor.original;
649
- return result;
650
- });
651
-
652
- // Normal replacement
653
- text = text.split(anchor.placeholder).join(anchor.original);
654
- }
655
- return text;
656
- }
657
-
658
- /**
659
- * Protect cross-references before diffing
660
- * References like @fig:label, @tbl:label should be preserved
661
- * @param {string} md
662
- * @returns {{text: string, crossrefs: Array<{original: string, placeholder: string}>}}
663
- */
664
- function protectCrossrefs(md) {
665
- const crossrefs = [];
666
-
667
- // Match @fig:label, @tbl:label, @eq:label, @sec:label
668
- // Can appear as @fig:label or (@fig:label) or [@fig:label]
669
- const text = md.replace(/@(fig|tbl|eq|sec|lst):[a-zA-Z0-9_-]+/g, (match) => {
670
- const idx = crossrefs.length;
671
- const placeholder = `XREFBLOCK${idx}ENDXREF`;
672
- crossrefs.push({ original: match, placeholder });
673
- return placeholder;
674
- });
675
-
676
- return { text, crossrefs };
677
- }
678
-
679
- /**
680
- * Restore cross-references from placeholders
681
- * @param {string} text
682
- * @param {Array} crossrefs
683
- * @returns {string}
684
- */
685
- function restoreCrossrefs(text, crossrefs) {
686
- for (const xref of crossrefs) {
687
- // Handle deletions - restore the reference even if marked deleted
688
- const deletionPattern = new RegExp(`\\{--([^}]*?)${xref.placeholder}([^}]*?)--\\}`, 'g');
689
- text = text.replace(deletionPattern, (match, before, after) => {
690
- const cleanBefore = before.trim();
691
- const cleanAfter = after.trim();
692
- let result = '';
693
- if (cleanBefore) result += `{--${cleanBefore}--}`;
694
- result += xref.original;
695
- if (cleanAfter) result += `{--${cleanAfter}--}`;
696
- return result;
697
- });
698
-
699
- // Handle substitutions where rendered form (Figure 1) replaced the reference
700
- // {~~XREFBLOCK0ENDXREF~>Figure 1~~} -> @fig:label
701
- const substitutionPattern = new RegExp(`\\{~~${xref.placeholder}~>[^~]+~~\\}`, 'g');
702
- text = text.replace(substitutionPattern, xref.original);
703
-
704
- // Normal replacement
705
- text = text.split(xref.placeholder).join(xref.original);
706
- }
707
- return text;
708
- }
709
-
710
- /**
711
- * Protect mathematical notation before diffing by replacing with placeholders
712
- * Handles both inline $...$ and display $$...$$ math
713
- * @param {string} md
714
- * @returns {{text: string, mathBlocks: Array<{original: string, placeholder: string, type: string, simplified: string}>}}
715
- */
716
- function protectMath(md) {
717
- const mathBlocks = [];
718
-
719
- // First protect display math ($$...$$) - must be done before inline math
720
- let text = md.replace(/\$\$([^$]+)\$\$/g, (match, content) => {
721
- const idx = mathBlocks.length;
722
- const placeholder = `MATHBLOCK${idx}ENDMATH`;
723
- // Create simplified version for matching in Word text
724
- const simplified = simplifyMathForMatching(content);
725
- mathBlocks.push({ original: match, placeholder, type: 'display', simplified });
726
- return placeholder;
727
- });
728
-
729
- // Then protect inline math ($...$)
730
- text = text.replace(/\$([^$\n]+)\$/g, (match, content) => {
731
- const idx = mathBlocks.length;
732
- const placeholder = `MATHBLOCK${idx}ENDMATH`;
733
- const simplified = simplifyMathForMatching(content);
734
- mathBlocks.push({ original: match, placeholder, type: 'inline', simplified });
735
- return placeholder;
736
- });
737
-
738
- return { text, mathBlocks };
739
- }
740
-
741
- /**
742
- * Simplify LaTeX math for fuzzy matching against Word text
743
- * Word renders math as text, so we need to match the rendered form
744
- * @param {string} latex
745
- * @returns {string}
746
- */
747
- function simplifyMathForMatching(latex) {
748
- return latex
749
- // Remove common LaTeX commands
750
- .replace(/\\text\{([^}]+)\}/g, '$1')
751
- .replace(/\\hat\{([^}]+)\}/g, '$1')
752
- .replace(/\\bar\{([^}]+)\}/g, '$1')
753
- .replace(/\\frac\{([^}]+)\}\{([^}]+)\}/g, '$1/$2')
754
- .replace(/\\sum_([a-z])/g, 'Σ')
755
- .replace(/\\sum/g, 'Σ')
756
- .replace(/\\cdot/g, '·')
757
- .replace(/\\quad/g, ' ')
758
- .replace(/\\,/g, ' ')
759
- .replace(/\\_/g, '_')
760
- .replace(/\\{/g, '{')
761
- .replace(/\\}/g, '}')
762
- .replace(/\\/g, '') // Remove remaining backslashes
763
- .replace(/[{}]/g, '') // Remove braces
764
- .replace(/\s+/g, ' ')
765
- .trim();
766
- }
767
-
768
- /**
769
- * Restore math from placeholders
770
- * @param {string} text
771
- * @param {Array} mathBlocks
772
- * @returns {string}
773
- */
774
- function restoreMath(text, mathBlocks) {
775
- for (const block of mathBlocks) {
776
- text = text.split(block.placeholder).join(block.original);
777
- }
778
- return text;
779
- }
780
-
781
- /**
782
- * Replace rendered math in Word text with matching placeholders
783
- * This is heuristic-based since Word can render math in various ways
784
- * @param {string} wordText
785
- * @param {Array} mathBlocks
786
- * @returns {string}
787
- */
788
- function replaceRenderedMath(wordText, mathBlocks) {
789
- let result = wordText;
790
-
791
- for (const block of mathBlocks) {
792
- // For inline math, try to find the simplified form in Word text
793
- if (block.simplified.length >= 2) {
794
- // Try exact match first
795
- if (result.includes(block.simplified)) {
796
- result = result.replace(block.simplified, block.placeholder);
797
- }
798
- }
799
- }
800
-
801
- return result;
802
- }
803
-
804
- /**
805
- * Protect citations before diffing by replacing with placeholders
806
- * @param {string} md
807
- * @returns {{text: string, citations: string[]}}
808
- */
809
- function protectCitations(md) {
810
- const citations = [];
811
- const text = md.replace(/\[@[^\]]+\]/g, (match) => {
812
- const idx = citations.length;
813
- citations.push(match);
814
- return `CITEREF${idx}ENDCITE`;
815
- });
816
- return { text, citations };
817
- }
818
-
819
- /**
820
- * Restore citations from placeholders
821
- * @param {string} text
822
- * @param {string[]} citations
823
- * @returns {string}
824
- */
825
- function restoreCitations(text, citations) {
826
- for (let i = 0; i < citations.length; i++) {
827
- // Handle cases where placeholder might be inside annotations
828
- const placeholder = `CITEREF${i}ENDCITE`;
829
- text = text.split(placeholder).join(citations[i]);
830
- }
831
- return text;
832
- }
833
-
834
- /**
835
- * Remove rendered citations from Word text (replace with matching placeholders)
836
- * @param {string} wordText
837
- * @param {number} count
838
- * @returns {string}
839
- */
840
- function replaceRenderedCitations(wordText, count) {
841
- // Match rendered citation patterns: (Author 2021), (Author et al. 2021), etc.
842
- const pattern = /\((?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?(?:\s*[&,;]\s*[A-Z][a-zé]+(?:\s+et\s+al\.?)?)*\s+\d{4}(?:[a-z])?(?:\s*[,;]\s*(?:[A-Z][a-zé]+(?:\s+et\s+al\.?)?\s+)?\d{4}(?:[a-z])?)*)\)/g;
843
-
844
- let idx = 0;
845
- return wordText.replace(pattern, (match) => {
846
- if (idx < count) {
847
- const placeholder = `CITEREF${idx}ENDCITE`;
848
- idx++;
849
- return placeholder;
850
- }
851
- return match;
852
- });
853
- }
854
-
855
- /**
856
- * Smart paragraph-level diff that preserves markdown structure
857
- * @param {string} originalMd
858
- * @param {string} wordText
859
- * @param {string} author
860
- * @returns {string}
861
- */
862
- export function generateSmartDiff(originalMd, wordText, author = 'Reviewer') {
863
- // Protection order matters: anchors first, then crossrefs, math, citations
864
-
865
- // Protect figure/table anchors (CRITICAL - these must never be deleted)
866
- const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(originalMd);
867
-
868
- // Protect cross-references (@fig:label, @tbl:label)
869
- const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
870
-
871
- // Protect math (before citations, since citations might be inside math)
872
- const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
873
-
874
- // Then protect citations
875
- const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
876
-
877
- // Replace rendered math and citations in Word text with matching placeholders
878
- let wordProtected = replaceRenderedMath(wordText, mathBlocks);
879
- wordProtected = replaceRenderedCitations(wordProtected, citations.length);
880
-
881
- // Split into paragraphs
882
- const originalParas = mdProtected.split(/\n\n+/);
883
- const wordParas = wordProtected.split(/\n\n+/);
884
-
885
- const result = [];
886
-
887
- // Try to match paragraphs intelligently
888
- let wordIdx = 0;
889
-
890
- for (let i = 0; i < originalParas.length; i++) {
891
- const orig = originalParas[i] || '';
892
- const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
893
-
894
- // Find best matching word paragraph
895
- let bestMatch = -1;
896
- let bestScore = 0;
897
-
898
- for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
899
- const wordPara = wordParas[j] || '';
900
- // Simple similarity: count common words
901
- const origWords = new Set(origContent.toLowerCase().split(/\s+/));
902
- const wordWords = wordPara.toLowerCase().split(/\s+/);
903
- const common = wordWords.filter((w) => origWords.has(w)).length;
904
- const score = common / Math.max(origWords.size, wordWords.length);
905
-
906
- if (score > bestScore && score > 0.3) {
907
- bestScore = score;
908
- bestMatch = j;
909
- }
910
- }
911
-
912
- if (bestMatch === -1) {
913
- // No match found - paragraph was deleted or heavily modified
914
- // Check if it's just a header that Word converted
915
- if (mdPrefix && wordIdx < wordParas.length) {
916
- const wordPara = wordParas[wordIdx];
917
- if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
918
- // Word paragraph contains the header content - match them
919
- bestMatch = wordIdx;
920
- }
921
- }
922
- }
923
-
924
- if (bestMatch >= 0) {
925
- const word = wordParas[bestMatch];
926
-
927
- // Strip markdown from original for clean comparison
928
- const origStripped = stripMarkdownSyntax(orig);
929
- const wordNormalized = normalizeText(word);
930
-
931
- if (origStripped === wordNormalized) {
932
- // Unchanged (ignoring markdown syntax)
933
- result.push(orig);
934
- } else {
935
- // Modified - diff the content, preserve markdown prefix
936
- const changes = diffWords(origStripped, wordNormalized);
937
- let annotated = mdPrefix; // Preserve header/list marker
938
-
939
- for (const part of changes) {
940
- if (part.added) {
941
- annotated += `{++${part.value}++}`;
942
- } else if (part.removed) {
943
- annotated += `{--${part.value}--}`;
944
- } else {
945
- annotated += part.value;
946
- }
947
- }
948
-
949
- result.push(annotated);
950
- }
951
-
952
- wordIdx = bestMatch + 1;
953
- } else {
954
- // Paragraph deleted entirely
955
- result.push(`{--${orig}--}`);
956
- }
957
- }
958
-
959
- // Any remaining word paragraphs are additions
960
- for (let j = wordIdx; j < wordParas.length; j++) {
961
- const word = wordParas[j];
962
- if (word.trim()) {
963
- result.push(`{++${word}++}`);
964
- }
965
- }
966
-
967
- // Restore protected content (reverse order of protection)
968
- let finalResult = result.join('\n\n');
969
- finalResult = restoreCitations(finalResult, citations);
970
- finalResult = restoreMath(finalResult, mathBlocks);
971
- finalResult = restoreCrossrefs(finalResult, crossrefs);
972
- finalResult = restoreAnchors(finalResult, figAnchors);
973
-
974
- return finalResult;
975
- }
976
-
977
- /**
978
- * Clean up redundant adjacent annotations
979
- * e.g., {--old--}{++new++} → {~~old~>new~~}
980
- * @param {string} text
981
- * @returns {string}
982
- */
983
- export function cleanupAnnotations(text) {
984
- // Convert adjacent delete+insert to substitution (with possible whitespace between)
985
- // Pattern: {--something--} {++something else++}
986
- text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
987
-
988
- // Also handle insert+delete (less common but possible)
989
- text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
990
-
991
- // Fix malformed patterns where {-- got merged with ~>
992
- // {--key~>critical~~} → {~~key~>critical~~}
993
- text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
994
-
995
- // Fix malformed substitutions that got split
996
- // {~~word --} ... {++other~~} patterns
997
- text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
998
- text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
999
-
1000
- // Clean up empty annotations
1001
- text = text.replace(/\{--\s*--\}/g, '');
1002
- text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1003
-
1004
- // Clean up double spaces
1005
- text = text.replace(/ +/g, ' ');
1006
-
1007
- return text;
1008
- }
1009
-
1010
- /**
1011
- * Parse visible comment markers from Word text
1012
- * Format: [Author: comment text]
1013
- * @param {string} text
1014
- * @returns {Array<{author: string, text: string, position: number}>}
1015
- */
1016
- export function parseVisibleComments(text) {
1017
- const comments = [];
1018
- const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
1019
-
1020
- let match;
1021
- while ((match = pattern.exec(text)) !== null) {
1022
- comments.push({
1023
- author: match[1].trim(),
1024
- text: match[2].trim(),
1025
- position: match.index,
1026
- });
1027
- }
1028
-
1029
- return comments;
1030
- }
1031
-
1032
- /**
1033
- * Convert visible comments to CriticMarkup format
1034
- * @param {string} text
1035
- * @returns {string}
1036
- */
1037
- export function convertVisibleComments(text) {
1038
- return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
1039
- }
1040
-
1041
- /**
1042
- * Full import pipeline: Word doc → annotated MD
1043
- * @param {string} docxPath - Path to Word document
1044
- * @param {string} originalMdPath - Path to original markdown
1045
- * @param {{author?: string, sectionContent?: string, figuresDir?: string}} options
1046
- * @returns {Promise<{annotated: string, stats: object, extractedMedia: string[]}>}
1047
- */
1048
- export async function importFromWord(docxPath, originalMdPath, options = {}) {
1049
- const { author = 'Reviewer', sectionContent, figuresDir } = options;
1050
-
1051
- // Use provided section content or extract from Word
1052
- let wordText;
1053
- let extractedMedia = [];
1054
-
1055
- if (sectionContent !== undefined) {
1056
- wordText = sectionContent;
1057
- } else {
1058
- // Determine media directory - use figuresDir if provided, otherwise extract next to docx
1059
- const docxDir = path.dirname(docxPath);
1060
- const mediaDir = figuresDir || docxDir;
1061
-
1062
- const extracted = await extractFromWord(docxPath, { mediaDir });
1063
- wordText = extracted.text;
1064
- extractedMedia = extracted.extractedMedia || [];
1065
-
1066
- // Log any messages
1067
- for (const msg of extracted.messages || []) {
1068
- if (msg.type === 'info') {
1069
- console.log(msg.message);
1070
- } else if (msg.type === 'warning') {
1071
- console.warn(`Warning: ${msg.message}`);
1072
- }
1073
- }
1074
- }
1075
-
1076
- // Read original markdown
1077
- let originalMd = fs.readFileSync(originalMdPath, 'utf-8');
1078
-
1079
- // IMPORTANT: Strip any existing annotations to prevent nested annotations
1080
- // This ensures we always diff clean text against Word text
1081
- originalMd = stripAnnotations(originalMd, { keepComments: false });
1082
-
1083
- // Generate diff
1084
- let annotated = generateSmartDiff(originalMd, wordText, author);
1085
-
1086
- // Clean up adjacent del/ins to substitutions
1087
- annotated = cleanupAnnotations(annotated);
1088
-
1089
- // Fix citation-related annotations (preserve markdown citations)
1090
- annotated = fixCitationAnnotations(annotated, originalMd);
1091
-
1092
- // Convert any visible comments
1093
- annotated = convertVisibleComments(annotated);
1094
-
1095
- // Count changes
1096
- const insertions = (annotated.match(/\{\+\+/g) || []).length;
1097
- const deletions = (annotated.match(/\{--/g) || []).length;
1098
- const substitutions = (annotated.match(/\{~~/g) || []).length;
1099
- const comments = (annotated.match(/\{>>/g) || []).length;
1100
-
1101
- return {
1102
- annotated,
1103
- stats: {
1104
- insertions,
1105
- deletions,
1106
- substitutions,
1107
- comments,
1108
- total: insertions + deletions + substitutions + comments,
1109
- },
1110
- extractedMedia,
1111
- };
1112
- }
1113
-
1114
- /**
1115
- * Move extracted media files to a figures directory with better names
1116
- * @param {string[]} mediaFiles - Paths to extracted media files
1117
- * @param {string} figuresDir - Target figures directory
1118
- * @param {string} prefix - Prefix for renamed files (e.g., 'fig')
1119
- * @returns {{moved: string[], errors: string[]}}
1120
- */
1121
- export function moveExtractedMedia(mediaFiles, figuresDir, prefix = 'figure') {
1122
- const moved = [];
1123
- const errors = [];
1124
-
1125
- // Create figures directory if it doesn't exist
1126
- if (!fs.existsSync(figuresDir)) {
1127
- fs.mkdirSync(figuresDir, { recursive: true });
1128
- }
1129
-
1130
- for (let i = 0; i < mediaFiles.length; i++) {
1131
- const src = mediaFiles[i];
1132
- const ext = path.extname(src).toLowerCase();
1133
- const newName = `${prefix}${i + 1}${ext}`;
1134
- const dest = path.join(figuresDir, newName);
1135
-
1136
- try {
1137
- fs.copyFileSync(src, dest);
1138
- moved.push({ from: src, to: dest, name: newName });
1139
- } catch (err) {
1140
- errors.push(`Failed to copy ${src}: ${err.message}`);
1141
- }
1142
- }
1143
-
1144
- return { moved, errors };
1145
- }