docrev 0.8.1 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (306) hide show
  1. package/.claude/settings.local.json +9 -0
  2. package/PLAN-tables-and-postprocess.md +850 -0
  3. package/README.md +33 -0
  4. package/bin/rev.js +12 -131
  5. package/bin/rev.ts +145 -0
  6. package/dist/bin/rev.d.ts +9 -0
  7. package/dist/bin/rev.d.ts.map +1 -0
  8. package/dist/bin/rev.js +118 -0
  9. package/dist/bin/rev.js.map +1 -0
  10. package/dist/lib/annotations.d.ts +91 -0
  11. package/dist/lib/annotations.d.ts.map +1 -0
  12. package/dist/lib/annotations.js +554 -0
  13. package/dist/lib/annotations.js.map +1 -0
  14. package/dist/lib/build.d.ts +171 -0
  15. package/dist/lib/build.d.ts.map +1 -0
  16. package/dist/lib/build.js +755 -0
  17. package/dist/lib/build.js.map +1 -0
  18. package/dist/lib/citations.d.ts +34 -0
  19. package/dist/lib/citations.d.ts.map +1 -0
  20. package/dist/lib/citations.js +140 -0
  21. package/dist/lib/citations.js.map +1 -0
  22. package/dist/lib/commands/build.d.ts +13 -0
  23. package/dist/lib/commands/build.d.ts.map +1 -0
  24. package/dist/lib/commands/build.js +678 -0
  25. package/dist/lib/commands/build.js.map +1 -0
  26. package/dist/lib/commands/citations.d.ts +11 -0
  27. package/dist/lib/commands/citations.d.ts.map +1 -0
  28. package/dist/lib/commands/citations.js +428 -0
  29. package/dist/lib/commands/citations.js.map +1 -0
  30. package/dist/lib/commands/comments.d.ts +11 -0
  31. package/dist/lib/commands/comments.d.ts.map +1 -0
  32. package/dist/lib/commands/comments.js +883 -0
  33. package/dist/lib/commands/comments.js.map +1 -0
  34. package/dist/lib/commands/context.d.ts +35 -0
  35. package/dist/lib/commands/context.d.ts.map +1 -0
  36. package/dist/lib/commands/context.js +59 -0
  37. package/dist/lib/commands/context.js.map +1 -0
  38. package/dist/lib/commands/core.d.ts +11 -0
  39. package/dist/lib/commands/core.d.ts.map +1 -0
  40. package/dist/lib/commands/core.js +246 -0
  41. package/dist/lib/commands/core.js.map +1 -0
  42. package/dist/lib/commands/doi.d.ts +11 -0
  43. package/dist/lib/commands/doi.d.ts.map +1 -0
  44. package/dist/lib/commands/doi.js +373 -0
  45. package/dist/lib/commands/doi.js.map +1 -0
  46. package/dist/lib/commands/history.d.ts +11 -0
  47. package/dist/lib/commands/history.d.ts.map +1 -0
  48. package/dist/lib/commands/history.js +245 -0
  49. package/dist/lib/commands/history.js.map +1 -0
  50. package/dist/lib/commands/index.d.ts +28 -0
  51. package/dist/lib/commands/index.d.ts.map +1 -0
  52. package/dist/lib/commands/index.js +35 -0
  53. package/dist/lib/commands/index.js.map +1 -0
  54. package/dist/lib/commands/init.d.ts +11 -0
  55. package/dist/lib/commands/init.d.ts.map +1 -0
  56. package/dist/lib/commands/init.js +209 -0
  57. package/dist/lib/commands/init.js.map +1 -0
  58. package/dist/lib/commands/response.d.ts +11 -0
  59. package/dist/lib/commands/response.d.ts.map +1 -0
  60. package/dist/lib/commands/response.js +317 -0
  61. package/dist/lib/commands/response.js.map +1 -0
  62. package/dist/lib/commands/sections.d.ts +11 -0
  63. package/dist/lib/commands/sections.d.ts.map +1 -0
  64. package/dist/lib/commands/sections.js +1071 -0
  65. package/dist/lib/commands/sections.js.map +1 -0
  66. package/dist/lib/commands/utilities.d.ts +19 -0
  67. package/dist/lib/commands/utilities.d.ts.map +1 -0
  68. package/dist/lib/commands/utilities.js +2009 -0
  69. package/dist/lib/commands/utilities.js.map +1 -0
  70. package/dist/lib/comment-realign.d.ts +50 -0
  71. package/dist/lib/comment-realign.d.ts.map +1 -0
  72. package/dist/lib/comment-realign.js +372 -0
  73. package/dist/lib/comment-realign.js.map +1 -0
  74. package/dist/lib/config.d.ts +41 -0
  75. package/dist/lib/config.d.ts.map +1 -0
  76. package/dist/lib/config.js +76 -0
  77. package/dist/lib/config.js.map +1 -0
  78. package/dist/lib/crossref.d.ts +108 -0
  79. package/dist/lib/crossref.d.ts.map +1 -0
  80. package/dist/lib/crossref.js +597 -0
  81. package/dist/lib/crossref.js.map +1 -0
  82. package/dist/lib/dependencies.d.ts +30 -0
  83. package/dist/lib/dependencies.d.ts.map +1 -0
  84. package/dist/lib/dependencies.js +95 -0
  85. package/dist/lib/dependencies.js.map +1 -0
  86. package/dist/lib/doi-cache.d.ts +29 -0
  87. package/dist/lib/doi-cache.d.ts.map +1 -0
  88. package/dist/lib/doi-cache.js +104 -0
  89. package/dist/lib/doi-cache.js.map +1 -0
  90. package/dist/lib/doi.d.ts +65 -0
  91. package/dist/lib/doi.d.ts.map +1 -0
  92. package/dist/lib/doi.js +710 -0
  93. package/dist/lib/doi.js.map +1 -0
  94. package/dist/lib/equations.d.ts +61 -0
  95. package/dist/lib/equations.d.ts.map +1 -0
  96. package/dist/lib/equations.js +445 -0
  97. package/dist/lib/equations.js.map +1 -0
  98. package/dist/lib/errors.d.ts +60 -0
  99. package/dist/lib/errors.d.ts.map +1 -0
  100. package/dist/lib/errors.js +303 -0
  101. package/dist/lib/errors.js.map +1 -0
  102. package/dist/lib/format.d.ts +104 -0
  103. package/dist/lib/format.d.ts.map +1 -0
  104. package/dist/lib/format.js +416 -0
  105. package/dist/lib/format.js.map +1 -0
  106. package/dist/lib/git.d.ts +88 -0
  107. package/dist/lib/git.d.ts.map +1 -0
  108. package/dist/lib/git.js +304 -0
  109. package/dist/lib/git.js.map +1 -0
  110. package/dist/lib/grammar.d.ts +62 -0
  111. package/dist/lib/grammar.d.ts.map +1 -0
  112. package/dist/lib/grammar.js +244 -0
  113. package/dist/lib/grammar.js.map +1 -0
  114. package/dist/lib/image-registry.d.ts +68 -0
  115. package/dist/lib/image-registry.d.ts.map +1 -0
  116. package/dist/lib/image-registry.js +112 -0
  117. package/dist/lib/image-registry.js.map +1 -0
  118. package/dist/lib/import.d.ts +184 -0
  119. package/dist/lib/import.d.ts.map +1 -0
  120. package/dist/lib/import.js +1581 -0
  121. package/dist/lib/import.js.map +1 -0
  122. package/dist/lib/journals.d.ts +55 -0
  123. package/dist/lib/journals.d.ts.map +1 -0
  124. package/dist/lib/journals.js +417 -0
  125. package/dist/lib/journals.js.map +1 -0
  126. package/dist/lib/merge.d.ts +138 -0
  127. package/dist/lib/merge.d.ts.map +1 -0
  128. package/dist/lib/merge.js +603 -0
  129. package/dist/lib/merge.js.map +1 -0
  130. package/dist/lib/orcid.d.ts +36 -0
  131. package/dist/lib/orcid.d.ts.map +1 -0
  132. package/dist/lib/orcid.js +117 -0
  133. package/dist/lib/orcid.js.map +1 -0
  134. package/dist/lib/pdf-comments.d.ts +95 -0
  135. package/dist/lib/pdf-comments.d.ts.map +1 -0
  136. package/dist/lib/pdf-comments.js +192 -0
  137. package/dist/lib/pdf-comments.js.map +1 -0
  138. package/dist/lib/pdf-import.d.ts +118 -0
  139. package/dist/lib/pdf-import.d.ts.map +1 -0
  140. package/dist/lib/pdf-import.js +397 -0
  141. package/dist/lib/pdf-import.js.map +1 -0
  142. package/dist/lib/plugins.d.ts +76 -0
  143. package/dist/lib/plugins.d.ts.map +1 -0
  144. package/dist/lib/plugins.js +235 -0
  145. package/dist/lib/plugins.js.map +1 -0
  146. package/dist/lib/postprocess.d.ts +42 -0
  147. package/dist/lib/postprocess.d.ts.map +1 -0
  148. package/dist/lib/postprocess.js +138 -0
  149. package/dist/lib/postprocess.js.map +1 -0
  150. package/dist/lib/pptx-template.d.ts +59 -0
  151. package/dist/lib/pptx-template.d.ts.map +1 -0
  152. package/dist/lib/pptx-template.js +613 -0
  153. package/dist/lib/pptx-template.js.map +1 -0
  154. package/dist/lib/pptx-themes.d.ts +80 -0
  155. package/dist/lib/pptx-themes.d.ts.map +1 -0
  156. package/dist/lib/pptx-themes.js +818 -0
  157. package/dist/lib/pptx-themes.js.map +1 -0
  158. package/dist/lib/protect-restore.d.ts +137 -0
  159. package/dist/lib/protect-restore.d.ts.map +1 -0
  160. package/dist/lib/protect-restore.js +394 -0
  161. package/dist/lib/protect-restore.js.map +1 -0
  162. package/dist/lib/rate-limiter.d.ts +27 -0
  163. package/dist/lib/rate-limiter.d.ts.map +1 -0
  164. package/dist/lib/rate-limiter.js +79 -0
  165. package/dist/lib/rate-limiter.js.map +1 -0
  166. package/dist/lib/response.d.ts +41 -0
  167. package/dist/lib/response.d.ts.map +1 -0
  168. package/dist/lib/response.js +150 -0
  169. package/dist/lib/response.js.map +1 -0
  170. package/dist/lib/review.d.ts +35 -0
  171. package/dist/lib/review.d.ts.map +1 -0
  172. package/dist/lib/review.js +263 -0
  173. package/dist/lib/review.js.map +1 -0
  174. package/dist/lib/schema.d.ts +66 -0
  175. package/dist/lib/schema.d.ts.map +1 -0
  176. package/dist/lib/schema.js +339 -0
  177. package/dist/lib/schema.js.map +1 -0
  178. package/dist/lib/scientific-words.d.ts +6 -0
  179. package/dist/lib/scientific-words.d.ts.map +1 -0
  180. package/dist/lib/scientific-words.js +66 -0
  181. package/dist/lib/scientific-words.js.map +1 -0
  182. package/dist/lib/sections.d.ts +40 -0
  183. package/dist/lib/sections.d.ts.map +1 -0
  184. package/dist/lib/sections.js +288 -0
  185. package/dist/lib/sections.js.map +1 -0
  186. package/dist/lib/slides.d.ts +86 -0
  187. package/dist/lib/slides.d.ts.map +1 -0
  188. package/dist/lib/slides.js +676 -0
  189. package/dist/lib/slides.js.map +1 -0
  190. package/dist/lib/spelling.d.ts +76 -0
  191. package/dist/lib/spelling.d.ts.map +1 -0
  192. package/dist/lib/spelling.js +272 -0
  193. package/dist/lib/spelling.js.map +1 -0
  194. package/dist/lib/templates.d.ts +30 -0
  195. package/dist/lib/templates.d.ts.map +1 -0
  196. package/dist/lib/templates.js +504 -0
  197. package/dist/lib/templates.js.map +1 -0
  198. package/dist/lib/themes.d.ts +85 -0
  199. package/dist/lib/themes.d.ts.map +1 -0
  200. package/dist/lib/themes.js +652 -0
  201. package/dist/lib/themes.js.map +1 -0
  202. package/dist/lib/trackchanges.d.ts +51 -0
  203. package/dist/lib/trackchanges.d.ts.map +1 -0
  204. package/dist/lib/trackchanges.js +202 -0
  205. package/dist/lib/trackchanges.js.map +1 -0
  206. package/dist/lib/tui.d.ts +76 -0
  207. package/dist/lib/tui.d.ts.map +1 -0
  208. package/dist/lib/tui.js +377 -0
  209. package/dist/lib/tui.js.map +1 -0
  210. package/dist/lib/types.d.ts +447 -0
  211. package/dist/lib/types.d.ts.map +1 -0
  212. package/dist/lib/types.js +6 -0
  213. package/dist/lib/types.js.map +1 -0
  214. package/dist/lib/undo.d.ts +57 -0
  215. package/dist/lib/undo.d.ts.map +1 -0
  216. package/dist/lib/undo.js +185 -0
  217. package/dist/lib/undo.js.map +1 -0
  218. package/dist/lib/utils.d.ts +16 -0
  219. package/dist/lib/utils.d.ts.map +1 -0
  220. package/dist/lib/utils.js +40 -0
  221. package/dist/lib/utils.js.map +1 -0
  222. package/dist/lib/variables.d.ts +42 -0
  223. package/dist/lib/variables.d.ts.map +1 -0
  224. package/dist/lib/variables.js +141 -0
  225. package/dist/lib/variables.js.map +1 -0
  226. package/dist/lib/word.d.ts +80 -0
  227. package/dist/lib/word.d.ts.map +1 -0
  228. package/dist/lib/word.js +360 -0
  229. package/dist/lib/word.js.map +1 -0
  230. package/dist/lib/wordcomments.d.ts +51 -0
  231. package/dist/lib/wordcomments.d.ts.map +1 -0
  232. package/dist/lib/wordcomments.js +587 -0
  233. package/dist/lib/wordcomments.js.map +1 -0
  234. package/eslint.config.js +27 -0
  235. package/lib/annotations.ts +622 -0
  236. package/lib/apply-buildup-colors.py +88 -0
  237. package/lib/build.ts +1013 -0
  238. package/lib/{citations.js → citations.ts} +38 -27
  239. package/lib/commands/{build.js → build.ts} +80 -27
  240. package/lib/commands/{citations.js → citations.ts} +36 -18
  241. package/lib/commands/{comments.js → comments.ts} +187 -54
  242. package/lib/commands/{context.js → context.ts} +18 -8
  243. package/lib/commands/{core.js → core.ts} +34 -20
  244. package/lib/commands/{doi.js → doi.ts} +32 -16
  245. package/lib/commands/{history.js → history.ts} +25 -12
  246. package/lib/commands/{index.js → index.ts} +9 -5
  247. package/lib/commands/{init.js → init.ts} +20 -8
  248. package/lib/commands/{response.js → response.ts} +47 -20
  249. package/lib/commands/{sections.js → sections.ts} +273 -68
  250. package/lib/commands/{utilities.js → utilities.ts} +338 -158
  251. package/lib/{comment-realign.js → comment-realign.ts} +117 -45
  252. package/lib/config.ts +84 -0
  253. package/lib/{crossref.js → crossref.ts} +213 -138
  254. package/lib/dependencies.ts +106 -0
  255. package/lib/doi-cache.ts +115 -0
  256. package/lib/{doi.js → doi.ts} +115 -281
  257. package/lib/{equations.js → equations.ts} +60 -64
  258. package/lib/{errors.js → errors.ts} +56 -48
  259. package/lib/{format.js → format.ts} +137 -63
  260. package/lib/{git.js → git.ts} +66 -63
  261. package/lib/{grammar.js → grammar.ts} +45 -32
  262. package/lib/image-registry.ts +180 -0
  263. package/lib/import.ts +2060 -0
  264. package/lib/journals.ts +505 -0
  265. package/lib/{merge.js → merge.ts} +185 -135
  266. package/lib/{orcid.js → orcid.ts} +17 -22
  267. package/lib/{pdf-comments.js → pdf-comments.ts} +76 -18
  268. package/lib/{pdf-import.js → pdf-import.ts} +148 -70
  269. package/lib/{plugins.js → plugins.ts} +82 -39
  270. package/lib/postprocess.ts +188 -0
  271. package/lib/pptx-color-filter.lua +37 -0
  272. package/lib/pptx-template.ts +625 -0
  273. package/lib/pptx-themes/academic.pptx +0 -0
  274. package/lib/pptx-themes/corporate.pptx +0 -0
  275. package/lib/pptx-themes/dark.pptx +0 -0
  276. package/lib/pptx-themes/default.pptx +0 -0
  277. package/lib/pptx-themes/minimal.pptx +0 -0
  278. package/lib/pptx-themes/plant.pptx +0 -0
  279. package/lib/pptx-themes.ts +896 -0
  280. package/lib/protect-restore.ts +516 -0
  281. package/lib/rate-limiter.ts +94 -0
  282. package/lib/{response.js → response.ts} +36 -21
  283. package/lib/{review.js → review.ts} +53 -43
  284. package/lib/{schema.js → schema.ts} +70 -25
  285. package/lib/{sections.js → sections.ts} +71 -76
  286. package/lib/slides.ts +793 -0
  287. package/lib/{spelling.js → spelling.ts} +43 -59
  288. package/lib/{templates.js → templates.ts} +20 -17
  289. package/lib/themes.ts +742 -0
  290. package/lib/{trackchanges.js → trackchanges.ts} +52 -23
  291. package/lib/types.ts +509 -0
  292. package/lib/{undo.js → undo.ts} +75 -52
  293. package/lib/utils.ts +41 -0
  294. package/lib/{variables.js → variables.ts} +60 -54
  295. package/lib/word.ts +428 -0
  296. package/lib/{wordcomments.js → wordcomments.ts} +94 -40
  297. package/package.json +15 -5
  298. package/skill/REFERENCE.md +67 -0
  299. package/tsconfig.json +26 -0
  300. package/lib/annotations.js +0 -414
  301. package/lib/build.js +0 -639
  302. package/lib/config.js +0 -79
  303. package/lib/import.js +0 -1145
  304. package/lib/journals.js +0 -629
  305. package/lib/word.js +0 -225
  306. /package/lib/{scientific-words.js → scientific-words.ts} +0 -0
package/lib/import.ts ADDED
@@ -0,0 +1,2060 @@
1
+ /**
2
+ * Import functionality - convert Word docs to annotated Markdown
3
+ */
4
+
5
+ import * as fs from 'fs';
6
+ import * as path from 'path';
7
+ import { diffWords, Change } from 'diff';
8
+ import { stripAnnotations } from './annotations.js';
9
+ import { readImageRegistry } from './image-registry.js';
10
+ import { exec } from 'child_process';
11
+ import { promisify } from 'util';
12
+ import {
13
+ extractMarkdownPrefix,
14
+ protectAnchors,
15
+ restoreAnchors,
16
+ protectCrossrefs,
17
+ restoreCrossrefs,
18
+ simplifyMathForMatching,
19
+ protectMath,
20
+ restoreMath,
21
+ replaceRenderedMath,
22
+ protectCitations,
23
+ restoreCitations,
24
+ replaceRenderedCitations,
25
+ protectImages,
26
+ restoreImages,
27
+ matchWordImagesToOriginal,
28
+ protectTables,
29
+ restoreTables,
30
+ } from './protect-restore.js';
31
+
32
+ const execAsync = promisify(exec);
33
+
34
+ // ============================================
35
+ // Type Definitions
36
+ // ============================================
37
+
38
+ interface WordComment {
39
+ id: string;
40
+ author: string;
41
+ date: string;
42
+ text: string;
43
+ }
44
+
45
+ interface TextNode {
46
+ xmlStart: number;
47
+ xmlEnd: number;
48
+ textStart: number;
49
+ textEnd: number;
50
+ text: string;
51
+ }
52
+
53
+ interface CommentAnchorData {
54
+ anchor: string;
55
+ before: string;
56
+ after: string;
57
+ docPosition: number;
58
+ docLength: number;
59
+ isEmpty: boolean;
60
+ }
61
+
62
+ interface CommentAnchorsResult {
63
+ anchors: Map<string, CommentAnchorData>;
64
+ fullDocText: string;
65
+ }
66
+
67
+ interface WordTable {
68
+ markdown: string;
69
+ rowCount: number;
70
+ colCount: number;
71
+ }
72
+
73
+ interface ParsedRow {
74
+ cells: string[];
75
+ colSpans: number[];
76
+ }
77
+
78
+ interface ExtractFromWordOptions {
79
+ mediaDir?: string;
80
+ skipMediaExtraction?: boolean;
81
+ }
82
+
83
+ interface ExtractMessage {
84
+ type: 'info' | 'warning';
85
+ message: string;
86
+ }
87
+
88
+ interface ExtractFromWordResult {
89
+ text: string;
90
+ comments: WordComment[];
91
+ anchors: Map<string, CommentAnchorData>;
92
+ messages: ExtractMessage[];
93
+ extractedMedia: string[];
94
+ tables: WordTable[];
95
+ hasTrackChanges: boolean;
96
+ trackChangeStats: { insertions: number; deletions: number };
97
+ }
98
+
99
+ interface InsertCommentsOptions {
100
+ quiet?: boolean;
101
+ sectionBoundary?: { start: number; end: number } | null;
102
+ }
103
+
104
+ interface CommentWithPos {
105
+ id: string;
106
+ author: string;
107
+ text: string;
108
+ date: string;
109
+ pos: number;
110
+ anchorText: string | null;
111
+ anchorEnd?: number;
112
+ isEmpty?: boolean;
113
+ strategy?: string;
114
+ }
115
+
116
+ interface AnchorSearchResult {
117
+ occurrences: number[];
118
+ matchedAnchor: string | null;
119
+ strategy: string;
120
+ stripped?: boolean;
121
+ }
122
+
123
+ interface MarkdownPrefixResult {
124
+ prefix: string;
125
+ content: string;
126
+ }
127
+
128
+ interface GenerateSmartDiffOptions {
129
+ wordTables?: WordTable[];
130
+ imageRegistry?: any;
131
+ }
132
+
133
+ interface RestoreCrossrefResult {
134
+ text: string;
135
+ restored: number;
136
+ messages: string[];
137
+ restoredLabels: Set<string>;
138
+ }
139
+
140
+ interface RestoreImagesResult {
141
+ text: string;
142
+ restored: number;
143
+ messages: string[];
144
+ }
145
+
146
+ interface ImportWordWithTrackChangesOptions {
147
+ mediaDir?: string;
148
+ projectDir?: string;
149
+ }
150
+
151
+ interface ImportWordWithTrackChangesResult {
152
+ text: string;
153
+ stats: {
154
+ insertions: number;
155
+ deletions: number;
156
+ substitutions: number;
157
+ comments: number;
158
+ total: number;
159
+ hasTrackChanges: boolean;
160
+ trackChangeStats: { insertions: number; deletions: number };
161
+ };
162
+ extractedMedia: string[];
163
+ comments: WordComment[];
164
+ }
165
+
166
+ interface ImportFromWordOptions {
167
+ author?: string;
168
+ sectionContent?: string;
169
+ figuresDir?: string;
170
+ wordTables?: WordTable[];
171
+ }
172
+
173
+ interface ImportFromWordResult {
174
+ annotated: string;
175
+ stats: {
176
+ insertions: number;
177
+ deletions: number;
178
+ substitutions: number;
179
+ comments: number;
180
+ total: number;
181
+ };
182
+ extractedMedia: string[];
183
+ }
184
+
185
+ interface MovedFile {
186
+ from: string;
187
+ to: string;
188
+ name: string;
189
+ }
190
+
191
+ interface MoveExtractedMediaResult {
192
+ moved: MovedFile[];
193
+ errors: string[];
194
+ }
195
+
196
+ // ============================================
197
+ // Functions
198
+ // ============================================
199
+
200
+ /**
201
+ * Extract comments directly from Word docx comments.xml
202
+ */
203
+ export async function extractWordComments(docxPath: string): Promise<WordComment[]> {
204
+ const AdmZip = (await import('adm-zip')).default;
205
+ const { parseStringPromise } = await import('xml2js');
206
+
207
+ const comments: WordComment[] = [];
208
+
209
+ // Validate file exists
210
+ if (!fs.existsSync(docxPath)) {
211
+ throw new Error(`File not found: ${docxPath}`);
212
+ }
213
+
214
+ try {
215
+ let zip;
216
+ try {
217
+ zip = new AdmZip(docxPath);
218
+ } catch (err: any) {
219
+ throw new Error(`Invalid Word document (not a valid .docx file): ${err.message}`);
220
+ }
221
+
222
+ const commentsEntry = zip.getEntry('word/comments.xml');
223
+
224
+ if (!commentsEntry) {
225
+ return comments;
226
+ }
227
+
228
+ let commentsXml;
229
+ try {
230
+ commentsXml = commentsEntry.getData().toString('utf8');
231
+ } catch (err: any) {
232
+ throw new Error(`Failed to read comments from document: ${err.message}`);
233
+ }
234
+
235
+ const parsed = await parseStringPromise(commentsXml, { explicitArray: false });
236
+
237
+ const ns = 'w:';
238
+ const commentsRoot = parsed['w:comments'];
239
+ if (!commentsRoot || !commentsRoot['w:comment']) {
240
+ return comments;
241
+ }
242
+
243
+ // Ensure it's an array
244
+ const commentNodes = Array.isArray(commentsRoot['w:comment'])
245
+ ? commentsRoot['w:comment']
246
+ : [commentsRoot['w:comment']];
247
+
248
+ for (const comment of commentNodes) {
249
+ const id = comment.$?.['w:id'] || '';
250
+ const author = comment.$?.['w:author'] || 'Unknown';
251
+ const date = comment.$?.['w:date'] || '';
252
+
253
+ // Extract text from nested w:p/w:r/w:t elements
254
+ let text = '';
255
+ const extractText = (node: any): void => {
256
+ if (!node) return;
257
+ if (typeof node === 'string') {
258
+ text += node;
259
+ return;
260
+ }
261
+ if (node['w:t']) {
262
+ const t = node['w:t'];
263
+ text += typeof t === 'string' ? t : (t._ || t);
264
+ }
265
+ if (node['w:r']) {
266
+ const runs = Array.isArray(node['w:r']) ? node['w:r'] : [node['w:r']];
267
+ runs.forEach(extractText);
268
+ }
269
+ if (node['w:p']) {
270
+ const paras = Array.isArray(node['w:p']) ? node['w:p'] : [node['w:p']];
271
+ paras.forEach(extractText);
272
+ }
273
+ };
274
+ extractText(comment);
275
+
276
+ comments.push({ id, author, date: date.slice(0, 10), text: text.trim() });
277
+ }
278
+ } catch (err: any) {
279
+ // Re-throw with more context if it's already an Error we created
280
+ if (err.message.includes('Invalid Word document') || err.message.includes('File not found')) {
281
+ throw err;
282
+ }
283
+ throw new Error(`Error extracting comments from ${path.basename(docxPath)}: ${err.message}`);
284
+ }
285
+
286
+ return comments;
287
+ }
288
+
289
+ /**
290
+ * Extract comment anchor texts from document.xml with surrounding context
291
+ * Returns map of comment ID -> {anchor, before, after, docPosition, isEmpty} for better matching
292
+ * Also returns fullDocText for section boundary matching
293
+ */
294
+ export async function extractCommentAnchors(docxPath: string): Promise<CommentAnchorsResult> {
295
+ const AdmZip = (await import('adm-zip')).default;
296
+ const anchors = new Map<string, CommentAnchorData>();
297
+ let fullDocText = '';
298
+
299
+ try {
300
+ const zip = new AdmZip(docxPath);
301
+ const docEntry = zip.getEntry('word/document.xml');
302
+
303
+ if (!docEntry) {
304
+ return { anchors, fullDocText };
305
+ }
306
+
307
+ const docXml = docEntry.getData().toString('utf8');
308
+
309
+ // ========================================
310
+ // STEP 1: Build text position mapping
311
+ // ========================================
312
+ const textNodePattern = /<w:t[^>]*>([^<]*)<\/w:t>/g;
313
+ const textNodes: TextNode[] = [];
314
+ let textPosition = 0;
315
+ let nodeMatch;
316
+
317
+ while ((nodeMatch = textNodePattern.exec(docXml)) !== null) {
318
+ const rawText = nodeMatch[1] ?? '';
319
+ const decodedText = decodeXmlEntities(rawText);
320
+ textNodes.push({
321
+ xmlStart: nodeMatch.index,
322
+ xmlEnd: nodeMatch.index + nodeMatch[0].length,
323
+ textStart: textPosition,
324
+ textEnd: textPosition + decodedText.length,
325
+ text: decodedText
326
+ });
327
+ textPosition += decodedText.length;
328
+ }
329
+
330
+ fullDocText = textNodes.map(n => n.text).join('');
331
+
332
+ // Helper: convert XML position to text position
333
+ function xmlPosToTextPos(xmlPos: number): number {
334
+ for (let i = 0; i < textNodes.length; i++) {
335
+ const node = textNodes[i];
336
+ if (!node) continue;
337
+ if (xmlPos >= node.xmlStart && xmlPos < node.xmlEnd) {
338
+ return node.textStart;
339
+ }
340
+ if (xmlPos < node.xmlStart) {
341
+ return node.textStart;
342
+ }
343
+ }
344
+ const lastNode = textNodes[textNodes.length - 1];
345
+ return lastNode ? lastNode.textEnd : 0;
346
+ }
347
+
348
+ // Helper: extract context before a position
349
+ function getContextBefore(position: number, maxLength: number = 150): string {
350
+ const beforeText = fullDocText.slice(Math.max(0, position - maxLength), position);
351
+ const sentenceStart = beforeText.search(/[.!?]\s+[A-Z][^.!?]*$/);
352
+ return sentenceStart >= 0
353
+ ? beforeText.slice(sentenceStart + 2).trim()
354
+ : beforeText.slice(-80).trim();
355
+ }
356
+
357
+ // Helper: extract context after a position
358
+ function getContextAfter(position: number, maxLength: number = 150): string {
359
+ const afterText = fullDocText.slice(position, position + maxLength);
360
+ const sentenceEnd = afterText.search(/[.!?]\s/);
361
+ return sentenceEnd >= 0
362
+ ? afterText.slice(0, sentenceEnd + 1).trim()
363
+ : afterText.slice(0, 80).trim();
364
+ }
365
+
366
+ // ========================================
367
+ // STEP 2: Collect all start/end markers separately
368
+ // ========================================
369
+ const startPattern = /<w:commentRangeStart[^>]*w:id="(\d+)"[^>]*\/?>/g;
370
+ const endPattern = /<w:commentRangeEnd[^>]*w:id="(\d+)"[^>]*\/?>/g;
371
+
372
+ const starts = new Map<string, number>(); // id -> position after start tag
373
+ const ends = new Map<string, number>(); // id -> position before end tag
374
+
375
+ let match;
376
+ while ((match = startPattern.exec(docXml)) !== null) {
377
+ const id = match[1];
378
+ if (!starts.has(id)) {
379
+ starts.set(id, match.index + match[0].length);
380
+ }
381
+ }
382
+
383
+ while ((match = endPattern.exec(docXml)) !== null) {
384
+ const id = match[1];
385
+ if (!ends.has(id)) {
386
+ ends.set(id, match.index);
387
+ }
388
+ }
389
+
390
+ // ========================================
391
+ // STEP 3: Process each comment range by ID
392
+ // ========================================
393
+ for (const [id, startXmlPos] of starts) {
394
+ const endXmlPos = ends.get(id);
395
+
396
+ // Missing end marker - skip with warning
397
+ if (endXmlPos === undefined) {
398
+ console.warn(`Comment ${id}: missing end marker`);
399
+ continue;
400
+ }
401
+
402
+ // Calculate text position
403
+ const docPosition = xmlPosToTextPos(startXmlPos);
404
+
405
+ // Handle empty or inverted ranges
406
+ if (endXmlPos <= startXmlPos) {
407
+ anchors.set(id, {
408
+ anchor: '',
409
+ before: getContextBefore(docPosition),
410
+ after: getContextAfter(docPosition),
411
+ docPosition,
412
+ docLength: fullDocText.length,
413
+ isEmpty: true
414
+ });
415
+ continue;
416
+ }
417
+
418
+ // Extract XML segment between markers
419
+ const segment = docXml.slice(startXmlPos, endXmlPos);
420
+
421
+ // Extract text from w:t (regular) AND w:delText (deleted text in track changes)
422
+ const textInRangePattern = /<w:t[^>]*>([^<]*)<\/w:t>|<w:delText[^>]*>([^<]*)<\/w:delText>/g;
423
+ let anchorText = '';
424
+ let tm;
425
+ while ((tm = textInRangePattern.exec(segment)) !== null) {
426
+ anchorText += tm[1] || tm[2] || '';
427
+ }
428
+ anchorText = decodeXmlEntities(anchorText);
429
+
430
+ // Get context
431
+ const anchorLength = anchorText.length;
432
+ const before = getContextBefore(docPosition);
433
+ const after = getContextAfter(docPosition + anchorLength);
434
+
435
+ // ALWAYS add entry (even if anchor is empty)
436
+ anchors.set(id, {
437
+ anchor: anchorText.trim(),
438
+ before,
439
+ after,
440
+ docPosition,
441
+ docLength: fullDocText.length,
442
+ isEmpty: !anchorText.trim()
443
+ });
444
+ }
445
+ } catch (err: any) {
446
+ console.error('Error extracting comment anchors:', err.message);
447
+ return { anchors, fullDocText: '' };
448
+ }
449
+
450
+ return { anchors, fullDocText };
451
+ }
452
+
453
+ /**
454
+ * Decode XML entities in text
455
+ */
456
+ function decodeXmlEntities(text: string): string {
457
+ return text
458
+ .replace(/&amp;/g, '&')
459
+ .replace(/&lt;/g, '<')
460
+ .replace(/&gt;/g, '>')
461
+ .replace(/&quot;/g, '"')
462
+ .replace(/&apos;/g, "'")
463
+ .replace(/&#(\d+);/g, (_, code) => String.fromCharCode(parseInt(code, 10)))
464
+ .replace(/&#x([0-9a-fA-F]+);/g, (_, code) => String.fromCharCode(parseInt(code, 16)));
465
+ }
466
+
467
+ /**
468
+ * Extract text content from a Word XML cell
469
+ */
470
+ function extractCellText(cellXml: string): string {
471
+ const parts: string[] = [];
472
+
473
+ // Check for OMML math - replace with [math] placeholder
474
+ if (cellXml.includes('<m:oMath')) {
475
+ // Try to extract the text representation of math
476
+ const mathTextMatches = cellXml.match(/<m:t>([^<]*)<\/m:t>/g) || [];
477
+ if (mathTextMatches.length > 0) {
478
+ const mathText = mathTextMatches.map((t) => t.replace(/<[^>]+>/g, '')).join('');
479
+ parts.push(mathText);
480
+ } else {
481
+ parts.push('[math]');
482
+ }
483
+ }
484
+
485
+ // Extract regular text from w:t elements
486
+ const textMatches = cellXml.match(/<w:t[^>]*>([^<]*)<\/w:t>/g) || [];
487
+ for (const match of textMatches) {
488
+ const text = match.replace(/<[^>]+>/g, '');
489
+ if (text) {
490
+ parts.push(text);
491
+ }
492
+ }
493
+
494
+ let result = parts.join('').trim();
495
+ result = decodeXmlEntities(result);
496
+
497
+ // Escape pipe characters in cell content (would break table)
498
+ result = result.replace(/\|/g, '\\|');
499
+
500
+ return result;
501
+ }
502
+
503
+ /**
504
+ * Parse a table row, handling merged cells (gridSpan)
505
+ */
506
+ function parseTableRow(rowXml: string, expectedCols: number): ParsedRow {
507
+ // Match cells - handle both <w:tc> and <w:tc ...>
508
+ const cellMatches = rowXml.match(/<w:tc(?:\s[^>]*)?>[\s\S]*?<\/w:tc>/g) || [];
509
+ const cells: string[] = [];
510
+ const colSpans: number[] = [];
511
+
512
+ for (const cellXml of cellMatches) {
513
+ // Check for horizontal merge (gridSpan)
514
+ const gridSpanMatch = cellXml.match(/<w:gridSpan\s+w:val="(\d+)"/);
515
+ const span = gridSpanMatch ? parseInt(gridSpanMatch[1], 10) : 1;
516
+
517
+ // Check for vertical merge continuation (vMerge without restart)
518
+ // If vMerge is present without w:val="restart", it's a continuation - use empty
519
+ const vMergeMatch = cellXml.match(/<w:vMerge(?:\s+w:val="([^"]+)")?/);
520
+ const isVMergeContinuation = vMergeMatch && vMergeMatch[1] !== 'restart';
521
+
522
+ const cellText = isVMergeContinuation ? '' : extractCellText(cellXml);
523
+
524
+ // Add the cell content
525
+ cells.push(cellText);
526
+ colSpans.push(span);
527
+
528
+ // For gridSpan > 1, add empty cells to maintain column alignment
529
+ for (let i = 1; i < span; i++) {
530
+ cells.push('');
531
+ colSpans.push(0); // 0 indicates this is a spanned cell
532
+ }
533
+ }
534
+
535
+ return { cells, colSpans };
536
+ }
537
+
538
+ /**
539
+ * Determine table grid column count from table XML
540
+ */
541
+ function getTableGridCols(tableXml: string): number {
542
+ // Try to get from tblGrid
543
+ const gridColMatches = tableXml.match(/<w:gridCol/g) || [];
544
+ if (gridColMatches.length > 0) {
545
+ return gridColMatches.length;
546
+ }
547
+
548
+ // Fallback: count max cells in any row
549
+ const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
550
+ let maxCols = 0;
551
+ for (const rowXml of rowMatches) {
552
+ const { cells } = parseTableRow(rowXml, 0);
553
+ maxCols = Math.max(maxCols, cells.length);
554
+ }
555
+ return maxCols;
556
+ }
557
+
558
+ /**
559
+ * Extract tables directly from Word document XML and convert to markdown pipe tables
560
+ */
561
+ export async function extractWordTables(docxPath: string): Promise<WordTable[]> {
562
+ const AdmZip = (await import('adm-zip')).default;
563
+ const tables: WordTable[] = [];
564
+
565
+ try {
566
+ const zip = new AdmZip(docxPath);
567
+ const docEntry = zip.getEntry('word/document.xml');
568
+
569
+ if (!docEntry) {
570
+ return tables;
571
+ }
572
+
573
+ const xml = docEntry.getData().toString('utf8');
574
+
575
+ // Find all table elements
576
+ const tableMatches = xml.match(/<w:tbl>[\s\S]*?<\/w:tbl>/g) || [];
577
+
578
+ for (const tableXml of tableMatches) {
579
+ // Determine expected column count from grid
580
+ const expectedCols = getTableGridCols(tableXml);
581
+
582
+ // Extract rows
583
+ const rowMatches = tableXml.match(/<w:tr[\s\S]*?<\/w:tr>/g) || [];
584
+ const rows: string[][] = [];
585
+
586
+ for (const rowXml of rowMatches) {
587
+ const { cells } = parseTableRow(rowXml, expectedCols);
588
+ if (cells.length > 0) {
589
+ rows.push(cells);
590
+ }
591
+ }
592
+
593
+ if (rows.length > 0) {
594
+ // Convert to markdown pipe table
595
+ const markdown = convertRowsToMarkdownTable(rows);
596
+ tables.push({ markdown, rowCount: rows.length, colCount: expectedCols || rows[0]?.length || 0 });
597
+ }
598
+ }
599
+ } catch (err: any) {
600
+ console.error('Error extracting tables from Word:', err.message);
601
+ }
602
+
603
+ return tables;
604
+ }
605
+
606
+ /**
607
+ * Convert array of rows (each row is array of cell strings) to markdown pipe table
608
+ */
609
+ function convertRowsToMarkdownTable(rows: string[][]): string {
610
+ if (rows.length === 0) return '';
611
+
612
+ // Normalize column count (use max across all rows)
613
+ const colCount = Math.max(...rows.map((r) => r.length));
614
+
615
+ // Pad rows to have consistent column count
616
+ const normalizedRows = rows.map((row) => {
617
+ while (row.length < colCount) {
618
+ row.push('');
619
+ }
620
+ return row;
621
+ });
622
+
623
+ // Build markdown table
624
+ const lines: string[] = [];
625
+
626
+ // Header row
627
+ const header = normalizedRows[0];
628
+ lines.push('| ' + header.join(' | ') + ' |');
629
+
630
+ // Separator row
631
+ lines.push('|' + header.map(() => '---').join('|') + '|');
632
+
633
+ // Data rows
634
+ for (let i = 1; i < normalizedRows.length; i++) {
635
+ lines.push('| ' + normalizedRows[i].join(' | ') + ' |');
636
+ }
637
+
638
+ return lines.join('\n');
639
+ }
640
+
641
+ /**
642
+ * Extract text from Word document using pandoc with track changes preserved
643
+ */
644
+ export async function extractFromWord(
645
+ docxPath: string,
646
+ options: ExtractFromWordOptions = {}
647
+ ): Promise<ExtractFromWordResult> {
648
+ let text: string;
649
+ let messages: ExtractMessage[] = [];
650
+ let extractedMedia: string[] = [];
651
+ let hasTrackChanges = false;
652
+ let trackChangeStats = { insertions: 0, deletions: 0 };
653
+
654
+ // Determine media extraction directory
655
+ const docxDir = path.dirname(docxPath);
656
+ const mediaDir = options.mediaDir || path.join(docxDir, 'media');
657
+
658
+ // Skip media extraction if figures already exist (e.g., when re-importing with existing source)
659
+ const skipMediaExtraction = options.skipMediaExtraction || false;
660
+
661
+ // Extract tables directly from Word XML (reliable, no heuristics)
662
+ const wordTables = await extractWordTables(docxPath);
663
+
664
+ // Try pandoc first with --track-changes=all to preserve reviewer edits
665
+ try {
666
+ // Build pandoc command
667
+ let pandocCmd = `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`;
668
+ if (!skipMediaExtraction) {
669
+ pandocCmd += ` --extract-media="${mediaDir}"`;
670
+ }
671
+
672
+ const { stdout } = await execAsync(pandocCmd, { maxBuffer: 50 * 1024 * 1024 });
673
+ text = stdout;
674
+
675
+ // Convert pandoc's track change format to CriticMarkup
676
+ const origLength = text.length;
677
+
678
+ // Use a more robust pattern that handles nested content
679
+ text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.insertion[^}]*\}/g, (match, content) => {
680
+ if (content.trim()) {
681
+ trackChangeStats.insertions++;
682
+ return `{++${content}++}`;
683
+ }
684
+ return ''; // Empty insertions are removed
685
+ });
686
+
687
+ text = text.replace(/\[([^\[\]]*(?:\[[^\[\]]*\][^\[\]]*)*)\]\{\.deletion[^}]*\}/g, (match, content) => {
688
+ if (content.trim()) {
689
+ trackChangeStats.deletions++;
690
+ return `{--${content}--}`;
691
+ }
692
+ return ''; // Empty deletions are removed
693
+ });
694
+
695
+ // Handle any remaining pandoc track change patterns
696
+ let prevText;
697
+ do {
698
+ prevText = text;
699
+ text = text.replace(/\[([^\]]*)\]\{\.insertion[^}]*\}/g, (match, content) => {
700
+ if (content.trim()) {
701
+ trackChangeStats.insertions++;
702
+ return `{++${content}++}`;
703
+ }
704
+ return '';
705
+ });
706
+ text = text.replace(/\[([^\]]*)\]\{\.deletion[^}]*\}/g, (match, content) => {
707
+ if (content.trim()) {
708
+ trackChangeStats.deletions++;
709
+ return `{--${content}--}`;
710
+ }
711
+ return '';
712
+ });
713
+ } while (text !== prevText);
714
+
715
+ // Handle pandoc comment patterns - remove comment text from body
716
+ text = text.replace(/\[[^\]]*\]\{\.comment-start[^}]*\}/g, '');
717
+ text = text.replace(/\[\]\{\.comment-end[^}]*\}/g, '');
718
+
719
+ // Also handle {.mark} spans
720
+ text = text.replace(/\[([^\]]*)\]\{\.mark\}/g, '$1');
721
+
722
+ hasTrackChanges = trackChangeStats.insertions > 0 || trackChangeStats.deletions > 0;
723
+
724
+ if (hasTrackChanges) {
725
+ messages.push({
726
+ type: 'info',
727
+ message: `Found ${trackChangeStats.insertions} insertion(s) and ${trackChangeStats.deletions} deletion(s) from track changes`
728
+ });
729
+ }
730
+
731
+ // Find extracted media files
732
+ const mediaSubdir = path.join(mediaDir, 'media');
733
+ if (fs.existsSync(mediaSubdir)) {
734
+ extractedMedia = fs.readdirSync(mediaSubdir)
735
+ .filter(f => /\.(png|jpg|jpeg|gif|svg|emf|wmf|tiff?)$/i.test(f))
736
+ .map(f => path.join(mediaSubdir, f));
737
+
738
+ if (extractedMedia.length > 0) {
739
+ messages.push({
740
+ type: 'info',
741
+ message: `Extracted ${extractedMedia.length} image(s) to ${mediaSubdir}`
742
+ });
743
+ }
744
+ }
745
+ } catch (pandocErr: any) {
746
+ // Fall back to mammoth if pandoc fails
747
+ messages.push({ type: 'warning', message: 'Pandoc failed, using mammoth (equations and images may not be preserved)' });
748
+ const mammoth = await import('mammoth');
749
+ const textResult = await mammoth.extractRawText({ path: docxPath });
750
+ const htmlResult = await mammoth.convertToHtml({ path: docxPath });
751
+ text = textResult.value;
752
+ messages = [...textResult.messages, ...htmlResult.messages].map(m => ({ type: 'warning' as const, message: String(m) }));
753
+ }
754
+
755
+ // Extract comments directly from docx XML
756
+ const comments = await extractWordComments(docxPath);
757
+
758
+ // Extract comment anchor texts
759
+ const { anchors } = await extractCommentAnchors(docxPath);
760
+
761
+ return {
762
+ text,
763
+ comments,
764
+ anchors,
765
+ messages,
766
+ extractedMedia,
767
+ tables: wordTables,
768
+ hasTrackChanges,
769
+ trackChangeStats,
770
+ };
771
+ }
772
+
773
+ /**
774
+ * Insert comments into markdown text based on anchor texts with context
775
+ */
776
+ export function insertCommentsIntoMarkdown(
777
+ markdown: string,
778
+ comments: WordComment[],
779
+ anchors: Map<string, CommentAnchorData | string>,
780
+ options: InsertCommentsOptions = {}
781
+ ): string {
782
+ const { quiet = false, sectionBoundary = null } = options;
783
+ let result = markdown;
784
+ let unmatchedCount = 0;
785
+ const duplicateWarnings: string[] = [];
786
+ const usedPositions = new Set<number>(); // For tie-breaking: track used positions
787
+
788
+ // Helper: Strip CriticMarkup from text to get "clean" version for matching
789
+ function stripCriticMarkup(text: string): string {
790
+ return text
791
+ .replace(/\{\+\+([^+]*)\+\+\}/g, '$1') // insertions: keep inserted text
792
+ .replace(/\{--([^-]*)--\}/g, '') // deletions: remove deleted text
793
+ .replace(/\{~~([^~]*)~>([^~]*)~~\}/g, '$2') // substitutions: keep new text
794
+ .replace(/\{>>[^<]*<<\}/g, '') // comments: remove
795
+ .replace(/\[([^\]]*)\]\{\.mark\}/g, '$1'); // marked text: keep text
796
+ }
797
+
798
+ // Helper: Find anchor in text with multiple fallback strategies
799
+ function findAnchorInText(anchor: string, text: string, before: string = '', after: string = ''): AnchorSearchResult {
800
+ // If anchor is empty, skip directly to context-based matching
801
+ if (!anchor || anchor.trim().length === 0) {
802
+ // Jump to context-based strategies (Strategy 5)
803
+ if (before || after) {
804
+ const beforeLower = (before || '').toLowerCase();
805
+ const afterLower = (after || '').toLowerCase();
806
+ const textLower = text.toLowerCase();
807
+
808
+ if (before && after) {
809
+ const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
810
+ if (beforeIdx !== -1) {
811
+ const searchStart = beforeIdx + beforeLower.slice(-50).length;
812
+ const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
813
+ if (afterIdx !== -1 && afterIdx - searchStart < 500) {
814
+ return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
815
+ }
816
+ }
817
+ }
818
+
819
+ if (before) {
820
+ const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
821
+ if (beforeIdx !== -1) {
822
+ return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
823
+ }
824
+ }
825
+
826
+ if (after) {
827
+ const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
828
+ if (afterIdx !== -1) {
829
+ return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
830
+ }
831
+ }
832
+ }
833
+ return { occurrences: [], matchedAnchor: null, strategy: 'empty-anchor' };
834
+ }
835
+
836
+ const anchorLower = anchor.toLowerCase();
837
+ const textLower = text.toLowerCase();
838
+
839
+ // Strategy 1: Direct match
840
+ let occurrences = findAllOccurrences(textLower, anchorLower);
841
+ if (occurrences.length > 0) {
842
+ return { occurrences, matchedAnchor: anchor, strategy: 'direct' };
843
+ }
844
+
845
+ // Strategy 2: Normalized whitespace
846
+ const normalizedAnchor = anchor.replace(/\s+/g, ' ').toLowerCase();
847
+ const normalizedText = text.replace(/\s+/g, ' ').toLowerCase();
848
+ let idx = normalizedText.indexOf(normalizedAnchor);
849
+ if (idx !== -1) {
850
+ return { occurrences: [idx], matchedAnchor: anchor, strategy: 'normalized' };
851
+ }
852
+
853
+ // Strategy 3: Try matching in stripped CriticMarkup version
854
+ const strippedText = stripCriticMarkup(text);
855
+ const strippedLower = strippedText.toLowerCase();
856
+ occurrences = findAllOccurrences(strippedLower, anchorLower);
857
+ if (occurrences.length > 0) {
858
+ return { occurrences, matchedAnchor: anchor, strategy: 'stripped', stripped: true };
859
+ }
860
+
861
+ // Strategy 4: First N words of anchor (for long anchors)
862
+ const words = anchor.split(/\s+/);
863
+ if (words.length > 3) {
864
+ for (let n = Math.min(6, words.length); n >= 3; n--) {
865
+ const partialAnchor = words.slice(0, n).join(' ').toLowerCase();
866
+ if (partialAnchor.length >= 15) {
867
+ occurrences = findAllOccurrences(textLower, partialAnchor);
868
+ if (occurrences.length > 0) {
869
+ return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start' };
870
+ }
871
+ occurrences = findAllOccurrences(strippedLower, partialAnchor);
872
+ if (occurrences.length > 0) {
873
+ return { occurrences, matchedAnchor: words.slice(0, n).join(' '), strategy: 'partial-start-stripped', stripped: true };
874
+ }
875
+ }
876
+ }
877
+ }
878
+
879
+ // Strategy 5: Use context (before/after) to find approximate position
880
+ if (before || after) {
881
+ const beforeLower = before.toLowerCase();
882
+ const afterLower = after.toLowerCase();
883
+
884
+ if (before && after) {
885
+ const beforeIdx = textLower.indexOf(beforeLower.slice(-50));
886
+ if (beforeIdx !== -1) {
887
+ const searchStart = beforeIdx + beforeLower.slice(-50).length;
888
+ const afterIdx = textLower.indexOf(afterLower.slice(0, 50), searchStart);
889
+ if (afterIdx !== -1 && afterIdx - searchStart < 500) {
890
+ return { occurrences: [searchStart], matchedAnchor: null, strategy: 'context-both' };
891
+ }
892
+ }
893
+ }
894
+
895
+ if (before) {
896
+ const beforeIdx = textLower.lastIndexOf(beforeLower.slice(-30));
897
+ if (beforeIdx !== -1) {
898
+ return { occurrences: [beforeIdx + beforeLower.slice(-30).length], matchedAnchor: null, strategy: 'context-before' };
899
+ }
900
+ }
901
+
902
+ if (after) {
903
+ const afterIdx = textLower.indexOf(afterLower.slice(0, 30));
904
+ if (afterIdx !== -1) {
905
+ return { occurrences: [afterIdx], matchedAnchor: null, strategy: 'context-after' };
906
+ }
907
+ }
908
+ }
909
+
910
+ // Strategy 6: Try splitting anchor on common transition words
911
+ const splitPatterns = [' ', ', ', '. ', ' - ', ' – '];
912
+ for (const sep of splitPatterns) {
913
+ if (anchor.includes(sep)) {
914
+ const parts = anchor.split(sep).filter(p => p.length >= 4);
915
+ for (const part of parts) {
916
+ const partLower = part.toLowerCase();
917
+ occurrences = findAllOccurrences(textLower, partLower);
918
+ if (occurrences.length > 0 && occurrences.length < 5) {
919
+ return { occurrences, matchedAnchor: part, strategy: 'split-match' };
920
+ }
921
+ }
922
+ }
923
+ }
924
+
925
+ return { occurrences: [], matchedAnchor: null, strategy: 'failed' };
926
+ }
927
+
928
+ // Helper: Find all occurrences of needle in haystack
929
+ function findAllOccurrences(haystack: string, needle: string): number[] {
930
+ if (!needle || needle.length === 0) {
931
+ return [];
932
+ }
933
+ const occurrences: number[] = [];
934
+ let idx = 0;
935
+ while ((idx = haystack.indexOf(needle, idx)) !== -1) {
936
+ occurrences.push(idx);
937
+ idx += 1;
938
+ }
939
+ return occurrences;
940
+ }
941
+
942
+ // Get all positions in order (for sequential tie-breaking)
943
+ const commentsWithPositions = comments.map((c): CommentWithPos => {
944
+ const anchorData = anchors.get(c.id);
945
+ if (!anchorData) {
946
+ unmatchedCount++;
947
+ return { ...c, pos: -1, anchorText: null };
948
+ }
949
+
950
+ // Support both old format (string) and new format ({anchor, before, after})
951
+ const anchor = typeof anchorData === 'string' ? anchorData : anchorData.anchor;
952
+ const before = typeof anchorData === 'object' ? anchorData.before : '';
953
+ const after = typeof anchorData === 'object' ? anchorData.after : '';
954
+ const isEmpty = typeof anchorData === 'object' && anchorData.isEmpty;
955
+ const docPosition = typeof anchorData === 'object' ? anchorData.docPosition : undefined;
956
+
957
+ // Position-based insertion (most reliable)
958
+ if (sectionBoundary && docPosition !== undefined) {
959
+ const sectionLength = sectionBoundary.end - sectionBoundary.start;
960
+ if (sectionLength > 0) {
961
+ let relativePos;
962
+ if (docPosition < sectionBoundary.start) {
963
+ relativePos = 0;
964
+ } else {
965
+ relativePos = docPosition - sectionBoundary.start;
966
+ }
967
+
968
+ const proportion = Math.min(relativePos / sectionLength, 1.0);
969
+ const markdownPos = Math.floor(proportion * result.length);
970
+
971
+ let insertPos = markdownPos;
972
+
973
+ // Look for nearby word boundary
974
+ const searchWindow = result.slice(Math.max(0, markdownPos - 25), Math.min(result.length, markdownPos + 25));
975
+ const spaceIdx = searchWindow.indexOf(' ', 25);
976
+ if (spaceIdx !== -1 && spaceIdx < 50) {
977
+ insertPos = Math.max(0, markdownPos - 25) + spaceIdx;
978
+ }
979
+
980
+ // If we have anchor text, try to find it near this position
981
+ if (anchor && !isEmpty) {
982
+ const searchStart = Math.max(0, insertPos - 200);
983
+ const searchEnd = Math.min(result.length, insertPos + 200);
984
+ const localSearch = result.slice(searchStart, searchEnd).toLowerCase();
985
+ const anchorLower = anchor.toLowerCase();
986
+ const localIdx = localSearch.indexOf(anchorLower);
987
+ if (localIdx !== -1) {
988
+ return { ...c, pos: searchStart + localIdx, anchorText: anchor, anchorEnd: searchStart + localIdx + anchor.length, strategy: 'position+text' };
989
+ }
990
+ // Try first few words
991
+ const words = anchor.split(/\s+/).slice(0, 4).join(' ').toLowerCase();
992
+ if (words.length >= 10) {
993
+ const partialIdx = localSearch.indexOf(words);
994
+ if (partialIdx !== -1) {
995
+ return { ...c, pos: searchStart + partialIdx, anchorText: words, anchorEnd: searchStart + partialIdx + words.length, strategy: 'position+partial' };
996
+ }
997
+ }
998
+ }
999
+
1000
+ return { ...c, pos: insertPos, anchorText: null, strategy: 'position-only' };
1001
+ }
1002
+ }
1003
+
1004
+ // Handle empty anchors
1005
+ if (!anchor || isEmpty) {
1006
+ if (before || after) {
1007
+ const { occurrences } = findAnchorInText('', result, before, after);
1008
+ if (occurrences.length > 0) {
1009
+ return { ...c, pos: occurrences[0], anchorText: null, isEmpty: true };
1010
+ }
1011
+ }
1012
+ unmatchedCount++;
1013
+ return { ...c, pos: -1, anchorText: null, isEmpty: true };
1014
+ }
1015
+
1016
+ // Text-based matching strategies
1017
+ const { occurrences, matchedAnchor, strategy, stripped } = findAnchorInText(anchor, result, before, after);
1018
+
1019
+ if (occurrences.length === 0) {
1020
+ unmatchedCount++;
1021
+ return { ...c, pos: -1, anchorText: null };
1022
+ }
1023
+
1024
+ const anchorLen = matchedAnchor ? matchedAnchor.length : 0;
1025
+
1026
+ if (occurrences.length === 1) {
1027
+ if (matchedAnchor) {
1028
+ return { ...c, pos: occurrences[0], anchorText: matchedAnchor, anchorEnd: occurrences[0] + anchorLen };
1029
+ } else {
1030
+ return { ...c, pos: occurrences[0], anchorText: null };
1031
+ }
1032
+ }
1033
+
1034
+ // Multiple occurrences - use context for disambiguation
1035
+ if (matchedAnchor) {
1036
+ duplicateWarnings.push(`"${matchedAnchor.slice(0, 40)}${matchedAnchor.length > 40 ? '...' : ''}" appears ${occurrences.length} times`);
1037
+ }
1038
+
1039
+ let bestIdx = occurrences.find(p => !usedPositions.has(p)) ?? occurrences[0];
1040
+ let bestScore = -1;
1041
+
1042
+ for (const pos of occurrences) {
1043
+ if (usedPositions.has(pos)) continue;
1044
+
1045
+ let score = 0;
1046
+
1047
+ if (before) {
1048
+ const contextBefore = result.slice(Math.max(0, pos - before.length - 20), pos).toLowerCase();
1049
+ const beforeLower = before.toLowerCase();
1050
+ const beforeWords = beforeLower.split(/\s+/).filter(w => w.length > 3);
1051
+ for (const word of beforeWords) {
1052
+ if (contextBefore.includes(word)) score += 2;
1053
+ }
1054
+ if (contextBefore.includes(beforeLower.slice(-30))) score += 5;
1055
+ }
1056
+
1057
+ if (after) {
1058
+ const contextAfter = result.slice(pos + anchorLen, pos + anchorLen + after.length + 20).toLowerCase();
1059
+ const afterLower = after.toLowerCase();
1060
+ const afterWords = afterLower.split(/\s+/).filter(w => w.length > 3);
1061
+ for (const word of afterWords) {
1062
+ if (contextAfter.includes(word)) score += 2;
1063
+ }
1064
+ if (contextAfter.includes(afterLower.slice(0, 30))) score += 5;
1065
+ }
1066
+
1067
+ if (score > bestScore || (score === bestScore && pos < bestIdx)) {
1068
+ bestScore = score;
1069
+ bestIdx = pos;
1070
+ }
1071
+ }
1072
+
1073
+ usedPositions.add(bestIdx);
1074
+
1075
+ if (matchedAnchor) {
1076
+ return { ...c, pos: bestIdx, anchorText: matchedAnchor, anchorEnd: bestIdx + anchorLen };
1077
+ } else {
1078
+ return { ...c, pos: bestIdx, anchorText: null };
1079
+ }
1080
+ });
1081
+
1082
+ // Log any unmatched comments for debugging
1083
+ const unmatched = commentsWithPositions.filter((c) => c.pos < 0);
1084
+ if (process.env.DEBUG) {
1085
+ console.log(`[DEBUG] insertComments: ${comments.length} input, ${commentsWithPositions.length} processed, ${unmatched.length} unmatched`);
1086
+ if (unmatched.length > 0) {
1087
+ unmatched.forEach(c => console.log(`[DEBUG] Unmatched ID=${c.id}: anchor="${(c.anchorText || 'none').slice(0,30)}"`));
1088
+ }
1089
+ }
1090
+
1091
+ const matched = commentsWithPositions.filter((c) => c.pos >= 0);
1092
+
1093
+ // Sort by position descending (insert from end to avoid offset issues)
1094
+ matched.sort((a, b) => b.pos - a.pos);
1095
+
1096
+ // Insert each comment with anchor marking
1097
+ for (const c of matched) {
1098
+ const comment = `{>>${c.author}: ${c.text}<<}`;
1099
+ if (c.anchorText && c.anchorEnd) {
1100
+ // Replace anchor text with: {>>comment<<}[anchor]{.mark}
1101
+ const before = result.slice(0, c.pos);
1102
+ const anchor = result.slice(c.pos, c.anchorEnd);
1103
+ const after = result.slice(c.anchorEnd);
1104
+ result = before + comment + `[${anchor}]{.mark}` + after;
1105
+ } else {
1106
+ // No anchor - just insert comment at position
1107
+ result = result.slice(0, c.pos) + ` ${comment}` + result.slice(c.pos);
1108
+ }
1109
+ }
1110
+
1111
+ // Log warnings unless quiet mode
1112
+ if (!quiet) {
1113
+ if (unmatchedCount > 0) {
1114
+ console.warn(`Warning: ${unmatchedCount} comment(s) could not be matched to anchor text`);
1115
+ }
1116
+ if (duplicateWarnings.length > 0) {
1117
+ console.warn(`Warning: Duplicate anchor text found (using context & tie-breaks for placement):`);
1118
+ for (const w of duplicateWarnings) {
1119
+ console.warn(` - ${w}`);
1120
+ }
1121
+ }
1122
+ }
1123
+
1124
+ return result;
1125
+ }
1126
+
1127
+ /**
1128
+ * Normalize text for comparison (handle whitespace differences)
1129
+ */
1130
+ function normalizeWhitespace(text: string): string {
1131
+ return text
1132
+ .replace(/\r\n/g, '\n') // Normalize line endings
1133
+ .replace(/\t/g, ' ') // Tabs to spaces
1134
+ .replace(/ +/g, ' ') // Collapse multiple spaces
1135
+ .trim();
1136
+ }
1137
+
1138
+ /**
1139
+ * Fix citation and math annotations by preserving original markdown syntax
1140
+ */
1141
+ function fixCitationAnnotations(text: string, originalMd: string): string {
1142
+ // Fix math annotations - preserve inline and display math
1143
+ text = text.replace(/\{--(\$[^$]+\$)--\}/g, '$1');
1144
+ text = text.replace(/\{--(\$\$[^$]+\$\$)--\}/g, '$1');
1145
+
1146
+ text = text.replace(/\{~~(\$[^$]+\$)~>[^~]+~~\}/g, '$1');
1147
+ text = text.replace(/\{~~(\$\$[^$]+\$\$)~>[^~]+~~\}/g, '$1');
1148
+
1149
+ // Extract all citations from original markdown
1150
+ const citationPattern = /\[@[^\]]+\]/g;
1151
+ const originalCitations = [...originalMd.matchAll(citationPattern)].map(m => m[0]);
1152
+
1153
+ // Fix substitutions where left side has markdown citation
1154
+ text = text.replace(/\{~~(\[@[^\]]+\])~>[^~]+~~\}/g, '$1');
1155
+
1156
+ // Fix substitutions where left side STARTS with markdown citation
1157
+ text = text.replace(/\{~~(\[@[^\]]+\])\s*([^~]*)~>([^~]*)~~\}/g, (match, cite, oldText, newText) => {
1158
+ if (oldText.trim() === '' && newText.trim() === '') {
1159
+ return cite;
1160
+ }
1161
+ if (oldText.trim() || newText.trim()) {
1162
+ return cite + (oldText.trim() !== newText.trim() ? ` {~~${oldText.trim()}~>${newText.trim()}~~}` : ` ${newText}`);
1163
+ }
1164
+ return cite;
1165
+ });
1166
+
1167
+ // Fix deletions of markdown citations
1168
+ text = text.replace(/\{--(\[@[^\]]+\])--\}/g, '$1');
1169
+
1170
+ // Fix insertions of rendered citations
1171
+ text = text.replace(/\{\+\+\([A-Z][^)]*\d{4}[^)]*\)\+\+\}/g, '');
1172
+
1173
+ // Clean up broken multi-part substitutions
1174
+ text = text.replace(/\{~~(@[A-Za-z]+\d{4})~>[^~]+~~\}/g, '[$1]');
1175
+
1176
+ // Fix citations split across substitution boundaries
1177
+ text = text.replace(/\{~~\[@~>[^~]*~~\}([A-Za-z]+\d{4})\]/g, '[@$1]');
1178
+
1179
+ // Clean up any remaining partial citations
1180
+ text = text.replace(/\{~~;\s*@([A-Za-z]+\d{4})\]~>[^~]*~~\}/g, '; [@$1]');
1181
+
1182
+ // Remove rendered citation insertions (with Unicode support)
1183
+ text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\+\+\}/gu, '');
1184
+ text = text.replace(/\{\+\+\(\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1185
+
1186
+ // Trailing citation fragments
1187
+ text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1188
+ text = text.replace(/\{\+\+\d{4}[a-z]?(?:[;,]\s*(?:\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+)?\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
1189
+
1190
+ // Just year with closing paren
1191
+ text = text.replace(/\{\+\+\d{4}[a-z]?\)\.\s*\+\+\}/g, '');
1192
+ text = text.replace(/\{\+\+\d{4}[a-z]?\)\s*\+\+\}/g, '');
1193
+
1194
+ // Leading citation fragments
1195
+ text = text.replace(/\{\+\+\(?\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s*\+\+\}/gu, '');
1196
+
1197
+ // Semicolon-separated fragments
1198
+ text = text.replace(/\{\+\+[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?\+\+\}/gu, '');
1199
+
1200
+ // Year ranges with authors
1201
+ text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\s*\+\+\}/gu, '');
1202
+ text = text.replace(/\{\+\+\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?(?:[;,]\s*\p{Lu}\p{L}*(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)*\)\.\s*\+\+\}/gu, '');
1203
+
1204
+ // Clean up double spaces and orphaned punctuation
1205
+ text = text.replace(/ +/g, ' ');
1206
+ text = text.replace(/\s+\./g, '.');
1207
+ text = text.replace(/\s+,/g, ',');
1208
+
1209
+ // Final cleanup - remove empty annotations
1210
+ text = text.replace(/\{~~\s*~>\s*~~\}/g, '');
1211
+ text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1212
+ text = text.replace(/\{--\s*--\}/g, '');
1213
+
1214
+ return text;
1215
+ }
1216
+
1217
+ /**
1218
+ * Strip markdown syntax to get plain text
1219
+ */
1220
+ function stripMarkdownSyntax(md: string): string {
1221
+ return md
1222
+ .replace(/^---[\s\S]*?---\n*/m, '')
1223
+ .replace(/^#{1,6}\s+/gm, '')
1224
+ .replace(/(\*\*|__)(.*?)\1/g, '$2')
1225
+ .replace(/(\*|_)(.*?)\1/g, '$2')
1226
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1')
1227
+ .replace(/!\[([^\]]*)\]\([^)]+\)/g, '')
1228
+ .replace(/`([^`]+)`/g, '$1')
1229
+ .replace(/```[\s\S]*?```/g, '')
1230
+ .replace(/^>\s*/gm, '')
1231
+ .replace(/^[-*_]{3,}\s*$/gm, '')
1232
+ .replace(/^[\s]*[-*+]\s+/gm, '')
1233
+ .replace(/^[\s]*\d+\.\s+/gm, '')
1234
+ .replace(/\|/g, ' ')
1235
+ .replace(/^[-:]+$/gm, '')
1236
+ .replace(/\n{3,}/g, '\n\n')
1237
+ .trim();
1238
+ }
1239
+
1240
+ /**
1241
+ * Generate annotated markdown by diffing original MD against Word text
1242
+ */
1243
+ export function generateAnnotatedDiff(originalMd: string, wordText: string, author: string = 'Reviewer'): string {
1244
+ const normalizedOriginal = normalizeWhitespace(originalMd);
1245
+ const normalizedWord = normalizeWhitespace(wordText);
1246
+
1247
+ const changes = diffWords(normalizedOriginal, normalizedWord);
1248
+
1249
+ let result = '';
1250
+
1251
+ for (const part of changes) {
1252
+ if (part.added) {
1253
+ result += `{++${part.value}++}`;
1254
+ } else if (part.removed) {
1255
+ result += `{--${part.value}--}`;
1256
+ } else {
1257
+ result += part.value;
1258
+ }
1259
+ }
1260
+
1261
+ return result;
1262
+ }
1263
+
1264
+ /**
1265
+ * Inject Word tables (extracted from XML) into pandoc text output
1266
+ */
1267
+ function injectWordTables(pandocText: string, wordTables: WordTable[]): string {
1268
+ if (!wordTables || wordTables.length === 0) {
1269
+ return pandocText;
1270
+ }
1271
+
1272
+ let result = pandocText;
1273
+
1274
+ for (const table of wordTables) {
1275
+ const firstLine = table.markdown.split('\n')[0];
1276
+ const headerCells = firstLine
1277
+ .split('|')
1278
+ .map((c) => c.trim())
1279
+ .filter((c) => c.length > 0);
1280
+
1281
+ if (headerCells.length === 0) continue;
1282
+
1283
+ const firstCell = headerCells[0];
1284
+ const startIdx = result.indexOf(firstCell);
1285
+
1286
+ if (startIdx === -1) continue;
1287
+
1288
+ const lastLine = table.markdown.split('\n').pop();
1289
+ const lastCells = lastLine!
1290
+ .split('|')
1291
+ .map((c) => c.trim())
1292
+ .filter((c) => c.length > 0);
1293
+ const lastCell = lastCells[lastCells.length - 1] || lastCells[0];
1294
+
1295
+ const endIdx = result.indexOf(lastCell, startIdx);
1296
+ if (endIdx === -1) continue;
1297
+
1298
+ let regionStart = result.lastIndexOf('\n\n', startIdx);
1299
+ if (regionStart === -1) regionStart = 0;
1300
+ else regionStart += 2;
1301
+
1302
+ let regionEnd = result.indexOf('\n\n', endIdx + lastCell.length);
1303
+ if (regionEnd === -1) regionEnd = result.length;
1304
+
1305
+ result = result.slice(0, regionStart) + table.markdown + '\n\n' + result.slice(regionEnd);
1306
+ }
1307
+
1308
+ return result;
1309
+ }
1310
+
1311
+ /**
1312
+ * Smart paragraph-level diff that preserves markdown structure
1313
+ */
1314
+ export function generateSmartDiff(
1315
+ originalMd: string,
1316
+ wordText: string,
1317
+ author: string = 'Reviewer',
1318
+ options: GenerateSmartDiffOptions = {}
1319
+ ): string {
1320
+ const { wordTables = [], imageRegistry = null } = options;
1321
+
1322
+ // Inject Word tables into pandoc output
1323
+ let wordTextWithTables = injectWordTables(wordText, wordTables);
1324
+
1325
+ // Protect markdown tables
1326
+ const { text: mdWithTablesProtected, tables } = protectTables(originalMd);
1327
+
1328
+ // Also protect tables in Word text
1329
+ const { text: wordWithTablesProtected, tables: wordTableBlocks } = protectTables(wordTextWithTables);
1330
+
1331
+ // Protect images
1332
+ const { text: mdWithImagesProtected, images: origImages } = protectImages(mdWithTablesProtected, imageRegistry);
1333
+
1334
+ const { text: wordWithImagesProtected, images: wordImages } = protectImages(wordWithTablesProtected, imageRegistry);
1335
+
1336
+ // Match Word images to original images
1337
+ const imageMapping = matchWordImagesToOriginal(origImages, wordImages, imageRegistry);
1338
+
1339
+ // Replace Word image placeholders with matching original placeholders
1340
+ let wordWithMappedImages = wordWithImagesProtected;
1341
+ for (const [wordPlaceholder, origPlaceholder] of imageMapping) {
1342
+ wordWithMappedImages = wordWithMappedImages.split(wordPlaceholder).join(origPlaceholder);
1343
+ }
1344
+
1345
+ // Protect figure/table anchors
1346
+ const { text: mdWithAnchorsProtected, anchors: figAnchors } = protectAnchors(mdWithImagesProtected);
1347
+
1348
+ // Protect cross-references
1349
+ const { text: mdWithXrefsProtected, crossrefs } = protectCrossrefs(mdWithAnchorsProtected);
1350
+
1351
+ // Protect math
1352
+ const { text: mdWithMathProtected, mathBlocks } = protectMath(mdWithXrefsProtected);
1353
+
1354
+ // Protect citations
1355
+ const { text: mdProtected, citations } = protectCitations(mdWithMathProtected);
1356
+
1357
+ // Replace rendered elements in Word text
1358
+ let wordProtected = wordWithMappedImages;
1359
+ wordProtected = replaceRenderedMath(wordProtected, mathBlocks);
1360
+ wordProtected = replaceRenderedCitations(wordProtected, citations.length);
1361
+
1362
+ // Split into paragraphs
1363
+ const originalParas = mdProtected.split(/\n\n+/);
1364
+ const wordParas = wordProtected.split(/\n\n+/);
1365
+
1366
+ const result: string[] = [];
1367
+
1368
+ // Try to match paragraphs intelligently
1369
+ let wordIdx = 0;
1370
+
1371
+ for (let i = 0; i < originalParas.length; i++) {
1372
+ const orig = originalParas[i] || '';
1373
+ const { prefix: mdPrefix, content: origContent } = extractMarkdownPrefix(orig.split('\n')[0]);
1374
+
1375
+ // Find best matching word paragraph
1376
+ let bestMatch = -1;
1377
+ let bestScore = 0;
1378
+
1379
+ for (let j = wordIdx; j < Math.min(wordIdx + 3, wordParas.length); j++) {
1380
+ const wordPara = wordParas[j] || '';
1381
+ const origWords = new Set(origContent.toLowerCase().split(/\s+/));
1382
+ const wordWords = wordPara.toLowerCase().split(/\s+/);
1383
+ const common = wordWords.filter((w) => origWords.has(w)).length;
1384
+ const score = common / Math.max(origWords.size, wordWords.length);
1385
+
1386
+ if (score > bestScore && score > 0.3) {
1387
+ bestScore = score;
1388
+ bestMatch = j;
1389
+ }
1390
+ }
1391
+
1392
+ if (bestMatch === -1) {
1393
+ if (mdPrefix && wordIdx < wordParas.length) {
1394
+ const wordPara = wordParas[wordIdx];
1395
+ if (wordPara.toLowerCase().includes(origContent.toLowerCase().slice(0, 20))) {
1396
+ bestMatch = wordIdx;
1397
+ }
1398
+ }
1399
+ }
1400
+
1401
+ if (bestMatch >= 0) {
1402
+ const word = wordParas[bestMatch];
1403
+
1404
+ const origStripped = stripMarkdownSyntax(orig);
1405
+ const wordNormalized = normalizeWhitespace(word);
1406
+
1407
+ if (origStripped === wordNormalized) {
1408
+ result.push(orig);
1409
+ } else {
1410
+ const changes = diffWords(origStripped, wordNormalized);
1411
+ let annotated = mdPrefix;
1412
+
1413
+ for (const part of changes) {
1414
+ if (part.added) {
1415
+ annotated += `{++${part.value}++}`;
1416
+ } else if (part.removed) {
1417
+ annotated += `{--${part.value}--}`;
1418
+ } else {
1419
+ annotated += part.value;
1420
+ }
1421
+ }
1422
+
1423
+ result.push(annotated);
1424
+ }
1425
+
1426
+ wordIdx = bestMatch + 1;
1427
+ } else {
1428
+ // Paragraph deleted entirely
1429
+ if (mdPrefix && mdPrefix.match(/^#{1,6}\s+/)) {
1430
+ result.push(orig);
1431
+ } else {
1432
+ result.push(`{--${orig}--}`);
1433
+ }
1434
+ }
1435
+ }
1436
+
1437
+ // Any remaining word paragraphs are additions
1438
+ for (let j = wordIdx; j < wordParas.length; j++) {
1439
+ const word = wordParas[j];
1440
+ if (word.trim()) {
1441
+ result.push(`{++${word}++}`);
1442
+ }
1443
+ }
1444
+
1445
+ // Restore protected content
1446
+ let finalResult = result.join('\n\n');
1447
+ finalResult = restoreCitations(finalResult, citations);
1448
+ finalResult = restoreMath(finalResult, mathBlocks);
1449
+ finalResult = restoreCrossrefs(finalResult, crossrefs);
1450
+ finalResult = restoreAnchors(finalResult, figAnchors);
1451
+ finalResult = restoreImages(finalResult, origImages);
1452
+ finalResult = restoreImages(finalResult, wordImages);
1453
+ finalResult = restoreTables(finalResult, tables);
1454
+ finalResult = restoreTables(finalResult, wordTableBlocks);
1455
+
1456
+ return finalResult;
1457
+ }
1458
+
1459
+ /**
1460
+ * Clean up redundant adjacent annotations
1461
+ */
1462
+ export function cleanupAnnotations(text: string): string {
1463
+ // Convert adjacent delete+insert to substitution
1464
+ text = text.replace(/\{--(.+?)--\}\s*\{\+\+(.+?)\+\+\}/g, '{~~$1~>$2~~}');
1465
+
1466
+ // Also handle insert+delete
1467
+ text = text.replace(/\{\+\+(.+?)\+\+\}\s*\{--(.+?)--\}/g, '{~~$2~>$1~~}');
1468
+
1469
+ // Fix malformed patterns
1470
+ text = text.replace(/\{--([^}]+?)~>([^}]+?)~~\}/g, '{~~$1~>$2~~}');
1471
+
1472
+ // Fix malformed substitutions that got split
1473
+ text = text.replace(/\{~~([^~]+)\s*--\}/g, '{--$1--}');
1474
+ text = text.replace(/\{\+\+([^+]+)~~\}/g, '{++$1++}');
1475
+
1476
+ // Clean up empty annotations
1477
+ text = text.replace(/\{--\s*--\}/g, '');
1478
+ text = text.replace(/\{\+\+\s*\+\+\}/g, '');
1479
+
1480
+ // Clean up double spaces in prose, but preserve table formatting
1481
+ const lines = text.split('\n');
1482
+ let inTable = false;
1483
+
1484
+ const processedLines = lines.map((line, idx) => {
1485
+ const isSeparator = /^[-]+(\s+[-]+)+\s*$/.test(line.trim());
1486
+
1487
+ const looksLikeTableRow = /\S+\s{2,}\S+/.test(line);
1488
+
1489
+ if (isSeparator) {
1490
+ if (!inTable) {
1491
+ inTable = true;
1492
+ }
1493
+ return line;
1494
+ }
1495
+
1496
+ if (inTable) {
1497
+ if (line.trim() === '') {
1498
+ let lookAhead = idx + 1;
1499
+ let foundTableContent = false;
1500
+ let foundEndSeparator = false;
1501
+
1502
+ while (lookAhead < lines.length && lookAhead < idx + 20) {
1503
+ const nextLine = lines[lookAhead].trim();
1504
+
1505
+ if (nextLine === '') {
1506
+ lookAhead++;
1507
+ continue;
1508
+ }
1509
+
1510
+ if (/^[-]+(\s+[-]+)+\s*$/.test(nextLine)) {
1511
+ foundEndSeparator = true;
1512
+ break;
1513
+ }
1514
+
1515
+ if (/\S+\s{2,}\S+/.test(nextLine)) {
1516
+ foundTableContent = true;
1517
+ break;
1518
+ }
1519
+
1520
+ if (/^\*[^*]+\*\s*$/.test(nextLine)) {
1521
+ foundTableContent = true;
1522
+ break;
1523
+ }
1524
+
1525
+ if (lines[lookAhead].startsWith(' ')) {
1526
+ lookAhead++;
1527
+ continue;
1528
+ }
1529
+
1530
+ break;
1531
+ }
1532
+
1533
+ if (foundTableContent || foundEndSeparator) {
1534
+ return line;
1535
+ }
1536
+
1537
+ inTable = false;
1538
+ return line;
1539
+ }
1540
+
1541
+ return line;
1542
+ }
1543
+
1544
+ if (looksLikeTableRow) {
1545
+ let nextIdx = idx + 1;
1546
+ while (nextIdx < lines.length && lines[nextIdx].trim() === '') {
1547
+ nextIdx++;
1548
+ }
1549
+ if (nextIdx < lines.length && /^[-]+(\s+[-]+)+\s*$/.test(lines[nextIdx].trim())) {
1550
+ return line;
1551
+ }
1552
+ }
1553
+
1554
+ if (line.trim().startsWith('|')) {
1555
+ return line;
1556
+ }
1557
+
1558
+ return line.replace(/ +/g, ' ');
1559
+ });
1560
+ text = processedLines.join('\n');
1561
+
1562
+ return text;
1563
+ }
1564
+
1565
+ /**
1566
+ * Parse visible comment markers from Word text
1567
+ */
1568
+ export function parseVisibleComments(text: string): Array<{ author: string; text: string; position: number }> {
1569
+ const comments: Array<{ author: string; text: string; position: number }> = [];
1570
+ const pattern = /\[([^\]:]+):\s*([^\]]+)\]/g;
1571
+
1572
+ let match;
1573
+ while ((match = pattern.exec(text)) !== null) {
1574
+ comments.push({
1575
+ author: match[1].trim(),
1576
+ text: match[2].trim(),
1577
+ position: match.index,
1578
+ });
1579
+ }
1580
+
1581
+ return comments;
1582
+ }
1583
+
1584
+ /**
1585
+ * Convert visible comments to CriticMarkup format
1586
+ */
1587
+ export function convertVisibleComments(text: string): string {
1588
+ return text.replace(/\[([^\]:]+):\s*([^\]]+)\]/g, '{>>$1: $2<<}');
1589
+ }
1590
+
1591
+ /**
1592
+ * Restore pandoc-crossref figure/table references from Word-rendered format
1593
+ */
1594
+ export function restoreCrossrefFromWord(
1595
+ text: string,
1596
+ projectDir: string,
1597
+ restoredLabels: Set<string> | null = null
1598
+ ): RestoreCrossrefResult {
1599
+ const messages: string[] = [];
1600
+ let restored = 0;
1601
+ let result = text;
1602
+
1603
+ const registry = readImageRegistry(projectDir);
1604
+
1605
+ if (!restoredLabels) {
1606
+ restoredLabels = new Set<string>();
1607
+ }
1608
+
1609
+ // Pattern 1: [Figure]{.mark} [N]{.mark}
1610
+ result = result.replace(/\[(Figure|Table|Fig\.?)\]\{\.mark\}\s*\[(\d+|S\d+)\]\{\.mark\}/gi, (match, type, num) => {
1611
+ const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1612
+ if (registry) {
1613
+ const entry = registry.byNumber?.get(`${prefix}:${num}`);
1614
+ if (entry && entry.label) {
1615
+ restored++;
1616
+ return `@${prefix}:${entry.label}`;
1617
+ }
1618
+ }
1619
+ restored++;
1620
+ messages.push(`Restored ${type} ${num} (no label found, using placeholder)`);
1621
+ return `@${prefix}:fig${num}`;
1622
+ });
1623
+
1624
+ // Pattern 2: Plain "Figure N" or "Fig. N"
1625
+ result = result.replace(/(?<!!)\b(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)\b(?!\s*:)/gi, (match, type, num) => {
1626
+ const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1627
+ if (registry) {
1628
+ const entry = registry.byNumber?.get(`${prefix}:${num}`);
1629
+ if (entry && entry.label) {
1630
+ restored++;
1631
+ return `@${prefix}:${entry.label}`;
1632
+ }
1633
+ }
1634
+ return match;
1635
+ });
1636
+
1637
+ // Pattern 3: Remove duplicate plain-text captions
1638
+ result = result.replace(/(\!\[[^\]]+\]\([^)]+\)(?:\{[^}]*\})?)\s*\n+\s*(?:Figure|Fig\.?|Table|Tbl\.?)\s+\d+[:\.]?\s*[^\n]+/gi, '$1');
1639
+
1640
+ // Pattern 4: Clean up image captions that start with "Figure N: "
1641
+ result = result.replace(/!\[(Figure|Fig\.?|Table|Tbl\.?)\s+(\d+|S\d+)[:\.]?\s*([^\]]*)\]\(([^)]+)\)(?:\{[^}]*\})?/gi,
1642
+ (match, type, num, caption, imgPath) => {
1643
+ const prefix = type.toLowerCase().startsWith('tab') ? 'tbl' : 'fig';
1644
+ const labelKey = `${prefix}:${num}`;
1645
+
1646
+ if (registry) {
1647
+ const entry = registry.byNumber?.get(labelKey);
1648
+ if (entry) {
1649
+ if (restoredLabels!.has(labelKey)) {
1650
+ messages.push(`Skipped duplicate ${prefix}:${entry.label} (already restored)`);
1651
+ return `![${entry.caption}](${entry.path})`;
1652
+ }
1653
+ restoredLabels!.add(labelKey);
1654
+ restored++;
1655
+ messages.push(`Restored image ${prefix}:${entry.label} from Figure ${num}`);
1656
+ return `![${entry.caption}](${entry.path}){#${prefix}:${entry.label}}`;
1657
+ }
1658
+ }
1659
+ const cleanCaption = caption.trim();
1660
+ return `![${cleanCaption}](${imgPath})`;
1661
+ });
1662
+
1663
+ return { text: result, restored, messages, restoredLabels };
1664
+ }
1665
+
1666
+ /**
1667
+ * Restore proper markdown image syntax from Word-extracted text using image registry
1668
+ */
1669
+ export function restoreImagesFromRegistry(
1670
+ text: string,
1671
+ projectDir: string,
1672
+ restoredLabels: Set<string> | null = null
1673
+ ): RestoreImagesResult {
1674
+ const messages: string[] = [];
1675
+ let restored = 0;
1676
+
1677
+ const registry = readImageRegistry(projectDir);
1678
+ if (!registry || !registry.figures || registry.figures.length === 0) {
1679
+ return { text, restored: 0, messages: ['No image registry found'] };
1680
+ }
1681
+
1682
+ if (!restoredLabels) {
1683
+ restoredLabels = new Set<string>();
1684
+ }
1685
+
1686
+ let result = text;
1687
+
1688
+ // Pattern 1: Caption-like text
1689
+ const captionPatterns = [
1690
+ /@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^\n]+)/gi,
1691
+ /^(Figure|Fig\.?)\s+(\d+|S\d+)[.:]\s*([^\n]+)/gim,
1692
+ /\|\s*@(fig|tbl):([a-zA-Z0-9_-]+):\s*([^|]+)\s*\|/gi,
1693
+ ];
1694
+
1695
+ // Fix @fig:label: caption patterns
1696
+ result = result.replace(captionPatterns[0], (match, type, label, caption) => {
1697
+ const key = `${type}:${label}`;
1698
+ const entry = registry.byLabel.get(key);
1699
+ if (entry) {
1700
+ if (restoredLabels!.has(key)) {
1701
+ messages.push(`Skipped duplicate ${key} (already restored)`);
1702
+ return `![${entry.caption}](${entry.path})`;
1703
+ }
1704
+ restoredLabels!.add(key);
1705
+ restored++;
1706
+ messages.push(`Restored ${type}:${label} from registry`);
1707
+ return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1708
+ }
1709
+ return match;
1710
+ });
1711
+
1712
+ // Fix table-wrapped captions
1713
+ result = result.replace(captionPatterns[2], (match, type, label, caption) => {
1714
+ const key = `${type}:${label}`;
1715
+ const entry = registry.byLabel.get(key);
1716
+ if (entry) {
1717
+ if (restoredLabels!.has(key)) {
1718
+ messages.push(`Skipped duplicate ${key} from table wrapper`);
1719
+ return `![${entry.caption}](${entry.path})`;
1720
+ }
1721
+ restoredLabels!.add(key);
1722
+ restored++;
1723
+ messages.push(`Restored ${type}:${label} from table wrapper`);
1724
+ return `![${entry.caption}](${entry.path}){#${type}:${label}}`;
1725
+ }
1726
+ return match;
1727
+ });
1728
+
1729
+ // Clean up empty table structures
1730
+ result = result.replace(/\|\s*\|\s*\n\|:--:\|\s*\n/g, '');
1731
+
1732
+ // Fix "Figure N:" standalone lines
1733
+ result = result.replace(captionPatterns[1], (match, prefix, num, caption) => {
1734
+ const numKey = `fig:${num}`;
1735
+ const entry = registry.byNumber.get(numKey);
1736
+ if (entry) {
1737
+ const labelKey = `fig:${entry.label}`;
1738
+ if (restoredLabels!.has(labelKey)) {
1739
+ messages.push(`Skipped duplicate Figure ${num} (already restored)`);
1740
+ return `![${entry.caption}](${entry.path})`;
1741
+ }
1742
+ restoredLabels!.add(labelKey);
1743
+ restored++;
1744
+ messages.push(`Restored Figure ${num} by number lookup`);
1745
+ return `![${entry.caption}](${entry.path}){#fig:${entry.label}}`;
1746
+ }
1747
+ return match;
1748
+ });
1749
+
1750
+ // Fix generic media paths by matching caption text
1751
+ const genericImagePattern = /!\[([^\]]*)\]\(media\/[^)]+\)/g;
1752
+ result = result.replace(genericImagePattern, (match, caption) => {
1753
+ if (!caption || caption.trim() === '') {
1754
+ return match;
1755
+ }
1756
+
1757
+ const captionKey = caption.slice(0, 50).toLowerCase().trim();
1758
+ const entry = registry.byCaption.get(captionKey);
1759
+ if (entry) {
1760
+ const labelKey = entry.label ? `${entry.type}:${entry.label}` : null;
1761
+ if (labelKey && restoredLabels!.has(labelKey)) {
1762
+ messages.push(`Skipped duplicate by caption match: ${captionKey.slice(0, 30)}...`);
1763
+ return `![${entry.caption}](${entry.path})`;
1764
+ }
1765
+ if (labelKey) {
1766
+ restoredLabels!.add(labelKey);
1767
+ }
1768
+ restored++;
1769
+ messages.push(`Restored image by caption match: ${captionKey.slice(0, 30)}...`);
1770
+ const anchor = (entry.label && !restoredLabels!.has(labelKey!)) ? `{#${entry.type}:${entry.label}}` : '';
1771
+ return `![${entry.caption}](${entry.path})${anchor}`;
1772
+ }
1773
+ return match;
1774
+ });
1775
+
1776
+ return { text: result, restored, messages };
1777
+ }
1778
+
1779
+ /**
1780
+ * Import Word document with track changes directly as CriticMarkup
1781
+ */
1782
+ export async function importWordWithTrackChanges(
1783
+ docxPath: string,
1784
+ options: ImportWordWithTrackChangesOptions = {}
1785
+ ): Promise<ImportWordWithTrackChangesResult> {
1786
+ const { mediaDir, projectDir } = options;
1787
+ const docxDir = path.dirname(docxPath);
1788
+ const targetMediaDir = mediaDir || path.join(docxDir, 'media');
1789
+ const targetProjectDir = projectDir || docxDir;
1790
+
1791
+ const registry = readImageRegistry(targetProjectDir);
1792
+ const hasRegistry = registry && registry.figures && registry.figures.length > 0;
1793
+
1794
+ // First pass: count images
1795
+ const { stdout: rawText } = await execAsync(
1796
+ `pandoc "${docxPath}" -t markdown --wrap=none --track-changes=all`,
1797
+ { maxBuffer: 50 * 1024 * 1024 }
1798
+ );
1799
+
1800
+ const wordImageCount = (rawText.match(/!\[[^\]]*\]\(media\/[^)]+\)/g) || []).length;
1801
+ const registryCount = hasRegistry ? registry.figures.length : 0;
1802
+
1803
+ const needsMediaExtraction = wordImageCount > registryCount;
1804
+
1805
+ if (hasRegistry) {
1806
+ console.log(`Registry has ${registryCount} figures, Word doc has ${wordImageCount} images`);
1807
+ if (needsMediaExtraction) {
1808
+ console.log(`Extracting media (${wordImageCount - registryCount} new image(s) detected)`);
1809
+ } else {
1810
+ console.log(`Using existing figures from registry`);
1811
+ }
1812
+ }
1813
+
1814
+ // Extract from Word
1815
+ const extracted = await extractFromWord(docxPath, {
1816
+ mediaDir: targetMediaDir,
1817
+ skipMediaExtraction: !needsMediaExtraction,
1818
+ });
1819
+
1820
+ let text = extracted.text;
1821
+ const extractedMedia = extracted.extractedMedia || [];
1822
+ const comments = extracted.comments || [];
1823
+ const anchors = extracted.anchors || new Map();
1824
+
1825
+ // Log messages
1826
+ for (const msg of extracted.messages || []) {
1827
+ if (msg.type === 'info') {
1828
+ console.log(msg.message);
1829
+ } else if (msg.type === 'warning') {
1830
+ console.warn(`Warning: ${msg.message}`);
1831
+ }
1832
+ }
1833
+
1834
+ // Restore crossref
1835
+ const crossrefResult = restoreCrossrefFromWord(text, targetProjectDir);
1836
+ text = crossrefResult.text;
1837
+ if (crossrefResult.restored > 0) {
1838
+ console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
1839
+ }
1840
+
1841
+ // Restore images
1842
+ const imageRestoreResult = restoreImagesFromRegistry(text, targetProjectDir, crossrefResult.restoredLabels);
1843
+ text = imageRestoreResult.text;
1844
+ if (imageRestoreResult.restored > 0) {
1845
+ console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
1846
+ }
1847
+
1848
+ // Insert comments
1849
+ if (comments.length > 0) {
1850
+ text = insertCommentsIntoMarkdown(text, comments, anchors);
1851
+ console.log(`Inserted ${comments.length} comment(s)`);
1852
+ }
1853
+
1854
+ // Clean up
1855
+ text = cleanupAnnotations(text);
1856
+
1857
+ // Count final changes
1858
+ const insertions = (text.match(/\{\+\+/g) || []).length;
1859
+ const deletions = (text.match(/\{--/g) || []).length;
1860
+ const substitutions = (text.match(/\{~~/g) || []).length;
1861
+ const commentCount = (text.match(/\{>>/g) || []).length;
1862
+
1863
+ return {
1864
+ text,
1865
+ stats: {
1866
+ insertions,
1867
+ deletions,
1868
+ substitutions,
1869
+ comments: commentCount,
1870
+ total: insertions + deletions + substitutions + commentCount,
1871
+ hasTrackChanges: extracted.hasTrackChanges,
1872
+ trackChangeStats: extracted.trackChangeStats,
1873
+ },
1874
+ extractedMedia,
1875
+ comments,
1876
+ };
1877
+ }
1878
+
1879
+ /**
1880
+ * Legacy import function: Word doc → annotated MD via diff
1881
+ */
1882
+ export async function importFromWord(
1883
+ docxPath: string,
1884
+ originalMdPath: string,
1885
+ options: ImportFromWordOptions = {}
1886
+ ): Promise<ImportFromWordResult> {
1887
+ const { author = 'Reviewer', sectionContent, figuresDir } = options;
1888
+ const projectDir = path.dirname(originalMdPath);
1889
+
1890
+ let wordText: string;
1891
+ let extractedMedia: string[] = [];
1892
+ let wordTables: WordTable[] = options.wordTables || [];
1893
+ let hasTrackChanges = false;
1894
+
1895
+ if (sectionContent !== undefined) {
1896
+ let annotated = cleanupAnnotations(sectionContent);
1897
+
1898
+ const insertions = (annotated.match(/\{\+\+/g) || []).length;
1899
+ const deletions = (annotated.match(/\{--/g) || []).length;
1900
+ const substitutions = (annotated.match(/\{~~/g) || []).length;
1901
+ const commentCount = (annotated.match(/\{>>/g) || []).length;
1902
+
1903
+ return {
1904
+ annotated,
1905
+ stats: {
1906
+ insertions,
1907
+ deletions,
1908
+ substitutions,
1909
+ comments: commentCount,
1910
+ total: insertions + deletions + substitutions + commentCount,
1911
+ },
1912
+ extractedMedia: [],
1913
+ };
1914
+ } else {
1915
+ const docxDir = path.dirname(docxPath);
1916
+ const mediaDir = figuresDir || docxDir;
1917
+
1918
+ const extracted = await extractFromWord(docxPath, { mediaDir });
1919
+ wordText = extracted.text;
1920
+ extractedMedia = extracted.extractedMedia || [];
1921
+ wordTables = extracted.tables || [];
1922
+ hasTrackChanges = extracted.hasTrackChanges || false;
1923
+
1924
+ for (const msg of extracted.messages || []) {
1925
+ if (msg.type === 'info') {
1926
+ console.log(msg.message);
1927
+ } else if (msg.type === 'warning') {
1928
+ console.warn(`Warning: ${msg.message}`);
1929
+ }
1930
+ }
1931
+
1932
+ if (hasTrackChanges) {
1933
+ const crossrefResult = restoreCrossrefFromWord(wordText, projectDir);
1934
+ wordText = crossrefResult.text;
1935
+ if (crossrefResult.restored > 0) {
1936
+ console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
1937
+ }
1938
+
1939
+ const imageRestoreResult = restoreImagesFromRegistry(wordText, projectDir, crossrefResult.restoredLabels);
1940
+ wordText = imageRestoreResult.text;
1941
+ if (imageRestoreResult.restored > 0) {
1942
+ console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
1943
+ }
1944
+
1945
+ const comments = extracted.comments || [];
1946
+ const anchors = extracted.anchors || new Map();
1947
+ if (comments.length > 0) {
1948
+ wordText = insertCommentsIntoMarkdown(wordText, comments, anchors);
1949
+ console.log(`Inserted ${comments.length} comment(s)`);
1950
+ }
1951
+
1952
+ wordText = cleanupAnnotations(wordText);
1953
+
1954
+ const insertions = (wordText.match(/\{\+\+/g) || []).length;
1955
+ const deletions = (wordText.match(/\{--/g) || []).length;
1956
+ const substitutions = (wordText.match(/\{~~/g) || []).length;
1957
+ const commentCount = (wordText.match(/\{>>/g) || []).length;
1958
+
1959
+ return {
1960
+ annotated: wordText,
1961
+ stats: {
1962
+ insertions,
1963
+ deletions,
1964
+ substitutions,
1965
+ comments: commentCount,
1966
+ total: insertions + deletions + substitutions + commentCount,
1967
+ },
1968
+ extractedMedia,
1969
+ };
1970
+ }
1971
+
1972
+ console.warn('Warning: No track changes detected in Word document.');
1973
+ console.warn(' For best results, reviewers should use Track Changes in Word.');
1974
+ console.warn(' Falling back to diff-based import (comparing against original MD).');
1975
+ console.warn(' This approach may produce less accurate change annotations.');
1976
+
1977
+ const crossrefResult = restoreCrossrefFromWord(wordText, projectDir);
1978
+ wordText = crossrefResult.text;
1979
+ if (crossrefResult.restored > 0) {
1980
+ console.log(`Restored ${crossrefResult.restored} crossref reference(s)`);
1981
+ }
1982
+
1983
+ const imageRestoreResult = restoreImagesFromRegistry(wordText, projectDir, crossrefResult.restoredLabels);
1984
+ wordText = imageRestoreResult.text;
1985
+ if (imageRestoreResult.restored > 0) {
1986
+ console.log(`Restored ${imageRestoreResult.restored} image(s) from registry`);
1987
+ }
1988
+ }
1989
+
1990
+ // Read original markdown
1991
+ let originalMd = fs.readFileSync(originalMdPath, 'utf-8');
1992
+
1993
+ // Strip existing annotations
1994
+ originalMd = stripAnnotations(originalMd, { keepComments: false });
1995
+
1996
+ // Load image registry
1997
+ const imageRegistry = readImageRegistry(projectDir);
1998
+
1999
+ // Generate diff
2000
+ let annotated = generateSmartDiff(originalMd, wordText, author, { wordTables, imageRegistry });
2001
+
2002
+ // Clean up
2003
+ annotated = cleanupAnnotations(annotated);
2004
+
2005
+ // Fix citation annotations
2006
+ annotated = fixCitationAnnotations(annotated, originalMd);
2007
+
2008
+ // Convert visible comments
2009
+ annotated = convertVisibleComments(annotated);
2010
+
2011
+ // Count changes
2012
+ const insertions = (annotated.match(/\{\+\+/g) || []).length;
2013
+ const deletions = (annotated.match(/\{--/g) || []).length;
2014
+ const substitutions = (annotated.match(/\{~~/g) || []).length;
2015
+ const comments = (annotated.match(/\{>>/g) || []).length;
2016
+
2017
+ return {
2018
+ annotated,
2019
+ stats: {
2020
+ insertions,
2021
+ deletions,
2022
+ substitutions,
2023
+ comments,
2024
+ total: insertions + deletions + substitutions + comments,
2025
+ },
2026
+ extractedMedia,
2027
+ };
2028
+ }
2029
+
2030
+ /**
2031
+ * Move extracted media files to a figures directory with better names
2032
+ */
2033
+ export function moveExtractedMedia(
2034
+ mediaFiles: string[],
2035
+ figuresDir: string,
2036
+ prefix: string = 'figure'
2037
+ ): MoveExtractedMediaResult {
2038
+ const moved: MovedFile[] = [];
2039
+ const errors: string[] = [];
2040
+
2041
+ if (!fs.existsSync(figuresDir)) {
2042
+ fs.mkdirSync(figuresDir, { recursive: true });
2043
+ }
2044
+
2045
+ for (let i = 0; i < mediaFiles.length; i++) {
2046
+ const src = mediaFiles[i];
2047
+ const ext = path.extname(src).toLowerCase();
2048
+ const newName = `${prefix}${i + 1}${ext}`;
2049
+ const dest = path.join(figuresDir, newName);
2050
+
2051
+ try {
2052
+ fs.copyFileSync(src, dest);
2053
+ moved.push({ from: src, to: dest, name: newName });
2054
+ } catch (err: any) {
2055
+ errors.push(`Failed to copy ${src}: ${err.message}`);
2056
+ }
2057
+ }
2058
+
2059
+ return { moved, errors };
2060
+ }