@wonderwhy-er/desktop-commander 0.2.34 → 0.2.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. package/README.md +2 -0
  2. package/dist/handlers/filesystem-handlers.js +58 -11
  3. package/dist/handlers/history-handlers.d.ts +7 -0
  4. package/dist/handlers/history-handlers.js +33 -1
  5. package/dist/server.js +30 -4
  6. package/dist/tools/docx/builders/html-builder.d.ts +17 -0
  7. package/dist/tools/docx/builders/html-builder.js +92 -0
  8. package/dist/tools/docx/builders/image.d.ts +14 -0
  9. package/dist/tools/docx/builders/image.js +84 -0
  10. package/dist/tools/docx/builders/index.d.ts +11 -0
  11. package/dist/tools/docx/builders/index.js +11 -0
  12. package/dist/tools/docx/builders/markdown-builder.d.ts +2 -0
  13. package/dist/tools/docx/builders/markdown-builder.js +260 -0
  14. package/dist/tools/docx/builders/paragraph.d.ts +12 -0
  15. package/dist/tools/docx/builders/paragraph.js +29 -0
  16. package/dist/tools/docx/builders/table.d.ts +8 -0
  17. package/dist/tools/docx/builders/table.js +94 -0
  18. package/dist/tools/docx/builders/utils.d.ts +5 -0
  19. package/dist/tools/docx/builders/utils.js +18 -0
  20. package/dist/tools/docx/constants.d.ts +32 -0
  21. package/dist/tools/docx/constants.js +61 -0
  22. package/dist/tools/docx/converters/markdown-to-html.d.ts +17 -0
  23. package/dist/tools/docx/converters/markdown-to-html.js +111 -0
  24. package/dist/tools/docx/create.d.ts +21 -0
  25. package/dist/tools/docx/create.js +386 -0
  26. package/dist/tools/docx/dom.d.ts +66 -0
  27. package/dist/tools/docx/dom.js +228 -0
  28. package/dist/tools/docx/errors.d.ts +28 -0
  29. package/dist/tools/docx/errors.js +48 -0
  30. package/dist/tools/docx/extractors/images.d.ts +14 -0
  31. package/dist/tools/docx/extractors/images.js +40 -0
  32. package/dist/tools/docx/extractors/metadata.d.ts +14 -0
  33. package/dist/tools/docx/extractors/metadata.js +64 -0
  34. package/dist/tools/docx/extractors/sections.d.ts +14 -0
  35. package/dist/tools/docx/extractors/sections.js +61 -0
  36. package/dist/tools/docx/html.d.ts +17 -0
  37. package/dist/tools/docx/html.js +111 -0
  38. package/dist/tools/docx/index.d.ts +10 -0
  39. package/dist/tools/docx/index.js +10 -0
  40. package/dist/tools/docx/markdown.d.ts +84 -0
  41. package/dist/tools/docx/markdown.js +507 -0
  42. package/dist/tools/docx/modify.d.ts +28 -0
  43. package/dist/tools/docx/modify.js +271 -0
  44. package/dist/tools/docx/operations/handlers/index.d.ts +39 -0
  45. package/dist/tools/docx/operations/handlers/index.js +152 -0
  46. package/dist/tools/docx/operations/html-manipulator.d.ts +24 -0
  47. package/dist/tools/docx/operations/html-manipulator.js +352 -0
  48. package/dist/tools/docx/operations/index.d.ts +14 -0
  49. package/dist/tools/docx/operations/index.js +61 -0
  50. package/dist/tools/docx/operations/operation-handlers.d.ts +3 -0
  51. package/dist/tools/docx/operations/operation-handlers.js +67 -0
  52. package/dist/tools/docx/operations/preprocessor.d.ts +14 -0
  53. package/dist/tools/docx/operations/preprocessor.js +44 -0
  54. package/dist/tools/docx/operations/xml-replacer.d.ts +9 -0
  55. package/dist/tools/docx/operations/xml-replacer.js +35 -0
  56. package/dist/tools/docx/operations.d.ts +13 -0
  57. package/dist/tools/docx/operations.js +13 -0
  58. package/dist/tools/docx/ops/delete-paragraph-at-body-index.d.ts +11 -0
  59. package/dist/tools/docx/ops/delete-paragraph-at-body-index.js +23 -0
  60. package/dist/tools/docx/ops/header-replace-text-exact.d.ts +13 -0
  61. package/dist/tools/docx/ops/header-replace-text-exact.js +55 -0
  62. package/dist/tools/docx/ops/index.d.ts +17 -0
  63. package/dist/tools/docx/ops/index.js +67 -0
  64. package/dist/tools/docx/ops/insert-image-after-text.d.ts +24 -0
  65. package/dist/tools/docx/ops/insert-image-after-text.js +128 -0
  66. package/dist/tools/docx/ops/insert-paragraph-after-text.d.ts +12 -0
  67. package/dist/tools/docx/ops/insert-paragraph-after-text.js +74 -0
  68. package/dist/tools/docx/ops/insert-table-after-text.d.ts +19 -0
  69. package/dist/tools/docx/ops/insert-table-after-text.js +57 -0
  70. package/dist/tools/docx/ops/replace-hyperlink-url.d.ts +12 -0
  71. package/dist/tools/docx/ops/replace-hyperlink-url.js +37 -0
  72. package/dist/tools/docx/ops/replace-paragraph-at-body-index.d.ts +9 -0
  73. package/dist/tools/docx/ops/replace-paragraph-at-body-index.js +25 -0
  74. package/dist/tools/docx/ops/replace-paragraph-text-exact.d.ts +9 -0
  75. package/dist/tools/docx/ops/replace-paragraph-text-exact.js +21 -0
  76. package/dist/tools/docx/ops/set-color-for-paragraph-exact.d.ts +8 -0
  77. package/dist/tools/docx/ops/set-color-for-paragraph-exact.js +23 -0
  78. package/dist/tools/docx/ops/set-color-for-style.d.ts +9 -0
  79. package/dist/tools/docx/ops/set-color-for-style.js +27 -0
  80. package/dist/tools/docx/ops/set-paragraph-style-at-body-index.d.ts +8 -0
  81. package/dist/tools/docx/ops/set-paragraph-style-at-body-index.js +57 -0
  82. package/dist/tools/docx/ops/table-set-cell-text.d.ts +9 -0
  83. package/dist/tools/docx/ops/table-set-cell-text.js +72 -0
  84. package/dist/tools/docx/parsers/image-extractor.d.ts +18 -0
  85. package/dist/tools/docx/parsers/image-extractor.js +61 -0
  86. package/dist/tools/docx/parsers/index.d.ts +9 -0
  87. package/dist/tools/docx/parsers/index.js +9 -0
  88. package/dist/tools/docx/parsers/paragraph-parser.d.ts +2 -0
  89. package/dist/tools/docx/parsers/paragraph-parser.js +88 -0
  90. package/dist/tools/docx/parsers/table-parser.d.ts +9 -0
  91. package/dist/tools/docx/parsers/table-parser.js +72 -0
  92. package/dist/tools/docx/parsers/xml-parser.d.ts +25 -0
  93. package/dist/tools/docx/parsers/xml-parser.js +71 -0
  94. package/dist/tools/docx/parsers/zip-reader.d.ts +23 -0
  95. package/dist/tools/docx/parsers/zip-reader.js +52 -0
  96. package/dist/tools/docx/read.d.ts +27 -0
  97. package/dist/tools/docx/read.js +188 -0
  98. package/dist/tools/docx/relationships.d.ts +22 -0
  99. package/dist/tools/docx/relationships.js +76 -0
  100. package/dist/tools/docx/structure.d.ts +25 -0
  101. package/dist/tools/docx/structure.js +102 -0
  102. package/dist/tools/docx/styled-html-parser.d.ts +23 -0
  103. package/dist/tools/docx/styled-html-parser.js +1262 -0
  104. package/dist/tools/docx/types.d.ts +184 -0
  105. package/dist/tools/docx/types.js +5 -0
  106. package/dist/tools/docx/utils/escaping.d.ts +13 -0
  107. package/dist/tools/docx/utils/escaping.js +26 -0
  108. package/dist/tools/docx/utils/images.d.ts +9 -0
  109. package/dist/tools/docx/utils/images.js +26 -0
  110. package/dist/tools/docx/utils/index.d.ts +12 -0
  111. package/dist/tools/docx/utils/index.js +17 -0
  112. package/dist/tools/docx/utils/markdown.d.ts +13 -0
  113. package/dist/tools/docx/utils/markdown.js +32 -0
  114. package/dist/tools/docx/utils/paths.d.ts +15 -0
  115. package/dist/tools/docx/utils/paths.js +27 -0
  116. package/dist/tools/docx/utils/versioning.d.ts +25 -0
  117. package/dist/tools/docx/utils/versioning.js +55 -0
  118. package/dist/tools/docx/utils.d.ts +101 -0
  119. package/dist/tools/docx/utils.js +299 -0
  120. package/dist/tools/docx/validate.d.ts +33 -0
  121. package/dist/tools/docx/validate.js +49 -0
  122. package/dist/tools/docx/validators.d.ts +13 -0
  123. package/dist/tools/docx/validators.js +40 -0
  124. package/dist/tools/docx/write.d.ts +17 -0
  125. package/dist/tools/docx/write.js +88 -0
  126. package/dist/tools/docx/zip.d.ts +21 -0
  127. package/dist/tools/docx/zip.js +35 -0
  128. package/dist/tools/schemas.d.ts +13 -0
  129. package/dist/tools/schemas.js +5 -0
  130. package/dist/types.d.ts +10 -0
  131. package/dist/ui/contracts.d.ts +14 -0
  132. package/dist/ui/contracts.js +18 -0
  133. package/dist/ui/file-preview/index.html +16 -0
  134. package/dist/ui/file-preview/preview-runtime.js +13977 -0
  135. package/dist/ui/file-preview/shared/preview-file-types.d.ts +5 -0
  136. package/dist/ui/file-preview/shared/preview-file-types.js +57 -0
  137. package/dist/ui/file-preview/src/app.d.ts +4 -0
  138. package/dist/ui/file-preview/src/app.js +800 -0
  139. package/dist/ui/file-preview/src/components/code-viewer.d.ts +6 -0
  140. package/dist/ui/file-preview/src/components/code-viewer.js +73 -0
  141. package/dist/ui/file-preview/src/components/highlighting.d.ts +2 -0
  142. package/dist/ui/file-preview/src/components/highlighting.js +54 -0
  143. package/dist/ui/file-preview/src/components/html-renderer.d.ts +9 -0
  144. package/dist/ui/file-preview/src/components/html-renderer.js +63 -0
  145. package/dist/ui/file-preview/src/components/markdown-renderer.d.ts +1 -0
  146. package/dist/ui/file-preview/src/components/markdown-renderer.js +21 -0
  147. package/dist/ui/file-preview/src/components/toolbar.d.ts +6 -0
  148. package/dist/ui/file-preview/src/components/toolbar.js +75 -0
  149. package/dist/ui/file-preview/src/image-preview.d.ts +3 -0
  150. package/dist/ui/file-preview/src/image-preview.js +21 -0
  151. package/dist/ui/file-preview/src/main.d.ts +1 -0
  152. package/dist/ui/file-preview/src/main.js +5 -0
  153. package/dist/ui/file-preview/src/types.d.ts +1 -0
  154. package/dist/ui/file-preview/src/types.js +1 -0
  155. package/dist/ui/file-preview/styles.css +764 -0
  156. package/dist/ui/resources.d.ts +21 -0
  157. package/dist/ui/resources.js +72 -0
  158. package/dist/ui/shared/escape-html.d.ts +4 -0
  159. package/dist/ui/shared/escape-html.js +11 -0
  160. package/dist/ui/shared/host-lifecycle.d.ts +16 -0
  161. package/dist/ui/shared/host-lifecycle.js +35 -0
  162. package/dist/ui/shared/rpc-client.d.ts +14 -0
  163. package/dist/ui/shared/rpc-client.js +72 -0
  164. package/dist/ui/shared/theme-adaptation.d.ts +10 -0
  165. package/dist/ui/shared/theme-adaptation.js +118 -0
  166. package/dist/ui/shared/tool-header.d.ts +9 -0
  167. package/dist/ui/shared/tool-header.js +25 -0
  168. package/dist/ui/shared/tool-shell.d.ts +16 -0
  169. package/dist/ui/shared/tool-shell.js +65 -0
  170. package/dist/ui/shared/widget-state.d.ts +28 -0
  171. package/dist/ui/shared/widget-state.js +60 -0
  172. package/dist/utils/capture.d.ts +1 -0
  173. package/dist/utils/capture.js +10 -4
  174. package/dist/utils/files/docx.d.ts +34 -0
  175. package/dist/utils/files/docx.js +145 -0
  176. package/dist/utils/files/text.js +9 -1
  177. package/dist/version.d.ts +1 -1
  178. package/dist/version.js +1 -1
  179. package/package.json +5 -2
@@ -0,0 +1,84 @@
1
+ /**
2
+ * DOCX to Markdown Conversion
3
+ * Uses Docxtemplater + XML parsing for reading Word documents
4
+ */
5
+ /**
6
+ * DOCX metadata structure
7
+ */
8
+ export interface DocxMetadata {
9
+ /** Document title from core properties */
10
+ title?: string;
11
+ /** Document author */
12
+ author?: string;
13
+ /** Document creator */
14
+ creator?: string;
15
+ /** Document subject */
16
+ subject?: string;
17
+ /** Document description */
18
+ description?: string;
19
+ /** Creation date */
20
+ creationDate?: Date;
21
+ /** Last modification date */
22
+ modificationDate?: Date;
23
+ /** Last modified by */
24
+ lastModifiedBy?: string;
25
+ /** Document revision number */
26
+ revision?: string;
27
+ /** File size in bytes */
28
+ fileSize?: number;
29
+ }
30
+ /**
31
+ * Embedded image information
32
+ */
33
+ export interface DocxImage {
34
+ /** Unique identifier for the image */
35
+ id: string;
36
+ /** Base64-encoded image data */
37
+ data: string;
38
+ /** MIME type (e.g., "image/png", "image/jpeg") */
39
+ mimeType: string;
40
+ /** Alt text if available */
41
+ altText?: string;
42
+ /** Original size in bytes */
43
+ originalSize?: number;
44
+ }
45
+ /**
46
+ * DOCX section/paragraph structure
47
+ */
48
+ export interface DocxSection {
49
+ /** Section type: heading, paragraph, list, table */
50
+ type: 'heading' | 'paragraph' | 'list' | 'table' | 'image';
51
+ /** Section content as markdown */
52
+ content: string;
53
+ /** Heading level if type is heading */
54
+ level?: number;
55
+ /** Associated images if any */
56
+ images?: DocxImage[];
57
+ }
58
+ /**
59
+ * Complete DOCX parse result
60
+ */
61
+ export interface DocxParseResult {
62
+ /** Document content as markdown */
63
+ markdown: string;
64
+ /** Document metadata */
65
+ metadata: DocxMetadata;
66
+ /** Extracted images */
67
+ images: DocxImage[];
68
+ /** Structured sections (optional, for advanced parsing) */
69
+ sections?: DocxSection[];
70
+ }
71
+ /**
72
+ * Convert DOCX to Markdown using Docxtemplater + XML parsing
73
+ * @param source Path to DOCX file or URL
74
+ * @param options Conversion options
75
+ * @returns Parsed DOCX result with markdown and metadata
76
+ */
77
+ export declare function parseDocxToMarkdown(source: string, options?: {
78
+ /** Extract images as base64 */
79
+ includeImages?: boolean;
80
+ /** Preserve inline formatting (bold, italic) */
81
+ preserveFormatting?: boolean;
82
+ /** Custom style mapping */
83
+ styleMap?: string[];
84
+ }): Promise<DocxParseResult>;
@@ -0,0 +1,507 @@
1
+ /**
2
+ * DOCX to Markdown Conversion
3
+ * Uses Docxtemplater + XML parsing for reading Word documents
4
+ */
5
+ import fs from 'fs/promises';
6
+ import path from 'path';
7
+ import { createRequire } from 'module';
8
+ const require = createRequire(import.meta.url);
9
+ const PizZip = require('pizzip');
10
+ const Docxtemplater = require('docxtemplater');
11
+ const { DOMParser } = require('@xmldom/xmldom');
12
+ /**
13
+ * Check if source is a URL
14
+ */
15
+ const isUrl = (source) => source.startsWith('http://') || source.startsWith('https://');
16
+ /**
17
+ * Load DOCX file as buffer
18
+ */
19
+ async function loadDocxToBuffer(source) {
20
+ if (isUrl(source)) {
21
+ const response = await fetch(source);
22
+ const arrayBuffer = await response.arrayBuffer();
23
+ return Buffer.from(arrayBuffer);
24
+ }
25
+ else {
26
+ return await fs.readFile(source);
27
+ }
28
+ }
29
+ function readZipFileText(zip, filePath) {
30
+ const file = zip.file(filePath);
31
+ if (!file)
32
+ return null;
33
+ if (typeof file.asText === 'function') {
34
+ return file.asText();
35
+ }
36
+ if (typeof file.asBinary === 'function') {
37
+ return Buffer.from(file.asBinary(), 'binary').toString('utf8');
38
+ }
39
+ return null;
40
+ }
41
+ function readZipFileBuffer(zip, filePath) {
42
+ const file = zip.file(filePath);
43
+ if (!file)
44
+ return null;
45
+ if (typeof file.asUint8Array === 'function') {
46
+ return Buffer.from(file.asUint8Array());
47
+ }
48
+ if (typeof file.asNodeBuffer === 'function') {
49
+ return file.asNodeBuffer();
50
+ }
51
+ if (typeof file.asBinary === 'function') {
52
+ return Buffer.from(file.asBinary(), 'binary');
53
+ }
54
+ return null;
55
+ }
56
+ function getMimeTypeForTarget(target) {
57
+ const ext = path.extname(target).toLowerCase();
58
+ const mimeTypes = {
59
+ '.png': 'image/png',
60
+ '.jpg': 'image/jpeg',
61
+ '.jpeg': 'image/jpeg',
62
+ '.gif': 'image/gif',
63
+ '.bmp': 'image/bmp',
64
+ '.webp': 'image/webp',
65
+ '.svg': 'image/svg+xml',
66
+ };
67
+ return mimeTypes[ext] || 'application/octet-stream';
68
+ }
69
+ function escapeTableCell(text) {
70
+ return text.replace(/\|/g, '\\|').replace(/\r?\n/g, '<br>');
71
+ }
72
+ function getElementChildren(node) {
73
+ const children = [];
74
+ for (let i = 0; i < node.childNodes.length; i++) {
75
+ const child = node.childNodes[i];
76
+ if (child.nodeType === 1) {
77
+ children.push(child);
78
+ }
79
+ }
80
+ return children;
81
+ }
82
+ function getAttributeValue(node, name) {
83
+ return node.getAttribute(name) || node.getAttribute(`w:${name}`) || null;
84
+ }
85
+ function getHeadingLevelFromParagraph(paragraph) {
86
+ const pPr = paragraph.getElementsByTagName('w:pPr')[0];
87
+ if (!pPr)
88
+ return null;
89
+ const pStyle = pPr.getElementsByTagName('w:pStyle')[0];
90
+ if (!pStyle)
91
+ return null;
92
+ const styleVal = getAttributeValue(pStyle, 'val');
93
+ if (!styleVal)
94
+ return null;
95
+ const match = styleVal.match(/heading\s*([1-6])/i);
96
+ if (!match)
97
+ return null;
98
+ return Number(match[1]);
99
+ }
100
+ function extractRelationshipMap(relsXml) {
101
+ const relMap = new Map();
102
+ if (!relsXml)
103
+ return relMap;
104
+ const relDoc = new DOMParser().parseFromString(relsXml, 'application/xml');
105
+ const rels = relDoc.getElementsByTagName('Relationship');
106
+ for (let i = 0; i < rels.length; i++) {
107
+ const rel = rels[i];
108
+ const id = rel.getAttribute('Id');
109
+ const type = rel.getAttribute('Type') || '';
110
+ const target = rel.getAttribute('Target') || '';
111
+ if (id && target) {
112
+ relMap.set(id, { target, type });
113
+ }
114
+ }
115
+ return relMap;
116
+ }
117
+ function buildImageResolver(zip, relMap, images, includeImages) {
118
+ const cache = new Map();
119
+ return (relId) => {
120
+ if (!includeImages || !relId)
121
+ return '';
122
+ const rel = relMap.get(relId);
123
+ if (!rel || !rel.type.includes('/image'))
124
+ return '';
125
+ if (cache.has(relId)) {
126
+ const cached = cache.get(relId);
127
+ return `![image](data:${cached.mimeType};base64,${cached.data})`;
128
+ }
129
+ const targetPath = rel.target.startsWith('word/')
130
+ ? rel.target
131
+ : `word/${rel.target.replace(/^\/?/, '')}`;
132
+ const buffer = readZipFileBuffer(zip, targetPath);
133
+ if (!buffer)
134
+ return '';
135
+ const mimeType = getMimeTypeForTarget(rel.target);
136
+ const base64 = buffer.toString('base64');
137
+ const image = {
138
+ id: relId,
139
+ data: base64,
140
+ mimeType,
141
+ originalSize: buffer.length,
142
+ };
143
+ images.push(image);
144
+ cache.set(relId, image);
145
+ return `![image](data:${mimeType};base64,${base64})`;
146
+ };
147
+ }
148
+ function extractTextFromRun(run, resolveImage) {
149
+ let text = '';
150
+ const children = getElementChildren(run);
151
+ for (const child of children) {
152
+ const nodeName = child.nodeName;
153
+ if (nodeName === 'w:t') {
154
+ text += child.textContent || '';
155
+ continue;
156
+ }
157
+ if (nodeName === 'w:tab') {
158
+ text += '\t';
159
+ continue;
160
+ }
161
+ if (nodeName === 'w:br') {
162
+ text += '\n';
163
+ continue;
164
+ }
165
+ if (nodeName === 'w:drawing' || nodeName === 'w:pict') {
166
+ const blips = child.getElementsByTagName('a:blip');
167
+ for (let i = 0; i < blips.length; i++) {
168
+ const blip = blips[i];
169
+ const relId = blip.getAttribute('r:embed') || blip.getAttribute('embed');
170
+ const imageMarkdown = resolveImage(relId);
171
+ if (imageMarkdown) {
172
+ text += imageMarkdown;
173
+ }
174
+ }
175
+ }
176
+ }
177
+ return text;
178
+ }
179
+ function extractParagraphText(paragraph, resolveImage) {
180
+ let text = '';
181
+ const children = getElementChildren(paragraph);
182
+ for (const child of children) {
183
+ const nodeName = child.nodeName;
184
+ if (nodeName === 'w:r') {
185
+ text += extractTextFromRun(child, resolveImage);
186
+ continue;
187
+ }
188
+ if (nodeName === 'w:hyperlink') {
189
+ const runs = child.getElementsByTagName('w:r');
190
+ for (let i = 0; i < runs.length; i++) {
191
+ text += extractTextFromRun(runs[i], resolveImage);
192
+ }
193
+ continue;
194
+ }
195
+ }
196
+ return text;
197
+ }
198
+ function convertTableToMarkdown(table, resolveImage) {
199
+ const rows = [];
200
+ const rowNodes = table.getElementsByTagName('w:tr');
201
+ for (let i = 0; i < rowNodes.length; i++) {
202
+ const row = rowNodes[i];
203
+ const cells = row.getElementsByTagName('w:tc');
204
+ const rowCells = [];
205
+ for (let j = 0; j < cells.length; j++) {
206
+ const cell = cells[j];
207
+ const paragraphs = cell.getElementsByTagName('w:p');
208
+ const cellTexts = [];
209
+ for (let k = 0; k < paragraphs.length; k++) {
210
+ const text = extractParagraphText(paragraphs[k], resolveImage).trim();
211
+ if (text) {
212
+ cellTexts.push(text);
213
+ }
214
+ }
215
+ const combined = cellTexts.length > 0 ? cellTexts.join('<br>') : ' ';
216
+ rowCells.push(escapeTableCell(combined));
217
+ }
218
+ if (rowCells.length > 0) {
219
+ rows.push(rowCells);
220
+ }
221
+ }
222
+ if (rows.length === 0)
223
+ return null;
224
+ const maxCols = Math.max(...rows.map(row => row.length));
225
+ for (const row of rows) {
226
+ while (row.length < maxCols) {
227
+ row.push(' ');
228
+ }
229
+ }
230
+ const header = rows[0];
231
+ const bodyRows = rows.slice(1);
232
+ const headerLine = `| ${header.join(' | ')} |`;
233
+ const separatorLine = `| ${header.map(() => '---').join(' | ')} |`;
234
+ const dataLines = bodyRows.map(row => `| ${row.join(' | ')} |`);
235
+ return [headerLine, separatorLine, ...dataLines].join('\n');
236
+ }
237
+ function convertBodyToMarkdown(body, resolveImage) {
238
+ const blocks = [];
239
+ const children = getElementChildren(body);
240
+ for (const child of children) {
241
+ const nodeName = child.nodeName;
242
+ if (nodeName === 'w:p') {
243
+ const text = extractParagraphText(child, resolveImage).trim();
244
+ if (!text)
245
+ continue;
246
+ const headingLevel = getHeadingLevelFromParagraph(child);
247
+ if (headingLevel && headingLevel >= 1 && headingLevel <= 6) {
248
+ blocks.push(`${'#'.repeat(headingLevel)} ${text}`);
249
+ }
250
+ else {
251
+ blocks.push(text);
252
+ }
253
+ continue;
254
+ }
255
+ if (nodeName === 'w:tbl') {
256
+ const tableMarkdown = convertTableToMarkdown(child, resolveImage);
257
+ if (tableMarkdown) {
258
+ blocks.push(tableMarkdown);
259
+ }
260
+ continue;
261
+ }
262
+ }
263
+ return blocks.join('\n\n');
264
+ }
265
+ /**
266
+ * Convert DOCX to Markdown using Docxtemplater + XML parsing
267
+ * @param source Path to DOCX file or URL
268
+ * @param options Conversion options
269
+ * @returns Parsed DOCX result with markdown and metadata
270
+ */
271
+ export async function parseDocxToMarkdown(source, options = {}) {
272
+ const { includeImages = true, preserveFormatting = true, styleMap = [] } = options;
273
+ try {
274
+ // Load DOCX file
275
+ const buffer = await loadDocxToBuffer(source);
276
+ // Get file size (for local files)
277
+ let fileSize;
278
+ if (!isUrl(source)) {
279
+ try {
280
+ const stats = await fs.stat(source);
281
+ fileSize = stats.size;
282
+ }
283
+ catch {
284
+ // Ignore stat errors for URLs
285
+ }
286
+ }
287
+ const zip = new PizZip(buffer);
288
+ try {
289
+ new Docxtemplater(zip, { paragraphLoop: true, linebreaks: true });
290
+ }
291
+ catch (error) {
292
+ console.warn('Docxtemplater validation failed, continuing with raw XML parsing:', error);
293
+ }
294
+ const documentXml = readZipFileText(zip, 'word/document.xml');
295
+ if (!documentXml) {
296
+ throw new Error('Invalid DOCX file: word/document.xml not found');
297
+ }
298
+ const relsXml = readZipFileText(zip, 'word/_rels/document.xml.rels');
299
+ const relMap = extractRelationshipMap(relsXml);
300
+ const images = [];
301
+ const resolveImage = buildImageResolver(zip, relMap, images, includeImages);
302
+ const doc = new DOMParser().parseFromString(documentXml, 'application/xml');
303
+ const body = doc.getElementsByTagName('w:body')[0];
304
+ if (!body) {
305
+ throw new Error('Invalid DOCX file: <w:body> not found');
306
+ }
307
+ let markdown = convertBodyToMarkdown(body, resolveImage);
308
+ // Extract metadata from DOCX
309
+ const metadata = await extractMetadata(source, buffer, fileSize);
310
+ // Post-process markdown for better formatting
311
+ markdown = postProcessMarkdown(markdown);
312
+ // Parse into sections (optional advanced feature)
313
+ const sections = parseIntoSections(markdown, images);
314
+ return {
315
+ markdown,
316
+ metadata,
317
+ images,
318
+ sections
319
+ };
320
+ }
321
+ catch (error) {
322
+ console.error('Error converting DOCX to Markdown:', error);
323
+ throw new Error(`Failed to parse DOCX file: ${error instanceof Error ? error.message : String(error)}`);
324
+ }
325
+ }
326
+ /**
327
+ * Extract metadata from DOCX file
328
+ */
329
+ async function extractMetadata(source, buffer, fileSize) {
330
+ try {
331
+ // Core properties aren't exposed by the parser, so we'll use JSZip directly
332
+ // For now, return basic metadata structure
333
+ // TODO: Could enhance with docx-parser or officegen for full metadata
334
+ const metadata = {
335
+ fileSize
336
+ };
337
+ // Try to extract basic metadata if available
338
+ // This is a simplified version - full implementation would use docx package
339
+ try {
340
+ // Attempt to read core properties using JSZip (DOCX is a ZIP file)
341
+ const JSZip = require('jszip');
342
+ const zip = await JSZip.loadAsync(buffer);
343
+ // Read core properties XML
344
+ const corePropsFile = zip.file('docProps/core.xml');
345
+ if (corePropsFile) {
346
+ const corePropsXml = await corePropsFile.async('string');
347
+ // Basic XML parsing (ideally use proper XML parser)
348
+ const extractTag = (xml, tag) => {
349
+ const regex = new RegExp(`<dc:${tag}[^>]*>([^<]*)<\/dc:${tag}>`, 'i');
350
+ const match = xml.match(regex);
351
+ if (match)
352
+ return match[1];
353
+ // Try cp: namespace
354
+ const regex2 = new RegExp(`<cp:${tag}[^>]*>([^<]*)<\/cp:${tag}>`, 'i');
355
+ const match2 = xml.match(regex2);
356
+ return match2 ? match2[1] : undefined;
357
+ };
358
+ const extractDcmiTerms = (xml, tag) => {
359
+ const regex = new RegExp(`<dcterms:${tag}[^>]*>([^<]*)<\/dcterms:${tag}>`, 'i');
360
+ const match = xml.match(regex);
361
+ if (match) {
362
+ try {
363
+ return new Date(match[1]);
364
+ }
365
+ catch {
366
+ return undefined;
367
+ }
368
+ }
369
+ return undefined;
370
+ };
371
+ metadata.title = extractTag(corePropsXml, 'title');
372
+ metadata.author = extractTag(corePropsXml, 'creator');
373
+ metadata.subject = extractTag(corePropsXml, 'subject');
374
+ metadata.description = extractTag(corePropsXml, 'description');
375
+ metadata.lastModifiedBy = extractTag(corePropsXml, 'lastModifiedBy');
376
+ metadata.revision = extractTag(corePropsXml, 'revision');
377
+ metadata.creationDate = extractDcmiTerms(corePropsXml, 'created');
378
+ metadata.modificationDate = extractDcmiTerms(corePropsXml, 'modified');
379
+ }
380
+ }
381
+ catch (metaError) {
382
+ // Metadata extraction is optional, don't fail if it doesn't work
383
+ console.warn('Could not extract detailed metadata:', metaError);
384
+ }
385
+ return metadata;
386
+ }
387
+ catch (error) {
388
+ // Return minimal metadata on error
389
+ return { fileSize };
390
+ }
391
+ }
392
+ /**
393
+ * Post-process markdown for better formatting
394
+ */
395
+ function postProcessMarkdown(markdown) {
396
+ // Clean up excessive newlines
397
+ markdown = markdown.replace(/\n{3,}/g, '\n\n');
398
+ // Ensure proper spacing around headings
399
+ markdown = markdown.replace(/([^\n])\n(#+\s)/g, '$1\n\n$2');
400
+ markdown = markdown.replace(/(#+\s[^\n]+)\n([^\n])/g, '$1\n\n$2');
401
+ // Clean up list formatting
402
+ markdown = markdown.replace(/\n([*-]\s)/g, '\n$1');
403
+ // Ensure proper spacing around code blocks
404
+ markdown = markdown.replace(/([^\n])\n```/g, '$1\n\n```');
405
+ markdown = markdown.replace(/```\n([^\n])/g, '```\n\n$1');
406
+ // Ensure proper spacing around tables
407
+ markdown = markdown.replace(/([^\n])\n(\|[^\n]+\|)/g, '$1\n\n$2');
408
+ markdown = markdown.replace(/(\|[^\n]+\|)\n([^\n|])/g, '$1\n\n$2');
409
+ // Trim leading/trailing whitespace
410
+ markdown = markdown.trim();
411
+ return markdown;
412
+ }
413
+ /**
414
+ * Parse markdown into structured sections
415
+ */
416
+ function parseIntoSections(markdown, images) {
417
+ const sections = [];
418
+ const lines = markdown.split('\n');
419
+ let currentSection = null;
420
+ let currentContent = [];
421
+ for (const line of lines) {
422
+ // Detect headings
423
+ const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
424
+ if (headingMatch) {
425
+ // Save previous section
426
+ if (currentSection) {
427
+ currentSection.content = currentContent.join('\n').trim();
428
+ sections.push(currentSection);
429
+ }
430
+ // Start new heading section
431
+ currentSection = {
432
+ type: 'heading',
433
+ level: headingMatch[1].length,
434
+ content: '' // Will be set later
435
+ };
436
+ currentContent = [line];
437
+ continue;
438
+ }
439
+ // Detect images
440
+ const imageMatch = line.match(/!\[([^\]]*)\]\(([^)]+)\)/);
441
+ if (imageMatch) {
442
+ // Save previous section
443
+ if (currentSection && currentContent.length > 0) {
444
+ currentSection.content = currentContent.join('\n').trim();
445
+ sections.push(currentSection);
446
+ }
447
+ // Create image section
448
+ sections.push({
449
+ type: 'image',
450
+ content: line
451
+ });
452
+ currentSection = null;
453
+ currentContent = [];
454
+ continue;
455
+ }
456
+ // Detect lists
457
+ if (line.match(/^[*\-+]\s/) || line.match(/^\d+\.\s/)) {
458
+ if (!currentSection || currentSection.type !== 'list') {
459
+ // Save previous section
460
+ if (currentSection && currentContent.length > 0) {
461
+ currentSection.content = currentContent.join('\n').trim();
462
+ sections.push(currentSection);
463
+ }
464
+ // Start new list section
465
+ currentSection = {
466
+ type: 'list',
467
+ content: ''
468
+ };
469
+ currentContent = [];
470
+ }
471
+ currentContent.push(line);
472
+ continue;
473
+ }
474
+ // Regular paragraph content
475
+ if (line.trim()) {
476
+ if (!currentSection || (currentSection.type !== 'paragraph' && currentSection.type !== 'heading')) {
477
+ // Save previous section
478
+ if (currentSection && currentContent.length > 0) {
479
+ currentSection.content = currentContent.join('\n').trim();
480
+ sections.push(currentSection);
481
+ }
482
+ // Start new paragraph section
483
+ currentSection = {
484
+ type: 'paragraph',
485
+ content: ''
486
+ };
487
+ currentContent = [];
488
+ }
489
+ currentContent.push(line);
490
+ }
491
+ else if (currentContent.length > 0) {
492
+ // Empty line - finalize current section
493
+ if (currentSection) {
494
+ currentSection.content = currentContent.join('\n').trim();
495
+ sections.push(currentSection);
496
+ }
497
+ currentSection = null;
498
+ currentContent = [];
499
+ }
500
+ }
501
+ // Save final section
502
+ if (currentSection && currentContent.length > 0) {
503
+ currentSection.content = currentContent.join('\n').trim();
504
+ sections.push(currentSection);
505
+ }
506
+ return sections;
507
+ }
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Legacy DOCX modification operations.
3
+ *
4
+ * These functions support the older write_file / edit_block paths that
5
+ * modify DOCX via simple operations (replace, insert, delete, style).
6
+ * They are distinct from the new patch-based writeDocxPatched pipeline.
7
+ *
8
+ * Single Responsibility: create / modify DOCX content using the legacy
9
+ * DocxModification interface. Delegates XML parsing and element
10
+ * manipulation to the shared dom.ts module.
11
+ */
12
+ import type { DocxModification } from './types.js';
13
+ /**
14
+ * Open an existing DOCX, apply an ordered list of modifications to
15
+ * word/document.xml, and write the result to outputPath.
16
+ * Every other file in the ZIP (styles, images, rels, …) is preserved.
17
+ */
18
+ export declare function modifyDocxContent(inputPath: string, outputPath: string, modifications: DocxModification[]): Promise<void>;
19
+ /**
20
+ * Replace the entire w:body content of a DOCX with new body XML.
21
+ * Used by the body-XML replacement mode of write_file.
22
+ */
23
+ export declare function replaceBodyXml(inputPath: string, outputPath: string, newBodyXml: string): Promise<void>;
24
+ /**
25
+ * Create a brand-new minimal DOCX from a plain-text string.
26
+ * Double-newlines are treated as paragraph separators.
27
+ */
28
+ export declare function writeDocx(outputPath: string, content: string | DocxModification[]): Promise<void>;