@wonderwhy-er/desktop-commander 0.2.34 → 0.2.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/handlers/filesystem-handlers.js +58 -11
- package/dist/handlers/history-handlers.d.ts +7 -0
- package/dist/handlers/history-handlers.js +33 -1
- package/dist/server.js +30 -4
- package/dist/tools/docx/builders/html-builder.d.ts +17 -0
- package/dist/tools/docx/builders/html-builder.js +92 -0
- package/dist/tools/docx/builders/image.d.ts +14 -0
- package/dist/tools/docx/builders/image.js +84 -0
- package/dist/tools/docx/builders/index.d.ts +11 -0
- package/dist/tools/docx/builders/index.js +11 -0
- package/dist/tools/docx/builders/markdown-builder.d.ts +2 -0
- package/dist/tools/docx/builders/markdown-builder.js +260 -0
- package/dist/tools/docx/builders/paragraph.d.ts +12 -0
- package/dist/tools/docx/builders/paragraph.js +29 -0
- package/dist/tools/docx/builders/table.d.ts +8 -0
- package/dist/tools/docx/builders/table.js +94 -0
- package/dist/tools/docx/builders/utils.d.ts +5 -0
- package/dist/tools/docx/builders/utils.js +18 -0
- package/dist/tools/docx/constants.d.ts +32 -0
- package/dist/tools/docx/constants.js +61 -0
- package/dist/tools/docx/converters/markdown-to-html.d.ts +17 -0
- package/dist/tools/docx/converters/markdown-to-html.js +111 -0
- package/dist/tools/docx/create.d.ts +21 -0
- package/dist/tools/docx/create.js +386 -0
- package/dist/tools/docx/dom.d.ts +66 -0
- package/dist/tools/docx/dom.js +228 -0
- package/dist/tools/docx/errors.d.ts +28 -0
- package/dist/tools/docx/errors.js +48 -0
- package/dist/tools/docx/extractors/images.d.ts +14 -0
- package/dist/tools/docx/extractors/images.js +40 -0
- package/dist/tools/docx/extractors/metadata.d.ts +14 -0
- package/dist/tools/docx/extractors/metadata.js +64 -0
- package/dist/tools/docx/extractors/sections.d.ts +14 -0
- package/dist/tools/docx/extractors/sections.js +61 -0
- package/dist/tools/docx/html.d.ts +17 -0
- package/dist/tools/docx/html.js +111 -0
- package/dist/tools/docx/index.d.ts +10 -0
- package/dist/tools/docx/index.js +10 -0
- package/dist/tools/docx/markdown.d.ts +84 -0
- package/dist/tools/docx/markdown.js +507 -0
- package/dist/tools/docx/modify.d.ts +28 -0
- package/dist/tools/docx/modify.js +271 -0
- package/dist/tools/docx/operations/handlers/index.d.ts +39 -0
- package/dist/tools/docx/operations/handlers/index.js +152 -0
- package/dist/tools/docx/operations/html-manipulator.d.ts +24 -0
- package/dist/tools/docx/operations/html-manipulator.js +352 -0
- package/dist/tools/docx/operations/index.d.ts +14 -0
- package/dist/tools/docx/operations/index.js +61 -0
- package/dist/tools/docx/operations/operation-handlers.d.ts +3 -0
- package/dist/tools/docx/operations/operation-handlers.js +67 -0
- package/dist/tools/docx/operations/preprocessor.d.ts +14 -0
- package/dist/tools/docx/operations/preprocessor.js +44 -0
- package/dist/tools/docx/operations/xml-replacer.d.ts +9 -0
- package/dist/tools/docx/operations/xml-replacer.js +35 -0
- package/dist/tools/docx/operations.d.ts +13 -0
- package/dist/tools/docx/operations.js +13 -0
- package/dist/tools/docx/ops/delete-paragraph-at-body-index.d.ts +11 -0
- package/dist/tools/docx/ops/delete-paragraph-at-body-index.js +23 -0
- package/dist/tools/docx/ops/header-replace-text-exact.d.ts +13 -0
- package/dist/tools/docx/ops/header-replace-text-exact.js +55 -0
- package/dist/tools/docx/ops/index.d.ts +17 -0
- package/dist/tools/docx/ops/index.js +67 -0
- package/dist/tools/docx/ops/insert-image-after-text.d.ts +24 -0
- package/dist/tools/docx/ops/insert-image-after-text.js +128 -0
- package/dist/tools/docx/ops/insert-paragraph-after-text.d.ts +12 -0
- package/dist/tools/docx/ops/insert-paragraph-after-text.js +74 -0
- package/dist/tools/docx/ops/insert-table-after-text.d.ts +19 -0
- package/dist/tools/docx/ops/insert-table-after-text.js +57 -0
- package/dist/tools/docx/ops/replace-hyperlink-url.d.ts +12 -0
- package/dist/tools/docx/ops/replace-hyperlink-url.js +37 -0
- package/dist/tools/docx/ops/replace-paragraph-at-body-index.d.ts +9 -0
- package/dist/tools/docx/ops/replace-paragraph-at-body-index.js +25 -0
- package/dist/tools/docx/ops/replace-paragraph-text-exact.d.ts +9 -0
- package/dist/tools/docx/ops/replace-paragraph-text-exact.js +21 -0
- package/dist/tools/docx/ops/set-color-for-paragraph-exact.d.ts +8 -0
- package/dist/tools/docx/ops/set-color-for-paragraph-exact.js +23 -0
- package/dist/tools/docx/ops/set-color-for-style.d.ts +9 -0
- package/dist/tools/docx/ops/set-color-for-style.js +27 -0
- package/dist/tools/docx/ops/set-paragraph-style-at-body-index.d.ts +8 -0
- package/dist/tools/docx/ops/set-paragraph-style-at-body-index.js +57 -0
- package/dist/tools/docx/ops/table-set-cell-text.d.ts +9 -0
- package/dist/tools/docx/ops/table-set-cell-text.js +72 -0
- package/dist/tools/docx/parsers/image-extractor.d.ts +18 -0
- package/dist/tools/docx/parsers/image-extractor.js +61 -0
- package/dist/tools/docx/parsers/index.d.ts +9 -0
- package/dist/tools/docx/parsers/index.js +9 -0
- package/dist/tools/docx/parsers/paragraph-parser.d.ts +2 -0
- package/dist/tools/docx/parsers/paragraph-parser.js +88 -0
- package/dist/tools/docx/parsers/table-parser.d.ts +9 -0
- package/dist/tools/docx/parsers/table-parser.js +72 -0
- package/dist/tools/docx/parsers/xml-parser.d.ts +25 -0
- package/dist/tools/docx/parsers/xml-parser.js +71 -0
- package/dist/tools/docx/parsers/zip-reader.d.ts +23 -0
- package/dist/tools/docx/parsers/zip-reader.js +52 -0
- package/dist/tools/docx/read.d.ts +27 -0
- package/dist/tools/docx/read.js +188 -0
- package/dist/tools/docx/relationships.d.ts +22 -0
- package/dist/tools/docx/relationships.js +76 -0
- package/dist/tools/docx/structure.d.ts +25 -0
- package/dist/tools/docx/structure.js +102 -0
- package/dist/tools/docx/styled-html-parser.d.ts +23 -0
- package/dist/tools/docx/styled-html-parser.js +1262 -0
- package/dist/tools/docx/types.d.ts +184 -0
- package/dist/tools/docx/types.js +5 -0
- package/dist/tools/docx/utils/escaping.d.ts +13 -0
- package/dist/tools/docx/utils/escaping.js +26 -0
- package/dist/tools/docx/utils/images.d.ts +9 -0
- package/dist/tools/docx/utils/images.js +26 -0
- package/dist/tools/docx/utils/index.d.ts +12 -0
- package/dist/tools/docx/utils/index.js +17 -0
- package/dist/tools/docx/utils/markdown.d.ts +13 -0
- package/dist/tools/docx/utils/markdown.js +32 -0
- package/dist/tools/docx/utils/paths.d.ts +15 -0
- package/dist/tools/docx/utils/paths.js +27 -0
- package/dist/tools/docx/utils/versioning.d.ts +25 -0
- package/dist/tools/docx/utils/versioning.js +55 -0
- package/dist/tools/docx/utils.d.ts +101 -0
- package/dist/tools/docx/utils.js +299 -0
- package/dist/tools/docx/validate.d.ts +33 -0
- package/dist/tools/docx/validate.js +49 -0
- package/dist/tools/docx/validators.d.ts +13 -0
- package/dist/tools/docx/validators.js +40 -0
- package/dist/tools/docx/write.d.ts +17 -0
- package/dist/tools/docx/write.js +88 -0
- package/dist/tools/docx/zip.d.ts +21 -0
- package/dist/tools/docx/zip.js +35 -0
- package/dist/tools/schemas.d.ts +13 -0
- package/dist/tools/schemas.js +5 -0
- package/dist/types.d.ts +10 -0
- package/dist/ui/contracts.d.ts +14 -0
- package/dist/ui/contracts.js +18 -0
- package/dist/ui/file-preview/index.html +16 -0
- package/dist/ui/file-preview/preview-runtime.js +13977 -0
- package/dist/ui/file-preview/shared/preview-file-types.d.ts +5 -0
- package/dist/ui/file-preview/shared/preview-file-types.js +57 -0
- package/dist/ui/file-preview/src/app.d.ts +4 -0
- package/dist/ui/file-preview/src/app.js +800 -0
- package/dist/ui/file-preview/src/components/code-viewer.d.ts +6 -0
- package/dist/ui/file-preview/src/components/code-viewer.js +73 -0
- package/dist/ui/file-preview/src/components/highlighting.d.ts +2 -0
- package/dist/ui/file-preview/src/components/highlighting.js +54 -0
- package/dist/ui/file-preview/src/components/html-renderer.d.ts +9 -0
- package/dist/ui/file-preview/src/components/html-renderer.js +63 -0
- package/dist/ui/file-preview/src/components/markdown-renderer.d.ts +1 -0
- package/dist/ui/file-preview/src/components/markdown-renderer.js +21 -0
- package/dist/ui/file-preview/src/components/toolbar.d.ts +6 -0
- package/dist/ui/file-preview/src/components/toolbar.js +75 -0
- package/dist/ui/file-preview/src/image-preview.d.ts +3 -0
- package/dist/ui/file-preview/src/image-preview.js +21 -0
- package/dist/ui/file-preview/src/main.d.ts +1 -0
- package/dist/ui/file-preview/src/main.js +5 -0
- package/dist/ui/file-preview/src/types.d.ts +1 -0
- package/dist/ui/file-preview/src/types.js +1 -0
- package/dist/ui/file-preview/styles.css +764 -0
- package/dist/ui/resources.d.ts +21 -0
- package/dist/ui/resources.js +72 -0
- package/dist/ui/shared/escape-html.d.ts +4 -0
- package/dist/ui/shared/escape-html.js +11 -0
- package/dist/ui/shared/host-lifecycle.d.ts +16 -0
- package/dist/ui/shared/host-lifecycle.js +35 -0
- package/dist/ui/shared/rpc-client.d.ts +14 -0
- package/dist/ui/shared/rpc-client.js +72 -0
- package/dist/ui/shared/theme-adaptation.d.ts +10 -0
- package/dist/ui/shared/theme-adaptation.js +118 -0
- package/dist/ui/shared/tool-header.d.ts +9 -0
- package/dist/ui/shared/tool-header.js +25 -0
- package/dist/ui/shared/tool-shell.d.ts +16 -0
- package/dist/ui/shared/tool-shell.js +65 -0
- package/dist/ui/shared/widget-state.d.ts +28 -0
- package/dist/ui/shared/widget-state.js +60 -0
- package/dist/utils/capture.d.ts +1 -0
- package/dist/utils/capture.js +10 -4
- package/dist/utils/files/docx.d.ts +34 -0
- package/dist/utils/files/docx.js +145 -0
- package/dist/utils/files/text.js +9 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/package.json +5 -2
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOCX to Markdown Conversion
|
|
3
|
+
* Uses Docxtemplater + XML parsing for reading Word documents
|
|
4
|
+
*/
|
|
5
|
+
/**
|
|
6
|
+
* DOCX metadata structure
|
|
7
|
+
*/
|
|
8
|
+
export interface DocxMetadata {
|
|
9
|
+
/** Document title from core properties */
|
|
10
|
+
title?: string;
|
|
11
|
+
/** Document author */
|
|
12
|
+
author?: string;
|
|
13
|
+
/** Document creator */
|
|
14
|
+
creator?: string;
|
|
15
|
+
/** Document subject */
|
|
16
|
+
subject?: string;
|
|
17
|
+
/** Document description */
|
|
18
|
+
description?: string;
|
|
19
|
+
/** Creation date */
|
|
20
|
+
creationDate?: Date;
|
|
21
|
+
/** Last modification date */
|
|
22
|
+
modificationDate?: Date;
|
|
23
|
+
/** Last modified by */
|
|
24
|
+
lastModifiedBy?: string;
|
|
25
|
+
/** Document revision number */
|
|
26
|
+
revision?: string;
|
|
27
|
+
/** File size in bytes */
|
|
28
|
+
fileSize?: number;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Embedded image information
|
|
32
|
+
*/
|
|
33
|
+
export interface DocxImage {
|
|
34
|
+
/** Unique identifier for the image */
|
|
35
|
+
id: string;
|
|
36
|
+
/** Base64-encoded image data */
|
|
37
|
+
data: string;
|
|
38
|
+
/** MIME type (e.g., "image/png", "image/jpeg") */
|
|
39
|
+
mimeType: string;
|
|
40
|
+
/** Alt text if available */
|
|
41
|
+
altText?: string;
|
|
42
|
+
/** Original size in bytes */
|
|
43
|
+
originalSize?: number;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* DOCX section/paragraph structure
|
|
47
|
+
*/
|
|
48
|
+
export interface DocxSection {
|
|
49
|
+
/** Section type: heading, paragraph, list, table */
|
|
50
|
+
type: 'heading' | 'paragraph' | 'list' | 'table' | 'image';
|
|
51
|
+
/** Section content as markdown */
|
|
52
|
+
content: string;
|
|
53
|
+
/** Heading level if type is heading */
|
|
54
|
+
level?: number;
|
|
55
|
+
/** Associated images if any */
|
|
56
|
+
images?: DocxImage[];
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Complete DOCX parse result
|
|
60
|
+
*/
|
|
61
|
+
export interface DocxParseResult {
|
|
62
|
+
/** Document content as markdown */
|
|
63
|
+
markdown: string;
|
|
64
|
+
/** Document metadata */
|
|
65
|
+
metadata: DocxMetadata;
|
|
66
|
+
/** Extracted images */
|
|
67
|
+
images: DocxImage[];
|
|
68
|
+
/** Structured sections (optional, for advanced parsing) */
|
|
69
|
+
sections?: DocxSection[];
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Convert DOCX to Markdown using Docxtemplater + XML parsing
|
|
73
|
+
* @param source Path to DOCX file or URL
|
|
74
|
+
* @param options Conversion options
|
|
75
|
+
* @returns Parsed DOCX result with markdown and metadata
|
|
76
|
+
*/
|
|
77
|
+
export declare function parseDocxToMarkdown(source: string, options?: {
|
|
78
|
+
/** Extract images as base64 */
|
|
79
|
+
includeImages?: boolean;
|
|
80
|
+
/** Preserve inline formatting (bold, italic) */
|
|
81
|
+
preserveFormatting?: boolean;
|
|
82
|
+
/** Custom style mapping */
|
|
83
|
+
styleMap?: string[];
|
|
84
|
+
}): Promise<DocxParseResult>;
|
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* DOCX to Markdown Conversion
|
|
3
|
+
* Uses Docxtemplater + XML parsing for reading Word documents
|
|
4
|
+
*/
|
|
5
|
+
import fs from 'fs/promises';
|
|
6
|
+
import path from 'path';
|
|
7
|
+
import { createRequire } from 'module';
|
|
8
|
+
const require = createRequire(import.meta.url);
|
|
9
|
+
const PizZip = require('pizzip');
|
|
10
|
+
const Docxtemplater = require('docxtemplater');
|
|
11
|
+
const { DOMParser } = require('@xmldom/xmldom');
|
|
12
|
+
/**
|
|
13
|
+
* Check if source is a URL
|
|
14
|
+
*/
|
|
15
|
+
const isUrl = (source) => source.startsWith('http://') || source.startsWith('https://');
|
|
16
|
+
/**
|
|
17
|
+
* Load DOCX file as buffer
|
|
18
|
+
*/
|
|
19
|
+
async function loadDocxToBuffer(source) {
|
|
20
|
+
if (isUrl(source)) {
|
|
21
|
+
const response = await fetch(source);
|
|
22
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
23
|
+
return Buffer.from(arrayBuffer);
|
|
24
|
+
}
|
|
25
|
+
else {
|
|
26
|
+
return await fs.readFile(source);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
function readZipFileText(zip, filePath) {
|
|
30
|
+
const file = zip.file(filePath);
|
|
31
|
+
if (!file)
|
|
32
|
+
return null;
|
|
33
|
+
if (typeof file.asText === 'function') {
|
|
34
|
+
return file.asText();
|
|
35
|
+
}
|
|
36
|
+
if (typeof file.asBinary === 'function') {
|
|
37
|
+
return Buffer.from(file.asBinary(), 'binary').toString('utf8');
|
|
38
|
+
}
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
function readZipFileBuffer(zip, filePath) {
|
|
42
|
+
const file = zip.file(filePath);
|
|
43
|
+
if (!file)
|
|
44
|
+
return null;
|
|
45
|
+
if (typeof file.asUint8Array === 'function') {
|
|
46
|
+
return Buffer.from(file.asUint8Array());
|
|
47
|
+
}
|
|
48
|
+
if (typeof file.asNodeBuffer === 'function') {
|
|
49
|
+
return file.asNodeBuffer();
|
|
50
|
+
}
|
|
51
|
+
if (typeof file.asBinary === 'function') {
|
|
52
|
+
return Buffer.from(file.asBinary(), 'binary');
|
|
53
|
+
}
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
function getMimeTypeForTarget(target) {
|
|
57
|
+
const ext = path.extname(target).toLowerCase();
|
|
58
|
+
const mimeTypes = {
|
|
59
|
+
'.png': 'image/png',
|
|
60
|
+
'.jpg': 'image/jpeg',
|
|
61
|
+
'.jpeg': 'image/jpeg',
|
|
62
|
+
'.gif': 'image/gif',
|
|
63
|
+
'.bmp': 'image/bmp',
|
|
64
|
+
'.webp': 'image/webp',
|
|
65
|
+
'.svg': 'image/svg+xml',
|
|
66
|
+
};
|
|
67
|
+
return mimeTypes[ext] || 'application/octet-stream';
|
|
68
|
+
}
|
|
69
|
+
function escapeTableCell(text) {
|
|
70
|
+
return text.replace(/\|/g, '\\|').replace(/\r?\n/g, '<br>');
|
|
71
|
+
}
|
|
72
|
+
function getElementChildren(node) {
|
|
73
|
+
const children = [];
|
|
74
|
+
for (let i = 0; i < node.childNodes.length; i++) {
|
|
75
|
+
const child = node.childNodes[i];
|
|
76
|
+
if (child.nodeType === 1) {
|
|
77
|
+
children.push(child);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return children;
|
|
81
|
+
}
|
|
82
|
+
function getAttributeValue(node, name) {
|
|
83
|
+
return node.getAttribute(name) || node.getAttribute(`w:${name}`) || null;
|
|
84
|
+
}
|
|
85
|
+
function getHeadingLevelFromParagraph(paragraph) {
|
|
86
|
+
const pPr = paragraph.getElementsByTagName('w:pPr')[0];
|
|
87
|
+
if (!pPr)
|
|
88
|
+
return null;
|
|
89
|
+
const pStyle = pPr.getElementsByTagName('w:pStyle')[0];
|
|
90
|
+
if (!pStyle)
|
|
91
|
+
return null;
|
|
92
|
+
const styleVal = getAttributeValue(pStyle, 'val');
|
|
93
|
+
if (!styleVal)
|
|
94
|
+
return null;
|
|
95
|
+
const match = styleVal.match(/heading\s*([1-6])/i);
|
|
96
|
+
if (!match)
|
|
97
|
+
return null;
|
|
98
|
+
return Number(match[1]);
|
|
99
|
+
}
|
|
100
|
+
function extractRelationshipMap(relsXml) {
|
|
101
|
+
const relMap = new Map();
|
|
102
|
+
if (!relsXml)
|
|
103
|
+
return relMap;
|
|
104
|
+
const relDoc = new DOMParser().parseFromString(relsXml, 'application/xml');
|
|
105
|
+
const rels = relDoc.getElementsByTagName('Relationship');
|
|
106
|
+
for (let i = 0; i < rels.length; i++) {
|
|
107
|
+
const rel = rels[i];
|
|
108
|
+
const id = rel.getAttribute('Id');
|
|
109
|
+
const type = rel.getAttribute('Type') || '';
|
|
110
|
+
const target = rel.getAttribute('Target') || '';
|
|
111
|
+
if (id && target) {
|
|
112
|
+
relMap.set(id, { target, type });
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return relMap;
|
|
116
|
+
}
|
|
117
|
+
function buildImageResolver(zip, relMap, images, includeImages) {
|
|
118
|
+
const cache = new Map();
|
|
119
|
+
return (relId) => {
|
|
120
|
+
if (!includeImages || !relId)
|
|
121
|
+
return '';
|
|
122
|
+
const rel = relMap.get(relId);
|
|
123
|
+
if (!rel || !rel.type.includes('/image'))
|
|
124
|
+
return '';
|
|
125
|
+
if (cache.has(relId)) {
|
|
126
|
+
const cached = cache.get(relId);
|
|
127
|
+
return ``;
|
|
128
|
+
}
|
|
129
|
+
const targetPath = rel.target.startsWith('word/')
|
|
130
|
+
? rel.target
|
|
131
|
+
: `word/${rel.target.replace(/^\/?/, '')}`;
|
|
132
|
+
const buffer = readZipFileBuffer(zip, targetPath);
|
|
133
|
+
if (!buffer)
|
|
134
|
+
return '';
|
|
135
|
+
const mimeType = getMimeTypeForTarget(rel.target);
|
|
136
|
+
const base64 = buffer.toString('base64');
|
|
137
|
+
const image = {
|
|
138
|
+
id: relId,
|
|
139
|
+
data: base64,
|
|
140
|
+
mimeType,
|
|
141
|
+
originalSize: buffer.length,
|
|
142
|
+
};
|
|
143
|
+
images.push(image);
|
|
144
|
+
cache.set(relId, image);
|
|
145
|
+
return ``;
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
function extractTextFromRun(run, resolveImage) {
|
|
149
|
+
let text = '';
|
|
150
|
+
const children = getElementChildren(run);
|
|
151
|
+
for (const child of children) {
|
|
152
|
+
const nodeName = child.nodeName;
|
|
153
|
+
if (nodeName === 'w:t') {
|
|
154
|
+
text += child.textContent || '';
|
|
155
|
+
continue;
|
|
156
|
+
}
|
|
157
|
+
if (nodeName === 'w:tab') {
|
|
158
|
+
text += '\t';
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
if (nodeName === 'w:br') {
|
|
162
|
+
text += '\n';
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
if (nodeName === 'w:drawing' || nodeName === 'w:pict') {
|
|
166
|
+
const blips = child.getElementsByTagName('a:blip');
|
|
167
|
+
for (let i = 0; i < blips.length; i++) {
|
|
168
|
+
const blip = blips[i];
|
|
169
|
+
const relId = blip.getAttribute('r:embed') || blip.getAttribute('embed');
|
|
170
|
+
const imageMarkdown = resolveImage(relId);
|
|
171
|
+
if (imageMarkdown) {
|
|
172
|
+
text += imageMarkdown;
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
return text;
|
|
178
|
+
}
|
|
179
|
+
function extractParagraphText(paragraph, resolveImage) {
|
|
180
|
+
let text = '';
|
|
181
|
+
const children = getElementChildren(paragraph);
|
|
182
|
+
for (const child of children) {
|
|
183
|
+
const nodeName = child.nodeName;
|
|
184
|
+
if (nodeName === 'w:r') {
|
|
185
|
+
text += extractTextFromRun(child, resolveImage);
|
|
186
|
+
continue;
|
|
187
|
+
}
|
|
188
|
+
if (nodeName === 'w:hyperlink') {
|
|
189
|
+
const runs = child.getElementsByTagName('w:r');
|
|
190
|
+
for (let i = 0; i < runs.length; i++) {
|
|
191
|
+
text += extractTextFromRun(runs[i], resolveImage);
|
|
192
|
+
}
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
return text;
|
|
197
|
+
}
|
|
198
|
+
function convertTableToMarkdown(table, resolveImage) {
|
|
199
|
+
const rows = [];
|
|
200
|
+
const rowNodes = table.getElementsByTagName('w:tr');
|
|
201
|
+
for (let i = 0; i < rowNodes.length; i++) {
|
|
202
|
+
const row = rowNodes[i];
|
|
203
|
+
const cells = row.getElementsByTagName('w:tc');
|
|
204
|
+
const rowCells = [];
|
|
205
|
+
for (let j = 0; j < cells.length; j++) {
|
|
206
|
+
const cell = cells[j];
|
|
207
|
+
const paragraphs = cell.getElementsByTagName('w:p');
|
|
208
|
+
const cellTexts = [];
|
|
209
|
+
for (let k = 0; k < paragraphs.length; k++) {
|
|
210
|
+
const text = extractParagraphText(paragraphs[k], resolveImage).trim();
|
|
211
|
+
if (text) {
|
|
212
|
+
cellTexts.push(text);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
const combined = cellTexts.length > 0 ? cellTexts.join('<br>') : ' ';
|
|
216
|
+
rowCells.push(escapeTableCell(combined));
|
|
217
|
+
}
|
|
218
|
+
if (rowCells.length > 0) {
|
|
219
|
+
rows.push(rowCells);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
if (rows.length === 0)
|
|
223
|
+
return null;
|
|
224
|
+
const maxCols = Math.max(...rows.map(row => row.length));
|
|
225
|
+
for (const row of rows) {
|
|
226
|
+
while (row.length < maxCols) {
|
|
227
|
+
row.push(' ');
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
const header = rows[0];
|
|
231
|
+
const bodyRows = rows.slice(1);
|
|
232
|
+
const headerLine = `| ${header.join(' | ')} |`;
|
|
233
|
+
const separatorLine = `| ${header.map(() => '---').join(' | ')} |`;
|
|
234
|
+
const dataLines = bodyRows.map(row => `| ${row.join(' | ')} |`);
|
|
235
|
+
return [headerLine, separatorLine, ...dataLines].join('\n');
|
|
236
|
+
}
|
|
237
|
+
function convertBodyToMarkdown(body, resolveImage) {
|
|
238
|
+
const blocks = [];
|
|
239
|
+
const children = getElementChildren(body);
|
|
240
|
+
for (const child of children) {
|
|
241
|
+
const nodeName = child.nodeName;
|
|
242
|
+
if (nodeName === 'w:p') {
|
|
243
|
+
const text = extractParagraphText(child, resolveImage).trim();
|
|
244
|
+
if (!text)
|
|
245
|
+
continue;
|
|
246
|
+
const headingLevel = getHeadingLevelFromParagraph(child);
|
|
247
|
+
if (headingLevel && headingLevel >= 1 && headingLevel <= 6) {
|
|
248
|
+
blocks.push(`${'#'.repeat(headingLevel)} ${text}`);
|
|
249
|
+
}
|
|
250
|
+
else {
|
|
251
|
+
blocks.push(text);
|
|
252
|
+
}
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
if (nodeName === 'w:tbl') {
|
|
256
|
+
const tableMarkdown = convertTableToMarkdown(child, resolveImage);
|
|
257
|
+
if (tableMarkdown) {
|
|
258
|
+
blocks.push(tableMarkdown);
|
|
259
|
+
}
|
|
260
|
+
continue;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
return blocks.join('\n\n');
|
|
264
|
+
}
|
|
265
|
+
/**
|
|
266
|
+
* Convert DOCX to Markdown using Docxtemplater + XML parsing
|
|
267
|
+
* @param source Path to DOCX file or URL
|
|
268
|
+
* @param options Conversion options
|
|
269
|
+
* @returns Parsed DOCX result with markdown and metadata
|
|
270
|
+
*/
|
|
271
|
+
export async function parseDocxToMarkdown(source, options = {}) {
|
|
272
|
+
const { includeImages = true, preserveFormatting = true, styleMap = [] } = options;
|
|
273
|
+
try {
|
|
274
|
+
// Load DOCX file
|
|
275
|
+
const buffer = await loadDocxToBuffer(source);
|
|
276
|
+
// Get file size (for local files)
|
|
277
|
+
let fileSize;
|
|
278
|
+
if (!isUrl(source)) {
|
|
279
|
+
try {
|
|
280
|
+
const stats = await fs.stat(source);
|
|
281
|
+
fileSize = stats.size;
|
|
282
|
+
}
|
|
283
|
+
catch {
|
|
284
|
+
// Ignore stat errors for URLs
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
const zip = new PizZip(buffer);
|
|
288
|
+
try {
|
|
289
|
+
new Docxtemplater(zip, { paragraphLoop: true, linebreaks: true });
|
|
290
|
+
}
|
|
291
|
+
catch (error) {
|
|
292
|
+
console.warn('Docxtemplater validation failed, continuing with raw XML parsing:', error);
|
|
293
|
+
}
|
|
294
|
+
const documentXml = readZipFileText(zip, 'word/document.xml');
|
|
295
|
+
if (!documentXml) {
|
|
296
|
+
throw new Error('Invalid DOCX file: word/document.xml not found');
|
|
297
|
+
}
|
|
298
|
+
const relsXml = readZipFileText(zip, 'word/_rels/document.xml.rels');
|
|
299
|
+
const relMap = extractRelationshipMap(relsXml);
|
|
300
|
+
const images = [];
|
|
301
|
+
const resolveImage = buildImageResolver(zip, relMap, images, includeImages);
|
|
302
|
+
const doc = new DOMParser().parseFromString(documentXml, 'application/xml');
|
|
303
|
+
const body = doc.getElementsByTagName('w:body')[0];
|
|
304
|
+
if (!body) {
|
|
305
|
+
throw new Error('Invalid DOCX file: <w:body> not found');
|
|
306
|
+
}
|
|
307
|
+
let markdown = convertBodyToMarkdown(body, resolveImage);
|
|
308
|
+
// Extract metadata from DOCX
|
|
309
|
+
const metadata = await extractMetadata(source, buffer, fileSize);
|
|
310
|
+
// Post-process markdown for better formatting
|
|
311
|
+
markdown = postProcessMarkdown(markdown);
|
|
312
|
+
// Parse into sections (optional advanced feature)
|
|
313
|
+
const sections = parseIntoSections(markdown, images);
|
|
314
|
+
return {
|
|
315
|
+
markdown,
|
|
316
|
+
metadata,
|
|
317
|
+
images,
|
|
318
|
+
sections
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
catch (error) {
|
|
322
|
+
console.error('Error converting DOCX to Markdown:', error);
|
|
323
|
+
throw new Error(`Failed to parse DOCX file: ${error instanceof Error ? error.message : String(error)}`);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Extract metadata from DOCX file
|
|
328
|
+
*/
|
|
329
|
+
async function extractMetadata(source, buffer, fileSize) {
|
|
330
|
+
try {
|
|
331
|
+
// Core properties aren't exposed by the parser, so we'll use JSZip directly
|
|
332
|
+
// For now, return basic metadata structure
|
|
333
|
+
// TODO: Could enhance with docx-parser or officegen for full metadata
|
|
334
|
+
const metadata = {
|
|
335
|
+
fileSize
|
|
336
|
+
};
|
|
337
|
+
// Try to extract basic metadata if available
|
|
338
|
+
// This is a simplified version - full implementation would use docx package
|
|
339
|
+
try {
|
|
340
|
+
// Attempt to read core properties using JSZip (DOCX is a ZIP file)
|
|
341
|
+
const JSZip = require('jszip');
|
|
342
|
+
const zip = await JSZip.loadAsync(buffer);
|
|
343
|
+
// Read core properties XML
|
|
344
|
+
const corePropsFile = zip.file('docProps/core.xml');
|
|
345
|
+
if (corePropsFile) {
|
|
346
|
+
const corePropsXml = await corePropsFile.async('string');
|
|
347
|
+
// Basic XML parsing (ideally use proper XML parser)
|
|
348
|
+
const extractTag = (xml, tag) => {
|
|
349
|
+
const regex = new RegExp(`<dc:${tag}[^>]*>([^<]*)<\/dc:${tag}>`, 'i');
|
|
350
|
+
const match = xml.match(regex);
|
|
351
|
+
if (match)
|
|
352
|
+
return match[1];
|
|
353
|
+
// Try cp: namespace
|
|
354
|
+
const regex2 = new RegExp(`<cp:${tag}[^>]*>([^<]*)<\/cp:${tag}>`, 'i');
|
|
355
|
+
const match2 = xml.match(regex2);
|
|
356
|
+
return match2 ? match2[1] : undefined;
|
|
357
|
+
};
|
|
358
|
+
const extractDcmiTerms = (xml, tag) => {
|
|
359
|
+
const regex = new RegExp(`<dcterms:${tag}[^>]*>([^<]*)<\/dcterms:${tag}>`, 'i');
|
|
360
|
+
const match = xml.match(regex);
|
|
361
|
+
if (match) {
|
|
362
|
+
try {
|
|
363
|
+
return new Date(match[1]);
|
|
364
|
+
}
|
|
365
|
+
catch {
|
|
366
|
+
return undefined;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
return undefined;
|
|
370
|
+
};
|
|
371
|
+
metadata.title = extractTag(corePropsXml, 'title');
|
|
372
|
+
metadata.author = extractTag(corePropsXml, 'creator');
|
|
373
|
+
metadata.subject = extractTag(corePropsXml, 'subject');
|
|
374
|
+
metadata.description = extractTag(corePropsXml, 'description');
|
|
375
|
+
metadata.lastModifiedBy = extractTag(corePropsXml, 'lastModifiedBy');
|
|
376
|
+
metadata.revision = extractTag(corePropsXml, 'revision');
|
|
377
|
+
metadata.creationDate = extractDcmiTerms(corePropsXml, 'created');
|
|
378
|
+
metadata.modificationDate = extractDcmiTerms(corePropsXml, 'modified');
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
catch (metaError) {
|
|
382
|
+
// Metadata extraction is optional, don't fail if it doesn't work
|
|
383
|
+
console.warn('Could not extract detailed metadata:', metaError);
|
|
384
|
+
}
|
|
385
|
+
return metadata;
|
|
386
|
+
}
|
|
387
|
+
catch (error) {
|
|
388
|
+
// Return minimal metadata on error
|
|
389
|
+
return { fileSize };
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
/**
|
|
393
|
+
* Post-process markdown for better formatting
|
|
394
|
+
*/
|
|
395
|
+
function postProcessMarkdown(markdown) {
|
|
396
|
+
// Clean up excessive newlines
|
|
397
|
+
markdown = markdown.replace(/\n{3,}/g, '\n\n');
|
|
398
|
+
// Ensure proper spacing around headings
|
|
399
|
+
markdown = markdown.replace(/([^\n])\n(#+\s)/g, '$1\n\n$2');
|
|
400
|
+
markdown = markdown.replace(/(#+\s[^\n]+)\n([^\n])/g, '$1\n\n$2');
|
|
401
|
+
// Clean up list formatting
|
|
402
|
+
markdown = markdown.replace(/\n([*-]\s)/g, '\n$1');
|
|
403
|
+
// Ensure proper spacing around code blocks
|
|
404
|
+
markdown = markdown.replace(/([^\n])\n```/g, '$1\n\n```');
|
|
405
|
+
markdown = markdown.replace(/```\n([^\n])/g, '```\n\n$1');
|
|
406
|
+
// Ensure proper spacing around tables
|
|
407
|
+
markdown = markdown.replace(/([^\n])\n(\|[^\n]+\|)/g, '$1\n\n$2');
|
|
408
|
+
markdown = markdown.replace(/(\|[^\n]+\|)\n([^\n|])/g, '$1\n\n$2');
|
|
409
|
+
// Trim leading/trailing whitespace
|
|
410
|
+
markdown = markdown.trim();
|
|
411
|
+
return markdown;
|
|
412
|
+
}
|
|
413
|
+
/**
|
|
414
|
+
* Parse markdown into structured sections
|
|
415
|
+
*/
|
|
416
|
+
function parseIntoSections(markdown, images) {
|
|
417
|
+
const sections = [];
|
|
418
|
+
const lines = markdown.split('\n');
|
|
419
|
+
let currentSection = null;
|
|
420
|
+
let currentContent = [];
|
|
421
|
+
for (const line of lines) {
|
|
422
|
+
// Detect headings
|
|
423
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
424
|
+
if (headingMatch) {
|
|
425
|
+
// Save previous section
|
|
426
|
+
if (currentSection) {
|
|
427
|
+
currentSection.content = currentContent.join('\n').trim();
|
|
428
|
+
sections.push(currentSection);
|
|
429
|
+
}
|
|
430
|
+
// Start new heading section
|
|
431
|
+
currentSection = {
|
|
432
|
+
type: 'heading',
|
|
433
|
+
level: headingMatch[1].length,
|
|
434
|
+
content: '' // Will be set later
|
|
435
|
+
};
|
|
436
|
+
currentContent = [line];
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
// Detect images
|
|
440
|
+
const imageMatch = line.match(/!\[([^\]]*)\]\(([^)]+)\)/);
|
|
441
|
+
if (imageMatch) {
|
|
442
|
+
// Save previous section
|
|
443
|
+
if (currentSection && currentContent.length > 0) {
|
|
444
|
+
currentSection.content = currentContent.join('\n').trim();
|
|
445
|
+
sections.push(currentSection);
|
|
446
|
+
}
|
|
447
|
+
// Create image section
|
|
448
|
+
sections.push({
|
|
449
|
+
type: 'image',
|
|
450
|
+
content: line
|
|
451
|
+
});
|
|
452
|
+
currentSection = null;
|
|
453
|
+
currentContent = [];
|
|
454
|
+
continue;
|
|
455
|
+
}
|
|
456
|
+
// Detect lists
|
|
457
|
+
if (line.match(/^[*\-+]\s/) || line.match(/^\d+\.\s/)) {
|
|
458
|
+
if (!currentSection || currentSection.type !== 'list') {
|
|
459
|
+
// Save previous section
|
|
460
|
+
if (currentSection && currentContent.length > 0) {
|
|
461
|
+
currentSection.content = currentContent.join('\n').trim();
|
|
462
|
+
sections.push(currentSection);
|
|
463
|
+
}
|
|
464
|
+
// Start new list section
|
|
465
|
+
currentSection = {
|
|
466
|
+
type: 'list',
|
|
467
|
+
content: ''
|
|
468
|
+
};
|
|
469
|
+
currentContent = [];
|
|
470
|
+
}
|
|
471
|
+
currentContent.push(line);
|
|
472
|
+
continue;
|
|
473
|
+
}
|
|
474
|
+
// Regular paragraph content
|
|
475
|
+
if (line.trim()) {
|
|
476
|
+
if (!currentSection || (currentSection.type !== 'paragraph' && currentSection.type !== 'heading')) {
|
|
477
|
+
// Save previous section
|
|
478
|
+
if (currentSection && currentContent.length > 0) {
|
|
479
|
+
currentSection.content = currentContent.join('\n').trim();
|
|
480
|
+
sections.push(currentSection);
|
|
481
|
+
}
|
|
482
|
+
// Start new paragraph section
|
|
483
|
+
currentSection = {
|
|
484
|
+
type: 'paragraph',
|
|
485
|
+
content: ''
|
|
486
|
+
};
|
|
487
|
+
currentContent = [];
|
|
488
|
+
}
|
|
489
|
+
currentContent.push(line);
|
|
490
|
+
}
|
|
491
|
+
else if (currentContent.length > 0) {
|
|
492
|
+
// Empty line - finalize current section
|
|
493
|
+
if (currentSection) {
|
|
494
|
+
currentSection.content = currentContent.join('\n').trim();
|
|
495
|
+
sections.push(currentSection);
|
|
496
|
+
}
|
|
497
|
+
currentSection = null;
|
|
498
|
+
currentContent = [];
|
|
499
|
+
}
|
|
500
|
+
}
|
|
501
|
+
// Save final section
|
|
502
|
+
if (currentSection && currentContent.length > 0) {
|
|
503
|
+
currentSection.content = currentContent.join('\n').trim();
|
|
504
|
+
sections.push(currentSection);
|
|
505
|
+
}
|
|
506
|
+
return sections;
|
|
507
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Legacy DOCX modification operations.
|
|
3
|
+
*
|
|
4
|
+
* These functions support the older write_file / edit_block paths that
|
|
5
|
+
* modify DOCX via simple operations (replace, insert, delete, style).
|
|
6
|
+
* They are distinct from the new patch-based writeDocxPatched pipeline.
|
|
7
|
+
*
|
|
8
|
+
* Single Responsibility: create / modify DOCX content using the legacy
|
|
9
|
+
* DocxModification interface. Delegates XML parsing and element
|
|
10
|
+
* manipulation to the shared dom.ts module.
|
|
11
|
+
*/
|
|
12
|
+
import type { DocxModification } from './types.js';
|
|
13
|
+
/**
|
|
14
|
+
* Open an existing DOCX, apply an ordered list of modifications to
|
|
15
|
+
* word/document.xml, and write the result to outputPath.
|
|
16
|
+
* Every other file in the ZIP (styles, images, rels, …) is preserved.
|
|
17
|
+
*/
|
|
18
|
+
export declare function modifyDocxContent(inputPath: string, outputPath: string, modifications: DocxModification[]): Promise<void>;
|
|
19
|
+
/**
|
|
20
|
+
* Replace the entire w:body content of a DOCX with new body XML.
|
|
21
|
+
* Used by the body-XML replacement mode of write_file.
|
|
22
|
+
*/
|
|
23
|
+
export declare function replaceBodyXml(inputPath: string, outputPath: string, newBodyXml: string): Promise<void>;
|
|
24
|
+
/**
|
|
25
|
+
* Create a brand-new minimal DOCX from a plain-text string.
|
|
26
|
+
* Double-newlines are treated as paragraph separators.
|
|
27
|
+
*/
|
|
28
|
+
export declare function writeDocx(outputPath: string, content: string | DocxModification[]): Promise<void>;
|