docx-to-html-mathml-v2 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/LICENSE +22 -0
  2. package/OMML2MML.XSL +1819 -0
  3. package/README.md +29 -0
  4. package/bin/mammoth +38 -0
  5. package/index.js +3 -0
  6. package/lib/document-to-html.js +526 -0
  7. package/lib/documents.js +266 -0
  8. package/lib/docx/body-reader.js +931 -0
  9. package/lib/docx/comments-reader.js +31 -0
  10. package/lib/docx/content-types-reader.js +58 -0
  11. package/lib/docx/document-xml-reader.js +30 -0
  12. package/lib/docx/docx-reader.js +226 -0
  13. package/lib/docx/files.js +80 -0
  14. package/lib/docx/notes-reader.js +28 -0
  15. package/lib/docx/numbering-xml.js +111 -0
  16. package/lib/docx/office-xml-reader.js +73 -0
  17. package/lib/docx/relationships-reader.js +43 -0
  18. package/lib/docx/style-map.js +75 -0
  19. package/lib/docx/styles-reader.js +90 -0
  20. package/lib/docx/uris.js +21 -0
  21. package/lib/docx-to-html-mathml.js +193 -0
  22. package/lib/html/ast.js +51 -0
  23. package/lib/html/index.js +49 -0
  24. package/lib/html/simplify.js +99 -0
  25. package/lib/images.js +31 -0
  26. package/lib/index.d.ts +15 -0
  27. package/lib/index.js +111 -0
  28. package/lib/main.js +63 -0
  29. package/lib/mammoth-core.js +3 -0
  30. package/lib/mathtype_batch.rb +58 -0
  31. package/lib/omml_to_mathml.cjs +97 -0
  32. package/lib/options-reader.js +107 -0
  33. package/lib/promises.js +42 -0
  34. package/lib/raw-text.js +14 -0
  35. package/lib/results.js +72 -0
  36. package/lib/style-reader.js +365 -0
  37. package/lib/styles/document-matchers.js +100 -0
  38. package/lib/styles/html-paths.js +75 -0
  39. package/lib/styles/parser/tokeniser.js +30 -0
  40. package/lib/transforms.js +62 -0
  41. package/lib/underline.js +11 -0
  42. package/lib/unzip.js +20 -0
  43. package/lib/writers/html-writer.js +167 -0
  44. package/lib/writers/index.js +14 -0
  45. package/lib/writers/markdown-writer.js +163 -0
  46. package/lib/xml/index.js +8 -0
  47. package/lib/xml/nodes.js +70 -0
  48. package/lib/xml/reader.js +75 -0
  49. package/lib/xml/writer.js +61 -0
  50. package/lib/xml/xmldom.js +23 -0
  51. package/lib/zipfile.js +72 -0
  52. package/mathtype_to_mathml_plus.rb +139 -0
  53. package/package.json +74 -0
package/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # docx-to-html-mathml
2
+
3
+ Convert **DOCX → HTML** with full **MathML support** (OMML & MathType).
4
+
5
+ This library is a patched and extended version of Mammoth that adds **proper math handling**:
6
+
7
+ - Word built-in equations (OMML)
8
+ - MathType equations (OLE objects)
9
+
10
+ The output is **pure HTML + MathML**, ready to render with **MathJax**.
11
+
12
+ ---
13
+
14
+ ## ✨ Features
15
+
16
+ - 📄 Convert DOCX to HTML
17
+ - ➗ Convert **OMML (Word Equation)** → MathML
18
+ - ➕ Convert **MathType (OLE)** → MathML (via Ruby)
19
+ - 🧠 Keeps original document structure (paragraphs, tables, images, styles)
20
+ - ⚡ Returns a simple **HTML string**
21
+ - 🟢 Node.js friendly (>= 12)
22
+
23
+ ---
24
+
25
+ ## 📦 Installation
26
+
27
+ ```bash
28
+ npm install docx-to-html-mathml
29
+ ```
package/bin/mammoth ADDED
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env node
2
+
3
+ var ArgumentParser = require("argparse").ArgumentParser;
4
+ var main = require("../lib/main");
5
+
6
+ var parser = new ArgumentParser({
7
+ addHelp: true
8
+ });
9
+
10
+ parser.addArgument(["docx-path"], {
11
+ type: "string",
12
+ help: "Path to the .docx file to convert."
13
+ });
14
+
15
+ var outputGroup = parser.addMutuallyExclusiveGroup();
16
+ outputGroup.addArgument(["output-path"], {
17
+ type: "string",
18
+ nargs: "?",
19
+ help: "Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set."
20
+ });
21
+ outputGroup.addArgument(["--output-dir"], {
22
+ type: "string",
23
+ help: "Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path."
24
+ });
25
+
26
+ parser.addArgument(["--output-format"], {
27
+ defaultValue: "html",
28
+ choices: ["html", "markdown"],
29
+ help: "Output format."
30
+ });
31
+
32
+ parser.addArgument(["--style-map"], {
33
+ type: "string",
34
+ help: "File containg a style map."
35
+ });
36
+
37
+
38
+ main(parser.parseArgs());
package/index.js ADDED
@@ -0,0 +1,3 @@
1
+ "use strict";
2
+
3
+ module.exports = require("./lib/docx-to-html-mathml");
@@ -0,0 +1,526 @@
1
+ var _ = require("underscore");
2
+
3
+ var promises = require("./promises");
4
+ var documents = require("./documents");
5
+ var htmlPaths = require("./styles/html-paths");
6
+ var results = require("./results");
7
+ var images = require("./images");
8
+ var Html = require("./html");
9
+ var writers = require("./writers");
10
+
11
+ exports.DocumentConverter = DocumentConverter;
12
+
13
+
14
+ function DocumentConverter(options) {
15
+ return {
16
+ convertToHtml: function (element) {
17
+ var comments = _.indexBy(
18
+ element.type === documents.types.document ? element.comments : [],
19
+ "commentId"
20
+ );
21
+ var conversion = new DocumentConversion(options, comments);
22
+ return conversion.convertToHtml(element);
23
+ }
24
+ };
25
+ }
26
+
27
+ function DocumentConversion(options, comments) {
28
+ var noteNumber = 1;
29
+
30
+ var noteReferences = [];
31
+
32
+ var referencedComments = [];
33
+
34
+ options = _.extend({ ignoreEmptyParagraphs: true }, options);
35
+ var idPrefix = options.idPrefix === undefined ? "" : options.idPrefix;
36
+ var ignoreEmptyParagraphs = options.ignoreEmptyParagraphs;
37
+
38
+ var defaultParagraphStyle = htmlPaths.topLevelElement("p");
39
+
40
+ var styleMap = options.styleMap || [];
41
+
42
+ function convertToHtml(document) {
43
+ var messages = [];
44
+
45
+ var html = elementToHtml(document, messages, {});
46
+
47
+ var deferredNodes = [];
48
+ walkHtml(html, function (node) {
49
+ if (node.type === "deferred") {
50
+ deferredNodes.push(node);
51
+ }
52
+ });
53
+ var deferredValues = {};
54
+ return promises.mapSeries(deferredNodes, function (deferred) {
55
+ return deferred.value().then(function (value) {
56
+ deferredValues[deferred.id] = value;
57
+ });
58
+ }).then(function () {
59
+ function replaceDeferred(nodes) {
60
+ return flatMap(nodes, function (node) {
61
+ if (node.type === "deferred") {
62
+ return deferredValues[node.id];
63
+ } else if (node.children) {
64
+ return [
65
+ _.extend({}, node, {
66
+ children: replaceDeferred(node.children)
67
+ })
68
+ ];
69
+ } else {
70
+ return [node];
71
+ }
72
+ });
73
+ }
74
+ var writer = writers.writer({
75
+ prettyPrint: options.prettyPrint,
76
+ outputFormat: options.outputFormat
77
+ });
78
+ Html.write(writer, Html.simplify(replaceDeferred(html)));
79
+ return new results.Result(writer.asString(), messages);
80
+ });
81
+ }
82
+
83
+ function convertElements(elements, messages, options) {
84
+ return flatMap(elements, function (element) {
85
+ return elementToHtml(element, messages, options);
86
+ });
87
+ }
88
+
89
+ function elementToHtml(element, messages, options) {
90
+ if (!options) {
91
+ throw new Error("options not set");
92
+ }
93
+ var handler = elementConverters[element.type];
94
+ if (handler) {
95
+ return handler(element, messages, options);
96
+ } else {
97
+ return [];
98
+ }
99
+ }
100
+
101
+ function convertParagraph(element, messages, options) {
102
+ return htmlPathForParagraph(element, messages).wrap(function () {
103
+ var content = convertElements(element.children, messages, options);
104
+ if (ignoreEmptyParagraphs) {
105
+ return content;
106
+ } else {
107
+ return [Html.forceWrite].concat(content);
108
+ }
109
+ });
110
+ }
111
+
112
+ function htmlPathForParagraph(element, messages) {
113
+ var style = findStyle(element);
114
+
115
+ if (style) {
116
+ return style.to;
117
+ } else {
118
+ if (element.styleId) {
119
+ messages.push(unrecognisedStyleWarning("paragraph", element));
120
+ }
121
+ return defaultParagraphStyle;
122
+ }
123
+ }
124
+
125
+ function convertRun(run, messages, options) {
126
+ var applyChildren = function () {
127
+ return convertElements(run.children, messages, options);
128
+ };
129
+
130
+ var paths = [];
131
+
132
+ // ---- collect Mammoth builtin wrappers ----
133
+ if (run.highlight !== null) {
134
+ var path = findHtmlPath({ type: "highlight", color: run.highlight });
135
+ if (path) paths.push(path);
136
+ }
137
+ if (run.highlight !== null) {
138
+ const color = run.highlight;
139
+
140
+ const prev = applyChildren;
141
+
142
+ applyChildren = function () {
143
+ let children = prev();
144
+
145
+ // bọc nội dung trong span màu
146
+ children = [
147
+ Html.freshElement("span",
148
+ { style: `background-color:${color};` },
149
+ children
150
+ )
151
+ ];
152
+
153
+ // chèn ký tự ẩn để ngăn Mammoth merge run tiếp theo
154
+ children.push(Html.text("\u200B"));
155
+
156
+ return children;
157
+ };
158
+ }
159
+ if (run.isSmallCaps) paths.push(findHtmlPathForRunProperty("smallCaps"));
160
+ if (run.isAllCaps) paths.push(findHtmlPathForRunProperty("allCaps"));
161
+ if (run.isStrikethrough) paths.push(findHtmlPathForRunProperty("strikethrough", "s"));
162
+ if (run.isUnderline) paths.push(findHtmlPathForRunProperty("underline"));
163
+ if (run.verticalAlignment === documents.verticalAlignment.subscript)
164
+ paths.push(htmlPaths.element("sub", {}, { fresh: false }));
165
+ if (run.verticalAlignment === documents.verticalAlignment.superscript)
166
+ paths.push(htmlPaths.element("sup", {}, { fresh: false }));
167
+ if (run.isItalic) paths.push(findHtmlPathForRunProperty("italic", "em"));
168
+ if (run.isBold) paths.push(findHtmlPathForRunProperty("bold", "strong"));
169
+
170
+ var stylePath = htmlPaths.empty;
171
+ var style = findStyle(run);
172
+
173
+ if (style) {
174
+ stylePath = style.to;
175
+ } else if (run.styleId) {
176
+ messages.push(unrecognisedStyleWarning("run", run));
177
+ }
178
+ paths.push(stylePath);
179
+
180
+ // ---- APPLY built-in paths first ----
181
+ paths.forEach(function (path) {
182
+ applyChildren = path.wrap.bind(path, applyChildren);
183
+ });
184
+
185
+ // ---- NOW apply custom inline CSS LAST (outermost) ----
186
+ if (run.color || run.background || run.font || run.fontSize) {
187
+ var css = [];
188
+
189
+ if (run.color) css.push("color:#" + run.color);
190
+ if (run.background) css.push("background-color:#" + run.background);
191
+ if (run.highlight) css.push("background-color:" + run.highlight);
192
+ if (run.font) css.push("font-family:" + run.font);
193
+ if (run.fontSize) css.push("font-size:" + run.fontSize + "pt");
194
+
195
+ let cssString = css.join(";");
196
+
197
+ var original = applyChildren;
198
+ applyChildren = function () {
199
+ return [
200
+ Html.nonFreshElement("span", { style: cssString }, original())
201
+ ];
202
+ };
203
+ }
204
+
205
+ return applyChildren();
206
+ }
207
+
208
+
209
+ function findHtmlPathForRunProperty(elementType, defaultTagName) {
210
+ var path = findHtmlPath({ type: elementType });
211
+ if (path) {
212
+ return path;
213
+ } else if (defaultTagName) {
214
+ return htmlPaths.element(defaultTagName, {}, { fresh: false });
215
+ } else {
216
+ return htmlPaths.empty;
217
+ }
218
+ }
219
+
220
+ function findHtmlPath(element, defaultPath) {
221
+ var style = findStyle(element);
222
+ return style ? style.to : defaultPath;
223
+ }
224
+
225
+ function findStyle(element) {
226
+ for (var i = 0; i < styleMap.length; i++) {
227
+ if (styleMap[i].from.matches(element)) {
228
+ return styleMap[i];
229
+ }
230
+ }
231
+ }
232
+
233
+ function recoveringConvertImage(convertImage) {
234
+ return function (image, messages) {
235
+ return promises.attempt(function () {
236
+ return convertImage(image, messages);
237
+ }).caught(function (error) {
238
+ messages.push(results.error(error));
239
+ return [];
240
+ });
241
+ };
242
+ }
243
+ function recoveringConvertMath(convertMath) {
244
+ return function (math, messages, options) {
245
+ return promises.attempt(function () {
246
+ return convertMath(math, messages, options);
247
+ }).caught(function (error) {
248
+ messages.push(results.error(error));
249
+ return defaultConvertMath(math, messages, options);
250
+ });
251
+ };
252
+ }
253
+
254
+ function defaultConvertMath(math, messages, options) {
255
+ // fallback không crash
256
+ if (math && math.altText) {
257
+ return [
258
+ Html.nonFreshElement("span", { class: "math-fallback" }, [
259
+ Html.text(math.altText)
260
+ ])
261
+ ];
262
+ }
263
+ return [];
264
+ }
265
+
266
+
267
+ function noteHtmlId(note) {
268
+ return referentHtmlId(note.noteType, note.noteId);
269
+ }
270
+
271
+ function noteRefHtmlId(note) {
272
+ return referenceHtmlId(note.noteType, note.noteId);
273
+ }
274
+
275
+ function referentHtmlId(referenceType, referenceId) {
276
+ return htmlId(referenceType + "-" + referenceId);
277
+ }
278
+
279
+ function referenceHtmlId(referenceType, referenceId) {
280
+ return htmlId(referenceType + "-ref-" + referenceId);
281
+ }
282
+
283
+ function htmlId(suffix) {
284
+ return idPrefix + suffix;
285
+ }
286
+
287
+ var defaultTablePath = htmlPaths.elements([
288
+ htmlPaths.element("table", {}, { fresh: true })
289
+ ]);
290
+
291
+ function convertTable(element, messages, options) {
292
+ return findHtmlPath(element, defaultTablePath).wrap(function () {
293
+ return convertTableChildren(element, messages, options);
294
+ });
295
+ }
296
+
297
+ function convertTableChildren(element, messages, options) {
298
+ var bodyIndex = _.findIndex(element.children, function (child) {
299
+ return !child.type === documents.types.tableRow || !child.isHeader;
300
+ });
301
+ if (bodyIndex === -1) {
302
+ bodyIndex = element.children.length;
303
+ }
304
+ var children;
305
+ if (bodyIndex === 0) {
306
+ children = convertElements(
307
+ element.children,
308
+ messages,
309
+ _.extend({}, options, { isTableHeader: false })
310
+ );
311
+ } else {
312
+ var headRows = convertElements(
313
+ element.children.slice(0, bodyIndex),
314
+ messages,
315
+ _.extend({}, options, { isTableHeader: true })
316
+ );
317
+ var bodyRows = convertElements(
318
+ element.children.slice(bodyIndex),
319
+ messages,
320
+ _.extend({}, options, { isTableHeader: false })
321
+ );
322
+ children = [
323
+ Html.freshElement("thead", {}, headRows),
324
+ Html.freshElement("tbody", {}, bodyRows)
325
+ ];
326
+ }
327
+ return [Html.forceWrite].concat(children);
328
+ }
329
+
330
+ function convertTableRow(element, messages, options) {
331
+ var children = convertElements(element.children, messages, options);
332
+ return [
333
+ Html.freshElement("tr", {}, [Html.forceWrite].concat(children))
334
+ ];
335
+ }
336
+
337
+ function convertTableCell(element, messages, options) {
338
+ var tagName = options.isTableHeader ? "th" : "td";
339
+ var children = convertElements(element.children, messages, options);
340
+ var attributes = {};
341
+ if (element.colSpan !== 1) {
342
+ attributes.colspan = element.colSpan.toString();
343
+ }
344
+ if (element.rowSpan !== 1) {
345
+ attributes.rowspan = element.rowSpan.toString();
346
+ }
347
+
348
+ return [
349
+ Html.freshElement(tagName, attributes, [Html.forceWrite].concat(children))
350
+ ];
351
+ }
352
+
353
+ function convertCommentReference(reference, messages, options) {
354
+ return findHtmlPath(reference, htmlPaths.ignore).wrap(function () {
355
+ var comment = comments[reference.commentId];
356
+ var count = referencedComments.length + 1;
357
+ var label = "[" + commentAuthorLabel(comment) + count + "]";
358
+ referencedComments.push({ label: label, comment: comment });
359
+ // TODO: remove duplication with note references
360
+ return [
361
+ Html.freshElement("a", {
362
+ href: "#" + referentHtmlId("comment", reference.commentId),
363
+ id: referenceHtmlId("comment", reference.commentId)
364
+ }, [Html.text(label)])
365
+ ];
366
+ });
367
+ }
368
+
369
+ function convertComment(referencedComment, messages, options) {
370
+ // TODO: remove duplication with note references
371
+
372
+ var label = referencedComment.label;
373
+ var comment = referencedComment.comment;
374
+ var body = convertElements(comment.body, messages, options).concat([
375
+ Html.nonFreshElement("p", {}, [
376
+ Html.text(" "),
377
+ Html.freshElement("a", { "href": "#" + referenceHtmlId("comment", comment.commentId) }, [
378
+ Html.text("↑")
379
+ ])
380
+ ])
381
+ ]);
382
+
383
+ return [
384
+ Html.freshElement(
385
+ "dt",
386
+ { "id": referentHtmlId("comment", comment.commentId) },
387
+ [Html.text("Comment " + label)]
388
+ ),
389
+ Html.freshElement("dd", {}, body)
390
+ ];
391
+ }
392
+
393
+ function convertBreak(element, messages, options) {
394
+ return htmlPathForBreak(element).wrap(function () {
395
+ return [];
396
+ });
397
+ }
398
+
399
+ function htmlPathForBreak(element) {
400
+ var style = findStyle(element);
401
+ if (style) {
402
+ return style.to;
403
+ } else if (element.breakType === "line") {
404
+ return htmlPaths.topLevelElement("br");
405
+ } else {
406
+ return htmlPaths.empty;
407
+ }
408
+ }
409
+
410
+ var elementConverters = {
411
+ "math": deferredConversion(recoveringConvertMath(options.convertMath || defaultConvertMath)),
412
+ "document": function (document, messages, options) {
413
+ var children = convertElements(document.children, messages, options);
414
+ var notes = noteReferences.map(function (noteReference) {
415
+ return document.notes.resolve(noteReference);
416
+ });
417
+ var notesNodes = convertElements(notes, messages, options);
418
+ return children.concat([
419
+ Html.freshElement("ol", {}, notesNodes),
420
+ Html.freshElement("dl", {}, flatMap(referencedComments, function (referencedComment) {
421
+ return convertComment(referencedComment, messages, options);
422
+ }))
423
+ ]);
424
+ },
425
+ "paragraph": convertParagraph,
426
+ "run": convertRun,
427
+ "text": function (element, messages, options) {
428
+ return [Html.text(element.value)];
429
+ },
430
+ "tab": function (element, messages, options) {
431
+ return [Html.text("\t")];
432
+ },
433
+ "hyperlink": function (element, messages, options) {
434
+ var href = element.anchor ? "#" + htmlId(element.anchor) : element.href;
435
+ var attributes = { href: href };
436
+ if (element.targetFrame != null) {
437
+ attributes.target = element.targetFrame;
438
+ }
439
+
440
+ var children = convertElements(element.children, messages, options);
441
+ return [Html.nonFreshElement("a", attributes, children)];
442
+ },
443
+ "checkbox": function (element) {
444
+ var attributes = { type: "checkbox" };
445
+ if (element.checked) {
446
+ attributes["checked"] = "checked";
447
+ }
448
+ return [Html.freshElement("input", attributes)];
449
+ },
450
+ "bookmarkStart": function (element, messages, options) {
451
+ var anchor = Html.freshElement("a", {
452
+ id: htmlId(element.name)
453
+ }, [Html.forceWrite]);
454
+ return [anchor];
455
+ },
456
+ "noteReference": function (element, messages, options) {
457
+ noteReferences.push(element);
458
+ var anchor = Html.freshElement("a", {
459
+ href: "#" + noteHtmlId(element),
460
+ id: noteRefHtmlId(element)
461
+ }, [Html.text("[" + (noteNumber++) + "]")]);
462
+
463
+ return [Html.freshElement("sup", {}, [anchor])];
464
+ },
465
+ "note": function (element, messages, options) {
466
+ var children = convertElements(element.body, messages, options);
467
+ var backLink = Html.elementWithTag(htmlPaths.element("p", {}, { fresh: false }), [
468
+ Html.text(" "),
469
+ Html.freshElement("a", { href: "#" + noteRefHtmlId(element) }, [Html.text("↑")])
470
+ ]);
471
+ var body = children.concat([backLink]);
472
+
473
+ return Html.freshElement("li", { id: noteHtmlId(element) }, body);
474
+ },
475
+ "commentReference": convertCommentReference,
476
+ "comment": convertComment,
477
+ "image": deferredConversion(recoveringConvertImage(options.convertImage || images.dataUri)),
478
+ "table": convertTable,
479
+ "tableRow": convertTableRow,
480
+ "tableCell": convertTableCell,
481
+ "break": convertBreak
482
+ };
483
+ return {
484
+ convertToHtml: convertToHtml
485
+ };
486
+ }
487
+
488
+ var deferredId = 1;
489
+
490
+ function deferredConversion(func) {
491
+ return function (element, messages, options) {
492
+ return [
493
+ {
494
+ type: "deferred",
495
+ id: deferredId++,
496
+ value: function () {
497
+ return func(element, messages, options);
498
+ }
499
+ }
500
+ ];
501
+ };
502
+ }
503
+
504
+ function unrecognisedStyleWarning(type, element) {
505
+ return results.warning(
506
+ "Unrecognised " + type + " style: '" + element.styleName + "'" +
507
+ " (Style ID: " + element.styleId + ")"
508
+ );
509
+ }
510
+
511
+ function flatMap(values, func) {
512
+ return _.flatten(values.map(func), true);
513
+ }
514
+
515
+ function walkHtml(nodes, callback) {
516
+ nodes.forEach(function (node) {
517
+ callback(node);
518
+ if (node.children) {
519
+ walkHtml(node.children, callback);
520
+ }
521
+ });
522
+ }
523
+
524
+ var commentAuthorLabel = exports.commentAuthorLabel = function commentAuthorLabel(comment) {
525
+ return comment.authorInitials || "";
526
+ };