@hyperlex/mammoth 1.4.9-beta

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. package/.eslintrc.json +77 -0
  2. package/.github/ISSUE_TEMPLATE.md +12 -0
  3. package/.idea/mammoth.js.iml +12 -0
  4. package/.idea/modules.xml +8 -0
  5. package/.idea/vcs.xml +6 -0
  6. package/.travis.yml +10 -0
  7. package/LICENSE +22 -0
  8. package/NEWS +373 -0
  9. package/README.md +883 -0
  10. package/bin/mammoth +38 -0
  11. package/browser/docx/files.js +14 -0
  12. package/browser/unzip.js +12 -0
  13. package/lib/document-to-html.js +453 -0
  14. package/lib/documents.js +238 -0
  15. package/lib/docx/body-reader.js +636 -0
  16. package/lib/docx/comments-reader.js +31 -0
  17. package/lib/docx/content-types-reader.js +58 -0
  18. package/lib/docx/document-xml-reader.js +26 -0
  19. package/lib/docx/docx-reader.js +222 -0
  20. package/lib/docx/files.js +67 -0
  21. package/lib/docx/notes-reader.js +28 -0
  22. package/lib/docx/numbering-xml.js +69 -0
  23. package/lib/docx/office-xml-reader.js +58 -0
  24. package/lib/docx/relationships-reader.js +43 -0
  25. package/lib/docx/style-map.js +75 -0
  26. package/lib/docx/styles-reader.js +70 -0
  27. package/lib/docx/uris.js +21 -0
  28. package/lib/html/ast.js +50 -0
  29. package/lib/html/index.js +41 -0
  30. package/lib/html/simplify.js +88 -0
  31. package/lib/images.js +29 -0
  32. package/lib/index.js +115 -0
  33. package/lib/main.js +63 -0
  34. package/lib/options-reader.js +98 -0
  35. package/lib/promises.js +42 -0
  36. package/lib/results.js +72 -0
  37. package/lib/style-reader.js +321 -0
  38. package/lib/styles/document-matchers.js +74 -0
  39. package/lib/styles/html-paths.js +81 -0
  40. package/lib/styles/parser/tokeniser.js +30 -0
  41. package/lib/transforms.js +61 -0
  42. package/lib/underline.js +11 -0
  43. package/lib/unzip.js +22 -0
  44. package/lib/writers/html-writer.js +160 -0
  45. package/lib/writers/index.js +14 -0
  46. package/lib/writers/markdown-writer.js +163 -0
  47. package/lib/xml/index.js +7 -0
  48. package/lib/xml/nodes.js +69 -0
  49. package/lib/xml/reader.js +83 -0
  50. package/lib/xml/writer.js +61 -0
  51. package/lib/zipfile.js +77 -0
  52. package/mammoth.browser.js +32950 -0
  53. package/mammoth.browser.min.js +18 -0
  54. package/package.json +65 -0
  55. package/test/.eslintrc.json +7 -0
  56. package/test/document-to-html.tests.js +834 -0
  57. package/test/docx/body-reader.tests.js +1342 -0
  58. package/test/docx/comments-reader.tests.js +52 -0
  59. package/test/docx/content-types-reader.tests.js +45 -0
  60. package/test/docx/document-matchers.js +37 -0
  61. package/test/docx/docx-reader.tests.js +179 -0
  62. package/test/docx/files.tests.js +94 -0
  63. package/test/docx/notes-reader.tests.js +35 -0
  64. package/test/docx/numbering-xml.tests.js +65 -0
  65. package/test/docx/office-xml-reader.tests.js +24 -0
  66. package/test/docx/relationships-reader.tests.js +65 -0
  67. package/test/docx/style-map.tests.js +112 -0
  68. package/test/docx/styles-reader.tests.js +133 -0
  69. package/test/docx/uris.tests.js +22 -0
  70. package/test/html/simplify.tests.js +134 -0
  71. package/test/html/write.tests.js +42 -0
  72. package/test/images.tests.js +34 -0
  73. package/test/main.tests.js +89 -0
  74. package/test/mammoth.tests.js +429 -0
  75. package/test/mocha.opts +1 -0
  76. package/test/options-reader.tests.js +63 -0
  77. package/test/results.tests.js +15 -0
  78. package/test/style-reader.tests.js +256 -0
  79. package/test/styles/document-matchers.tests.js +71 -0
  80. package/test/styles/html-paths.tests.js +20 -0
  81. package/test/styles/parser/tokeniser.tests.js +104 -0
  82. package/test/test-data/comments.docx +0 -0
  83. package/test/test-data/embedded-style-map.docx +0 -0
  84. package/test/test-data/empty.docx +0 -0
  85. package/test/test-data/empty.zip +0 -0
  86. package/test/test-data/endnotes.docx +0 -0
  87. package/test/test-data/external-picture.docx +0 -0
  88. package/test/test-data/footnote-hyperlink.docx +0 -0
  89. package/test/test-data/footnotes.docx +0 -0
  90. package/test/test-data/hello.zip +0 -0
  91. package/test/test-data/hyperlinks/word/_rels/document.xml.rels +10 -0
  92. package/test/test-data/hyperlinks/word/document.xml +18 -0
  93. package/test/test-data/simple/word/document.xml +18 -0
  94. package/test/test-data/simple-list.docx +0 -0
  95. package/test/test-data/single-paragraph.docx +0 -0
  96. package/test/test-data/strikethrough.docx +0 -0
  97. package/test/test-data/tables.docx +0 -0
  98. package/test/test-data/text-box.docx +0 -0
  99. package/test/test-data/tiny-picture-target-base-relative.docx +0 -0
  100. package/test/test-data/tiny-picture.docx +0 -0
  101. package/test/test-data/tiny-picture.png +0 -0
  102. package/test/test-data/underline.docx +0 -0
  103. package/test/test-data/utf8-bom.docx +0 -0
  104. package/test/test.js +11 -0
  105. package/test/testing.js +55 -0
  106. package/test/transforms.tests.js +125 -0
  107. package/test/unzip.tests.js +38 -0
  108. package/test/writers/html-writer.tests.js +133 -0
  109. package/test/writers/markdown-writer.tests.js +304 -0
  110. package/test/xml/reader.tests.js +85 -0
  111. package/test/xml/writer.tests.js +81 -0
  112. package/test/zipfile.tests.js +59 -0
package/bin/mammoth ADDED
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env node
2
+
3
+ var ArgumentParser = require("argparse").ArgumentParser;
4
+ var main = require("../lib/main");
5
+
6
+ var parser = new ArgumentParser({
7
+ addHelp: true
8
+ });
9
+
10
+ parser.addArgument(["docx-path"], {
11
+ type: "string",
12
+ help: "Path to the .docx file to convert."
13
+ });
14
+
15
+ var outputGroup = parser.addMutuallyExclusiveGroup();
16
+ outputGroup.addArgument(["output-path"], {
17
+ type: "string",
18
+ nargs: "?",
19
+ help: "Output path for the generated document. Images will be stored inline in the output document. Output is written to stdout if not set."
20
+ });
21
+ outputGroup.addArgument(["--output-dir"], {
22
+ type: "string",
23
+ help: "Output directory for generated HTML and images. Images will be stored in separate files. Mutually exclusive with output-path."
24
+ });
25
+
26
+ parser.addArgument(["--output-format"], {
27
+ defaultValue: "html",
28
+ choices: ["html", "markdown"],
29
+ help: "Output format."
30
+ });
31
+
32
+ parser.addArgument(["--style-map"], {
33
+ type: "string",
34
+ help: "File containg a style map."
35
+ });
36
+
37
+
38
+ main(parser.parseArgs());
@@ -0,0 +1,14 @@
1
+ var promises = require("../../lib/promises");
2
+
3
+ exports.Files = Files;
4
+
5
+
6
+ function Files() {
7
+ function read(uri) {
8
+ return promises.reject(new Error("could not open external image: '" + uri + "'\ncannot open linked files from a web browser"));
9
+ }
10
+
11
+ return {
12
+ read: read
13
+ };
14
+ }
@@ -0,0 +1,12 @@
1
+ var promises = require("../lib/promises");
2
+ var zipfile = require("../lib/zipfile");
3
+
4
+ exports.openZip = openZip;
5
+
6
+ function openZip(options) {
7
+ if (options.arrayBuffer) {
8
+ return promises.resolve(zipfile.openArrayBuffer(options.arrayBuffer));
9
+ } else {
10
+ return promises.reject(new Error("Could not find file in options"));
11
+ }
12
+ }
@@ -0,0 +1,453 @@
1
+ var _ = require("underscore");
2
+
3
+ var promises = require("./promises");
4
+ var documents = require("./documents");
5
+ var htmlPaths = require("./styles/html-paths");
6
+ var results = require("./results");
7
+ var images = require("./images");
8
+ var Html = require("./html");
9
+ var writers = require("./writers");
10
+
11
+ exports.DocumentConverter = DocumentConverter;
12
+
13
+
14
+ function DocumentConverter(options) {
15
+ return {
16
+ convertToHtml: function(element) {
17
+ var comments = _.indexBy(
18
+ element.type === documents.types.document ? element.comments : [],
19
+ "commentId"
20
+ );
21
+ var conversion = new DocumentConversion(options, comments);
22
+ return conversion.convertToHtml(element);
23
+ }
24
+ };
25
+ }
26
+
27
+ function DocumentConversion(options, comments) {
28
+ var noteNumber = 1;
29
+
30
+ var noteReferences = [];
31
+
32
+ var referencedComments = [];
33
+
34
+ options = _.extend({ignoreEmptyParagraphs: true}, options);
35
+ var idPrefix = options.idPrefix === undefined ? "" : options.idPrefix;
36
+ var ignoreEmptyParagraphs = options.ignoreEmptyParagraphs;
37
+
38
+ var defaultParagraphStyle = htmlPaths.topLevelElement("p");
39
+
40
+ var styleMap = options.styleMap || [];
41
+
42
+ function convertToHtml(document) {
43
+ var messages = [];
44
+
45
+ var html = elementToHtml(document, messages, {});
46
+
47
+ var deferredNodes = [];
48
+ walkHtml(html, function(node) {
49
+ if (node.type === "deferred") {
50
+ deferredNodes.push(node);
51
+ }
52
+ });
53
+ var deferredValues = {};
54
+ return promises.mapSeries(deferredNodes, function(deferred) {
55
+ return deferred.value().then(function(value) {
56
+ deferredValues[deferred.id] = value;
57
+ });
58
+ }).then(function() {
59
+ function replaceDeferred(nodes) {
60
+ return flatMap(nodes, function(node) {
61
+ if (node.type === "deferred") {
62
+ return deferredValues[node.id];
63
+ } else if (node.children) {
64
+ return [
65
+ _.extend({}, node, {
66
+ children: replaceDeferred(node.children)
67
+ })
68
+ ];
69
+ } else {
70
+ return [node];
71
+ }
72
+ });
73
+ }
74
+ var writer = writers.writer({
75
+ prettyPrint: options.prettyPrint,
76
+ outputFormat: options.outputFormat
77
+ });
78
+ Html.write(writer, Html.simplify(replaceDeferred(html)));
79
+ return new results.Result(writer.asString(), messages);
80
+ });
81
+ }
82
+
83
+ function convertElements(elements, messages, options) {
84
+ return flatMap(elements, function(element) {
85
+ return elementToHtml(element, messages, options);
86
+ });
87
+ }
88
+
89
+ function elementToHtml(element, messages, options) {
90
+ if (!options) {
91
+ throw new Error("options not set");
92
+ }
93
+ var handler = elementConverters[element.type];
94
+ if (handler) {
95
+ return handler(element, messages, options);
96
+ } else {
97
+ return [];
98
+ }
99
+ }
100
+
101
+ function convertParagraph(element, messages, options) {
102
+ return htmlPathForParagraph(element, messages).wrap(function() {
103
+ var content = convertElements(element.children, messages, options);
104
+ if (ignoreEmptyParagraphs) {
105
+ return content;
106
+ } else {
107
+ return [Html.forceWrite].concat(content);
108
+ }
109
+ }, element);
110
+ }
111
+
112
+ function htmlPathForParagraph(element, messages) {
113
+ var style = findStyle(element);
114
+
115
+ if (style) {
116
+ return style.to;
117
+ } else {
118
+ if (element.styleId) {
119
+ messages.push(unrecognisedStyleWarning("paragraph", element));
120
+ }
121
+ return defaultParagraphStyle;
122
+ }
123
+ }
124
+
125
+ function convertRun(run, messages, options) {
126
+ var nodes = function() {
127
+ return convertElements(run.children, messages, options);
128
+ };
129
+ var paths = [];
130
+ if (run.isSmallCaps) {
131
+ paths.push(findHtmlPathForRunProperty("smallCaps"));
132
+ }
133
+ if (run.isStrikethrough) {
134
+ paths.push(findHtmlPathForRunProperty("strikethrough", "s"));
135
+ }
136
+ if (run.isUnderline) {
137
+ paths.push(findHtmlPathForRunProperty("underline"));
138
+ }
139
+ if (run.verticalAlignment === documents.verticalAlignment.subscript) {
140
+ paths.push(htmlPaths.element("sub", {}, {fresh: false}));
141
+ }
142
+ if (run.verticalAlignment === documents.verticalAlignment.superscript) {
143
+ paths.push(htmlPaths.element("sup", {}, {fresh: false}));
144
+ }
145
+ if (run.isItalic) {
146
+ paths.push(findHtmlPathForRunProperty("italic", "em"));
147
+ }
148
+ if (run.isBold) {
149
+ paths.push(findHtmlPathForRunProperty("bold", "strong"));
150
+ }
151
+ var stylePath = htmlPaths.empty;
152
+ var style = findStyle(run);
153
+ if (style) {
154
+ stylePath = style.to;
155
+ } else if (run.styleId) {
156
+ messages.push(unrecognisedStyleWarning("run", run));
157
+ }
158
+ paths.push(stylePath);
159
+
160
+ paths.forEach(function(path) {
161
+ nodes = path.wrap.bind(path, nodes, run);
162
+ });
163
+
164
+ return nodes();
165
+ }
166
+
167
+ function findHtmlPathForRunProperty(elementType, defaultTagName) {
168
+ var path = findHtmlPath({type: elementType});
169
+ if (path) {
170
+ return path;
171
+ } else if (defaultTagName) {
172
+ return htmlPaths.element(defaultTagName, {}, {fresh: false});
173
+ } else {
174
+ return htmlPaths.empty;
175
+ }
176
+ }
177
+
178
+ function findHtmlPath(element, defaultPath) {
179
+ var style = findStyle(element);
180
+ return style ? style.to : defaultPath;
181
+ }
182
+
183
+ function findStyle(element) {
184
+ for (var i = 0; i < styleMap.length; i++) {
185
+ if (styleMap[i].from.matches(element)) {
186
+ return styleMap[i];
187
+ }
188
+ }
189
+ }
190
+
191
+ function recoveringConvertImage(convertImage) {
192
+ return function(image, messages) {
193
+ return promises.attempt(function() {
194
+ return convertImage(image, messages);
195
+ }).caught(function(error) {
196
+ messages.push(results.error(error));
197
+ return [];
198
+ });
199
+ };
200
+ }
201
+
202
+ function noteHtmlId(note) {
203
+ return referentHtmlId(note.noteType, note.noteId);
204
+ }
205
+
206
+ function noteRefHtmlId(note) {
207
+ return referenceHtmlId(note.noteType, note.noteId);
208
+ }
209
+
210
+ function referentHtmlId(referenceType, referenceId) {
211
+ return htmlId(referenceType + "-" + referenceId);
212
+ }
213
+
214
+ function referenceHtmlId(referenceType, referenceId) {
215
+ return htmlId(referenceType + "-ref-" + referenceId);
216
+ }
217
+
218
+ function htmlId(suffix) {
219
+ return idPrefix + suffix;
220
+ }
221
+
222
+ var defaultTablePath = htmlPaths.elements([
223
+ htmlPaths.element("table", {}, {fresh: true})
224
+ ]);
225
+
226
+ function convertTable(element, messages, options) {
227
+ return findHtmlPath(element, defaultTablePath).wrap(function() {
228
+ return convertTableChildren(element, messages, options);
229
+ });
230
+ }
231
+
232
+ function convertTableChildren(element, messages, options) {
233
+ var bodyIndex = _.findIndex(element.children, function(child) {
234
+ return !child.type === documents.types.tableRow || !child.isHeader;
235
+ });
236
+ if (bodyIndex === -1) {
237
+ bodyIndex = element.children.length;
238
+ }
239
+ var children;
240
+ if (bodyIndex === 0) {
241
+ children = convertElements(
242
+ element.children,
243
+ messages,
244
+ _.extend({}, options, {isTableHeader: false})
245
+ );
246
+ } else {
247
+ var headRows = convertElements(
248
+ element.children.slice(0, bodyIndex),
249
+ messages,
250
+ _.extend({}, options, {isTableHeader: true})
251
+ );
252
+ var bodyRows = convertElements(
253
+ element.children.slice(bodyIndex),
254
+ messages,
255
+ _.extend({}, options, {isTableHeader: false})
256
+ );
257
+ children = [
258
+ Html.freshElement("thead", {}, headRows),
259
+ Html.freshElement("tbody", {}, bodyRows)
260
+ ];
261
+ }
262
+ return [Html.forceWrite].concat(children);
263
+ }
264
+
265
+ function convertTableRow(element, messages, options) {
266
+ var children = convertElements(element.children, messages, options);
267
+ return [
268
+ Html.freshElement("tr", {}, [Html.forceWrite].concat(children))
269
+ ];
270
+ }
271
+
272
+ function convertTableCell(element, messages, options) {
273
+ var tagName = options.isTableHeader ? "th" : "td";
274
+ var children = convertElements(element.children, messages, options);
275
+ var attributes = {};
276
+ if (element.colSpan !== 1) {
277
+ attributes.colspan = element.colSpan.toString();
278
+ }
279
+ if (element.rowSpan !== 1) {
280
+ attributes.rowspan = element.rowSpan.toString();
281
+ }
282
+
283
+ return [
284
+ Html.freshElement(tagName, attributes, [Html.forceWrite].concat(children))
285
+ ];
286
+ }
287
+
288
+ function convertCommentReference(reference, messages, options) {
289
+ return findHtmlPath(reference, htmlPaths.ignore).wrap(function() {
290
+ var comment = comments[reference.commentId];
291
+ var count = referencedComments.length + 1;
292
+ var label = "[" + commentAuthorLabel(comment) + count + "]";
293
+ referencedComments.push({label: label, comment: comment});
294
+ // TODO: remove duplication with note references
295
+ return [
296
+ Html.freshElement("a", {
297
+ href: "#" + referentHtmlId("comment", reference.commentId),
298
+ id: referenceHtmlId("comment", reference.commentId)
299
+ }, [Html.text(label)])
300
+ ];
301
+ });
302
+ }
303
+
304
+ function convertComment(referencedComment, messages, options) {
305
+ // TODO: remove duplication with note references
306
+
307
+ var label = referencedComment.label;
308
+ var comment = referencedComment.comment;
309
+ var body = convertElements(comment.body, messages, options).concat([
310
+ Html.nonFreshElement("p", {}, [
311
+ Html.text(" "),
312
+ Html.freshElement("a", {"href": "#" + referenceHtmlId("comment", comment.commentId)}, [
313
+ Html.text("↑")
314
+ ])
315
+ ])
316
+ ]);
317
+
318
+ return [
319
+ Html.freshElement(
320
+ "dt",
321
+ {"id": referentHtmlId("comment", comment.commentId)},
322
+ [Html.text("Comment " + label)]
323
+ ),
324
+ Html.freshElement("dd", {}, body)
325
+ ];
326
+ }
327
+
328
+ function convertBreak(element, messages, options) {
329
+ return htmlPathForBreak(element).wrap(function() {
330
+ return [];
331
+ });
332
+ }
333
+
334
+ function htmlPathForBreak(element) {
335
+ var style = findStyle(element);
336
+ if (style) {
337
+ return style.to;
338
+ } else if (element.breakType === "line") {
339
+ return htmlPaths.topLevelElement("br");
340
+ } else {
341
+ return htmlPaths.empty;
342
+ }
343
+ }
344
+
345
+ var elementConverters = {
346
+ "document": function(document, messages, options) {
347
+ var children = convertElements(document.children, messages, options);
348
+ var notes = noteReferences.map(function(noteReference) {
349
+ return document.notes.resolve(noteReference);
350
+ });
351
+ var notesNodes = convertElements(notes, messages, options);
352
+ return children.concat([
353
+ Html.freshElement("ol", {}, notesNodes),
354
+ Html.freshElement("dl", {}, flatMap(referencedComments, function(referencedComment) {
355
+ return convertComment(referencedComment, messages, options);
356
+ }))
357
+ ]);
358
+ },
359
+ "paragraph": convertParagraph,
360
+ "run": convertRun,
361
+ "text": function(element, messages, options) {
362
+ return [Html.text(element.value)];
363
+ },
364
+ "tab": function(element, messages, options) {
365
+ return [Html.text("\t")];
366
+ },
367
+ "hyperlink": function(element, messages, options) {
368
+ var href = element.anchor ? "#" + htmlId(element.anchor) : element.href;
369
+ var attributes = {href: href};
370
+ if (element.targetFrame != null) {
371
+ attributes.target = element.targetFrame;
372
+ }
373
+
374
+ var children = convertElements(element.children, messages, options);
375
+ return [Html.nonFreshElement("a", attributes, children)];
376
+ },
377
+ "bookmarkStart": function(element, messages, options) {
378
+ var anchor = Html.freshElement("a", {
379
+ id: htmlId(element.name)
380
+ }, [Html.forceWrite]);
381
+ return [anchor];
382
+ },
383
+ "noteReference": function(element, messages, options) {
384
+ noteReferences.push(element);
385
+ var anchor = Html.freshElement("a", {
386
+ href: "#" + noteHtmlId(element),
387
+ id: noteRefHtmlId(element)
388
+ }, [Html.text("[" + (noteNumber++) + "]")]);
389
+
390
+ return [Html.freshElement("sup", {}, [anchor])];
391
+ },
392
+ "note": function(element, messages, options) {
393
+ var children = convertElements(element.body, messages, options);
394
+ var backLink = Html.elementWithTag(htmlPaths.element("p", {}, {fresh: false}), [
395
+ Html.text(" "),
396
+ Html.freshElement("a", {href: "#" + noteRefHtmlId(element)}, [Html.text("↑")])
397
+ ]);
398
+ var body = children.concat([backLink]);
399
+
400
+ return Html.freshElement("li", {id: noteHtmlId(element)}, body);
401
+ },
402
+ "commentReference": convertCommentReference,
403
+ "comment": convertComment,
404
+ "image": deferredConversion(recoveringConvertImage(options.convertImage || images.dataUri)),
405
+ "table": convertTable,
406
+ "tableRow": convertTableRow,
407
+ "tableCell": convertTableCell,
408
+ "break": convertBreak
409
+ };
410
+ return {
411
+ convertToHtml: convertToHtml
412
+ };
413
+ }
414
+
415
+ var deferredId = 1;
416
+
417
+ function deferredConversion(func) {
418
+ return function(element, messages, options) {
419
+ return [
420
+ {
421
+ type: "deferred",
422
+ id: deferredId++,
423
+ value: function() {
424
+ return func(element, messages, options);
425
+ }
426
+ }
427
+ ];
428
+ };
429
+ }
430
+
431
+ function unrecognisedStyleWarning(type, element) {
432
+ return results.warning(
433
+ "Unrecognised " + type + " style: '" + element.styleName + "'" +
434
+ " (Style ID: " + element.styleId + ")"
435
+ );
436
+ }
437
+
438
+ function flatMap(values, func) {
439
+ return _.flatten(values.map(func), true);
440
+ }
441
+
442
+ function walkHtml(nodes, callback) {
443
+ nodes.forEach(function(node) {
444
+ callback(node);
445
+ if (node.children) {
446
+ walkHtml(node.children, callback);
447
+ }
448
+ });
449
+ }
450
+
451
+ var commentAuthorLabel = exports.commentAuthorLabel = function commentAuthorLabel(comment) {
452
+ return comment.authorInitials || "";
453
+ };