@adobe/helix-docx2md 1.1.3 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ # [1.2.0](https://github.com/adobe/helix-docx2md/compare/v1.1.4...v1.2.0) (2022-10-25)
2
+
3
+
4
+ ### Features
5
+
6
+ * add support for document internal links and bookmarks ([#140](https://github.com/adobe/helix-docx2md/issues/140)) ([90bc3af](https://github.com/adobe/helix-docx2md/commit/90bc3afb3c05819ed285bb6deedf0fff89ba52b3)), closes [#134](https://github.com/adobe/helix-docx2md/issues/134)
7
+
8
+ ## [1.1.4](https://github.com/adobe/helix-docx2md/compare/v1.1.3...v1.1.4) (2022-10-05)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * **deps:** update dependency @adobe/helix-markdown-support to v5.0.10 ([#129](https://github.com/adobe/helix-docx2md/issues/129)) ([33fa129](https://github.com/adobe/helix-docx2md/commit/33fa129f80e61ca62e6db4bdac41d87673bd8bfb))
14
+
1
15
  ## [1.1.3](https://github.com/adobe/helix-docx2md/compare/v1.1.2...v1.1.3) (2022-09-28)
2
16
 
3
17
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/helix-docx2md",
3
- "version": "1.1.3",
3
+ "version": "1.2.0",
4
4
  "description": "Helix library that converts word documents to markdown",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
@@ -33,10 +33,11 @@
33
33
  },
34
34
  "homepage": "https://github.com/adobe/helix-docx2md#readme",
35
35
  "dependencies": {
36
- "@adobe/helix-markdown-support": "5.0.9",
36
+ "@adobe/helix-markdown-support": "5.0.10",
37
37
  "@adobe/helix-shared-process-queue": "1.1.5",
38
38
  "@adobe/mammoth": "1.5.1-bleeding.1",
39
39
  "dirname-filename-esm": "1.1.1",
40
+ "github-slugger": "1.4.0",
40
41
  "mdast-util-to-markdown": "1.3.0",
41
42
  "mdast-util-to-string": "3.1.0",
42
43
  "remark-gfm": "3.0.1",
@@ -49,20 +50,20 @@
49
50
  },
50
51
  "devDependencies": {
51
52
  "@adobe/eslint-config-helix": "1.3.2",
52
- "@adobe/helix-mediahandler": "1.2.5",
53
+ "@adobe/helix-mediahandler": "1.2.12",
53
54
  "@semantic-release/changelog": "6.0.1",
54
55
  "@semantic-release/exec": "6.0.3",
55
56
  "@semantic-release/git": "10.0.1",
56
57
  "c8": "7.12.0",
57
- "dotenv": "16.0.2",
58
- "eslint": "8.24.0",
58
+ "dotenv": "16.0.3",
59
+ "eslint": "8.26.0",
59
60
  "eslint-import-resolver-exports": "1.0.0-beta.3",
60
61
  "eslint-plugin-header": "3.1.1",
61
62
  "eslint-plugin-import": "2.26.0",
62
63
  "husky": "8.0.1",
63
64
  "junit-report-builder": "3.0.1",
64
65
  "lint-staged": "13.0.3",
65
- "mocha": "10.0.0",
66
+ "mocha": "10.1.0",
66
67
  "mocha-multi-reporters": "1.5.1",
67
68
  "semantic-release": "19.0.5",
68
69
  "unist-util-inspect": "7.0.1"
@@ -11,8 +11,10 @@
11
11
  */
12
12
 
13
13
  /* eslint-disable no-param-reassign */
14
+ import { toString } from 'mdast-util-to-string';
14
15
  import one from './one.js';
15
16
  import handlers from './handlers/index.js';
17
+ import IDSlugger from './id-slugger.js';
16
18
 
17
19
  /**
18
20
  * @typedef {Node} List
@@ -28,6 +30,11 @@ import handlers from './handlers/index.js';
28
30
  * Note: that the stack is reversed, i.e. the first is the deepest one.
29
31
  * @typedef {ListStack[]} ListContainers
30
32
  *
33
+ * @typedef Bookmark
34
+ * @property {string} name
35
+ * @property {Node} target
36
+ * @property {Node[]} links
37
+ *
31
38
  * Converts the docx AST to markdown ast.
32
39
  * @param {object} tree the docx ast
33
40
  * @param {object} opts options
@@ -62,9 +69,57 @@ export default async function dast2mdast(tree, opts = {}) {
62
69
  h.handlers = handlers;
63
70
  h.numbering = {};
64
71
  h.gridtables = opts.gridtables;
72
+ h.bookmarks = {};
65
73
 
66
74
  /** @type {ListContainers} */
67
75
  h.listContainers = [[]];
68
76
 
69
- return one(h, tree, null);
77
+ /**
78
+ * @param {string} name
79
+ * @returns {Bookmark}
80
+ */
81
+ h.getBookmark = (name) => {
82
+ let bm = h.bookmarks[name];
83
+ if (!bm) {
84
+ bm = {
85
+ name,
86
+ target: null,
87
+ links: [],
88
+ };
89
+ h.bookmarks[name] = bm;
90
+ }
91
+ return bm;
92
+ };
93
+
94
+ const mdast = one(h, tree, null);
95
+
96
+ // process bookmarks. note that we _should_ re-slug them after the headings are sanitized in
97
+ // mdast2md. another option would be to keep the `bookmark` nodes in the mdast and only
98
+ // process them in mdast2md. but then, the dast2mdast would produce non standard mdast.
99
+ let slugger;
100
+ for (const bm of Object.values(h.bookmarks)) {
101
+ if (!bm.target) {
102
+ // eslint-disable-next-line no-continue
103
+ continue;
104
+ }
105
+ if (!slugger) {
106
+ slugger = new IDSlugger();
107
+ }
108
+ // if heading, create an ID from its text
109
+ if (bm.target.type === 'heading') {
110
+ const text = toString(bm.target).trim();
111
+ bm.id = slugger.slug(text || 'heading');
112
+ bm.target.id = bm.id;
113
+ } else {
114
+ // create an anchor node for non-heading bookmarks
115
+ bm.id = slugger.slug('bookmark');
116
+ bm.target.type = 'html';
117
+ bm.target.value = `<a id="${bm.id}"></a>`;
118
+ }
119
+ // adjust all links uris to the id
120
+ for (const link of bm.links) {
121
+ link.url = `#${bm.id}`;
122
+ }
123
+ }
124
+ return mdast;
70
125
  }
@@ -0,0 +1,24 @@
1
+ /*
2
+ * Copyright 2019 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ /**
14
+ * Bookmarks are inserted by word when one creates a document internal link. At this point they
15
+ * are just remembered. If they are a child of a heading node it will later be adjusted to
16
+ * reflect the expected ID of that heading (by slugging the heading text). if they are just normal
17
+ * paragraph bookmarks, they will later be turned into an anchor link.
18
+ */
19
+ export default function bookmark(h, node) {
20
+ const bm = h.getBookmark(node.name);
21
+ bm.target = h('bookmark', node.name);
22
+ bm.target.bookmark = bm;
23
+ return bm.target;
24
+ }
@@ -12,8 +12,18 @@
12
12
  import all from '../all.js';
13
13
 
14
14
  export default function hyperlink(h, node) {
15
- return h('link', {
15
+ const link = h('link', {
16
16
  url: node.href || '',
17
17
  title: node.title,
18
18
  }, all(h, node));
19
+
20
+ // document internal links will have an `anchor` property and are managed as bookmarks
21
+ // after the document is processed, the uris will be adjusted to point to the correct bookmark id.
22
+ if (node.anchor) {
23
+ // only bookmark links have anchors
24
+ const bm = h.getBookmark(node.anchor);
25
+ bm.links.push(link);
26
+ }
27
+
28
+ return link;
19
29
  }
@@ -20,6 +20,7 @@ import tableRow from './table-row.js';
20
20
  import tableCell from './table-cell.js';
21
21
  import text from './text.js';
22
22
  import run from './run.js';
23
+ import bookmarkStart from './bookmark.js';
23
24
 
24
25
  export default {
25
26
  document,
@@ -32,4 +33,5 @@ export default {
32
33
  tableRow,
33
34
  tableCell,
34
35
  image,
36
+ bookmarkStart,
35
37
  };
@@ -172,8 +172,15 @@ export default function paragraph(h, node, parent, siblings) {
172
172
  if (toString(nodes).trim() === '---') {
173
173
  return h('thematicBreak');
174
174
  }
175
- // sanitize children
176
- return h('heading', { depth }, nodes);
175
+ const heading = h('heading', { depth }, nodes);
176
+ // check if any of the children is a bookmark
177
+ const idx = nodes.findIndex((n) => n.type === 'bookmark');
178
+ if (idx >= 0) {
179
+ // replace the bookmark node with this heading and remove the child
180
+ nodes[idx].bookmark.target = heading;
181
+ nodes.splice(idx, 1);
182
+ }
183
+ return heading;
177
184
  }
178
185
 
179
186
  // check for codeblock
@@ -0,0 +1,38 @@
1
+ /*
2
+ * Copyright 2022 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ import GithubSlugger from 'github-slugger';
13
+
14
+ export default class IDSlugger {
15
+ constructor() {
16
+ this.occurrences = {};
17
+ }
18
+
19
+ /**
20
+ * Generate a unique slug.
21
+ * @param {string} value String of text to slugify
22
+ * @return {string} A unique slug string
23
+ */
24
+ slug(value) {
25
+ let id = GithubSlugger.slug(value)
26
+ // remove leading numbers
27
+ .replace(/^\d+-+/, '');
28
+
29
+ // resolve collisions
30
+ const original = id;
31
+ while (id in this.occurrences) {
32
+ this.occurrences[original] += 1;
33
+ id = `${original}-${this.occurrences[original]}`;
34
+ }
35
+ this.occurrences[id] = 0;
36
+ return id;
37
+ }
38
+ }