@adobe/helix-importer 2.9.28 → 2.9.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,17 @@
1
+ ## [2.9.30](https://github.com/adobe/helix-importer/compare/v2.9.29...v2.9.30) (2023-09-13)
2
+
3
+
4
+ ### Bug Fixes
5
+
6
+ * **deps:** update adobe fixes ([#226](https://github.com/adobe/helix-importer/issues/226)) ([bf68111](https://github.com/adobe/helix-importer/commit/bf68111284ac5bfe25879022c25f55bcee94386e))
7
+
8
+ ## [2.9.29](https://github.com/adobe/helix-importer/compare/v2.9.28...v2.9.29) (2023-09-07)
9
+
10
+
11
+ ### Bug Fixes
12
+
13
+ * cleanup mdast tree before writing as md ([#223](https://github.com/adobe/helix-importer/issues/223)) ([49f88ce](https://github.com/adobe/helix-importer/commit/49f88cec0bbc30cbf8e5806b1467a21f05ab9814)), closes [#217](https://github.com/adobe/helix-importer/issues/217) [#214](https://github.com/adobe/helix-importer/issues/214) [#166](https://github.com/adobe/helix-importer/issues/166) [#213](https://github.com/adobe/helix-importer/issues/213)
14
+
1
15
  ## [2.9.28](https://github.com/adobe/helix-importer/compare/v2.9.27...v2.9.28) (2023-09-02)
2
16
 
3
17
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/helix-importer",
3
- "version": "2.9.28",
3
+ "version": "2.9.30",
4
4
  "description": "Helix Importer tool: create md / docx from html",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -27,24 +27,26 @@
27
27
  "devDependencies": {
28
28
  "@adobe/eslint-config-helix": "2.0.3",
29
29
  "@adobe/helix-docx2md": "1.4.15",
30
- "@adobe/helix-mediahandler": "2.2.12",
30
+ "@adobe/helix-mediahandler": "2.2.14",
31
31
  "@semantic-release/changelog": "6.0.3",
32
32
  "@semantic-release/exec": "6.0.3",
33
33
  "@semantic-release/git": "10.0.1",
34
34
  "c8": "8.0.1",
35
35
  "dirname-filename-esm": "1.1.1",
36
- "eslint": "8.48.0",
36
+ "eslint": "8.49.0",
37
37
  "husky": "8.0.3",
38
38
  "lint-staged": "14.0.1",
39
39
  "mocha": "10.2.0",
40
40
  "mocha-multi-reporters": "1.5.1",
41
41
  "mock-fs": "5.2.0",
42
- "semantic-release": "21.1.1"
42
+ "remark-parse": "10.0.2",
43
+ "semantic-release": "21.1.1",
44
+ "unist-util-inspect": "8.0.0"
43
45
  },
44
46
  "license": "Apache-2.0",
45
47
  "dependencies": {
46
48
  "@adobe/helix-markdown-support": "6.3.1",
47
- "@adobe/helix-md2docx": "2.1.24",
49
+ "@adobe/helix-md2docx": "2.1.25",
48
50
  "@adobe/mdast-util-gridtables": "2.0.2",
49
51
  "@adobe/remark-gridtables": "1.0.4",
50
52
  "form-data": "4.0.0",
@@ -17,35 +17,33 @@ import { JSDOM } from 'jsdom';
17
17
  import path from 'path';
18
18
  import { unified } from 'unified';
19
19
  import parse from 'rehype-parse';
20
- import rehype2remark from 'rehype-remark';
20
+ import { defaultHandlers, toMdast } from 'hast-util-to-mdast';
21
21
  import stringify from 'remark-stringify';
22
22
  import fs from 'fs-extra';
23
23
  import { md2docx } from '@adobe/helix-md2docx';
24
24
  import remarkGridTable from '@adobe/remark-gridtables';
25
- import { imageReferences, remarkGfmNoLink } from '@adobe/helix-markdown-support';
25
+ import {
26
+ imageReferences,
27
+ remarkGfmNoLink,
28
+ sanitizeHeading,
29
+ sanitizeLinks,
30
+ sanitizeTextAndFormats,
31
+ suppressSpaceCode,
32
+ } from '@adobe/helix-markdown-support';
26
33
  import gridtableHandlers from './hast-to-mdast-gridtable-handlers.js';
27
34
  import Utils from '../utils/Utils.js';
28
35
  import DOMUtils from '../utils/DOMUtils.js';
29
36
  import FileUtils from '../utils/FileUtils.js';
30
37
  import MDUtils from '../utils/MDUtils.js';
31
-
32
- function remarkImageReferences() {
33
- return imageReferences;
34
- }
35
-
36
- function htmlElementNode(element, state, node) {
37
- if (node.children && node.children.length > 0) {
38
- return [{
39
- type: 'html',
40
- value: `<${element}>`,
41
- },
42
- ...state.all(node),
43
- {
44
- type: 'html',
45
- value: `</${element}>`,
46
- }];
47
- }
48
- return '';
38
+ import formatPlugin from './mdast-to-md-format-plugin.js';
39
+
40
+ function formatNode(type, state, node) {
41
+ const result = {
42
+ type,
43
+ children: state.all(node),
44
+ };
45
+ state.patch(node, result);
46
+ return result;
49
47
  }
50
48
 
51
49
  export default class PageImporter {
@@ -76,20 +74,32 @@ export default class PageImporter {
76
74
  const sanitizedName = FileUtils.sanitizeFilename(name);
77
75
  this.logger.log(`Computing Markdown for ${directory}/${sanitizedName}`);
78
76
 
79
- const processor = unified()
77
+ const html = resource.document.innerHTML;
78
+ const hast = await unified()
80
79
  .use(parse, { emitParseErrors: true })
81
- .use(rehype2remark, {
82
- handlers: {
83
- u: (state, node) => htmlElementNode('u', state, node),
84
- sub: (state, node) => htmlElementNode('sub', state, node),
85
- sup: (state, node) => htmlElementNode('sup', state, node),
86
- ...gridtableHandlers,
87
- },
88
- })
89
- .use(remarkImageReferences)
90
- .use(remarkGridTable)
91
- .use(remarkGfmNoLink)
80
+ .parse(html);
81
+
82
+ const mdast = toMdast(hast, {
83
+ handlers: {
84
+ ...defaultHandlers,
85
+ u: (state, node) => formatNode('underline', state, node),
86
+ sub: (state, node) => formatNode('subscript', state, node),
87
+ sup: (state, node) => formatNode('superscript', state, node),
88
+ ...gridtableHandlers,
89
+ },
90
+ });
91
+
92
+ // cleanup mdast similar to docx2md
93
+ await sanitizeHeading(mdast);
94
+ await sanitizeLinks(mdast);
95
+ await sanitizeTextAndFormats(mdast);
96
+ await suppressSpaceCode(mdast);
97
+ await imageReferences(mdast);
98
+
99
+ let md = await unified()
92
100
  .use(stringify, {
101
+ strong: '*',
102
+ emphasis: '_',
93
103
  bullet: '-',
94
104
  fence: '`',
95
105
  fences: true,
@@ -97,20 +107,21 @@ export default class PageImporter {
97
107
  rule: '-',
98
108
  ruleRepetition: 3,
99
109
  ruleSpaces: false,
100
- });
101
-
102
- const html = resource.document.innerHTML;
103
- const file = await processor.process(html);
104
- let contents = String(file);
110
+ })
111
+ .use(remarkGridTable)
112
+ .use(remarkGfmNoLink)
113
+ .use(formatPlugin) // this converts the `underline` and `subscript` back to tags in the md.
114
+ .stringify(mdast);
105
115
 
106
116
  // process image links
117
+ // TODO: this can be done easier in the MDAST tree
107
118
  const { document } = resource;
108
119
  const assets = [];
109
120
  const imgs = document.querySelectorAll('img');
110
121
  imgs.forEach((img) => {
111
122
  const { src } = img;
112
123
  const isEmbed = img.classList.contains('hlx-embed');
113
- if (!isEmbed && src && src !== '' && (contents.indexOf(src) !== -1 || contents.indexOf(decodeURI(src)) !== -1)) {
124
+ if (!isEmbed && src && src !== '' && (md.indexOf(src) !== -1 || md.indexOf(decodeURI(src)) !== -1)) {
114
125
  assets.push({
115
126
  url: src,
116
127
  append: '#image.png',
@@ -122,7 +133,7 @@ export default class PageImporter {
122
133
  as.forEach((a) => {
123
134
  const { href } = a;
124
135
  try {
125
- if ((href && href !== '' && contents.indexOf(href) !== -1) || contents.indexOf(decodeURI(href)) !== -1) {
136
+ if ((href && href !== '' && md.indexOf(href) !== -1) || md.indexOf(decodeURI(href)) !== -1) {
126
137
  const u = new URL(href, url);
127
138
  const ext = path.extname(u.href);
128
139
  if (ext === '.mp4') {
@@ -141,7 +152,7 @@ export default class PageImporter {
141
152
  const vs = document.querySelectorAll('video source');
142
153
  vs.forEach((s) => {
143
154
  const { src } = s;
144
- if ((src && src !== '' && contents.indexOf(src) !== -1) || contents.indexOf(decodeURI(src)) !== -1) {
155
+ if ((src && src !== '' && md.indexOf(src) !== -1) || md.indexOf(decodeURI(src)) !== -1) {
145
156
  try {
146
157
  const u = new URL(src, url);
147
158
  const ext = path.extname(u.href);
@@ -167,18 +178,18 @@ export default class PageImporter {
167
178
  // adjust assets url (from relative to absolute)
168
179
  assets.forEach((asset) => {
169
180
  const u = new URL(decodeURI(asset.url), url);
170
- contents = MDUtils.replaceSrcInMarkdown(contents, asset.url, u.toString());
181
+ md = MDUtils.replaceSrcInMarkdown(md, asset.url, u.toString());
171
182
  });
172
183
 
173
184
  if (resource.prepend) {
174
- contents = resource.prepend + contents;
185
+ md = resource.prepend + md;
175
186
  }
176
187
 
177
- contents = this.postProcessMD(contents);
188
+ md = this.postProcessMD(md);
178
189
 
179
190
  return {
180
191
  path: path.join(directory, sanitizedName),
181
- content: contents,
192
+ content: md,
182
193
  };
183
194
  }
184
195
 
@@ -0,0 +1,68 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ /**
13
+ * Renders special html only format.
14
+ */
15
+ function format(tagName) {
16
+ const tagOpen = `<${tagName}>`;
17
+ const tagClose = `</${tagName}>`;
18
+
19
+ /**
20
+ * @param {Node} node
21
+ * @param {Parents | undefined} _
22
+ * @param {State} state
23
+ * @param {Info} info
24
+ * @returns {string}
25
+ */
26
+ return (node, _, state, info) => {
27
+ const exit = state.enter('html');
28
+ const tracker = state.createTracker(info);
29
+ let value = tracker.move(tagOpen);
30
+ value += tracker.move(
31
+ state.containerPhrasing(node, {
32
+ before: value,
33
+ after: tagOpen,
34
+ ...tracker.current(),
35
+ }),
36
+ );
37
+ value += tracker.move(tagClose);
38
+ exit();
39
+ return value;
40
+ };
41
+ }
42
+
43
+ function toMarkdown() {
44
+ return {
45
+ handlers: {
46
+ subscript: format('sub'),
47
+ superscript: format('sup'),
48
+ underline: format('u'),
49
+ },
50
+ };
51
+ }
52
+
53
+ export default function formatPlugin(options) {
54
+ const data = this.data();
55
+
56
+ function add(field, value) {
57
+ /* c8 ignore next 2 */
58
+ if (data[field]) {
59
+ data[field].push(value);
60
+ } else {
61
+ data[field] = [value];
62
+ }
63
+ }
64
+
65
+ // add('micromarkExtensions', syntax(options));
66
+ // add('fromMarkdownExtensions', fromMarkdown(options));
67
+ add('toMarkdownExtensions', toMarkdown(options));
68
+ }
@@ -21,6 +21,11 @@ import { dirname } from 'dirname-filename-esm';
21
21
 
22
22
  import { docx2md } from '@adobe/helix-docx2md';
23
23
 
24
+ import { unified } from 'unified';
25
+ import remarkParse from 'remark-parse';
26
+ import remarkGridTable from '@adobe/remark-gridtables';
27
+ // eslint-disable-next-line no-unused-vars
28
+ import { inspect, inspectNoColor } from 'unist-util-inspect';
24
29
  import PageImporter from '../../src/importer/PageImporter.js';
25
30
  import PageImporterResource from '../../src/importer/PageImporterResource.js';
26
31
  import MemoryHandler from '../../src/storage/MemoryHandler.js';
@@ -148,6 +153,22 @@ describe('PageImporter tests - fixtures', () => {
148
153
  const md = await storageHandler.get(results[0].md);
149
154
  const expectedMD = await fs.readFile(path.resolve(__dirname, 'fixtures', `${feature}.spec.md`), 'utf-8');
150
155
  strictEqual(md.trim(), expectedMD.trim(), 'imported md is expected one');
156
+
157
+ // parse md to verify mdast
158
+ const mdast = unified()
159
+ .use(remarkParse)
160
+ .use(remarkGridTable)
161
+ .use()
162
+ .parse(md);
163
+
164
+ // process.stdout.write(inspect(mdast, { showPositions: false }));
165
+ // process.stdout.write('\n');
166
+
167
+ if (await fs.pathExistsSync(path.resolve(__dirname, 'fixtures', `${feature}.spec.mdast`))) {
168
+ const actualMdast = inspectNoColor(mdast, { showPositions: false });
169
+ const expectedMdast = await fs.readFile(path.resolve(__dirname, 'fixtures', `${feature}.spec.mdast`), 'utf-8');
170
+ strictEqual(actualMdast.trim(), expectedMdast.trim(), 'imported mdast is expected one');
171
+ }
151
172
  };
152
173
 
153
174
  it('import - tables', async () => {
@@ -4,5 +4,6 @@
4
4
  <p><strong><em>&nbsp;</em></strong></p><p>usefull</p>
5
5
  <p><strong><em>emphasis</em></strong><strong><em> space </em></strong><strong><em>another emphasis</em></strong> <strong><em>last emphasis</em></strong></p>
6
6
  <p><a href="https://www.sample.com">linkcontent</a><i>. </i></p>
7
+ <p><strong><em>Side shuffle </em></strong><em>– </em>Quadriceps, glutes, hamstrings, calves</p>
7
8
  </body>
8
- </html>
9
+ </html>
@@ -2,6 +2,8 @@
2
2
 
3
3
  usefull
4
4
 
5
- ***emphasis* *space* *another emphasis* *last emphasis***
5
+ **_emphasis_ _space_ _another emphasis_ _last emphasis_**
6
6
 
7
- [linkcontent](https://www.sample.com).
7
+ [linkcontent](https://www.sample.com).
8
+
9
+ **_Side shuffle_** _–_ Quadriceps, glutes, hamstrings, calves
@@ -0,0 +1,33 @@
1
+ root[5]
2
+ ├─0 heading[1]
3
+ │ │ depth: 1
4
+ │ └─0 text "EM sample"
5
+ ├─1 paragraph[1]
6
+ │ └─0 text "usefull"
7
+ ├─2 paragraph[1]
8
+ │ └─0 strong[7]
9
+ │ ├─0 emphasis[1]
10
+ │ │ └─0 text "emphasis"
11
+ │ ├─1 text " "
12
+ │ ├─2 emphasis[1]
13
+ │ │ └─0 text "space"
14
+ │ ├─3 text " "
15
+ │ ├─4 emphasis[1]
16
+ │ │ └─0 text "another emphasis"
17
+ │ ├─5 text " "
18
+ │ └─6 emphasis[1]
19
+ │ └─0 text "last emphasis"
20
+ ├─3 paragraph[2]
21
+ │ ├─0 link[1]
22
+ │ │ │ title: null
23
+ │ │ │ url: "https://www.sample.com"
24
+ │ │ └─0 text "linkcontent"
25
+ │ └─1 text "."
26
+ └─4 paragraph[4]
27
+ ├─0 strong[1]
28
+ │ └─0 emphasis[1]
29
+ │ └─0 text "Side shuffle"
30
+ ├─1 text " "
31
+ ├─2 emphasis[1]
32
+ │ └─0 text "–"
33
+ └─3 text " Quadriceps, glutes, hamstrings, calves"
@@ -11,9 +11,6 @@ A paragraph with a br at the end and \&nbsp; " ".
11
11
 
12
12
  A paragraph followed by a br
13
13
 
14
- \
15
-
16
-
17
14
  A paragraph after the br
18
15
 
19
- A paragraph after the nbsp;
16
+ A paragraph after the nbsp;
@@ -8,20 +8,14 @@ Some normal text with random <u>underline</u> or <u>span with underline</u> or <
8
8
 
9
9
  **<u>Underline 3</u>**
10
10
 
11
- - <u>
12
- li underline 1
13
- </u>
14
- - <u>
15
- li underline 2
16
- </u>
11
+ - <u>li underline 1</u>
12
+ - <u>li underline 2</u>
17
13
  also may have text here
18
- - <u>
19
- li underline 3
20
- </u>
14
+ - <u>li underline 3</u>
21
15
 
22
16
  [Unlined link](https:/www.sample.com/a) or [<u>Linked underline</u>](https:/www.sample.com/b) ?
23
17
 
24
- <u>**Some underline and strong text**</u>
18
+ **<u>Some underline and strong text</u>**
25
19
 
26
20
  [U and A are not friends](https://www.austinparks.org/)[Boys & Girls Clubs of the Austin Area](http://www.bgcaustin.org/)[The First Tee of Greater Austin](http://www.thefirstteeaustin.org/club/scripts/public/public.asp)
27
21
 
@@ -0,0 +1,102 @@
1
+ root[10]
2
+ ├─0 heading[1]
3
+ │ │ depth: 1
4
+ │ └─0 text "Underline combo sample"
5
+ ├─1 paragraph[3]
6
+ │ ├─0 html "<u>"
7
+ │ ├─1 text "Underline 1"
8
+ │ └─2 html "</u>"
9
+ ├─2 paragraph[13]
10
+ │ ├─0 text "Some normal text with random "
11
+ │ ├─1 html "<u>"
12
+ │ ├─2 text "underline"
13
+ │ ├─3 html "</u>"
14
+ │ ├─4 text " or "
15
+ │ ├─5 html "<u>"
16
+ │ ├─6 text "span with underline"
17
+ │ ├─7 html "</u>"
18
+ │ ├─8 text " or "
19
+ │ ├─9 html "<u>"
20
+ │ ├─10 text "underline with span"
21
+ │ ├─11 html "</u>"
22
+ │ └─12 text "..."
23
+ ├─3 paragraph[1]
24
+ │ └─0 strong[3]
25
+ │ ├─0 html "<u>"
26
+ │ ├─1 text "Underline 2"
27
+ │ └─2 html "</u>"
28
+ ├─4 paragraph[1]
29
+ │ └─0 strong[3]
30
+ │ ├─0 html "<u>"
31
+ │ ├─1 text "Underline 3"
32
+ │ └─2 html "</u>"
33
+ ├─5 list[3]
34
+ │ │ ordered: false
35
+ │ │ start: null
36
+ │ │ spread: false
37
+ │ ├─0 listItem[1]
38
+ │ │ │ spread: false
39
+ │ │ │ checked: null
40
+ │ │ └─0 paragraph[3]
41
+ │ │ ├─0 html "<u>"
42
+ │ │ ├─1 text "li underline 1"
43
+ │ │ └─2 html "</u>"
44
+ │ ├─1 listItem[1]
45
+ │ │ │ spread: false
46
+ │ │ │ checked: null
47
+ │ │ └─0 paragraph[4]
48
+ │ │ ├─0 html "<u>"
49
+ │ │ ├─1 text "li underline 2"
50
+ │ │ ├─2 html "</u>"
51
+ │ │ └─3 text "\nalso may have text here"
52
+ │ └─2 listItem[1]
53
+ │ │ spread: false
54
+ │ │ checked: null
55
+ │ └─0 paragraph[3]
56
+ │ ├─0 html "<u>"
57
+ │ ├─1 text "li underline 3"
58
+ │ └─2 html "</u>"
59
+ ├─6 paragraph[4]
60
+ │ ├─0 link[1]
61
+ │ │ │ title: null
62
+ │ │ │ url: "https:/www.sample.com/a"
63
+ │ │ └─0 text "Unlined link"
64
+ │ ├─1 text " or "
65
+ │ ├─2 link[3]
66
+ │ │ │ title: null
67
+ │ │ │ url: "https:/www.sample.com/b"
68
+ │ │ ├─0 html "<u>"
69
+ │ │ ├─1 text "Linked underline"
70
+ │ │ └─2 html "</u>"
71
+ │ └─3 text " ?"
72
+ ├─7 paragraph[1]
73
+ │ └─0 strong[3]
74
+ │ ├─0 html "<u>"
75
+ │ ├─1 text "Some underline and strong text"
76
+ │ └─2 html "</u>"
77
+ ├─8 paragraph[3]
78
+ │ ├─0 link[1]
79
+ │ │ │ title: null
80
+ │ │ │ url: "https://www.austinparks.org/"
81
+ │ │ └─0 text "U and A are not friends"
82
+ │ ├─1 link[1]
83
+ │ │ │ title: null
84
+ │ │ │ url: "http://www.bgcaustin.org/"
85
+ │ │ └─0 text "Boys & Girls Clubs of the Austin Area"
86
+ │ └─2 link[1]
87
+ │ │ title: null
88
+ │ │ url: "http://www.thefirstteeaustin.org/club/scripts/public/public.asp"
89
+ │ └─0 text "The First Tee of Greater Austin"
90
+ └─9 paragraph[3]
91
+ ├─0 link[1]
92
+ │ │ title: null
93
+ │ │ url: "https://www.austinparks.org/"
94
+ │ └─0 text "U and A are not friends"
95
+ ├─1 link[1]
96
+ │ │ title: null
97
+ │ │ url: "http://www.bgcaustin.org/"
98
+ │ └─0 text "Boys & Girls Clubs of the Austin Area"
99
+ └─2 link[1]
100
+ │ title: null
101
+ │ url: "http://www.thefirstteeaustin.org/club/scripts/public/public.asp"
102
+ └─0 text "The First Tee of Greater Austin"