@adobe/helix-importer 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/CHANGELOG.md +7 -0
  2. package/package.json +1 -1
  3. package/src/importer/HTML2x.js +2 -21
  4. package/src/importer/defaults/generateDocumentPath.js +24 -0
  5. package/src/importer/defaults/rules/adjustImageUrls.js +28 -0
  6. package/src/importer/defaults/rules/convertIcons.js +25 -0
  7. package/src/importer/defaults/rules/createMetadata.js +102 -0
  8. package/src/importer/defaults/rules/transformBackgroundImages.js +21 -0
  9. package/src/importer/defaults/transformDOM.js +42 -0
  10. package/src/index.js +13 -0
  11. package/src/utils/DOMUtils.js +4 -2
  12. package/test/TestUtils.js +21 -0
  13. package/test/browser/DOMUtils.test.js +67 -0
  14. package/test/importers/HTML2x.spec.js +6 -35
  15. package/test/importers/PageImporter.spec.js +4 -0
  16. package/test/importers/defaults/fixtures/adjust-image-urls.expected.html +7 -0
  17. package/test/importers/defaults/fixtures/adjust-image-urls.input.html +10 -0
  18. package/test/importers/defaults/fixtures/background-image.expected.html +13 -0
  19. package/test/importers/defaults/fixtures/background-image.input.html +10 -0
  20. package/test/importers/defaults/fixtures/cleanup.expected.html +5 -0
  21. package/test/importers/defaults/fixtures/cleanup.input.html +11 -0
  22. package/test/importers/defaults/fixtures/default.expected.html +4 -0
  23. package/test/importers/defaults/fixtures/default.input.html +6 -0
  24. package/test/importers/defaults/fixtures/icons.expected.html +4 -0
  25. package/test/importers/defaults/fixtures/icons.input.html +6 -0
  26. package/test/importers/defaults/fixtures/metadata.all.diff.expected.html +40 -0
  27. package/test/importers/defaults/fixtures/metadata.all.diff.input.html +17 -0
  28. package/test/importers/defaults/fixtures/metadata.all.same.expected.html +20 -0
  29. package/test/importers/defaults/fixtures/metadata.all.same.input.html +17 -0
  30. package/test/importers/defaults/fixtures/metadata.basic.expected.html +16 -0
  31. package/test/importers/defaults/fixtures/metadata.basic.input.html +9 -0
  32. package/test/importers/defaults/fixtures/metadata.image.expected.html +12 -0
  33. package/test/importers/defaults/fixtures/metadata.image.input.html +9 -0
  34. package/test/importers/defaults/fixtures/metadata.og.expected.html +16 -0
  35. package/test/importers/defaults/fixtures/metadata.og.input.html +9 -0
  36. package/test/importers/defaults/fixtures/metadata.twitter.expected.html +16 -0
  37. package/test/importers/defaults/fixtures/metadata.twitter.input.html +9 -0
  38. package/test/importers/defaults/generateDocumentPath.spec.js +32 -0
  39. package/test/importers/defaults/transformDOM.spec.js +94 -0
  40. package/test/importers/fixtures/video.spec.html +11 -0
  41. package/test/importers/fixtures/video.spec.md +7 -0
  42. package/test/utils/DOMUtils.spec.js +3 -4
package/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ # [3.1.0](https://github.com/adobe/helix-importer/compare/v3.0.0...v3.1.0) (2023-11-13)
2
+
3
+
4
+ ### Features
5
+
6
+ * improve default import ([#261](https://github.com/adobe/helix-importer/issues/261)) ([251cfcd](https://github.com/adobe/helix-importer/commit/251cfcdde7ac54525eef49341a9ede52d368cf71))
7
+
1
8
  # [3.0.0](https://github.com/adobe/helix-importer/compare/v2.9.41...v3.0.0) (2023-11-13)
2
9
 
3
10
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/helix-importer",
3
- "version": "3.0.0",
3
+ "version": "3.1.0",
4
4
  "description": "Helix Importer tool: create md / docx from html",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -17,6 +17,8 @@ import PageImporterResource from './PageImporterResource.js';
17
17
  import MemoryHandler from '../storage/MemoryHandler.js';
18
18
  import Utils from '../utils/Utils.js';
19
19
  import BrowserUtils from '../utils/BrowserUtils.js';
20
+ import defaultTransformDOM from './defaults/transformDOM.js';
21
+ import defaultGenerateDocumentPath from './defaults/generateDocumentPath.js';
20
22
 
21
23
  // import docxStylesXML from '../resources/styles.xml';
22
24
 
@@ -35,27 +37,6 @@ function setBackgroundImagesFromCSS(document) {
35
37
  }
36
38
  }
37
39
 
38
- async function defaultTransformDOM({
39
- // eslint-disable-next-line no-unused-vars
40
- url, document, html, params,
41
- }) {
42
- return document.body;
43
- }
44
-
45
- async function defaultGenerateDocumentPath({
46
- // eslint-disable-next-line no-unused-vars
47
- url, document, html, params,
48
- }) {
49
- let p = new URL(url).pathname;
50
- if (p.endsWith('/')) {
51
- p = `${p}index`;
52
- }
53
- return decodeURIComponent(p)
54
- .toLowerCase()
55
- .replace(/\.html$/, '')
56
- .replace(/[^a-z0-9/]/gm, '-');
57
- }
58
-
59
40
  async function html2x(
60
41
  url,
61
42
  doc,
@@ -0,0 +1,24 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ export default async function generateDocumentPath({
13
+ // eslint-disable-next-line no-unused-vars
14
+ url, document, html, params,
15
+ }) {
16
+ let p = new URL(url).pathname;
17
+ if (p.endsWith('/')) {
18
+ p = `${p}index`;
19
+ }
20
+ return decodeURIComponent(p)
21
+ .toLowerCase()
22
+ .replace(/\.html$/, '')
23
+ .replace(/[^a-z0-9/]/gm, '-');
24
+ }
@@ -0,0 +1,28 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ export default function adjustImageUrls(main, url) {
14
+ [...main.querySelectorAll('img')].forEach((img) => {
15
+ const src = img.getAttribute('src');
16
+ if (src && (src.startsWith('./') || src.startsWith('/') || src.startsWith('../'))) {
17
+ try {
18
+ const u = new URL(src, url);
19
+ // eslint-disable-next-line no-param-reassign
20
+ img.src = u.toString();
21
+ } catch (e) {
22
+ // eslint-disable-next-line no-console
23
+ console.log(`Unable to adjust image URL ${img.src} - removing image`);
24
+ img.remove();
25
+ }
26
+ }
27
+ });
28
+ }
@@ -0,0 +1,25 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ export default function convertIcons(main, document) {
14
+ [...main.querySelectorAll('img')].forEach((img) => {
15
+ const src = img.getAttribute('src');
16
+ if (src && src.endsWith('.svg')) {
17
+ const span = document.createElement('span');
18
+ const name = src.split('/').pop().split('.')[0].toLowerCase().trim().replace(/[^a-z0-9]/g, '-');
19
+ if (name) {
20
+ span.innerHTML = `:${name}:`;
21
+ img.replaceWith(span);
22
+ }
23
+ }
24
+ });
25
+ }
@@ -0,0 +1,102 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ import Blocks from '../../../utils/Blocks.js';
14
+
15
+ function getMetadata(name, document) {
16
+ const attr = name && name.includes(':') ? 'property' : 'name';
17
+ const meta = [...document.head.querySelectorAll(`meta[${attr}="${name}"]`)]
18
+ .map((m) => m.content)
19
+ .join(', ');
20
+ return meta || '';
21
+ }
22
+
23
+ export default function createMetadata(main, document) {
24
+ const meta = {};
25
+
26
+ const title = document.querySelector('title');
27
+ if (title) {
28
+ meta.Title = title.textContent.replace(/[\n\t]/gm, '');
29
+ }
30
+
31
+ const desc = getMetadata('description', document);
32
+ if (desc) {
33
+ meta.Description = desc;
34
+ }
35
+
36
+ const img = getMetadata('og:image', document);
37
+ if (img) {
38
+ const el = document.createElement('img');
39
+ el.src = img;
40
+ meta.Image = el;
41
+
42
+ const imgAlt = getMetadata('og:image:alt', document);
43
+ if (imgAlt) {
44
+ el.alt = imgAlt;
45
+ }
46
+ }
47
+
48
+ const ogtitle = getMetadata('og:title', document);
49
+ if (ogtitle && ogtitle !== meta.Title) {
50
+ if (meta.Title) {
51
+ meta['og:title'] = ogtitle;
52
+ } else {
53
+ meta.Title = ogtitle;
54
+ }
55
+ }
56
+
57
+ const ogdesc = getMetadata('og:description', document);
58
+ if (ogdesc && ogdesc !== meta.Description) {
59
+ if (meta.Description) {
60
+ meta['og:description'] = ogdesc;
61
+ } else {
62
+ meta.Description = ogdesc;
63
+ }
64
+ }
65
+
66
+ const ttitle = getMetadata('twitter:title', document);
67
+ if (ttitle && ttitle !== meta.Title) {
68
+ if (meta.Title) {
69
+ meta['twitter:title'] = ttitle;
70
+ } else {
71
+ meta.Title = ttitle;
72
+ }
73
+ }
74
+
75
+ const tdesc = getMetadata('twitter:description', document);
76
+ if (tdesc && tdesc !== meta.Description) {
77
+ if (meta.Description) {
78
+ meta['twitter:description'] = tdesc;
79
+ } else {
80
+ meta.Description = tdesc;
81
+ }
82
+ }
83
+
84
+ const timg = getMetadata('twitter:image', document);
85
+ if (timg && timg !== img) {
86
+ const el = document.createElement('img');
87
+ el.src = timg;
88
+ meta['twitter:image'] = el;
89
+
90
+ const imgAlt = getMetadata('twitter:image:alt', document);
91
+ if (imgAlt) {
92
+ el.alt = imgAlt;
93
+ }
94
+ }
95
+
96
+ if (Object.keys(meta).length > 0) {
97
+ const block = Blocks.getMetadataBlock(document, meta);
98
+ main.append(block);
99
+ }
100
+
101
+ return meta;
102
+ }
@@ -0,0 +1,21 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ import DOMUtils from '../../../utils/DOMUtils.js';
14
+
15
+ export default function transformBackgroundImages(main, document) {
16
+ [...main.querySelectorAll('[style*="background-image: url"]')].forEach((element) => {
17
+ const img = DOMUtils.getImgFromBackground(element, document);
18
+ element.prepend(img);
19
+ element.style.removeProperty('background-image');
20
+ });
21
+ }
@@ -0,0 +1,42 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ import DOMUtils from '../../utils/DOMUtils.js';
13
+ import createMetadata from './rules/createMetadata.js';
14
+ import adjustImageUrls from './rules/adjustImageUrls.js';
15
+ import convertIcons from './rules/convertIcons.js';
16
+ import transformBackgroundImages from './rules/transformBackgroundImages.js';
17
+
18
+ export default async function transformDOM({
19
+ // eslint-disable-next-line no-unused-vars
20
+ url, document, html, params,
21
+ }) {
22
+ const main = document.body;
23
+
24
+ // attempt to remove non-content elements
25
+ DOMUtils.remove(main, [
26
+ 'header',
27
+ '.header',
28
+ 'nav',
29
+ '.nav',
30
+ 'footer',
31
+ '.footer',
32
+ 'iframe',
33
+ 'noscript',
34
+ ]);
35
+
36
+ createMetadata(main, document);
37
+ transformBackgroundImages(main, document);
38
+ adjustImageUrls(main, url);
39
+ convertIcons(main, document);
40
+
41
+ return main;
42
+ }
package/src/index.js CHANGED
@@ -27,6 +27,18 @@ import WPUtils from './wp/WPUtils.js';
27
27
 
28
28
  import { html2md, html2docx } from './importer/HTML2x.js';
29
29
 
30
+ import createMetadata from './importer/defaults/rules/createMetadata.js';
31
+ import adjustImageUrls from './importer/defaults/rules/adjustImageUrls.js';
32
+ import convertIcons from './importer/defaults/rules/convertIcons.js';
33
+ import transformBackgroundImages from './importer/defaults/rules/transformBackgroundImages.js';
34
+
35
+ const rules = {
36
+ createMetadata,
37
+ adjustImageUrls,
38
+ convertIcons,
39
+ transformBackgroundImages,
40
+ };
41
+
30
42
  export {
31
43
  PageImporter,
32
44
  PageImporterParams,
@@ -42,4 +54,5 @@ export {
42
54
  WPUtils,
43
55
  html2md,
44
56
  html2docx,
57
+ rules,
45
58
  };
@@ -258,7 +258,9 @@ export default class DOMUtils {
258
258
  const styleAttr = element?.getAttribute('style')?.split(';');
259
259
  if (styleAttr) {
260
260
  styleAttr.forEach((style) => {
261
- const [prop, value] = style.split(':');
261
+ const split = style.split(':');
262
+ const prop = split.shift();
263
+ const value = split.join(':').trim();
262
264
  if (prop === 'background-image') {
263
265
  const trimmedValue = value.replace(/\s/g, '');
264
266
  const elStyle = element.style;
@@ -267,7 +269,7 @@ export default class DOMUtils {
267
269
  });
268
270
  const url = element.style.backgroundImage;
269
271
  if (url && url.toLowerCase() !== 'none') {
270
- const src = url.replace(/url\(/gm, '').replace(/'/gm, '').replace(/\)/gm, '');
272
+ const src = url.replace(/url\(/gm, '').replace(/'/gm, '').replace(/"/gm, '').replace(/\)/gm, '');
271
273
  const img = document.createElement('img');
272
274
  img.src = src;
273
275
  return img;
@@ -0,0 +1,21 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ import { JSDOM } from 'jsdom';
14
+
15
+ export default class TestUtils {
16
+ // test environment createDocumentFromString version using JSDOM
17
+ static createDocumentFromString(html) {
18
+ const { document } = new JSDOM(html, { runScripts: undefined }).window;
19
+ return document;
20
+ }
21
+ }
@@ -0,0 +1,67 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ /* eslint-env mocha */
13
+ /* eslint-disable no-unused-expressions */
14
+
15
+ import { expect } from '@esm-bundle/chai';
16
+ import BrowserUtils from '../../src/utils/BrowserUtils.js';
17
+ import DOMUtils from '../../src/utils/DOMUtils.js';
18
+
19
+ const createElement = (document, tag, attrs, styles, innerHTML) => {
20
+ const element = document.createElement(tag);
21
+ // eslint-disable-next-line no-restricted-syntax, guard-for-in
22
+ for (const a in attrs) {
23
+ element.setAttribute(a, attrs[a]);
24
+ }
25
+ // eslint-disable-next-line no-restricted-syntax, guard-for-in
26
+ for (const p in styles) {
27
+ element.style[p] = styles[p];
28
+ }
29
+ element.innerHTML = innerHTML;
30
+ return element;
31
+ };
32
+
33
+ describe('DOMUtils#element', () => {
34
+ const test = (tag, attrs, styles, innerHTML, expected) => {
35
+ const document = BrowserUtils.createDocumentFromString('<html><body></body></html>');
36
+ const element = createElement(document, tag, attrs, styles, innerHTML);
37
+ const ret = DOMUtils.getImgFromBackground(element, document);
38
+ if (expected) {
39
+ expect(ret).to.not.be.null;
40
+ expect(ret.outerHTML).to.equal(expected);
41
+ } else {
42
+ expect(ret).to.be.null;
43
+ }
44
+ };
45
+
46
+ it('no background-image style', () => {
47
+ test('p', {}, {}, 'Some content', null);
48
+ test('img', { src: 'https://www.server.com/image.jpg', title: 'Some title' }, {}, '', null);
49
+ test('p', {}, { 'background-image': 'none' }, 'Some content', null);
50
+ });
51
+
52
+ it('with background-image style', () => {
53
+ test('p', {}, { 'background-image': 'url(https://www.server.com/image.jpg)' }, 'Some content', '<img src="https://www.server.com/image.jpg">');
54
+ test('p', {}, { 'background-image': 'url("https://www.server.com/image.jpg")' }, 'Some content', '<img src="https://www.server.com/image.jpg">');
55
+ test('p', {}, { 'background-image': 'url(\'https://www.server.com/image.jpg\')' }, 'Some content', '<img src="https://www.server.com/image.jpg">');
56
+ test('p', {}, { 'background-image': 'url(http://localhost:3001/image.jpg)' }, 'Some content', '<img src="http://localhost:3001/image.jpg">');
57
+ });
58
+
59
+ // `createElement` uses JSDOM to create the test-DOM
60
+ // the workaround in DOMUtils#getImgFromBackground exists _precisely_
61
+ // because of a potential bug in JSDOM due to which it doesn't
62
+ // parse `url()` with whitespaces correctly
63
+ // browser specific version of the test
64
+ it('with background-image style containing whitespace in url()', () => {
65
+ test('p', {}, { 'background-image': 'url( /image.jpg )' }, 'Some content', '<img src="/image.jpg">');
66
+ });
67
+ });
@@ -14,7 +14,6 @@ import {
14
14
  deepStrictEqual, ok, strictEqual, fail,
15
15
  } from 'assert';
16
16
  import { describe, it } from 'mocha';
17
- import { JSDOM } from 'jsdom';
18
17
  import { docx2md } from '@adobe/helix-docx2md';
19
18
  import MockMediaHandler from '../mocks/MockMediaHandler.js';
20
19
 
@@ -22,39 +21,11 @@ import DOMUtils from '../../src/utils/DOMUtils.js';
22
21
  import {
23
22
  html2md,
24
23
  html2docx,
25
- defaultGenerateDocumentPath,
26
- defaultTransformDOM,
27
24
  } from '../../src/importer/HTML2x.js';
28
25
 
29
- // test environment createDocumentFromString version using JSDOM
30
- const createDocumentFromString = (html) => {
31
- const { document } = new JSDOM(html, { runScripts: undefined }).window;
32
- return document;
33
- };
34
-
35
- describe('defaultTransformDOM tests', () => {
36
- it('default transformation', async () => {
37
- const document = createDocumentFromString('<html><body><h1>Hello World</h1></body></html>');
38
- const out = await defaultTransformDOM({ document });
39
- strictEqual(out.outerHTML, '<body><h1>Hello World</h1></body>');
40
- });
41
- });
26
+ import TestUtils from '../TestUtils.js';
42
27
 
43
- describe('defaultGenerateDocumentPath tests', () => {
44
- it('default paths', async () => {
45
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com' }), '/index');
46
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/' }), '/index');
47
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index.html' }), '/index');
48
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index' }), '/index');
49
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page' }), '/page');
50
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page.html' }), '/page');
51
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page' }), '/folder/page');
52
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page.html' }), '/folder/page');
53
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page/' }), '/folder/page/index');
54
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page with spaces.html' }), '/folder/page-with-spaces');
55
- strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/PagE_with_3xtr4_charactére.html' }), '/folder/page-with-3xtr4-charact-re');
56
- });
57
- });
28
+ const { createDocumentFromString } = TestUtils;
58
29
 
59
30
  describe('html2x parameters', () => {
60
31
  const URL = 'https://www.sample.com/page.html';
@@ -408,8 +379,8 @@ describe('html2md tests', () => {
408
379
  const out = await html2md('https://www.sample.com/page.html', '<html><body><img src="data:abc" data-src="./image.jpg"></body></html>', null, {
409
380
  createDocumentFromString,
410
381
  });
411
- strictEqual(out.html.trim(), '<body><img src="./image.jpg" data-src="./image.jpg"></body>');
412
- strictEqual(out.md.trim(), '![][image0]\n\n[image0]: ./image.jpg');
382
+ strictEqual(out.html.trim(), '<body><img src="https://www.sample.com/image.jpg" data-src="./image.jpg"></body>');
383
+ strictEqual(out.md.trim(), '![][image0]\n\n[image0]: https://www.sample.com/image.jpg');
413
384
  });
414
385
 
415
386
  it('html2md allows to preprocess the document', async () => {
@@ -422,8 +393,8 @@ describe('html2md tests', () => {
422
393
  }, {
423
394
  createDocumentFromString,
424
395
  });
425
- strictEqual(out.html.trim(), '<body><img src="./image.jpg"></body>');
426
- strictEqual(out.md.trim(), '![][image0]\n\n[image0]: ./image.jpg');
396
+ strictEqual(out.html.trim(), '<body><img src="https://www.sample.com/image.jpg"></body>');
397
+ strictEqual(out.md.trim(), '![][image0]\n\n[image0]: https://www.sample.com/image.jpg');
427
398
  });
428
399
 
429
400
  it('html2md removes original hrs but keeps md section breaks', async () => {
@@ -237,4 +237,8 @@ describe('PageImporter tests - fixtures', () => {
237
237
  it('import - sub and sup', async () => {
238
238
  await featureTest('subsup');
239
239
  });
240
+
241
+ it('import - video', async () => {
242
+ await featureTest('video');
243
+ });
240
244
  });
@@ -0,0 +1,7 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <img src="https://wwww.sample.com/image1.png">
4
+ <img src="https://wwww.sample.com/path/image2.png">
5
+ <img src="https://wwww.sample.com/image3.png">
6
+ <img src="https://wwww.anotherhost.com/image4.png">
7
+ </body>
@@ -0,0 +1,10 @@
1
+ <html>
2
+ <body>
3
+ <h1>Hello World</h1>
4
+ <img src="/image1.png">
5
+ <img src="./image2.png">
6
+ <img src="https://wwww.sample.com/image3.png">
7
+ <img src="https://wwww.anotherhost.com/image4.png">
8
+ <img src="/\/: #brokenlink#?!$$">
9
+ </body>
10
+ </html>
@@ -0,0 +1,13 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <div style="">
4
+ <img src="https://wwww.sample.com/image1.png">
5
+ <span>some content here</span>
6
+ </div>
7
+ <div style="">
8
+ <img src="https://wwww.sample.com/path/image2.png">
9
+ </div>
10
+ <div style="width: 100%;">
11
+ <img src="https://www.anotherhost.com/image3.png">
12
+ </div>
13
+ </body>
@@ -0,0 +1,10 @@
1
+ <html>
2
+ <body>
3
+ <h1>Hello World</h1>
4
+ <div style="background-image: url('/image1.png')">
5
+ <span>some content here</span>
6
+ </div>
7
+ <div style='background-image: url("./image2.png")'></div>
8
+ <div style="width: 100%; background-image: url('https://www.anotherhost.com/image3.png')"></div>
9
+ </body>
10
+ </html>
@@ -0,0 +1,5 @@
1
+ <body>
2
+ <main>
3
+ <h1>Hello World</h1>
4
+ </main>
5
+ </body>
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <body>
3
+ <header>Top header</header>
4
+ <nav>Nav might be here</nav>
5
+ <main>
6
+ <noscript>Some no script here</noscript>
7
+ <h1>Hello World</h1><iframe src="iframe.html"></iframe>
8
+ </main>
9
+ <footer>Bottom footer</footer>
10
+ </body>
11
+ </html>
@@ -0,0 +1,4 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <p>Some text with a <span>span</span>, a link <a href="./anotherpage.html">anotherpage</a> and a <sub>sub</sub>.</p>
4
+ </body>
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <body>
3
+ <h1>Hello World</h1>
4
+ <p>Some text with a <span>span</span>, a link <a href="./anotherpage.html">anotherpage</a> and a <sub>sub</sub>.</p>
5
+ </body>
6
+ </html>
@@ -0,0 +1,4 @@
1
+ <body>
2
+ <h1>Hello World</h1><span>:icon1:</span>
3
+ <p>This is text with an icon <span>:icon2:</span></p>
4
+ </body>
@@ -0,0 +1,6 @@
1
+ <html>
2
+ <body>
3
+ <h1>Hello World</h1><img src="/icon1.svg">
4
+ <p>This is text with an icon <img src="./icon2.svg"></p>
5
+ </body>
6
+ </html>
@@ -0,0 +1,40 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <table>
4
+ <tr>
5
+ <th colspan="2">Metadata</th>
6
+ </tr>
7
+ <tr>
8
+ <td>Title</td>
9
+ <td>Page title - tite element</td>
10
+ </tr>
11
+ <tr>
12
+ <td>Description</td>
13
+ <td>Page description - description meta</td>
14
+ </tr>
15
+ <tr>
16
+ <td>Image</td>
17
+ <td><img src="https://wwww.sample.com/path/img-og.png" alt="This is the image alt text - og:image:alt meta"></td>
18
+ </tr>
19
+ <tr>
20
+ <td>og:title</td>
21
+ <td>Page title - og:title meta</td>
22
+ </tr>
23
+ <tr>
24
+ <td>og:description</td>
25
+ <td>Page description - og:description meta</td>
26
+ </tr>
27
+ <tr>
28
+ <td>twitter:title</td>
29
+ <td>Page title - twitter:title meta</td>
30
+ </tr>
31
+ <tr>
32
+ <td>twitter:description</td>
33
+ <td>Page description - twitter:description meta</td>
34
+ </tr>
35
+ <tr>
36
+ <td>twitter:image</td>
37
+ <td><img src="https://wwww.sample.com/path/img-twitter.png" alt="This is the image alt text - twitter:image:alt meta"></td>
38
+ </tr>
39
+ </table>
40
+ </body>
@@ -0,0 +1,17 @@
1
+ <html>
2
+ <head>
3
+ <title>Page title - tite element</title>
4
+ <meta name="description" content="Page description - description meta">
5
+ <meta property="og:title" content="Page title - og:title meta">
6
+ <meta property="og:description" content="Page description - og:description meta">
7
+ <meta property="og:image" content="./img-og.png">
8
+ <meta property="og:image:alt" content="This is the image alt text - og:image:alt meta">
9
+ <meta property="twitter:title" content="Page title - twitter:title meta">
10
+ <meta property="twitter:description" content="Page description - twitter:description meta">
11
+ <meta property="twitter:image" content="./img-twitter.png">
12
+ <meta property="twitter:image:alt" content="This is the image alt text - twitter:image:alt meta">
13
+ </head>
14
+ <body>
15
+ <h1>Hello World</h1>
16
+ </body>
17
+ </html>
@@ -0,0 +1,20 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <table>
4
+ <tr>
5
+ <th colspan="2">Metadata</th>
6
+ </tr>
7
+ <tr>
8
+ <td>Title</td>
9
+ <td>Page title</td>
10
+ </tr>
11
+ <tr>
12
+ <td>Description</td>
13
+ <td>Page description</td>
14
+ </tr>
15
+ <tr>
16
+ <td>Image</td>
17
+ <td><img src="https://wwww.sample.com/path/img.png" alt="This is the image alt text"></td>
18
+ </tr>
19
+ </table>
20
+ </body>
@@ -0,0 +1,17 @@
1
+ <html>
2
+ <head>
3
+ <title>Page title</title>
4
+ <meta name="description" content="Page description">
5
+ <meta property="og:title" content="Page title">
6
+ <meta property="og:description" content="Page description">
7
+ <meta property="og:image" content="./img.png">
8
+ <meta property="og:image:alt" content="This is the image alt text">
9
+ <meta property="twitter:title" content="Page title">
10
+ <meta property="twitter:description" content="Page description">
11
+ <meta property="twitter:image" content="./img.png">
12
+ <meta property="twitter:image:alt" content="This is the image alt text">
13
+ </head>
14
+ <body>
15
+ <h1>Hello World</h1>
16
+ </body>
17
+ </html>
@@ -0,0 +1,16 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <table>
4
+ <tr>
5
+ <th colspan="2">Metadata</th>
6
+ </tr>
7
+ <tr>
8
+ <td>Title</td>
9
+ <td>Page title</td>
10
+ </tr>
11
+ <tr>
12
+ <td>Description</td>
13
+ <td>Page description</td>
14
+ </tr>
15
+ </table>
16
+ </body>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <title>Page title</title>
4
+ <meta name="description" content="Page description">
5
+ </head>
6
+ <body>
7
+ <h1>Hello World</h1>
8
+ </body>
9
+ </html
@@ -0,0 +1,12 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <table>
4
+ <tr>
5
+ <th colspan="2">Metadata</th>
6
+ </tr>
7
+ <tr>
8
+ <td>Image</td>
9
+ <td><img src="https://wwww.sample.com/img.png" alt="This is the image alt text"></td>
10
+ </tr>
11
+ </table>
12
+ </body>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <meta property="og:image" content="/img.png">
4
+ <meta property="og:image:alt" content="This is the image alt text">
5
+ </head>
6
+ <body>
7
+ <h1>Hello World</h1>
8
+ </body>
9
+ </html>
@@ -0,0 +1,16 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <table>
4
+ <tr>
5
+ <th colspan="2">Metadata</th>
6
+ </tr>
7
+ <tr>
8
+ <td>Title</td>
9
+ <td>Page title - og:title meta</td>
10
+ </tr>
11
+ <tr>
12
+ <td>Description</td>
13
+ <td>Page description - og:description meta</td>
14
+ </tr>
15
+ </table>
16
+ </body>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <meta property="og:title" content="Page title - og:title meta">
4
+ <meta property="og:description" content="Page description - og:description meta">
5
+ </head>
6
+ <body>
7
+ <h1>Hello World</h1>
8
+ </body>
9
+ </html>
@@ -0,0 +1,16 @@
1
+ <body>
2
+ <h1>Hello World</h1>
3
+ <table>
4
+ <tr>
5
+ <th colspan="2">Metadata</th>
6
+ </tr>
7
+ <tr>
8
+ <td>Title</td>
9
+ <td>Page title - twitter:title meta</td>
10
+ </tr>
11
+ <tr>
12
+ <td>Description</td>
13
+ <td>Page description - twitter:description meta</td>
14
+ </tr>
15
+ </table>
16
+ </body>
@@ -0,0 +1,9 @@
1
+ <html>
2
+ <head>
3
+ <meta property="twitter:title" content="Page title - twitter:title meta">
4
+ <meta property="twitter:description" content="Page description - twitter:description meta">
5
+ </head>
6
+ <body>
7
+ <h1>Hello World</h1>
8
+ </body>
9
+ </html>
@@ -0,0 +1,32 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ import { strictEqual } from 'assert';
14
+ import { describe, it } from 'mocha';
15
+
16
+ import defaultGenerateDocumentPath from '../../../src/importer/defaults/generateDocumentPath.js';
17
+
18
+ describe('defaultGenerateDocumentPath tests', () => {
19
+ it('default paths', async () => {
20
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com' }), '/index');
21
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/' }), '/index');
22
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index.html' }), '/index');
23
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/index' }), '/index');
24
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page' }), '/page');
25
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/page.html' }), '/page');
26
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page' }), '/folder/page');
27
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page.html' }), '/folder/page');
28
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page/' }), '/folder/page/index');
29
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/page with spaces.html' }), '/folder/page-with-spaces');
30
+ strictEqual(await defaultGenerateDocumentPath({ url: 'https://wwww.sample.com/folder/PagE_with_3xtr4_charactére.html' }), '/folder/page-with-3xtr4-charact-re');
31
+ });
32
+ });
@@ -0,0 +1,94 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ import path from 'path';
13
+ import fs from 'fs-extra';
14
+ import { dirname } from 'dirname-filename-esm';
15
+ import { strictEqual } from 'assert';
16
+ import { describe, it } from 'mocha';
17
+
18
+ import defaultTransformDOM from '../../../src/importer/defaults/transformDOM.js';
19
+ import TestUtils from '../../TestUtils.js';
20
+
21
+ // eslint-disable-next-line no-underscore-dangle
22
+ const __dirname = dirname(import.meta);
23
+
24
+ const { createDocumentFromString } = TestUtils;
25
+
26
+ describe('defaultTransformDOM tests', () => {
27
+ const runTest = async (feature, config) => {
28
+ const spec = await fs.readFile(path.resolve(__dirname, 'fixtures', `${feature}.input.html`), 'utf-8');
29
+ const document = createDocumentFromString(spec);
30
+ const out = await defaultTransformDOM({ document, ...config });
31
+ const expected = await fs.readFile(path.resolve(__dirname, 'fixtures', `${feature}.expected.html`), 'utf-8');
32
+ strictEqual(out.outerHTML.replace(/(?:\r\n|\r|\n|\s\s)/g, ''), expected.replace(/(?:\r\n|\r|\n|\s\s)/g, ''));
33
+ };
34
+
35
+ it('default transformation', async () => {
36
+ await runTest('default');
37
+ });
38
+
39
+ it('default transformation handles basic metadata', async () => {
40
+ await runTest('metadata.basic');
41
+ });
42
+
43
+ it('default transformation handles img and alt metadata', async () => {
44
+ await runTest('metadata.image', {
45
+ url: 'https://wwww.sample.com/path/page.html',
46
+ });
47
+ });
48
+
49
+ it('default transformation handles identical metadata', async () => {
50
+ await runTest('metadata.all.same', {
51
+ url: 'https://wwww.sample.com/path/page.html',
52
+ });
53
+ });
54
+
55
+ it('default transformation handles different metadata', async () => {
56
+ await runTest('metadata.all.diff', {
57
+ url: 'https://wwww.sample.com/path/page.html',
58
+ });
59
+ });
60
+
61
+ it('default transformation handles falls back to og metadata', async () => {
62
+ await runTest('metadata.og', {
63
+ url: 'https://wwww.sample.com/path/page.html',
64
+ });
65
+ });
66
+
67
+ it('default transformation handles falls back to twitter metadata', async () => {
68
+ await runTest('metadata.twitter', {
69
+ url: 'https://wwww.sample.com/path/page.html',
70
+ });
71
+ });
72
+
73
+ it('default transformation removes non content elements', async () => {
74
+ await runTest('cleanup');
75
+ });
76
+
77
+ it('default transformation adjusts image urls', async () => {
78
+ await runTest('adjust-image-urls', {
79
+ url: 'https://wwww.sample.com/path/page.html',
80
+ });
81
+ });
82
+
83
+ it('default transformation converts icons', async () => {
84
+ await runTest('icons', {
85
+ url: 'https://wwww.sample.com/path/page.html',
86
+ });
87
+ });
88
+
89
+ it('default transformation converts background-image styles into image element', async () => {
90
+ await runTest('background-image', {
91
+ url: 'https://wwww.sample.com/path/page.html',
92
+ });
93
+ });
94
+ });
@@ -0,0 +1,11 @@
1
+ <html>
2
+ <body>
3
+ <h1>videos</h1>
4
+ <p><a href="./video.mp4">video link</a></p>
5
+ <video src="./video.mp4" poster="./poster.png">
6
+ <source src="./video.mp4" type="video/mp4">
7
+ <source src="./video.webm" type="video/webm">
8
+ <source src="./video.ogg" type="video/ogg">
9
+ </video>
10
+ </body>
11
+ </html>
@@ -0,0 +1,7 @@
1
+ # videos
2
+
3
+ [video link](https://www.sample.com/video.mp4)
4
+
5
+ [![][image0]](https://www.sample.com/video.mp4)
6
+
7
+ [image0]: ./poster.png
@@ -408,15 +408,15 @@ describe('DOMUtils#getImgFromBackground', () => {
408
408
 
409
409
  it('no background-image style', () => {
410
410
  test(createElement('p', {}, {}, 'Some content'), null);
411
-
412
411
  test(createElement('img', { src: 'https://www.server.com/image.jpg', title: 'Some title' }, {}, ''), null);
413
-
414
412
  test(createElement('p', {}, { 'background-image': 'none' }, 'Some content'), null);
415
413
  });
416
414
 
417
415
  it('with background-image style', () => {
418
416
  test(createElement('p', {}, { 'background-image': 'url(https://www.server.com/image.jpg)' }, 'Some content'), '<img src="https://www.server.com/image.jpg">');
419
- test(createElement('div', { class: 'someclass' }, { 'background-image': 'url("https://www.server.com/image.jpg")', background: 'rgb(0, 0, 0) none repeat scroll 0% 0% / auto padding-box border-box' }, '<div><div>Some divs</div><div>More divs</div></div>'), '<img src="https://www.server.com/image.jpg">');
417
+ test(createElement('p', {}, { 'background-image': 'url("https://www.server.com/image.jpg")' }, 'Some content'), '<img src="https://www.server.com/image.jpg">');
418
+ test(createElement('p', {}, { 'background-image': 'url(\'https://www.server.com/image.jpg\')' }, 'Some content'), '<img src="https://www.server.com/image.jpg">');
419
+ test(createElement('p', {}, { 'background-image': 'url(http://localhost:3001/image.jpg)' }, 'Some content'), '<img src="http://localhost:3001/image.jpg">');
420
420
  });
421
421
 
422
422
  // `createElement` uses JSDOM to create the test-DOM
@@ -426,6 +426,5 @@ describe('DOMUtils#getImgFromBackground', () => {
426
426
  // disabling the test, keeping it as a reference
427
427
  xit('with background-image style containing whitespace in url()', () => {
428
428
  test(createElement('p', {}, { 'background-image': 'url( /image.jpg )' }, 'Some content'), '<img src="/image.jpg">');
429
- test(createElement('div', { class: 'someclass' }, { 'background-image': 'url( https://www.server.com/image.jpg )', background: 'rgb(0, 0, 0) none repeat scroll 0% 0% / auto padding-box border-box' }, '<div><div>Some divs</div><div>More divs</div></div>'), '<img src="https://www.server.com/image.jpg">');
430
429
  });
431
430
  });