@adobe/helix-importer 2.9.41 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ # [3.0.0](https://github.com/adobe/helix-importer/compare/v2.9.41...v3.0.0) (2023-11-13)
2
+
3
+
4
+ ### Features
5
+
6
+ * get rid of JSDOM ([#260](https://github.com/adobe/helix-importer/issues/260)) ([2530363](https://github.com/adobe/helix-importer/commit/2530363c328958b2363f0f178b9b1ef7d7bb73b8))
7
+
8
+
9
+ ### BREAKING CHANGES
10
+
11
+ * removing JSDOM
12
+
1
13
  ## [2.9.41](https://github.com/adobe/helix-importer/compare/v2.9.40...v2.9.41) (2023-11-05)
2
14
 
3
15
 
package/README.md CHANGED
@@ -2,43 +2,7 @@
2
2
 
3
3
  Foundation tools for importing website content into that can be consumed in an Helix project.
4
4
 
5
- helix-importer is composed of 2 main building blocks:
6
-
7
- 1. explorer: crawl a website to construct a list of urls to be importer
8
- 2. importer: construct an importer - for an input url, transform the DOM and convert it into a Markdown file
9
-
10
- The folder [./src/wp](./src/wp) contains WordPress specific utils and explorer methods.
11
-
12
- ## Explorer
13
-
14
- Idea of an explorer is to crawl the site in order to collect a list of urls. This list of urls can then be imported.
15
-
16
- Here is a basic sample:
17
-
18
- ```js
19
-
20
- import { WPContentPager, FSHandler, CSV } from '@adobe/helix-importer';
21
-
22
- async function main() {
23
- const pager = new WPContentPager({
24
- nbMaxPages: 1000,
25
- url: 'url to a WordPress site'
26
- });
27
-
28
- const entries = await pager.explore();
29
-
30
- const csv = CSV.toCSV(entries);
31
-
32
- const handler = new FSHandler('output', console);
33
- await handler.put('explorer_results.csv', csv);
34
- }
35
- ```
36
-
37
- In this example, the [WPContentPager](./src/wp/explorers/WPContentPager.ts) extends the [PagingExplorer](src/explorer/PagingExplorer.ts) which implements the 2 methods:
38
- - `fetch` which defines how to fetch one page on results
39
- - `explore` which extracts the list of urls present on that page
40
-
41
- The final result is a list of urls that could be found on list of paged results given by the WordPress API `/page/${page_number}`.
5
+ Basic concept of the importer: for an input url, transform the DOM and convert it into a Markdown / docx file.
42
6
 
43
7
  ## Importer
44
8
 
@@ -50,10 +14,13 @@ Goal of the importer is to get rid of the generic DOM elements like the header /
50
14
 
51
15
  [HTML2x](src/importer/HTML2x.js) methods (`HTML2md` and `HTML2docx`) are convienence methods to run an import. As input, they take:
52
16
  - `URL`: URL of the page to import
53
- - `document`: the DOM element to import
17
+ - `document`: the DOM element to import - a Document object or a string (see `createDocumentFromString` for the string case)
54
18
  - `transformerCfg`: object with the transformation "rules". Object can be either:
55
19
  - `{ transformDOM: ({ url, document, html, params }) => { ... return element-to-convert }, generateDocumentPath: ({ url, document, html, params }) => { ... return path-to-target; }}` for a single mapping between one input document / one output file
56
20
  - `{ transform: ({ url, document, html, params }) => { ... return [{ element: first-element-to-convert, path: first-path-to-target }, ...] }` for a mapping one input document / multiple output files (useful to generate multiple docx from a single web page)
21
+ - `config`: object with several config properties
22
+ - `createDocumentFromString`: this config is required if you use the methods in a non-browser context and want to pass `document` param as string. This method receives the HTML to parse as a string and must return a Document object.
23
+ - `setBackgroundImagesFromCSS`: set to false to disable the `background-image` inlining in the DOM.
57
24
 
58
25
  ### Importer UI
59
26
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/helix-importer",
3
- "version": "2.9.41",
3
+ "version": "3.0.0",
4
4
  "description": "Helix Importer tool: create md / docx from html",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -10,8 +10,10 @@
10
10
  },
11
11
  "scripts": {
12
12
  "lint": "eslint .",
13
- "test": "c8 mocha",
14
- "test-ci": "c8 mocha",
13
+ "test:web": "web-test-runner test/browser/*.test.js --node-resolve",
14
+ "test:web:watch": "web-test-runner test/browser/*.test.js --node-resolve --watch",
15
+ "test": "c8 mocha && npm run test:web",
16
+ "test-ci": "c8 mocha && npm run test:web",
15
17
  "semantic-release": "semantic-release",
16
18
  "prepare": "npx husky install"
17
19
  },
@@ -27,15 +29,21 @@
27
29
  "devDependencies": {
28
30
  "@adobe/eslint-config-helix": "2.0.4",
29
31
  "@adobe/helix-docx2md": "1.5.0",
30
- "@adobe/helix-mediahandler": "2.3.1",
32
+ "@adobe/helix-mediahandler": "2.3.3",
33
+ "@esm-bundle/chai": "4.3.4-fix.0",
31
34
  "@semantic-release/changelog": "6.0.3",
32
35
  "@semantic-release/exec": "6.0.3",
33
36
  "@semantic-release/git": "10.0.1",
37
+ "@web/test-runner": "0.15.1",
38
+ "@web/test-runner-commands": "0.6.5",
39
+ "@web/test-runner-mocha": "0.8.1",
34
40
  "c8": "8.0.1",
41
+ "chai": "4.3.7",
35
42
  "dirname-filename-esm": "1.1.1",
36
43
  "eslint": "8.53.0",
37
44
  "husky": "8.0.3",
38
- "lint-staged": "15.0.2",
45
+ "jsdom": "22.1.0",
46
+ "lint-staged": "15.1.0",
39
47
  "mocha": "10.2.0",
40
48
  "mocha-multi-reporters": "1.5.1",
41
49
  "mock-fs": "5.2.0",
@@ -52,7 +60,6 @@
52
60
  "form-data": "4.0.0",
53
61
  "fs-extra": "11.1.1",
54
62
  "hast-util-to-mdast": "10.1.0",
55
- "jsdom": "22.1.0",
56
63
  "node-fetch": "3.3.2",
57
64
  "rehype-parse": "9.0.0",
58
65
  "rehype-remark": "10.0.0",
@@ -12,12 +12,11 @@
12
12
  /* eslint-disable class-methods-use-this, no-console */
13
13
 
14
14
  import path from 'path';
15
- import { Response } from 'node-fetch';
16
- import { JSDOM } from 'jsdom';
17
15
  import PageImporter from './PageImporter.js';
18
16
  import PageImporterResource from './PageImporterResource.js';
19
17
  import MemoryHandler from '../storage/MemoryHandler.js';
20
18
  import Utils from '../utils/Utils.js';
19
+ import BrowserUtils from '../utils/BrowserUtils.js';
21
20
 
22
21
  // import docxStylesXML from '../resources/styles.xml';
23
22
 
@@ -93,8 +92,8 @@ async function html2x(
93
92
 
94
93
  const html = doc.documentElement.outerHTML;
95
94
  class InternalImporter extends PageImporter {
96
- async fetch() {
97
- return new Response(html);
95
+ async get() {
96
+ return { document: doc, html };
98
97
  }
99
98
 
100
99
  async process(document) {
@@ -181,6 +180,7 @@ async function html2x(
181
180
  stylesXML: config.docxStylesXML,
182
181
  image2png: config.image2png,
183
182
  },
183
+ createDocumentFromString: config.createDocumentFromString,
184
184
  });
185
185
 
186
186
  const pirs = await importer.import(url);
@@ -224,10 +224,18 @@ async function html2x(
224
224
  }
225
225
  }
226
226
 
227
+ const parseStringDocument = (html, config) => {
228
+ if (config?.createDocumentFromString) {
229
+ return config.createDocumentFromString(html);
230
+ } else {
231
+ return BrowserUtils.createDocumentFromString(html);
232
+ }
233
+ };
234
+
227
235
  /**
228
236
  * Returns the result of the conversion from html to md.
229
237
  * @param {string} url URL of the document to convert
230
- * @param {HTMLElement|string} document Document to convert
238
+ * @param {Document} document Document to convert
231
239
  * @param {Object} transformCfg Conversion configuration
232
240
  * @param {Object} config Conversion configuration.
233
241
  * @param {Object} params Conversion params. Object will be pass to the transformer functions.
@@ -235,8 +243,8 @@ async function html2x(
235
243
  */
236
244
  async function html2md(url, document, transformCfg, config, params = {}) {
237
245
  let doc = document;
238
- if (typeof document === 'string') {
239
- doc = new JSDOM(document, { runScripts: undefined }).window.document;
246
+ if (typeof doc === 'string') {
247
+ doc = parseStringDocument(document, config);
240
248
  }
241
249
  return html2x(url, doc, transformCfg, { ...config, toMd: true, toDocx: false }, params);
242
250
  }
@@ -252,8 +260,8 @@ async function html2md(url, document, transformCfg, config, params = {}) {
252
260
  */
253
261
  async function html2docx(url, document, transformCfg, config, params = {}) {
254
262
  let doc = document;
255
- if (typeof document === 'string') {
256
- doc = new JSDOM(document, { runScripts: undefined }).window.document;
263
+ if (typeof doc === 'string') {
264
+ doc = parseStringDocument(document, config);
257
265
  }
258
266
  return html2x(url, doc, transformCfg, { ...config, toMd: true, toDocx: true }, params);
259
267
  }
@@ -12,8 +12,6 @@
12
12
 
13
13
  /* eslint-disable class-methods-use-this */
14
14
 
15
- import { JSDOM } from 'jsdom';
16
-
17
15
  import path from 'path';
18
16
  import { unified } from 'unified';
19
17
  import parse from 'rehype-parse';
@@ -36,6 +34,7 @@ import DOMUtils from '../utils/DOMUtils.js';
36
34
  import FileUtils from '../utils/FileUtils.js';
37
35
  import MDUtils from '../utils/MDUtils.js';
38
36
  import formatPlugin from './mdast-to-md-format-plugin.js';
37
+ import BrowserUtils from '../utils/BrowserUtils.js';
39
38
 
40
39
  function formatNode(type, state, node) {
41
40
  const result = {
@@ -55,6 +54,12 @@ export default class PageImporter {
55
54
 
56
55
  constructor(params) {
57
56
  this.params = params;
57
+
58
+ if (!this.params.createDocumentFromString) {
59
+ // default the string parsing using the browser DOMParser
60
+ this.params.createDocumentFromString = BrowserUtils.createDocumentFromString;
61
+ }
62
+
58
63
  this.logger = params.logger || console;
59
64
 
60
65
  this.useCache = !!params.cache;
@@ -297,8 +302,9 @@ export default class PageImporter {
297
302
  const html = await this.download(url);
298
303
 
299
304
  if (html) {
300
- const { document } = new JSDOM(DOMUtils.removeNoscripts(html.toString())).window;
301
- this.preProcess(document);
305
+ const cleanedHTML = DOMUtils.removeNoscripts(html.toString());
306
+
307
+ const document = this.params.createDocumentFromString(cleanedHTML);
302
308
  return {
303
309
  document,
304
310
  html,
@@ -315,6 +321,8 @@ export default class PageImporter {
315
321
 
316
322
  const results = [];
317
323
  if (document) {
324
+ this.preProcess(document);
325
+
318
326
  const entries = await this.process(document, url, entryParams, html);
319
327
 
320
328
  this.postProcess(document);
@@ -22,4 +22,6 @@ export default class PageImporterParams {
22
22
  logger;
23
23
 
24
24
  mdast2docxOptions;
25
+
26
+ createDocumentFromString;
25
27
  }
package/src/index.js CHANGED
@@ -9,9 +9,6 @@
9
9
  * OF ANY KIND, either express or implied. See the License for the specific language
10
10
  * governing permissions and limitations under the License.
11
11
  */
12
- import PagingExplorer from './explorer/PagingExplorer.js';
13
- import PagingExplorerParams from './explorer/PagingExplorerParams.js';
14
-
15
12
  import PageImporter from './importer/PageImporter.js';
16
13
  import PageImporterParams from './importer/PageImporterParams.js';
17
14
  import PageImporterResource from './importer/PageImporterResource.js';
@@ -27,15 +24,10 @@ import Loader from './utils/Loader.js';
27
24
  import Utils from './utils/Utils.js';
28
25
 
29
26
  import WPUtils from './wp/WPUtils.js';
30
- import WPAdminAjaxPager from './wp/explorers/WPAdminAjaxPager.js';
31
- import WPContentPager from './wp/explorers/WPContentPager.js';
32
- import WPPostWrapPager from './wp/explorers/WPPostWrapPager.js';
33
27
 
34
28
  import { html2md, html2docx } from './importer/HTML2x.js';
35
29
 
36
30
  export {
37
- PagingExplorer,
38
- PagingExplorerParams,
39
31
  PageImporter,
40
32
  PageImporterParams,
41
33
  PageImporterResource,
@@ -48,9 +40,6 @@ export {
48
40
  Loader,
49
41
  Utils,
50
42
  WPUtils,
51
- WPAdminAjaxPager,
52
- WPContentPager,
53
- WPPostWrapPager,
54
43
  html2md,
55
44
  html2docx,
56
45
  };
@@ -0,0 +1,29 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ export default class BrowserUtils {
14
+ /**
15
+ * Creates a document from a html string. This function use DOMParser
16
+ * which should be available in execution context, i.e. a browser.
17
+ * @param {String} html The html to parse
18
+ * @returns Document The parsed document
19
+ */
20
+ static createDocumentFromString(html) {
21
+ try {
22
+ // eslint-disable-next-line no-undef
23
+ const parser = new DOMParser();
24
+ return parser.parseFromString(html, 'text/html');
25
+ } catch (e) {
26
+ throw new Error('Unable to parse HTML using default createDocumentFromString function and global DOMParser. Please provide a custom createDocumentFromString.');
27
+ }
28
+ }
29
+ }
@@ -10,11 +10,15 @@
10
10
  * governing permissions and limitations under the License.
11
11
  */
12
12
 
13
- import { JSDOM } from 'jsdom';
14
-
15
13
  export default class DOMUtils {
16
14
  static EMPTY_TAGS_TO_PRESERVE = ['img', 'video', 'iframe', 'div', 'picture'];
17
15
 
16
+ static fragment(document, string) {
17
+ const tpl = document.createElement('template');
18
+ tpl.innerHTML = string;
19
+ return tpl.content;
20
+ }
21
+
18
22
  static reviewInlineElement(document, tagName) {
19
23
  let tags = [...document.querySelectorAll(tagName)];
20
24
  // first pass, remove empty nodes
@@ -48,7 +52,7 @@ export default class DOMUtils {
48
52
  for (let i = tags.length - 1; i >= 0; i -= 1) {
49
53
  const tag = tags[i];
50
54
  if (tag.innerHTML === '.' || tag.innerHTML === '. ' || tag.innerHTML === ':' || tag.innerHTML === ': ') {
51
- tag.replaceWith(JSDOM.fragment(tag.innerHTML));
55
+ tag.replaceWith(DOMUtils.fragment(document, tag.innerHTML));
52
56
  } else {
53
57
  const { innerHTML } = tag;
54
58
  if (tag.previousSibling) {
@@ -82,13 +86,13 @@ export default class DOMUtils {
82
86
  // move trailing space to a new text node outside of current element
83
87
  tag.innerHTML = innerHTML.slice(0, innerHTML.length - 1);
84
88
  ({ innerHTML } = tag);
85
- tag.after(JSDOM.fragment('<span> </span>'));
89
+ tag.after(DOMUtils.fragment(document, '<span> </span>'));
86
90
  }
87
91
 
88
92
  if (innerHTML.indexOf(' ') === 0) {
89
93
  // move leading space to a new text node outside of current element
90
94
  tag.innerHTML = innerHTML.slice(1);
91
- tag.before(JSDOM.fragment('<span> </span>'));
95
+ tag.before(DOMUtils.fragment(document, '<span> </span>'));
92
96
  }
93
97
  }
94
98
  }
@@ -146,7 +150,7 @@ export default class DOMUtils {
146
150
  if (span.textContent === '') {
147
151
  span.remove();
148
152
  } else {
149
- span.replaceWith(JSDOM.fragment(span.innerHTML));
153
+ span.replaceWith(DOMUtils.fragment(document, span.innerHTML));
150
154
  }
151
155
  }
152
156
  });
@@ -156,7 +160,7 @@ export default class DOMUtils {
156
160
  selectors.forEach((selector) => {
157
161
  document.querySelectorAll(selector).forEach((elem) => {
158
162
  const captionText = elem.textContent.trim();
159
- elem.parentNode.insertBefore(JSDOM.fragment(`<p><em>${captionText}</em></p>`), elem);
163
+ elem.parentNode.insertBefore(DOMUtils.fragment(document, `<p><em>${captionText}</em></p>`), elem);
160
164
  elem.remove();
161
165
  });
162
166
  });
@@ -198,8 +202,8 @@ export default class DOMUtils {
198
202
  return table;
199
203
  }
200
204
 
201
- static generateEmbed(url) {
202
- return JSDOM.fragment(`<table><tr><th>Embed</th></tr><tr><td><a href="${url}">${url}</a></td></tr></table>`);
205
+ static generateEmbed(document, url) {
206
+ return DOMUtils.fragment(document, `<table><tr><th>Embed</th></tr><tr><td><a href="${url}">${url}</a></td></tr></table>`);
203
207
  }
204
208
 
205
209
  static replaceEmbeds(document) {
@@ -208,7 +212,7 @@ export default class DOMUtils {
208
212
  const dataSrc = iframe.getAttribute('data-src');
209
213
  const url = dataSrc || src;
210
214
  if (url) {
211
- iframe.after(DOMUtils.generateEmbed(url));
215
+ iframe.after(DOMUtils.generateEmbed(document, url));
212
216
  }
213
217
  iframe.remove();
214
218
  });
@@ -218,7 +222,7 @@ export default class DOMUtils {
218
222
  if (video.autoplay) {
219
223
  blockType = 'Animation';
220
224
  }
221
- const anim = JSDOM.fragment(`<table><tr><th>${blockType}</th></tr><tr><td>${video.outerHTML}</td></tr></table>`);
225
+ const anim = DOMUtils.fragment(document, `<table><tr><th>${blockType}</th></tr><tr><td>${video.outerHTML}</td></tr></table>`);
222
226
  video.replaceWith(anim);
223
227
  });
224
228
  }
package/src/wp/WPUtils.js CHANGED
@@ -9,8 +9,6 @@
9
9
  * OF ANY KIND, either express or implied. See the License for the specific language
10
10
  * governing permissions and limitations under the License.
11
11
  */
12
- import { JSDOM } from 'jsdom';
13
-
14
12
  import DOMUtils from '../utils/DOMUtils.js';
15
13
 
16
14
  export default class WPUtils {
@@ -27,7 +25,7 @@ export default class WPUtils {
27
25
  ? item.parentNode.nextElementSibling
28
26
  : item.nextElementSibling;
29
27
  const captionText = elem.textContent.trim();
30
- elem.parentNode.insertBefore(JSDOM.fragment(`<p><em>${captionText}</em><p>`), elem);
28
+ elem.parentNode.insertBefore(DOMUtils.fragment(document, `<p><em>${captionText}</em><p>`), elem);
31
29
  elem.remove();
32
30
  }
33
31
  });
@@ -0,0 +1,42 @@
1
+ /*
2
+ * Copyright 2023 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ /* eslint-env mocha */
13
+ /* global Document, HTMLElement */
14
+
15
+ import { expect } from '@esm-bundle/chai';
16
+ import BrowserUtils from '../../src/utils/BrowserUtils.js';
17
+
18
+ describe('BrowserUtils#createDocumentFromString', () => {
19
+ it('createDocumentFromString can parse a simple string', () => {
20
+ const document = BrowserUtils.createDocumentFromString('<html><head><title>Test</title></head><body><h1>Hello World</h1></body></html>');
21
+ expect(document).to.be.an.instanceof(Document);
22
+ expect(document.documentElement).to.be.an.instanceof(HTMLElement);
23
+
24
+ const title = document.querySelector('title');
25
+ expect(title).to.be.an.instanceof(HTMLElement);
26
+ expect(title.textContent).to.equal('Test');
27
+
28
+ const h1 = document.querySelector('h1');
29
+ expect(h1).to.be.an.instanceof(HTMLElement);
30
+ expect(h1.textContent).to.equal('Hello World');
31
+ });
32
+
33
+ it('createDocumentFromString can parse a non document string', () => {
34
+ const document = BrowserUtils.createDocumentFromString('<h1>Hello World</h1>');
35
+ expect(document).to.be.an.instanceof(Document);
36
+ expect(document.documentElement).to.be.an.instanceof(HTMLElement);
37
+
38
+ const h1 = document.querySelector('h1');
39
+ expect(h1).to.be.an.instanceof(HTMLElement);
40
+ expect(h1.textContent).to.equal('Hello World');
41
+ });
42
+ });