npm - @adobe/helix-importer - Versions diffs - 1.13.1 → 1.14.0 - Mend

@adobe/helix-importer 1.13.1 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/CHANGELOG.md +7 -0
package/README.md +15 -4
package/package.json +1 -1
package/src/importer/HTML2x.js +128 -51
package/src/importer/PageImporter.js +3 -1
package/test/importers/HTML2x.spec.js +111 -1
package/test/importers/PageImporter.spec.js +6 -1
package/test/importers/fixtures/space.spec.html +13 -0
package/test/importers/fixtures/space.spec.md +19 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,10 @@
+# [1.14.0](https://github.com/adobe/helix-importer/compare/v1.13.1...v1.14.0) (2022-07-06)
+### Features
+* import one document -> multiple output ([f0b688e](https://github.com/adobe/helix-importer/commit/f0b688ebc9dbb68a9981c45c99bad9fa106d4376))
 ## [1.13.1](https://github.com/adobe/helix-importer/compare/v1.13.0...v1.13.1) (2022-05-23)

package/README.md CHANGED Viewed

@@ -15,7 +15,7 @@ Idea of an explorer is to crawl the site in order to collect a list of urls. Thi
 Here is a basic sample:
-```typescript
+```js
 import { WPContentPager, FSHandler, CSV } from '@adobe/helix-importer';
@@ -42,11 +42,22 @@ The final result is a list of urls that could be found on list of paged results
 ## Importer
-An importer must extends [PageImporter](src/importer/PageImporter.ts) and implement the `fetch` and `process` method. The general idea is that `fetch` receives the url to import and is responsible to return the HTML. `process` receives the corresponding Document in order to filter / rearrange / reshuffle the DOM before it gets processed by the Markdown transformer. `process` computes and defines the list of [PageImporterResource](src/importer/PageImporterResource.ts) (could be more than one), each resource being transformed as a Markdown document.
+An importer must extends [PageImporter](src/importer/PageImporter.js) and implement the `fetch` and `process` method. The general idea is that `fetch` receives the url to import and is responsible to return the HTML. `process` receives the corresponding Document in order to filter / rearrange / reshuffle the DOM before it gets processed by the Markdown transformer. `process` computes and defines the list of [PageImporterResource](src/importer/PageImporterResource.ts) (could be more than one), each resource being transformed as a Markdown document.
 Goal of the importer is to get rid of the generic DOM elements like the header / footer, the nav... and all elements that are common to all pages in order to get the unique piece(s) of content per page.
-You can find a large collection of importer examples in repo: https://github.com/kptdobe/helix-importer-projects
+### HTML2x helpers
+[HTML2x](src/importer/HTML2x.js) methods (`HTML2md` and `HTML2docx`) are convienence methods to run an import. As input, they take:
+- `URL`: URL of the page to import
+- `document`: the DOM element to import
+- `transformerCfg`: object with the transformation "rules". Object can be either:
+  - `{ transformDOM: ({ url, document, html, params }) => { ... return element-to-convert  }, generateDocumentPath: ({ url, document, html, params }) => { ... return path-to-target; }}` for a single mapping between one input document / one output file
+  - `{ transform: ({ url, document, html, params }) => { ... return [{ element: first-element-to-convert, path: first-path-to-target }, ...]  }` for a mapping one input document / multiple output files (useful to generate multiple docx from a single web page)
+### Importer UI
+The Helix Importer has a dedicated browser UI: see https://github.com/adobe/helix-importer-ui
 ## Installation
@@ -58,6 +69,6 @@ TODO: publish npm module
 ## Usage
-```typescript
+```js
 import { ... } from '@adobe/helix-importer';
 ```

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@adobe/helix-importer",
-  "version": "1.13.1",
+  "version": "1.14.0",
   "description": "Helix Importer tool: create md / docx from html",
   "main": "src/index.js",
   "type": "module",

package/src/importer/HTML2x.js CHANGED Viewed

@@ -17,6 +17,7 @@ import { JSDOM } from 'jsdom';
 import PageImporter from './PageImporter.js';
 import PageImporterResource from './PageImporterResource.js';
 import MemoryHandler from '../storage/MemoryHandler.js';
+import Utils from '../utils/Utils.js';
 // import docxStylesXML from '../resources/styles.xml';
@@ -35,13 +36,17 @@ function preprocessDOM(document) {
   }
 }
-// eslint-disable-next-line no-unused-vars
-async function defaultTransformDOM({ url, document, html }) {
+async function defaultTransformDOM({
+  // eslint-disable-next-line no-unused-vars
+  url, document, html, params,
+}) {
   return document.body;
 }
-// eslint-disable-next-line no-unused-vars
-async function defaultGenerateDocumentPath({ url, document }) {
+async function defaultGenerateDocumentPath({
+  // eslint-disable-next-line no-unused-vars
+  url, document, html, params,
+}) {
   let p = new URL(url).pathname;
   if (p.endsWith('/')) {
     p = `${p}index`;
@@ -52,21 +57,26 @@ async function defaultGenerateDocumentPath({ url, document }) {
     .replace(/[^a-z0-9/]/gm, '-');
 }
-async function html2x(url, doc, transformCfg, toMd, toDocx, options = {}) {
-  let name = 'index';
-  let dirname = '';
-  const transfrom = transformCfg || {};
-  if (!transfrom.transformDOM) {
-    transfrom.transformDOM = defaultTransformDOM;
-  }
+async function html2x(
+  url,
+  doc,
+  transformCfg,
+  config = { toMd: true, toDocx: false },
+  params = {},
+) {
+  const transformer = transformCfg || {};
+  if (!transformer.transform) {
+    if (!transformer.transformDOM) {
+      transformer.transformDOM = defaultTransformDOM;
+    }
-  if (!transfrom.generateDocumentPath) {
-    transfrom.generateDocumentPath = defaultGenerateDocumentPath;
+    if (!transformer.generateDocumentPath) {
+      transformer.generateDocumentPath = defaultGenerateDocumentPath;
+    }
   }
-  if (options.preprocess !== false) {
+  if (config.preprocess !== false) {
     preprocessDOM(doc);
   }
@@ -77,22 +87,59 @@ async function html2x(url, doc, transformCfg, toMd, toDocx, options = {}) {
     }
     async process(document) {
-      let output = await transfrom.transformDOM({ url, document, html });
-      output = output || document.body;
-      let p = await transfrom.generateDocumentPath({ url, document });
-      if (!p) {
-        // provided function returns null -> apply default
-        p = await defaultGenerateDocumentPath({ url, document });
+      if (transformer.transform) {
+        let results = transformer.transform({
+          url,
+          document,
+          html,
+          params,
+        });
+        if (!results) return null;
+        const pirs = [];
+        if (!Array.isArray(results)) {
+          // single element with transform function
+          results = [results];
+        }
+        results.forEach((result) => {
+          const name = path.basename(result.path);
+          const dirname = path.dirname(result.path);
+          const pir = new PageImporterResource(name, dirname, result.element, null, {
+            html: result.element.outerHTML,
+          });
+          pirs.push(pir);
+        });
+        return pirs;
+      } else {
+        let output = await transformer.transformDOM({
+          url,
+          document,
+          html,
+          params,
+        });
+        output = output || document.body;
+        let p = await transformer.generateDocumentPath({
+          url,
+          document,
+          html,
+          params,
+        });
+        if (!p) {
+          // provided function returns null -> apply default
+          p = await defaultGenerateDocumentPath({ url, document });
+        }
+        const name = path.basename(p);
+        const dirname = path.dirname(p);
+        const pir = new PageImporterResource(name, dirname, output, null, {
+          html: output.outerHTML,
+        });
+        return [pir];
       }
-      name = path.basename(p);
-      dirname = path.dirname(p);
-      const pir = new PageImporterResource(name, dirname, output, null, {
-        html: output.outerHTML,
-      });
-      return [pir];
     }
   }
@@ -107,48 +154,78 @@ async function html2x(url, doc, transformCfg, toMd, toDocx, options = {}) {
   const storageHandler = new MemoryHandler(logger);
   const importer = new InternalImporter({
     storageHandler,
-    skipDocxConversion: !toDocx,
-    skipMDFileCreation: !toMd,
+    skipDocxConversion: !config.toDocx,
+    skipMDFileCreation: !config.toMd,
     logger,
     mdast2docxOptions: {
-      stylesXML: options.docxStylesXML,
-      svg2png: options.svg2png,
+      stylesXML: config.docxStylesXML,
+      svg2png: config.svg2png,
     },
   });
   const pirs = await importer.import(url);
-  const res = {
-    html: pirs[0].extra.html,
-  };
+  const getResponseObjectFromPIR = async (pir) => {
+    const res = {
+      html: pir.extra.html,
+    };
-  res.path = path.resolve(dirname, name);
+    res.path = path.resolve(pir.directory, pir.name);
-  if (toMd) {
-    const md = await storageHandler.get(pirs[0].md);
-    res.md = md;
-  }
-  if (toDocx) {
-    const docx = await storageHandler.get(pirs[0].docx);
-    res.docx = docx;
+    if (config.toMd) {
+      const md = await storageHandler.get(pir.md);
+      res.md = md;
+    }
+    if (config.toDocx) {
+      const docx = await storageHandler.get(pir.docx);
+      res.docx = docx;
+    }
+    return res;
+  };
+  if (pirs.length === 1) {
+    return getResponseObjectFromPIR(pirs[0]);
+  } else {
+    const res = [];
+    await Utils.asyncForEach(pirs, async (pir) => {
+      res.push(await getResponseObjectFromPIR(pir));
+    });
+    return res;
   }
-  return res;
 }
-async function html2md(url, document, transformCfg, options = {}) {
+/**
+ * Returns the result of the conversion from html to md.
+ * @param {string} url URL of the document to convert
+ * @param {HTMLElement|string} document Document to convert
+ * @param {Object} transformCfg Conversion configuration
+ * @param {Object} config Conversion configuration.
+ * @param {Object} params Conversion params. Object will be pass to the transformer functions.
+ * @returns {Object|Array} Result(s) of the conversion
+ */
+async function html2md(url, document, transformCfg, config, params = {}) {
   let doc = document;
   if (typeof document === 'string') {
     doc = new JSDOM(document, { runScripts: undefined }).window.document;
   }
-  return html2x(url, doc, transformCfg, true, false, options);
+  return html2x(url, doc, transformCfg, { ...config, toMd: true, toDocx: false }, params);
 }
-async function html2docx(url, document, transformCfg, options = {}) {
+/**
+ * Returns the result of the conversion from html to docx.
+ * @param {string} url URL of the document to convert
+ * @param {HTMLElement|string} document Document to convert
+ * @param {Object} transformCfg Conversion configuration
+ * @param {Object} config Conversion configuration.
+ * @param {Object} params Conversion params. Object will be pass to the transformer functions.
+ * @returns {Object|Array} Result(s) of the conversion
+ */
+async function html2docx(url, document, transformCfg, config, params = {}) {
   let doc = document;
   if (typeof document === 'string') {
     doc = new JSDOM(document, { runScripts: undefined }).window.document;
   }
-  return html2x(url, doc, transformCfg, true, true, options);
+  return html2x(url, doc, transformCfg, { ...config, toMd: true, toDocx: true }, params);
 }
 export {

package/src/importer/PageImporter.js CHANGED Viewed

@@ -160,7 +160,7 @@ export default class PageImporter {
     contents = this.postProcessMD(contents);
     return {
-      path: `${directory}/${sanitizedName}`,
+      path: path.join(directory, sanitizedName),
       content: contents,
     };
   }
@@ -296,6 +296,8 @@ export default class PageImporter {
           // eslint-disable-next-line no-param-reassign
           entry.source = url;
           // eslint-disable-next-line no-param-reassign
+          entry.path = res.path;
+          // eslint-disable-next-line no-param-reassign
           entry.markdown = res.content;
           if (!this.params.skipMDFileCreation) {

package/test/importers/HTML2x.spec.js CHANGED Viewed

@@ -48,6 +48,57 @@ describe('defaultGenerateDocumentPath tests', () => {
   });
 });
+describe('html2x parameters', () => {
+  const URL = 'https://www.sample.com/page.html';
+  const ORIGNAL_URL = 'https://www.notproxyurl.com/folder/page.html';
+  const HTML = '<html><head></head><body><h1>Hello World</h1></body></html>';
+  const testParams = ({
+    url,
+    document,
+    html,
+    params,
+  }) => {
+    strictEqual(url, URL);
+    strictEqual(params.originalURL, ORIGNAL_URL);
+    strictEqual(html, HTML);
+    const h1 = document.querySelector('h1');
+    ok(h1);
+    strictEqual(h1.textContent, 'Hello World');
+  };
+  it('parameters are correctly passed in single mode', async () => {
+    await html2md(URL, HTML, {
+      transformDOM: testParams,
+      generateDocumentPath: testParams,
+    }, null, {
+      originalURL: ORIGNAL_URL,
+    });
+    await html2docx(URL, HTML, {
+      transformDOM: testParams,
+      generateDocumentPath: testParams,
+    }, null, {
+      originalURL: ORIGNAL_URL,
+    });
+  });
+  it('parameters are correctly passed in multi mode', async () => {
+    await html2md(URL, HTML, {
+      transform: testParams,
+    }, null, {
+      originalURL: ORIGNAL_URL,
+    });
+    await html2docx(URL, HTML, {
+      transform: testParams,
+    }, null, {
+      originalURL: ORIGNAL_URL,
+    });
+  });
+});
 describe('html2md tests', () => {
   it('html2md provides a default transformation', async () => {
     const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>');
@@ -56,7 +107,7 @@ describe('html2md tests', () => {
     strictEqual(out.path, '/page');
   });
-  it('html2md handles a custom transformations', async () => {
+  it('html2md handles a custom transformation', async () => {
     const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
       transformDOM: ({ document }) => {
         const p = document.createElement('p');
@@ -70,6 +121,65 @@ describe('html2md tests', () => {
     strictEqual(out.path, '/folder/my-custom-path');
   });
+  it('html2md handles multiple transform', async () => {
+    const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
+      transform: ({ document }) => {
+        const p1 = document.createElement('p');
+        p1.innerHTML = 'My Hello to the World 1';
+        const p2 = document.createElement('p');
+        p2.innerHTML = 'My Hello to the World 2';
+        return [{
+          element: p1,
+          path: '/my-custom-path-p1',
+        }, {
+          element: p2,
+          path: '/folder/my-custom-path-p2',
+        }];
+      },
+    });
+    const out1 = out[0];
+    strictEqual(out1.html.trim(), '<p>My Hello to the World 1</p>');
+    strictEqual(out1.md.trim(), 'My Hello to the World 1');
+    strictEqual(out1.path, '/my-custom-path-p1');
+    const out2 = out[1];
+    strictEqual(out2.html.trim(), '<p>My Hello to the World 2</p>');
+    strictEqual(out2.md.trim(), 'My Hello to the World 2');
+    strictEqual(out2.path, '/folder/my-custom-path-p2');
+  });
+  it('html2md handles multiple transform', async () => {
+    const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
+      transform: ({ document }) => {
+        const p1 = document.createElement('p');
+        p1.innerHTML = 'My Hello to the World 1';
+        const p2 = document.createElement('p');
+        p2.innerHTML = 'My Hello to the World 2';
+        return {
+          element: p1,
+          path: '/my-custom-path-p1',
+        };
+      },
+    });
+    strictEqual(out.html.trim(), '<p>My Hello to the World 1</p>');
+    strictEqual(out.md.trim(), 'My Hello to the World 1');
+    strictEqual(out.path, '/my-custom-path-p1');
+  });
+  it('html2md does not crash if transform returns null', async () => {
+    const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
+      transform: () => null,
+    });
+    strictEqual(out.length, 0);
+  });
   it('html2md can deal with null returning transformation', async () => {
     const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
       transformDOM: () => null,

package/test/importers/PageImporter.spec.js CHANGED Viewed

@@ -76,6 +76,7 @@ describe('PageImporter tests - various options', () => {
     const results = await se.import('/someurl');
     strictEqual(results.length, 1, 'expect no result');
+    strictEqual(results[0].path, '/someurl/somecomputedpath/resource1', 'expect no result');
     ok(await storageHandler.exists('/someurl/somecomputedpath/resource1.md'), 'md has been stored');
     ok(await storageHandler.exists('/someurl/somecomputedpath/resource1.docx'), 'docx has been stored');
@@ -144,7 +145,7 @@ describe('PageImporter tests - fixtures', () => {
     strictEqual(results.length, 1, 'expect one result');
-    const md = await storageHandler.get(`/${feature}.md`);
+    const md = await storageHandler.get(results[0].md);
     const expectedMD = await fs.readFile(path.resolve(__dirname, 'fixtures', `${feature}.spec.md`), 'utf-8');
     strictEqual(md.trim(), expectedMD.trim(), 'inported md is expected one');
   };
@@ -172,4 +173,8 @@ describe('PageImporter tests - fixtures', () => {
   it('import - complex', async () => {
     await featureTest('complex');
   });
+  it('import - spaces', async () => {
+    await featureTest('space');
+  });
 });

package/test/importers/fixtures/space.spec.html ADDED Viewed

@@ -0,0 +1,13 @@
+<html>
+  <body>
+    <h1>Space sample</h1>
+    <p>A simple paragraph</p>
+    <p>A paragraph with a br inside.<br> This should be next line.</p>
+    <p>A paragraph with a br at the end.<br></p>
+    <p>A paragraph followed by a br</p>
+    <br>
+    <p>A paragraph after the br</p>
+    &nbsp;
+    <p>A paragraph after the nbsp;</p>
+  </body>
+</html>

package/test/importers/fixtures/space.spec.md ADDED Viewed

@@ -0,0 +1,19 @@
+# Space sample
+A simple paragraph
+A paragraph with a br inside.\
+This should be next line.
+A paragraph with a br at the end.
+A paragraph followed by a br
+\
+A paragraph after the br
+A paragraph after the nbsp;