@adobe/helix-importer 1.13.1 → 1.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ # [1.14.0](https://github.com/adobe/helix-importer/compare/v1.13.1...v1.14.0) (2022-07-06)
2
+
3
+
4
+ ### Features
5
+
6
+ * import one document -> multiple output ([f0b688e](https://github.com/adobe/helix-importer/commit/f0b688ebc9dbb68a9981c45c99bad9fa106d4376))
7
+
1
8
  ## [1.13.1](https://github.com/adobe/helix-importer/compare/v1.13.0...v1.13.1) (2022-05-23)
2
9
 
3
10
 
package/README.md CHANGED
@@ -15,7 +15,7 @@ Idea of an explorer is to crawl the site in order to collect a list of urls. Thi
15
15
 
16
16
  Here is a basic sample:
17
17
 
18
- ```typescript
18
+ ```js
19
19
 
20
20
  import { WPContentPager, FSHandler, CSV } from '@adobe/helix-importer';
21
21
 
@@ -42,11 +42,22 @@ The final result is a list of urls that could be found on list of paged results
42
42
 
43
43
  ## Importer
44
44
 
45
- An importer must extends [PageImporter](src/importer/PageImporter.ts) and implement the `fetch` and `process` method. The general idea is that `fetch` receives the url to import and is responsible to return the HTML. `process` receives the corresponding Document in order to filter / rearrange / reshuffle the DOM before it gets processed by the Markdown transformer. `process` computes and defines the list of [PageImporterResource](src/importer/PageImporterResource.ts) (could be more than one), each resource being transformed as a Markdown document.
45
+ An importer must extends [PageImporter](src/importer/PageImporter.js) and implement the `fetch` and `process` method. The general idea is that `fetch` receives the url to import and is responsible to return the HTML. `process` receives the corresponding Document in order to filter / rearrange / reshuffle the DOM before it gets processed by the Markdown transformer. `process` computes and defines the list of [PageImporterResource](src/importer/PageImporterResource.ts) (could be more than one), each resource being transformed as a Markdown document.
46
46
 
47
47
  Goal of the importer is to get rid of the generic DOM elements like the header / footer, the nav... and all elements that are common to all pages in order to get the unique piece(s) of content per page.
48
48
 
49
- You can find a large collection of importer examples in repo: https://github.com/kptdobe/helix-importer-projects
49
+ ### HTML2x helpers
50
+
51
+ [HTML2x](src/importer/HTML2x.js) methods (`HTML2md` and `HTML2docx`) are convienence methods to run an import. As input, they take:
52
+ - `URL`: URL of the page to import
53
+ - `document`: the DOM element to import
54
+ - `transformerCfg`: object with the transformation "rules". Object can be either:
55
+ - `{ transformDOM: ({ url, document, html, params }) => { ... return element-to-convert }, generateDocumentPath: ({ url, document, html, params }) => { ... return path-to-target; }}` for a single mapping between one input document / one output file
56
+ - `{ transform: ({ url, document, html, params }) => { ... return [{ element: first-element-to-convert, path: first-path-to-target }, ...] }` for a mapping one input document / multiple output files (useful to generate multiple docx from a single web page)
57
+
58
+ ### Importer UI
59
+
60
+ The Helix Importer has a dedicated browser UI: see https://github.com/adobe/helix-importer-ui
50
61
 
51
62
  ## Installation
52
63
 
@@ -58,6 +69,6 @@ TODO: publish npm module
58
69
 
59
70
  ## Usage
60
71
 
61
- ```typescript
72
+ ```js
62
73
  import { ... } from '@adobe/helix-importer';
63
74
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adobe/helix-importer",
3
- "version": "1.13.1",
3
+ "version": "1.14.0",
4
4
  "description": "Helix Importer tool: create md / docx from html",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -17,6 +17,7 @@ import { JSDOM } from 'jsdom';
17
17
  import PageImporter from './PageImporter.js';
18
18
  import PageImporterResource from './PageImporterResource.js';
19
19
  import MemoryHandler from '../storage/MemoryHandler.js';
20
+ import Utils from '../utils/Utils.js';
20
21
 
21
22
  // import docxStylesXML from '../resources/styles.xml';
22
23
 
@@ -35,13 +36,17 @@ function preprocessDOM(document) {
35
36
  }
36
37
  }
37
38
 
38
- // eslint-disable-next-line no-unused-vars
39
- async function defaultTransformDOM({ url, document, html }) {
39
+ async function defaultTransformDOM({
40
+ // eslint-disable-next-line no-unused-vars
41
+ url, document, html, params,
42
+ }) {
40
43
  return document.body;
41
44
  }
42
45
 
43
- // eslint-disable-next-line no-unused-vars
44
- async function defaultGenerateDocumentPath({ url, document }) {
46
+ async function defaultGenerateDocumentPath({
47
+ // eslint-disable-next-line no-unused-vars
48
+ url, document, html, params,
49
+ }) {
45
50
  let p = new URL(url).pathname;
46
51
  if (p.endsWith('/')) {
47
52
  p = `${p}index`;
@@ -52,21 +57,26 @@ async function defaultGenerateDocumentPath({ url, document }) {
52
57
  .replace(/[^a-z0-9/]/gm, '-');
53
58
  }
54
59
 
55
- async function html2x(url, doc, transformCfg, toMd, toDocx, options = {}) {
56
- let name = 'index';
57
- let dirname = '';
58
-
59
- const transfrom = transformCfg || {};
60
-
61
- if (!transfrom.transformDOM) {
62
- transfrom.transformDOM = defaultTransformDOM;
63
- }
60
+ async function html2x(
61
+ url,
62
+ doc,
63
+ transformCfg,
64
+ config = { toMd: true, toDocx: false },
65
+ params = {},
66
+ ) {
67
+ const transformer = transformCfg || {};
68
+
69
+ if (!transformer.transform) {
70
+ if (!transformer.transformDOM) {
71
+ transformer.transformDOM = defaultTransformDOM;
72
+ }
64
73
 
65
- if (!transfrom.generateDocumentPath) {
66
- transfrom.generateDocumentPath = defaultGenerateDocumentPath;
74
+ if (!transformer.generateDocumentPath) {
75
+ transformer.generateDocumentPath = defaultGenerateDocumentPath;
76
+ }
67
77
  }
68
78
 
69
- if (options.preprocess !== false) {
79
+ if (config.preprocess !== false) {
70
80
  preprocessDOM(doc);
71
81
  }
72
82
 
@@ -77,22 +87,59 @@ async function html2x(url, doc, transformCfg, toMd, toDocx, options = {}) {
77
87
  }
78
88
 
79
89
  async process(document) {
80
- let output = await transfrom.transformDOM({ url, document, html });
81
- output = output || document.body;
82
-
83
- let p = await transfrom.generateDocumentPath({ url, document });
84
- if (!p) {
85
- // provided function returns null -> apply default
86
- p = await defaultGenerateDocumentPath({ url, document });
90
+ if (transformer.transform) {
91
+ let results = transformer.transform({
92
+ url,
93
+ document,
94
+ html,
95
+ params,
96
+ });
97
+ if (!results) return null;
98
+ const pirs = [];
99
+
100
+ if (!Array.isArray(results)) {
101
+ // single element with transform function
102
+ results = [results];
103
+ }
104
+
105
+ results.forEach((result) => {
106
+ const name = path.basename(result.path);
107
+ const dirname = path.dirname(result.path);
108
+
109
+ const pir = new PageImporterResource(name, dirname, result.element, null, {
110
+ html: result.element.outerHTML,
111
+ });
112
+ pirs.push(pir);
113
+ });
114
+ return pirs;
115
+ } else {
116
+ let output = await transformer.transformDOM({
117
+ url,
118
+ document,
119
+ html,
120
+ params,
121
+ });
122
+ output = output || document.body;
123
+
124
+ let p = await transformer.generateDocumentPath({
125
+ url,
126
+ document,
127
+ html,
128
+ params,
129
+ });
130
+ if (!p) {
131
+ // provided function returns null -> apply default
132
+ p = await defaultGenerateDocumentPath({ url, document });
133
+ }
134
+
135
+ const name = path.basename(p);
136
+ const dirname = path.dirname(p);
137
+
138
+ const pir = new PageImporterResource(name, dirname, output, null, {
139
+ html: output.outerHTML,
140
+ });
141
+ return [pir];
87
142
  }
88
-
89
- name = path.basename(p);
90
- dirname = path.dirname(p);
91
-
92
- const pir = new PageImporterResource(name, dirname, output, null, {
93
- html: output.outerHTML,
94
- });
95
- return [pir];
96
143
  }
97
144
  }
98
145
 
@@ -107,48 +154,78 @@ async function html2x(url, doc, transformCfg, toMd, toDocx, options = {}) {
107
154
  const storageHandler = new MemoryHandler(logger);
108
155
  const importer = new InternalImporter({
109
156
  storageHandler,
110
- skipDocxConversion: !toDocx,
111
- skipMDFileCreation: !toMd,
157
+ skipDocxConversion: !config.toDocx,
158
+ skipMDFileCreation: !config.toMd,
112
159
  logger,
113
160
  mdast2docxOptions: {
114
- stylesXML: options.docxStylesXML,
115
- svg2png: options.svg2png,
161
+ stylesXML: config.docxStylesXML,
162
+ svg2png: config.svg2png,
116
163
  },
117
164
  });
118
165
 
119
166
  const pirs = await importer.import(url);
120
167
 
121
- const res = {
122
- html: pirs[0].extra.html,
123
- };
168
+ const getResponseObjectFromPIR = async (pir) => {
169
+ const res = {
170
+ html: pir.extra.html,
171
+ };
124
172
 
125
- res.path = path.resolve(dirname, name);
173
+ res.path = path.resolve(pir.directory, pir.name);
126
174
 
127
- if (toMd) {
128
- const md = await storageHandler.get(pirs[0].md);
129
- res.md = md;
130
- }
131
- if (toDocx) {
132
- const docx = await storageHandler.get(pirs[0].docx);
133
- res.docx = docx;
175
+ if (config.toMd) {
176
+ const md = await storageHandler.get(pir.md);
177
+ res.md = md;
178
+ }
179
+ if (config.toDocx) {
180
+ const docx = await storageHandler.get(pir.docx);
181
+ res.docx = docx;
182
+ }
183
+ return res;
184
+ };
185
+
186
+ if (pirs.length === 1) {
187
+ return getResponseObjectFromPIR(pirs[0]);
188
+ } else {
189
+ const res = [];
190
+ await Utils.asyncForEach(pirs, async (pir) => {
191
+ res.push(await getResponseObjectFromPIR(pir));
192
+ });
193
+ return res;
134
194
  }
135
- return res;
136
195
  }
137
196
 
138
- async function html2md(url, document, transformCfg, options = {}) {
197
+ /**
198
+ * Returns the result of the conversion from html to md.
199
+ * @param {string} url URL of the document to convert
200
+ * @param {HTMLElement|string} document Document to convert
201
+ * @param {Object} transformCfg Conversion configuration
202
+ * @param {Object} config Conversion configuration.
203
+ * @param {Object} params Conversion params. Object will be pass to the transformer functions.
204
+ * @returns {Object|Array} Result(s) of the conversion
205
+ */
206
+ async function html2md(url, document, transformCfg, config, params = {}) {
139
207
  let doc = document;
140
208
  if (typeof document === 'string') {
141
209
  doc = new JSDOM(document, { runScripts: undefined }).window.document;
142
210
  }
143
- return html2x(url, doc, transformCfg, true, false, options);
211
+ return html2x(url, doc, transformCfg, { ...config, toMd: true, toDocx: false }, params);
144
212
  }
145
213
 
146
- async function html2docx(url, document, transformCfg, options = {}) {
214
+ /**
215
+ * Returns the result of the conversion from html to docx.
216
+ * @param {string} url URL of the document to convert
217
+ * @param {HTMLElement|string} document Document to convert
218
+ * @param {Object} transformCfg Conversion configuration
219
+ * @param {Object} config Conversion configuration.
220
+ * @param {Object} params Conversion params. Object will be pass to the transformer functions.
221
+ * @returns {Object|Array} Result(s) of the conversion
222
+ */
223
+ async function html2docx(url, document, transformCfg, config, params = {}) {
147
224
  let doc = document;
148
225
  if (typeof document === 'string') {
149
226
  doc = new JSDOM(document, { runScripts: undefined }).window.document;
150
227
  }
151
- return html2x(url, doc, transformCfg, true, true, options);
228
+ return html2x(url, doc, transformCfg, { ...config, toMd: true, toDocx: true }, params);
152
229
  }
153
230
 
154
231
  export {
@@ -160,7 +160,7 @@ export default class PageImporter {
160
160
  contents = this.postProcessMD(contents);
161
161
 
162
162
  return {
163
- path: `${directory}/${sanitizedName}`,
163
+ path: path.join(directory, sanitizedName),
164
164
  content: contents,
165
165
  };
166
166
  }
@@ -296,6 +296,8 @@ export default class PageImporter {
296
296
  // eslint-disable-next-line no-param-reassign
297
297
  entry.source = url;
298
298
  // eslint-disable-next-line no-param-reassign
299
+ entry.path = res.path;
300
+ // eslint-disable-next-line no-param-reassign
299
301
  entry.markdown = res.content;
300
302
 
301
303
  if (!this.params.skipMDFileCreation) {
@@ -48,6 +48,57 @@ describe('defaultGenerateDocumentPath tests', () => {
48
48
  });
49
49
  });
50
50
 
51
+ describe('html2x parameters', () => {
52
+ const URL = 'https://www.sample.com/page.html';
53
+ const ORIGNAL_URL = 'https://www.notproxyurl.com/folder/page.html';
54
+ const HTML = '<html><head></head><body><h1>Hello World</h1></body></html>';
55
+
56
+ const testParams = ({
57
+ url,
58
+ document,
59
+ html,
60
+ params,
61
+ }) => {
62
+ strictEqual(url, URL);
63
+ strictEqual(params.originalURL, ORIGNAL_URL);
64
+ strictEqual(html, HTML);
65
+
66
+ const h1 = document.querySelector('h1');
67
+ ok(h1);
68
+ strictEqual(h1.textContent, 'Hello World');
69
+ };
70
+
71
+ it('parameters are correctly passed in single mode', async () => {
72
+ await html2md(URL, HTML, {
73
+ transformDOM: testParams,
74
+ generateDocumentPath: testParams,
75
+ }, null, {
76
+ originalURL: ORIGNAL_URL,
77
+ });
78
+
79
+ await html2docx(URL, HTML, {
80
+ transformDOM: testParams,
81
+ generateDocumentPath: testParams,
82
+ }, null, {
83
+ originalURL: ORIGNAL_URL,
84
+ });
85
+ });
86
+
87
+ it('parameters are correctly passed in multi mode', async () => {
88
+ await html2md(URL, HTML, {
89
+ transform: testParams,
90
+ }, null, {
91
+ originalURL: ORIGNAL_URL,
92
+ });
93
+
94
+ await html2docx(URL, HTML, {
95
+ transform: testParams,
96
+ }, null, {
97
+ originalURL: ORIGNAL_URL,
98
+ });
99
+ });
100
+ });
101
+
51
102
  describe('html2md tests', () => {
52
103
  it('html2md provides a default transformation', async () => {
53
104
  const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>');
@@ -56,7 +107,7 @@ describe('html2md tests', () => {
56
107
  strictEqual(out.path, '/page');
57
108
  });
58
109
 
59
- it('html2md handles a custom transformations', async () => {
110
+ it('html2md handles a custom transformation', async () => {
60
111
  const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
61
112
  transformDOM: ({ document }) => {
62
113
  const p = document.createElement('p');
@@ -70,6 +121,65 @@ describe('html2md tests', () => {
70
121
  strictEqual(out.path, '/folder/my-custom-path');
71
122
  });
72
123
 
124
+ it('html2md handles multiple transform', async () => {
125
+ const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
126
+ transform: ({ document }) => {
127
+ const p1 = document.createElement('p');
128
+ p1.innerHTML = 'My Hello to the World 1';
129
+
130
+ const p2 = document.createElement('p');
131
+ p2.innerHTML = 'My Hello to the World 2';
132
+
133
+ return [{
134
+ element: p1,
135
+ path: '/my-custom-path-p1',
136
+ }, {
137
+ element: p2,
138
+ path: '/folder/my-custom-path-p2',
139
+ }];
140
+ },
141
+ });
142
+
143
+ const out1 = out[0];
144
+ strictEqual(out1.html.trim(), '<p>My Hello to the World 1</p>');
145
+ strictEqual(out1.md.trim(), 'My Hello to the World 1');
146
+ strictEqual(out1.path, '/my-custom-path-p1');
147
+
148
+ const out2 = out[1];
149
+ strictEqual(out2.html.trim(), '<p>My Hello to the World 2</p>');
150
+ strictEqual(out2.md.trim(), 'My Hello to the World 2');
151
+ strictEqual(out2.path, '/folder/my-custom-path-p2');
152
+ });
153
+
154
+ it('html2md handles multiple transform', async () => {
155
+ const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
156
+ transform: ({ document }) => {
157
+ const p1 = document.createElement('p');
158
+ p1.innerHTML = 'My Hello to the World 1';
159
+
160
+ const p2 = document.createElement('p');
161
+ p2.innerHTML = 'My Hello to the World 2';
162
+
163
+ return {
164
+ element: p1,
165
+ path: '/my-custom-path-p1',
166
+ };
167
+ },
168
+ });
169
+
170
+ strictEqual(out.html.trim(), '<p>My Hello to the World 1</p>');
171
+ strictEqual(out.md.trim(), 'My Hello to the World 1');
172
+ strictEqual(out.path, '/my-custom-path-p1');
173
+ });
174
+
175
+ it('html2md does not crash if transform returns null', async () => {
176
+ const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
177
+ transform: () => null,
178
+ });
179
+
180
+ strictEqual(out.length, 0);
181
+ });
182
+
73
183
  it('html2md can deal with null returning transformation', async () => {
74
184
  const out = await html2md('https://www.sample.com/page.html', '<html><body><h1>Hello World</h1></body></html>', {
75
185
  transformDOM: () => null,
@@ -76,6 +76,7 @@ describe('PageImporter tests - various options', () => {
76
76
  const results = await se.import('/someurl');
77
77
 
78
78
  strictEqual(results.length, 1, 'expect no result');
79
+ strictEqual(results[0].path, '/someurl/somecomputedpath/resource1', 'expect no result');
79
80
 
80
81
  ok(await storageHandler.exists('/someurl/somecomputedpath/resource1.md'), 'md has been stored');
81
82
  ok(await storageHandler.exists('/someurl/somecomputedpath/resource1.docx'), 'docx has been stored');
@@ -144,7 +145,7 @@ describe('PageImporter tests - fixtures', () => {
144
145
 
145
146
  strictEqual(results.length, 1, 'expect one result');
146
147
 
147
- const md = await storageHandler.get(`/${feature}.md`);
148
+ const md = await storageHandler.get(results[0].md);
148
149
  const expectedMD = await fs.readFile(path.resolve(__dirname, 'fixtures', `${feature}.spec.md`), 'utf-8');
149
150
  strictEqual(md.trim(), expectedMD.trim(), 'inported md is expected one');
150
151
  };
@@ -172,4 +173,8 @@ describe('PageImporter tests - fixtures', () => {
172
173
  it('import - complex', async () => {
173
174
  await featureTest('complex');
174
175
  });
176
+
177
+ it('import - spaces', async () => {
178
+ await featureTest('space');
179
+ });
175
180
  });
@@ -0,0 +1,13 @@
1
+ <html>
2
+ <body>
3
+ <h1>Space sample</h1>
4
+ <p>A simple paragraph</p>
5
+ <p>A paragraph with a br inside.<br> This should be next line.</p>
6
+ <p>A paragraph with a br at the end.<br></p>
7
+ <p>A paragraph followed by a br</p>
8
+ <br>
9
+ <p>A paragraph after the br</p>
10
+ &nbsp;
11
+ <p>A paragraph after the nbsp;</p>
12
+ </body>
13
+ </html>
@@ -0,0 +1,19 @@
1
+ # Space sample
2
+
3
+ A simple paragraph
4
+
5
+ A paragraph with a br inside.\
6
+ This should be next line.
7
+
8
+ A paragraph with a br at the end.
9
+
10
+ A paragraph followed by a br
11
+
12
+ \
13
+
14
+
15
+ A paragraph after the br
16
+
17
+
18
+
19
+ A paragraph after the nbsp;