@adobe/helix-importer 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,365 @@
1
+ /*
2
+ * Copyright 2021 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ /* eslint-disable class-methods-use-this */
14
+
15
+ import { JSDOM } from 'jsdom';
16
+
17
+ import path from 'path';
18
+ import unified from 'unified';
19
+ import parse from 'rehype-parse';
20
+ import { toHtml } from 'hast-util-to-html';
21
+ import rehype2remark from 'rehype-remark';
22
+ import stringify from 'remark-stringify';
23
+ import { all } from 'hast-util-to-mdast/lib/all.js';
24
+ import fs from 'fs-extra';
25
+ import { md2docx } from '@adobe/helix-md2docx';
26
+ import Utils from '../utils/Utils.js';
27
+ import DOMUtils from '../utils/DOMUtils.js';
28
+ import FileUtils from '../utils/FileUtils.js';
29
+
30
+ export default class PageImporter {
31
+ params;
32
+
33
+ logger;
34
+
35
+ useCache;
36
+
37
+ constructor(params) {
38
+ this.params = params;
39
+ this.logger = params.logger || console;
40
+
41
+ this.useCache = !!params.cache;
42
+ }
43
+
44
+ async convertToDocx(docxPath, content) {
45
+ const buffer = await md2docx(content, this.logger);
46
+ return this.params.storageHandler.put(docxPath, buffer);
47
+ }
48
+
49
+ async createMarkdown(resource, url) {
50
+ const { name } = resource;
51
+ const { directory } = resource;
52
+ const sanitizedName = FileUtils.sanitizeFilename(name);
53
+ this.logger.log(`Computing Markdonw for ${directory}/${sanitizedName}`);
54
+
55
+ const processor = unified()
56
+ .use(parse, { emitParseErrors: true })
57
+ .use(rehype2remark, {
58
+ handlers: {
59
+ hlxembed: (h, node) => h(node, 'hlxembed', node.children[0].value),
60
+ u: (h, node) => h(node, 'u', all(h, node)),
61
+ table: (h, node) => h(node, 'table', toHtml(node)),
62
+ },
63
+ })
64
+ .use(stringify, {
65
+ bullet: '-',
66
+ fence: '`',
67
+ fences: true,
68
+ incrementListMarker: true,
69
+ rule: '-',
70
+ ruleRepetition: 3,
71
+ ruleSpaces: false,
72
+ })
73
+ .use(() => {
74
+ // use custom tag and rendering because text is always encoded by default
75
+ // we need the raw url
76
+ processor.Compiler.prototype.visitors.hlxembed = (node) => node.value;
77
+ })
78
+ .use(() => {
79
+ processor.Compiler.prototype.visitors.table = (node) => node.value;
80
+ })
81
+ .use(() => {
82
+ processor.Compiler.prototype.visitors.u = (node) => {
83
+ // u handling: remove the u is the first element is a link
84
+ if (node.children && node.children.length > 0) {
85
+ const children = node.children.map((child) => processor.stringify(child));
86
+ if (node.children[0].type === 'link') {
87
+ // first element in the <u> is a link: remove the <u> - unsupported case
88
+ return `${children.join()}`;
89
+ }
90
+ return `<u>${children.join()}</u>`;
91
+ }
92
+ return '';
93
+ };
94
+ })
95
+ .use(() => {
96
+ const originalEmphasis = processor.Compiler.prototype.visitors.emphasis;
97
+ processor.Compiler.prototype.visitors.emphasis = (node) => {
98
+ // @ts-ignore
99
+ const ori = originalEmphasis.apply(processor.Compiler(), [node]);
100
+ return ori;
101
+ };
102
+ });
103
+
104
+ const file = await processor.process(resource.document.innerHTML);
105
+ let contents = file.contents.toString();
106
+
107
+ // process image links
108
+ const { document } = resource;
109
+ const assets = [];
110
+ const imgs = document.querySelectorAll('img');
111
+ imgs.forEach((img) => {
112
+ const { src } = img;
113
+ const isEmbed = img.classList.contains('hlx-embed');
114
+ if (!isEmbed && src && src !== '' && (contents.indexOf(src) !== -1 || contents.indexOf(decodeURI(src)) !== -1)) {
115
+ assets.push({
116
+ url: src,
117
+ append: '#image.png',
118
+ });
119
+ }
120
+ });
121
+
122
+ const as = document.querySelectorAll('a');
123
+ as.forEach((a) => {
124
+ const { href } = a;
125
+ if ((href && href !== '' && contents.indexOf(href) !== -1) || contents.indexOf(decodeURI(href)) !== -1) {
126
+ try {
127
+ const u = new URL(href, url);
128
+ const ext = path.extname(u.href);
129
+ if (ext === '.mp4') {
130
+ // upload mp4
131
+ assets.push({
132
+ url: href,
133
+ append: '#image.mp4',
134
+ });
135
+ }
136
+ } catch (error) {
137
+ this.logger.warn(`Invalid link in the page: ${href}`);
138
+ }
139
+ }
140
+ });
141
+
142
+ const vs = document.querySelectorAll('video source');
143
+ vs.forEach((s) => {
144
+ const { src } = s;
145
+ if ((src && src !== '' && contents.indexOf(src) !== -1) || contents.indexOf(decodeURI(src)) !== -1) {
146
+ try {
147
+ const u = new URL(src, url);
148
+ const ext = path.extname(u.href);
149
+ if (ext === '.mp4') {
150
+ const poster = s.parentNode.getAttribute('poster');
151
+ if (poster) {
152
+ assets.push({
153
+ url: poster,
154
+ });
155
+ }
156
+ // upload mp4
157
+ assets.push({
158
+ url: src,
159
+ append: '#image.mp4',
160
+ });
161
+ }
162
+ } catch (error) {
163
+ this.logger.warn(`Invalid video in the page: ${src}`);
164
+ }
165
+ }
166
+ });
167
+
168
+ const patchSrcInContent = (c, oldSrc, newSrc) => contents
169
+ .replace(new RegExp(`${oldSrc.replace('.', '\\.').replace('?', '\\?')}`, 'gm'), newSrc)
170
+ .replace(new RegExp(`${decodeURI(oldSrc).replace('.', '\\.')}`, 'gm'), newSrc);
171
+
172
+ // adjust assets url (from relative to absolute)
173
+ assets.forEach((asset) => {
174
+ const u = new URL(decodeURI(asset.url), url);
175
+ contents = patchSrcInContent(contents, asset.url, u.toString());
176
+ });
177
+
178
+ if (resource.prepend) {
179
+ contents = resource.prepend + contents;
180
+ }
181
+
182
+ contents = this.postProcessMD(contents);
183
+
184
+ return {
185
+ path: `${directory}/${sanitizedName}`,
186
+ content: contents,
187
+ };
188
+ }
189
+
190
+ cleanup(document) {
191
+ DOMUtils.remove(document, ['script', 'hr']);
192
+ DOMUtils.removeComments(document);
193
+ DOMUtils.removeSpans(document);
194
+ }
195
+
196
+ preProcess(document) {
197
+ this.cleanup(document);
198
+ DOMUtils.reviewHeadings(document);
199
+ DOMUtils.reviewParagraphs(document);
200
+ DOMUtils.escapeSpecialCharacters(document);
201
+ [
202
+ 'b',
203
+ 'a',
204
+ 'big',
205
+ 'code',
206
+ 'em',
207
+ 'i',
208
+ 'label',
209
+ 's',
210
+ 'small', /* , 'span' */
211
+ 'strong',
212
+ 'sub',
213
+ 'sup',
214
+ 'u',
215
+ 'var',
216
+ ].forEach((tag) => DOMUtils.reviewInlineElement(document, tag));
217
+
218
+ // u a tag combo is not handled properly by unified js and is discouraged anyway -> remove the u
219
+ document.querySelectorAll('u > a').forEach((a) => {
220
+ const p = a.parentNode;
221
+ p.before(a);
222
+ p.remove();
223
+ });
224
+
225
+ const imgs = document.querySelectorAll('img');
226
+ imgs.forEach((img) => {
227
+ let src = img.getAttribute('src');
228
+ const dataSrc = img.getAttribute('data-src');
229
+ if (!src && dataSrc) {
230
+ // lazy loading case
231
+ img.setAttribute('src', dataSrc);
232
+ }
233
+
234
+ if (dataSrc && src && src.indexOf('data:') === 0) {
235
+ // b64 img, try replace with dataSrc
236
+ img.setAttribute('src', dataSrc);
237
+ }
238
+
239
+ src = img.getAttribute('src');
240
+ if (!src || src.indexOf('data:') === 0) {
241
+ // we cannot handle b64 asset for now, remove
242
+ img.remove();
243
+ }
244
+
245
+ const alt = img.getAttribute('alt');
246
+ const title = img.getAttribute('title');
247
+ if (title && title === alt) {
248
+ // a11y: image title has little value if it's the same than the alt text.
249
+ img.removeAttribute('title');
250
+ }
251
+ });
252
+ }
253
+
254
+ postProcess(document) {
255
+ DOMUtils.encodeImagesForTable(document);
256
+ }
257
+
258
+ postProcessMD(md) {
259
+ let ret = md.replace(/\\\\~/gm, '\\~');
260
+
261
+ const match = ret.match(/hlx_replaceTag\(.*?\)/gm);
262
+ if (match) {
263
+ const hlxReplaceTags = match.filter((i, p, s) => s.indexOf(i) === p);
264
+ hlxReplaceTags.forEach((r) => {
265
+ const by = r.substring(0, r.length - 1).split('(')[1];
266
+ const regex = new RegExp(r.replace('(', '\\(').replace(')', '\\)'), 'gm');
267
+ ret = ret.replace(regex, `<${by}>`);
268
+ });
269
+ }
270
+
271
+ return ret;
272
+ }
273
+
274
+ async download(url) {
275
+ const getLocalCacheName = (p) => path.resolve(p, `${new URL(url).pathname.replace(/^\/+|\/+$/g, '').replace(/\//gm, '_')}.html`);
276
+
277
+ if (this.useCache) {
278
+ const localPath = getLocalCacheName(this.params.cache);
279
+ if (await fs.exists(localPath)) {
280
+ return fs.readFile(localPath);
281
+ }
282
+ }
283
+
284
+ const res = await this.fetch(url);
285
+ if (!res.ok) {
286
+ this.logger.error(`${url}: Invalid response`, res);
287
+ throw new Error(`${url}: Invalid response - ${res.statusText}`);
288
+ } else {
289
+ const html = await res.text();
290
+
291
+ if (this.useCache) {
292
+ const localPath = getLocalCacheName(this.params.cache);
293
+ await fs.mkdirs(path.dirname(localPath));
294
+ await fs.writeFile(localPath, html);
295
+ }
296
+
297
+ return html;
298
+ }
299
+ }
300
+
301
+ async get(url) {
302
+ const html = await this.download(url);
303
+
304
+ if (html) {
305
+ const { document } = new JSDOM(DOMUtils.removeNoscripts(html.toString())).window;
306
+ this.preProcess(document);
307
+ return {
308
+ document,
309
+ html,
310
+ };
311
+ }
312
+
313
+ return null;
314
+ }
315
+
316
+ async import(url, entryParams) {
317
+ const startTime = new Date().getTime();
318
+
319
+ const { document, html } = await this.get(url);
320
+
321
+ const results = [];
322
+ if (document) {
323
+ const entries = await this.process(document, url, entryParams, html);
324
+
325
+ this.postProcess(document);
326
+
327
+ if (entries) {
328
+ await Utils.asyncForEach(entries, async (entry) => {
329
+ const res = await this.createMarkdown(entry, url);
330
+ // eslint-disable-next-line no-param-reassign
331
+ entry.source = url;
332
+ // eslint-disable-next-line no-param-reassign
333
+ entry.markdown = res.content;
334
+
335
+ if (!this.params.skipMDFileCreation) {
336
+ const mdPath = `${res.path}.md`;
337
+ await this.params.storageHandler.put(mdPath, res.content);
338
+ this.logger.log(`MD file created: ${mdPath}`);
339
+
340
+ // eslint-disable-next-line no-param-reassign
341
+ entry.md = mdPath;
342
+ }
343
+
344
+ if (!this.params.skipDocxConversion) {
345
+ const docxPath = `${res.path}.docx`;
346
+ await this.convertToDocx(docxPath, res.content);
347
+ // eslint-disable-next-line no-param-reassign
348
+ entry.docx = docxPath;
349
+ }
350
+
351
+ results.push(entry);
352
+ });
353
+ }
354
+ }
355
+
356
+ this.logger.log('');
357
+ this.logger.log(`${url}: Process took ${(new Date().getTime() - startTime) / 1000}s.`);
358
+
359
+ return results;
360
+ }
361
+
362
+ fetch() {}
363
+
364
+ process() {}
365
+ }
@@ -0,0 +1,23 @@
1
+ /*
2
+ * Copyright 2021 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ export default class PageImporterParams {
14
+ storageHandler;
15
+
16
+ cache;
17
+
18
+ skipDocxConversion;
19
+
20
+ skipMDFileCreation;
21
+
22
+ logger;
23
+ }
@@ -0,0 +1,31 @@
1
+ /*
2
+ * Copyright 2021 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ export default class PageImporterResource {
14
+ document;
15
+
16
+ name;
17
+
18
+ directory;
19
+
20
+ prepend;
21
+
22
+ extra;
23
+
24
+ constructor(name, directory, document, prepend, extra) {
25
+ this.name = name;
26
+ this.directory = directory;
27
+ this.document = document;
28
+ this.prepend = prepend;
29
+ this.extra = extra;
30
+ }
31
+ }
package/src/index.js ADDED
@@ -0,0 +1,46 @@
1
+ /*
2
+ * Copyright 2021 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+ import PagingExplorer from './explorer/PagingExplorer.js';
13
+ import PagingExplorerParams from './explorer/PagingExplorerParams.js';
14
+
15
+ import PageImporter from './importer/PageImporter.js';
16
+ import PageImporterParams from './importer/PageImporterParams.js';
17
+ import PageImporterResource from './importer/PageImporterResource.js';
18
+
19
+ import FSHandler from './storage/FSHandler.js';
20
+
21
+ import CSV from './utils/CSV.js';
22
+ import DOMUtils from './utils/DOMUtils.js';
23
+ import FileUtils from './utils/FileUtils.js';
24
+ import Utils from './utils/Utils.js';
25
+
26
+ import WPUtils from './wp/WPUtils.js';
27
+ import WPAdminAjaxPager from './wp/explorers/WPAdminAjaxPager.js';
28
+ import WPContentPager from './wp/explorers/WPContentPager.js';
29
+ import WPPostWrapPager from './wp/explorers/WPPostWrapPager.js';
30
+
31
+ export {
32
+ PagingExplorer,
33
+ PagingExplorerParams,
34
+ PageImporter,
35
+ PageImporterParams,
36
+ PageImporterResource,
37
+ FSHandler,
38
+ CSV,
39
+ DOMUtils,
40
+ FileUtils,
41
+ Utils,
42
+ WPUtils,
43
+ WPAdminAjaxPager,
44
+ WPContentPager,
45
+ WPPostWrapPager,
46
+ };
@@ -0,0 +1,46 @@
1
+ /*
2
+ * Copyright 2021 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ import fs from 'fs-extra';
14
+ import path from 'path';
15
+
16
+ export default class FSHandler {
17
+ target;
18
+
19
+ logger;
20
+
21
+ constructor(target, logger) {
22
+ this.logger = logger || console;
23
+ this.target = target;
24
+ }
25
+
26
+ async put(filePath, content) {
27
+ const local = path.resolve(path.join(this.target, filePath));
28
+ this.logger.log(`Writting file to file system: ${local}`);
29
+ await fs.mkdirs(path.dirname(local));
30
+ await fs.writeFile(local, content);
31
+ }
32
+
33
+ async get(filePath) {
34
+ const local = path.resolve(this.target, filePath);
35
+ this.logger.log(`Reading file from file system: ${local}`);
36
+
37
+ return fs.readFile(local);
38
+ }
39
+
40
+ async exists(filePath) {
41
+ const local = path.resolve(this.target, filePath);
42
+ this.logger.log(`Checking if file from file system exists: ${local}`);
43
+
44
+ return fs.exists(local);
45
+ }
46
+ }
@@ -0,0 +1,36 @@
1
+ /*
2
+ * Copyright 2021 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ export default class MemoryHandler {
14
+ storage = {};
15
+
16
+ logger;
17
+
18
+ constructor(logger) {
19
+ this.logger = logger || console;
20
+ }
21
+
22
+ async put(path, content) {
23
+ this.logger.log('MemoryHandler#put', path, content);
24
+ this.storage[path] = content;
25
+ }
26
+
27
+ async get(path) {
28
+ this.logger.log('MemoryHandler#get', path);
29
+ return this.storage[path];
30
+ }
31
+
32
+ async exists(path) {
33
+ this.logger.log('MemoryHandler#exists', path);
34
+ return !!this.storage[path];
35
+ }
36
+ }
@@ -0,0 +1,96 @@
1
+ /*
2
+ * Copyright 2021 Adobe. All rights reserved.
3
+ * This file is licensed to you under the Apache License, Version 2.0 (the "License");
4
+ * you may not use this file except in compliance with the License. You may obtain a copy
5
+ * of the License at http://www.apache.org/licenses/LICENSE-2.0
6
+ *
7
+ * Unless required by applicable law or agreed to in writing, software distributed under
8
+ * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
9
+ * OF ANY KIND, either express or implied. See the License for the specific language
10
+ * governing permissions and limitations under the License.
11
+ */
12
+
13
+ import os from 'os';
14
+
15
+ export default class CSV {
16
+ /**
17
+ * Minimalisatic CVS conversion of an array of objects:
18
+ * first object keys determine the CVS headers.
19
+ * Note: delimiter character is not supported in values
20
+ * @param {object[]} entries List of object
21
+ * @param {string} delimiter CSV delimiter
22
+ * @param {boolean} skipHeaders True to skip the headers
23
+ * @returns {string} CSV string
24
+ */
25
+ static toCSV(entries, delimiter = ';', skipHeaders = false) {
26
+ let ret = '';
27
+ if (entries && entries.length > 0) {
28
+ // headers
29
+ const headers = [];
30
+ // eslint-disable-next-line no-restricted-syntax
31
+ for (const name in entries[0]) {
32
+ // eslint-disable-next-line no-prototype-builtins
33
+ if (entries[0].hasOwnProperty(name)) {
34
+ headers.push(name);
35
+ if (!skipHeaders) {
36
+ ret += name + delimiter;
37
+ }
38
+ }
39
+ }
40
+
41
+ if (!skipHeaders) {
42
+ ret += os.EOL;
43
+ }
44
+
45
+ entries.forEach((e) => {
46
+ headers.forEach((h) => {
47
+ ret += (e[h] || '') + delimiter;
48
+ });
49
+ ret += os.EOL;
50
+ });
51
+ }
52
+ return ret;
53
+ }
54
+
55
+ /**
56
+ * Converts a CSV string into an array of object
57
+ * @param {string} csv The CSV string
58
+ * @param {string} delimiter Delimiter string
59
+ * @returns {object[]} An array of object for which each CSV column is a property
60
+ */
61
+ static toArray(csv, delimiter = ';') {
62
+ const rows = csv.split(os.EOL);
63
+
64
+ if (rows[rows.length - 1] === '') {
65
+ // remove last element
66
+ rows.pop();
67
+ }
68
+
69
+ let headers = [];
70
+ const array = [];
71
+
72
+ rows.forEach((r, i) => {
73
+ if (i === 0) {
74
+ // headers
75
+ headers = r.split(delimiter);
76
+
77
+ if (headers[headers.length - 1] === '') {
78
+ // remove last element
79
+ headers.pop();
80
+ }
81
+ } else {
82
+ let values = r.split(delimiter);
83
+
84
+ // remove last element(s)
85
+ values = values.slice(0, headers.length);
86
+
87
+ const obj = {};
88
+ values.forEach((v, index) => {
89
+ obj[headers[index].trim()] = v ? v.trim() : '';
90
+ });
91
+ array.push(obj);
92
+ }
93
+ });
94
+ return array;
95
+ }
96
+ }