@adobe/helix-importer 2.9.41 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +19 -0
- package/README.md +5 -38
- package/package.json +13 -6
- package/src/importer/HTML2x.js +19 -30
- package/src/importer/PageImporter.js +12 -4
- package/src/importer/PageImporterParams.js +2 -0
- package/src/importer/defaults/generateDocumentPath.js +24 -0
- package/src/importer/defaults/rules/adjustImageUrls.js +28 -0
- package/src/importer/defaults/rules/convertIcons.js +25 -0
- package/src/importer/defaults/rules/createMetadata.js +102 -0
- package/src/importer/defaults/rules/transformBackgroundImages.js +21 -0
- package/src/importer/defaults/transformDOM.js +42 -0
- package/src/index.js +13 -11
- package/src/utils/BrowserUtils.js +29 -0
- package/src/utils/DOMUtils.js +19 -13
- package/src/wp/WPUtils.js +1 -3
- package/{src/explorer/PagingExplorerParams.js → test/TestUtils.js} +8 -4
- package/test/browser/BrowserUtils.test.js +42 -0
- package/test/browser/DOMUtils.test.js +67 -0
- package/test/importers/HTML2x.spec.js +122 -38
- package/test/importers/PageImporter.spec.js +37 -2
- package/test/importers/defaults/fixtures/adjust-image-urls.expected.html +7 -0
- package/test/importers/defaults/fixtures/adjust-image-urls.input.html +10 -0
- package/test/importers/defaults/fixtures/background-image.expected.html +13 -0
- package/test/importers/defaults/fixtures/background-image.input.html +10 -0
- package/test/importers/defaults/fixtures/cleanup.expected.html +5 -0
- package/test/importers/defaults/fixtures/cleanup.input.html +11 -0
- package/test/importers/defaults/fixtures/default.expected.html +4 -0
- package/test/importers/defaults/fixtures/default.input.html +6 -0
- package/test/importers/defaults/fixtures/icons.expected.html +4 -0
- package/test/importers/defaults/fixtures/icons.input.html +6 -0
- package/test/importers/defaults/fixtures/metadata.all.diff.expected.html +40 -0
- package/test/importers/defaults/fixtures/metadata.all.diff.input.html +17 -0
- package/test/importers/defaults/fixtures/metadata.all.same.expected.html +20 -0
- package/test/importers/defaults/fixtures/metadata.all.same.input.html +17 -0
- package/test/importers/defaults/fixtures/metadata.basic.expected.html +16 -0
- package/test/importers/defaults/fixtures/metadata.basic.input.html +9 -0
- package/test/importers/defaults/fixtures/metadata.image.expected.html +12 -0
- package/test/importers/defaults/fixtures/metadata.image.input.html +9 -0
- package/test/importers/defaults/fixtures/metadata.og.expected.html +16 -0
- package/test/importers/defaults/fixtures/metadata.og.input.html +9 -0
- package/test/importers/defaults/fixtures/metadata.twitter.expected.html +16 -0
- package/test/importers/defaults/fixtures/metadata.twitter.input.html +9 -0
- package/test/importers/defaults/generateDocumentPath.spec.js +32 -0
- package/test/importers/defaults/transformDOM.spec.js +94 -0
- package/test/importers/fixtures/video.spec.html +11 -0
- package/test/importers/fixtures/video.spec.md +7 -0
- package/test/utils/DOMUtils.spec.js +23 -4
- package/src/explorer/PagingExplorer.js +0 -81
- package/src/wp/explorers/WPAdminAjaxPager.js +0 -51
- package/src/wp/explorers/WPContentPager.js +0 -48
- package/src/wp/explorers/WPPostWrapPager.js +0 -43
- package/test/explorers/PagingExplorer.spec.js +0 -280
package/CHANGELOG.md
CHANGED
|
@@ -1,3 +1,22 @@
|
|
|
1
|
+
# [3.1.0](https://github.com/adobe/helix-importer/compare/v3.0.0...v3.1.0) (2023-11-13)
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
### Features
|
|
5
|
+
|
|
6
|
+
* improve default import ([#261](https://github.com/adobe/helix-importer/issues/261)) ([251cfcd](https://github.com/adobe/helix-importer/commit/251cfcdde7ac54525eef49341a9ede52d368cf71))
|
|
7
|
+
|
|
8
|
+
# [3.0.0](https://github.com/adobe/helix-importer/compare/v2.9.41...v3.0.0) (2023-11-13)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### Features
|
|
12
|
+
|
|
13
|
+
* get rid of JSDOM ([#260](https://github.com/adobe/helix-importer/issues/260)) ([2530363](https://github.com/adobe/helix-importer/commit/2530363c328958b2363f0f178b9b1ef7d7bb73b8))
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
### BREAKING CHANGES
|
|
17
|
+
|
|
18
|
+
* removing JSDOM
|
|
19
|
+
|
|
1
20
|
## [2.9.41](https://github.com/adobe/helix-importer/compare/v2.9.40...v2.9.41) (2023-11-05)
|
|
2
21
|
|
|
3
22
|
|
package/README.md
CHANGED
|
@@ -2,43 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
Foundation tools for importing website content into that can be consumed in an Helix project.
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
1. explorer: crawl a website to construct a list of urls to be importer
|
|
8
|
-
2. importer: construct an importer - for an input url, transform the DOM and convert it into a Markdown file
|
|
9
|
-
|
|
10
|
-
The folder [./src/wp](./src/wp) contains WordPress specific utils and explorer methods.
|
|
11
|
-
|
|
12
|
-
## Explorer
|
|
13
|
-
|
|
14
|
-
Idea of an explorer is to crawl the site in order to collect a list of urls. This list of urls can then be imported.
|
|
15
|
-
|
|
16
|
-
Here is a basic sample:
|
|
17
|
-
|
|
18
|
-
```js
|
|
19
|
-
|
|
20
|
-
import { WPContentPager, FSHandler, CSV } from '@adobe/helix-importer';
|
|
21
|
-
|
|
22
|
-
async function main() {
|
|
23
|
-
const pager = new WPContentPager({
|
|
24
|
-
nbMaxPages: 1000,
|
|
25
|
-
url: 'url to a WordPress site'
|
|
26
|
-
});
|
|
27
|
-
|
|
28
|
-
const entries = await pager.explore();
|
|
29
|
-
|
|
30
|
-
const csv = CSV.toCSV(entries);
|
|
31
|
-
|
|
32
|
-
const handler = new FSHandler('output', console);
|
|
33
|
-
await handler.put('explorer_results.csv', csv);
|
|
34
|
-
}
|
|
35
|
-
```
|
|
36
|
-
|
|
37
|
-
In this example, the [WPContentPager](./src/wp/explorers/WPContentPager.ts) extends the [PagingExplorer](src/explorer/PagingExplorer.ts) which implements the 2 methods:
|
|
38
|
-
- `fetch` which defines how to fetch one page on results
|
|
39
|
-
- `explore` which extracts the list of urls present on that page
|
|
40
|
-
|
|
41
|
-
The final result is a list of urls that could be found on list of paged results given by the WordPress API `/page/${page_number}`.
|
|
5
|
+
Basic concept of the importer: for an input url, transform the DOM and convert it into a Markdown / docx file.
|
|
42
6
|
|
|
43
7
|
## Importer
|
|
44
8
|
|
|
@@ -50,10 +14,13 @@ Goal of the importer is to get rid of the generic DOM elements like the header /
|
|
|
50
14
|
|
|
51
15
|
[HTML2x](src/importer/HTML2x.js) methods (`HTML2md` and `HTML2docx`) are convienence methods to run an import. As input, they take:
|
|
52
16
|
- `URL`: URL of the page to import
|
|
53
|
-
- `document`: the DOM element to import
|
|
17
|
+
- `document`: the DOM element to import - a Document object or a string (see `createDocumentFromString` for the string case)
|
|
54
18
|
- `transformerCfg`: object with the transformation "rules". Object can be either:
|
|
55
19
|
- `{ transformDOM: ({ url, document, html, params }) => { ... return element-to-convert }, generateDocumentPath: ({ url, document, html, params }) => { ... return path-to-target; }}` for a single mapping between one input document / one output file
|
|
56
20
|
- `{ transform: ({ url, document, html, params }) => { ... return [{ element: first-element-to-convert, path: first-path-to-target }, ...] }` for a mapping one input document / multiple output files (useful to generate multiple docx from a single web page)
|
|
21
|
+
- `config`: object with several config properties
|
|
22
|
+
- `createDocumentFromString`: this config is required if you use the methods in a non-browser context and want to pass `document` param as string. This method receives the HTML to parse as a string and must return a Document object.
|
|
23
|
+
- `setBackgroundImagesFromCSS`: set to false to disable the `background-image` inlining in the DOM.
|
|
57
24
|
|
|
58
25
|
### Importer UI
|
|
59
26
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@adobe/helix-importer",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.1.0",
|
|
4
4
|
"description": "Helix Importer tool: create md / docx from html",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"type": "module",
|
|
@@ -10,8 +10,10 @@
|
|
|
10
10
|
},
|
|
11
11
|
"scripts": {
|
|
12
12
|
"lint": "eslint .",
|
|
13
|
-
"test": "
|
|
14
|
-
"test
|
|
13
|
+
"test:web": "web-test-runner test/browser/*.test.js --node-resolve",
|
|
14
|
+
"test:web:watch": "web-test-runner test/browser/*.test.js --node-resolve --watch",
|
|
15
|
+
"test": "c8 mocha && npm run test:web",
|
|
16
|
+
"test-ci": "c8 mocha && npm run test:web",
|
|
15
17
|
"semantic-release": "semantic-release",
|
|
16
18
|
"prepare": "npx husky install"
|
|
17
19
|
},
|
|
@@ -27,15 +29,21 @@
|
|
|
27
29
|
"devDependencies": {
|
|
28
30
|
"@adobe/eslint-config-helix": "2.0.4",
|
|
29
31
|
"@adobe/helix-docx2md": "1.5.0",
|
|
30
|
-
"@adobe/helix-mediahandler": "2.3.
|
|
32
|
+
"@adobe/helix-mediahandler": "2.3.3",
|
|
33
|
+
"@esm-bundle/chai": "4.3.4-fix.0",
|
|
31
34
|
"@semantic-release/changelog": "6.0.3",
|
|
32
35
|
"@semantic-release/exec": "6.0.3",
|
|
33
36
|
"@semantic-release/git": "10.0.1",
|
|
37
|
+
"@web/test-runner": "0.15.1",
|
|
38
|
+
"@web/test-runner-commands": "0.6.5",
|
|
39
|
+
"@web/test-runner-mocha": "0.8.1",
|
|
34
40
|
"c8": "8.0.1",
|
|
41
|
+
"chai": "4.3.7",
|
|
35
42
|
"dirname-filename-esm": "1.1.1",
|
|
36
43
|
"eslint": "8.53.0",
|
|
37
44
|
"husky": "8.0.3",
|
|
38
|
-
"
|
|
45
|
+
"jsdom": "22.1.0",
|
|
46
|
+
"lint-staged": "15.1.0",
|
|
39
47
|
"mocha": "10.2.0",
|
|
40
48
|
"mocha-multi-reporters": "1.5.1",
|
|
41
49
|
"mock-fs": "5.2.0",
|
|
@@ -52,7 +60,6 @@
|
|
|
52
60
|
"form-data": "4.0.0",
|
|
53
61
|
"fs-extra": "11.1.1",
|
|
54
62
|
"hast-util-to-mdast": "10.1.0",
|
|
55
|
-
"jsdom": "22.1.0",
|
|
56
63
|
"node-fetch": "3.3.2",
|
|
57
64
|
"rehype-parse": "9.0.0",
|
|
58
65
|
"rehype-remark": "10.0.0",
|
package/src/importer/HTML2x.js
CHANGED
|
@@ -12,12 +12,13 @@
|
|
|
12
12
|
/* eslint-disable class-methods-use-this, no-console */
|
|
13
13
|
|
|
14
14
|
import path from 'path';
|
|
15
|
-
import { Response } from 'node-fetch';
|
|
16
|
-
import { JSDOM } from 'jsdom';
|
|
17
15
|
import PageImporter from './PageImporter.js';
|
|
18
16
|
import PageImporterResource from './PageImporterResource.js';
|
|
19
17
|
import MemoryHandler from '../storage/MemoryHandler.js';
|
|
20
18
|
import Utils from '../utils/Utils.js';
|
|
19
|
+
import BrowserUtils from '../utils/BrowserUtils.js';
|
|
20
|
+
import defaultTransformDOM from './defaults/transformDOM.js';
|
|
21
|
+
import defaultGenerateDocumentPath from './defaults/generateDocumentPath.js';
|
|
21
22
|
|
|
22
23
|
// import docxStylesXML from '../resources/styles.xml';
|
|
23
24
|
|
|
@@ -36,27 +37,6 @@ function setBackgroundImagesFromCSS(document) {
|
|
|
36
37
|
}
|
|
37
38
|
}
|
|
38
39
|
|
|
39
|
-
async function defaultTransformDOM({
|
|
40
|
-
// eslint-disable-next-line no-unused-vars
|
|
41
|
-
url, document, html, params,
|
|
42
|
-
}) {
|
|
43
|
-
return document.body;
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
async function defaultGenerateDocumentPath({
|
|
47
|
-
// eslint-disable-next-line no-unused-vars
|
|
48
|
-
url, document, html, params,
|
|
49
|
-
}) {
|
|
50
|
-
let p = new URL(url).pathname;
|
|
51
|
-
if (p.endsWith('/')) {
|
|
52
|
-
p = `${p}index`;
|
|
53
|
-
}
|
|
54
|
-
return decodeURIComponent(p)
|
|
55
|
-
.toLowerCase()
|
|
56
|
-
.replace(/\.html$/, '')
|
|
57
|
-
.replace(/[^a-z0-9/]/gm, '-');
|
|
58
|
-
}
|
|
59
|
-
|
|
60
40
|
async function html2x(
|
|
61
41
|
url,
|
|
62
42
|
doc,
|
|
@@ -93,8 +73,8 @@ async function html2x(
|
|
|
93
73
|
|
|
94
74
|
const html = doc.documentElement.outerHTML;
|
|
95
75
|
class InternalImporter extends PageImporter {
|
|
96
|
-
async
|
|
97
|
-
return
|
|
76
|
+
async get() {
|
|
77
|
+
return { document: doc, html };
|
|
98
78
|
}
|
|
99
79
|
|
|
100
80
|
async process(document) {
|
|
@@ -181,6 +161,7 @@ async function html2x(
|
|
|
181
161
|
stylesXML: config.docxStylesXML,
|
|
182
162
|
image2png: config.image2png,
|
|
183
163
|
},
|
|
164
|
+
createDocumentFromString: config.createDocumentFromString,
|
|
184
165
|
});
|
|
185
166
|
|
|
186
167
|
const pirs = await importer.import(url);
|
|
@@ -224,10 +205,18 @@ async function html2x(
|
|
|
224
205
|
}
|
|
225
206
|
}
|
|
226
207
|
|
|
208
|
+
const parseStringDocument = (html, config) => {
|
|
209
|
+
if (config?.createDocumentFromString) {
|
|
210
|
+
return config.createDocumentFromString(html);
|
|
211
|
+
} else {
|
|
212
|
+
return BrowserUtils.createDocumentFromString(html);
|
|
213
|
+
}
|
|
214
|
+
};
|
|
215
|
+
|
|
227
216
|
/**
|
|
228
217
|
* Returns the result of the conversion from html to md.
|
|
229
218
|
* @param {string} url URL of the document to convert
|
|
230
|
-
* @param {
|
|
219
|
+
* @param {Document} document Document to convert
|
|
231
220
|
* @param {Object} transformCfg Conversion configuration
|
|
232
221
|
* @param {Object} config Conversion configuration.
|
|
233
222
|
* @param {Object} params Conversion params. Object will be pass to the transformer functions.
|
|
@@ -235,8 +224,8 @@ async function html2x(
|
|
|
235
224
|
*/
|
|
236
225
|
async function html2md(url, document, transformCfg, config, params = {}) {
|
|
237
226
|
let doc = document;
|
|
238
|
-
if (typeof
|
|
239
|
-
doc =
|
|
227
|
+
if (typeof doc === 'string') {
|
|
228
|
+
doc = parseStringDocument(document, config);
|
|
240
229
|
}
|
|
241
230
|
return html2x(url, doc, transformCfg, { ...config, toMd: true, toDocx: false }, params);
|
|
242
231
|
}
|
|
@@ -252,8 +241,8 @@ async function html2md(url, document, transformCfg, config, params = {}) {
|
|
|
252
241
|
*/
|
|
253
242
|
async function html2docx(url, document, transformCfg, config, params = {}) {
|
|
254
243
|
let doc = document;
|
|
255
|
-
if (typeof
|
|
256
|
-
doc =
|
|
244
|
+
if (typeof doc === 'string') {
|
|
245
|
+
doc = parseStringDocument(document, config);
|
|
257
246
|
}
|
|
258
247
|
return html2x(url, doc, transformCfg, { ...config, toMd: true, toDocx: true }, params);
|
|
259
248
|
}
|
|
@@ -12,8 +12,6 @@
|
|
|
12
12
|
|
|
13
13
|
/* eslint-disable class-methods-use-this */
|
|
14
14
|
|
|
15
|
-
import { JSDOM } from 'jsdom';
|
|
16
|
-
|
|
17
15
|
import path from 'path';
|
|
18
16
|
import { unified } from 'unified';
|
|
19
17
|
import parse from 'rehype-parse';
|
|
@@ -36,6 +34,7 @@ import DOMUtils from '../utils/DOMUtils.js';
|
|
|
36
34
|
import FileUtils from '../utils/FileUtils.js';
|
|
37
35
|
import MDUtils from '../utils/MDUtils.js';
|
|
38
36
|
import formatPlugin from './mdast-to-md-format-plugin.js';
|
|
37
|
+
import BrowserUtils from '../utils/BrowserUtils.js';
|
|
39
38
|
|
|
40
39
|
function formatNode(type, state, node) {
|
|
41
40
|
const result = {
|
|
@@ -55,6 +54,12 @@ export default class PageImporter {
|
|
|
55
54
|
|
|
56
55
|
constructor(params) {
|
|
57
56
|
this.params = params;
|
|
57
|
+
|
|
58
|
+
if (!this.params.createDocumentFromString) {
|
|
59
|
+
// default the string parsing using the browser DOMParser
|
|
60
|
+
this.params.createDocumentFromString = BrowserUtils.createDocumentFromString;
|
|
61
|
+
}
|
|
62
|
+
|
|
58
63
|
this.logger = params.logger || console;
|
|
59
64
|
|
|
60
65
|
this.useCache = !!params.cache;
|
|
@@ -297,8 +302,9 @@ export default class PageImporter {
|
|
|
297
302
|
const html = await this.download(url);
|
|
298
303
|
|
|
299
304
|
if (html) {
|
|
300
|
-
const
|
|
301
|
-
|
|
305
|
+
const cleanedHTML = DOMUtils.removeNoscripts(html.toString());
|
|
306
|
+
|
|
307
|
+
const document = this.params.createDocumentFromString(cleanedHTML);
|
|
302
308
|
return {
|
|
303
309
|
document,
|
|
304
310
|
html,
|
|
@@ -315,6 +321,8 @@ export default class PageImporter {
|
|
|
315
321
|
|
|
316
322
|
const results = [];
|
|
317
323
|
if (document) {
|
|
324
|
+
this.preProcess(document);
|
|
325
|
+
|
|
318
326
|
const entries = await this.process(document, url, entryParams, html);
|
|
319
327
|
|
|
320
328
|
this.postProcess(document);
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2023 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
export default async function generateDocumentPath({
|
|
13
|
+
// eslint-disable-next-line no-unused-vars
|
|
14
|
+
url, document, html, params,
|
|
15
|
+
}) {
|
|
16
|
+
let p = new URL(url).pathname;
|
|
17
|
+
if (p.endsWith('/')) {
|
|
18
|
+
p = `${p}index`;
|
|
19
|
+
}
|
|
20
|
+
return decodeURIComponent(p)
|
|
21
|
+
.toLowerCase()
|
|
22
|
+
.replace(/\.html$/, '')
|
|
23
|
+
.replace(/[^a-z0-9/]/gm, '-');
|
|
24
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2023 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
export default function adjustImageUrls(main, url) {
|
|
14
|
+
[...main.querySelectorAll('img')].forEach((img) => {
|
|
15
|
+
const src = img.getAttribute('src');
|
|
16
|
+
if (src && (src.startsWith('./') || src.startsWith('/') || src.startsWith('../'))) {
|
|
17
|
+
try {
|
|
18
|
+
const u = new URL(src, url);
|
|
19
|
+
// eslint-disable-next-line no-param-reassign
|
|
20
|
+
img.src = u.toString();
|
|
21
|
+
} catch (e) {
|
|
22
|
+
// eslint-disable-next-line no-console
|
|
23
|
+
console.log(`Unable to adjust image URL ${img.src} - removing image`);
|
|
24
|
+
img.remove();
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2023 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
export default function convertIcons(main, document) {
|
|
14
|
+
[...main.querySelectorAll('img')].forEach((img) => {
|
|
15
|
+
const src = img.getAttribute('src');
|
|
16
|
+
if (src && src.endsWith('.svg')) {
|
|
17
|
+
const span = document.createElement('span');
|
|
18
|
+
const name = src.split('/').pop().split('.')[0].toLowerCase().trim().replace(/[^a-z0-9]/g, '-');
|
|
19
|
+
if (name) {
|
|
20
|
+
span.innerHTML = `:${name}:`;
|
|
21
|
+
img.replaceWith(span);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2023 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import Blocks from '../../../utils/Blocks.js';
|
|
14
|
+
|
|
15
|
+
function getMetadata(name, document) {
|
|
16
|
+
const attr = name && name.includes(':') ? 'property' : 'name';
|
|
17
|
+
const meta = [...document.head.querySelectorAll(`meta[${attr}="${name}"]`)]
|
|
18
|
+
.map((m) => m.content)
|
|
19
|
+
.join(', ');
|
|
20
|
+
return meta || '';
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export default function createMetadata(main, document) {
|
|
24
|
+
const meta = {};
|
|
25
|
+
|
|
26
|
+
const title = document.querySelector('title');
|
|
27
|
+
if (title) {
|
|
28
|
+
meta.Title = title.textContent.replace(/[\n\t]/gm, '');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const desc = getMetadata('description', document);
|
|
32
|
+
if (desc) {
|
|
33
|
+
meta.Description = desc;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const img = getMetadata('og:image', document);
|
|
37
|
+
if (img) {
|
|
38
|
+
const el = document.createElement('img');
|
|
39
|
+
el.src = img;
|
|
40
|
+
meta.Image = el;
|
|
41
|
+
|
|
42
|
+
const imgAlt = getMetadata('og:image:alt', document);
|
|
43
|
+
if (imgAlt) {
|
|
44
|
+
el.alt = imgAlt;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const ogtitle = getMetadata('og:title', document);
|
|
49
|
+
if (ogtitle && ogtitle !== meta.Title) {
|
|
50
|
+
if (meta.Title) {
|
|
51
|
+
meta['og:title'] = ogtitle;
|
|
52
|
+
} else {
|
|
53
|
+
meta.Title = ogtitle;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const ogdesc = getMetadata('og:description', document);
|
|
58
|
+
if (ogdesc && ogdesc !== meta.Description) {
|
|
59
|
+
if (meta.Description) {
|
|
60
|
+
meta['og:description'] = ogdesc;
|
|
61
|
+
} else {
|
|
62
|
+
meta.Description = ogdesc;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const ttitle = getMetadata('twitter:title', document);
|
|
67
|
+
if (ttitle && ttitle !== meta.Title) {
|
|
68
|
+
if (meta.Title) {
|
|
69
|
+
meta['twitter:title'] = ttitle;
|
|
70
|
+
} else {
|
|
71
|
+
meta.Title = ttitle;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const tdesc = getMetadata('twitter:description', document);
|
|
76
|
+
if (tdesc && tdesc !== meta.Description) {
|
|
77
|
+
if (meta.Description) {
|
|
78
|
+
meta['twitter:description'] = tdesc;
|
|
79
|
+
} else {
|
|
80
|
+
meta.Description = tdesc;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const timg = getMetadata('twitter:image', document);
|
|
85
|
+
if (timg && timg !== img) {
|
|
86
|
+
const el = document.createElement('img');
|
|
87
|
+
el.src = timg;
|
|
88
|
+
meta['twitter:image'] = el;
|
|
89
|
+
|
|
90
|
+
const imgAlt = getMetadata('twitter:image:alt', document);
|
|
91
|
+
if (imgAlt) {
|
|
92
|
+
el.alt = imgAlt;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (Object.keys(meta).length > 0) {
|
|
97
|
+
const block = Blocks.getMetadataBlock(document, meta);
|
|
98
|
+
main.append(block);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return meta;
|
|
102
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2023 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import DOMUtils from '../../../utils/DOMUtils.js';
|
|
14
|
+
|
|
15
|
+
export default function transformBackgroundImages(main, document) {
|
|
16
|
+
[...main.querySelectorAll('[style*="background-image: url"]')].forEach((element) => {
|
|
17
|
+
const img = DOMUtils.getImgFromBackground(element, document);
|
|
18
|
+
element.prepend(img);
|
|
19
|
+
element.style.removeProperty('background-image');
|
|
20
|
+
});
|
|
21
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2023 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
import DOMUtils from '../../utils/DOMUtils.js';
|
|
13
|
+
import createMetadata from './rules/createMetadata.js';
|
|
14
|
+
import adjustImageUrls from './rules/adjustImageUrls.js';
|
|
15
|
+
import convertIcons from './rules/convertIcons.js';
|
|
16
|
+
import transformBackgroundImages from './rules/transformBackgroundImages.js';
|
|
17
|
+
|
|
18
|
+
export default async function transformDOM({
|
|
19
|
+
// eslint-disable-next-line no-unused-vars
|
|
20
|
+
url, document, html, params,
|
|
21
|
+
}) {
|
|
22
|
+
const main = document.body;
|
|
23
|
+
|
|
24
|
+
// attempt to remove non-content elements
|
|
25
|
+
DOMUtils.remove(main, [
|
|
26
|
+
'header',
|
|
27
|
+
'.header',
|
|
28
|
+
'nav',
|
|
29
|
+
'.nav',
|
|
30
|
+
'footer',
|
|
31
|
+
'.footer',
|
|
32
|
+
'iframe',
|
|
33
|
+
'noscript',
|
|
34
|
+
]);
|
|
35
|
+
|
|
36
|
+
createMetadata(main, document);
|
|
37
|
+
transformBackgroundImages(main, document);
|
|
38
|
+
adjustImageUrls(main, url);
|
|
39
|
+
convertIcons(main, document);
|
|
40
|
+
|
|
41
|
+
return main;
|
|
42
|
+
}
|
package/src/index.js
CHANGED
|
@@ -9,9 +9,6 @@
|
|
|
9
9
|
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
10
|
* governing permissions and limitations under the License.
|
|
11
11
|
*/
|
|
12
|
-
import PagingExplorer from './explorer/PagingExplorer.js';
|
|
13
|
-
import PagingExplorerParams from './explorer/PagingExplorerParams.js';
|
|
14
|
-
|
|
15
12
|
import PageImporter from './importer/PageImporter.js';
|
|
16
13
|
import PageImporterParams from './importer/PageImporterParams.js';
|
|
17
14
|
import PageImporterResource from './importer/PageImporterResource.js';
|
|
@@ -27,15 +24,22 @@ import Loader from './utils/Loader.js';
|
|
|
27
24
|
import Utils from './utils/Utils.js';
|
|
28
25
|
|
|
29
26
|
import WPUtils from './wp/WPUtils.js';
|
|
30
|
-
import WPAdminAjaxPager from './wp/explorers/WPAdminAjaxPager.js';
|
|
31
|
-
import WPContentPager from './wp/explorers/WPContentPager.js';
|
|
32
|
-
import WPPostWrapPager from './wp/explorers/WPPostWrapPager.js';
|
|
33
27
|
|
|
34
28
|
import { html2md, html2docx } from './importer/HTML2x.js';
|
|
35
29
|
|
|
30
|
+
import createMetadata from './importer/defaults/rules/createMetadata.js';
|
|
31
|
+
import adjustImageUrls from './importer/defaults/rules/adjustImageUrls.js';
|
|
32
|
+
import convertIcons from './importer/defaults/rules/convertIcons.js';
|
|
33
|
+
import transformBackgroundImages from './importer/defaults/rules/transformBackgroundImages.js';
|
|
34
|
+
|
|
35
|
+
const rules = {
|
|
36
|
+
createMetadata,
|
|
37
|
+
adjustImageUrls,
|
|
38
|
+
convertIcons,
|
|
39
|
+
transformBackgroundImages,
|
|
40
|
+
};
|
|
41
|
+
|
|
36
42
|
export {
|
|
37
|
-
PagingExplorer,
|
|
38
|
-
PagingExplorerParams,
|
|
39
43
|
PageImporter,
|
|
40
44
|
PageImporterParams,
|
|
41
45
|
PageImporterResource,
|
|
@@ -48,9 +52,7 @@ export {
|
|
|
48
52
|
Loader,
|
|
49
53
|
Utils,
|
|
50
54
|
WPUtils,
|
|
51
|
-
WPAdminAjaxPager,
|
|
52
|
-
WPContentPager,
|
|
53
|
-
WPPostWrapPager,
|
|
54
55
|
html2md,
|
|
55
56
|
html2docx,
|
|
57
|
+
rules,
|
|
56
58
|
};
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2023 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
export default class BrowserUtils {
|
|
14
|
+
/**
|
|
15
|
+
* Creates a document from a html string. This function use DOMParser
|
|
16
|
+
* which should be available in execution context, i.e. a browser.
|
|
17
|
+
* @param {String} html The html to parse
|
|
18
|
+
* @returns Document The parsed document
|
|
19
|
+
*/
|
|
20
|
+
static createDocumentFromString(html) {
|
|
21
|
+
try {
|
|
22
|
+
// eslint-disable-next-line no-undef
|
|
23
|
+
const parser = new DOMParser();
|
|
24
|
+
return parser.parseFromString(html, 'text/html');
|
|
25
|
+
} catch (e) {
|
|
26
|
+
throw new Error('Unable to parse HTML using default createDocumentFromString function and global DOMParser. Please provide a custom createDocumentFromString.');
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|