@adobe/helix-importer 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +33 -0
- package/CODE_OF_CONDUCT.md +74 -0
- package/CONTRIBUTING.md +74 -0
- package/LICENSE.txt +264 -0
- package/README.md +63 -0
- package/package.json +46 -0
- package/src/explorer/PagingExplorer.js +81 -0
- package/src/explorer/PagingExplorerParams.js +17 -0
- package/src/importer/PageImporter.js +365 -0
- package/src/importer/PageImporterParams.js +23 -0
- package/src/importer/PageImporterResource.js +31 -0
- package/src/index.js +46 -0
- package/src/storage/FSHandler.js +46 -0
- package/src/storage/MemoryHandler.js +36 -0
- package/src/utils/CSV.js +96 -0
- package/src/utils/DOMUtils.js +207 -0
- package/src/utils/FileUtils.js +25 -0
- package/src/utils/Utils.js +20 -0
- package/src/wp/WPUtils.js +68 -0
- package/src/wp/explorers/WPAdminAjaxPager.js +51 -0
- package/src/wp/explorers/WPContentPager.js +48 -0
- package/src/wp/explorers/WPPostWrapPager.js +43 -0
- package/test/explorers/PagingExplorer.spec.js +205 -0
- package/test/importers/PageImporter.spec.js +86 -0
- package/test/mocks/MockMediaHandler.js +41 -0
- package/test/mocks/NoopLogger.js +26 -0
- package/test/storage/FSHandler.spec.js +52 -0
- package/test/storage/MemoryHandler.spec.js +33 -0
- package/test/utils/CSV.spec.js +60 -0
- package/test/utils/DOMUtils.spec.js +270 -0
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2021 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { JSDOM } from 'jsdom';
|
|
14
|
+
|
|
15
|
+
export default class DOMUtils {
|
|
16
|
+
static EMPTY_TAGS_TO_PRESERVE = ['img', 'video', 'iframe', 'div', 'picture'];
|
|
17
|
+
|
|
18
|
+
static reviewInlineElement(document, tagName) {
|
|
19
|
+
let tags = [...document.querySelectorAll(tagName)];
|
|
20
|
+
// first pass, remove empty nodes
|
|
21
|
+
for (let i = tags.length - 1; i >= 0; i -= 1) {
|
|
22
|
+
const tag = tags[i];
|
|
23
|
+
if (tag.textContent === '' && !tag.querySelector(DOMUtils.EMPTY_TAGS_TO_PRESERVE.join(','))) {
|
|
24
|
+
tag.remove();
|
|
25
|
+
} else {
|
|
26
|
+
tag.innerHTML = tag.innerHTML.replace(/ /gm, ' ');
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
tags = [...document.querySelectorAll(tagName)];
|
|
31
|
+
// make a first pass to find <tag>x</tag> <tag>y</tag> and move the space
|
|
32
|
+
for (let i = tags.length - 1; i >= 0; i -= 1) {
|
|
33
|
+
const tag = tags[i];
|
|
34
|
+
if (tag.nextSibling && tag.nextSibling.textContent === ' ') {
|
|
35
|
+
// next is a space, check next next
|
|
36
|
+
const nextNext = tag.nextSibling.nextSibling;
|
|
37
|
+
if (nextNext && nextNext.tagName && nextNext.tagName.toLowerCase() === tagName) {
|
|
38
|
+
// same tag
|
|
39
|
+
tag.nextSibling.remove();
|
|
40
|
+
tag.innerHTML = `${tag.innerHTML} `;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
tags = [...document.querySelectorAll(tagName)];
|
|
46
|
+
// collaspe consecutive <tag>
|
|
47
|
+
// and make sure element does not start ends with spaces while it is before / after some text
|
|
48
|
+
for (let i = tags.length - 1; i >= 0; i -= 1) {
|
|
49
|
+
const tag = tags[i];
|
|
50
|
+
if (tag.innerHTML === '.' || tag.innerHTML === '. ' || tag.innerHTML === ':' || tag.innerHTML === ': ') {
|
|
51
|
+
tag.replaceWith(JSDOM.fragment(tag.innerHTML));
|
|
52
|
+
} else {
|
|
53
|
+
const { innerHTML } = tag;
|
|
54
|
+
if (tag.previousSibling) {
|
|
55
|
+
const previous = tag.previousSibling;
|
|
56
|
+
if (
|
|
57
|
+
previous.tagName
|
|
58
|
+
&& previous.tagName.toLowerCase() === tagName
|
|
59
|
+
&& (!previous.href || previous.href === tag.href)
|
|
60
|
+
) {
|
|
61
|
+
if (tag.hasChildNodes()) {
|
|
62
|
+
[...tag.childNodes].forEach((child) => {
|
|
63
|
+
previous.append(child);
|
|
64
|
+
});
|
|
65
|
+
} else {
|
|
66
|
+
// previous sibling is an <tag>, merge current one inside the previous one
|
|
67
|
+
previous.append(innerHTML);
|
|
68
|
+
}
|
|
69
|
+
tag.remove();
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
tags = [...document.querySelectorAll(tagName)];
|
|
76
|
+
// extra leading and trailing spaces into a dedicated span
|
|
77
|
+
for (let i = tags.length - 1; i >= 0; i -= 1) {
|
|
78
|
+
const tag = tags[i];
|
|
79
|
+
let { innerHTML } = tag;
|
|
80
|
+
if (innerHTML) {
|
|
81
|
+
if (innerHTML.lastIndexOf(' ') === innerHTML.length - 1) {
|
|
82
|
+
// move trailing space to a new text node outside of current element
|
|
83
|
+
tag.innerHTML = innerHTML.slice(0, innerHTML.length - 1);
|
|
84
|
+
({ innerHTML } = tag);
|
|
85
|
+
tag.after(JSDOM.fragment('<span> </span>'));
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if (innerHTML.indexOf(' ') === 0) {
|
|
89
|
+
// move leading space to a new text node outside of current element
|
|
90
|
+
tag.innerHTML = innerHTML.slice(1);
|
|
91
|
+
tag.before(JSDOM.fragment('<span> </span>'));
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
static reviewParagraphs(document) {
|
|
98
|
+
const tags = [...document.querySelectorAll('p')];
|
|
99
|
+
for (let i = tags.length - 1; i >= 0; i -= 1) {
|
|
100
|
+
const tag = tags[i];
|
|
101
|
+
// remove useless paragraphs
|
|
102
|
+
if (
|
|
103
|
+
(tag.textContent === ''
|
|
104
|
+
|| tag.textContent === ' '
|
|
105
|
+
|| tag.textContent === ' '
|
|
106
|
+
|| tag.textContent.charCodeAt(0) === 160)
|
|
107
|
+
&& !tag.querySelector(DOMUtils.EMPTY_TAGS_TO_PRESERVE.join(','))
|
|
108
|
+
) {
|
|
109
|
+
tag.remove();
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
static escapeSpecialCharacters(document) {
|
|
115
|
+
// eslint-disable-next-line no-param-reassign
|
|
116
|
+
document.body.innerHTML = document.body.innerHTML.replace(/~/gm, '\\~');
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
static reviewHeadings(document) {
|
|
120
|
+
const tags = [...document.querySelectorAll('h1, h2, h3, h4, h5, h6')];
|
|
121
|
+
for (let i = tags.length - 1; i >= 0; i -= 1) {
|
|
122
|
+
const tag = tags[i];
|
|
123
|
+
// remove useless strong tags
|
|
124
|
+
tag.innerHTML = tag.innerHTML.replace(/<strong>|<\\strong>/gm, '');
|
|
125
|
+
if (tag.innerHTML === '') {
|
|
126
|
+
tag.remove();
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
static remove(document, selectors) {
|
|
132
|
+
selectors.forEach((s) => {
|
|
133
|
+
document.querySelectorAll(s).forEach((n) => n.remove());
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
static removeComments(document) {
|
|
138
|
+
// eslint-disable-next-line no-param-reassign
|
|
139
|
+
document.body.innerHTML = document.body.innerHTML
|
|
140
|
+
// remove html comments
|
|
141
|
+
.replace(/<!--(?!>)[\S\s]*?-->/gm, '');
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
static removeSpans(document) {
|
|
145
|
+
// remove spans
|
|
146
|
+
document.querySelectorAll('span').forEach((span) => {
|
|
147
|
+
if (span.textContent === '') {
|
|
148
|
+
span.remove();
|
|
149
|
+
} else {
|
|
150
|
+
span.replaceWith(JSDOM.fragment(span.innerHTML));
|
|
151
|
+
}
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
static replaceByCaptions(document, selectors) {
|
|
156
|
+
selectors.forEach((selector) => {
|
|
157
|
+
document.querySelectorAll(selector).forEach((elem) => {
|
|
158
|
+
const captionText = elem.textContent.trim();
|
|
159
|
+
elem.parentNode.insertBefore(JSDOM.fragment(`<p><em>${captionText}</em></p>`), elem);
|
|
160
|
+
elem.remove();
|
|
161
|
+
});
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
static generateEmbed(url) {
|
|
166
|
+
return JSDOM.fragment(`<table><tr><th>Embed</th></tr><tr><td><a href="${url}">${url}</a></td></tr></table>`);
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
static replaceEmbeds(document) {
|
|
170
|
+
document.querySelectorAll('iframe').forEach((iframe) => {
|
|
171
|
+
const src = iframe.getAttribute('src');
|
|
172
|
+
const dataSrc = iframe.getAttribute('data-src');
|
|
173
|
+
const url = dataSrc || src;
|
|
174
|
+
if (url) {
|
|
175
|
+
iframe.after(DOMUtils.generateEmbed(url));
|
|
176
|
+
}
|
|
177
|
+
iframe.remove();
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
document.querySelectorAll('video').forEach((video) => {
|
|
181
|
+
let blockType = 'Video';
|
|
182
|
+
if (video.autoplay) {
|
|
183
|
+
blockType = 'Animation';
|
|
184
|
+
}
|
|
185
|
+
const anim = JSDOM.fragment(`<table><tr><th>${blockType}</th></tr><tr><td>${video.outerHTML}</td></tr></table>`);
|
|
186
|
+
video.replaceWith(anim);
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
static removeNoscripts(html) {
|
|
191
|
+
return html.replace(/<noscript>((.|\n)*?)<\/noscript>/gm, '');
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
static encodeImagesForTable(document) {
|
|
195
|
+
const imgs = document.querySelectorAll('img');
|
|
196
|
+
imgs.forEach((img) => {
|
|
197
|
+
if (img.closest('table')) {
|
|
198
|
+
// if image is in a table
|
|
199
|
+
if (img.title && img.title.indexOf('|') !== -1) {
|
|
200
|
+
// pipes in title do not get encoded
|
|
201
|
+
// eslint-disable-next-line no-param-reassign
|
|
202
|
+
img.title = img.title.replace(/\|/gm, '\\|');
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2021 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import sanitize from 'sanitize-filename';
|
|
14
|
+
|
|
15
|
+
export default class FileUtils {
|
|
16
|
+
static sanitizeFilename(name) {
|
|
17
|
+
return sanitize(decodeURIComponent(name))
|
|
18
|
+
.trim()
|
|
19
|
+
.toLowerCase()
|
|
20
|
+
.replace(/\./gm, '')
|
|
21
|
+
.replace(/&/gm, '')
|
|
22
|
+
.replace(/\s/g, '-')
|
|
23
|
+
.replace(/-{2,}/g, '-');
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2021 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
export default class Utils {
|
|
14
|
+
static async asyncForEach(array, callback) {
|
|
15
|
+
for (let index = 0; index < array.length; index += 1) {
|
|
16
|
+
// eslint-disable-next-line no-await-in-loop
|
|
17
|
+
await callback(array[index], index, array);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2021 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
import { JSDOM } from 'jsdom';
|
|
13
|
+
|
|
14
|
+
import DOMUtils from '../utils/DOMUtils.js';
|
|
15
|
+
|
|
16
|
+
export default class WPUtils {
|
|
17
|
+
static handleCaptions(document) {
|
|
18
|
+
DOMUtils.replaceByCaptions(document, ['.wp-caption-text', 'figcaption']);
|
|
19
|
+
|
|
20
|
+
// an h5 following an image / video is a caption
|
|
21
|
+
document.querySelectorAll('p img, video').forEach((item) => {
|
|
22
|
+
if (
|
|
23
|
+
(item.parentNode.nextElementSibling && item.parentNode.nextElementSibling.tagName === 'H5')
|
|
24
|
+
|| (item.nextElementSibling && item.nextElementSibling.tagName === 'H5')
|
|
25
|
+
) {
|
|
26
|
+
const elem = item.parentNode.nextElementSibling && item.parentNode.nextElementSibling.tagName === 'H5'
|
|
27
|
+
? item.parentNode.nextElementSibling
|
|
28
|
+
: item.nextElementSibling;
|
|
29
|
+
const captionText = elem.textContent.trim();
|
|
30
|
+
elem.parentNode.insertBefore(JSDOM.fragment(`<p><em>${captionText}</em><p>`), elem);
|
|
31
|
+
elem.remove();
|
|
32
|
+
}
|
|
33
|
+
});
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
static genericDOMCleanup(document) {
|
|
37
|
+
// extract "emphasis" from links
|
|
38
|
+
// see https://github.com/adobe/helix-pipeline/issues/895
|
|
39
|
+
document.querySelectorAll('a strong').forEach((elem) => {
|
|
40
|
+
const parent = elem.parentNode;
|
|
41
|
+
if (parent.childNodes.length === 1) {
|
|
42
|
+
// only cover case with 1 child
|
|
43
|
+
const txt = elem.textContent;
|
|
44
|
+
// only treat links
|
|
45
|
+
if (txt && (txt.indexOf('.') !== -1 || txt.indexOf(':') !== -1)) {
|
|
46
|
+
// eslint-disable-next-line no-param-reassign
|
|
47
|
+
elem.innerHTML = '';
|
|
48
|
+
// take out of parent
|
|
49
|
+
parent.parentNode.insertBefore(elem, parent.nextSibling);
|
|
50
|
+
elem.appendChild(parent);
|
|
51
|
+
parent.innerHTML = txt;
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
// some images are in headings...
|
|
57
|
+
document.querySelectorAll('h1 img, h2 img, h3 img, h4 img, h5 img, h6 img').forEach((img) => {
|
|
58
|
+
// move image after its parent heading
|
|
59
|
+
img.parentNode.parentNode.insertBefore(img, img.parentNode.nextSibling);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
// heading could be full of tags
|
|
63
|
+
document.querySelectorAll('h1, h2, h3, h4, h5, h6').forEach((h) => {
|
|
64
|
+
// eslint-disable-next-line no-param-reassign
|
|
65
|
+
h.innerHTML = h.textContent;
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2021 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import FormData from 'form-data';
|
|
14
|
+
import fetch from 'node-fetch';
|
|
15
|
+
import PagingExplorer from '../../explorer/PagingExplorer.js';
|
|
16
|
+
|
|
17
|
+
const API = 'wp-admin/admin-ajax.php';
|
|
18
|
+
|
|
19
|
+
export default class WPAdminAjaxPager extends PagingExplorer {
|
|
20
|
+
async fetch(page) {
|
|
21
|
+
const api = `${this.params.url}${API}`;
|
|
22
|
+
const form = new FormData();
|
|
23
|
+
form.append('action', 'cardsFilter');
|
|
24
|
+
form.append('filterBy', 'latest');
|
|
25
|
+
form.append('paged', `${page}`);
|
|
26
|
+
return fetch(api, {
|
|
27
|
+
method: 'POST',
|
|
28
|
+
body: form,
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// eslint-disable-next-line class-methods-use-this
|
|
33
|
+
process(document, all) {
|
|
34
|
+
const entries = [];
|
|
35
|
+
document.querySelectorAll('.card-item').forEach((el) => {
|
|
36
|
+
const link = el.querySelector('h4 a');
|
|
37
|
+
const url = link.getAttribute('href');
|
|
38
|
+
|
|
39
|
+
const entryDate = el.querySelector('.date');
|
|
40
|
+
const date = entryDate.textContent.trim();
|
|
41
|
+
|
|
42
|
+
if (all.findIndex((entry) => entry.url === url) === -1) {
|
|
43
|
+
entries.push({
|
|
44
|
+
date,
|
|
45
|
+
url,
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
});
|
|
49
|
+
return entries;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2021 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import fetch from 'node-fetch';
|
|
14
|
+
import PagingExplorer from '../../explorer/PagingExplorer.js';
|
|
15
|
+
|
|
16
|
+
const API = 'page/';
|
|
17
|
+
|
|
18
|
+
export default class WPContentPager extends PagingExplorer {
|
|
19
|
+
async fetch(page) {
|
|
20
|
+
const api = `${this.params.url}${API}${page}`;
|
|
21
|
+
return fetch(api);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// eslint-disable-next-line class-methods-use-this
|
|
25
|
+
process(document, all) {
|
|
26
|
+
const entries = [];
|
|
27
|
+
document.querySelectorAll('main .content .entry, main .entries .entry, article .entries .entry').forEach((el) => {
|
|
28
|
+
const link = el.querySelector('h2 a');
|
|
29
|
+
if (link) {
|
|
30
|
+
const url = link.getAttribute('href');
|
|
31
|
+
|
|
32
|
+
const entryDate = el.querySelector('.date') || el.querySelector('.entry_footer');
|
|
33
|
+
let date = '';
|
|
34
|
+
if (entryDate) {
|
|
35
|
+
date = entryDate.textContent.trim();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if (all.findIndex((entry) => entry.url === url) === -1) {
|
|
39
|
+
entries.push({
|
|
40
|
+
date,
|
|
41
|
+
url,
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
return entries;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2021 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import fetch from 'node-fetch';
|
|
14
|
+
import PagingExplorer from '../../explorer/PagingExplorer.js';
|
|
15
|
+
|
|
16
|
+
const API = 'page/';
|
|
17
|
+
|
|
18
|
+
export default class WPPostWrapPager extends PagingExplorer {
|
|
19
|
+
async fetch(page) {
|
|
20
|
+
const api = `${this.params.url}${API}${page}`;
|
|
21
|
+
return fetch(api);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// eslint-disable-next-line class-methods-use-this
|
|
25
|
+
process(document, all) {
|
|
26
|
+
const entries = [];
|
|
27
|
+
document.querySelectorAll('.post-meta-wrap').forEach((el) => {
|
|
28
|
+
const link = el.querySelector('.post-item > a');
|
|
29
|
+
const url = link.getAttribute('href');
|
|
30
|
+
|
|
31
|
+
const entryDate = el.querySelector('.post-date');
|
|
32
|
+
const date = entryDate.textContent.trim();
|
|
33
|
+
|
|
34
|
+
if (all.findIndex((entry) => entry.url === url) === -1) {
|
|
35
|
+
entries.push({
|
|
36
|
+
date,
|
|
37
|
+
url,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
return entries;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright 2020 Adobe. All rights reserved.
|
|
3
|
+
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
* you may not use this file except in compliance with the License. You may obtain a copy
|
|
5
|
+
* of the License at http://www.apache.org/licenses/LICENSE-2.0
|
|
6
|
+
*
|
|
7
|
+
* Unless required by applicable law or agreed to in writing, software distributed under
|
|
8
|
+
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
|
|
9
|
+
* OF ANY KIND, either express or implied. See the License for the specific language
|
|
10
|
+
* governing permissions and limitations under the License.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/* eslint-disable max-classes-per-file, class-methods-use-this */
|
|
14
|
+
|
|
15
|
+
import { deepStrictEqual, strictEqual } from 'assert';
|
|
16
|
+
import { describe, it } from 'mocha';
|
|
17
|
+
|
|
18
|
+
import { Response } from 'node-fetch';
|
|
19
|
+
|
|
20
|
+
import PagingExplorer from '../../src/explorer/PagingExplorer.js';
|
|
21
|
+
|
|
22
|
+
describe('PagingExplorer tests', () => {
|
|
23
|
+
const params = {
|
|
24
|
+
url: 'testdest',
|
|
25
|
+
nbMaxPages: 3,
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
it('fetch and process are called 3 times if end not reached', async () => {
|
|
29
|
+
let fetchCalled = 0;
|
|
30
|
+
let processCalled = 0;
|
|
31
|
+
|
|
32
|
+
class Test extends PagingExplorer {
|
|
33
|
+
async fetch() {
|
|
34
|
+
fetchCalled += 1;
|
|
35
|
+
return new Response('test');
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
process() {
|
|
39
|
+
processCalled += 1;
|
|
40
|
+
return [{
|
|
41
|
+
a: 1,
|
|
42
|
+
}];
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
const se = new Test(params);
|
|
47
|
+
await se.explore();
|
|
48
|
+
|
|
49
|
+
strictEqual(fetchCalled, 3, 'process is called 3 times');
|
|
50
|
+
strictEqual(processCalled, 3, 'process is called 3 times');
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it('fetch stops the process when reaches the end', async () => {
|
|
54
|
+
let fetchCalled = 0;
|
|
55
|
+
let processCalled = 0;
|
|
56
|
+
|
|
57
|
+
class Test extends PagingExplorer {
|
|
58
|
+
async fetch() {
|
|
59
|
+
fetchCalled += 1;
|
|
60
|
+
if (fetchCalled > 1) {
|
|
61
|
+
return new Response('reached the end', { status: 404 });
|
|
62
|
+
}
|
|
63
|
+
return new Response('test');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
process() {
|
|
67
|
+
processCalled += 1;
|
|
68
|
+
return [{
|
|
69
|
+
a: 1,
|
|
70
|
+
}];
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const se = new Test(params);
|
|
75
|
+
await se.explore();
|
|
76
|
+
|
|
77
|
+
strictEqual(fetchCalled, 2, 'fetch is called 2 times');
|
|
78
|
+
strictEqual(processCalled, 1, 'process is called 1 time');
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('explore returns the expected result set', async () => {
|
|
82
|
+
let processCalled = 0;
|
|
83
|
+
class Test extends PagingExplorer {
|
|
84
|
+
async fetch() {
|
|
85
|
+
return new Response('test');
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
process() {
|
|
89
|
+
processCalled += 1;
|
|
90
|
+
return [{
|
|
91
|
+
a: processCalled,
|
|
92
|
+
}];
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const se = new Test(params);
|
|
97
|
+
const results = await se.explore();
|
|
98
|
+
|
|
99
|
+
deepStrictEqual(results, [{ a: 1 }, { a: 2 }, { a: 3 }], 'result is correct');
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
it('explore returns the expected result set when number of pages is not the max one', async () => {
|
|
103
|
+
let fetchCalled = 0;
|
|
104
|
+
let processCalled = 0;
|
|
105
|
+
class Test extends PagingExplorer {
|
|
106
|
+
async fetch() {
|
|
107
|
+
fetchCalled += 1;
|
|
108
|
+
if (fetchCalled > 2) {
|
|
109
|
+
return new Response('reached the end', { status: 404 });
|
|
110
|
+
}
|
|
111
|
+
return new Response('test');
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
process() {
|
|
115
|
+
processCalled += 1;
|
|
116
|
+
return [{
|
|
117
|
+
a: processCalled,
|
|
118
|
+
}];
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const se = new Test(params);
|
|
123
|
+
const results = await se.explore();
|
|
124
|
+
|
|
125
|
+
deepStrictEqual(results, [{ a: 1 }, { a: 2 }], 'result is correct');
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
it('explore, fetch and process can be used to retrieve multipage results', async () => {
|
|
129
|
+
let fetchCalled = 0;
|
|
130
|
+
class Test extends PagingExplorer {
|
|
131
|
+
async fetch() {
|
|
132
|
+
fetchCalled += 1;
|
|
133
|
+
if (fetchCalled > 2) {
|
|
134
|
+
return new Response('reached the end', { status: 404 });
|
|
135
|
+
}
|
|
136
|
+
return new Response(`<html>
|
|
137
|
+
<body>
|
|
138
|
+
<a href="a${fetchCalled}.html">a${fetchCalled}</a>
|
|
139
|
+
<a href="b${fetchCalled}.html">b${fetchCalled}</a>
|
|
140
|
+
<a href="c${fetchCalled}.html">c${fetchCalled}</a>
|
|
141
|
+
</body
|
|
142
|
+
</html>`);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
process(document) {
|
|
146
|
+
const entries = [];
|
|
147
|
+
document.querySelectorAll('a').forEach((el) => {
|
|
148
|
+
entries.push({
|
|
149
|
+
link: el.getAttribute('href'),
|
|
150
|
+
});
|
|
151
|
+
});
|
|
152
|
+
return entries;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const se = new Test(params);
|
|
157
|
+
const results = await se.explore();
|
|
158
|
+
|
|
159
|
+
deepStrictEqual(results, [
|
|
160
|
+
{ link: 'a1.html' },
|
|
161
|
+
{ link: 'b1.html' },
|
|
162
|
+
{ link: 'c1.html' },
|
|
163
|
+
{ link: 'a2.html' },
|
|
164
|
+
{ link: 'b2.html' },
|
|
165
|
+
{ link: 'c2.html' },
|
|
166
|
+
], 'result is correct');
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
it('process receives the entry set from previous pages', async () => {
|
|
170
|
+
let fetchCalled = 0;
|
|
171
|
+
class Test extends PagingExplorer {
|
|
172
|
+
async fetch() {
|
|
173
|
+
fetchCalled += 1;
|
|
174
|
+
return new Response(`<html>
|
|
175
|
+
<body>
|
|
176
|
+
<a href="a${fetchCalled}.html">a${fetchCalled}</a>
|
|
177
|
+
<a href="b${fetchCalled}.html">b${fetchCalled}</a>
|
|
178
|
+
<a href="c${fetchCalled}.html">c${fetchCalled}</a>
|
|
179
|
+
</body
|
|
180
|
+
</html>`);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
process(document, all) {
|
|
184
|
+
const testResult = [];
|
|
185
|
+
for (let i = 1; i < fetchCalled; i += 1) {
|
|
186
|
+
testResult.push({ link: `a${i}.html` });
|
|
187
|
+
testResult.push({ link: `b${i}.html` });
|
|
188
|
+
testResult.push({ link: `c${i}.html` });
|
|
189
|
+
}
|
|
190
|
+
deepStrictEqual(all, testResult, 'all entries argument contains previous entries from previous pages');
|
|
191
|
+
|
|
192
|
+
const entries = [];
|
|
193
|
+
document.querySelectorAll('a').forEach((el) => {
|
|
194
|
+
entries.push({
|
|
195
|
+
link: el.getAttribute('href'),
|
|
196
|
+
});
|
|
197
|
+
});
|
|
198
|
+
return entries;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const se = new Test(params);
|
|
203
|
+
await se.explore();
|
|
204
|
+
});
|
|
205
|
+
});
|