@ckeditor/ckeditor5-dev-docs 31.1.13 → 32.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/{docs/index.js → build.js} +0 -0
- package/lib/index.js +3 -2
- package/package.json +3 -7
- package/lib/web-crawler/constants.js +0 -71
- package/lib/web-crawler/index.js +0 -729
- package/lib/web-crawler/spinner.js +0 -57
- package/lib/web-crawler/utils.js +0 -51
|
File without changes
|
package/lib/index.js
CHANGED
package/package.json
CHANGED
|
@@ -1,18 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ckeditor/ckeditor5-dev-docs",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "32.0.0",
|
|
4
4
|
"description": "Tasks used to build and verify the documentation for CKEditor 5.",
|
|
5
5
|
"keywords": [],
|
|
6
6
|
"main": "lib/index.js",
|
|
7
7
|
"dependencies": {
|
|
8
|
-
"@ckeditor/ckeditor5-dev-utils": "^
|
|
9
|
-
"@ckeditor/jsdoc-plugins": "^
|
|
8
|
+
"@ckeditor/ckeditor5-dev-utils": "^32.0.0",
|
|
9
|
+
"@ckeditor/jsdoc-plugins": "^32.0.0",
|
|
10
10
|
"fast-glob": "^3.2.4",
|
|
11
11
|
"fs-extra": "^9.0.0",
|
|
12
|
-
"puppeteer": "^13.1.3",
|
|
13
|
-
"chalk": "^4.1.0",
|
|
14
|
-
"strip-ansi": "^6.0.0",
|
|
15
|
-
"ora": "^5.2.0",
|
|
16
12
|
"jsdoc": "ckeditor/jsdoc#fixed-trailing-comment-doclets",
|
|
17
13
|
"tmp": "^0.2.1"
|
|
18
14
|
},
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.
|
|
5
|
-
* For licensing, see LICENSE.md.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
/* eslint-env node */
|
|
9
|
-
|
|
10
|
-
const DEFAULT_CONCURRENCY = require( 'os' ).cpus().length / 2;
|
|
11
|
-
|
|
12
|
-
const DEFAULT_TIMEOUT = 15 * 1000;
|
|
13
|
-
|
|
14
|
-
const DEFAULT_RESPONSIVENESS_CHECK_TIMEOUT = 1000;
|
|
15
|
-
|
|
16
|
-
const DEFAULT_REMAINING_ATTEMPTS = 3;
|
|
17
|
-
|
|
18
|
-
const ERROR_TYPES = {
|
|
19
|
-
PAGE_CRASH: {
|
|
20
|
-
event: 'error',
|
|
21
|
-
description: 'Page crash'
|
|
22
|
-
},
|
|
23
|
-
UNCAUGHT_EXCEPTION: {
|
|
24
|
-
event: 'pageerror',
|
|
25
|
-
description: 'Uncaught exception'
|
|
26
|
-
},
|
|
27
|
-
REQUEST_FAILURE: {
|
|
28
|
-
event: 'requestfailed',
|
|
29
|
-
description: 'Request failure'
|
|
30
|
-
},
|
|
31
|
-
RESPONSE_FAILURE: {
|
|
32
|
-
event: 'response',
|
|
33
|
-
description: 'Response failure'
|
|
34
|
-
},
|
|
35
|
-
CONSOLE_ERROR: {
|
|
36
|
-
event: 'console',
|
|
37
|
-
description: 'Console error'
|
|
38
|
-
},
|
|
39
|
-
NAVIGATION_ERROR: {
|
|
40
|
-
// Navigation error does not have the `event` property, because this error is not emitted by page.on() method as
|
|
41
|
-
// event, but it is thrown as exception from page.goto() method.
|
|
42
|
-
description: 'Navigation error'
|
|
43
|
-
}
|
|
44
|
-
};
|
|
45
|
-
|
|
46
|
-
const PATTERN_TYPE_TO_ERROR_TYPE_MAP = {
|
|
47
|
-
'page-crash': ERROR_TYPES.PAGE_CRASH,
|
|
48
|
-
'uncaught-exception': ERROR_TYPES.UNCAUGHT_EXCEPTION,
|
|
49
|
-
'request-failure': ERROR_TYPES.REQUEST_FAILURE,
|
|
50
|
-
'response-failure': ERROR_TYPES.RESPONSE_FAILURE,
|
|
51
|
-
'console-error': ERROR_TYPES.CONSOLE_ERROR,
|
|
52
|
-
'navigation-error': ERROR_TYPES.NAVIGATION_ERROR
|
|
53
|
-
};
|
|
54
|
-
|
|
55
|
-
const IGNORE_ALL_ERRORS_WILDCARD = '*';
|
|
56
|
-
|
|
57
|
-
const META_TAG_NAME = 'x-cke-crawler-ignore-patterns';
|
|
58
|
-
|
|
59
|
-
const DATA_ATTRIBUTE_NAME = 'data-cke-crawler-skip';
|
|
60
|
-
|
|
61
|
-
module.exports = {
|
|
62
|
-
DEFAULT_CONCURRENCY,
|
|
63
|
-
DEFAULT_TIMEOUT,
|
|
64
|
-
DEFAULT_RESPONSIVENESS_CHECK_TIMEOUT,
|
|
65
|
-
DEFAULT_REMAINING_ATTEMPTS,
|
|
66
|
-
ERROR_TYPES,
|
|
67
|
-
PATTERN_TYPE_TO_ERROR_TYPE_MAP,
|
|
68
|
-
IGNORE_ALL_ERRORS_WILDCARD,
|
|
69
|
-
META_TAG_NAME,
|
|
70
|
-
DATA_ATTRIBUTE_NAME
|
|
71
|
-
};
|
package/lib/web-crawler/index.js
DELETED
|
@@ -1,729 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.
|
|
5
|
-
* For licensing, see LICENSE.md.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
/* eslint-env node */
|
|
9
|
-
|
|
10
|
-
const puppeteer = require( 'puppeteer' );
|
|
11
|
-
const chalk = require( 'chalk' );
|
|
12
|
-
const util = require( 'util' );
|
|
13
|
-
const stripAnsiEscapeCodes = require( 'strip-ansi' );
|
|
14
|
-
const { getBaseUrl, toArray } = require( './utils' );
|
|
15
|
-
const { createSpinner, getProgressHandler } = require( './spinner' );
|
|
16
|
-
|
|
17
|
-
const {
|
|
18
|
-
DEFAULT_TIMEOUT,
|
|
19
|
-
DEFAULT_RESPONSIVENESS_CHECK_TIMEOUT,
|
|
20
|
-
DEFAULT_REMAINING_ATTEMPTS,
|
|
21
|
-
ERROR_TYPES,
|
|
22
|
-
PATTERN_TYPE_TO_ERROR_TYPE_MAP,
|
|
23
|
-
IGNORE_ALL_ERRORS_WILDCARD,
|
|
24
|
-
META_TAG_NAME,
|
|
25
|
-
DATA_ATTRIBUTE_NAME
|
|
26
|
-
} = require( './constants' );
|
|
27
|
-
|
|
28
|
-
/**
|
|
29
|
-
* Main crawler function. Its purpose is to:
|
|
30
|
-
* - create Puppeteer's browser instance,
|
|
31
|
-
* - open simultaneously (up to concurrency limit) links from the provided URL in a dedicated Puppeteer's page for each link,
|
|
32
|
-
* - show error summary after all links have been visited.
|
|
33
|
-
*
|
|
34
|
-
* @param {Object} options Parsed CLI arguments.
|
|
35
|
-
* @param {String} options.url The URL to start crawling. This argument is required.
|
|
36
|
-
* @param {Number} [options.depth=Infinity] Defines how many nested page levels should be examined. Infinity by default.
|
|
37
|
-
* @param {Array.<String>} [options.exclusions=[]] An array of patterns to exclude links. Empty array by default to not exclude anything.
|
|
38
|
-
* @param {Number} [options.concurrency=1] Number of concurrent pages (browser tabs) to be used during crawling. One by default.
|
|
39
|
-
* @param {Boolean} [options.quit=false] Terminates the scan as soon as an error is found. False (off) by default.
|
|
40
|
-
* @param {Boolean} [options.disableBrowserSandbox=false] Whether the browser should be created with the `--no-sandbox` flag.
|
|
41
|
-
* @param {Boolean} [options.noSpinner=false] Whether to display the spinner with progress or a raw message with current progress.
|
|
42
|
-
* @param {Boolean} [options.ignoreHTTPSErrors=false] Whether the browser should ignore invalid (self-signed) certificates.
|
|
43
|
-
* @returns {Promise} Promise is resolved, when the crawler has finished the whole crawling procedure.
|
|
44
|
-
*/
|
|
45
|
-
module.exports = async function verify( options ) {
|
|
46
|
-
const {
|
|
47
|
-
url,
|
|
48
|
-
depth = Infinity,
|
|
49
|
-
exclusions = [],
|
|
50
|
-
concurrency = 1,
|
|
51
|
-
quit = false,
|
|
52
|
-
disableBrowserSandbox = false,
|
|
53
|
-
noSpinner = false,
|
|
54
|
-
ignoreHTTPSErrors = false
|
|
55
|
-
} = options;
|
|
56
|
-
|
|
57
|
-
console.log( chalk.bold( '\n🔎 Starting the Crawler\n' ) );
|
|
58
|
-
|
|
59
|
-
process.on( 'unhandledRejection', reason => {
|
|
60
|
-
const error = util.inspect( reason, {
|
|
61
|
-
breakLength: Infinity,
|
|
62
|
-
compact: true
|
|
63
|
-
} );
|
|
64
|
-
|
|
65
|
-
console.log( chalk.red.bold( `\n🔥 Caught the \`unhandledRejection\` error: ${ error }\n` ) );
|
|
66
|
-
|
|
67
|
-
process.exit( 1 );
|
|
68
|
-
} );
|
|
69
|
-
|
|
70
|
-
const spinner = createSpinner( { noSpinner } );
|
|
71
|
-
const errors = new Map();
|
|
72
|
-
const browser = await createBrowser( { disableBrowserSandbox, ignoreHTTPSErrors } );
|
|
73
|
-
|
|
74
|
-
spinner.start( 'Checking pages…' );
|
|
75
|
-
|
|
76
|
-
let status = 'Done';
|
|
77
|
-
|
|
78
|
-
await openLinks( browser, {
|
|
79
|
-
baseUrl: getBaseUrl( url ),
|
|
80
|
-
linksQueue: [ {
|
|
81
|
-
url,
|
|
82
|
-
parentUrl: '(none)',
|
|
83
|
-
remainingNestedLevels: depth,
|
|
84
|
-
remainingAttempts: DEFAULT_REMAINING_ATTEMPTS
|
|
85
|
-
} ],
|
|
86
|
-
foundLinks: [ url ],
|
|
87
|
-
exclusions,
|
|
88
|
-
concurrency,
|
|
89
|
-
quit,
|
|
90
|
-
onError: getErrorHandler( errors ),
|
|
91
|
-
onProgress: getProgressHandler( spinner, { verbose: noSpinner } )
|
|
92
|
-
} ).catch( () => {
|
|
93
|
-
status = 'Terminated on first error';
|
|
94
|
-
} );
|
|
95
|
-
|
|
96
|
-
spinner.succeed( `Checking pages… ${ chalk.bold( status ) }` );
|
|
97
|
-
|
|
98
|
-
await browser.close();
|
|
99
|
-
|
|
100
|
-
logErrors( errors );
|
|
101
|
-
|
|
102
|
-
// Always exit the script because `spinner` can freeze the process of the crawler if it is executed in the `noSpinner:true` mode.
|
|
103
|
-
process.exit( errors.size ? 1 : 0 );
|
|
104
|
-
};
|
|
105
|
-
|
|
106
|
-
/**
|
|
107
|
-
* Creates a new browser instance and closes the default blank page.
|
|
108
|
-
*
|
|
109
|
-
* @param {Object} options
|
|
110
|
-
* @param {Boolean} [options.disableBrowserSandbox] Whether the browser should be created with the `--no-sandbox` flag.
|
|
111
|
-
* @param {Boolean} [options.ignoreHTTPSErrors] Whether the browser should ignore invalid (self-signed) certificates.
|
|
112
|
-
*
|
|
113
|
-
* @returns {Promise.<Object>} A promise, which resolves to the Puppeteer browser instance.
|
|
114
|
-
*/
|
|
115
|
-
async function createBrowser( options ) {
|
|
116
|
-
const browserOptions = {
|
|
117
|
-
args: []
|
|
118
|
-
};
|
|
119
|
-
|
|
120
|
-
if ( options.disableBrowserSandbox ) {
|
|
121
|
-
browserOptions.args.push( '--no-sandbox' );
|
|
122
|
-
browserOptions.args.push( '--disable-setuid-sandbox' );
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
if ( options.ignoreHTTPSErrors ) {
|
|
126
|
-
browserOptions.ignoreHTTPSErrors = options.ignoreHTTPSErrors;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
const browser = await puppeteer.launch( browserOptions );
|
|
130
|
-
|
|
131
|
-
const [ defaultBlankPage ] = await browser.pages();
|
|
132
|
-
|
|
133
|
-
if ( defaultBlankPage ) {
|
|
134
|
-
await defaultBlankPage.close();
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
return browser;
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
/**
|
|
141
|
-
* Returns an error handler, which is called every time new error is found.
|
|
142
|
-
*
|
|
143
|
-
* @param {Map.<ErrorType, ErrorCollection>} errors All errors grouped by their type.
|
|
144
|
-
* @returns {Function} Error handler.
|
|
145
|
-
*/
|
|
146
|
-
function getErrorHandler( errors ) {
|
|
147
|
-
return error => {
|
|
148
|
-
if ( !errors.has( error.type ) ) {
|
|
149
|
-
errors.set( error.type, new Map() );
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// Split the message into the first line and all the rest. The first line is the key by which the errors are grouped together.
|
|
153
|
-
// All errors are grouped together only by the first message line (without the error call stack and other details, that could
|
|
154
|
-
// possibly exist after the first line), because there is a good chance that the same error can be triggered in a different
|
|
155
|
-
// contexts (so in a different call stacks). In order not to duplicate almost the same errors, we need to determine their common
|
|
156
|
-
// part.
|
|
157
|
-
const messageLines = error.message.split( '\n' );
|
|
158
|
-
const firstMessageLine = messageLines.shift();
|
|
159
|
-
const nextMessageLines = messageLines.join( '\n' );
|
|
160
|
-
|
|
161
|
-
const errorCollection = errors.get( error.type );
|
|
162
|
-
|
|
163
|
-
if ( !errorCollection.has( firstMessageLine ) ) {
|
|
164
|
-
errorCollection.set( firstMessageLine, {
|
|
165
|
-
// Store only unique pages, because given error can occur multiple times on the same page.
|
|
166
|
-
pages: new Set(),
|
|
167
|
-
details: nextMessageLines
|
|
168
|
-
} );
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
errorCollection.get( firstMessageLine ).pages.add( error.pageUrl );
|
|
172
|
-
};
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
/**
|
|
176
|
-
* Searches and opens all found links in the document body from requested URL, recursively.
|
|
177
|
-
*
|
|
178
|
-
* @param {Object} browser The headless browser instance from Puppeteer.
|
|
179
|
-
* @param {Object} data All data needed for crawling the links.
|
|
180
|
-
* @param {String} data.baseUrl The base URL from the initial page URL.
|
|
181
|
-
* @param {Array.<Link>} data.linksQueue An array of link to crawl.
|
|
182
|
-
* @param {Array.<String>} data.foundLinks An array of all links, which have been already discovered.
|
|
183
|
-
* @param {Array.<String>} data.exclusions An array of patterns to exclude links. Empty array by default to not exclude anything.
|
|
184
|
-
* @param {Number} data.concurrency Number of concurrent pages (browser tabs) to be used during crawling.
|
|
185
|
-
* @param {Boolean} data.quit Terminates the scan as soon as an error is found.
|
|
186
|
-
* @param {Function} data.onError Callback called ever time an error has been found.
|
|
187
|
-
* @param {Function} data.onProgress Callback called every time just before opening a new link.
|
|
188
|
-
* @returns {Promise} Promise is resolved, when all links have been visited.
|
|
189
|
-
*/
|
|
190
|
-
async function openLinks( browser, { baseUrl, linksQueue, foundLinks, exclusions, concurrency, quit, onError, onProgress } ) {
|
|
191
|
-
const numberOfOpenPages = ( await browser.pages() ).length;
|
|
192
|
-
|
|
193
|
-
// Check if the limit of simultaneously opened pages in the browser has been reached.
|
|
194
|
-
if ( numberOfOpenPages >= concurrency ) {
|
|
195
|
-
return;
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
return Promise.all(
|
|
199
|
-
linksQueue
|
|
200
|
-
// Get links from the queue, up to the concurrency limit...
|
|
201
|
-
.splice( 0, concurrency - numberOfOpenPages )
|
|
202
|
-
// ...and open each of them in a dedicated page to collect nested links and errors (if any) they contain.
|
|
203
|
-
.map( async link => {
|
|
204
|
-
let newErrors = [];
|
|
205
|
-
let newLinks = [];
|
|
206
|
-
|
|
207
|
-
onProgress( {
|
|
208
|
-
total: foundLinks.length
|
|
209
|
-
} );
|
|
210
|
-
|
|
211
|
-
// If opening a given link causes an error, try opening it again until the limit of remaining attempts is reached.
|
|
212
|
-
do {
|
|
213
|
-
const { errors, links } = await openLink( browser, { baseUrl, link, foundLinks, exclusions } );
|
|
214
|
-
|
|
215
|
-
link.remainingAttempts--;
|
|
216
|
-
|
|
217
|
-
newErrors = [ ...errors ];
|
|
218
|
-
newLinks = [ ...links ];
|
|
219
|
-
} while ( newErrors.length && link.remainingAttempts );
|
|
220
|
-
|
|
221
|
-
newErrors.forEach( newError => onError( newError ) );
|
|
222
|
-
|
|
223
|
-
newLinks.forEach( newLink => {
|
|
224
|
-
foundLinks.push( newLink );
|
|
225
|
-
|
|
226
|
-
linksQueue.push( {
|
|
227
|
-
url: newLink,
|
|
228
|
-
parentUrl: link.url,
|
|
229
|
-
remainingNestedLevels: link.remainingNestedLevels - 1,
|
|
230
|
-
remainingAttempts: DEFAULT_REMAINING_ATTEMPTS
|
|
231
|
-
} );
|
|
232
|
-
} );
|
|
233
|
-
|
|
234
|
-
// Terminate the scan as soon as an error is found, if `--quit` or `-q` CLI argument has been set.
|
|
235
|
-
if ( newErrors.length > 0 && quit ) {
|
|
236
|
-
return Promise.reject();
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
// When currently examined link has been checked, try to open new links up to the concurrency limit.
|
|
240
|
-
return openLinks( browser, { baseUrl, linksQueue, foundLinks, exclusions, concurrency, quit, onError, onProgress } );
|
|
241
|
-
} )
|
|
242
|
-
);
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
/**
|
|
246
|
-
* Creates a dedicated Puppeteer's page for URL to be tested and collects all links from it. Only links from the same base URL
|
|
247
|
-
* as the tested URL are collected. Only the base URL part consisting of a protocol, a host, a port, and a path is stored, without
|
|
248
|
-
* a hash and search parts. Duplicated links, which were already found and enqueued, are skipped to avoid loops. Explicitly
|
|
249
|
-
* excluded links are also skipped. If the requested traversing depth has been reached, nested links from this URL are not collected
|
|
250
|
-
* anymore.
|
|
251
|
-
*
|
|
252
|
-
* @param {Object} browser The headless browser instance from Puppeteer.
|
|
253
|
-
* @param {Object} data All data needed for crawling the link.
|
|
254
|
-
* @param {String} data.baseUrl The base URL from the initial page URL.
|
|
255
|
-
* @param {Link} data.link A link to crawl.
|
|
256
|
-
* @param {Array.<String>} data.foundLinks An array of all links, which have been already discovered.
|
|
257
|
-
* @param {Array.<String>} data.exclusions An array of patterns to exclude links. Empty array by default to not exclude anything.
|
|
258
|
-
* @returns {Promise.<ErrorsAndLinks>} A promise, which resolves to a collection of unique errors and links.
|
|
259
|
-
*/
|
|
260
|
-
async function openLink( browser, { baseUrl, link, foundLinks, exclusions } ) {
|
|
261
|
-
const errors = [];
|
|
262
|
-
|
|
263
|
-
const onError = error => errors.push( error );
|
|
264
|
-
|
|
265
|
-
// Create dedicated page for current link.
|
|
266
|
-
const page = await createPage( browser, { link, onError } );
|
|
267
|
-
|
|
268
|
-
try {
|
|
269
|
-
// Consider navigation to be finished when the `load` event is fired and there are no network connections for at least 500 ms.
|
|
270
|
-
await page.goto( link.url, { waitUntil: [ 'load', 'networkidle0' ] } );
|
|
271
|
-
} catch ( error ) {
|
|
272
|
-
const errorMessage = error.message || '(empty message)';
|
|
273
|
-
|
|
274
|
-
// All navigation errors starting with the `net::` prefix are already covered by the "request" error handler, so it should
|
|
275
|
-
// not be also reported as the "navigation error".
|
|
276
|
-
const ignoredMessage = 'net::';
|
|
277
|
-
|
|
278
|
-
if ( !errorMessage.startsWith( ignoredMessage ) ) {
|
|
279
|
-
onError( {
|
|
280
|
-
pageUrl: link.url,
|
|
281
|
-
type: ERROR_TYPES.NAVIGATION_ERROR,
|
|
282
|
-
message: errorMessage
|
|
283
|
-
} );
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
const isResponding = await isPageResponding( page );
|
|
287
|
-
|
|
288
|
-
// Exit immediately and do not try to call any function in the context of the page, that is not responding or if it has not been
|
|
289
|
-
// opened. However, once the page has been opened (its URL is the same as the one requested), continue as usual and do not close
|
|
290
|
-
// the page yet, because the page may contain error exclusions, that should be taken into account. Such a case can happen when,
|
|
291
|
-
// for example, the `load` event was not fired because the external resource was not loaded yet.
|
|
292
|
-
if ( !isResponding || page.url() !== link.url ) {
|
|
293
|
-
page.removeAllListeners();
|
|
294
|
-
|
|
295
|
-
await page.close();
|
|
296
|
-
|
|
297
|
-
return {
|
|
298
|
-
errors,
|
|
299
|
-
links: []
|
|
300
|
-
};
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
// Create patterns from meta tags to ignore errors.
|
|
305
|
-
const errorIgnorePatterns = await getErrorIgnorePatternsFromPage( page );
|
|
306
|
-
|
|
307
|
-
// Iterates over recently found errors to mark them as ignored ones, if they match the patterns.
|
|
308
|
-
markErrorsAsIgnored( errors, errorIgnorePatterns );
|
|
309
|
-
|
|
310
|
-
// Skip crawling deeper, if the bottom has been reached, or get all unique links from the page body otherwise.
|
|
311
|
-
const links = link.remainingNestedLevels === 0 ?
|
|
312
|
-
[] :
|
|
313
|
-
await getLinksFromPage( page, { baseUrl, foundLinks, exclusions } );
|
|
314
|
-
|
|
315
|
-
page.removeAllListeners();
|
|
316
|
-
|
|
317
|
-
await page.close();
|
|
318
|
-
|
|
319
|
-
return {
|
|
320
|
-
errors: errors.filter( error => !error.ignored ),
|
|
321
|
-
links
|
|
322
|
-
};
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
/**
|
|
326
|
-
* Finds all links in opened page and filters out external, already discovered and exlicitly excluded ones.
|
|
327
|
-
*
|
|
328
|
-
* @param {Object} page The page instance from Puppeteer.
|
|
329
|
-
* @param {Object} data All data needed for crawling the link.
|
|
330
|
-
* @param {String} data.baseUrl The base URL from the initial page URL.
|
|
331
|
-
* @param {Array.<String>} data.foundLinks An array of all links, which have been already discovered.
|
|
332
|
-
* @param {Array.<String>} data.exclusions An array patterns to exclude links. Empty array by default to not exclude anything.
|
|
333
|
-
* @returns {Promise.<Array.<String>>} A promise, which resolves to an array of unique links.
|
|
334
|
-
*/
|
|
335
|
-
async function getLinksFromPage( page, { baseUrl, foundLinks, exclusions } ) {
|
|
336
|
-
const evaluatePage = anchors => [ ...new Set( anchors
|
|
337
|
-
.filter( anchor => /http(s)?:/.test( anchor.protocol ) )
|
|
338
|
-
.map( anchor => `${ anchor.origin }${ anchor.pathname }` ) )
|
|
339
|
-
];
|
|
340
|
-
|
|
341
|
-
return ( await page.$$eval( `body a[href]:not([${ DATA_ATTRIBUTE_NAME }])`, evaluatePage ) )
|
|
342
|
-
.filter( link => {
|
|
343
|
-
// Skip external link.
|
|
344
|
-
if ( !link.startsWith( baseUrl ) ) {
|
|
345
|
-
return false;
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
// Skip already discovered link.
|
|
349
|
-
if ( foundLinks.includes( link ) ) {
|
|
350
|
-
return false;
|
|
351
|
-
}
|
|
352
|
-
|
|
353
|
-
// Skip explicitly excluded link.
|
|
354
|
-
if ( exclusions.some( exclusion => link.includes( exclusion ) ) ) {
|
|
355
|
-
return false;
|
|
356
|
-
}
|
|
357
|
-
|
|
358
|
-
return true;
|
|
359
|
-
} );
|
|
360
|
-
}
|
|
361
|
-
|
|
362
|
-
/**
|
|
363
|
-
* Finds all meta tags, that contain a pattern to ignore errors, and then returns a map between error type and these patterns.
|
|
364
|
-
*
|
|
365
|
-
* @param {Object} page The page instance from Puppeteer.
|
|
366
|
-
* @returns {Promise.<Map.<ErrorType, Set.<String>>>} A promise, which resolves to a map between an error type and a set of patterns.
|
|
367
|
-
*/
|
|
368
|
-
async function getErrorIgnorePatternsFromPage( page ) {
|
|
369
|
-
const metaTag = await page.$( `head > meta[name=${ META_TAG_NAME }]` );
|
|
370
|
-
|
|
371
|
-
const patterns = new Map();
|
|
372
|
-
|
|
373
|
-
// If meta tag is not defined, return an empty map.
|
|
374
|
-
if ( !metaTag ) {
|
|
375
|
-
return patterns;
|
|
376
|
-
}
|
|
377
|
-
|
|
378
|
-
const contentString = await metaTag.evaluate( metaTag => metaTag.getAttribute( 'content' ) );
|
|
379
|
-
|
|
380
|
-
let content;
|
|
381
|
-
|
|
382
|
-
try {
|
|
383
|
-
// Try to parse value from meta tag...
|
|
384
|
-
content = JSON.parse( contentString );
|
|
385
|
-
} catch ( error ) {
|
|
386
|
-
// ...but if it is not a valid JSON, return an empty map.
|
|
387
|
-
return patterns;
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
Object.entries( content ).forEach( ( [ type, pattern ] ) => {
|
|
391
|
-
const patternCollection = new Set( toArray( pattern )
|
|
392
|
-
// Only string patterns are supported, as the error message produced by the crawler is always a string.
|
|
393
|
-
.filter( pattern => typeof pattern === 'string' )
|
|
394
|
-
// Only non-empty patterns are supported, because an empty pattern would cause all errors in a given type to be ignored.
|
|
395
|
-
.filter( pattern => pattern.length > 0 )
|
|
396
|
-
);
|
|
397
|
-
|
|
398
|
-
if ( !patternCollection.size ) {
|
|
399
|
-
return;
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
const errorType = PATTERN_TYPE_TO_ERROR_TYPE_MAP[ type ];
|
|
403
|
-
|
|
404
|
-
patterns.set( errorType, patternCollection );
|
|
405
|
-
} );
|
|
406
|
-
|
|
407
|
-
return patterns;
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
/**
|
|
411
|
-
* Iterates over all found errors from given link and marks errors as ingored, if their message match the ignore pattern.
|
|
412
|
-
*
|
|
413
|
-
* @param {Array.<Error>} errors An array of errors to check.
|
|
414
|
-
* @param {Map.<ErrorType, Set.<String>>} errorIgnorePatterns A map between an error type and a set of patterns.
|
|
415
|
-
*/
|
|
416
|
-
function markErrorsAsIgnored( errors, errorIgnorePatterns ) {
|
|
417
|
-
errors.forEach( error => {
|
|
418
|
-
// Skip, if there is no pattern defined for currently examined error type.
|
|
419
|
-
if ( !errorIgnorePatterns.has( error.type ) ) {
|
|
420
|
-
return;
|
|
421
|
-
}
|
|
422
|
-
|
|
423
|
-
const patterns = [ ...errorIgnorePatterns.get( error.type ) ];
|
|
424
|
-
|
|
425
|
-
const isPatternMatched = pattern => {
|
|
426
|
-
if ( pattern === IGNORE_ALL_ERRORS_WILDCARD ) {
|
|
427
|
-
return true;
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
if ( stripAnsiEscapeCodes( error.message ).includes( pattern ) ) {
|
|
431
|
-
return true;
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
if ( error.failedResourceUrl && error.failedResourceUrl.includes( pattern ) ) {
|
|
435
|
-
return true;
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
return false;
|
|
439
|
-
};
|
|
440
|
-
|
|
441
|
-
// If at least one pattern matches the error message, mark currently examined error as ignored.
|
|
442
|
-
if ( patterns.some( isPatternMatched ) ) {
|
|
443
|
-
error.ignored = true;
|
|
444
|
-
}
|
|
445
|
-
} );
|
|
446
|
-
}
|
|
447
|
-
|
|
448
|
-
/**
|
|
449
|
-
* Creates a new page in Puppeteer's browser instance.
|
|
450
|
-
*
|
|
451
|
-
* @param {Object} browser The headless browser instance from Puppeteer.
|
|
452
|
-
* @param {Object} data All data needed for creating a new page.
|
|
453
|
-
* @param {Link} data.link A link to crawl.
|
|
454
|
-
* @param {Function} data.onError Callback called every time just before opening a new link.
|
|
455
|
-
* @returns {Promise.<Object>} A promise, which resolves to the page instance from Puppeteer.
|
|
456
|
-
*/
|
|
457
|
-
async function createPage( browser, { link, onError } ) {
|
|
458
|
-
const page = await browser.newPage();
|
|
459
|
-
|
|
460
|
-
await page.setDefaultTimeout( DEFAULT_TIMEOUT );
|
|
461
|
-
|
|
462
|
-
await page.setCacheEnabled( false );
|
|
463
|
-
|
|
464
|
-
dismissDialogs( page );
|
|
465
|
-
|
|
466
|
-
registerErrorHandlers( page, { link, onError } );
|
|
467
|
-
|
|
468
|
-
await registerRequestInterception( page );
|
|
469
|
-
|
|
470
|
-
return page;
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
/**
|
|
474
|
-
* Dismisses any dialogs (alert, prompt, confirm, beforeunload) that could be displayed on page load.
|
|
475
|
-
*
|
|
476
|
-
* @param {Object} page The page instance from Puppeteer.
|
|
477
|
-
*/
|
|
478
|
-
function dismissDialogs( page ) {
|
|
479
|
-
page.on( 'dialog', async dialog => {
|
|
480
|
-
await dialog.dismiss();
|
|
481
|
-
} );
|
|
482
|
-
}
|
|
483
|
-
|
|
484
|
-
/**
|
|
485
|
-
* Registers all error handlers on given page instance.
|
|
486
|
-
*
|
|
487
|
-
* @param {Object} page The page instance from Puppeteer.
|
|
488
|
-
* @param {Object} data All data needed for registering error handlers.
|
|
489
|
-
* @param {Link} data.link A link to crawl associated with Puppeteer's page.
|
|
490
|
-
* @param {Function} data.onError Called each time an error has been found.
|
|
491
|
-
*/
|
|
492
|
-
function registerErrorHandlers( page, { link, onError } ) {
|
|
493
|
-
page.on( ERROR_TYPES.PAGE_CRASH.event, error => onError( {
|
|
494
|
-
pageUrl: page.url(),
|
|
495
|
-
type: ERROR_TYPES.PAGE_CRASH,
|
|
496
|
-
message: error.message || '(empty message)'
|
|
497
|
-
} ) );
|
|
498
|
-
|
|
499
|
-
page.on( ERROR_TYPES.UNCAUGHT_EXCEPTION.event, error => onError( {
|
|
500
|
-
pageUrl: page.url(),
|
|
501
|
-
type: ERROR_TYPES.UNCAUGHT_EXCEPTION,
|
|
502
|
-
message: error.message || '(empty message)'
|
|
503
|
-
} ) );
|
|
504
|
-
|
|
505
|
-
page.on( ERROR_TYPES.REQUEST_FAILURE.event, request => {
|
|
506
|
-
const errorText = request.failure().errorText;
|
|
507
|
-
|
|
508
|
-
// Do not log errors explicitly aborted by the crawler.
|
|
509
|
-
if ( errorText !== 'net::ERR_BLOCKED_BY_CLIENT.Inspector' ) {
|
|
510
|
-
const url = request.url();
|
|
511
|
-
const host = new URL( url ).host;
|
|
512
|
-
const isNavigation = isNavigationRequest( request );
|
|
513
|
-
const message = isNavigation ?
|
|
514
|
-
`Failed to open link ${ chalk.bold( url ) }` :
|
|
515
|
-
`Failed to load resource from ${ chalk.bold( host ) }`;
|
|
516
|
-
|
|
517
|
-
onError( {
|
|
518
|
-
pageUrl: isNavigation ? link.parentUrl : page.url(),
|
|
519
|
-
type: ERROR_TYPES.REQUEST_FAILURE,
|
|
520
|
-
message: `${ message } (failure message: ${ chalk.bold( errorText ) })`,
|
|
521
|
-
failedResourceUrl: url
|
|
522
|
-
} );
|
|
523
|
-
}
|
|
524
|
-
} );
|
|
525
|
-
|
|
526
|
-
page.on( ERROR_TYPES.RESPONSE_FAILURE.event, response => {
|
|
527
|
-
const responseStatus = response.status();
|
|
528
|
-
|
|
529
|
-
if ( responseStatus > 399 ) {
|
|
530
|
-
const url = response.url();
|
|
531
|
-
const host = new URL( url ).host;
|
|
532
|
-
const isNavigation = isNavigationRequest( response.request() );
|
|
533
|
-
const message = isNavigation ?
|
|
534
|
-
`Failed to open link ${ chalk.bold( url ) }` :
|
|
535
|
-
`Failed to load resource from ${ chalk.bold( host ) }`;
|
|
536
|
-
|
|
537
|
-
onError( {
|
|
538
|
-
pageUrl: isNavigation ? link.parentUrl : page.url(),
|
|
539
|
-
type: ERROR_TYPES.RESPONSE_FAILURE,
|
|
540
|
-
message: `${ message } (HTTP response status code: ${ chalk.bold( responseStatus ) })`,
|
|
541
|
-
failedResourceUrl: url
|
|
542
|
-
} );
|
|
543
|
-
}
|
|
544
|
-
} );
|
|
545
|
-
|
|
546
|
-
page.on( ERROR_TYPES.CONSOLE_ERROR.event, async message => {
|
|
547
|
-
// The resource loading failure is already covered by the "request" or "response" error handlers, so it should
|
|
548
|
-
// not be also reported as the "console error".
|
|
549
|
-
const ignoredMessage = 'Failed to load resource:';
|
|
550
|
-
|
|
551
|
-
if ( message.text().startsWith( ignoredMessage ) ) {
|
|
552
|
-
return;
|
|
553
|
-
}
|
|
554
|
-
|
|
555
|
-
if ( message.type() !== 'error' ) {
|
|
556
|
-
return;
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
const serializeArgumentInPageContext = argument => {
|
|
560
|
-
// Since errors are not serializable, return the message with the call stack as the output text.
|
|
561
|
-
if ( argument instanceof Error ) {
|
|
562
|
-
return argument.stack;
|
|
563
|
-
}
|
|
564
|
-
|
|
565
|
-
// Cast non-string iterable argument to an array.
|
|
566
|
-
if ( typeof argument !== 'string' && argument[ Symbol.iterator ] ) {
|
|
567
|
-
return [ ...argument ];
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
// Return argument right away. Since we use `executionContext().evaluate()`, it'll return JSON value of the
|
|
571
|
-
// argument if possible, or `undefined` if it fails to stringify it.
|
|
572
|
-
return argument;
|
|
573
|
-
};
|
|
574
|
-
|
|
575
|
-
const serializeArguments = argument => argument
|
|
576
|
-
.executionContext()
|
|
577
|
-
.evaluate( serializeArgumentInPageContext, argument );
|
|
578
|
-
|
|
579
|
-
const serializedArguments = await Promise.all( message.args().map( serializeArguments ) );
|
|
580
|
-
|
|
581
|
-
const serializedMessage = serializedArguments
|
|
582
|
-
.map( argument => {
|
|
583
|
-
// Do not wrap the string in additional quotes and just return it as is.
|
|
584
|
-
if ( typeof argument === 'string' ) {
|
|
585
|
-
return argument;
|
|
586
|
-
}
|
|
587
|
-
|
|
588
|
-
return util.inspect( argument, {
|
|
589
|
-
breakLength: Infinity,
|
|
590
|
-
compact: true
|
|
591
|
-
} );
|
|
592
|
-
} )
|
|
593
|
-
.join( ' ' );
|
|
594
|
-
|
|
595
|
-
onError( {
|
|
596
|
-
pageUrl: page.url(),
|
|
597
|
-
type: ERROR_TYPES.CONSOLE_ERROR,
|
|
598
|
-
message: serializedMessage || message.text() || '(empty message)'
|
|
599
|
-
} );
|
|
600
|
-
} );
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
/**
|
|
604
|
-
* Checks, if HTTP request was a navigation one, i.e. request that is driving frame's navigation. Requests sent from child frames
|
|
605
|
-
* (i.e. from <iframe>) are not treated as a navigation. Only a request from a top-level frame is navigation.
|
|
606
|
-
*
|
|
607
|
-
* @param {Object} request The Puppeteer's HTTP request instance.
|
|
608
|
-
* @returns {Boolean}
|
|
609
|
-
*/
|
|
610
|
-
function isNavigationRequest( request ) {
|
|
611
|
-
return request.isNavigationRequest() && request.frame().parentFrame() === null;
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
/**
|
|
615
|
-
* Checks, if the page is not hung by trying to evaluate a function within the page context in defined time.
|
|
616
|
-
*
|
|
617
|
-
* @param {Object} page The page instance from Puppeteer.
|
|
618
|
-
* @returns {Promise.<Boolean>}
|
|
619
|
-
*/
|
|
620
|
-
async function isPageResponding( page ) {
|
|
621
|
-
return Promise.race( [
|
|
622
|
-
page.title(),
|
|
623
|
-
new Promise( ( resolve, reject ) => setTimeout( () => reject(), DEFAULT_RESPONSIVENESS_CHECK_TIMEOUT ) )
|
|
624
|
-
] ).then( () => true ).catch( () => false );
|
|
625
|
-
}
|
|
626
|
-
|
|
627
|
-
/**
|
|
628
|
-
* Registers a request interception procedure to explicitly block all 'media' requests (resources loaded by a <video> or <audio> elements).
|
|
629
|
-
*
|
|
630
|
-
* @param {Object} page The page instance from Puppeteer.
|
|
631
|
-
* @returns {Promise} Promise is resolved, when the request interception procedure is registered.
|
|
632
|
-
*/
|
|
633
|
-
async function registerRequestInterception( page ) {
|
|
634
|
-
await page.setRequestInterception( true );
|
|
635
|
-
|
|
636
|
-
page.on( 'request', request => {
|
|
637
|
-
const resourceType = request.resourceType();
|
|
638
|
-
|
|
639
|
-
// Block all 'media' requests, as they are likely to fail anyway due to limitations in Puppeteer.
|
|
640
|
-
if ( resourceType === 'media' ) {
|
|
641
|
-
request.abort( 'blockedbyclient' );
|
|
642
|
-
} else {
|
|
643
|
-
request.continue();
|
|
644
|
-
}
|
|
645
|
-
} );
|
|
646
|
-
}
|
|
647
|
-
|
|
648
|
-
/**
|
|
649
|
-
* Analyzes collected errors and logs them in the console.
|
|
650
|
-
*
|
|
651
|
-
* @param {Map.<ErrorType, ErrorCollection>} errors All found errors grouped by their type.
|
|
652
|
-
*/
|
|
653
|
-
function logErrors( errors ) {
|
|
654
|
-
if ( !errors.size ) {
|
|
655
|
-
console.log( chalk.green.bold( '\n✨ No errors have been found.\n' ) );
|
|
656
|
-
return;
|
|
657
|
-
}
|
|
658
|
-
|
|
659
|
-
console.log( chalk.red.bold( '\n🔥 The following errors have been found:' ) );
|
|
660
|
-
|
|
661
|
-
errors.forEach( ( errorCollection, errorType ) => {
|
|
662
|
-
const numberOfErrors = errorCollection.size;
|
|
663
|
-
const separator = chalk.gray( ' ➜ ' );
|
|
664
|
-
const errorName = chalk.bgRed.white.bold( ` ${ errorType.description.toUpperCase() } ` );
|
|
665
|
-
const errorSummary = chalk.red( `${ chalk.bold( numberOfErrors ) } ${ numberOfErrors > 1 ? 'errors' : 'error' }` );
|
|
666
|
-
|
|
667
|
-
console.group( `\n${ errorName } ${ separator } ${ errorSummary }` );
|
|
668
|
-
|
|
669
|
-
errorCollection.forEach( ( error, message ) => {
|
|
670
|
-
console.group( `\n❌ ${ message }` );
|
|
671
|
-
|
|
672
|
-
if ( error.details ) {
|
|
673
|
-
console.log( error.details );
|
|
674
|
-
}
|
|
675
|
-
|
|
676
|
-
console.log( chalk.red( `\n…found on the following ${ error.pages.size > 1 ? 'pages' : 'page' }:` ) );
|
|
677
|
-
|
|
678
|
-
error.pages.forEach( pageUrl => console.log( chalk.gray( `➥ ${ pageUrl }` ) ) );
|
|
679
|
-
|
|
680
|
-
console.groupEnd();
|
|
681
|
-
} );
|
|
682
|
-
|
|
683
|
-
console.groupEnd();
|
|
684
|
-
} );
|
|
685
|
-
|
|
686
|
-
// Blank message only to separate the errors output log.
|
|
687
|
-
console.log();
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
/**
|
|
691
|
-
* @typedef {Object.<String, String|Number>} Link
|
|
692
|
-
* @property {String} url The URL associated with the link.
|
|
693
|
-
* @property {String} parentUrl The page on which the link was found.
|
|
694
|
-
* @property {Number} remainingNestedLevels The remaining number of nested levels to be checked. If this value is 0, the
|
|
695
|
-
* requested traversing depth has been reached and nested links from the URL associated with this link are not collected anymore.
|
|
696
|
-
* @property {Number} remainingAttempts The total number of reopenings allowed for the given link.
|
|
697
|
-
*/
|
|
698
|
-
|
|
699
|
-
/**
|
|
700
|
-
* @typedef {Object.<String, String>} ErrorType
|
|
701
|
-
* @property {String} [event] The event name emitted by Puppeteer.
|
|
702
|
-
* @property {String} description Human-readable description of the error.
|
|
703
|
-
*/
|
|
704
|
-
|
|
705
|
-
/**
|
|
706
|
-
* @typedef {Object.<String, String|Boolean|ErrorType>} Error
|
|
707
|
-
* @property {String} pageUrl The URL, where error has occurred.
|
|
708
|
-
* @property {ErrorType} type Error type.
|
|
709
|
-
* @property {String} message Error message.
|
|
710
|
-
* @property {String} [failedResourceUrl] Full resource URL, that has failed. Necessary for matching against exclusion patterns.
|
|
711
|
-
* @property {Boolean} [ignored] Indicates that error should be ignored, because its message matches the exclusion pattern.
|
|
712
|
-
*/
|
|
713
|
-
|
|
714
|
-
/**
|
|
715
|
-
* @typedef {Object.<String, Set.<String>>} ErrorOccurrence
|
|
716
|
-
* @property {Set.<String>} pages A set of unique pages, where error has been found.
|
|
717
|
-
* @property {Set.<String>} [details] Additional error details (i.e. an error stack).
|
|
718
|
-
*/
|
|
719
|
-
|
|
720
|
-
/**
|
|
721
|
-
* @typedef {Map.<String, ErrorOccurrence>} ErrorCollection
|
|
722
|
-
* @property {ErrorOccurrence} [*] Error message.
|
|
723
|
-
*/
|
|
724
|
-
|
|
725
|
-
/**
|
|
726
|
-
* @typedef {Object.<String, Array.<String>>} ErrorsAndLinks Collection of unique errors and links.
|
|
727
|
-
* @property {Array.<String>} errors An array of errors.
|
|
728
|
-
* @property {Array.<String>} links An array of links.
|
|
729
|
-
*/
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.
|
|
5
|
-
* For licensing, see LICENSE.md.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
/* eslint-env node */
|
|
9
|
-
|
|
10
|
-
const chalk = require( 'chalk' );
|
|
11
|
-
const ora = require( 'ora' );
|
|
12
|
-
|
|
13
|
-
/**
|
|
14
|
-
* Creates nice-looking CLI spinner.
|
|
15
|
-
* @param {Object} options
|
|
16
|
-
* @param {Boolean} [options.noSpinner=false] Whether to display the spinner with progress or a message with current progress.
|
|
17
|
-
*/
|
|
18
|
-
function createSpinner( { noSpinner } ) {
|
|
19
|
-
return ora( {
|
|
20
|
-
spinner: {
|
|
21
|
-
frames: [ '⣾', '⣷', '⣯', '⣟', '⡿', '⢿', '⣻', '⣽' ]
|
|
22
|
-
},
|
|
23
|
-
// Do not render the spinner if the `verbose` mode is enabled.
|
|
24
|
-
isSilent: noSpinner,
|
|
25
|
-
stream: noSpinner ? process.stdout : process.stderr
|
|
26
|
-
} );
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Returns a progress handler, which is called every time just before opening a new link.
|
|
31
|
-
*
|
|
32
|
-
* @param {Object} spinner Spinner instance
|
|
33
|
-
* @param {Object} options
|
|
34
|
-
* @param {Boolean} [options.verbose] Whether to display raw log instead of modifying the spinner instance.
|
|
35
|
-
* @returns {Function} Progress handler.
|
|
36
|
-
*/
|
|
37
|
-
function getProgressHandler( spinner, { verbose } ) {
|
|
38
|
-
let current = 0;
|
|
39
|
-
|
|
40
|
-
return ( { total } ) => {
|
|
41
|
-
current++;
|
|
42
|
-
|
|
43
|
-
const progress = Math.round( current / total * 100 );
|
|
44
|
-
const logMessage = `Checking pages… ${ chalk.bold( `${ progress }% (${ current } of ${ total })` ) }`;
|
|
45
|
-
|
|
46
|
-
if ( verbose ) {
|
|
47
|
-
console.log( logMessage );
|
|
48
|
-
} else {
|
|
49
|
-
spinner.text = logMessage;
|
|
50
|
-
}
|
|
51
|
-
};
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
module.exports = {
|
|
55
|
-
createSpinner,
|
|
56
|
-
getProgressHandler
|
|
57
|
-
};
|
package/lib/web-crawler/utils.js
DELETED
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* @license Copyright (c) 2003-2022, CKSource Holding sp. z o.o. All rights reserved.
|
|
5
|
-
* For licensing, see LICENSE.md.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
/* eslint-env node */
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Extracts base URL from the provided page URL. Base URL consists of a protocol, a host, a port, and a path.
|
|
12
|
-
* A hash and search parts are omitted, because they would have navigated to the same page if they were set.
|
|
13
|
-
*
|
|
14
|
-
* @param {String} url Page URL.
|
|
15
|
-
* @returns {String} Base URL from page URL.
|
|
16
|
-
*/
|
|
17
|
-
function getBaseUrl( url ) {
|
|
18
|
-
const { origin, pathname } = new URL( url );
|
|
19
|
-
|
|
20
|
-
return `${ origin }${ pathname }`;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
/**
|
|
24
|
-
* Checks, if provided string is a valid URL utilizing the HTTP or HTTPS protocols.
|
|
25
|
-
*
|
|
26
|
-
* @param {String} url The URL to validate.
|
|
27
|
-
* @returns {Boolean}
|
|
28
|
-
*/
|
|
29
|
-
function isUrlValid( url ) {
|
|
30
|
-
try {
|
|
31
|
-
return [ 'http:', 'https:' ].includes( new URL( url ).protocol );
|
|
32
|
-
} catch ( error ) {
|
|
33
|
-
return false;
|
|
34
|
-
}
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
/**
|
|
38
|
-
* Transforms any value to an array. If the provided value is already an array, it is returned unchanged.
|
|
39
|
-
*
|
|
40
|
-
* @param {*} data The value to transform to an array.
|
|
41
|
-
* @returns {Array.<*>} An array created from data.
|
|
42
|
-
*/
|
|
43
|
-
function toArray( data ) {
|
|
44
|
-
return Array.isArray( data ) ? data : [ data ];
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
module.exports = {
|
|
48
|
-
getBaseUrl,
|
|
49
|
-
isUrlValid,
|
|
50
|
-
toArray
|
|
51
|
-
};
|