@uxf/scripts 11.61.4 → 11.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/uxf-sitemap-check/index.js +104 -24
package/package.json
CHANGED
|
@@ -8,15 +8,23 @@ const robotsTxtParser = require("robots-txt-parser");
|
|
|
8
8
|
const { HTTP_USERNAME, HTTP_PASSWORD } = process.env;
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
|
-
* @typedef {{parentUrl: (string | undefined), isImg: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null)}} UrlCheckResponse
|
|
11
|
+
* @typedef {{parentUrl: (string | undefined), isImg: boolean, isWebPage: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null), redirected: boolean}} UrlCheckResponse
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* @typedef {{url: string, parentUrl: (string | undefined), canonicalUrl: (string | null), isImg: boolean, isWebPage: boolean, ttl: number, status: number, message: (string | undefined), skipped: boolean, indexable: boolean, redirected: boolean}} TestedUrlDto
|
|
12
16
|
*/
|
|
13
17
|
|
|
14
18
|
const MAX_TTL = 3;
|
|
15
19
|
const IMAGES_LABEL = "🏞 Images:";
|
|
16
20
|
const URLS_LABEL = "🔗 Links:";
|
|
17
21
|
|
|
22
|
+
/**
|
|
23
|
+
* @type TestedUrlDto[]
|
|
24
|
+
*/
|
|
18
25
|
const TESTED_URLS = [];
|
|
19
26
|
const URLS_TO_CHECK = new Set();
|
|
27
|
+
const ERRORS = [];
|
|
20
28
|
|
|
21
29
|
const robotsParser = robotsTxtParser({ userAgent: "uxf-bot", allowOnNeutral: false });
|
|
22
30
|
|
|
@@ -73,6 +81,7 @@ function createErrorList(errors) {
|
|
|
73
81
|
function createErrorResult(errors) {
|
|
74
82
|
let parentPages = "";
|
|
75
83
|
let nestedPages = "";
|
|
84
|
+
let generalErrors = "";
|
|
76
85
|
|
|
77
86
|
const parentPagesErrors = errors.filter((url) => url.parentUrl === undefined);
|
|
78
87
|
if (parentPagesErrors.length > 0) {
|
|
@@ -103,7 +112,14 @@ function createErrorResult(errors) {
|
|
|
103
112
|
}
|
|
104
113
|
}
|
|
105
114
|
|
|
106
|
-
|
|
115
|
+
if (ERRORS.length > 0) {
|
|
116
|
+
generalErrors = `\n\nGeneral errors:\n`;
|
|
117
|
+
for (const error of ERRORS) {
|
|
118
|
+
generalErrors += `${createTabSpace(1)}${error}\n`;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return parentPages + nestedPages + generalErrors;
|
|
107
123
|
}
|
|
108
124
|
|
|
109
125
|
/**
|
|
@@ -163,11 +179,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
163
179
|
url,
|
|
164
180
|
parentUrl,
|
|
165
181
|
isImg: isImageUrl(url),
|
|
182
|
+
isWebPage: false,
|
|
166
183
|
ttl,
|
|
167
184
|
status: 0,
|
|
168
185
|
message: "invalid url: " + url,
|
|
169
186
|
skipped: false,
|
|
170
187
|
html: null,
|
|
188
|
+
redirected: false,
|
|
171
189
|
};
|
|
172
190
|
}
|
|
173
191
|
|
|
@@ -184,11 +202,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
184
202
|
url,
|
|
185
203
|
parentUrl,
|
|
186
204
|
isImg: isImageUrl(url),
|
|
205
|
+
isWebPage: true,
|
|
187
206
|
ttl,
|
|
188
207
|
status: 0,
|
|
189
208
|
message: "blocked by robots.txt",
|
|
190
209
|
skipped: true,
|
|
191
210
|
html: null,
|
|
211
|
+
redirected: false,
|
|
192
212
|
};
|
|
193
213
|
}
|
|
194
214
|
}
|
|
@@ -201,11 +221,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
201
221
|
url,
|
|
202
222
|
parentUrl,
|
|
203
223
|
isImg: isImageUrl(url),
|
|
224
|
+
isWebPage: true,
|
|
204
225
|
ttl,
|
|
205
226
|
status: errorStatus,
|
|
206
227
|
message: e.message,
|
|
207
228
|
skipped: errorStatus !== undefined,
|
|
208
229
|
html: null,
|
|
230
|
+
redirected: false,
|
|
209
231
|
};
|
|
210
232
|
}
|
|
211
233
|
|
|
@@ -220,10 +242,12 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
220
242
|
url,
|
|
221
243
|
parentUrl,
|
|
222
244
|
isImg: isImageUrl(url),
|
|
245
|
+
isWebPage: response.headers.get("content-type").includes("text/html"),
|
|
223
246
|
ttl,
|
|
224
247
|
status: response.status,
|
|
225
248
|
skipped: false,
|
|
226
249
|
html: await response.text(),
|
|
250
|
+
redirected: response.redirected,
|
|
227
251
|
};
|
|
228
252
|
} catch (e) {
|
|
229
253
|
const status = Number.parseInt((e && e.response && e.response.status) || -1, 10);
|
|
@@ -232,11 +256,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
232
256
|
url,
|
|
233
257
|
parentUrl,
|
|
234
258
|
isImg: isImageUrl(url),
|
|
259
|
+
isWebPage: true,
|
|
235
260
|
ttl,
|
|
236
261
|
status,
|
|
237
262
|
message: e.message,
|
|
238
263
|
skipped: false,
|
|
239
264
|
html: null,
|
|
265
|
+
redirected: false,
|
|
240
266
|
};
|
|
241
267
|
}
|
|
242
268
|
}
|
|
@@ -245,7 +271,7 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
245
271
|
* @param url {string}
|
|
246
272
|
* @param webUrl {string}
|
|
247
273
|
* @param parentUrl {string | undefined}
|
|
248
|
-
* @return {UrlCheckResponse}
|
|
274
|
+
* @return {result: UrlCheckResponse|TestedUrlDto, fromCache: boolean}
|
|
249
275
|
*/
|
|
250
276
|
async function testUrl(url, webUrl, parentUrl = undefined) {
|
|
251
277
|
const indexInChecked = TESTED_URLS.findIndex((result) => result.url === url);
|
|
@@ -253,16 +279,26 @@ async function testUrl(url, webUrl, parentUrl = undefined) {
|
|
|
253
279
|
const result = await fetchUrl(url, webUrl, parentUrl);
|
|
254
280
|
TESTED_URLS.push({
|
|
255
281
|
isImg: result.isImg,
|
|
282
|
+
isWebPage: result.isWebPage,
|
|
256
283
|
message: result.message,
|
|
257
284
|
parentUrl: result.parentUrl,
|
|
258
285
|
skipped: result.skipped,
|
|
259
286
|
status: result.status,
|
|
260
287
|
ttl: result.ttl,
|
|
261
288
|
url: result.url,
|
|
289
|
+
redirected: result.redirected,
|
|
290
|
+
indexable:
|
|
291
|
+
result.isWebPage && typeof result.html === "string"
|
|
292
|
+
? cheerio.load(result.html)("meta[name='robots']").attr("content") !== "noindex"
|
|
293
|
+
: false,
|
|
294
|
+
canonicalUrl:
|
|
295
|
+
result.isWebPage && typeof result.html === "string"
|
|
296
|
+
? (cheerio.load(result.html)("link[rel='canonical']").attr("href") ?? null)
|
|
297
|
+
: null,
|
|
262
298
|
});
|
|
263
|
-
return result;
|
|
299
|
+
return { result, fromCache: false };
|
|
264
300
|
}
|
|
265
|
-
return TESTED_URLS[indexInChecked];
|
|
301
|
+
return { result: TESTED_URLS[indexInChecked], fromCache: true };
|
|
266
302
|
}
|
|
267
303
|
|
|
268
304
|
/**
|
|
@@ -283,14 +319,14 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
|
|
|
283
319
|
const url = urls[i];
|
|
284
320
|
const changedUrl = webUrl ? `${webUrl}${new URL(url).pathname}` : url;
|
|
285
321
|
|
|
286
|
-
const result = await testUrl(changedUrl, webUrl);
|
|
322
|
+
const { result, fromCache } = await testUrl(changedUrl, webUrl);
|
|
287
323
|
printProgress();
|
|
288
324
|
|
|
289
|
-
if (withNested && result.status === 200) {
|
|
325
|
+
if (withNested && !fromCache && result.status === 200) {
|
|
290
326
|
await testNestedUrls(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
|
|
291
327
|
}
|
|
292
328
|
|
|
293
|
-
if (withImages && result.status === 200) {
|
|
329
|
+
if (withImages && !fromCache && result.status === 200) {
|
|
294
330
|
await testNestedImages(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
|
|
295
331
|
}
|
|
296
332
|
}
|
|
@@ -304,13 +340,20 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
|
|
|
304
340
|
* @return {Promise<void>}
|
|
305
341
|
*/
|
|
306
342
|
async function testNestedUrls(html, parentUrl, parentIndex, webUrl) {
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
343
|
+
try {
|
|
344
|
+
const $ = cheerio.load(html.toString());
|
|
345
|
+
let urls = createCorrectLinks(
|
|
346
|
+
$("a[href]").map((i, node) => $(node).attr("href")),
|
|
347
|
+
webUrl,
|
|
348
|
+
);
|
|
349
|
+
|
|
350
|
+
// FIXME
|
|
351
|
+
urls = urls.filter((url) => url.startsWith(webUrl) || url.startsWith("/"));
|
|
312
352
|
|
|
313
|
-
|
|
353
|
+
await testNested(urls, parentIndex, parentUrl, createTabSpace() + URLS_LABEL, webUrl);
|
|
354
|
+
} catch (e) {
|
|
355
|
+
ERRORS.push(`Can't test all nested pages for ${parentUrl} - ${e.message}`);
|
|
356
|
+
}
|
|
314
357
|
}
|
|
315
358
|
|
|
316
359
|
/**
|
|
@@ -321,13 +364,17 @@ async function testNestedUrls(html, parentUrl, parentIndex, webUrl) {
|
|
|
321
364
|
* @return {Promise<void>}
|
|
322
365
|
*/
|
|
323
366
|
async function testNestedImages(html, parentUrl, parentIndex, webUrl) {
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
367
|
+
try {
|
|
368
|
+
const $ = cheerio.load(html);
|
|
369
|
+
const images = createCorrectLinks(
|
|
370
|
+
$("img[src]").map((i, node) => $(node).attr("src")),
|
|
371
|
+
webUrl,
|
|
372
|
+
);
|
|
329
373
|
|
|
330
|
-
|
|
374
|
+
await testNested(images, parentIndex, parentUrl, createTabSpace() + IMAGES_LABEL, webUrl);
|
|
375
|
+
} catch (e) {
|
|
376
|
+
ERRORS.push(`Can't test all nested images for ${parentUrl} - ${e.message}`);
|
|
377
|
+
}
|
|
331
378
|
}
|
|
332
379
|
|
|
333
380
|
/**
|
|
@@ -426,6 +473,24 @@ async function sendGoogleChatMessage(resultErrors, webhookUrl) {
|
|
|
426
473
|
);
|
|
427
474
|
}
|
|
428
475
|
|
|
476
|
+
/**
|
|
477
|
+
* @param webUrl {string}
|
|
478
|
+
* @return {string[]}
|
|
479
|
+
*/
|
|
480
|
+
function getPagesShouldBeInSitemap(webUrl) {
|
|
481
|
+
return TESTED_URLS.filter((u) => u.indexable)
|
|
482
|
+
.filter(
|
|
483
|
+
(dto) =>
|
|
484
|
+
(dto.canonicalUrl ?? dto.url).startsWith(webUrl) &&
|
|
485
|
+
!(dto.canonicalUrl ?? dto.url).includes("?") &&
|
|
486
|
+
dto.status === 200 &&
|
|
487
|
+
dto.ttl <= 1 &&
|
|
488
|
+
!dto.redirected &&
|
|
489
|
+
dto.isWebPage,
|
|
490
|
+
)
|
|
491
|
+
.map((url) => (url.canonicalUrl ?? url.url).toLowerCase());
|
|
492
|
+
}
|
|
493
|
+
|
|
429
494
|
/**
|
|
430
495
|
* @param sitemapUrl {string}
|
|
431
496
|
* @param skip {number}
|
|
@@ -453,17 +518,32 @@ module.exports = async function run(sitemapUrl, skip, withNested, withImages, go
|
|
|
453
518
|
}
|
|
454
519
|
|
|
455
520
|
const startTime = performance.now();
|
|
456
|
-
|
|
521
|
+
const sitemapUrls = await Sitemap.getSitemap(sitemapUrl);
|
|
522
|
+
await testSitemapUrls(sitemapUrls, webUrl, sitemapUrl, skip, withNested, withImages);
|
|
457
523
|
const finishTime = performance.now();
|
|
458
524
|
|
|
525
|
+
const shouldBeInSitemap = getPagesShouldBeInSitemap(webUrl);
|
|
526
|
+
|
|
459
527
|
const errors = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === false);
|
|
528
|
+
const duplicates = [...new Set(sitemapUrls.filter((item, index, self) => self.indexOf(item) !== index))];
|
|
460
529
|
const skippedUrls = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === true);
|
|
461
530
|
const ok = TESTED_URLS.filter((r) => r.status === 200);
|
|
531
|
+
const missingInSitemap = shouldBeInSitemap.filter((testedUrl) => !sitemapUrls.includes(testedUrl));
|
|
462
532
|
|
|
463
|
-
if (errors.length > 0) {
|
|
533
|
+
if (missingInSitemap.length > 0 || duplicates.length > 0 || errors.length > 0 || ERRORS.length > 0) {
|
|
534
|
+
const duplicatesText = duplicates.map((url) => `${createTabSpace()}${url}`).join("\n");
|
|
535
|
+
const missingText = missingInSitemap.map((url) => `${createTabSpace()}${url}`).join("\n");
|
|
464
536
|
const errorText = createErrorResult(errors);
|
|
465
|
-
|
|
466
|
-
|
|
537
|
+
if (duplicatesText) {
|
|
538
|
+
logErrors(duplicatesText, "\n\n\nDuplicated pages in sitemap:\n");
|
|
539
|
+
}
|
|
540
|
+
if (missingText) {
|
|
541
|
+
logErrors(missingText, "\n\n\nMissing pages in sitemap:\n");
|
|
542
|
+
}
|
|
543
|
+
if (errorText) {
|
|
544
|
+
logErrors(errorText, "\n\n\nErrors:\n");
|
|
545
|
+
}
|
|
546
|
+
await sendGoogleChatMessage(duplicatesText + missingText + errorText, googleWebhookUrl);
|
|
467
547
|
}
|
|
468
548
|
|
|
469
549
|
if (skippedUrls.length > 0) {
|