@uxf/scripts 11.61.5 → 11.62.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/uxf-sitemap-check/index.js +85 -15
package/package.json
CHANGED
|
@@ -8,13 +8,24 @@ const robotsTxtParser = require("robots-txt-parser");
|
|
|
8
8
|
const { HTTP_USERNAME, HTTP_PASSWORD } = process.env;
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
|
-
* @typedef {{parentUrl: (string | undefined), isImg: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null)}} UrlCheckResponse
|
|
11
|
+
* @typedef {{parentUrl: (string | undefined), isImg: boolean, isWebPage: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null), redirected: boolean}} UrlCheckResponse
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
+
/**
|
|
15
|
+
* @typedef {{url: string, parentUrl: (string | undefined), canonicalUrl: (string | null), isImg: boolean, isWebPage: boolean, ttl: number, status: number, message: (string | undefined), skipped: boolean, indexable: boolean, redirected: boolean}} TestedUrlDto
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
const DUPLICATES_TITLE = "\n\n\nDuplicated pages in sitemap:\n";
|
|
19
|
+
const MISSING_TITLE = "\n\n\nMissing pages in sitemap:\n";
|
|
20
|
+
const ERROR_TITLE = "\n\n\nErrors:\n"
|
|
21
|
+
|
|
14
22
|
const MAX_TTL = 3;
|
|
15
23
|
const IMAGES_LABEL = "🏞 Images:";
|
|
16
24
|
const URLS_LABEL = "🔗 Links:";
|
|
17
25
|
|
|
26
|
+
/**
|
|
27
|
+
* @type TestedUrlDto[]
|
|
28
|
+
*/
|
|
18
29
|
const TESTED_URLS = [];
|
|
19
30
|
const URLS_TO_CHECK = new Set();
|
|
20
31
|
const ERRORS = [];
|
|
@@ -68,7 +79,7 @@ function createErrorList(errors) {
|
|
|
68
79
|
}
|
|
69
80
|
|
|
70
81
|
/**
|
|
71
|
-
* @param errors {
|
|
82
|
+
* @param errors {TestedUrlDto[]}
|
|
72
83
|
* @return {string}
|
|
73
84
|
*/
|
|
74
85
|
function createErrorResult(errors) {
|
|
@@ -116,7 +127,7 @@ function createErrorResult(errors) {
|
|
|
116
127
|
}
|
|
117
128
|
|
|
118
129
|
/**
|
|
119
|
-
* @param skippedUrls {
|
|
130
|
+
* @param skippedUrls {TestedUrlDto[]}
|
|
120
131
|
* @return {string}
|
|
121
132
|
*/
|
|
122
133
|
function createSkippedResult(skippedUrls) {
|
|
@@ -172,11 +183,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
172
183
|
url,
|
|
173
184
|
parentUrl,
|
|
174
185
|
isImg: isImageUrl(url),
|
|
186
|
+
isWebPage: false,
|
|
175
187
|
ttl,
|
|
176
188
|
status: 0,
|
|
177
189
|
message: "invalid url: " + url,
|
|
178
190
|
skipped: false,
|
|
179
191
|
html: null,
|
|
192
|
+
redirected: false,
|
|
180
193
|
};
|
|
181
194
|
}
|
|
182
195
|
|
|
@@ -193,11 +206,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
193
206
|
url,
|
|
194
207
|
parentUrl,
|
|
195
208
|
isImg: isImageUrl(url),
|
|
209
|
+
isWebPage: true,
|
|
196
210
|
ttl,
|
|
197
211
|
status: 0,
|
|
198
212
|
message: "blocked by robots.txt",
|
|
199
213
|
skipped: true,
|
|
200
214
|
html: null,
|
|
215
|
+
redirected: false,
|
|
201
216
|
};
|
|
202
217
|
}
|
|
203
218
|
}
|
|
@@ -210,11 +225,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
210
225
|
url,
|
|
211
226
|
parentUrl,
|
|
212
227
|
isImg: isImageUrl(url),
|
|
228
|
+
isWebPage: true,
|
|
213
229
|
ttl,
|
|
214
230
|
status: errorStatus,
|
|
215
231
|
message: e.message,
|
|
216
232
|
skipped: errorStatus !== undefined,
|
|
217
233
|
html: null,
|
|
234
|
+
redirected: false,
|
|
218
235
|
};
|
|
219
236
|
}
|
|
220
237
|
|
|
@@ -229,10 +246,12 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
229
246
|
url,
|
|
230
247
|
parentUrl,
|
|
231
248
|
isImg: isImageUrl(url),
|
|
249
|
+
isWebPage: response.headers.get("content-type").includes("text/html"),
|
|
232
250
|
ttl,
|
|
233
251
|
status: response.status,
|
|
234
252
|
skipped: false,
|
|
235
253
|
html: await response.text(),
|
|
254
|
+
redirected: response.redirected,
|
|
236
255
|
};
|
|
237
256
|
} catch (e) {
|
|
238
257
|
const status = Number.parseInt((e && e.response && e.response.status) || -1, 10);
|
|
@@ -241,11 +260,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
241
260
|
url,
|
|
242
261
|
parentUrl,
|
|
243
262
|
isImg: isImageUrl(url),
|
|
263
|
+
isWebPage: true,
|
|
244
264
|
ttl,
|
|
245
265
|
status,
|
|
246
266
|
message: e.message,
|
|
247
267
|
skipped: false,
|
|
248
268
|
html: null,
|
|
269
|
+
redirected: false,
|
|
249
270
|
};
|
|
250
271
|
}
|
|
251
272
|
}
|
|
@@ -254,7 +275,7 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
254
275
|
* @param url {string}
|
|
255
276
|
* @param webUrl {string}
|
|
256
277
|
* @param parentUrl {string | undefined}
|
|
257
|
-
* @return {UrlCheckResponse}
|
|
278
|
+
* @return {result: UrlCheckResponse|TestedUrlDto, fromCache: boolean}
|
|
258
279
|
*/
|
|
259
280
|
async function testUrl(url, webUrl, parentUrl = undefined) {
|
|
260
281
|
const indexInChecked = TESTED_URLS.findIndex((result) => result.url === url);
|
|
@@ -262,16 +283,26 @@ async function testUrl(url, webUrl, parentUrl = undefined) {
|
|
|
262
283
|
const result = await fetchUrl(url, webUrl, parentUrl);
|
|
263
284
|
TESTED_URLS.push({
|
|
264
285
|
isImg: result.isImg,
|
|
286
|
+
isWebPage: result.isWebPage,
|
|
265
287
|
message: result.message,
|
|
266
288
|
parentUrl: result.parentUrl,
|
|
267
289
|
skipped: result.skipped,
|
|
268
290
|
status: result.status,
|
|
269
291
|
ttl: result.ttl,
|
|
270
292
|
url: result.url,
|
|
293
|
+
redirected: result.redirected,
|
|
294
|
+
indexable:
|
|
295
|
+
result.isWebPage && typeof result.html === "string"
|
|
296
|
+
? cheerio.load(result.html)("meta[name='robots']").attr("content") !== "noindex"
|
|
297
|
+
: false,
|
|
298
|
+
canonicalUrl:
|
|
299
|
+
result.isWebPage && typeof result.html === "string"
|
|
300
|
+
? (cheerio.load(result.html)("link[rel='canonical']").attr("href") ?? null)
|
|
301
|
+
: null,
|
|
271
302
|
});
|
|
272
|
-
return result;
|
|
303
|
+
return { result, fromCache: false };
|
|
273
304
|
}
|
|
274
|
-
return TESTED_URLS[indexInChecked];
|
|
305
|
+
return { result: TESTED_URLS[indexInChecked], fromCache: true };
|
|
275
306
|
}
|
|
276
307
|
|
|
277
308
|
/**
|
|
@@ -292,14 +323,14 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
|
|
|
292
323
|
const url = urls[i];
|
|
293
324
|
const changedUrl = webUrl ? `${webUrl}${new URL(url).pathname}` : url;
|
|
294
325
|
|
|
295
|
-
const result = await testUrl(changedUrl, webUrl);
|
|
326
|
+
const { result, fromCache } = await testUrl(changedUrl, webUrl);
|
|
296
327
|
printProgress();
|
|
297
328
|
|
|
298
|
-
if (withNested && result.status === 200) {
|
|
329
|
+
if (withNested && !fromCache && result.status === 200) {
|
|
299
330
|
await testNestedUrls(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
|
|
300
331
|
}
|
|
301
332
|
|
|
302
|
-
if (withImages && result.status === 200) {
|
|
333
|
+
if (withImages && !fromCache && result.status === 200) {
|
|
303
334
|
await testNestedImages(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
|
|
304
335
|
}
|
|
305
336
|
}
|
|
@@ -314,8 +345,8 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
|
|
|
314
345
|
*/
|
|
315
346
|
async function testNestedUrls(html, parentUrl, parentIndex, webUrl) {
|
|
316
347
|
try {
|
|
317
|
-
const $ = cheerio.load(html);
|
|
318
|
-
|
|
348
|
+
const $ = cheerio.load(html.toString());
|
|
349
|
+
let urls = createCorrectLinks(
|
|
319
350
|
$("a[href]").map((i, node) => $(node).attr("href")),
|
|
320
351
|
webUrl,
|
|
321
352
|
);
|
|
@@ -443,6 +474,24 @@ async function sendGoogleChatMessage(resultErrors, webhookUrl) {
|
|
|
443
474
|
);
|
|
444
475
|
}
|
|
445
476
|
|
|
477
|
+
/**
|
|
478
|
+
* @param webUrl {string}
|
|
479
|
+
* @return {string[]}
|
|
480
|
+
*/
|
|
481
|
+
function getPagesShouldBeInSitemap(webUrl) {
|
|
482
|
+
return TESTED_URLS.filter((u) => u.indexable)
|
|
483
|
+
.filter(
|
|
484
|
+
(dto) =>
|
|
485
|
+
(dto.canonicalUrl ?? dto.url).startsWith(webUrl) &&
|
|
486
|
+
!(dto.canonicalUrl ?? dto.url).includes("?") &&
|
|
487
|
+
dto.status === 200 &&
|
|
488
|
+
dto.ttl <= 1 &&
|
|
489
|
+
!dto.redirected &&
|
|
490
|
+
dto.isWebPage,
|
|
491
|
+
)
|
|
492
|
+
.map((url) => (url.canonicalUrl ?? url.url).toLowerCase());
|
|
493
|
+
}
|
|
494
|
+
|
|
446
495
|
/**
|
|
447
496
|
* @param sitemapUrl {string}
|
|
448
497
|
* @param skip {number}
|
|
@@ -470,17 +519,38 @@ module.exports = async function run(sitemapUrl, skip, withNested, withImages, go
|
|
|
470
519
|
}
|
|
471
520
|
|
|
472
521
|
const startTime = performance.now();
|
|
473
|
-
|
|
522
|
+
const sitemapUrls = await Sitemap.getSitemap(sitemapUrl);
|
|
523
|
+
await testSitemapUrls(sitemapUrls, webUrl, sitemapUrl, skip, withNested, withImages);
|
|
474
524
|
const finishTime = performance.now();
|
|
475
525
|
|
|
526
|
+
const shouldBeInSitemap = getPagesShouldBeInSitemap(webUrl);
|
|
527
|
+
|
|
476
528
|
const errors = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === false);
|
|
529
|
+
const duplicates = [...new Set(sitemapUrls.filter((item, index, self) => self.indexOf(item) !== index))];
|
|
477
530
|
const skippedUrls = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === true);
|
|
478
531
|
const ok = TESTED_URLS.filter((r) => r.status === 200);
|
|
532
|
+
const missingInSitemap = shouldBeInSitemap.filter((testedUrl) => !sitemapUrls.includes(testedUrl));
|
|
479
533
|
|
|
480
|
-
if (errors.length > 0 || ERRORS.length > 0) {
|
|
534
|
+
if (missingInSitemap.length > 0 || duplicates.length > 0 || errors.length > 0 || ERRORS.length > 0) {
|
|
535
|
+
let chatMessage = "";
|
|
536
|
+
const duplicatesText = duplicates.map((url) => `${createTabSpace()}${url}`).join("\n");
|
|
537
|
+
const missingText = missingInSitemap.map((url) => `${createTabSpace()}${url}`).join("\n");
|
|
481
538
|
const errorText = createErrorResult(errors);
|
|
482
|
-
|
|
483
|
-
|
|
539
|
+
|
|
540
|
+
if (duplicatesText) {
|
|
541
|
+
logErrors(duplicatesText, DUPLICATES_TITLE);
|
|
542
|
+
chatMessage += DUPLICATES_TITLE + duplicatesText;
|
|
543
|
+
}
|
|
544
|
+
if (missingText) {
|
|
545
|
+
logErrors(missingText, MISSING_TITLE);
|
|
546
|
+
chatMessage += MISSING_TITLE + missingText;
|
|
547
|
+
}
|
|
548
|
+
if (errorText) {
|
|
549
|
+
logErrors(errorText, ERROR_TITLE);
|
|
550
|
+
chatMessage += ERROR_TITLE + errorText;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
await sendGoogleChatMessage(chatMessage, googleWebhookUrl);
|
|
484
554
|
}
|
|
485
555
|
|
|
486
556
|
if (skippedUrls.length > 0) {
|