@uxf/scripts 11.61.5 → 11.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/uxf-sitemap-check/index.js +76 -13
package/package.json
CHANGED
|
@@ -8,13 +8,20 @@ const robotsTxtParser = require("robots-txt-parser");
|
|
|
8
8
|
const { HTTP_USERNAME, HTTP_PASSWORD } = process.env;
|
|
9
9
|
|
|
10
10
|
/**
|
|
11
|
-
* @typedef {{parentUrl: (string | undefined), isImg: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null)}} UrlCheckResponse
|
|
11
|
+
* @typedef {{parentUrl: (string | undefined), isImg: boolean, isWebPage: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null), redirected: boolean}} UrlCheckResponse
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* @typedef {{url: string, parentUrl: (string | undefined), canonicalUrl: (string | null), isImg: boolean, isWebPage: boolean, ttl: number, status: number, message: (string | undefined), skipped: boolean, indexable: boolean, redirected: boolean}} TestedUrlDto
|
|
12
16
|
*/
|
|
13
17
|
|
|
14
18
|
const MAX_TTL = 3;
|
|
15
19
|
const IMAGES_LABEL = "🏞 Images:";
|
|
16
20
|
const URLS_LABEL = "🔗 Links:";
|
|
17
21
|
|
|
22
|
+
/**
|
|
23
|
+
* @type TestedUrlDto[]
|
|
24
|
+
*/
|
|
18
25
|
const TESTED_URLS = [];
|
|
19
26
|
const URLS_TO_CHECK = new Set();
|
|
20
27
|
const ERRORS = [];
|
|
@@ -172,11 +179,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
172
179
|
url,
|
|
173
180
|
parentUrl,
|
|
174
181
|
isImg: isImageUrl(url),
|
|
182
|
+
isWebPage: false,
|
|
175
183
|
ttl,
|
|
176
184
|
status: 0,
|
|
177
185
|
message: "invalid url: " + url,
|
|
178
186
|
skipped: false,
|
|
179
187
|
html: null,
|
|
188
|
+
redirected: false,
|
|
180
189
|
};
|
|
181
190
|
}
|
|
182
191
|
|
|
@@ -193,11 +202,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
193
202
|
url,
|
|
194
203
|
parentUrl,
|
|
195
204
|
isImg: isImageUrl(url),
|
|
205
|
+
isWebPage: true,
|
|
196
206
|
ttl,
|
|
197
207
|
status: 0,
|
|
198
208
|
message: "blocked by robots.txt",
|
|
199
209
|
skipped: true,
|
|
200
210
|
html: null,
|
|
211
|
+
redirected: false,
|
|
201
212
|
};
|
|
202
213
|
}
|
|
203
214
|
}
|
|
@@ -210,11 +221,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
210
221
|
url,
|
|
211
222
|
parentUrl,
|
|
212
223
|
isImg: isImageUrl(url),
|
|
224
|
+
isWebPage: true,
|
|
213
225
|
ttl,
|
|
214
226
|
status: errorStatus,
|
|
215
227
|
message: e.message,
|
|
216
228
|
skipped: errorStatus !== undefined,
|
|
217
229
|
html: null,
|
|
230
|
+
redirected: false,
|
|
218
231
|
};
|
|
219
232
|
}
|
|
220
233
|
|
|
@@ -229,10 +242,12 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
229
242
|
url,
|
|
230
243
|
parentUrl,
|
|
231
244
|
isImg: isImageUrl(url),
|
|
245
|
+
isWebPage: response.headers.get("content-type").includes("text/html"),
|
|
232
246
|
ttl,
|
|
233
247
|
status: response.status,
|
|
234
248
|
skipped: false,
|
|
235
249
|
html: await response.text(),
|
|
250
|
+
redirected: response.redirected,
|
|
236
251
|
};
|
|
237
252
|
} catch (e) {
|
|
238
253
|
const status = Number.parseInt((e && e.response && e.response.status) || -1, 10);
|
|
@@ -241,11 +256,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
241
256
|
url,
|
|
242
257
|
parentUrl,
|
|
243
258
|
isImg: isImageUrl(url),
|
|
259
|
+
isWebPage: true,
|
|
244
260
|
ttl,
|
|
245
261
|
status,
|
|
246
262
|
message: e.message,
|
|
247
263
|
skipped: false,
|
|
248
264
|
html: null,
|
|
265
|
+
redirected: false,
|
|
249
266
|
};
|
|
250
267
|
}
|
|
251
268
|
}
|
|
@@ -254,7 +271,7 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
254
271
|
* @param url {string}
|
|
255
272
|
* @param webUrl {string}
|
|
256
273
|
* @param parentUrl {string | undefined}
|
|
257
|
-
* @return {UrlCheckResponse}
|
|
274
|
+
* @return {result: UrlCheckResponse|TestedUrlDto, fromCache: boolean}
|
|
258
275
|
*/
|
|
259
276
|
async function testUrl(url, webUrl, parentUrl = undefined) {
|
|
260
277
|
const indexInChecked = TESTED_URLS.findIndex((result) => result.url === url);
|
|
@@ -262,16 +279,26 @@ async function testUrl(url, webUrl, parentUrl = undefined) {
|
|
|
262
279
|
const result = await fetchUrl(url, webUrl, parentUrl);
|
|
263
280
|
TESTED_URLS.push({
|
|
264
281
|
isImg: result.isImg,
|
|
282
|
+
isWebPage: result.isWebPage,
|
|
265
283
|
message: result.message,
|
|
266
284
|
parentUrl: result.parentUrl,
|
|
267
285
|
skipped: result.skipped,
|
|
268
286
|
status: result.status,
|
|
269
287
|
ttl: result.ttl,
|
|
270
288
|
url: result.url,
|
|
289
|
+
redirected: result.redirected,
|
|
290
|
+
indexable:
|
|
291
|
+
result.isWebPage && typeof result.html === "string"
|
|
292
|
+
? cheerio.load(result.html)("meta[name='robots']").attr("content") !== "noindex"
|
|
293
|
+
: false,
|
|
294
|
+
canonicalUrl:
|
|
295
|
+
result.isWebPage && typeof result.html === "string"
|
|
296
|
+
? (cheerio.load(result.html)("link[rel='canonical']").attr("href") ?? null)
|
|
297
|
+
: null,
|
|
271
298
|
});
|
|
272
|
-
return result;
|
|
299
|
+
return { result, fromCache: false };
|
|
273
300
|
}
|
|
274
|
-
return TESTED_URLS[indexInChecked];
|
|
301
|
+
return { result: TESTED_URLS[indexInChecked], fromCache: true };
|
|
275
302
|
}
|
|
276
303
|
|
|
277
304
|
/**
|
|
@@ -292,14 +319,14 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
|
|
|
292
319
|
const url = urls[i];
|
|
293
320
|
const changedUrl = webUrl ? `${webUrl}${new URL(url).pathname}` : url;
|
|
294
321
|
|
|
295
|
-
const result = await testUrl(changedUrl, webUrl);
|
|
322
|
+
const { result, fromCache } = await testUrl(changedUrl, webUrl);
|
|
296
323
|
printProgress();
|
|
297
324
|
|
|
298
|
-
if (withNested && result.status === 200) {
|
|
325
|
+
if (withNested && !fromCache && result.status === 200) {
|
|
299
326
|
await testNestedUrls(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
|
|
300
327
|
}
|
|
301
328
|
|
|
302
|
-
if (withImages && result.status === 200) {
|
|
329
|
+
if (withImages && !fromCache && result.status === 200) {
|
|
303
330
|
await testNestedImages(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
|
|
304
331
|
}
|
|
305
332
|
}
|
|
@@ -314,12 +341,15 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
|
|
|
314
341
|
*/
|
|
315
342
|
async function testNestedUrls(html, parentUrl, parentIndex, webUrl) {
|
|
316
343
|
try {
|
|
317
|
-
const $ = cheerio.load(html);
|
|
318
|
-
|
|
344
|
+
const $ = cheerio.load(html.toString());
|
|
345
|
+
let urls = createCorrectLinks(
|
|
319
346
|
$("a[href]").map((i, node) => $(node).attr("href")),
|
|
320
347
|
webUrl,
|
|
321
348
|
);
|
|
322
349
|
|
|
350
|
+
// FIXME
|
|
351
|
+
urls = urls.filter((url) => url.startsWith(webUrl) || url.startsWith("/"));
|
|
352
|
+
|
|
323
353
|
await testNested(urls, parentIndex, parentUrl, createTabSpace() + URLS_LABEL, webUrl);
|
|
324
354
|
} catch (e) {
|
|
325
355
|
ERRORS.push(`Can't test all nested pages for ${parentUrl} - ${e.message}`);
|
|
@@ -443,6 +473,24 @@ async function sendGoogleChatMessage(resultErrors, webhookUrl) {
|
|
|
443
473
|
);
|
|
444
474
|
}
|
|
445
475
|
|
|
476
|
+
/**
|
|
477
|
+
* @param webUrl {string}
|
|
478
|
+
* @return {string[]}
|
|
479
|
+
*/
|
|
480
|
+
function getPagesShouldBeInSitemap(webUrl) {
|
|
481
|
+
return TESTED_URLS.filter((u) => u.indexable)
|
|
482
|
+
.filter(
|
|
483
|
+
(dto) =>
|
|
484
|
+
(dto.canonicalUrl ?? dto.url).startsWith(webUrl) &&
|
|
485
|
+
!(dto.canonicalUrl ?? dto.url).includes("?") &&
|
|
486
|
+
dto.status === 200 &&
|
|
487
|
+
dto.ttl <= 1 &&
|
|
488
|
+
!dto.redirected &&
|
|
489
|
+
dto.isWebPage,
|
|
490
|
+
)
|
|
491
|
+
.map((url) => (url.canonicalUrl ?? url.url).toLowerCase());
|
|
492
|
+
}
|
|
493
|
+
|
|
446
494
|
/**
|
|
447
495
|
* @param sitemapUrl {string}
|
|
448
496
|
* @param skip {number}
|
|
@@ -470,17 +518,32 @@ module.exports = async function run(sitemapUrl, skip, withNested, withImages, go
|
|
|
470
518
|
}
|
|
471
519
|
|
|
472
520
|
const startTime = performance.now();
|
|
473
|
-
|
|
521
|
+
const sitemapUrls = await Sitemap.getSitemap(sitemapUrl);
|
|
522
|
+
await testSitemapUrls(sitemapUrls, webUrl, sitemapUrl, skip, withNested, withImages);
|
|
474
523
|
const finishTime = performance.now();
|
|
475
524
|
|
|
525
|
+
const shouldBeInSitemap = getPagesShouldBeInSitemap(webUrl);
|
|
526
|
+
|
|
476
527
|
const errors = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === false);
|
|
528
|
+
const duplicates = [...new Set(sitemapUrls.filter((item, index, self) => self.indexOf(item) !== index))];
|
|
477
529
|
const skippedUrls = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === true);
|
|
478
530
|
const ok = TESTED_URLS.filter((r) => r.status === 200);
|
|
531
|
+
const missingInSitemap = shouldBeInSitemap.filter((testedUrl) => !sitemapUrls.includes(testedUrl));
|
|
479
532
|
|
|
480
|
-
if (errors.length > 0 || ERRORS.length > 0) {
|
|
533
|
+
if (missingInSitemap.length > 0 || duplicates.length > 0 || errors.length > 0 || ERRORS.length > 0) {
|
|
534
|
+
const duplicatesText = duplicates.map((url) => `${createTabSpace()}${url}`).join("\n");
|
|
535
|
+
const missingText = missingInSitemap.map((url) => `${createTabSpace()}${url}`).join("\n");
|
|
481
536
|
const errorText = createErrorResult(errors);
|
|
482
|
-
|
|
483
|
-
|
|
537
|
+
if (duplicatesText) {
|
|
538
|
+
logErrors(duplicatesText, "\n\n\nDuplicated pages in sitemap:\n");
|
|
539
|
+
}
|
|
540
|
+
if (missingText) {
|
|
541
|
+
logErrors(missingText, "\n\n\nMissing pages in sitemap:\n");
|
|
542
|
+
}
|
|
543
|
+
if (errorText) {
|
|
544
|
+
logErrors(errorText, "\n\n\nErrors:\n");
|
|
545
|
+
}
|
|
546
|
+
await sendGoogleChatMessage(duplicatesText + missingText + errorText, googleWebhookUrl);
|
|
484
547
|
}
|
|
485
548
|
|
|
486
549
|
if (skippedUrls.length > 0) {
|