@uxf/scripts 11.61.5 → 11.62.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@uxf/scripts",
3
- "version": "11.61.5",
3
+ "version": "11.62.1",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -8,13 +8,24 @@ const robotsTxtParser = require("robots-txt-parser");
8
8
  const { HTTP_USERNAME, HTTP_PASSWORD } = process.env;
9
9
 
10
10
  /**
11
- * @typedef {{parentUrl: (string | undefined), isImg: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null)}} UrlCheckResponse
11
+ * @typedef {{parentUrl: (string | undefined), isImg: boolean, isWebPage: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null), redirected: boolean}} UrlCheckResponse
12
12
  */
13
13
 
14
+ /**
15
+ * @typedef {{url: string, parentUrl: (string | undefined), canonicalUrl: (string | null), isImg: boolean, isWebPage: boolean, ttl: number, status: number, message: (string | undefined), skipped: boolean, indexable: boolean, redirected: boolean}} TestedUrlDto
16
+ */
17
+
18
+ const DUPLICATES_TITLE = "\n\n\nDuplicated pages in sitemap:\n";
19
+ const MISSING_TITLE = "\n\n\nMissing pages in sitemap:\n";
20
+ const ERROR_TITLE = "\n\n\nErrors:\n"
21
+
14
22
  const MAX_TTL = 3;
15
23
  const IMAGES_LABEL = "🏞 Images:";
16
24
  const URLS_LABEL = "🔗 Links:";
17
25
 
26
+ /**
27
+ * @type TestedUrlDto[]
28
+ */
18
29
  const TESTED_URLS = [];
19
30
  const URLS_TO_CHECK = new Set();
20
31
  const ERRORS = [];
@@ -68,7 +79,7 @@ function createErrorList(errors) {
68
79
  }
69
80
 
70
81
  /**
71
- * @param errors {UrlCheckResponse[]}
82
+ * @param errors {TestedUrlDto[]}
72
83
  * @return {string}
73
84
  */
74
85
  function createErrorResult(errors) {
@@ -116,7 +127,7 @@ function createErrorResult(errors) {
116
127
  }
117
128
 
118
129
  /**
119
- * @param skippedUrls {UrlCheckResponse[]}
130
+ * @param skippedUrls {TestedUrlDto[]}
120
131
  * @return {string}
121
132
  */
122
133
  function createSkippedResult(skippedUrls) {
@@ -172,11 +183,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
172
183
  url,
173
184
  parentUrl,
174
185
  isImg: isImageUrl(url),
186
+ isWebPage: false,
175
187
  ttl,
176
188
  status: 0,
177
189
  message: "invalid url: " + url,
178
190
  skipped: false,
179
191
  html: null,
192
+ redirected: false,
180
193
  };
181
194
  }
182
195
 
@@ -193,11 +206,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
193
206
  url,
194
207
  parentUrl,
195
208
  isImg: isImageUrl(url),
209
+ isWebPage: true,
196
210
  ttl,
197
211
  status: 0,
198
212
  message: "blocked by robots.txt",
199
213
  skipped: true,
200
214
  html: null,
215
+ redirected: false,
201
216
  };
202
217
  }
203
218
  }
@@ -210,11 +225,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
210
225
  url,
211
226
  parentUrl,
212
227
  isImg: isImageUrl(url),
228
+ isWebPage: true,
213
229
  ttl,
214
230
  status: errorStatus,
215
231
  message: e.message,
216
232
  skipped: errorStatus !== undefined,
217
233
  html: null,
234
+ redirected: false,
218
235
  };
219
236
  }
220
237
 
@@ -229,10 +246,12 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
229
246
  url,
230
247
  parentUrl,
231
248
  isImg: isImageUrl(url),
249
+ isWebPage: response.headers.get("content-type").includes("text/html"),
232
250
  ttl,
233
251
  status: response.status,
234
252
  skipped: false,
235
253
  html: await response.text(),
254
+ redirected: response.redirected,
236
255
  };
237
256
  } catch (e) {
238
257
  const status = Number.parseInt((e && e.response && e.response.status) || -1, 10);
@@ -241,11 +260,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
241
260
  url,
242
261
  parentUrl,
243
262
  isImg: isImageUrl(url),
263
+ isWebPage: true,
244
264
  ttl,
245
265
  status,
246
266
  message: e.message,
247
267
  skipped: false,
248
268
  html: null,
269
+ redirected: false,
249
270
  };
250
271
  }
251
272
  }
@@ -254,7 +275,7 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
254
275
  * @param url {string}
255
276
  * @param webUrl {string}
256
277
  * @param parentUrl {string | undefined}
257
- * @return {UrlCheckResponse}
278
+ * @return {result: UrlCheckResponse|TestedUrlDto, fromCache: boolean}
258
279
  */
259
280
  async function testUrl(url, webUrl, parentUrl = undefined) {
260
281
  const indexInChecked = TESTED_URLS.findIndex((result) => result.url === url);
@@ -262,16 +283,26 @@ async function testUrl(url, webUrl, parentUrl = undefined) {
262
283
  const result = await fetchUrl(url, webUrl, parentUrl);
263
284
  TESTED_URLS.push({
264
285
  isImg: result.isImg,
286
+ isWebPage: result.isWebPage,
265
287
  message: result.message,
266
288
  parentUrl: result.parentUrl,
267
289
  skipped: result.skipped,
268
290
  status: result.status,
269
291
  ttl: result.ttl,
270
292
  url: result.url,
293
+ redirected: result.redirected,
294
+ indexable:
295
+ result.isWebPage && typeof result.html === "string"
296
+ ? cheerio.load(result.html)("meta[name='robots']").attr("content") !== "noindex"
297
+ : false,
298
+ canonicalUrl:
299
+ result.isWebPage && typeof result.html === "string"
300
+ ? (cheerio.load(result.html)("link[rel='canonical']").attr("href") ?? null)
301
+ : null,
271
302
  });
272
- return result;
303
+ return { result, fromCache: false };
273
304
  }
274
- return TESTED_URLS[indexInChecked];
305
+ return { result: TESTED_URLS[indexInChecked], fromCache: true };
275
306
  }
276
307
 
277
308
  /**
@@ -292,14 +323,14 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
292
323
  const url = urls[i];
293
324
  const changedUrl = webUrl ? `${webUrl}${new URL(url).pathname}` : url;
294
325
 
295
- const result = await testUrl(changedUrl, webUrl);
326
+ const { result, fromCache } = await testUrl(changedUrl, webUrl);
296
327
  printProgress();
297
328
 
298
- if (withNested && result.status === 200) {
329
+ if (withNested && !fromCache && result.status === 200) {
299
330
  await testNestedUrls(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
300
331
  }
301
332
 
302
- if (withImages && result.status === 200) {
333
+ if (withImages && !fromCache && result.status === 200) {
303
334
  await testNestedImages(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
304
335
  }
305
336
  }
@@ -314,8 +345,8 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
314
345
  */
315
346
  async function testNestedUrls(html, parentUrl, parentIndex, webUrl) {
316
347
  try {
317
- const $ = cheerio.load(html);
318
- const urls = createCorrectLinks(
348
+ const $ = cheerio.load(html.toString());
349
+ let urls = createCorrectLinks(
319
350
  $("a[href]").map((i, node) => $(node).attr("href")),
320
351
  webUrl,
321
352
  );
@@ -443,6 +474,24 @@ async function sendGoogleChatMessage(resultErrors, webhookUrl) {
443
474
  );
444
475
  }
445
476
 
477
+ /**
478
+ * @param webUrl {string}
479
+ * @return {string[]}
480
+ */
481
+ function getPagesShouldBeInSitemap(webUrl) {
482
+ return TESTED_URLS.filter((u) => u.indexable)
483
+ .filter(
484
+ (dto) =>
485
+ (dto.canonicalUrl ?? dto.url).startsWith(webUrl) &&
486
+ !(dto.canonicalUrl ?? dto.url).includes("?") &&
487
+ dto.status === 200 &&
488
+ dto.ttl <= 1 &&
489
+ !dto.redirected &&
490
+ dto.isWebPage,
491
+ )
492
+ .map((url) => (url.canonicalUrl ?? url.url).toLowerCase());
493
+ }
494
+
446
495
  /**
447
496
  * @param sitemapUrl {string}
448
497
  * @param skip {number}
@@ -470,17 +519,38 @@ module.exports = async function run(sitemapUrl, skip, withNested, withImages, go
470
519
  }
471
520
 
472
521
  const startTime = performance.now();
473
- await testSitemapUrls(await Sitemap.getSitemap(sitemapUrl), webUrl, sitemapUrl, skip, withNested, withImages);
522
+ const sitemapUrls = await Sitemap.getSitemap(sitemapUrl);
523
+ await testSitemapUrls(sitemapUrls, webUrl, sitemapUrl, skip, withNested, withImages);
474
524
  const finishTime = performance.now();
475
525
 
526
+ const shouldBeInSitemap = getPagesShouldBeInSitemap(webUrl);
527
+
476
528
  const errors = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === false);
529
+ const duplicates = [...new Set(sitemapUrls.filter((item, index, self) => self.indexOf(item) !== index))];
477
530
  const skippedUrls = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === true);
478
531
  const ok = TESTED_URLS.filter((r) => r.status === 200);
532
+ const missingInSitemap = shouldBeInSitemap.filter((testedUrl) => !sitemapUrls.includes(testedUrl));
479
533
 
480
- if (errors.length > 0 || ERRORS.length > 0) {
534
+ if (missingInSitemap.length > 0 || duplicates.length > 0 || errors.length > 0 || ERRORS.length > 0) {
535
+ let chatMessage = "";
536
+ const duplicatesText = duplicates.map((url) => `${createTabSpace()}${url}`).join("\n");
537
+ const missingText = missingInSitemap.map((url) => `${createTabSpace()}${url}`).join("\n");
481
538
  const errorText = createErrorResult(errors);
482
- logErrors(errorText, "\n\n\nErrors:\n");
483
- await sendGoogleChatMessage(errorText, googleWebhookUrl);
539
+
540
+ if (duplicatesText) {
541
+ logErrors(duplicatesText, DUPLICATES_TITLE);
542
+ chatMessage += DUPLICATES_TITLE + duplicatesText;
543
+ }
544
+ if (missingText) {
545
+ logErrors(missingText, MISSING_TITLE);
546
+ chatMessage += MISSING_TITLE + missingText;
547
+ }
548
+ if (errorText) {
549
+ logErrors(errorText, ERROR_TITLE);
550
+ chatMessage += ERROR_TITLE + errorText;
551
+ }
552
+
553
+ await sendGoogleChatMessage(chatMessage, googleWebhookUrl);
484
554
  }
485
555
 
486
556
  if (skippedUrls.length > 0) {