@uxf/scripts 11.61.4 → 11.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@uxf/scripts",
3
- "version": "11.61.4",
3
+ "version": "11.62.0",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -8,15 +8,23 @@ const robotsTxtParser = require("robots-txt-parser");
8
8
  const { HTTP_USERNAME, HTTP_PASSWORD } = process.env;
9
9
 
10
10
  /**
11
- * @typedef {{parentUrl: (string | undefined), isImg: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null)}} UrlCheckResponse
11
+ * @typedef {{parentUrl: (string | undefined), isImg: boolean, isWebPage: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null), redirected: boolean}} UrlCheckResponse
12
+ */
13
+
14
+ /**
15
+ * @typedef {{url: string, parentUrl: (string | undefined), canonicalUrl: (string | null), isImg: boolean, isWebPage: boolean, ttl: number, status: number, message: (string | undefined), skipped: boolean, indexable: boolean, redirected: boolean}} TestedUrlDto
12
16
  */
13
17
 
14
18
  const MAX_TTL = 3;
15
19
  const IMAGES_LABEL = "🏞 Images:";
16
20
  const URLS_LABEL = "🔗 Links:";
17
21
 
22
+ /**
23
+ * @type TestedUrlDto[]
24
+ */
18
25
  const TESTED_URLS = [];
19
26
  const URLS_TO_CHECK = new Set();
27
+ const ERRORS = [];
20
28
 
21
29
  const robotsParser = robotsTxtParser({ userAgent: "uxf-bot", allowOnNeutral: false });
22
30
 
@@ -73,6 +81,7 @@ function createErrorList(errors) {
73
81
  function createErrorResult(errors) {
74
82
  let parentPages = "";
75
83
  let nestedPages = "";
84
+ let generalErrors = "";
76
85
 
77
86
  const parentPagesErrors = errors.filter((url) => url.parentUrl === undefined);
78
87
  if (parentPagesErrors.length > 0) {
@@ -103,7 +112,14 @@ function createErrorResult(errors) {
103
112
  }
104
113
  }
105
114
 
106
- return parentPages + nestedPages;
115
+ if (ERRORS.length > 0) {
116
+ generalErrors = `\n\nGeneral errors:\n`;
117
+ for (const error of ERRORS) {
118
+ generalErrors += `${createTabSpace(1)}${error}\n`;
119
+ }
120
+ }
121
+
122
+ return parentPages + nestedPages + generalErrors;
107
123
  }
108
124
 
109
125
  /**
@@ -163,11 +179,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
163
179
  url,
164
180
  parentUrl,
165
181
  isImg: isImageUrl(url),
182
+ isWebPage: false,
166
183
  ttl,
167
184
  status: 0,
168
185
  message: "invalid url: " + url,
169
186
  skipped: false,
170
187
  html: null,
188
+ redirected: false,
171
189
  };
172
190
  }
173
191
 
@@ -184,11 +202,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
184
202
  url,
185
203
  parentUrl,
186
204
  isImg: isImageUrl(url),
205
+ isWebPage: true,
187
206
  ttl,
188
207
  status: 0,
189
208
  message: "blocked by robots.txt",
190
209
  skipped: true,
191
210
  html: null,
211
+ redirected: false,
192
212
  };
193
213
  }
194
214
  }
@@ -201,11 +221,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
201
221
  url,
202
222
  parentUrl,
203
223
  isImg: isImageUrl(url),
224
+ isWebPage: true,
204
225
  ttl,
205
226
  status: errorStatus,
206
227
  message: e.message,
207
228
  skipped: errorStatus !== undefined,
208
229
  html: null,
230
+ redirected: false,
209
231
  };
210
232
  }
211
233
 
@@ -220,10 +242,12 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
220
242
  url,
221
243
  parentUrl,
222
244
  isImg: isImageUrl(url),
245
+ isWebPage: response.headers.get("content-type").includes("text/html"),
223
246
  ttl,
224
247
  status: response.status,
225
248
  skipped: false,
226
249
  html: await response.text(),
250
+ redirected: response.redirected,
227
251
  };
228
252
  } catch (e) {
229
253
  const status = Number.parseInt((e && e.response && e.response.status) || -1, 10);
@@ -232,11 +256,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
232
256
  url,
233
257
  parentUrl,
234
258
  isImg: isImageUrl(url),
259
+ isWebPage: true,
235
260
  ttl,
236
261
  status,
237
262
  message: e.message,
238
263
  skipped: false,
239
264
  html: null,
265
+ redirected: false,
240
266
  };
241
267
  }
242
268
  }
@@ -245,7 +271,7 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
245
271
  * @param url {string}
246
272
  * @param webUrl {string}
247
273
  * @param parentUrl {string | undefined}
248
- * @return {UrlCheckResponse}
274
+ * @return {result: UrlCheckResponse|TestedUrlDto, fromCache: boolean}
249
275
  */
250
276
  async function testUrl(url, webUrl, parentUrl = undefined) {
251
277
  const indexInChecked = TESTED_URLS.findIndex((result) => result.url === url);
@@ -253,16 +279,26 @@ async function testUrl(url, webUrl, parentUrl = undefined) {
253
279
  const result = await fetchUrl(url, webUrl, parentUrl);
254
280
  TESTED_URLS.push({
255
281
  isImg: result.isImg,
282
+ isWebPage: result.isWebPage,
256
283
  message: result.message,
257
284
  parentUrl: result.parentUrl,
258
285
  skipped: result.skipped,
259
286
  status: result.status,
260
287
  ttl: result.ttl,
261
288
  url: result.url,
289
+ redirected: result.redirected,
290
+ indexable:
291
+ result.isWebPage && typeof result.html === "string"
292
+ ? cheerio.load(result.html)("meta[name='robots']").attr("content") !== "noindex"
293
+ : false,
294
+ canonicalUrl:
295
+ result.isWebPage && typeof result.html === "string"
296
+ ? (cheerio.load(result.html)("link[rel='canonical']").attr("href") ?? null)
297
+ : null,
262
298
  });
263
- return result;
299
+ return { result, fromCache: false };
264
300
  }
265
- return TESTED_URLS[indexInChecked];
301
+ return { result: TESTED_URLS[indexInChecked], fromCache: true };
266
302
  }
267
303
 
268
304
  /**
@@ -283,14 +319,14 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
283
319
  const url = urls[i];
284
320
  const changedUrl = webUrl ? `${webUrl}${new URL(url).pathname}` : url;
285
321
 
286
- const result = await testUrl(changedUrl, webUrl);
322
+ const { result, fromCache } = await testUrl(changedUrl, webUrl);
287
323
  printProgress();
288
324
 
289
- if (withNested && result.status === 200) {
325
+ if (withNested && !fromCache && result.status === 200) {
290
326
  await testNestedUrls(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
291
327
  }
292
328
 
293
- if (withImages && result.status === 200) {
329
+ if (withImages && !fromCache && result.status === 200) {
294
330
  await testNestedImages(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
295
331
  }
296
332
  }
@@ -304,13 +340,20 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
304
340
  * @return {Promise<void>}
305
341
  */
306
342
  async function testNestedUrls(html, parentUrl, parentIndex, webUrl) {
307
- const $ = cheerio.load(html);
308
- const urls = createCorrectLinks(
309
- $("a[href]").map((i, node) => $(node).attr("href")),
310
- webUrl,
311
- );
343
+ try {
344
+ const $ = cheerio.load(html.toString());
345
+ let urls = createCorrectLinks(
346
+ $("a[href]").map((i, node) => $(node).attr("href")),
347
+ webUrl,
348
+ );
349
+
350
+ // FIXME
351
+ urls = urls.filter((url) => url.startsWith(webUrl) || url.startsWith("/"));
312
352
 
313
- await testNested(urls, parentIndex, parentUrl, createTabSpace() + URLS_LABEL, webUrl);
353
+ await testNested(urls, parentIndex, parentUrl, createTabSpace() + URLS_LABEL, webUrl);
354
+ } catch (e) {
355
+ ERRORS.push(`Can't test all nested pages for ${parentUrl} - ${e.message}`);
356
+ }
314
357
  }
315
358
 
316
359
  /**
@@ -321,13 +364,17 @@ async function testNestedUrls(html, parentUrl, parentIndex, webUrl) {
321
364
  * @return {Promise<void>}
322
365
  */
323
366
  async function testNestedImages(html, parentUrl, parentIndex, webUrl) {
324
- const $ = cheerio.load(html);
325
- const images = createCorrectLinks(
326
- $("img[src]").map((i, node) => $(node).attr("src")),
327
- webUrl,
328
- );
367
+ try {
368
+ const $ = cheerio.load(html);
369
+ const images = createCorrectLinks(
370
+ $("img[src]").map((i, node) => $(node).attr("src")),
371
+ webUrl,
372
+ );
329
373
 
330
- await testNested(images, parentIndex, parentUrl, createTabSpace() + IMAGES_LABEL, webUrl);
374
+ await testNested(images, parentIndex, parentUrl, createTabSpace() + IMAGES_LABEL, webUrl);
375
+ } catch (e) {
376
+ ERRORS.push(`Can't test all nested images for ${parentUrl} - ${e.message}`);
377
+ }
331
378
  }
332
379
 
333
380
  /**
@@ -426,6 +473,24 @@ async function sendGoogleChatMessage(resultErrors, webhookUrl) {
426
473
  );
427
474
  }
428
475
 
476
+ /**
477
+ * @param webUrl {string}
478
+ * @return {string[]}
479
+ */
480
+ function getPagesShouldBeInSitemap(webUrl) {
481
+ return TESTED_URLS.filter((u) => u.indexable)
482
+ .filter(
483
+ (dto) =>
484
+ (dto.canonicalUrl ?? dto.url).startsWith(webUrl) &&
485
+ !(dto.canonicalUrl ?? dto.url).includes("?") &&
486
+ dto.status === 200 &&
487
+ dto.ttl <= 1 &&
488
+ !dto.redirected &&
489
+ dto.isWebPage,
490
+ )
491
+ .map((url) => (url.canonicalUrl ?? url.url).toLowerCase());
492
+ }
493
+
429
494
  /**
430
495
  * @param sitemapUrl {string}
431
496
  * @param skip {number}
@@ -453,17 +518,32 @@ module.exports = async function run(sitemapUrl, skip, withNested, withImages, go
453
518
  }
454
519
 
455
520
  const startTime = performance.now();
456
- await testSitemapUrls(await Sitemap.getSitemap(sitemapUrl), webUrl, sitemapUrl, skip, withNested, withImages);
521
+ const sitemapUrls = await Sitemap.getSitemap(sitemapUrl);
522
+ await testSitemapUrls(sitemapUrls, webUrl, sitemapUrl, skip, withNested, withImages);
457
523
  const finishTime = performance.now();
458
524
 
525
+ const shouldBeInSitemap = getPagesShouldBeInSitemap(webUrl);
526
+
459
527
  const errors = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === false);
528
+ const duplicates = [...new Set(sitemapUrls.filter((item, index, self) => self.indexOf(item) !== index))];
460
529
  const skippedUrls = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === true);
461
530
  const ok = TESTED_URLS.filter((r) => r.status === 200);
531
+ const missingInSitemap = shouldBeInSitemap.filter((testedUrl) => !sitemapUrls.includes(testedUrl));
462
532
 
463
- if (errors.length > 0) {
533
+ if (missingInSitemap.length > 0 || duplicates.length > 0 || errors.length > 0 || ERRORS.length > 0) {
534
+ const duplicatesText = duplicates.map((url) => `${createTabSpace()}${url}`).join("\n");
535
+ const missingText = missingInSitemap.map((url) => `${createTabSpace()}${url}`).join("\n");
464
536
  const errorText = createErrorResult(errors);
465
- logErrors(errorText, "\n\n\nErrors:\n");
466
- await sendGoogleChatMessage(errorText, googleWebhookUrl);
537
+ if (duplicatesText) {
538
+ logErrors(duplicatesText, "\n\n\nDuplicated pages in sitemap:\n");
539
+ }
540
+ if (missingText) {
541
+ logErrors(missingText, "\n\n\nMissing pages in sitemap:\n");
542
+ }
543
+ if (errorText) {
544
+ logErrors(errorText, "\n\n\nErrors:\n");
545
+ }
546
+ await sendGoogleChatMessage(duplicatesText + missingText + errorText, googleWebhookUrl);
467
547
  }
468
548
 
469
549
  if (skippedUrls.length > 0) {