@uxf/scripts 11.61.5 → 11.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@uxf/scripts",
3
- "version": "11.61.5",
3
+ "version": "11.62.0",
4
4
  "description": "",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -8,13 +8,20 @@ const robotsTxtParser = require("robots-txt-parser");
8
8
  const { HTTP_USERNAME, HTTP_PASSWORD } = process.env;
9
9
 
10
10
  /**
11
- * @typedef {{parentUrl: (string | undefined), isImg: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null)}} UrlCheckResponse
11
+ * @typedef {{parentUrl: (string | undefined), isImg: boolean, isWebPage: boolean, ttl: number, url: string, status: number, message: (string | undefined), skipped: boolean, html: (string | null), redirected: boolean}} UrlCheckResponse
12
+ */
13
+
14
+ /**
15
+ * @typedef {{url: string, parentUrl: (string | undefined), canonicalUrl: (string | null), isImg: boolean, isWebPage: boolean, ttl: number, status: number, message: (string | undefined), skipped: boolean, indexable: boolean, redirected: boolean}} TestedUrlDto
12
16
  */
13
17
 
14
18
  const MAX_TTL = 3;
15
19
  const IMAGES_LABEL = "🏞 Images:";
16
20
  const URLS_LABEL = "🔗 Links:";
17
21
 
22
+ /**
23
+ * @type TestedUrlDto[]
24
+ */
18
25
  const TESTED_URLS = [];
19
26
  const URLS_TO_CHECK = new Set();
20
27
  const ERRORS = [];
@@ -172,11 +179,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
172
179
  url,
173
180
  parentUrl,
174
181
  isImg: isImageUrl(url),
182
+ isWebPage: false,
175
183
  ttl,
176
184
  status: 0,
177
185
  message: "invalid url: " + url,
178
186
  skipped: false,
179
187
  html: null,
188
+ redirected: false,
180
189
  };
181
190
  }
182
191
 
@@ -193,11 +202,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
193
202
  url,
194
203
  parentUrl,
195
204
  isImg: isImageUrl(url),
205
+ isWebPage: true,
196
206
  ttl,
197
207
  status: 0,
198
208
  message: "blocked by robots.txt",
199
209
  skipped: true,
200
210
  html: null,
211
+ redirected: false,
201
212
  };
202
213
  }
203
214
  }
@@ -210,11 +221,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
210
221
  url,
211
222
  parentUrl,
212
223
  isImg: isImageUrl(url),
224
+ isWebPage: true,
213
225
  ttl,
214
226
  status: errorStatus,
215
227
  message: e.message,
216
228
  skipped: errorStatus !== undefined,
217
229
  html: null,
230
+ redirected: false,
218
231
  };
219
232
  }
220
233
 
@@ -229,10 +242,12 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
229
242
  url,
230
243
  parentUrl,
231
244
  isImg: isImageUrl(url),
245
+ isWebPage: response.headers.get("content-type").includes("text/html"),
232
246
  ttl,
233
247
  status: response.status,
234
248
  skipped: false,
235
249
  html: await response.text(),
250
+ redirected: response.redirected,
236
251
  };
237
252
  } catch (e) {
238
253
  const status = Number.parseInt((e && e.response && e.response.status) || -1, 10);
@@ -241,11 +256,13 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
241
256
  url,
242
257
  parentUrl,
243
258
  isImg: isImageUrl(url),
259
+ isWebPage: true,
244
260
  ttl,
245
261
  status,
246
262
  message: e.message,
247
263
  skipped: false,
248
264
  html: null,
265
+ redirected: false,
249
266
  };
250
267
  }
251
268
  }
@@ -254,7 +271,7 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
254
271
  * @param url {string}
255
272
  * @param webUrl {string}
256
273
  * @param parentUrl {string | undefined}
257
- * @return {UrlCheckResponse}
274
+ * @return {result: UrlCheckResponse|TestedUrlDto, fromCache: boolean}
258
275
  */
259
276
  async function testUrl(url, webUrl, parentUrl = undefined) {
260
277
  const indexInChecked = TESTED_URLS.findIndex((result) => result.url === url);
@@ -262,16 +279,26 @@ async function testUrl(url, webUrl, parentUrl = undefined) {
262
279
  const result = await fetchUrl(url, webUrl, parentUrl);
263
280
  TESTED_URLS.push({
264
281
  isImg: result.isImg,
282
+ isWebPage: result.isWebPage,
265
283
  message: result.message,
266
284
  parentUrl: result.parentUrl,
267
285
  skipped: result.skipped,
268
286
  status: result.status,
269
287
  ttl: result.ttl,
270
288
  url: result.url,
289
+ redirected: result.redirected,
290
+ indexable:
291
+ result.isWebPage && typeof result.html === "string"
292
+ ? cheerio.load(result.html)("meta[name='robots']").attr("content") !== "noindex"
293
+ : false,
294
+ canonicalUrl:
295
+ result.isWebPage && typeof result.html === "string"
296
+ ? (cheerio.load(result.html)("link[rel='canonical']").attr("href") ?? null)
297
+ : null,
271
298
  });
272
- return result;
299
+ return { result, fromCache: false };
273
300
  }
274
- return TESTED_URLS[indexInChecked];
301
+ return { result: TESTED_URLS[indexInChecked], fromCache: true };
275
302
  }
276
303
 
277
304
  /**
@@ -292,14 +319,14 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
292
319
  const url = urls[i];
293
320
  const changedUrl = webUrl ? `${webUrl}${new URL(url).pathname}` : url;
294
321
 
295
- const result = await testUrl(changedUrl, webUrl);
322
+ const { result, fromCache } = await testUrl(changedUrl, webUrl);
296
323
  printProgress();
297
324
 
298
- if (withNested && result.status === 200) {
325
+ if (withNested && !fromCache && result.status === 200) {
299
326
  await testNestedUrls(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
300
327
  }
301
328
 
302
- if (withImages && result.status === 200) {
329
+ if (withImages && !fromCache && result.status === 200) {
303
330
  await testNestedImages(result.html, changedUrl, i, webUrl ?? sitemapUrl.split("/").slice(0, 3).join("/"));
304
331
  }
305
332
  }
@@ -314,12 +341,15 @@ async function testSitemapUrls(urls, webUrl, sitemapUrl, skip, withNested, withI
314
341
  */
315
342
  async function testNestedUrls(html, parentUrl, parentIndex, webUrl) {
316
343
  try {
317
- const $ = cheerio.load(html);
318
- const urls = createCorrectLinks(
344
+ const $ = cheerio.load(html.toString());
345
+ let urls = createCorrectLinks(
319
346
  $("a[href]").map((i, node) => $(node).attr("href")),
320
347
  webUrl,
321
348
  );
322
349
 
350
+ // FIXME
351
+ urls = urls.filter((url) => url.startsWith(webUrl) || url.startsWith("/"));
352
+
323
353
  await testNested(urls, parentIndex, parentUrl, createTabSpace() + URLS_LABEL, webUrl);
324
354
  } catch (e) {
325
355
  ERRORS.push(`Can't test all nested pages for ${parentUrl} - ${e.message}`);
@@ -443,6 +473,24 @@ async function sendGoogleChatMessage(resultErrors, webhookUrl) {
443
473
  );
444
474
  }
445
475
 
476
+ /**
477
+ * @param webUrl {string}
478
+ * @return {string[]}
479
+ */
480
+ function getPagesShouldBeInSitemap(webUrl) {
481
+ return TESTED_URLS.filter((u) => u.indexable)
482
+ .filter(
483
+ (dto) =>
484
+ (dto.canonicalUrl ?? dto.url).startsWith(webUrl) &&
485
+ !(dto.canonicalUrl ?? dto.url).includes("?") &&
486
+ dto.status === 200 &&
487
+ dto.ttl <= 1 &&
488
+ !dto.redirected &&
489
+ dto.isWebPage,
490
+ )
491
+ .map((url) => (url.canonicalUrl ?? url.url).toLowerCase());
492
+ }
493
+
446
494
  /**
447
495
  * @param sitemapUrl {string}
448
496
  * @param skip {number}
@@ -470,17 +518,32 @@ module.exports = async function run(sitemapUrl, skip, withNested, withImages, go
470
518
  }
471
519
 
472
520
  const startTime = performance.now();
473
- await testSitemapUrls(await Sitemap.getSitemap(sitemapUrl), webUrl, sitemapUrl, skip, withNested, withImages);
521
+ const sitemapUrls = await Sitemap.getSitemap(sitemapUrl);
522
+ await testSitemapUrls(sitemapUrls, webUrl, sitemapUrl, skip, withNested, withImages);
474
523
  const finishTime = performance.now();
475
524
 
525
+ const shouldBeInSitemap = getPagesShouldBeInSitemap(webUrl);
526
+
476
527
  const errors = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === false);
528
+ const duplicates = [...new Set(sitemapUrls.filter((item, index, self) => self.indexOf(item) !== index))];
477
529
  const skippedUrls = TESTED_URLS.filter((r) => r.status !== 200 && r.skipped === true);
478
530
  const ok = TESTED_URLS.filter((r) => r.status === 200);
531
+ const missingInSitemap = shouldBeInSitemap.filter((testedUrl) => !sitemapUrls.includes(testedUrl));
479
532
 
480
- if (errors.length > 0 || ERRORS.length > 0) {
533
+ if (missingInSitemap.length > 0 || duplicates.length > 0 || errors.length > 0 || ERRORS.length > 0) {
534
+ const duplicatesText = duplicates.map((url) => `${createTabSpace()}${url}`).join("\n");
535
+ const missingText = missingInSitemap.map((url) => `${createTabSpace()}${url}`).join("\n");
481
536
  const errorText = createErrorResult(errors);
482
- logErrors(errorText, "\n\n\nErrors:\n");
483
- await sendGoogleChatMessage(errorText, googleWebhookUrl);
537
+ if (duplicatesText) {
538
+ logErrors(duplicatesText, "\n\n\nDuplicated pages in sitemap:\n");
539
+ }
540
+ if (missingText) {
541
+ logErrors(missingText, "\n\n\nMissing pages in sitemap:\n");
542
+ }
543
+ if (errorText) {
544
+ logErrors(errorText, "\n\n\nErrors:\n");
545
+ }
546
+ await sendGoogleChatMessage(duplicatesText + missingText + errorText, googleWebhookUrl);
484
547
  }
485
548
 
486
549
  if (skippedUrls.length > 0) {