@uxf/scripts 11.64.0 → 11.64.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/src/uxf-sitemap-check/index.js +58 -18
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@uxf/scripts",
|
|
3
|
-
"version": "11.64.
|
|
3
|
+
"version": "11.64.2",
|
|
4
4
|
"description": "",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"bin": {
|
|
@@ -33,6 +33,7 @@
|
|
|
33
33
|
"cheerio": "1.0.0",
|
|
34
34
|
"dayjs": "1.11.13",
|
|
35
35
|
"fast-glob": "3.3.2",
|
|
36
|
+
"got": "14.4.7",
|
|
36
37
|
"madge": "8.0.0",
|
|
37
38
|
"robots-txt-parser": "2.0.3",
|
|
38
39
|
"yargs": "17.7.2"
|
|
@@ -5,6 +5,8 @@ const cheerio = require("cheerio");
|
|
|
5
5
|
const GoogleChat = require("../GoogleChat");
|
|
6
6
|
const robotsTxtParser = require("robots-txt-parser");
|
|
7
7
|
|
|
8
|
+
const got = (url, init) => import("got").then((mod) => mod.default(url, init));
|
|
9
|
+
|
|
8
10
|
const { HTTP_USERNAME, HTTP_PASSWORD } = process.env;
|
|
9
11
|
|
|
10
12
|
/**
|
|
@@ -17,7 +19,7 @@ const DUPLICATES_TITLE = "\n\n\nDuplicated pages in sitemap:\n";
|
|
|
17
19
|
const MISSING_TITLE = "\n\n\nMissing pages in sitemap:\n";
|
|
18
20
|
const ERROR_TITLE = "\n\n\nErrors:\n";
|
|
19
21
|
|
|
20
|
-
const MAX_TTL =
|
|
22
|
+
const MAX_TTL = 1;
|
|
21
23
|
const IMAGES_LABEL = "🏞 Images:";
|
|
22
24
|
const URLS_LABEL = "🔗 Links:";
|
|
23
25
|
|
|
@@ -50,17 +52,41 @@ function getUrlOrigin(url) {
|
|
|
50
52
|
/**
|
|
51
53
|
* @param url {string}
|
|
52
54
|
* @param options {{redirect: boolean, isExternal: boolean}}
|
|
53
|
-
* @returns {Promise<Response
|
|
55
|
+
* @returns {Promise<import('got').Response<string>>}
|
|
54
56
|
*/
|
|
55
57
|
function fetcher(url, options) {
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
58
|
+
const shouldUseBasicAuth = !options.isExternal && HTTP_USERNAME && HTTP_PASSWORD;
|
|
59
|
+
|
|
60
|
+
const headers = new Headers({
|
|
61
|
+
"User-Agent":
|
|
62
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36",
|
|
63
|
+
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
64
|
+
"Accept-Language": "en-US,en;q=0.9,cs-CZ;q=0.8,cs;q=0.7,de;q=0.6",
|
|
65
|
+
"Cache-Control": "no-cache",
|
|
66
|
+
Connection: "keep-alive",
|
|
67
|
+
Pragma: "no-cache",
|
|
68
|
+
"Sec-Ch-Ua": '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
|
|
69
|
+
"Sec-Ch-Ua-Arch": '"x86"',
|
|
70
|
+
"Sec-Ch-Ua-Mobile": "?0",
|
|
71
|
+
"Sec-Ch-Ua-Platform": '"Windows"',
|
|
72
|
+
"Sec-Fetch-Dest": "document",
|
|
73
|
+
"Sec-Fetch-Mode": "navigate",
|
|
74
|
+
"Sec-Fetch-Site": "cross-site",
|
|
75
|
+
"Sec-Fetch-User": "?1",
|
|
76
|
+
"Sec-Fetch-User-Agent": "?1",
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
return got(url, {
|
|
80
|
+
throwHttpErrors: false,
|
|
81
|
+
decompress: false,
|
|
82
|
+
https: {
|
|
83
|
+
rejectUnauthorized: false,
|
|
84
|
+
},
|
|
85
|
+
headers,
|
|
86
|
+
username: shouldUseBasicAuth ? HTTP_USERNAME : undefined,
|
|
87
|
+
password: shouldUseBasicAuth ? HTTP_PASSWORD : undefined,
|
|
88
|
+
followRedirect: options.redirect,
|
|
89
|
+
signal: AbortSignal.timeout(30_000),
|
|
64
90
|
});
|
|
65
91
|
}
|
|
66
92
|
|
|
@@ -226,7 +252,7 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
226
252
|
}
|
|
227
253
|
} catch (e) {
|
|
228
254
|
const errorStatus = await fetcher(new URL(url).origin + "/robots.txt", { isExternal: true, redirect: true })
|
|
229
|
-
.then((res) => (res.
|
|
255
|
+
.then((res) => (res.statusCode === 200 ? -1 : res.statusCode))
|
|
230
256
|
.catch((e) => e.response?.status);
|
|
231
257
|
|
|
232
258
|
return {
|
|
@@ -237,7 +263,7 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
237
263
|
ttl,
|
|
238
264
|
status: errorStatus,
|
|
239
265
|
message: e.message,
|
|
240
|
-
skipped:
|
|
266
|
+
skipped: true,
|
|
241
267
|
html: null,
|
|
242
268
|
redirected: false,
|
|
243
269
|
};
|
|
@@ -246,7 +272,22 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
246
272
|
try {
|
|
247
273
|
const response = await fetcher(url, { redirect: !!parentUrl, isExternal: !url.includes(webUrl) });
|
|
248
274
|
|
|
249
|
-
if (response.
|
|
275
|
+
if (response.statusCode === 403 && response.headers["server"] === "cloudflare") {
|
|
276
|
+
return {
|
|
277
|
+
url,
|
|
278
|
+
parentUrl,
|
|
279
|
+
isImg: isImageUrl(url),
|
|
280
|
+
isWebPage: true,
|
|
281
|
+
ttl,
|
|
282
|
+
status: 0,
|
|
283
|
+
message: "blocked by server",
|
|
284
|
+
skipped: true,
|
|
285
|
+
html: null,
|
|
286
|
+
redirected: false,
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
if (response.statusCode !== 200 && ttl < MAX_TTL) {
|
|
250
291
|
return await fetchUrl(url, webUrl, parentUrl, ttl + 1);
|
|
251
292
|
}
|
|
252
293
|
|
|
@@ -254,12 +295,12 @@ async function fetchUrl(url, webUrl, parentUrl = undefined, ttl = 1) {
|
|
|
254
295
|
url,
|
|
255
296
|
parentUrl,
|
|
256
297
|
isImg: isImageUrl(url),
|
|
257
|
-
isWebPage: response.headers
|
|
298
|
+
isWebPage: response.headers["content-type"]?.includes("text/html") ?? true,
|
|
258
299
|
ttl,
|
|
259
|
-
status: response.
|
|
300
|
+
status: response.statusCode,
|
|
260
301
|
skipped: false,
|
|
261
|
-
html:
|
|
262
|
-
redirected: response.
|
|
302
|
+
html: response.body,
|
|
303
|
+
redirected: response.redirectUrls.length > 0,
|
|
263
304
|
};
|
|
264
305
|
} catch (e) {
|
|
265
306
|
const status = Number.parseInt((e && e.response && e.response.status) || -1, 10);
|
|
@@ -619,5 +660,4 @@ module.exports = async function run(sitemapUrl, skip, withNested, withImages, ch
|
|
|
619
660
|
|
|
620
661
|
process.exit(1);
|
|
621
662
|
}
|
|
622
|
-
|
|
623
663
|
};
|