@govtechsg/oobee 0.10.42 → 0.10.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/REPORTS.md +71 -2
- package/package.json +4 -2
- package/src/cli.ts +2 -11
- package/src/constants/common.ts +216 -76
- package/src/constants/constants.ts +89 -1
- package/src/constants/oobeeAi.ts +6 -6
- package/src/constants/questions.ts +3 -2
- package/src/crawlers/commonCrawlerFunc.ts +16 -15
- package/src/crawlers/crawlDomain.ts +82 -84
- package/src/crawlers/crawlIntelligentSitemap.ts +21 -19
- package/src/crawlers/crawlSitemap.ts +120 -109
- package/src/crawlers/custom/findElementByCssSelector.ts +1 -1
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +8 -8
- package/src/crawlers/custom/xPathToCss.ts +10 -10
- package/src/crawlers/runCustom.ts +1 -1
- package/src/index.ts +3 -4
- package/src/logs.ts +1 -1
- package/src/mergeAxeResults.ts +126 -7
- package/src/npmIndex.ts +12 -8
- package/src/screenshotFunc/htmlScreenshotFunc.ts +8 -20
- package/src/types/text-readability.d.ts +3 -0
- package/src/types/types.ts +1 -1
- package/src/utils.ts +254 -114
- package/src/xPathToCss.ts +0 -186
- package/src/xPathToCssCypress.ts +0 -178
package/src/constants/oobeeAi.ts
CHANGED
@@ -24,7 +24,7 @@ export const oobeeAiRules = [
|
|
24
24
|
'autocomplete-valid',
|
25
25
|
];
|
26
26
|
|
27
|
-
export const oobeeAiHtmlETL = htmlSnippet => {
|
27
|
+
export const oobeeAiHtmlETL = (htmlSnippet: string) => {
|
28
28
|
// Whitelisted attributes (to not drop)
|
29
29
|
// i.e. any other attribute will be dropped
|
30
30
|
const whitelistedAttributes = [
|
@@ -60,12 +60,12 @@ export const oobeeAiHtmlETL = htmlSnippet => {
|
|
60
60
|
`aria-labelledby`,
|
61
61
|
];
|
62
62
|
|
63
|
-
const sortAlphaAttributes = html => {
|
63
|
+
const sortAlphaAttributes = (html: string) => {
|
64
64
|
let entireHtml = '';
|
65
65
|
const htmlOpeningTagRegex = /<[^>]+/g;
|
66
66
|
const htmlTagmatches = html.match(htmlOpeningTagRegex);
|
67
67
|
|
68
|
-
let sortedHtmlTag;
|
68
|
+
let sortedHtmlTag: string = '';
|
69
69
|
|
70
70
|
htmlTagmatches.forEach(htmlTag => {
|
71
71
|
const closingTag = htmlTag.trim().slice(-1) === '/' ? '/>' : '>';
|
@@ -112,7 +112,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
|
|
112
112
|
|
113
113
|
// For all attributes within mutedAttributeValues array
|
114
114
|
// replace their values with "something" while maintaining the attribute
|
115
|
-
const muteAttributeValues = html => {
|
115
|
+
const muteAttributeValues = (html: string) => {
|
116
116
|
const regex = /(\s+)([\w-]+)(\s*=\s*")([^"]*)(")/g;
|
117
117
|
|
118
118
|
// p1 is the whitespace before the attribute
|
@@ -120,7 +120,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
|
|
120
120
|
// p3 is the attribute value before the replacement
|
121
121
|
// p4 is the attribute value (replaced with "...")
|
122
122
|
// p5 is the closing quote of the attribute value
|
123
|
-
return html.replace(regex, (match, p1, p2, p3,
|
123
|
+
return html.replace(regex, (match, p1, p2, p3, _p4, p5) => {
|
124
124
|
if (mutedAttributeValues.includes(p2)) {
|
125
125
|
return `${p1}${p2}${p3}...${p5}`;
|
126
126
|
}
|
@@ -129,7 +129,7 @@ export const oobeeAiHtmlETL = htmlSnippet => {
|
|
129
129
|
};
|
130
130
|
|
131
131
|
// Drop all attributes from the HTML snippet except whitelisted
|
132
|
-
const dropAllExceptWhitelisted = html => {
|
132
|
+
const dropAllExceptWhitelisted = (html: string) => {
|
133
133
|
const regex = new RegExp(
|
134
134
|
`(\\s+)(?!${whitelistedAttributes.join(`|`)})([\\w-]+)(\\s*=\\s*"[^"]*")`,
|
135
135
|
`g`,
|
@@ -12,12 +12,13 @@ import {
|
|
12
12
|
validEmail,
|
13
13
|
validName,
|
14
14
|
validateCustomFlowLabel,
|
15
|
+
parseHeaders,
|
15
16
|
} from './common.js';
|
16
17
|
import constants, { BrowserTypes, ScannerTypes } from './constants.js';
|
17
18
|
|
18
19
|
const userData = getUserDataTxt();
|
19
20
|
|
20
|
-
const questions = [];
|
21
|
+
const questions: Question[] = [];
|
21
22
|
|
22
23
|
const startScanQuestions = [
|
23
24
|
{
|
@@ -95,7 +96,7 @@ const startScanQuestions = [
|
|
95
96
|
clonedBrowserDataDir,
|
96
97
|
playwrightDeviceDetailsObject,
|
97
98
|
answers.scanner === ScannerTypes.CUSTOM,
|
98
|
-
answers.header,
|
99
|
+
parseHeaders(answers.header),
|
99
100
|
);
|
100
101
|
|
101
102
|
deleteClonedProfiles(browserToRun);
|
@@ -1,4 +1,4 @@
|
|
1
|
-
import crawlee, { CrawlingContext, PlaywrightGotoOptions } from 'crawlee';
|
1
|
+
import crawlee, { CrawlingContext, PlaywrightGotoOptions, Request } from 'crawlee';
|
2
2
|
import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
|
3
3
|
import { BrowserContext, Page } from 'playwright';
|
4
4
|
import {
|
@@ -18,7 +18,7 @@ import { framesCheck } from './custom/framesCheck.js';
|
|
18
18
|
import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
|
19
19
|
import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
|
20
20
|
import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
|
21
|
-
import
|
21
|
+
import xPathToCss from './custom/xPathToCss.js';
|
22
22
|
|
23
23
|
// types
|
24
24
|
interface AxeResultsWithScreenshot extends AxeResults {
|
@@ -118,13 +118,13 @@ export const filterAxeResults = (
|
|
118
118
|
|
119
119
|
if (conformance[0] !== 'best-practice' && !wcagRegex.test(conformance[0])) {
|
120
120
|
conformance.sort((a, b) => {
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
121
|
+
if (wcagRegex.test(a) && !wcagRegex.test(b)) {
|
122
|
+
return -1;
|
123
|
+
}
|
124
|
+
if (!wcagRegex.test(a) && wcagRegex.test(b)) {
|
125
|
+
return 1;
|
126
|
+
}
|
127
|
+
return 0;
|
128
128
|
});
|
129
129
|
}
|
130
130
|
|
@@ -166,7 +166,6 @@ export const filterAxeResults = (
|
|
166
166
|
};
|
167
167
|
|
168
168
|
nodes.forEach(node => {
|
169
|
-
const { impact } = node;
|
170
169
|
const hasWcagA = conformance.some(tag => /^wcag\d*a$/.test(tag));
|
171
170
|
const hasWcagAA = conformance.some(tag => /^wcag\d*aa$/.test(tag));
|
172
171
|
// const hasWcagAAA = conformance.some(tag => /^wcag\d*aaa$/.test(tag));
|
@@ -255,7 +254,7 @@ export const runAxeScript = async ({
|
|
255
254
|
let mutationCount = 0;
|
256
255
|
const MAX_MUTATIONS = 250;
|
257
256
|
const MAX_SAME_MUTATION_LIMIT = 10;
|
258
|
-
const mutationHash = {};
|
257
|
+
const mutationHash: Record<string, number> = {};
|
259
258
|
|
260
259
|
const observer = new MutationObserver(mutationsList => {
|
261
260
|
clearTimeout(timeout);
|
@@ -309,6 +308,8 @@ export const runAxeScript = async ({
|
|
309
308
|
silentLogger.warn(`Error while checking for DOM mutations: ${e}`);
|
310
309
|
}
|
311
310
|
|
311
|
+
// Omit logging of browser console errors to reduce unnecessary verbosity
|
312
|
+
/*
|
312
313
|
page.on('console', msg => {
|
313
314
|
const type = msg.type();
|
314
315
|
if (type === 'error') {
|
@@ -317,6 +318,7 @@ export const runAxeScript = async ({
|
|
317
318
|
silentLogger.log({ level: 'info', message: msg.text() });
|
318
319
|
}
|
319
320
|
});
|
321
|
+
*/
|
320
322
|
|
321
323
|
const disableOobee = ruleset.includes(RuleFlags.DISABLE_OOBEE);
|
322
324
|
const enableWcagAaa = ruleset.includes(RuleFlags.ENABLE_WCAG_AAA);
|
@@ -399,7 +401,7 @@ export const runAxeScript = async ({
|
|
399
401
|
help: 'Clickable elements (i.e. elements with mouse-click interaction) must have accessible labels.',
|
400
402
|
helpUrl: 'https://www.deque.com/blog/accessible-aria-buttons',
|
401
403
|
nodes: escapedCssSelectors
|
402
|
-
.map(cssSelector => ({
|
404
|
+
.map((cssSelector: string): NodeResult => ({
|
403
405
|
html: findElementByCssSelector(cssSelector),
|
404
406
|
target: [cssSelector],
|
405
407
|
impact: 'serious' as ImpactValue,
|
@@ -443,8 +445,7 @@ export const runAxeScript = async ({
|
|
443
445
|
framesCheckFunctionString: framesCheck.toString(),
|
444
446
|
findElementByCssSelectorFunctionString: findElementByCssSelector.toString(),
|
445
447
|
getAxeConfigurationFunctionString: getAxeConfiguration.toString(),
|
446
|
-
flagUnlabelledClickableElementsFunctionString:
|
447
|
-
flagUnlabelledClickableElements.toString(),
|
448
|
+
flagUnlabelledClickableElementsFunctionString: flagUnlabelledClickableElements.toString(),
|
448
449
|
xPathToCssFunctionString: xPathToCss.toString(),
|
449
450
|
},
|
450
451
|
);
|
@@ -495,7 +496,7 @@ export const postNavigationHooks = [
|
|
495
496
|
},
|
496
497
|
];
|
497
498
|
|
498
|
-
export const failedRequestHandler = async ({ request }) => {
|
499
|
+
export const failedRequestHandler = async ({ request }: { request: Request }) => {
|
499
500
|
guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
|
500
501
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
501
502
|
};
|
@@ -9,7 +9,6 @@ import https from 'https';
|
|
9
9
|
import type { BatchAddRequestsResult } from '@crawlee/types';
|
10
10
|
import {
|
11
11
|
createCrawleeSubFolders,
|
12
|
-
preNavigationHooks,
|
13
12
|
runAxeScript,
|
14
13
|
isUrlPdf,
|
15
14
|
} from './commonCrawlerFunc.js';
|
@@ -19,6 +18,7 @@ import constants, {
|
|
19
18
|
guiInfoStatusTypes,
|
20
19
|
cssQuerySelectors,
|
21
20
|
RuleFlags,
|
21
|
+
STATUS_CODE_METADATA,
|
22
22
|
} from '../constants/constants.js';
|
23
23
|
import {
|
24
24
|
getPlaywrightLaunchOptions,
|
@@ -26,7 +26,6 @@ import {
|
|
26
26
|
isSkippedUrl,
|
27
27
|
isDisallowedInRobotsTxt,
|
28
28
|
getUrlsFromRobotsTxt,
|
29
|
-
getBlackListedPatterns,
|
30
29
|
urlWithoutAuth,
|
31
30
|
waitForPageLoaded,
|
32
31
|
initModifiedUserAgent,
|
@@ -116,13 +115,12 @@ const crawlDomain = async ({
|
|
116
115
|
fs.mkdirSync(randomToken);
|
117
116
|
}
|
118
117
|
|
119
|
-
const pdfDownloads = [];
|
120
|
-
const uuidToPdfMapping = {};
|
118
|
+
const pdfDownloads: Promise<void>[] = [];
|
119
|
+
const uuidToPdfMapping: Record<string, string> = {};
|
121
120
|
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
122
121
|
const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
|
123
122
|
const { maxConcurrency } = constants;
|
124
123
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
125
|
-
const isBlacklistedUrl = isBlacklisted(url, blacklistedPatterns);
|
126
124
|
|
127
125
|
const httpsAgent = new https.Agent({ rejectUnauthorized: false });
|
128
126
|
|
@@ -167,8 +165,8 @@ const crawlDomain = async ({
|
|
167
165
|
const httpHeadCache = new Map<string, boolean>();
|
168
166
|
const isProcessibleUrl = async (url: string): Promise<boolean> => {
|
169
167
|
if (httpHeadCache.has(url)) {
|
170
|
-
silentLogger.info(
|
171
|
-
return false; // return false to avoid processing the url again
|
168
|
+
silentLogger.info(`Skipping request as URL has been processed before ${url}}`);
|
169
|
+
return false; // return false to avoid processing the same url again
|
172
170
|
}
|
173
171
|
|
174
172
|
try {
|
@@ -490,56 +488,35 @@ const crawlDomain = async ({
|
|
490
488
|
return new Promise(resolve => {
|
491
489
|
let timeout;
|
492
490
|
let mutationCount = 0;
|
493
|
-
const MAX_MUTATIONS
|
494
|
-
const
|
495
|
-
|
496
|
-
|
497
|
-
const observer = new MutationObserver(mutationsList => {
|
491
|
+
const MAX_MUTATIONS = 250; // stop if things never quiet down
|
492
|
+
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
493
|
+
|
494
|
+
const observer = new MutationObserver(() => {
|
498
495
|
clearTimeout(timeout);
|
499
|
-
|
500
|
-
mutationCount
|
501
|
-
|
496
|
+
|
497
|
+
mutationCount++;
|
502
498
|
if (mutationCount > MAX_MUTATIONS) {
|
503
499
|
observer.disconnect();
|
504
|
-
resolve('Too many mutations
|
500
|
+
resolve('Too many mutations, exiting.');
|
501
|
+
return;
|
505
502
|
}
|
506
|
-
|
507
|
-
//
|
508
|
-
mutationsList.forEach(mutation => {
|
509
|
-
let mutationKey;
|
510
|
-
|
511
|
-
if (mutation.target instanceof Element) {
|
512
|
-
Array.from(mutation.target.attributes).forEach(attr => {
|
513
|
-
mutationKey = `${mutation.target.nodeName}-${attr.name}`;
|
514
|
-
|
515
|
-
if (mutationKey) {
|
516
|
-
if (!mutationHash[mutationKey]) {
|
517
|
-
mutationHash[mutationKey] = 1;
|
518
|
-
} else {
|
519
|
-
mutationHash[mutationKey]++;
|
520
|
-
}
|
521
|
-
|
522
|
-
if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
|
523
|
-
observer.disconnect();
|
524
|
-
resolve(`Repeated mutation detected for ${mutationKey}`);
|
525
|
-
}
|
526
|
-
}
|
527
|
-
});
|
528
|
-
}
|
529
|
-
});
|
530
|
-
|
503
|
+
|
504
|
+
// restart quiet‑period timer
|
531
505
|
timeout = setTimeout(() => {
|
532
506
|
observer.disconnect();
|
533
|
-
resolve('DOM stabilized
|
507
|
+
resolve('DOM stabilized.');
|
534
508
|
}, 1000);
|
535
509
|
});
|
536
|
-
|
510
|
+
|
511
|
+
// overall timeout in case the page never settles
|
537
512
|
timeout = setTimeout(() => {
|
538
513
|
observer.disconnect();
|
539
|
-
resolve('
|
540
|
-
},
|
541
|
-
|
542
|
-
|
514
|
+
resolve('Observer timeout reached.');
|
515
|
+
}, OBSERVER_TIMEOUT);
|
516
|
+
|
517
|
+
// **HERE**: select the real DOM node inside evaluate
|
518
|
+
const root = document.documentElement;
|
519
|
+
observer.observe(root, { childList: true, subtree: true });
|
543
520
|
});
|
544
521
|
});
|
545
522
|
|
@@ -641,10 +618,12 @@ const crawlDomain = async ({
|
|
641
618
|
numScanned: urlsCrawled.scanned.length,
|
642
619
|
urlScanned: request.url,
|
643
620
|
});
|
644
|
-
urlsCrawled.
|
621
|
+
urlsCrawled.userExcluded.push({
|
645
622
|
url: request.url,
|
646
623
|
pageTitle: request.url,
|
647
|
-
actualUrl:
|
624
|
+
actualUrl: request.url, // because about:blank is not useful
|
625
|
+
metadata: STATUS_CODE_METADATA[1],
|
626
|
+
httpStatusCode: 0,
|
648
627
|
});
|
649
628
|
|
650
629
|
return;
|
@@ -666,10 +645,12 @@ const crawlDomain = async ({
|
|
666
645
|
numScanned: urlsCrawled.scanned.length,
|
667
646
|
urlScanned: request.url,
|
668
647
|
});
|
669
|
-
urlsCrawled.
|
648
|
+
urlsCrawled.userExcluded.push({
|
670
649
|
url: request.url,
|
671
650
|
pageTitle: request.url,
|
672
|
-
actualUrl: actualUrl, //
|
651
|
+
actualUrl: actualUrl, // because about:blank is not useful
|
652
|
+
metadata: STATUS_CODE_METADATA[1],
|
653
|
+
httpStatusCode: 0,
|
673
654
|
});
|
674
655
|
|
675
656
|
return;
|
@@ -680,38 +661,16 @@ const crawlDomain = async ({
|
|
680
661
|
url: request.url,
|
681
662
|
pageTitle: request.url,
|
682
663
|
actualUrl: actualUrl,
|
664
|
+
metadata: STATUS_CODE_METADATA[0],
|
665
|
+
httpStatusCode: 0,
|
683
666
|
});
|
684
|
-
|
685
|
-
await enqueueProcess(page, enqueueLinks, browserContext);
|
686
|
-
return;
|
687
|
-
}
|
688
|
-
|
689
|
-
if (response && response.status() === 403) {
|
690
|
-
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
691
|
-
numScanned: urlsCrawled.scanned.length,
|
692
|
-
urlScanned: request.url,
|
693
|
-
});
|
694
|
-
urlsCrawled.forbidden.push({
|
695
|
-
url: request.url,
|
696
|
-
pageTitle: request.url,
|
697
|
-
actualUrl: actualUrl, // i.e. actualUrl
|
698
|
-
});
|
699
|
-
|
700
|
-
return;
|
701
|
-
}
|
702
|
-
|
703
|
-
if (response && response.status() !== 200) {
|
704
|
-
|
667
|
+
|
705
668
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
706
669
|
numScanned: urlsCrawled.scanned.length,
|
707
670
|
urlScanned: request.url,
|
708
671
|
});
|
709
|
-
urlsCrawled.invalid.push({
|
710
|
-
url: request.url,
|
711
|
-
pageTitle: request.url,
|
712
|
-
actualUrl: actualUrl, // i.e. actualUrl
|
713
|
-
});
|
714
672
|
|
673
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
715
674
|
return;
|
716
675
|
}
|
717
676
|
|
@@ -733,6 +692,22 @@ const crawlDomain = async ({
|
|
733
692
|
return;
|
734
693
|
}
|
735
694
|
|
695
|
+
const responseStatus = response?.status();
|
696
|
+
if (responseStatus && responseStatus >= 300) {
|
697
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
698
|
+
numScanned: urlsCrawled.scanned.length,
|
699
|
+
urlScanned: request.url,
|
700
|
+
});
|
701
|
+
urlsCrawled.userExcluded.push({
|
702
|
+
url: request.url,
|
703
|
+
pageTitle: request.url,
|
704
|
+
actualUrl,
|
705
|
+
metadata: STATUS_CODE_METADATA[responseStatus] || STATUS_CODE_METADATA[599],
|
706
|
+
httpStatusCode: responseStatus,
|
707
|
+
});
|
708
|
+
return;
|
709
|
+
}
|
710
|
+
|
736
711
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
737
712
|
|
738
713
|
if (isRedirected) {
|
@@ -790,10 +765,12 @@ const crawlDomain = async ({
|
|
790
765
|
numScanned: urlsCrawled.scanned.length,
|
791
766
|
urlScanned: request.url,
|
792
767
|
});
|
793
|
-
urlsCrawled.
|
768
|
+
urlsCrawled.userExcluded.push({
|
794
769
|
url: request.url,
|
795
770
|
pageTitle: request.url,
|
796
|
-
actualUrl: actualUrl, //
|
771
|
+
actualUrl: actualUrl, // because about:blank is not useful
|
772
|
+
metadata: STATUS_CODE_METADATA[1],
|
773
|
+
httpStatusCode: 0,
|
797
774
|
});
|
798
775
|
|
799
776
|
}
|
@@ -833,18 +810,39 @@ const crawlDomain = async ({
|
|
833
810
|
// when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
|
834
811
|
// a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
|
835
812
|
if (!isAbortingScanNow) {
|
836
|
-
|
813
|
+
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
814
|
+
numScanned: urlsCrawled.scanned.length,
|
815
|
+
urlScanned: request.url,
|
816
|
+
});
|
817
|
+
|
818
|
+
urlsCrawled.error.push({
|
819
|
+
url: request.url,
|
820
|
+
pageTitle: request.url,
|
821
|
+
actualUrl: request.url,
|
822
|
+
metadata: STATUS_CODE_METADATA[2]
|
823
|
+
});
|
837
824
|
}
|
838
825
|
}
|
839
826
|
},
|
840
|
-
failedRequestHandler: async ({ request }) => {
|
827
|
+
failedRequestHandler: async ({ request, response }) => {
|
841
828
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
842
829
|
numScanned: urlsCrawled.scanned.length,
|
843
830
|
urlScanned: request.url,
|
844
831
|
});
|
845
|
-
|
846
|
-
|
847
|
-
|
832
|
+
|
833
|
+
const status = response?.status();
|
834
|
+
const metadata = typeof status === 'number'
|
835
|
+
? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
|
836
|
+
: STATUS_CODE_METADATA[2];
|
837
|
+
|
838
|
+
urlsCrawled.error.push({
|
839
|
+
url: request.url,
|
840
|
+
pageTitle: request.url,
|
841
|
+
actualUrl: request.url,
|
842
|
+
metadata,
|
843
|
+
httpStatusCode: typeof status === 'number' ? status : 0,
|
844
|
+
});
|
845
|
+
|
848
846
|
},
|
849
847
|
maxRequestsPerCrawl: Infinity,
|
850
848
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
@@ -1,27 +1,29 @@
|
|
1
1
|
import fs from 'fs';
|
2
|
-
import { chromium } from 'playwright';
|
2
|
+
import { chromium, Page } from 'playwright';
|
3
3
|
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
4
4
|
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
5
5
|
import { silentLogger, guiInfoLog } from '../logs.js';
|
6
6
|
import crawlDomain from './crawlDomain.js';
|
7
7
|
import crawlSitemap from './crawlSitemap.js';
|
8
|
+
import { EnqueueStrategy } from 'crawlee';
|
9
|
+
import { ViewportSettingsClass } from '../combine.js';
|
8
10
|
|
9
11
|
const crawlIntelligentSitemap = async (
|
10
|
-
url,
|
11
|
-
randomToken,
|
12
|
-
host,
|
13
|
-
viewportSettings,
|
14
|
-
maxRequestsPerCrawl,
|
15
|
-
browser,
|
16
|
-
userDataDirectory,
|
17
|
-
strategy,
|
18
|
-
specifiedMaxConcurrency,
|
19
|
-
fileTypes,
|
20
|
-
blacklistedPatterns,
|
21
|
-
includeScreenshots,
|
22
|
-
followRobots,
|
23
|
-
extraHTTPHeaders,
|
24
|
-
safeMode,
|
12
|
+
url: string,
|
13
|
+
randomToken: string,
|
14
|
+
host: string,
|
15
|
+
viewportSettings: ViewportSettingsClass,
|
16
|
+
maxRequestsPerCrawl: number,
|
17
|
+
browser: string,
|
18
|
+
userDataDirectory: string,
|
19
|
+
strategy: EnqueueStrategy,
|
20
|
+
specifiedMaxConcurrency: number,
|
21
|
+
fileTypes: string,
|
22
|
+
blacklistedPatterns: string[],
|
23
|
+
includeScreenshots: boolean,
|
24
|
+
followRobots: boolean,
|
25
|
+
extraHTTPHeaders: Record<string, string>,
|
26
|
+
safeMode: boolean,
|
25
27
|
) => {
|
26
28
|
let urlsCrawledFinal;
|
27
29
|
let urlsCrawled;
|
@@ -37,7 +39,7 @@ const crawlIntelligentSitemap = async (
|
|
37
39
|
fs.mkdirSync(randomToken);
|
38
40
|
}
|
39
41
|
|
40
|
-
function getHomeUrl(parsedUrl) {
|
42
|
+
function getHomeUrl(parsedUrl: string) {
|
41
43
|
const urlObject = new URL(parsedUrl);
|
42
44
|
if (urlObject.username !== '' && urlObject.password !== '') {
|
43
45
|
return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
@@ -46,7 +48,7 @@ const crawlIntelligentSitemap = async (
|
|
46
48
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
47
49
|
}
|
48
50
|
|
49
|
-
async function findSitemap(link) {
|
51
|
+
async function findSitemap(link: string) {
|
50
52
|
const homeUrl = getHomeUrl(link);
|
51
53
|
let sitemapLinkFound = false;
|
52
54
|
let sitemapLink = '';
|
@@ -70,7 +72,7 @@ const crawlIntelligentSitemap = async (
|
|
70
72
|
return sitemapExist ? sitemapLink : '';
|
71
73
|
}
|
72
74
|
|
73
|
-
const checkUrlExists = async (page, parsedUrl) => {
|
75
|
+
const checkUrlExists = async (page: Page, parsedUrl: string) => {
|
74
76
|
try {
|
75
77
|
const response = await page.goto(parsedUrl);
|
76
78
|
if (response.ok()) {
|