@govtechsg/oobee 0.10.39 → 0.10.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/docker-test.yml +1 -1
- package/README.md +2 -0
- package/REPORTS.md +431 -0
- package/package.json +3 -2
- package/src/cli.ts +2 -11
- package/src/constants/common.ts +68 -52
- package/src/constants/constants.ts +81 -1
- package/src/constants/oobeeAi.ts +6 -6
- package/src/constants/questions.ts +3 -2
- package/src/crawlers/commonCrawlerFunc.ts +45 -16
- package/src/crawlers/crawlDomain.ts +83 -102
- package/src/crawlers/crawlIntelligentSitemap.ts +21 -19
- package/src/crawlers/crawlSitemap.ts +121 -110
- package/src/crawlers/custom/findElementByCssSelector.ts +1 -1
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +593 -558
- package/src/crawlers/custom/xPathToCss.ts +10 -10
- package/src/crawlers/pdfScanFunc.ts +67 -26
- package/src/crawlers/runCustom.ts +1 -1
- package/src/index.ts +3 -4
- package/src/logs.ts +1 -1
- package/src/mergeAxeResults.ts +305 -242
- package/src/npmIndex.ts +12 -8
- package/src/screenshotFunc/htmlScreenshotFunc.ts +8 -20
- package/src/screenshotFunc/pdfScreenshotFunc.ts +34 -1
- package/src/types/text-readability.d.ts +3 -0
- package/src/types/types.ts +1 -1
- package/src/utils.ts +340 -50
- package/src/xPathToCss.ts +0 -186
- package/src/xPathToCssCypress.ts +0 -178
@@ -9,7 +9,6 @@ import https from 'https';
|
|
9
9
|
import type { BatchAddRequestsResult } from '@crawlee/types';
|
10
10
|
import {
|
11
11
|
createCrawleeSubFolders,
|
12
|
-
preNavigationHooks,
|
13
12
|
runAxeScript,
|
14
13
|
isUrlPdf,
|
15
14
|
} from './commonCrawlerFunc.js';
|
@@ -19,6 +18,7 @@ import constants, {
|
|
19
18
|
guiInfoStatusTypes,
|
20
19
|
cssQuerySelectors,
|
21
20
|
RuleFlags,
|
21
|
+
STATUS_CODE_METADATA,
|
22
22
|
} from '../constants/constants.js';
|
23
23
|
import {
|
24
24
|
getPlaywrightLaunchOptions,
|
@@ -26,7 +26,6 @@ import {
|
|
26
26
|
isSkippedUrl,
|
27
27
|
isDisallowedInRobotsTxt,
|
28
28
|
getUrlsFromRobotsTxt,
|
29
|
-
getBlackListedPatterns,
|
30
29
|
urlWithoutAuth,
|
31
30
|
waitForPageLoaded,
|
32
31
|
initModifiedUserAgent,
|
@@ -116,13 +115,12 @@ const crawlDomain = async ({
|
|
116
115
|
fs.mkdirSync(randomToken);
|
117
116
|
}
|
118
117
|
|
119
|
-
const pdfDownloads = [];
|
120
|
-
const uuidToPdfMapping = {};
|
118
|
+
const pdfDownloads: Promise<void>[] = [];
|
119
|
+
const uuidToPdfMapping: Record<string, string> = {};
|
121
120
|
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
122
121
|
const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
|
123
122
|
const { maxConcurrency } = constants;
|
124
123
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
125
|
-
const isBlacklistedUrl = isBlacklisted(url, blacklistedPatterns);
|
126
124
|
|
127
125
|
const httpsAgent = new https.Agent({ rejectUnauthorized: false });
|
128
126
|
|
@@ -167,8 +165,8 @@ const crawlDomain = async ({
|
|
167
165
|
const httpHeadCache = new Map<string, boolean>();
|
168
166
|
const isProcessibleUrl = async (url: string): Promise<boolean> => {
|
169
167
|
if (httpHeadCache.has(url)) {
|
170
|
-
silentLogger.info(
|
171
|
-
return false; // return false to avoid processing the url again
|
168
|
+
silentLogger.info(`Skipping request as URL has been processed before ${url}}`);
|
169
|
+
return false; // return false to avoid processing the same url again
|
172
170
|
}
|
173
171
|
|
174
172
|
try {
|
@@ -490,56 +488,35 @@ const crawlDomain = async ({
|
|
490
488
|
return new Promise(resolve => {
|
491
489
|
let timeout;
|
492
490
|
let mutationCount = 0;
|
493
|
-
const MAX_MUTATIONS
|
494
|
-
const
|
495
|
-
|
496
|
-
|
497
|
-
const observer = new MutationObserver(mutationsList => {
|
491
|
+
const MAX_MUTATIONS = 250; // stop if things never quiet down
|
492
|
+
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
493
|
+
|
494
|
+
const observer = new MutationObserver(() => {
|
498
495
|
clearTimeout(timeout);
|
499
|
-
|
500
|
-
mutationCount
|
501
|
-
|
496
|
+
|
497
|
+
mutationCount++;
|
502
498
|
if (mutationCount > MAX_MUTATIONS) {
|
503
499
|
observer.disconnect();
|
504
|
-
resolve('Too many mutations
|
500
|
+
resolve('Too many mutations, exiting.');
|
501
|
+
return;
|
505
502
|
}
|
506
|
-
|
507
|
-
//
|
508
|
-
mutationsList.forEach(mutation => {
|
509
|
-
let mutationKey;
|
510
|
-
|
511
|
-
if (mutation.target instanceof Element) {
|
512
|
-
Array.from(mutation.target.attributes).forEach(attr => {
|
513
|
-
mutationKey = `${mutation.target.nodeName}-${attr.name}`;
|
514
|
-
|
515
|
-
if (mutationKey) {
|
516
|
-
if (!mutationHash[mutationKey]) {
|
517
|
-
mutationHash[mutationKey] = 1;
|
518
|
-
} else {
|
519
|
-
mutationHash[mutationKey]++;
|
520
|
-
}
|
521
|
-
|
522
|
-
if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
|
523
|
-
observer.disconnect();
|
524
|
-
resolve(`Repeated mutation detected for ${mutationKey}`);
|
525
|
-
}
|
526
|
-
}
|
527
|
-
});
|
528
|
-
}
|
529
|
-
});
|
530
|
-
|
503
|
+
|
504
|
+
// restart quiet‑period timer
|
531
505
|
timeout = setTimeout(() => {
|
532
506
|
observer.disconnect();
|
533
|
-
resolve('DOM stabilized
|
507
|
+
resolve('DOM stabilized.');
|
534
508
|
}, 1000);
|
535
509
|
});
|
536
|
-
|
510
|
+
|
511
|
+
// overall timeout in case the page never settles
|
537
512
|
timeout = setTimeout(() => {
|
538
513
|
observer.disconnect();
|
539
|
-
resolve('
|
540
|
-
},
|
541
|
-
|
542
|
-
|
514
|
+
resolve('Observer timeout reached.');
|
515
|
+
}, OBSERVER_TIMEOUT);
|
516
|
+
|
517
|
+
// **HERE**: select the real DOM node inside evaluate
|
518
|
+
const root = document.documentElement;
|
519
|
+
observer.observe(root, { childList: true, subtree: true });
|
543
520
|
});
|
544
521
|
});
|
545
522
|
|
@@ -635,16 +612,18 @@ const crawlDomain = async ({
|
|
635
612
|
}
|
636
613
|
|
637
614
|
// handle pdfs
|
638
|
-
if (request.skipNavigation &&
|
615
|
+
if (request.skipNavigation && actualUrl === "about:blank") {
|
639
616
|
if (!isScanPdfs) {
|
640
617
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
641
618
|
numScanned: urlsCrawled.scanned.length,
|
642
619
|
urlScanned: request.url,
|
643
620
|
});
|
644
|
-
urlsCrawled.
|
621
|
+
urlsCrawled.userExcluded.push({
|
645
622
|
url: request.url,
|
646
623
|
pageTitle: request.url,
|
647
|
-
actualUrl:
|
624
|
+
actualUrl: request.url, // because about:blank is not useful
|
625
|
+
metadata: STATUS_CODE_METADATA[1],
|
626
|
+
httpStatusCode: 0,
|
648
627
|
});
|
649
628
|
|
650
629
|
return;
|
@@ -661,33 +640,17 @@ const crawlDomain = async ({
|
|
661
640
|
return;
|
662
641
|
}
|
663
642
|
|
664
|
-
const resHeaders = response ? response.headers() : {}; // Safely access response headers
|
665
|
-
const contentType = resHeaders['content-type'] || ''; // Ensure contentType is defined
|
666
|
-
|
667
|
-
// Skip non-HTML and non-PDF URLs
|
668
|
-
if (!contentType.includes('text/html') && !contentType.includes('application/pdf')) {
|
669
|
-
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
670
|
-
numScanned: urlsCrawled.scanned.length,
|
671
|
-
urlScanned: request.url,
|
672
|
-
});
|
673
|
-
urlsCrawled.blacklisted.push({
|
674
|
-
url: request.url,
|
675
|
-
pageTitle: request.url,
|
676
|
-
actualUrl: actualUrl, // i.e. actualUrl
|
677
|
-
});
|
678
|
-
|
679
|
-
return;
|
680
|
-
}
|
681
|
-
|
682
643
|
if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
|
683
644
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
684
645
|
numScanned: urlsCrawled.scanned.length,
|
685
646
|
urlScanned: request.url,
|
686
647
|
});
|
687
|
-
urlsCrawled.
|
648
|
+
urlsCrawled.userExcluded.push({
|
688
649
|
url: request.url,
|
689
650
|
pageTitle: request.url,
|
690
|
-
actualUrl: actualUrl, //
|
651
|
+
actualUrl: actualUrl, // because about:blank is not useful
|
652
|
+
metadata: STATUS_CODE_METADATA[1],
|
653
|
+
httpStatusCode: 0,
|
691
654
|
});
|
692
655
|
|
693
656
|
return;
|
@@ -698,37 +661,16 @@ const crawlDomain = async ({
|
|
698
661
|
url: request.url,
|
699
662
|
pageTitle: request.url,
|
700
663
|
actualUrl: actualUrl,
|
664
|
+
metadata: STATUS_CODE_METADATA[0],
|
665
|
+
httpStatusCode: 0,
|
701
666
|
});
|
702
|
-
|
703
|
-
await enqueueProcess(page, enqueueLinks, browserContext);
|
704
|
-
return;
|
705
|
-
}
|
706
|
-
|
707
|
-
if (response.status() === 403) {
|
667
|
+
|
708
668
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
709
669
|
numScanned: urlsCrawled.scanned.length,
|
710
670
|
urlScanned: request.url,
|
711
671
|
});
|
712
|
-
urlsCrawled.forbidden.push({
|
713
|
-
url: request.url,
|
714
|
-
pageTitle: request.url,
|
715
|
-
actualUrl: actualUrl, // i.e. actualUrl
|
716
|
-
});
|
717
|
-
|
718
|
-
return;
|
719
|
-
}
|
720
|
-
|
721
|
-
if (response.status() !== 200) {
|
722
|
-
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
723
|
-
numScanned: urlsCrawled.scanned.length,
|
724
|
-
urlScanned: request.url,
|
725
|
-
});
|
726
|
-
urlsCrawled.invalid.push({
|
727
|
-
url: request.url,
|
728
|
-
pageTitle: request.url,
|
729
|
-
actualUrl: actualUrl, // i.e. actualUrl
|
730
|
-
});
|
731
672
|
|
673
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
732
674
|
return;
|
733
675
|
}
|
734
676
|
|
@@ -750,6 +692,22 @@ const crawlDomain = async ({
|
|
750
692
|
return;
|
751
693
|
}
|
752
694
|
|
695
|
+
const responseStatus = response?.status();
|
696
|
+
if (responseStatus && responseStatus >= 300) {
|
697
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
698
|
+
numScanned: urlsCrawled.scanned.length,
|
699
|
+
urlScanned: request.url,
|
700
|
+
});
|
701
|
+
urlsCrawled.userExcluded.push({
|
702
|
+
url: request.url,
|
703
|
+
pageTitle: request.url,
|
704
|
+
actualUrl,
|
705
|
+
metadata: STATUS_CODE_METADATA[responseStatus] || STATUS_CODE_METADATA[599],
|
706
|
+
httpStatusCode: responseStatus,
|
707
|
+
});
|
708
|
+
return;
|
709
|
+
}
|
710
|
+
|
753
711
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
754
712
|
|
755
713
|
if (isRedirected) {
|
@@ -807,10 +765,12 @@ const crawlDomain = async ({
|
|
807
765
|
numScanned: urlsCrawled.scanned.length,
|
808
766
|
urlScanned: request.url,
|
809
767
|
});
|
810
|
-
urlsCrawled.
|
768
|
+
urlsCrawled.userExcluded.push({
|
811
769
|
url: request.url,
|
812
770
|
pageTitle: request.url,
|
813
|
-
actualUrl: actualUrl, //
|
771
|
+
actualUrl: actualUrl, // because about:blank is not useful
|
772
|
+
metadata: STATUS_CODE_METADATA[1],
|
773
|
+
httpStatusCode: 0,
|
814
774
|
});
|
815
775
|
|
816
776
|
}
|
@@ -850,18 +810,39 @@ const crawlDomain = async ({
|
|
850
810
|
// when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
|
851
811
|
// a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
|
852
812
|
if (!isAbortingScanNow) {
|
853
|
-
|
813
|
+
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
814
|
+
numScanned: urlsCrawled.scanned.length,
|
815
|
+
urlScanned: request.url,
|
816
|
+
});
|
817
|
+
|
818
|
+
urlsCrawled.error.push({
|
819
|
+
url: request.url,
|
820
|
+
pageTitle: request.url,
|
821
|
+
actualUrl: request.url,
|
822
|
+
metadata: STATUS_CODE_METADATA[2]
|
823
|
+
});
|
854
824
|
}
|
855
825
|
}
|
856
826
|
},
|
857
|
-
failedRequestHandler: async ({ request }) => {
|
827
|
+
failedRequestHandler: async ({ request, response }) => {
|
858
828
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
859
829
|
numScanned: urlsCrawled.scanned.length,
|
860
830
|
urlScanned: request.url,
|
861
831
|
});
|
862
|
-
|
863
|
-
|
864
|
-
|
832
|
+
|
833
|
+
const status = response?.status();
|
834
|
+
const metadata = typeof status === 'number'
|
835
|
+
? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
|
836
|
+
: STATUS_CODE_METADATA[2];
|
837
|
+
|
838
|
+
urlsCrawled.error.push({
|
839
|
+
url: request.url,
|
840
|
+
pageTitle: request.url,
|
841
|
+
actualUrl: request.url,
|
842
|
+
metadata,
|
843
|
+
httpStatusCode: typeof status === 'number' ? status : 0,
|
844
|
+
});
|
845
|
+
|
865
846
|
},
|
866
847
|
maxRequestsPerCrawl: Infinity,
|
867
848
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
@@ -1,27 +1,29 @@
|
|
1
1
|
import fs from 'fs';
|
2
|
-
import { chromium } from 'playwright';
|
2
|
+
import { chromium, Page } from 'playwright';
|
3
3
|
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
4
4
|
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
5
5
|
import { silentLogger, guiInfoLog } from '../logs.js';
|
6
6
|
import crawlDomain from './crawlDomain.js';
|
7
7
|
import crawlSitemap from './crawlSitemap.js';
|
8
|
+
import { EnqueueStrategy } from 'crawlee';
|
9
|
+
import { ViewportSettingsClass } from '../combine.js';
|
8
10
|
|
9
11
|
const crawlIntelligentSitemap = async (
|
10
|
-
url,
|
11
|
-
randomToken,
|
12
|
-
host,
|
13
|
-
viewportSettings,
|
14
|
-
maxRequestsPerCrawl,
|
15
|
-
browser,
|
16
|
-
userDataDirectory,
|
17
|
-
strategy,
|
18
|
-
specifiedMaxConcurrency,
|
19
|
-
fileTypes,
|
20
|
-
blacklistedPatterns,
|
21
|
-
includeScreenshots,
|
22
|
-
followRobots,
|
23
|
-
extraHTTPHeaders,
|
24
|
-
safeMode,
|
12
|
+
url: string,
|
13
|
+
randomToken: string,
|
14
|
+
host: string,
|
15
|
+
viewportSettings: ViewportSettingsClass,
|
16
|
+
maxRequestsPerCrawl: number,
|
17
|
+
browser: string,
|
18
|
+
userDataDirectory: string,
|
19
|
+
strategy: EnqueueStrategy,
|
20
|
+
specifiedMaxConcurrency: number,
|
21
|
+
fileTypes: string,
|
22
|
+
blacklistedPatterns: string[],
|
23
|
+
includeScreenshots: boolean,
|
24
|
+
followRobots: boolean,
|
25
|
+
extraHTTPHeaders: Record<string, string>,
|
26
|
+
safeMode: boolean,
|
25
27
|
) => {
|
26
28
|
let urlsCrawledFinal;
|
27
29
|
let urlsCrawled;
|
@@ -37,7 +39,7 @@ const crawlIntelligentSitemap = async (
|
|
37
39
|
fs.mkdirSync(randomToken);
|
38
40
|
}
|
39
41
|
|
40
|
-
function getHomeUrl(parsedUrl) {
|
42
|
+
function getHomeUrl(parsedUrl: string) {
|
41
43
|
const urlObject = new URL(parsedUrl);
|
42
44
|
if (urlObject.username !== '' && urlObject.password !== '') {
|
43
45
|
return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
@@ -46,7 +48,7 @@ const crawlIntelligentSitemap = async (
|
|
46
48
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
47
49
|
}
|
48
50
|
|
49
|
-
async function findSitemap(link) {
|
51
|
+
async function findSitemap(link: string) {
|
50
52
|
const homeUrl = getHomeUrl(link);
|
51
53
|
let sitemapLinkFound = false;
|
52
54
|
let sitemapLink = '';
|
@@ -70,7 +72,7 @@ const crawlIntelligentSitemap = async (
|
|
70
72
|
return sitemapExist ? sitemapLink : '';
|
71
73
|
}
|
72
74
|
|
73
|
-
const checkUrlExists = async (page, parsedUrl) => {
|
75
|
+
const checkUrlExists = async (page: Page, parsedUrl: string) => {
|
74
76
|
try {
|
75
77
|
const response = await page.goto(parsedUrl);
|
76
78
|
if (response.ok()) {
|