@govtechsg/oobee 0.10.50 → 0.10.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/bump-package-version.yml +58 -0
- package/.github/workflows/image.yml +38 -17
- package/DETAILS.md +5 -2
- package/INTEGRATION.md +57 -53
- package/README.md +4 -1
- package/__tests__/test-sitemap-url-patterns.xml +105 -0
- package/exclusions.txt +1 -0
- package/package.json +7 -6
- package/src/cli.ts +35 -2
- package/src/combine.ts +10 -7
- package/src/constants/cliFunctions.ts +9 -0
- package/src/constants/common.ts +95 -105
- package/src/constants/constants.ts +47 -2
- package/src/crawlers/commonCrawlerFunc.ts +50 -5
- package/src/crawlers/crawlDomain.ts +112 -73
- package/src/crawlers/crawlIntelligentSitemap.ts +40 -36
- package/src/crawlers/crawlLocalFile.ts +77 -35
- package/src/crawlers/crawlSitemap.ts +156 -89
- package/src/index.ts +2 -0
- package/src/logs.ts +4 -2
- package/src/mergeAxeResults.ts +20 -9
- package/src/npmIndex.ts +1 -1
- package/src/screenshotFunc/htmlScreenshotFunc.ts +7 -5
- package/src/screenshotFunc/pdfScreenshotFunc.ts +2 -2
- package/src/static/ejs/partials/components/wcagCompliance.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +1 -0
- package/src/static/ejs/partials/styles/styles.ejs +11 -0
- package/src/static/ejs/report.ejs +14 -1
- package/src/utils.ts +3 -3
@@ -7,7 +7,7 @@ import os from 'os';
|
|
7
7
|
import { spawnSync, execSync } from 'child_process';
|
8
8
|
import { chromium } from 'playwright';
|
9
9
|
import * as Sentry from '@sentry/node';
|
10
|
-
import { silentLogger } from '../logs.js';
|
10
|
+
import { consoleLogger, silentLogger } from '../logs.js';
|
11
11
|
import { PageInfo } from '../mergeAxeResults.js';
|
12
12
|
|
13
13
|
const filename = fileURLToPath(import.meta.url);
|
@@ -128,7 +128,7 @@ export const getDefaultChromiumDataDir = () => {
|
|
128
128
|
defaultChromiumDataDir = '/tmp';
|
129
129
|
}
|
130
130
|
|
131
|
-
|
131
|
+
consoleLogger.info(`Using Chromium support directory at ${defaultChromiumDataDir}`);
|
132
132
|
}
|
133
133
|
|
134
134
|
if (defaultChromiumDataDir && fs.existsSync(defaultChromiumDataDir)) {
|
@@ -179,6 +179,7 @@ export const basicAuthRegex = /^.*\/\/.*:.*@.*$/i;
|
|
179
179
|
// for crawlers
|
180
180
|
export const axeScript = path.join(dirname, '../../node_modules/axe-core/axe.min.js');
|
181
181
|
export class UrlsCrawled {
|
182
|
+
siteName: string;
|
182
183
|
toScan: string[] = [];
|
183
184
|
scanned: PageInfo[] = [];
|
184
185
|
invalid: PageInfo[] = [];
|
@@ -361,6 +362,7 @@ const wcagLinks = {
|
|
361
362
|
// 'WCAG 1.4.10': 'https://www.w3.org/TR/WCAG22/#reflow', - TODO: review for veraPDF
|
362
363
|
'WCAG 1.4.12': 'https://www.w3.org/TR/WCAG22/#text-spacing',
|
363
364
|
'WCAG 2.1.1': 'https://www.w3.org/TR/WCAG22/#keyboard',
|
365
|
+
'WCAG 2.1.3': 'https://www.w3.org/WAI/WCAG22/Understanding/keyboard-no-exception.html', // AAA
|
364
366
|
'WCAG 2.2.1': 'https://www.w3.org/TR/WCAG22/#timing-adjustable',
|
365
367
|
'WCAG 2.2.2': 'https://www.w3.org/TR/WCAG22/#pause-stop-hide',
|
366
368
|
'WCAG 2.2.4': 'https://www.w3.org/TR/WCAG22/#interruptions', // AAA
|
@@ -564,3 +566,46 @@ export const STATUS_CODE_METADATA: Record<number,string> = {
|
|
564
566
|
511: '511 - Network Authentication Required',
|
565
567
|
|
566
568
|
};
|
569
|
+
|
570
|
+
// Elements that should not be clicked or enqueued
|
571
|
+
// With reference from https://chromeenterprise.google/policies/url-patterns/
|
572
|
+
export const disallowedListOfPatterns = [
|
573
|
+
"#",
|
574
|
+
"mailto:",
|
575
|
+
"tel:",
|
576
|
+
"sms:",
|
577
|
+
"skype:",
|
578
|
+
"zoommtg:",
|
579
|
+
"msteams:",
|
580
|
+
"whatsapp:",
|
581
|
+
"slack:",
|
582
|
+
"viber:",
|
583
|
+
"tg:",
|
584
|
+
"line:",
|
585
|
+
"meet:",
|
586
|
+
"facetime:",
|
587
|
+
"imessage:",
|
588
|
+
"discord:",
|
589
|
+
"sgnl:",
|
590
|
+
"webex:",
|
591
|
+
"intent:",
|
592
|
+
"ms-outlook:",
|
593
|
+
"ms-onedrive:",
|
594
|
+
"ms-word:",
|
595
|
+
"ms-excel:",
|
596
|
+
"ms-powerpoint:",
|
597
|
+
"ms-office:",
|
598
|
+
"onenote:",
|
599
|
+
"vs:",
|
600
|
+
"chrome-extension:",
|
601
|
+
"chrome-search:",
|
602
|
+
"chrome:",
|
603
|
+
"chrome-untrusted:",
|
604
|
+
"devtools:",
|
605
|
+
"isolated-app:"
|
606
|
+
];
|
607
|
+
|
608
|
+
export const disallowedSelectorPatterns = disallowedListOfPatterns
|
609
|
+
.map(pattern => `a[href^="${pattern}"]`)
|
610
|
+
.join(',')
|
611
|
+
.replace(/\s+/g, '');
|
@@ -1,13 +1,14 @@
|
|
1
1
|
import crawlee, { CrawlingContext, PlaywrightGotoOptions, Request } from 'crawlee';
|
2
2
|
import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
|
3
|
-
import { BrowserContext, Page } from 'playwright';
|
3
|
+
import { BrowserContext, ElementHandle, Page } from 'playwright';
|
4
4
|
import {
|
5
5
|
axeScript,
|
6
|
+
disallowedListOfPatterns,
|
6
7
|
guiInfoStatusTypes,
|
7
8
|
RuleFlags,
|
8
9
|
saflyIconSelector,
|
9
10
|
} from '../constants/constants.js';
|
10
|
-
import { guiInfoLog, silentLogger } from '../logs.js';
|
11
|
+
import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
|
11
12
|
import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
|
12
13
|
import { isFilePath } from '../constants/common.js';
|
13
14
|
import { extractAndGradeText } from './custom/extractAndGradeText.js';
|
@@ -305,7 +306,7 @@ export const runAxeScript = async ({
|
|
305
306
|
});
|
306
307
|
});
|
307
308
|
} catch (e) {
|
308
|
-
|
309
|
+
// do nothing, just continue
|
309
310
|
}
|
310
311
|
|
311
312
|
// Omit logging of browser console errors to reduce unnecessary verbosity
|
@@ -459,9 +460,9 @@ export const runAxeScript = async ({
|
|
459
460
|
try {
|
460
461
|
pageTitle = await page.evaluate(() => document.title);
|
461
462
|
} catch (e) {
|
462
|
-
|
463
|
+
consoleLogger.info(`Error while getting page title: ${e}`);
|
463
464
|
if (page.isClosed()) {
|
464
|
-
|
465
|
+
consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
|
465
466
|
page = await browserContext.newPage();
|
466
467
|
await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
|
467
468
|
pageTitle = await page.evaluate(() => document.title);
|
@@ -508,3 +509,47 @@ export const isUrlPdf = (url: string) => {
|
|
508
509
|
const parsedUrl = new URL(url);
|
509
510
|
return /\.pdf($|\?|#)/i.test(parsedUrl.pathname) || /\.pdf($|\?|#)/i.test(parsedUrl.href);
|
510
511
|
};
|
512
|
+
|
513
|
+
export async function shouldSkipClickDueToDisallowedHref(
|
514
|
+
page: Page,
|
515
|
+
element: ElementHandle
|
516
|
+
): Promise<boolean> {
|
517
|
+
return await page.evaluate(
|
518
|
+
({ el, disallowedPrefixes }) => {
|
519
|
+
function isDisallowedHref(href: string | null): boolean {
|
520
|
+
if (!href) return false;
|
521
|
+
href = href.toLowerCase();
|
522
|
+
return disallowedPrefixes.some((prefix: string) => href.startsWith(prefix));
|
523
|
+
}
|
524
|
+
|
525
|
+
const castEl = el as HTMLElement;
|
526
|
+
|
527
|
+
// Check descendant <a href="">
|
528
|
+
const descendants = castEl.querySelectorAll('a[href]');
|
529
|
+
for (const a of descendants) {
|
530
|
+
const href = a.getAttribute('href');
|
531
|
+
if (isDisallowedHref(href)) {
|
532
|
+
return true;
|
533
|
+
}
|
534
|
+
}
|
535
|
+
|
536
|
+
// Check self and ancestors for disallowed <a>
|
537
|
+
let current: HTMLElement | null = castEl;
|
538
|
+
while (current) {
|
539
|
+
if (
|
540
|
+
current.tagName === 'A' &&
|
541
|
+
isDisallowedHref(current.getAttribute('href'))
|
542
|
+
) {
|
543
|
+
return true;
|
544
|
+
}
|
545
|
+
current = current.parentElement;
|
546
|
+
}
|
547
|
+
|
548
|
+
return false;
|
549
|
+
},
|
550
|
+
{
|
551
|
+
el: element,
|
552
|
+
disallowedPrefixes: disallowedListOfPatterns,
|
553
|
+
}
|
554
|
+
);
|
555
|
+
}
|
@@ -11,6 +11,7 @@ import {
|
|
11
11
|
createCrawleeSubFolders,
|
12
12
|
runAxeScript,
|
13
13
|
isUrlPdf,
|
14
|
+
shouldSkipClickDueToDisallowedHref,
|
14
15
|
} from './commonCrawlerFunc.js';
|
15
16
|
import constants, {
|
16
17
|
UrlsCrawled,
|
@@ -19,6 +20,8 @@ import constants, {
|
|
19
20
|
cssQuerySelectors,
|
20
21
|
RuleFlags,
|
21
22
|
STATUS_CODE_METADATA,
|
23
|
+
disallowedListOfPatterns,
|
24
|
+
disallowedSelectorPatterns,
|
22
25
|
} from '../constants/constants.js';
|
23
26
|
import {
|
24
27
|
getPlaywrightLaunchOptions,
|
@@ -37,7 +40,7 @@ import {
|
|
37
40
|
mapPdfScanResults,
|
38
41
|
doPdfScreenshots,
|
39
42
|
} from './pdfScanFunc.js';
|
40
|
-
import {
|
43
|
+
import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
|
41
44
|
import { ViewportSettingsClass } from '../combine.js';
|
42
45
|
|
43
46
|
const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
|
@@ -71,6 +74,7 @@ const crawlDomain = async ({
|
|
71
74
|
includeScreenshots,
|
72
75
|
followRobots,
|
73
76
|
extraHTTPHeaders,
|
77
|
+
scanDuration = 0,
|
74
78
|
safeMode = false,
|
75
79
|
fromCrawlIntelligentSitemap = false,
|
76
80
|
datasetFromIntelligent = null,
|
@@ -91,12 +95,14 @@ const crawlDomain = async ({
|
|
91
95
|
includeScreenshots: boolean;
|
92
96
|
followRobots: boolean;
|
93
97
|
extraHTTPHeaders: Record<string, string>;
|
98
|
+
scanDuration?: number;
|
94
99
|
safeMode?: boolean;
|
95
100
|
fromCrawlIntelligentSitemap?: boolean;
|
96
101
|
datasetFromIntelligent?: crawlee.Dataset;
|
97
102
|
urlsCrawledFromIntelligent?: UrlsCrawled;
|
98
103
|
ruleset?: RuleFlags[];
|
99
104
|
}) => {
|
105
|
+
const crawlStartTime = Date.now();
|
100
106
|
let dataset: crawlee.Dataset;
|
101
107
|
let urlsCrawled: UrlsCrawled;
|
102
108
|
let requestQueue: crawlee.RequestQueue;
|
@@ -165,7 +171,7 @@ const crawlDomain = async ({
|
|
165
171
|
const httpHeadCache = new Map<string, boolean>();
|
166
172
|
const isProcessibleUrl = async (url: string): Promise<boolean> => {
|
167
173
|
if (httpHeadCache.has(url)) {
|
168
|
-
|
174
|
+
consoleLogger.info(`Skipping request as URL has been processed before: ${url}}`);
|
169
175
|
return false; // return false to avoid processing the same url again
|
170
176
|
}
|
171
177
|
|
@@ -180,14 +186,14 @@ const crawlDomain = async ({
|
|
180
186
|
|
181
187
|
// Check if the response suggests it's a downloadable file based on Content-Disposition header
|
182
188
|
if (contentDisposition.includes('attachment')) {
|
183
|
-
|
189
|
+
consoleLogger.info(`Skipping URL due to attachment header: ${url}`);
|
184
190
|
httpHeadCache.set(url, false);
|
185
191
|
return false;
|
186
192
|
}
|
187
193
|
|
188
194
|
// Check if the MIME type suggests it's a downloadable file
|
189
195
|
if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
|
190
|
-
|
196
|
+
consoleLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
|
191
197
|
httpHeadCache.set(url, false);
|
192
198
|
return false;
|
193
199
|
}
|
@@ -195,14 +201,14 @@ const crawlDomain = async ({
|
|
195
201
|
// Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
|
196
202
|
const mimeType = mime.lookup(contentType);
|
197
203
|
if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
|
198
|
-
|
204
|
+
consoleLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
|
199
205
|
httpHeadCache.set(url, false);
|
200
206
|
return false;
|
201
207
|
}
|
202
208
|
|
203
209
|
// Additional check for zip files by their magic number (PK\x03\x04)
|
204
210
|
if (url.endsWith('.zip')) {
|
205
|
-
|
211
|
+
consoleLogger.info(`Checking for zip file magic number at URL ${url}`);
|
206
212
|
|
207
213
|
// Download the first few bytes of the file to check for the magic number
|
208
214
|
const byteResponse = await axios.get(url, {
|
@@ -213,11 +219,11 @@ const crawlDomain = async ({
|
|
213
219
|
|
214
220
|
const magicNumber = byteResponse.data.toString('hex');
|
215
221
|
if (magicNumber === '504b0304') {
|
216
|
-
|
222
|
+
consoleLogger.info(`Skipping zip file at URL ${url}`);
|
217
223
|
httpHeadCache.set(url, false);
|
218
224
|
return false;
|
219
225
|
}
|
220
|
-
|
226
|
+
consoleLogger.info(
|
221
227
|
`Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
|
222
228
|
);
|
223
229
|
}
|
@@ -235,12 +241,12 @@ const crawlDomain = async ({
|
|
235
241
|
!fileType.mime.startsWith('text/html') &&
|
236
242
|
!fileType.mime.startsWith('text/')
|
237
243
|
) {
|
238
|
-
|
244
|
+
consoleLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
|
239
245
|
httpHeadCache.set(url, false);
|
240
246
|
return false;
|
241
247
|
}
|
242
248
|
} catch (e) {
|
243
|
-
//
|
249
|
+
// consoleLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
|
244
250
|
// If an error occurs (e.g., a network issue), assume the URL is processible
|
245
251
|
httpHeadCache.set(url, true);
|
246
252
|
return true;
|
@@ -259,14 +265,14 @@ const crawlDomain = async ({
|
|
259
265
|
try {
|
260
266
|
await enqueueLinks({
|
261
267
|
// set selector matches anchor elements with href but not contains # or starting with mailto:
|
262
|
-
selector:
|
268
|
+
selector: `a:not(${disallowedSelectorPatterns})`,
|
263
269
|
strategy,
|
264
270
|
requestQueue,
|
265
271
|
transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
|
266
272
|
try {
|
267
273
|
req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
268
274
|
} catch (e) {
|
269
|
-
|
275
|
+
consoleLogger.error(e);
|
270
276
|
}
|
271
277
|
if (urlsCrawled.scanned.some(item => item.url === req.url)) {
|
272
278
|
req.skipNavigation = true;
|
@@ -288,7 +294,7 @@ const crawlDomain = async ({
|
|
288
294
|
try {
|
289
295
|
await customEnqueueLinksByClickingElements(page, browserContext);
|
290
296
|
} catch (e) {
|
291
|
-
|
297
|
+
// do nothing;
|
292
298
|
}
|
293
299
|
}
|
294
300
|
} catch {
|
@@ -307,7 +313,10 @@ const crawlDomain = async ({
|
|
307
313
|
const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
|
308
314
|
const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
|
309
315
|
const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
|
310
|
-
|
316
|
+
const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
|
317
|
+
newPageUrl.toLowerCase().startsWith(pattern),
|
318
|
+
);
|
319
|
+
return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
|
311
320
|
};
|
312
321
|
const setPageListeners = (page: Page): void => {
|
313
322
|
// event listener to handle new page popups upon button click
|
@@ -431,6 +440,16 @@ const crawlDomain = async ({
|
|
431
440
|
});
|
432
441
|
} else if (!newUrlFoundInElement) {
|
433
442
|
try {
|
443
|
+
const shouldSkip = await shouldSkipClickDueToDisallowedHref(page, element);
|
444
|
+
if (shouldSkip) {
|
445
|
+
const elementHtml = await page.evaluate(el => el.outerHTML, element);
|
446
|
+
consoleLogger.info(
|
447
|
+
'Skipping a click due to disallowed href nearby. Element HTML:',
|
448
|
+
elementHtml,
|
449
|
+
);
|
450
|
+
continue;
|
451
|
+
}
|
452
|
+
|
434
453
|
// Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
|
435
454
|
await element.click({ force: true });
|
436
455
|
await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
|
@@ -455,7 +474,7 @@ const crawlDomain = async ({
|
|
455
474
|
}
|
456
475
|
|
457
476
|
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
458
|
-
|
477
|
+
|
459
478
|
const crawler = new crawlee.PlaywrightCrawler({
|
460
479
|
launchContext: {
|
461
480
|
launcher: constants.launcher,
|
@@ -486,36 +505,35 @@ const crawlDomain = async ({
|
|
486
505
|
return new Promise(resolve => {
|
487
506
|
let timeout;
|
488
507
|
let mutationCount = 0;
|
489
|
-
const MAX_MUTATIONS
|
490
|
-
const OBSERVER_TIMEOUT
|
491
|
-
|
508
|
+
const MAX_MUTATIONS = 250; // stop if things never quiet down
|
509
|
+
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
510
|
+
|
492
511
|
const observer = new MutationObserver(() => {
|
493
512
|
clearTimeout(timeout);
|
494
|
-
|
513
|
+
|
495
514
|
mutationCount++;
|
496
515
|
if (mutationCount > MAX_MUTATIONS) {
|
497
516
|
observer.disconnect();
|
498
517
|
resolve('Too many mutations, exiting.');
|
499
518
|
return;
|
500
519
|
}
|
501
|
-
|
520
|
+
|
502
521
|
// restart quiet‑period timer
|
503
522
|
timeout = setTimeout(() => {
|
504
523
|
observer.disconnect();
|
505
524
|
resolve('DOM stabilized.');
|
506
525
|
}, 1000);
|
507
526
|
});
|
508
|
-
|
527
|
+
|
509
528
|
// overall timeout in case the page never settles
|
510
529
|
timeout = setTimeout(() => {
|
511
530
|
observer.disconnect();
|
512
531
|
resolve('Observer timeout reached.');
|
513
532
|
}, OBSERVER_TIMEOUT);
|
514
|
-
|
533
|
+
|
515
534
|
const root = document.documentElement || document.body || document;
|
516
535
|
if (!root || typeof observer.observe !== 'function') {
|
517
536
|
resolve('No root node to observe.');
|
518
|
-
return;
|
519
537
|
}
|
520
538
|
});
|
521
539
|
});
|
@@ -539,31 +557,31 @@ const crawlDomain = async ({
|
|
539
557
|
],
|
540
558
|
preNavigationHooks: isBasicAuth
|
541
559
|
? [
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
560
|
+
async ({ page, request }) => {
|
561
|
+
await page.setExtraHTTPHeaders({
|
562
|
+
Authorization: authHeader,
|
563
|
+
...extraHTTPHeaders,
|
564
|
+
});
|
565
|
+
const processible = await isProcessibleUrl(request.url);
|
566
|
+
if (!processible) {
|
567
|
+
request.skipNavigation = true;
|
568
|
+
return null;
|
569
|
+
}
|
570
|
+
},
|
571
|
+
]
|
554
572
|
: [
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
573
|
+
async ({ page, request }) => {
|
574
|
+
await page.setExtraHTTPHeaders({
|
575
|
+
...extraHTTPHeaders,
|
576
|
+
});
|
559
577
|
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
578
|
+
const processible = await isProcessibleUrl(request.url);
|
579
|
+
if (!processible) {
|
580
|
+
request.skipNavigation = true;
|
581
|
+
return null;
|
582
|
+
}
|
583
|
+
},
|
584
|
+
],
|
567
585
|
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
568
586
|
requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
|
569
587
|
const browserContext: BrowserContext = page.context();
|
@@ -586,7 +604,10 @@ const crawlDomain = async ({
|
|
586
604
|
actualUrl = page.url();
|
587
605
|
}
|
588
606
|
|
589
|
-
if (
|
607
|
+
if (
|
608
|
+
!isFollowStrategy(url, actualUrl, strategy) &&
|
609
|
+
(isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
|
610
|
+
) {
|
590
611
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
591
612
|
numScanned: urlsCrawled.scanned.length,
|
592
613
|
urlScanned: actualUrl,
|
@@ -594,7 +615,13 @@ const crawlDomain = async ({
|
|
594
615
|
return;
|
595
616
|
}
|
596
617
|
|
597
|
-
|
618
|
+
const hasExceededDuration =
|
619
|
+
scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
620
|
+
|
621
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
|
622
|
+
if (hasExceededDuration) {
|
623
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
|
624
|
+
}
|
598
625
|
isAbortingScanNow = true;
|
599
626
|
crawler.autoscaledPool.abort();
|
600
627
|
return;
|
@@ -612,7 +639,7 @@ const crawlDomain = async ({
|
|
612
639
|
}
|
613
640
|
|
614
641
|
// handle pdfs
|
615
|
-
if (request.skipNavigation && actualUrl ===
|
642
|
+
if (request.skipNavigation && actualUrl === 'about:blank') {
|
616
643
|
if (!isScanPdfs) {
|
617
644
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
618
645
|
numScanned: urlsCrawled.scanned.length,
|
@@ -648,7 +675,7 @@ const crawlDomain = async ({
|
|
648
675
|
urlsCrawled.userExcluded.push({
|
649
676
|
url: request.url,
|
650
677
|
pageTitle: request.url,
|
651
|
-
actualUrl
|
678
|
+
actualUrl, // because about:blank is not useful
|
652
679
|
metadata: STATUS_CODE_METADATA[1],
|
653
680
|
httpStatusCode: 0,
|
654
681
|
});
|
@@ -656,15 +683,19 @@ const crawlDomain = async ({
|
|
656
683
|
return;
|
657
684
|
}
|
658
685
|
|
659
|
-
if (
|
686
|
+
if (
|
687
|
+
!isFollowStrategy(url, actualUrl, strategy) &&
|
688
|
+
blacklistedPatterns &&
|
689
|
+
isSkippedUrl(actualUrl, blacklistedPatterns)
|
690
|
+
) {
|
660
691
|
urlsCrawled.userExcluded.push({
|
661
692
|
url: request.url,
|
662
693
|
pageTitle: request.url,
|
663
|
-
actualUrl
|
694
|
+
actualUrl,
|
664
695
|
metadata: STATUS_CODE_METADATA[0],
|
665
696
|
httpStatusCode: 0,
|
666
697
|
});
|
667
|
-
|
698
|
+
|
668
699
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
669
700
|
numScanned: urlsCrawled.scanned.length,
|
670
701
|
urlScanned: request.url,
|
@@ -679,11 +710,7 @@ const crawlDomain = async ({
|
|
679
710
|
const isRedirected = !areLinksEqual(actualUrl, request.url);
|
680
711
|
|
681
712
|
// check if redirected link is following strategy (same-domain/same-hostname)
|
682
|
-
const isLoadedUrlFollowStrategy = isFollowStrategy(
|
683
|
-
actualUrl,
|
684
|
-
request.url,
|
685
|
-
strategy,
|
686
|
-
);
|
713
|
+
const isLoadedUrlFollowStrategy = isFollowStrategy(actualUrl, request.url, strategy);
|
687
714
|
if (isRedirected && !isLoadedUrlFollowStrategy) {
|
688
715
|
urlsCrawled.notScannedRedirects.push({
|
689
716
|
fromUrl: request.url,
|
@@ -693,7 +720,7 @@ const crawlDomain = async ({
|
|
693
720
|
}
|
694
721
|
|
695
722
|
const responseStatus = response?.status();
|
696
|
-
|
723
|
+
if (responseStatus && responseStatus >= 300) {
|
697
724
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
698
725
|
numScanned: urlsCrawled.scanned.length,
|
699
726
|
urlScanned: request.url,
|
@@ -706,7 +733,7 @@ const crawlDomain = async ({
|
|
706
733
|
httpStatusCode: responseStatus,
|
707
734
|
});
|
708
735
|
return;
|
709
|
-
|
736
|
+
}
|
710
737
|
|
711
738
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
712
739
|
|
@@ -733,7 +760,7 @@ const crawlDomain = async ({
|
|
733
760
|
urlsCrawled.scanned.push({
|
734
761
|
url: urlWithoutAuth(request.url),
|
735
762
|
pageTitle: results.pageTitle,
|
736
|
-
actualUrl
|
763
|
+
actualUrl, // i.e. actualUrl
|
737
764
|
});
|
738
765
|
|
739
766
|
urlsCrawled.scannedRedirects.push({
|
@@ -768,11 +795,10 @@ const crawlDomain = async ({
|
|
768
795
|
urlsCrawled.userExcluded.push({
|
769
796
|
url: request.url,
|
770
797
|
pageTitle: request.url,
|
771
|
-
actualUrl
|
798
|
+
actualUrl, // because about:blank is not useful
|
772
799
|
metadata: STATUS_CODE_METADATA[1],
|
773
800
|
httpStatusCode: 0,
|
774
801
|
});
|
775
|
-
|
776
802
|
}
|
777
803
|
|
778
804
|
if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
|
@@ -780,7 +806,7 @@ const crawlDomain = async ({
|
|
780
806
|
} catch (e) {
|
781
807
|
try {
|
782
808
|
if (!e.message.includes('page.evaluate')) {
|
783
|
-
|
809
|
+
// do nothing;
|
784
810
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
785
811
|
numScanned: urlsCrawled.scanned.length,
|
786
812
|
urlScanned: request.url,
|
@@ -815,11 +841,11 @@ const crawlDomain = async ({
|
|
815
841
|
urlScanned: request.url,
|
816
842
|
});
|
817
843
|
|
818
|
-
urlsCrawled.error.push({
|
819
|
-
url: request.url,
|
820
|
-
pageTitle: request.url,
|
821
|
-
actualUrl: request.url,
|
822
|
-
metadata: STATUS_CODE_METADATA[2]
|
844
|
+
urlsCrawled.error.push({
|
845
|
+
url: request.url,
|
846
|
+
pageTitle: request.url,
|
847
|
+
actualUrl: request.url,
|
848
|
+
metadata: STATUS_CODE_METADATA[2],
|
823
849
|
});
|
824
850
|
}
|
825
851
|
}
|
@@ -831,9 +857,10 @@ const crawlDomain = async ({
|
|
831
857
|
});
|
832
858
|
|
833
859
|
const status = response?.status();
|
834
|
-
const metadata =
|
835
|
-
|
836
|
-
|
860
|
+
const metadata =
|
861
|
+
typeof status === 'number'
|
862
|
+
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
863
|
+
: STATUS_CODE_METADATA[2];
|
837
864
|
|
838
865
|
urlsCrawled.error.push({
|
839
866
|
url: request.url,
|
@@ -842,10 +869,18 @@ const crawlDomain = async ({
|
|
842
869
|
metadata,
|
843
870
|
httpStatusCode: typeof status === 'number' ? status : 0,
|
844
871
|
});
|
845
|
-
|
846
872
|
},
|
847
873
|
maxRequestsPerCrawl: Infinity,
|
848
874
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
875
|
+
...(process.env.OOBEE_FAST_CRAWLER && {
|
876
|
+
autoscaledPoolOptions: {
|
877
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
878
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
879
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
880
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
881
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
882
|
+
},
|
883
|
+
}),
|
849
884
|
});
|
850
885
|
|
851
886
|
await crawler.run();
|
@@ -875,6 +910,10 @@ const crawlDomain = async ({
|
|
875
910
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
876
911
|
}
|
877
912
|
|
913
|
+
if (scanDuration > 0) {
|
914
|
+
const elapsed = Math.round((Date.now() - crawlStartTime) / 1000);
|
915
|
+
console.log(`Crawl ended after ${elapsed}s. Limit: ${scanDuration}s.`);
|
916
|
+
}
|
878
917
|
return urlsCrawled;
|
879
918
|
};
|
880
919
|
|