@govtechsg/oobee 0.10.50 → 0.10.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ import os from 'os';
7
7
  import { spawnSync, execSync } from 'child_process';
8
8
  import { chromium } from 'playwright';
9
9
  import * as Sentry from '@sentry/node';
10
- import { silentLogger } from '../logs.js';
10
+ import { consoleLogger, silentLogger } from '../logs.js';
11
11
  import { PageInfo } from '../mergeAxeResults.js';
12
12
 
13
13
  const filename = fileURLToPath(import.meta.url);
@@ -128,7 +128,7 @@ export const getDefaultChromiumDataDir = () => {
128
128
  defaultChromiumDataDir = '/tmp';
129
129
  }
130
130
 
131
- silentLogger.warn(`Using Chromium support directory at ${defaultChromiumDataDir}`);
131
+ consoleLogger.info(`Using Chromium support directory at ${defaultChromiumDataDir}`);
132
132
  }
133
133
 
134
134
  if (defaultChromiumDataDir && fs.existsSync(defaultChromiumDataDir)) {
@@ -179,6 +179,7 @@ export const basicAuthRegex = /^.*\/\/.*:.*@.*$/i;
179
179
  // for crawlers
180
180
  export const axeScript = path.join(dirname, '../../node_modules/axe-core/axe.min.js');
181
181
  export class UrlsCrawled {
182
+ siteName: string;
182
183
  toScan: string[] = [];
183
184
  scanned: PageInfo[] = [];
184
185
  invalid: PageInfo[] = [];
@@ -361,6 +362,7 @@ const wcagLinks = {
361
362
  // 'WCAG 1.4.10': 'https://www.w3.org/TR/WCAG22/#reflow', - TODO: review for veraPDF
362
363
  'WCAG 1.4.12': 'https://www.w3.org/TR/WCAG22/#text-spacing',
363
364
  'WCAG 2.1.1': 'https://www.w3.org/TR/WCAG22/#keyboard',
365
+ 'WCAG 2.1.3': 'https://www.w3.org/WAI/WCAG22/Understanding/keyboard-no-exception.html', // AAA
364
366
  'WCAG 2.2.1': 'https://www.w3.org/TR/WCAG22/#timing-adjustable',
365
367
  'WCAG 2.2.2': 'https://www.w3.org/TR/WCAG22/#pause-stop-hide',
366
368
  'WCAG 2.2.4': 'https://www.w3.org/TR/WCAG22/#interruptions', // AAA
@@ -564,3 +566,46 @@ export const STATUS_CODE_METADATA: Record<number,string> = {
564
566
  511: '511 - Network Authentication Required',
565
567
 
566
568
  };
569
+
570
+ // Elements that should not be clicked or enqueued
571
+ // With reference from https://chromeenterprise.google/policies/url-patterns/
572
+ export const disallowedListOfPatterns = [
573
+ "#",
574
+ "mailto:",
575
+ "tel:",
576
+ "sms:",
577
+ "skype:",
578
+ "zoommtg:",
579
+ "msteams:",
580
+ "whatsapp:",
581
+ "slack:",
582
+ "viber:",
583
+ "tg:",
584
+ "line:",
585
+ "meet:",
586
+ "facetime:",
587
+ "imessage:",
588
+ "discord:",
589
+ "sgnl:",
590
+ "webex:",
591
+ "intent:",
592
+ "ms-outlook:",
593
+ "ms-onedrive:",
594
+ "ms-word:",
595
+ "ms-excel:",
596
+ "ms-powerpoint:",
597
+ "ms-office:",
598
+ "onenote:",
599
+ "vs:",
600
+ "chrome-extension:",
601
+ "chrome-search:",
602
+ "chrome:",
603
+ "chrome-untrusted:",
604
+ "devtools:",
605
+ "isolated-app:"
606
+ ];
607
+
608
+ export const disallowedSelectorPatterns = disallowedListOfPatterns
609
+ .map(pattern => `a[href^="${pattern}"]`)
610
+ .join(',')
611
+ .replace(/\s+/g, '');
@@ -1,13 +1,14 @@
1
1
  import crawlee, { CrawlingContext, PlaywrightGotoOptions, Request } from 'crawlee';
2
2
  import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
3
- import { BrowserContext, Page } from 'playwright';
3
+ import { BrowserContext, ElementHandle, Page } from 'playwright';
4
4
  import {
5
5
  axeScript,
6
+ disallowedListOfPatterns,
6
7
  guiInfoStatusTypes,
7
8
  RuleFlags,
8
9
  saflyIconSelector,
9
10
  } from '../constants/constants.js';
10
- import { guiInfoLog, silentLogger } from '../logs.js';
11
+ import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
11
12
  import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
12
13
  import { isFilePath } from '../constants/common.js';
13
14
  import { extractAndGradeText } from './custom/extractAndGradeText.js';
@@ -305,7 +306,7 @@ export const runAxeScript = async ({
305
306
  });
306
307
  });
307
308
  } catch (e) {
308
- silentLogger.warn(`Error while checking for DOM mutations: ${e}`);
309
+ // do nothing, just continue
309
310
  }
310
311
 
311
312
  // Omit logging of browser console errors to reduce unnecessary verbosity
@@ -459,9 +460,9 @@ export const runAxeScript = async ({
459
460
  try {
460
461
  pageTitle = await page.evaluate(() => document.title);
461
462
  } catch (e) {
462
- silentLogger.warn(`Error while getting page title: ${e}`);
463
+ consoleLogger.info(`Error while getting page title: ${e}`);
463
464
  if (page.isClosed()) {
464
- silentLogger.info(`Page was closed for ${requestUrl}, creating new page`);
465
+ consoleLogger.info(`Page was closed for ${requestUrl}, creating new page`);
465
466
  page = await browserContext.newPage();
466
467
  await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
467
468
  pageTitle = await page.evaluate(() => document.title);
@@ -508,3 +509,47 @@ export const isUrlPdf = (url: string) => {
508
509
  const parsedUrl = new URL(url);
509
510
  return /\.pdf($|\?|#)/i.test(parsedUrl.pathname) || /\.pdf($|\?|#)/i.test(parsedUrl.href);
510
511
  };
512
+
513
+ export async function shouldSkipClickDueToDisallowedHref(
514
+ page: Page,
515
+ element: ElementHandle
516
+ ): Promise<boolean> {
517
+ return await page.evaluate(
518
+ ({ el, disallowedPrefixes }) => {
519
+ function isDisallowedHref(href: string | null): boolean {
520
+ if (!href) return false;
521
+ href = href.toLowerCase();
522
+ return disallowedPrefixes.some((prefix: string) => href.startsWith(prefix));
523
+ }
524
+
525
+ const castEl = el as HTMLElement;
526
+
527
+ // Check descendant <a href="">
528
+ const descendants = castEl.querySelectorAll('a[href]');
529
+ for (const a of descendants) {
530
+ const href = a.getAttribute('href');
531
+ if (isDisallowedHref(href)) {
532
+ return true;
533
+ }
534
+ }
535
+
536
+ // Check self and ancestors for disallowed <a>
537
+ let current: HTMLElement | null = castEl;
538
+ while (current) {
539
+ if (
540
+ current.tagName === 'A' &&
541
+ isDisallowedHref(current.getAttribute('href'))
542
+ ) {
543
+ return true;
544
+ }
545
+ current = current.parentElement;
546
+ }
547
+
548
+ return false;
549
+ },
550
+ {
551
+ el: element,
552
+ disallowedPrefixes: disallowedListOfPatterns,
553
+ }
554
+ );
555
+ }
@@ -11,6 +11,7 @@ import {
11
11
  createCrawleeSubFolders,
12
12
  runAxeScript,
13
13
  isUrlPdf,
14
+ shouldSkipClickDueToDisallowedHref,
14
15
  } from './commonCrawlerFunc.js';
15
16
  import constants, {
16
17
  UrlsCrawled,
@@ -19,6 +20,8 @@ import constants, {
19
20
  cssQuerySelectors,
20
21
  RuleFlags,
21
22
  STATUS_CODE_METADATA,
23
+ disallowedListOfPatterns,
24
+ disallowedSelectorPatterns,
22
25
  } from '../constants/constants.js';
23
26
  import {
24
27
  getPlaywrightLaunchOptions,
@@ -37,7 +40,7 @@ import {
37
40
  mapPdfScanResults,
38
41
  doPdfScreenshots,
39
42
  } from './pdfScanFunc.js';
40
- import { silentLogger, guiInfoLog } from '../logs.js';
43
+ import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
41
44
  import { ViewportSettingsClass } from '../combine.js';
42
45
 
43
46
  const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
@@ -71,6 +74,7 @@ const crawlDomain = async ({
71
74
  includeScreenshots,
72
75
  followRobots,
73
76
  extraHTTPHeaders,
77
+ scanDuration = 0,
74
78
  safeMode = false,
75
79
  fromCrawlIntelligentSitemap = false,
76
80
  datasetFromIntelligent = null,
@@ -91,12 +95,14 @@ const crawlDomain = async ({
91
95
  includeScreenshots: boolean;
92
96
  followRobots: boolean;
93
97
  extraHTTPHeaders: Record<string, string>;
98
+ scanDuration?: number;
94
99
  safeMode?: boolean;
95
100
  fromCrawlIntelligentSitemap?: boolean;
96
101
  datasetFromIntelligent?: crawlee.Dataset;
97
102
  urlsCrawledFromIntelligent?: UrlsCrawled;
98
103
  ruleset?: RuleFlags[];
99
104
  }) => {
105
+ const crawlStartTime = Date.now();
100
106
  let dataset: crawlee.Dataset;
101
107
  let urlsCrawled: UrlsCrawled;
102
108
  let requestQueue: crawlee.RequestQueue;
@@ -165,7 +171,7 @@ const crawlDomain = async ({
165
171
  const httpHeadCache = new Map<string, boolean>();
166
172
  const isProcessibleUrl = async (url: string): Promise<boolean> => {
167
173
  if (httpHeadCache.has(url)) {
168
- silentLogger.info(`Skipping request as URL has been processed before ${url}}`);
174
+ consoleLogger.info(`Skipping request as URL has been processed before: ${url}}`);
169
175
  return false; // return false to avoid processing the same url again
170
176
  }
171
177
 
@@ -180,14 +186,14 @@ const crawlDomain = async ({
180
186
 
181
187
  // Check if the response suggests it's a downloadable file based on Content-Disposition header
182
188
  if (contentDisposition.includes('attachment')) {
183
- silentLogger.info(`Skipping URL due to attachment header: ${url}`);
189
+ consoleLogger.info(`Skipping URL due to attachment header: ${url}`);
184
190
  httpHeadCache.set(url, false);
185
191
  return false;
186
192
  }
187
193
 
188
194
  // Check if the MIME type suggests it's a downloadable file
189
195
  if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
190
- silentLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
196
+ consoleLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
191
197
  httpHeadCache.set(url, false);
192
198
  return false;
193
199
  }
@@ -195,14 +201,14 @@ const crawlDomain = async ({
195
201
  // Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
196
202
  const mimeType = mime.lookup(contentType);
197
203
  if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
198
- silentLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
204
+ consoleLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
199
205
  httpHeadCache.set(url, false);
200
206
  return false;
201
207
  }
202
208
 
203
209
  // Additional check for zip files by their magic number (PK\x03\x04)
204
210
  if (url.endsWith('.zip')) {
205
- silentLogger.info(`Checking for zip file magic number at URL ${url}`);
211
+ consoleLogger.info(`Checking for zip file magic number at URL ${url}`);
206
212
 
207
213
  // Download the first few bytes of the file to check for the magic number
208
214
  const byteResponse = await axios.get(url, {
@@ -213,11 +219,11 @@ const crawlDomain = async ({
213
219
 
214
220
  const magicNumber = byteResponse.data.toString('hex');
215
221
  if (magicNumber === '504b0304') {
216
- silentLogger.info(`Skipping zip file at URL ${url}`);
222
+ consoleLogger.info(`Skipping zip file at URL ${url}`);
217
223
  httpHeadCache.set(url, false);
218
224
  return false;
219
225
  }
220
- silentLogger.info(
226
+ consoleLogger.info(
221
227
  `Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
222
228
  );
223
229
  }
@@ -235,12 +241,12 @@ const crawlDomain = async ({
235
241
  !fileType.mime.startsWith('text/html') &&
236
242
  !fileType.mime.startsWith('text/')
237
243
  ) {
238
- silentLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
244
+ consoleLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
239
245
  httpHeadCache.set(url, false);
240
246
  return false;
241
247
  }
242
248
  } catch (e) {
243
- // silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
249
+ // consoleLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
244
250
  // If an error occurs (e.g., a network issue), assume the URL is processible
245
251
  httpHeadCache.set(url, true);
246
252
  return true;
@@ -259,14 +265,14 @@ const crawlDomain = async ({
259
265
  try {
260
266
  await enqueueLinks({
261
267
  // set selector matches anchor elements with href but not contains # or starting with mailto:
262
- selector: 'a:not(a[href*="#"],a[href^="mailto:"])',
268
+ selector: `a:not(${disallowedSelectorPatterns})`,
263
269
  strategy,
264
270
  requestQueue,
265
271
  transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
266
272
  try {
267
273
  req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
268
274
  } catch (e) {
269
- silentLogger.error(e);
275
+ consoleLogger.error(e);
270
276
  }
271
277
  if (urlsCrawled.scanned.some(item => item.url === req.url)) {
272
278
  req.skipNavigation = true;
@@ -288,7 +294,7 @@ const crawlDomain = async ({
288
294
  try {
289
295
  await customEnqueueLinksByClickingElements(page, browserContext);
290
296
  } catch (e) {
291
- silentLogger.info(e);
297
+ // do nothing;
292
298
  }
293
299
  }
294
300
  } catch {
@@ -307,7 +313,10 @@ const crawlDomain = async ({
307
313
  const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
308
314
  const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
309
315
  const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
310
- return isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
316
+ const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
317
+ newPageUrl.toLowerCase().startsWith(pattern),
318
+ );
319
+ return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
311
320
  };
312
321
  const setPageListeners = (page: Page): void => {
313
322
  // event listener to handle new page popups upon button click
@@ -431,6 +440,16 @@ const crawlDomain = async ({
431
440
  });
432
441
  } else if (!newUrlFoundInElement) {
433
442
  try {
443
+ const shouldSkip = await shouldSkipClickDueToDisallowedHref(page, element);
444
+ if (shouldSkip) {
445
+ const elementHtml = await page.evaluate(el => el.outerHTML, element);
446
+ consoleLogger.info(
447
+ 'Skipping a click due to disallowed href nearby. Element HTML:',
448
+ elementHtml,
449
+ );
450
+ continue;
451
+ }
452
+
434
453
  // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
435
454
  await element.click({ force: true });
436
455
  await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
@@ -455,7 +474,7 @@ const crawlDomain = async ({
455
474
  }
456
475
 
457
476
  await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
458
-
477
+
459
478
  const crawler = new crawlee.PlaywrightCrawler({
460
479
  launchContext: {
461
480
  launcher: constants.launcher,
@@ -486,36 +505,35 @@ const crawlDomain = async ({
486
505
  return new Promise(resolve => {
487
506
  let timeout;
488
507
  let mutationCount = 0;
489
- const MAX_MUTATIONS = 250; // stop if things never quiet down
490
- const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
491
-
508
+ const MAX_MUTATIONS = 250; // stop if things never quiet down
509
+ const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
510
+
492
511
  const observer = new MutationObserver(() => {
493
512
  clearTimeout(timeout);
494
-
513
+
495
514
  mutationCount++;
496
515
  if (mutationCount > MAX_MUTATIONS) {
497
516
  observer.disconnect();
498
517
  resolve('Too many mutations, exiting.');
499
518
  return;
500
519
  }
501
-
520
+
502
521
  // restart quiet‑period timer
503
522
  timeout = setTimeout(() => {
504
523
  observer.disconnect();
505
524
  resolve('DOM stabilized.');
506
525
  }, 1000);
507
526
  });
508
-
527
+
509
528
  // overall timeout in case the page never settles
510
529
  timeout = setTimeout(() => {
511
530
  observer.disconnect();
512
531
  resolve('Observer timeout reached.');
513
532
  }, OBSERVER_TIMEOUT);
514
-
533
+
515
534
  const root = document.documentElement || document.body || document;
516
535
  if (!root || typeof observer.observe !== 'function') {
517
536
  resolve('No root node to observe.');
518
- return;
519
537
  }
520
538
  });
521
539
  });
@@ -539,31 +557,31 @@ const crawlDomain = async ({
539
557
  ],
540
558
  preNavigationHooks: isBasicAuth
541
559
  ? [
542
- async ({ page, request }) => {
543
- await page.setExtraHTTPHeaders({
544
- Authorization: authHeader,
545
- ...extraHTTPHeaders,
546
- });
547
- const processible = await isProcessibleUrl(request.url);
548
- if (!processible) {
549
- request.skipNavigation = true;
550
- return null;
551
- }
552
- },
553
- ]
560
+ async ({ page, request }) => {
561
+ await page.setExtraHTTPHeaders({
562
+ Authorization: authHeader,
563
+ ...extraHTTPHeaders,
564
+ });
565
+ const processible = await isProcessibleUrl(request.url);
566
+ if (!processible) {
567
+ request.skipNavigation = true;
568
+ return null;
569
+ }
570
+ },
571
+ ]
554
572
  : [
555
- async ({ page, request }) => {
556
- await page.setExtraHTTPHeaders({
557
- ...extraHTTPHeaders,
558
- });
573
+ async ({ page, request }) => {
574
+ await page.setExtraHTTPHeaders({
575
+ ...extraHTTPHeaders,
576
+ });
559
577
 
560
- const processible = await isProcessibleUrl(request.url);
561
- if (!processible) {
562
- request.skipNavigation = true;
563
- return null;
564
- }
565
- },
566
- ],
578
+ const processible = await isProcessibleUrl(request.url);
579
+ if (!processible) {
580
+ request.skipNavigation = true;
581
+ return null;
582
+ }
583
+ },
584
+ ],
567
585
  requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
568
586
  requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
569
587
  const browserContext: BrowserContext = page.context();
@@ -586,7 +604,10 @@ const crawlDomain = async ({
586
604
  actualUrl = page.url();
587
605
  }
588
606
 
589
- if (!isFollowStrategy(url, actualUrl, strategy) && (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))) {
607
+ if (
608
+ !isFollowStrategy(url, actualUrl, strategy) &&
609
+ (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
610
+ ) {
590
611
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
591
612
  numScanned: urlsCrawled.scanned.length,
592
613
  urlScanned: actualUrl,
@@ -594,7 +615,13 @@ const crawlDomain = async ({
594
615
  return;
595
616
  }
596
617
 
597
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
618
+ const hasExceededDuration =
619
+ scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
620
+
621
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
622
+ if (hasExceededDuration) {
623
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
624
+ }
598
625
  isAbortingScanNow = true;
599
626
  crawler.autoscaledPool.abort();
600
627
  return;
@@ -612,7 +639,7 @@ const crawlDomain = async ({
612
639
  }
613
640
 
614
641
  // handle pdfs
615
- if (request.skipNavigation && actualUrl === "about:blank") {
642
+ if (request.skipNavigation && actualUrl === 'about:blank') {
616
643
  if (!isScanPdfs) {
617
644
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
618
645
  numScanned: urlsCrawled.scanned.length,
@@ -648,7 +675,7 @@ const crawlDomain = async ({
648
675
  urlsCrawled.userExcluded.push({
649
676
  url: request.url,
650
677
  pageTitle: request.url,
651
- actualUrl: actualUrl, // because about:blank is not useful
678
+ actualUrl, // because about:blank is not useful
652
679
  metadata: STATUS_CODE_METADATA[1],
653
680
  httpStatusCode: 0,
654
681
  });
@@ -656,15 +683,19 @@ const crawlDomain = async ({
656
683
  return;
657
684
  }
658
685
 
659
- if (!isFollowStrategy(url, actualUrl, strategy) && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
686
+ if (
687
+ !isFollowStrategy(url, actualUrl, strategy) &&
688
+ blacklistedPatterns &&
689
+ isSkippedUrl(actualUrl, blacklistedPatterns)
690
+ ) {
660
691
  urlsCrawled.userExcluded.push({
661
692
  url: request.url,
662
693
  pageTitle: request.url,
663
- actualUrl: actualUrl,
694
+ actualUrl,
664
695
  metadata: STATUS_CODE_METADATA[0],
665
696
  httpStatusCode: 0,
666
697
  });
667
-
698
+
668
699
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
669
700
  numScanned: urlsCrawled.scanned.length,
670
701
  urlScanned: request.url,
@@ -679,11 +710,7 @@ const crawlDomain = async ({
679
710
  const isRedirected = !areLinksEqual(actualUrl, request.url);
680
711
 
681
712
  // check if redirected link is following strategy (same-domain/same-hostname)
682
- const isLoadedUrlFollowStrategy = isFollowStrategy(
683
- actualUrl,
684
- request.url,
685
- strategy,
686
- );
713
+ const isLoadedUrlFollowStrategy = isFollowStrategy(actualUrl, request.url, strategy);
687
714
  if (isRedirected && !isLoadedUrlFollowStrategy) {
688
715
  urlsCrawled.notScannedRedirects.push({
689
716
  fromUrl: request.url,
@@ -693,7 +720,7 @@ const crawlDomain = async ({
693
720
  }
694
721
 
695
722
  const responseStatus = response?.status();
696
- if (responseStatus && responseStatus >= 300) {
723
+ if (responseStatus && responseStatus >= 300) {
697
724
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
698
725
  numScanned: urlsCrawled.scanned.length,
699
726
  urlScanned: request.url,
@@ -706,7 +733,7 @@ const crawlDomain = async ({
706
733
  httpStatusCode: responseStatus,
707
734
  });
708
735
  return;
709
- }
736
+ }
710
737
 
711
738
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
712
739
 
@@ -733,7 +760,7 @@ const crawlDomain = async ({
733
760
  urlsCrawled.scanned.push({
734
761
  url: urlWithoutAuth(request.url),
735
762
  pageTitle: results.pageTitle,
736
- actualUrl: actualUrl, // i.e. actualUrl
763
+ actualUrl, // i.e. actualUrl
737
764
  });
738
765
 
739
766
  urlsCrawled.scannedRedirects.push({
@@ -768,11 +795,10 @@ const crawlDomain = async ({
768
795
  urlsCrawled.userExcluded.push({
769
796
  url: request.url,
770
797
  pageTitle: request.url,
771
- actualUrl: actualUrl, // because about:blank is not useful
798
+ actualUrl, // because about:blank is not useful
772
799
  metadata: STATUS_CODE_METADATA[1],
773
800
  httpStatusCode: 0,
774
801
  });
775
-
776
802
  }
777
803
 
778
804
  if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
@@ -780,7 +806,7 @@ const crawlDomain = async ({
780
806
  } catch (e) {
781
807
  try {
782
808
  if (!e.message.includes('page.evaluate')) {
783
- silentLogger.info(e);
809
+ // do nothing;
784
810
  guiInfoLog(guiInfoStatusTypes.ERROR, {
785
811
  numScanned: urlsCrawled.scanned.length,
786
812
  urlScanned: request.url,
@@ -815,11 +841,11 @@ const crawlDomain = async ({
815
841
  urlScanned: request.url,
816
842
  });
817
843
 
818
- urlsCrawled.error.push({
819
- url: request.url,
820
- pageTitle: request.url,
821
- actualUrl: request.url,
822
- metadata: STATUS_CODE_METADATA[2]
844
+ urlsCrawled.error.push({
845
+ url: request.url,
846
+ pageTitle: request.url,
847
+ actualUrl: request.url,
848
+ metadata: STATUS_CODE_METADATA[2],
823
849
  });
824
850
  }
825
851
  }
@@ -831,9 +857,10 @@ const crawlDomain = async ({
831
857
  });
832
858
 
833
859
  const status = response?.status();
834
- const metadata = typeof status === 'number'
835
- ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
836
- : STATUS_CODE_METADATA[2];
860
+ const metadata =
861
+ typeof status === 'number'
862
+ ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
863
+ : STATUS_CODE_METADATA[2];
837
864
 
838
865
  urlsCrawled.error.push({
839
866
  url: request.url,
@@ -842,10 +869,18 @@ const crawlDomain = async ({
842
869
  metadata,
843
870
  httpStatusCode: typeof status === 'number' ? status : 0,
844
871
  });
845
-
846
872
  },
847
873
  maxRequestsPerCrawl: Infinity,
848
874
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
875
+ ...(process.env.OOBEE_FAST_CRAWLER && {
876
+ autoscaledPoolOptions: {
877
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
878
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
879
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
880
+ scaleUpStepRatio: 0.99, // Scale up faster
881
+ scaleDownStepRatio: 0.1, // Scale down slower
882
+ },
883
+ }),
849
884
  });
850
885
 
851
886
  await crawler.run();
@@ -875,6 +910,10 @@ const crawlDomain = async ({
875
910
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
876
911
  }
877
912
 
913
+ if (scanDuration > 0) {
914
+ const elapsed = Math.round((Date.now() - crawlStartTime) / 1000);
915
+ console.log(`Crawl ended after ${elapsed}s. Limit: ${scanDuration}s.`);
916
+ }
878
917
  return urlsCrawled;
879
918
  };
880
919