@govtechsg/oobee 0.10.51 → 0.10.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,15 +2,14 @@ import crawlee, { EnqueueStrategy } from 'crawlee';
2
2
  import fs from 'fs';
3
3
  import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
4
4
  import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
5
- import axios from 'axios';
6
- import { fileTypeFromBuffer } from 'file-type';
7
- import mime from 'mime-types';
8
5
  import https from 'https';
9
6
  import type { BatchAddRequestsResult } from '@crawlee/types';
10
7
  import {
11
8
  createCrawleeSubFolders,
12
9
  runAxeScript,
13
10
  isUrlPdf,
11
+ shouldSkipClickDueToDisallowedHref,
12
+ shouldSkipDueToUnsupportedContent,
14
13
  } from './commonCrawlerFunc.js';
15
14
  import constants, {
16
15
  UrlsCrawled,
@@ -19,6 +18,8 @@ import constants, {
19
18
  cssQuerySelectors,
20
19
  RuleFlags,
21
20
  STATUS_CODE_METADATA,
21
+ disallowedListOfPatterns,
22
+ disallowedSelectorPatterns,
22
23
  } from '../constants/constants.js';
23
24
  import {
24
25
  getPlaywrightLaunchOptions,
@@ -37,7 +38,7 @@ import {
37
38
  mapPdfScanResults,
38
39
  doPdfScreenshots,
39
40
  } from './pdfScanFunc.js';
40
- import { silentLogger, guiInfoLog } from '../logs.js';
41
+ import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
41
42
  import { ViewportSettingsClass } from '../combine.js';
42
43
 
43
44
  const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
@@ -71,6 +72,7 @@ const crawlDomain = async ({
71
72
  includeScreenshots,
72
73
  followRobots,
73
74
  extraHTTPHeaders,
75
+ scanDuration = 0,
74
76
  safeMode = false,
75
77
  fromCrawlIntelligentSitemap = false,
76
78
  datasetFromIntelligent = null,
@@ -91,12 +93,14 @@ const crawlDomain = async ({
91
93
  includeScreenshots: boolean;
92
94
  followRobots: boolean;
93
95
  extraHTTPHeaders: Record<string, string>;
96
+ scanDuration?: number;
94
97
  safeMode?: boolean;
95
98
  fromCrawlIntelligentSitemap?: boolean;
96
99
  datasetFromIntelligent?: crawlee.Dataset;
97
100
  urlsCrawledFromIntelligent?: UrlsCrawled;
98
101
  ruleset?: RuleFlags[];
99
102
  }) => {
103
+ const crawlStartTime = Date.now();
100
104
  let dataset: crawlee.Dataset;
101
105
  let urlsCrawled: UrlsCrawled;
102
106
  let requestQueue: crawlee.RequestQueue;
@@ -162,95 +166,6 @@ const crawlDomain = async ({
162
166
  });
163
167
  }
164
168
 
165
- const httpHeadCache = new Map<string, boolean>();
166
- const isProcessibleUrl = async (url: string): Promise<boolean> => {
167
- if (httpHeadCache.has(url)) {
168
- silentLogger.info(`Skipping request as URL has been processed before ${url}}`);
169
- return false; // return false to avoid processing the same url again
170
- }
171
-
172
- try {
173
- // Send a HEAD request to check headers without downloading the file
174
- const headResponse = await axios.head(url, {
175
- headers: { Authorization: authHeader },
176
- httpsAgent,
177
- });
178
- const contentType = headResponse.headers['content-type'] || '';
179
- const contentDisposition = headResponse.headers['content-disposition'] || '';
180
-
181
- // Check if the response suggests it's a downloadable file based on Content-Disposition header
182
- if (contentDisposition.includes('attachment')) {
183
- silentLogger.info(`Skipping URL due to attachment header: ${url}`);
184
- httpHeadCache.set(url, false);
185
- return false;
186
- }
187
-
188
- // Check if the MIME type suggests it's a downloadable file
189
- if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
190
- silentLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
191
- httpHeadCache.set(url, false);
192
- return false;
193
- }
194
-
195
- // Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
196
- const mimeType = mime.lookup(contentType);
197
- if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
198
- silentLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
199
- httpHeadCache.set(url, false);
200
- return false;
201
- }
202
-
203
- // Additional check for zip files by their magic number (PK\x03\x04)
204
- if (url.endsWith('.zip')) {
205
- silentLogger.info(`Checking for zip file magic number at URL ${url}`);
206
-
207
- // Download the first few bytes of the file to check for the magic number
208
- const byteResponse = await axios.get(url, {
209
- headers: { Range: 'bytes=0-3', Authorization: authHeader },
210
- responseType: 'arraybuffer',
211
- httpsAgent,
212
- });
213
-
214
- const magicNumber = byteResponse.data.toString('hex');
215
- if (magicNumber === '504b0304') {
216
- silentLogger.info(`Skipping zip file at URL ${url}`);
217
- httpHeadCache.set(url, false);
218
- return false;
219
- }
220
- silentLogger.info(
221
- `Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
222
- );
223
- }
224
-
225
- // If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content
226
- const response = await axios.get(url, {
227
- headers: { Range: 'bytes=0-4100', Authorization: authHeader },
228
- responseType: 'arraybuffer',
229
- httpsAgent,
230
- });
231
-
232
- const fileType = await fileTypeFromBuffer(response.data);
233
- if (
234
- fileType &&
235
- !fileType.mime.startsWith('text/html') &&
236
- !fileType.mime.startsWith('text/')
237
- ) {
238
- silentLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
239
- httpHeadCache.set(url, false);
240
- return false;
241
- }
242
- } catch (e) {
243
- // silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
244
- // If an error occurs (e.g., a network issue), assume the URL is processible
245
- httpHeadCache.set(url, true);
246
- return true;
247
- }
248
-
249
- // If none of the conditions to skip are met, allow processing of the URL
250
- httpHeadCache.set(url, true);
251
- return true;
252
- };
253
-
254
169
  const enqueueProcess = async (
255
170
  page: Page,
256
171
  enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
@@ -259,14 +174,14 @@ const crawlDomain = async ({
259
174
  try {
260
175
  await enqueueLinks({
261
176
  // set selector matches anchor elements with href but not contains # or starting with mailto:
262
- selector: 'a:not(a[href*="#"],a[href^="mailto:"])',
177
+ selector: `a:not(${disallowedSelectorPatterns})`,
263
178
  strategy,
264
179
  requestQueue,
265
180
  transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
266
181
  try {
267
182
  req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
268
183
  } catch (e) {
269
- silentLogger.error(e);
184
+ consoleLogger.error(e);
270
185
  }
271
186
  if (urlsCrawled.scanned.some(item => item.url === req.url)) {
272
187
  req.skipNavigation = true;
@@ -288,7 +203,7 @@ const crawlDomain = async ({
288
203
  try {
289
204
  await customEnqueueLinksByClickingElements(page, browserContext);
290
205
  } catch (e) {
291
- silentLogger.info(e);
206
+ // do nothing;
292
207
  }
293
208
  }
294
209
  } catch {
@@ -307,7 +222,10 @@ const crawlDomain = async ({
307
222
  const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
308
223
  const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
309
224
  const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
310
- return isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
225
+ const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
226
+ newPageUrl.toLowerCase().startsWith(pattern),
227
+ );
228
+ return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
311
229
  };
312
230
  const setPageListeners = (page: Page): void => {
313
231
  // event listener to handle new page popups upon button click
@@ -431,6 +349,16 @@ const crawlDomain = async ({
431
349
  });
432
350
  } else if (!newUrlFoundInElement) {
433
351
  try {
352
+ const shouldSkip = await shouldSkipClickDueToDisallowedHref(page, element);
353
+ if (shouldSkip) {
354
+ const elementHtml = await page.evaluate(el => el.outerHTML, element);
355
+ consoleLogger.info(
356
+ 'Skipping a click due to disallowed href nearby. Element HTML:',
357
+ elementHtml,
358
+ );
359
+ continue;
360
+ }
361
+
434
362
  // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
435
363
  await element.click({ force: true });
436
364
  await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
@@ -455,7 +383,7 @@ const crawlDomain = async ({
455
383
  }
456
384
 
457
385
  await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
458
-
386
+
459
387
  const crawler = new crawlee.PlaywrightCrawler({
460
388
  launchContext: {
461
389
  launcher: constants.launcher,
@@ -486,36 +414,35 @@ const crawlDomain = async ({
486
414
  return new Promise(resolve => {
487
415
  let timeout;
488
416
  let mutationCount = 0;
489
- const MAX_MUTATIONS = 250; // stop if things never quiet down
490
- const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
491
-
417
+ const MAX_MUTATIONS = 250; // stop if things never quiet down
418
+ const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
419
+
492
420
  const observer = new MutationObserver(() => {
493
421
  clearTimeout(timeout);
494
-
422
+
495
423
  mutationCount++;
496
424
  if (mutationCount > MAX_MUTATIONS) {
497
425
  observer.disconnect();
498
426
  resolve('Too many mutations, exiting.');
499
427
  return;
500
428
  }
501
-
429
+
502
430
  // restart quiet‑period timer
503
431
  timeout = setTimeout(() => {
504
432
  observer.disconnect();
505
433
  resolve('DOM stabilized.');
506
434
  }, 1000);
507
435
  });
508
-
436
+
509
437
  // overall timeout in case the page never settles
510
438
  timeout = setTimeout(() => {
511
439
  observer.disconnect();
512
440
  resolve('Observer timeout reached.');
513
441
  }, OBSERVER_TIMEOUT);
514
-
442
+
515
443
  const root = document.documentElement || document.body || document;
516
444
  if (!root || typeof observer.observe !== 'function') {
517
445
  resolve('No root node to observe.');
518
- return;
519
446
  }
520
447
  });
521
448
  });
@@ -537,33 +464,18 @@ const crawlDomain = async ({
537
464
  }
538
465
  },
539
466
  ],
540
- preNavigationHooks: isBasicAuth
541
- ? [
542
- async ({ page, request }) => {
543
- await page.setExtraHTTPHeaders({
544
- Authorization: authHeader,
545
- ...extraHTTPHeaders,
546
- });
547
- const processible = await isProcessibleUrl(request.url);
548
- if (!processible) {
549
- request.skipNavigation = true;
550
- return null;
551
- }
552
- },
553
- ]
554
- : [
555
- async ({ page, request }) => {
556
- await page.setExtraHTTPHeaders({
557
- ...extraHTTPHeaders,
558
- });
559
-
560
- const processible = await isProcessibleUrl(request.url);
561
- if (!processible) {
562
- request.skipNavigation = true;
563
- return null;
564
- }
565
- },
566
- ],
467
+ preNavigationHooks: [ async({ page, request}) => {
468
+ if (isBasicAuth) {
469
+ await page.setExtraHTTPHeaders({
470
+ Authorization: authHeader,
471
+ ...extraHTTPHeaders,
472
+ });
473
+ } else {
474
+ await page.setExtraHTTPHeaders({
475
+ ...extraHTTPHeaders,
476
+ });
477
+ }
478
+ }],
567
479
  requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
568
480
  requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
569
481
  const browserContext: BrowserContext = page.context();
@@ -586,7 +498,10 @@ const crawlDomain = async ({
586
498
  actualUrl = page.url();
587
499
  }
588
500
 
589
- if (!isFollowStrategy(url, actualUrl, strategy) && (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))) {
501
+ if (
502
+ !isFollowStrategy(url, actualUrl, strategy) &&
503
+ (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
504
+ ) {
590
505
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
591
506
  numScanned: urlsCrawled.scanned.length,
592
507
  urlScanned: actualUrl,
@@ -594,7 +509,13 @@ const crawlDomain = async ({
594
509
  return;
595
510
  }
596
511
 
597
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
512
+ const hasExceededDuration =
513
+ scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
514
+
515
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
516
+ if (hasExceededDuration) {
517
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
518
+ }
598
519
  isAbortingScanNow = true;
599
520
  crawler.autoscaledPool.abort();
600
521
  return;
@@ -612,7 +533,7 @@ const crawlDomain = async ({
612
533
  }
613
534
 
614
535
  // handle pdfs
615
- if (request.skipNavigation && actualUrl === "about:blank") {
536
+ if (shouldSkipDueToUnsupportedContent(response, request.url) || (request.skipNavigation && actualUrl === 'about:blank')) {
616
537
  if (!isScanPdfs) {
617
538
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
618
539
  numScanned: urlsCrawled.scanned.length,
@@ -648,7 +569,7 @@ const crawlDomain = async ({
648
569
  urlsCrawled.userExcluded.push({
649
570
  url: request.url,
650
571
  pageTitle: request.url,
651
- actualUrl: actualUrl, // because about:blank is not useful
572
+ actualUrl, // because about:blank is not useful
652
573
  metadata: STATUS_CODE_METADATA[1],
653
574
  httpStatusCode: 0,
654
575
  });
@@ -656,15 +577,19 @@ const crawlDomain = async ({
656
577
  return;
657
578
  }
658
579
 
659
- if (!isFollowStrategy(url, actualUrl, strategy) && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
580
+ if (
581
+ !isFollowStrategy(url, actualUrl, strategy) &&
582
+ blacklistedPatterns &&
583
+ isSkippedUrl(actualUrl, blacklistedPatterns)
584
+ ) {
660
585
  urlsCrawled.userExcluded.push({
661
586
  url: request.url,
662
587
  pageTitle: request.url,
663
- actualUrl: actualUrl,
588
+ actualUrl,
664
589
  metadata: STATUS_CODE_METADATA[0],
665
590
  httpStatusCode: 0,
666
591
  });
667
-
592
+
668
593
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
669
594
  numScanned: urlsCrawled.scanned.length,
670
595
  urlScanned: request.url,
@@ -679,11 +604,7 @@ const crawlDomain = async ({
679
604
  const isRedirected = !areLinksEqual(actualUrl, request.url);
680
605
 
681
606
  // check if redirected link is following strategy (same-domain/same-hostname)
682
- const isLoadedUrlFollowStrategy = isFollowStrategy(
683
- actualUrl,
684
- request.url,
685
- strategy,
686
- );
607
+ const isLoadedUrlFollowStrategy = isFollowStrategy(actualUrl, request.url, strategy);
687
608
  if (isRedirected && !isLoadedUrlFollowStrategy) {
688
609
  urlsCrawled.notScannedRedirects.push({
689
610
  fromUrl: request.url,
@@ -693,7 +614,7 @@ const crawlDomain = async ({
693
614
  }
694
615
 
695
616
  const responseStatus = response?.status();
696
- if (responseStatus && responseStatus >= 300) {
617
+ if (responseStatus && responseStatus >= 300) {
697
618
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
698
619
  numScanned: urlsCrawled.scanned.length,
699
620
  urlScanned: request.url,
@@ -706,7 +627,7 @@ const crawlDomain = async ({
706
627
  httpStatusCode: responseStatus,
707
628
  });
708
629
  return;
709
- }
630
+ }
710
631
 
711
632
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
712
633
 
@@ -733,7 +654,7 @@ const crawlDomain = async ({
733
654
  urlsCrawled.scanned.push({
734
655
  url: urlWithoutAuth(request.url),
735
656
  pageTitle: results.pageTitle,
736
- actualUrl: actualUrl, // i.e. actualUrl
657
+ actualUrl, // i.e. actualUrl
737
658
  });
738
659
 
739
660
  urlsCrawled.scannedRedirects.push({
@@ -768,11 +689,10 @@ const crawlDomain = async ({
768
689
  urlsCrawled.userExcluded.push({
769
690
  url: request.url,
770
691
  pageTitle: request.url,
771
- actualUrl: actualUrl, // because about:blank is not useful
692
+ actualUrl, // because about:blank is not useful
772
693
  metadata: STATUS_CODE_METADATA[1],
773
694
  httpStatusCode: 0,
774
695
  });
775
-
776
696
  }
777
697
 
778
698
  if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
@@ -780,7 +700,7 @@ const crawlDomain = async ({
780
700
  } catch (e) {
781
701
  try {
782
702
  if (!e.message.includes('page.evaluate')) {
783
- silentLogger.info(e);
703
+ // do nothing;
784
704
  guiInfoLog(guiInfoStatusTypes.ERROR, {
785
705
  numScanned: urlsCrawled.scanned.length,
786
706
  urlScanned: request.url,
@@ -815,11 +735,11 @@ const crawlDomain = async ({
815
735
  urlScanned: request.url,
816
736
  });
817
737
 
818
- urlsCrawled.error.push({
819
- url: request.url,
820
- pageTitle: request.url,
821
- actualUrl: request.url,
822
- metadata: STATUS_CODE_METADATA[2]
738
+ urlsCrawled.error.push({
739
+ url: request.url,
740
+ pageTitle: request.url,
741
+ actualUrl: request.url,
742
+ metadata: STATUS_CODE_METADATA[2],
823
743
  });
824
744
  }
825
745
  }
@@ -831,9 +751,10 @@ const crawlDomain = async ({
831
751
  });
832
752
 
833
753
  const status = response?.status();
834
- const metadata = typeof status === 'number'
835
- ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
836
- : STATUS_CODE_METADATA[2];
754
+ const metadata =
755
+ typeof status === 'number'
756
+ ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
757
+ : STATUS_CODE_METADATA[2];
837
758
 
838
759
  urlsCrawled.error.push({
839
760
  url: request.url,
@@ -842,10 +763,18 @@ const crawlDomain = async ({
842
763
  metadata,
843
764
  httpStatusCode: typeof status === 'number' ? status : 0,
844
765
  });
845
-
846
766
  },
847
767
  maxRequestsPerCrawl: Infinity,
848
768
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
769
+ ...(process.env.OOBEE_FAST_CRAWLER && {
770
+ autoscaledPoolOptions: {
771
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
772
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
773
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
774
+ scaleUpStepRatio: 0.99, // Scale up faster
775
+ scaleDownStepRatio: 0.1, // Scale down slower
776
+ },
777
+ }),
849
778
  });
850
779
 
851
780
  await crawler.run();
@@ -875,6 +804,10 @@ const crawlDomain = async ({
875
804
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
876
805
  }
877
806
 
807
+ if (scanDuration > 0) {
808
+ const elapsed = Math.round((Date.now() - crawlStartTime) / 1000);
809
+ console.log(`Crawl ended after ${elapsed}s. Limit: ${scanDuration}s.`);
810
+ }
878
811
  return urlsCrawled;
879
812
  };
880
813
 
@@ -2,7 +2,7 @@ import fs from 'fs';
2
2
  import { chromium, Page } from 'playwright';
3
3
  import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
4
  import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
5
- import { silentLogger, guiInfoLog } from '../logs.js';
5
+ import { consoleLogger, guiInfoLog } from '../logs.js';
6
6
  import crawlDomain from './crawlDomain.js';
7
7
  import crawlSitemap from './crawlSitemap.js';
8
8
  import { EnqueueStrategy } from 'crawlee';
@@ -24,46 +24,42 @@ const crawlIntelligentSitemap = async (
24
24
  followRobots: boolean,
25
25
  extraHTTPHeaders: Record<string, string>,
26
26
  safeMode: boolean,
27
+ scanDuration: number
27
28
  ) => {
29
+ const startTime = Date.now(); // Track start time
30
+
28
31
  let urlsCrawledFinal;
29
- let urlsCrawled;
32
+ let urlsCrawled = { ...constants.urlsCrawledObj };
30
33
  let dataset;
31
34
  let sitemapExist = false;
32
35
  const fromCrawlIntelligentSitemap = true;
33
36
  let sitemapUrl;
34
37
 
35
- urlsCrawled = { ...constants.urlsCrawledObj };
36
38
  ({ dataset } = await createCrawleeSubFolders(randomToken));
37
-
38
39
  if (!fs.existsSync(randomToken)) {
39
40
  fs.mkdirSync(randomToken);
40
41
  }
41
42
 
42
43
  function getHomeUrl(parsedUrl: string) {
43
44
  const urlObject = new URL(parsedUrl);
44
- if (urlObject.username !== '' && urlObject.password !== '') {
45
+ if (urlObject.username && urlObject.password) {
45
46
  return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
46
47
  }
47
-
48
48
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
49
49
  }
50
50
 
51
51
  async function findSitemap(link: string) {
52
52
  const homeUrl = getHomeUrl(link);
53
- let sitemapLinkFound = false;
54
53
  let sitemapLink = '';
55
- const chromiumBrowser = await chromium.launch(
56
- {
57
- headless: false,
58
- channel: 'chrome',
59
- args: ['--headless=new', '--no-sandbox']
60
- });
61
-
54
+ const chromiumBrowser = await chromium.launch({
55
+ headless: false,
56
+ channel: 'chrome',
57
+ args: ['--headless=new', '--no-sandbox'],
58
+ });
62
59
  const page = await chromiumBrowser.newPage();
63
60
  for (const path of sitemapPaths) {
64
61
  sitemapLink = homeUrl + path;
65
- sitemapLinkFound = await checkUrlExists(page, sitemapLink);
66
- if (sitemapLinkFound) {
62
+ if (await checkUrlExists(page, sitemapLink)) {
67
63
  sitemapExist = true;
68
64
  break;
69
65
  }
@@ -75,12 +71,9 @@ const crawlIntelligentSitemap = async (
75
71
  const checkUrlExists = async (page: Page, parsedUrl: string) => {
76
72
  try {
77
73
  const response = await page.goto(parsedUrl);
78
- if (response.ok()) {
79
- return true;
80
- }
81
- return false;
74
+ return response.ok();
82
75
  } catch (e) {
83
- silentLogger.error(e);
76
+ consoleLogger.error(e);
84
77
  return false;
85
78
  }
86
79
  };
@@ -88,13 +81,12 @@ const crawlIntelligentSitemap = async (
88
81
  try {
89
82
  sitemapUrl = await findSitemap(url);
90
83
  } catch (error) {
91
- silentLogger.error(error);
84
+ consoleLogger.error(error);
92
85
  }
93
86
 
94
87
  if (!sitemapExist) {
95
88
  console.log('Unable to find sitemap. Commencing website crawl instead.');
96
- // run crawlDomain as per normal
97
- urlsCrawledFinal = await crawlDomain({
89
+ return await crawlDomain({
98
90
  url,
99
91
  randomToken,
100
92
  host,
@@ -109,12 +101,13 @@ const crawlIntelligentSitemap = async (
109
101
  includeScreenshots,
110
102
  followRobots,
111
103
  extraHTTPHeaders,
104
+ safeMode,
105
+ scanDuration, // Use full duration since no sitemap
112
106
  });
113
- return urlsCrawledFinal;
114
107
  }
108
+
115
109
  console.log(`Sitemap found at ${sitemapUrl}`);
116
- // run crawlSitemap then crawDomain subsequently if urlsCrawled.scanned.length < maxRequestsPerCrawl
117
- urlsCrawledFinal = await crawlSitemap(
110
+ urlsCrawledFinal = await crawlSitemap({
118
111
  sitemapUrl,
119
112
  randomToken,
120
113
  host,
@@ -128,14 +121,21 @@ const crawlIntelligentSitemap = async (
128
121
  includeScreenshots,
129
122
  extraHTTPHeaders,
130
123
  fromCrawlIntelligentSitemap,
131
- url,
132
- dataset, // for crawlSitemap to add on to
133
- urlsCrawled, // for crawlSitemap to add on to
134
- false,
135
- );
124
+ userUrlInputFromIntelligent: url,
125
+ datasetFromIntelligent: dataset,
126
+ urlsCrawledFromIntelligent: urlsCrawled,
127
+ crawledFromLocalFile: false,
128
+ scanDuration,
129
+ });
130
+
131
+ const elapsed = Date.now() - startTime;
132
+ const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
136
133
 
137
- if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
138
- // run crawl domain starting from root website, only on pages not scanned before
134
+ if (
135
+ urlsCrawledFinal.scanned.length < maxRequestsPerCrawl &&
136
+ remainingScanDuration > 0
137
+ ) {
138
+ console.log(`Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`);
139
139
  urlsCrawledFinal = await crawlDomain({
140
140
  url,
141
141
  randomToken,
@@ -153,12 +153,16 @@ const crawlIntelligentSitemap = async (
153
153
  extraHTTPHeaders,
154
154
  safeMode,
155
155
  fromCrawlIntelligentSitemap,
156
- datasetFromIntelligent: dataset, // for crawlDomain to add on to
157
- urlsCrawledFromIntelligent: urlsCrawledFinal, // urls for crawlDomain to exclude
156
+ datasetFromIntelligent: dataset,
157
+ urlsCrawledFromIntelligent: urlsCrawledFinal,
158
+ scanDuration: remainingScanDuration,
158
159
  });
160
+ } else if (remainingScanDuration <= 0) {
161
+ console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
159
162
  }
160
163
 
161
164
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
162
165
  return urlsCrawledFinal;
163
166
  };
167
+
164
168
  export default crawlIntelligentSitemap;