npm - @govtechsg/oobee - Versions diffs - 0.10.51 → 0.10.58 - Mend

@govtechsg/oobee 0.10.51 → 0.10.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/.github/workflows/bump-package-version.yml +58 -0
package/.github/workflows/image.yml +38 -17
package/DETAILS.md +5 -2
package/INTEGRATION.md +57 -53
package/README.md +4 -1
package/__tests__/test-sitemap-url-patterns.xml +105 -0
package/exclusions.txt +1 -0
package/package.json +7 -6
package/src/cli.ts +35 -2
package/src/combine.ts +10 -7
package/src/constants/cliFunctions.ts +9 -0
package/src/constants/common.ts +95 -105
package/src/constants/constants.ts +47 -2
package/src/crawlers/commonCrawlerFunc.ts +84 -5
package/src/crawlers/crawlDomain.ts +93 -160
package/src/crawlers/crawlIntelligentSitemap.ts +40 -36
package/src/crawlers/crawlLocalFile.ts +77 -35
package/src/crawlers/crawlSitemap.ts +156 -89
package/src/crawlers/pdfScanFunc.ts +2 -0
package/src/index.ts +2 -0
package/src/logs.ts +4 -2
package/src/mergeAxeResults.ts +20 -9
package/src/npmIndex.ts +1 -1
package/src/screenshotFunc/htmlScreenshotFunc.ts +7 -5
package/src/screenshotFunc/pdfScreenshotFunc.ts +2 -2
package/src/static/ejs/partials/components/wcagCompliance.ejs +1 -1
package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +1 -0
package/src/static/ejs/partials/styles/styles.ejs +11 -0
package/src/static/ejs/report.ejs +14 -1
package/src/utils.ts +3 -3

package/src/crawlers/crawlDomain.ts CHANGED Viewed

@@ -2,15 +2,14 @@ import crawlee, { EnqueueStrategy } from 'crawlee';
 import fs from 'fs';
 import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
 import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
-import axios from 'axios';
-import { fileTypeFromBuffer } from 'file-type';
-import mime from 'mime-types';
 import https from 'https';
 import type { BatchAddRequestsResult } from '@crawlee/types';
 import {
   createCrawleeSubFolders,
   runAxeScript,
   isUrlPdf,
+  shouldSkipClickDueToDisallowedHref,
+  shouldSkipDueToUnsupportedContent,
 } from './commonCrawlerFunc.js';
 import constants, {
   UrlsCrawled,
@@ -19,6 +18,8 @@ import constants, {
   cssQuerySelectors,
   RuleFlags,
   STATUS_CODE_METADATA,
+  disallowedListOfPatterns,
+  disallowedSelectorPatterns,
 } from '../constants/constants.js';
 import {
   getPlaywrightLaunchOptions,
@@ -37,7 +38,7 @@ import {
   mapPdfScanResults,
   doPdfScreenshots,
 } from './pdfScanFunc.js';
-import { silentLogger, guiInfoLog } from '../logs.js';
+import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
 import { ViewportSettingsClass } from '../combine.js';
 const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
@@ -71,6 +72,7 @@ const crawlDomain = async ({
   includeScreenshots,
   followRobots,
   extraHTTPHeaders,
+  scanDuration = 0,
   safeMode = false,
   fromCrawlIntelligentSitemap = false,
   datasetFromIntelligent = null,
@@ -91,12 +93,14 @@ const crawlDomain = async ({
   includeScreenshots: boolean;
   followRobots: boolean;
   extraHTTPHeaders: Record<string, string>;
+  scanDuration?: number;
   safeMode?: boolean;
   fromCrawlIntelligentSitemap?: boolean;
   datasetFromIntelligent?: crawlee.Dataset;
   urlsCrawledFromIntelligent?: UrlsCrawled;
   ruleset?: RuleFlags[];
 }) => {
+  const crawlStartTime = Date.now();
   let dataset: crawlee.Dataset;
   let urlsCrawled: UrlsCrawled;
   let requestQueue: crawlee.RequestQueue;
@@ -162,95 +166,6 @@ const crawlDomain = async ({
     });
   }
-  const httpHeadCache = new Map<string, boolean>();
-  const isProcessibleUrl = async (url: string): Promise<boolean> => {
-    if (httpHeadCache.has(url)) {
-      silentLogger.info(`Skipping request as URL has been processed before ${url}}`);
-      return false; // return false to avoid processing the same url again
-    }
-    try {
-      // Send a HEAD request to check headers without downloading the file
-      const headResponse = await axios.head(url, {
-        headers: { Authorization: authHeader },
-        httpsAgent,
-      });
-      const contentType = headResponse.headers['content-type'] || '';
-      const contentDisposition = headResponse.headers['content-disposition'] || '';
-      // Check if the response suggests it's a downloadable file based on Content-Disposition header
-      if (contentDisposition.includes('attachment')) {
-        silentLogger.info(`Skipping URL due to attachment header: ${url}`);
-        httpHeadCache.set(url, false);
-        return false;
-      }
-      // Check if the MIME type suggests it's a downloadable file
-      if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
-        silentLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
-        httpHeadCache.set(url, false);
-        return false;
-      }
-      // Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
-      const mimeType = mime.lookup(contentType);
-      if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
-        silentLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
-        httpHeadCache.set(url, false);
-        return false;
-      }
-      // Additional check for zip files by their magic number (PK\x03\x04)
-      if (url.endsWith('.zip')) {
-        silentLogger.info(`Checking for zip file magic number at URL ${url}`);
-        // Download the first few bytes of the file to check for the magic number
-        const byteResponse = await axios.get(url, {
-          headers: { Range: 'bytes=0-3', Authorization: authHeader },
-          responseType: 'arraybuffer',
-          httpsAgent,
-        });
-        const magicNumber = byteResponse.data.toString('hex');
-        if (magicNumber === '504b0304') {
-          silentLogger.info(`Skipping zip file at URL ${url}`);
-          httpHeadCache.set(url, false);
-          return false;
-        }
-        silentLogger.info(
-          `Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
-        );
-      }
-      // If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content
-      const response = await axios.get(url, {
-        headers: { Range: 'bytes=0-4100', Authorization: authHeader },
-        responseType: 'arraybuffer',
-        httpsAgent,
-      });
-      const fileType = await fileTypeFromBuffer(response.data);
-      if (
-        fileType &&
-        !fileType.mime.startsWith('text/html') &&
-        !fileType.mime.startsWith('text/')
-      ) {
-        silentLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
-        httpHeadCache.set(url, false);
-        return false;
-      }
-    } catch (e) {
-      // silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
-      // If an error occurs (e.g., a network issue), assume the URL is processible
-      httpHeadCache.set(url, true);
-      return true;
-    }
-    // If none of the conditions to skip are met, allow processing of the URL
-    httpHeadCache.set(url, true);
-    return true;
-  };
   const enqueueProcess = async (
     page: Page,
     enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
@@ -259,14 +174,14 @@ const crawlDomain = async ({
     try {
       await enqueueLinks({
         // set selector matches anchor elements with href but not contains # or starting with mailto:
-        selector: 'a:not(a[href*="#"],a[href^="mailto:"])',
+        selector: `a:not(${disallowedSelectorPatterns})`,
         strategy,
         requestQueue,
         transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
           try {
             req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
           } catch (e) {
-            silentLogger.error(e);
+            consoleLogger.error(e);
           }
           if (urlsCrawled.scanned.some(item => item.url === req.url)) {
             req.skipNavigation = true;
@@ -288,7 +203,7 @@ const crawlDomain = async ({
         try {
           await customEnqueueLinksByClickingElements(page, browserContext);
         } catch (e) {
-          silentLogger.info(e);
+          // do nothing;
         }
       }
     } catch {
@@ -307,7 +222,10 @@ const crawlDomain = async ({
       const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
       const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
       const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
-      return isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
+      const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
+        newPageUrl.toLowerCase().startsWith(pattern),
+      );
+      return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
     };
     const setPageListeners = (page: Page): void => {
       // event listener to handle new page popups upon button click
@@ -431,6 +349,16 @@ const crawlDomain = async ({
             });
           } else if (!newUrlFoundInElement) {
             try {
+              const shouldSkip = await shouldSkipClickDueToDisallowedHref(page, element);
+              if (shouldSkip) {
+                const elementHtml = await page.evaluate(el => el.outerHTML, element);
+                consoleLogger.info(
+                  'Skipping a click due to disallowed href nearby. Element HTML:',
+                  elementHtml,
+                );
+                continue;
+              }
               // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
               await element.click({ force: true });
               await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
@@ -455,7 +383,7 @@ const crawlDomain = async ({
   }
   await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
   const crawler = new crawlee.PlaywrightCrawler({
     launchContext: {
       launcher: constants.launcher,
@@ -486,36 +414,35 @@ const crawlDomain = async ({
           return new Promise(resolve => {
             let timeout;
             let mutationCount = 0;
-            const MAX_MUTATIONS     = 250;   // stop if things never quiet down
-            const OBSERVER_TIMEOUT  = 5000;  // hard cap on total wait
+            const MAX_MUTATIONS = 250; // stop if things never quiet down
+            const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
             const observer = new MutationObserver(() => {
               clearTimeout(timeout);
               mutationCount++;
               if (mutationCount > MAX_MUTATIONS) {
                 observer.disconnect();
                 resolve('Too many mutations, exiting.');
                 return;
               }
               // restart quiet‑period timer
               timeout = setTimeout(() => {
                 observer.disconnect();
                 resolve('DOM stabilized.');
               }, 1000);
             });
             // overall timeout in case the page never settles
             timeout = setTimeout(() => {
               observer.disconnect();
               resolve('Observer timeout reached.');
             }, OBSERVER_TIMEOUT);
             const root = document.documentElement || document.body || document;
             if (!root || typeof observer.observe !== 'function') {
               resolve('No root node to observe.');
-              return;
             }
           });
         });
@@ -537,33 +464,18 @@ const crawlDomain = async ({
         }
       },
     ],
-    preNavigationHooks: isBasicAuth
-      ? [
-        async ({ page, request }) => {
-          await page.setExtraHTTPHeaders({
-            Authorization: authHeader,
-            ...extraHTTPHeaders,
-          });
-          const processible = await isProcessibleUrl(request.url);
-          if (!processible) {
-            request.skipNavigation = true;
-            return null;
-          }
-        },
-      ]
-      : [
-        async ({ page, request }) => {
-          await page.setExtraHTTPHeaders({
-            ...extraHTTPHeaders,
-          });
-          const processible = await isProcessibleUrl(request.url);
-          if (!processible) {
-            request.skipNavigation = true;
-            return null;
-          }
-        },
-      ],
+    preNavigationHooks: [ async({ page, request}) => {
+      if (isBasicAuth) {
+        await page.setExtraHTTPHeaders({
+          Authorization: authHeader,
+          ...extraHTTPHeaders,
+        });
+      } else {
+        await page.setExtraHTTPHeaders({
+          ...extraHTTPHeaders,
+        });
+      }
+    }],
     requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
     requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
       const browserContext: BrowserContext = page.context();
@@ -586,7 +498,10 @@ const crawlDomain = async ({
           actualUrl = page.url();
         }
-        if (!isFollowStrategy(url, actualUrl, strategy) && (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))) {
+        if (
+          !isFollowStrategy(url, actualUrl, strategy) &&
+          (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
+        ) {
           guiInfoLog(guiInfoStatusTypes.SKIPPED, {
             numScanned: urlsCrawled.scanned.length,
             urlScanned: actualUrl,
@@ -594,7 +509,13 @@ const crawlDomain = async ({
           return;
         }
-        if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
+        const hasExceededDuration =
+          scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
+        if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
+          if (hasExceededDuration) {
+            console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
+          }
           isAbortingScanNow = true;
           crawler.autoscaledPool.abort();
           return;
@@ -612,7 +533,7 @@ const crawlDomain = async ({
         }
         // handle pdfs
-        if (request.skipNavigation && actualUrl === "about:blank") {
+        if (shouldSkipDueToUnsupportedContent(response, request.url) || (request.skipNavigation && actualUrl === 'about:blank')) {
           if (!isScanPdfs) {
             guiInfoLog(guiInfoStatusTypes.SKIPPED, {
               numScanned: urlsCrawled.scanned.length,
@@ -648,7 +569,7 @@ const crawlDomain = async ({
           urlsCrawled.userExcluded.push({
             url: request.url,
             pageTitle: request.url,
-            actualUrl: actualUrl, // because about:blank is not useful
+            actualUrl, // because about:blank is not useful
             metadata: STATUS_CODE_METADATA[1],
             httpStatusCode: 0,
           });
@@ -656,15 +577,19 @@ const crawlDomain = async ({
           return;
         }
-        if (!isFollowStrategy(url, actualUrl, strategy) && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
+        if (
+          !isFollowStrategy(url, actualUrl, strategy) &&
+          blacklistedPatterns &&
+          isSkippedUrl(actualUrl, blacklistedPatterns)
+        ) {
           urlsCrawled.userExcluded.push({
             url: request.url,
             pageTitle: request.url,
-            actualUrl: actualUrl,
+            actualUrl,
             metadata: STATUS_CODE_METADATA[0],
             httpStatusCode: 0,
           });
           guiInfoLog(guiInfoStatusTypes.SKIPPED, {
             numScanned: urlsCrawled.scanned.length,
             urlScanned: request.url,
@@ -679,11 +604,7 @@ const crawlDomain = async ({
           const isRedirected = !areLinksEqual(actualUrl, request.url);
           // check if redirected link is following strategy (same-domain/same-hostname)
-          const isLoadedUrlFollowStrategy = isFollowStrategy(
-            actualUrl,
-            request.url,
-            strategy,
-          );
+          const isLoadedUrlFollowStrategy = isFollowStrategy(actualUrl, request.url, strategy);
           if (isRedirected && !isLoadedUrlFollowStrategy) {
             urlsCrawled.notScannedRedirects.push({
               fromUrl: request.url,
@@ -693,7 +614,7 @@ const crawlDomain = async ({
           }
           const responseStatus = response?.status();
-            if (responseStatus && responseStatus >= 300) {
+          if (responseStatus && responseStatus >= 300) {
             guiInfoLog(guiInfoStatusTypes.SKIPPED, {
               numScanned: urlsCrawled.scanned.length,
               urlScanned: request.url,
@@ -706,7 +627,7 @@ const crawlDomain = async ({
               httpStatusCode: responseStatus,
             });
             return;
-            }
+          }
           const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
@@ -733,7 +654,7 @@ const crawlDomain = async ({
               urlsCrawled.scanned.push({
                 url: urlWithoutAuth(request.url),
                 pageTitle: results.pageTitle,
-                actualUrl: actualUrl, // i.e. actualUrl
+                actualUrl, // i.e. actualUrl
               });
               urlsCrawled.scannedRedirects.push({
@@ -768,11 +689,10 @@ const crawlDomain = async ({
           urlsCrawled.userExcluded.push({
             url: request.url,
             pageTitle: request.url,
-            actualUrl: actualUrl, // because about:blank is not useful
+            actualUrl, // because about:blank is not useful
             metadata: STATUS_CODE_METADATA[1],
             httpStatusCode: 0,
           });
         }
         if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
@@ -780,7 +700,7 @@ const crawlDomain = async ({
       } catch (e) {
         try {
           if (!e.message.includes('page.evaluate')) {
-            silentLogger.info(e);
+            // do nothing;
             guiInfoLog(guiInfoStatusTypes.ERROR, {
               numScanned: urlsCrawled.scanned.length,
               urlScanned: request.url,
@@ -815,11 +735,11 @@ const crawlDomain = async ({
             urlScanned: request.url,
           });
-          urlsCrawled.error.push({
-            url: request.url,
-            pageTitle: request.url,
-            actualUrl: request.url,
-            metadata: STATUS_CODE_METADATA[2]
+          urlsCrawled.error.push({
+            url: request.url,
+            pageTitle: request.url,
+            actualUrl: request.url,
+            metadata: STATUS_CODE_METADATA[2],
           });
         }
       }
@@ -831,9 +751,10 @@ const crawlDomain = async ({
       });
       const status = response?.status();
-      const metadata = typeof status === 'number'
-      ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
-      : STATUS_CODE_METADATA[2];
+      const metadata =
+        typeof status === 'number'
+          ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
+          : STATUS_CODE_METADATA[2];
       urlsCrawled.error.push({
         url: request.url,
@@ -842,10 +763,18 @@ const crawlDomain = async ({
         metadata,
         httpStatusCode: typeof status === 'number' ? status : 0,
       });
     },
     maxRequestsPerCrawl: Infinity,
     maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
+    ...(process.env.OOBEE_FAST_CRAWLER && {
+      autoscaledPoolOptions: {
+        minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
+        maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
+        desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
+        scaleUpStepRatio: 0.99,        // Scale up faster
+        scaleDownStepRatio: 0.1,       // Scale down slower
+      },
+    }),
   });
   await crawler.run();
@@ -875,6 +804,10 @@ const crawlDomain = async ({
     guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
   }
+  if (scanDuration > 0) {
+    const elapsed = Math.round((Date.now() - crawlStartTime) / 1000);
+    console.log(`Crawl ended after ${elapsed}s. Limit: ${scanDuration}s.`);
+  }
   return urlsCrawled;
 };

package/src/crawlers/crawlIntelligentSitemap.ts CHANGED Viewed

@@ -2,7 +2,7 @@ import fs from 'fs';
 import { chromium, Page } from 'playwright';
 import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
 import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
-import { silentLogger, guiInfoLog } from '../logs.js';
+import { consoleLogger, guiInfoLog } from '../logs.js';
 import crawlDomain from './crawlDomain.js';
 import crawlSitemap from './crawlSitemap.js';
 import { EnqueueStrategy } from 'crawlee';
@@ -24,46 +24,42 @@ const crawlIntelligentSitemap = async (
   followRobots: boolean,
   extraHTTPHeaders: Record<string, string>,
   safeMode: boolean,
+  scanDuration: number
 ) => {
+  const startTime = Date.now(); // Track start time
   let urlsCrawledFinal;
-  let urlsCrawled;
+  let urlsCrawled = { ...constants.urlsCrawledObj };
   let dataset;
   let sitemapExist = false;
   const fromCrawlIntelligentSitemap = true;
   let sitemapUrl;
-  urlsCrawled = { ...constants.urlsCrawledObj };
   ({ dataset } = await createCrawleeSubFolders(randomToken));
   if (!fs.existsSync(randomToken)) {
     fs.mkdirSync(randomToken);
   }
   function getHomeUrl(parsedUrl: string) {
     const urlObject = new URL(parsedUrl);
-    if (urlObject.username !== '' && urlObject.password !== '') {
+    if (urlObject.username && urlObject.password) {
       return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
     }
     return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
   }
   async function findSitemap(link: string) {
     const homeUrl = getHomeUrl(link);
-    let sitemapLinkFound = false;
     let sitemapLink = '';
-    const chromiumBrowser = await chromium.launch(
-      {
-        headless: false,
-        channel: 'chrome',
-        args: ['--headless=new', '--no-sandbox']
-      });
+    const chromiumBrowser = await chromium.launch({
+      headless: false,
+      channel: 'chrome',
+      args: ['--headless=new', '--no-sandbox'],
+    });
     const page = await chromiumBrowser.newPage();
     for (const path of sitemapPaths) {
       sitemapLink = homeUrl + path;
-      sitemapLinkFound = await checkUrlExists(page, sitemapLink);
-      if (sitemapLinkFound) {
+      if (await checkUrlExists(page, sitemapLink)) {
         sitemapExist = true;
         break;
       }
@@ -75,12 +71,9 @@ const crawlIntelligentSitemap = async (
   const checkUrlExists = async (page: Page, parsedUrl: string) => {
     try {
       const response = await page.goto(parsedUrl);
-      if (response.ok()) {
-        return true;
-      }
-      return false;
+      return response.ok();
     } catch (e) {
-      silentLogger.error(e);
+      consoleLogger.error(e);
       return false;
     }
   };
@@ -88,13 +81,12 @@ const crawlIntelligentSitemap = async (
   try {
     sitemapUrl = await findSitemap(url);
   } catch (error) {
-    silentLogger.error(error);
+    consoleLogger.error(error);
   }
   if (!sitemapExist) {
     console.log('Unable to find sitemap. Commencing website crawl instead.');
-    // run crawlDomain as per normal
-    urlsCrawledFinal = await crawlDomain({
+    return await crawlDomain({
       url,
       randomToken,
       host,
@@ -109,12 +101,13 @@ const crawlIntelligentSitemap = async (
       includeScreenshots,
       followRobots,
       extraHTTPHeaders,
+      safeMode,
+      scanDuration, // Use full duration since no sitemap
     });
-    return urlsCrawledFinal;
   }
   console.log(`Sitemap found at ${sitemapUrl}`);
-  // run crawlSitemap then crawDomain subsequently if urlsCrawled.scanned.length < maxRequestsPerCrawl
-  urlsCrawledFinal = await crawlSitemap(
+  urlsCrawledFinal = await crawlSitemap({
     sitemapUrl,
     randomToken,
     host,
@@ -128,14 +121,21 @@ const crawlIntelligentSitemap = async (
     includeScreenshots,
     extraHTTPHeaders,
     fromCrawlIntelligentSitemap,
-    url,
-    dataset, // for crawlSitemap to add on to
-    urlsCrawled, // for crawlSitemap to add on to
-    false,
-  );
+    userUrlInputFromIntelligent: url,
+    datasetFromIntelligent: dataset,
+    urlsCrawledFromIntelligent: urlsCrawled,
+    crawledFromLocalFile: false,
+    scanDuration,
+  });
+  const elapsed = Date.now() - startTime;
+  const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
-  if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
-    // run crawl domain starting from root website, only on pages not scanned before
+  if (
+    urlsCrawledFinal.scanned.length < maxRequestsPerCrawl &&
+    remainingScanDuration > 0
+  ) {
+    console.log(`Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`);
     urlsCrawledFinal = await crawlDomain({
       url,
       randomToken,
@@ -153,12 +153,16 @@ const crawlIntelligentSitemap = async (
       extraHTTPHeaders,
       safeMode,
       fromCrawlIntelligentSitemap,
-      datasetFromIntelligent: dataset, // for crawlDomain to add on to
-      urlsCrawledFromIntelligent: urlsCrawledFinal, // urls for crawlDomain to exclude
+      datasetFromIntelligent: dataset,
+      urlsCrawledFromIntelligent: urlsCrawledFinal,
+      scanDuration: remainingScanDuration,
     });
+  } else if (remainingScanDuration <= 0) {
+    console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
   }
   guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
   return urlsCrawledFinal;
 };
 export default crawlIntelligentSitemap;