@govtechsg/oobee 0.10.91 → 0.10.93

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/AGENTS.md +303 -0
  2. package/README.md +22 -0
  3. package/dist/cli.js +3 -0
  4. package/dist/combine.js +15 -3
  5. package/dist/constants/cliFunctions.js +7 -0
  6. package/dist/constants/common.js +149 -80
  7. package/dist/constants/constants.js +1 -0
  8. package/dist/crawlers/commonCrawlerFunc.js +136 -15
  9. package/dist/crawlers/crawlDomain.js +55 -58
  10. package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
  11. package/dist/crawlers/crawlRateController.js +47 -0
  12. package/dist/crawlers/crawlSitemap.js +51 -62
  13. package/dist/crawlers/runCustom.js +8 -2
  14. package/dist/generateOobeeClientScanner.js +32 -1
  15. package/dist/mergeAxeResults/itemsStore.js +32 -3
  16. package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
  17. package/dist/mergeAxeResults.js +120 -92
  18. package/dist/npmIndex.js +1 -0
  19. package/dist/utils.js +23 -28
  20. package/oobee-client-scanner.js +35 -4
  21. package/package.json +3 -3
  22. package/src/cli.ts +4 -0
  23. package/src/combine.ts +16 -1
  24. package/src/constants/cliFunctions.ts +7 -0
  25. package/src/constants/common.ts +162 -90
  26. package/src/constants/constants.ts +1 -0
  27. package/src/crawlers/commonCrawlerFunc.ts +148 -14
  28. package/src/crawlers/crawlDomain.ts +64 -66
  29. package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
  30. package/src/crawlers/crawlRateController.ts +63 -0
  31. package/src/crawlers/crawlSitemap.ts +57 -70
  32. package/src/crawlers/runCustom.ts +10 -1
  33. package/src/generateOobeeClientScanner.ts +32 -1
  34. package/src/index.ts +1 -0
  35. package/src/mergeAxeResults/itemsStore.ts +37 -3
  36. package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
  37. package/src/mergeAxeResults.ts +139 -99
  38. package/src/npmIndex.ts +1 -0
  39. package/src/utils.ts +25 -33
  40. /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
@@ -1,14 +1,15 @@
1
1
  import crawlee, { EnqueueStrategy } from 'crawlee';
2
+ import { CrawlRateController } from './crawlRateController.js';
2
3
  import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
3
4
  import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
4
- import * as path from 'path';
5
- import fsp from 'fs/promises';
6
5
  import {
7
6
  createCrawleeSubFolders,
7
+ getPreLaunchHook,
8
8
  runAxeScript,
9
9
  isUrlPdf,
10
10
  shouldSkipClickDueToDisallowedHref,
11
11
  shouldSkipDueToUnsupportedContent,
12
+ splitAuthHeaders,
12
13
  } from './commonCrawlerFunc.js';
13
14
  import constants, {
14
15
  UrlsCrawled,
@@ -29,7 +30,7 @@ import {
29
30
  getUrlsFromRobotsTxt,
30
31
  waitForPageLoaded,
31
32
  } from '../constants/common.js';
32
- import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
33
+ import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
33
34
  import {
34
35
  handlePdfDownload,
35
36
  runPdfScan,
@@ -364,9 +365,7 @@ const crawlDomain = async ({
364
365
  // same-domain strategy) still contribute their <a> links above, but
365
366
  // clicking every interactive element on them is too slow and starves
366
367
  // the crawler of time to discover pages on the primary hostname.
367
- const currentHostname = new URL(page.url()).hostname;
368
- const seedHostname = new URL(url).hostname;
369
- if (currentHostname === seedHostname) {
368
+ if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
370
369
  // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
371
370
  try {
372
371
  await customEnqueueLinksByClickingElements(page, browserContext);
@@ -382,53 +381,46 @@ const crawlDomain = async ({
382
381
  };
383
382
 
384
383
  let isAbortingScanNow = false;
384
+ const rateController = new CrawlRateController(
385
+ maxRequestsPerCrawl,
386
+ specifiedMaxConcurrency || constants.maxConcurrency,
387
+ );
388
+
389
+ const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
385
390
 
386
391
  const crawler = register(
387
392
  new crawlee.PlaywrightCrawler({
388
393
  launchContext: {
389
394
  launcher: constants.launcher,
390
395
  launchOptions: getPlaywrightLaunchOptions(browser),
391
- // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
392
- ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
393
396
  },
394
397
  retryOnBlocked: true,
395
398
  browserPoolOptions: {
396
399
  useFingerprints: false,
397
400
  preLaunchHooks: [
401
+ getPreLaunchHook(userDataDirectory),
398
402
  async (_pageId, launchContext) => {
399
- const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
400
-
401
- // Ensure base exists
402
- await fsp.mkdir(baseDir, { recursive: true });
403
-
404
- // Create a unique subdir per browser
405
- const subProfileDir = path.join(
406
- baseDir,
407
- `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
408
- );
409
- await fsp.mkdir(subProfileDir, { recursive: true });
410
-
411
- // Assign to Crawlee's launcher
412
- // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
413
- // eslint-disable-next-line no-param-reassign
414
- launchContext.userDataDir = subProfileDir;
415
-
416
- // Safely extend launchOptions
417
403
  // eslint-disable-next-line no-param-reassign
418
404
  launchContext.launchOptions = {
419
405
  ...launchContext.launchOptions,
420
406
  ignoreHTTPSErrors: true,
421
407
  ...playwrightDeviceDetailsObject,
408
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
422
409
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
423
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
410
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
411
+ ...(httpCredentials && { httpCredentials }),
424
412
  };
425
-
426
- // Optionally log for debugging
427
- // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
428
413
  },
429
414
  ],
430
415
  },
431
416
  requestQueue,
417
+ preNavigationHooks: [
418
+ async (crawlingContext) => {
419
+ if (extraHTTPHeaders) {
420
+ crawlingContext.request.headers = extraHTTPHeaders;
421
+ }
422
+ },
423
+ ],
432
424
  postNavigationHooks: [
433
425
  async crawlingContext => {
434
426
  const { page, request } = crawlingContext;
@@ -527,11 +519,9 @@ const crawlDomain = async ({
527
519
  const hasExceededDuration =
528
520
  scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
529
521
 
530
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
531
- if (hasExceededDuration) {
532
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
533
- durationExceeded = true;
534
- }
522
+ if (hasExceededDuration) {
523
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
524
+ durationExceeded = true;
535
525
  isAbortingScanNow = true;
536
526
  activeCrawler.autoscaledPool.abort();
537
527
  return;
@@ -691,8 +681,7 @@ const crawlDomain = async ({
691
681
  return;
692
682
  }
693
683
 
694
- // One more check if scanned pages have reached limit due to multi-instances of handler running
695
- if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
684
+ if (rateController.claimSlot()) {
696
685
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
697
686
  numScanned: urlsCrawled.scanned.length,
698
687
  urlScanned: request.url,
@@ -703,6 +692,11 @@ const crawlDomain = async ({
703
692
  pageTitle: results.pageTitle,
704
693
  actualUrl, // i.e. actualUrl
705
694
  });
695
+ rateController.onSuccess(crawler.autoscaledPool);
696
+ if (rateController.isLimitReached()) {
697
+ isAbortingScanNow = true;
698
+ activeCrawler.autoscaledPool.abort();
699
+ }
706
700
  scannedUrlSet.add(normUrl(request.url));
707
701
  scannedResolvedUrlSet.add(normUrl(actualUrl));
708
702
 
@@ -715,8 +709,7 @@ const crawlDomain = async ({
715
709
  results.actualUrl = actualUrl;
716
710
  await dataset.pushData(results);
717
711
  }
718
- } else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
719
- // One more check if scanned pages have reached limit due to multi-instances of handler running
712
+ } else if (rateController.claimSlot()) {
720
713
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
721
714
  numScanned: urlsCrawled.scanned.length,
722
715
  urlScanned: request.url,
@@ -726,6 +719,11 @@ const crawlDomain = async ({
726
719
  actualUrl: request.url,
727
720
  pageTitle: results.pageTitle,
728
721
  });
722
+ rateController.onSuccess(crawler.autoscaledPool);
723
+ if (rateController.isLimitReached()) {
724
+ isAbortingScanNow = true;
725
+ activeCrawler.autoscaledPool.abort();
726
+ }
729
727
  scannedUrlSet.add(normUrl(request.url));
730
728
  scannedResolvedUrlSet.add(normUrl(request.url));
731
729
  await dataset.pushData(results);
@@ -777,33 +775,35 @@ const crawlDomain = async ({
777
775
  });
778
776
  }
779
777
  } catch {
780
- // Do nothing since the error will be pushed
778
+ // Recovery failed; Crawlee will retry the request automatically
781
779
  }
782
780
 
783
- // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
784
- // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
785
- if (!isAbortingScanNow) {
786
- guiInfoLog(guiInfoStatusTypes.ERROR, {
787
- numScanned: urlsCrawled.scanned.length,
788
- urlScanned: request.url,
789
- });
790
-
791
- urlsCrawled.error.push({
792
- url: request.url,
793
- pageTitle: request.url,
794
- actualUrl: request.url,
795
- metadata: STATUS_CODE_METADATA[2],
796
- });
797
- }
781
+ // Do not push to urlsCrawled.error here Crawlee will retry the request
782
+ // (up to maxRequestRetries, default 3). If all retries are exhausted,
783
+ // failedRequestHandler will record the error. Pushing here causes
784
+ // duplicates and false positives for URLs that succeed on retry.
798
785
  }
799
786
  },
800
787
  failedRequestHandler: async ({ request, response }) => {
788
+ if (isAbortingScanNow) {
789
+ return;
790
+ }
791
+
792
+ const status = response?.status();
793
+ if (rateController.onFailure(status, crawler.autoscaledPool)) {
794
+ consoleLogger.info(
795
+ `Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
796
+ );
797
+ isAbortingScanNow = true;
798
+ crawler.autoscaledPool?.abort();
799
+ return;
800
+ }
801
+
801
802
  guiInfoLog(guiInfoStatusTypes.ERROR, {
802
803
  numScanned: urlsCrawled.scanned.length,
803
804
  urlScanned: request.url,
804
805
  });
805
806
 
806
- const status = response?.status();
807
807
  const metadata =
808
808
  typeof status === 'number'
809
809
  ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
@@ -819,15 +819,13 @@ const crawlDomain = async ({
819
819
  },
820
820
  maxRequestsPerCrawl: Infinity,
821
821
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
822
- ...(process.env.OOBEE_FAST_CRAWLER && {
823
- autoscaledPoolOptions: {
824
- minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
825
- maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
826
- desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
827
- scaleUpStepRatio: 0.99, // Scale up faster
828
- scaleDownStepRatio: 0.1, // Scale down slower
829
- },
830
- }),
822
+ autoscaledPoolOptions: {
823
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
824
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
825
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
826
+ scaleUpStepRatio: 0.99, // Scale up faster
827
+ scaleDownStepRatio: 0.1, // Scale down slower
828
+ },
831
829
  }),
832
830
  );
833
831
 
@@ -850,7 +848,7 @@ const crawlDomain = async ({
850
848
  .map(item => item.actualUrl || item.url)
851
849
  .filter(pageUrl => {
852
850
  try {
853
- return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
851
+ return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
854
852
  } catch {
855
853
  return false;
856
854
  }
@@ -1,13 +1,13 @@
1
1
  import fs from 'fs';
2
2
  import { chromium, Page } from 'playwright';
3
3
  import { EnqueueStrategy } from 'crawlee';
4
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
5
5
  import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
6
6
  import { consoleLogger, guiInfoLog } from '../logs.js';
7
7
  import crawlDomain from './crawlDomain.js';
8
8
  import crawlSitemap from './crawlSitemap.js';
9
9
  import { ViewportSettingsClass } from '../combine.js';
10
- import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
10
+ import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
11
11
  import { register } from '../utils.js';
12
12
 
13
13
  const crawlIntelligentSitemap = async (
@@ -40,6 +40,10 @@ const crawlIntelligentSitemap = async (
40
40
 
41
41
  ({ dataset } = await createCrawleeSubFolders(randomToken));
42
42
 
43
+ // Initialise modified User-Agent early so sitemap discovery requests
44
+ // don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
45
+ await initModifiedUserAgent(browser);
46
+
43
47
  function getHomeUrl(parsedUrl: string) {
44
48
  const urlObject = new URL(parsedUrl);
45
49
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -54,6 +58,7 @@ const crawlIntelligentSitemap = async (
54
58
  let sitemapLink = '';
55
59
 
56
60
  const launchOptions = getPlaywrightLaunchOptions(browser);
61
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
57
62
  let context;
58
63
  let browserInstance;
59
64
 
@@ -61,18 +66,25 @@ const crawlIntelligentSitemap = async (
61
66
  const effectiveUserDataDirectory = userDataDirectory || '';
62
67
  context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
63
68
  ...launchOptions,
64
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
69
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
70
+ ...(httpCredentials && { httpCredentials }),
71
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
65
72
  });
66
73
  register(context);
67
74
  } else {
68
- // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
69
75
  browserInstance = await constants.launcher.launch(launchOptions);
70
76
  register(browserInstance as unknown as { close: () => Promise<void> });
71
77
  context = await browserInstance.newContext({
72
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
78
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
79
+ ...(httpCredentials && { httpCredentials }),
80
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
73
81
  });
74
82
  }
75
83
 
84
+ if (authHeader) {
85
+ await addAuthRouteHandler(context, link, authHeader);
86
+ }
87
+
76
88
  const page = await context.newPage();
77
89
 
78
90
  for (const path of sitemapPaths) {
@@ -93,7 +105,7 @@ const crawlIntelligentSitemap = async (
93
105
  const checkUrlExists = async (page: Page, parsedUrl: string) => {
94
106
  try {
95
107
  const response = await page.goto(parsedUrl);
96
- return response.ok();
108
+ return response?.ok() ?? false;
97
109
  } catch (e) {
98
110
  consoleLogger.error(e);
99
111
  return false;
@@ -105,7 +117,7 @@ const crawlIntelligentSitemap = async (
105
117
  try {
106
118
  sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
107
119
  if (sitemapUrls.length > 0) {
108
- console.log(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
120
+ consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
109
121
  sitemapExist = true;
110
122
  }
111
123
  } catch (error) {
@@ -125,7 +137,7 @@ const crawlIntelligentSitemap = async (
125
137
  }
126
138
 
127
139
  if (!sitemapExist) {
128
- console.log('Unable to find sitemap. Commencing website crawl instead.');
140
+ consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
129
141
  return await crawlDomain({
130
142
  url,
131
143
  randomToken,
@@ -157,7 +169,7 @@ const crawlIntelligentSitemap = async (
157
169
  break;
158
170
  }
159
171
 
160
- console.log(`Processing sitemap: ${currentSitemapUrl}`);
172
+ consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
161
173
  urlsCrawledFinal = await crawlSitemap({
162
174
  sitemapUrl: currentSitemapUrl,
163
175
  randomToken,
@@ -187,7 +199,7 @@ const crawlIntelligentSitemap = async (
187
199
  const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
188
200
 
189
201
  if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
190
- console.log(
202
+ consoleLogger.info(
191
203
  `Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
192
204
  );
193
205
  urlsCrawledFinal = await crawlDomain({
@@ -212,7 +224,7 @@ const crawlIntelligentSitemap = async (
212
224
  scanDuration: remainingScanDuration,
213
225
  });
214
226
  } else if (!hasDurationRemaining) {
215
- console.log(
227
+ consoleLogger.info(
216
228
  `Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
217
229
  );
218
230
  durationExceeded = true;
@@ -0,0 +1,63 @@
1
+ import { consoleLogger } from '../logs.js';
2
+
3
+ export class CrawlRateController {
4
+ private scannedCount = 0;
5
+ private readonly maxPages: number;
6
+ private consecutiveFailures = 0;
7
+ private consecutiveSuccesses = 0;
8
+ private readonly maxConsecutiveFailures: number;
9
+ private readonly originalMaxConcurrency: number;
10
+ private static readonly RECOVERY_INTERVAL = 10;
11
+
12
+ constructor(maxRequestsPerCrawl: number, maxConcurrency: number) {
13
+ this.maxPages = maxRequestsPerCrawl;
14
+ this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
15
+ this.originalMaxConcurrency = maxConcurrency;
16
+ }
17
+
18
+ claimSlot(): boolean {
19
+ if (this.scannedCount >= this.maxPages) {
20
+ return false;
21
+ }
22
+ this.scannedCount++;
23
+ return true;
24
+ }
25
+
26
+ onSuccess(pool?: { maxConcurrency: number }): void {
27
+ this.consecutiveFailures = 0;
28
+ this.consecutiveSuccesses++;
29
+
30
+ if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
31
+ if (pool.maxConcurrency < this.originalMaxConcurrency) {
32
+ pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
33
+ consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
34
+ }
35
+ }
36
+ }
37
+
38
+ onFailure(httpStatus: number | undefined, pool?: { maxConcurrency: number }): boolean {
39
+ if (typeof httpStatus !== 'number' || httpStatus < 400) {
40
+ return false;
41
+ }
42
+
43
+ this.consecutiveSuccesses = 0;
44
+ this.consecutiveFailures++;
45
+
46
+ if (pool && pool.maxConcurrency > 1) {
47
+ pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
48
+ consoleLogger.info(
49
+ `Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`,
50
+ );
51
+ }
52
+
53
+ if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
54
+ return true;
55
+ }
56
+
57
+ return false;
58
+ }
59
+
60
+ isLimitReached(): boolean {
61
+ return this.scannedCount >= this.maxPages;
62
+ }
63
+ }
@@ -1,9 +1,9 @@
1
1
  import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
2
+ import { CrawlRateController } from './crawlRateController.js';
2
3
  import fs from 'fs';
3
- import * as path from 'path';
4
- import fsp from 'fs/promises';
5
4
  import {
6
5
  createCrawleeSubFolders,
6
+ getPreLaunchHook,
7
7
  preNavigationHooks,
8
8
  runAxeScript,
9
9
  isUrlPdf,
@@ -30,7 +30,7 @@ import {
30
30
  mapPdfScanResults,
31
31
  doPdfScreenshots,
32
32
  } from './pdfScanFunc.js';
33
- import { guiInfoLog } from '../logs.js';
33
+ import { consoleLogger, guiInfoLog } from '../logs.js';
34
34
  import { ViewportSettingsClass } from '../combine.js';
35
35
 
36
36
  const crawlSitemap = async ({
@@ -81,6 +81,10 @@ const crawlSitemap = async ({
81
81
  let urlsCrawled: UrlsCrawled;
82
82
  let durationExceeded = false;
83
83
  let isAbortingScan = false;
84
+ const rateController = new CrawlRateController(
85
+ maxRequestsPerCrawl,
86
+ specifiedMaxConcurrency || constants.maxConcurrency,
87
+ );
84
88
 
85
89
  if (fromCrawlIntelligentSitemap) {
86
90
  dataset = datasetFromIntelligent;
@@ -125,39 +129,20 @@ const crawlSitemap = async ({
125
129
  launchContext: {
126
130
  launcher: constants.launcher,
127
131
  launchOptions: getPlaywrightLaunchOptions(browser),
128
- // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
129
- ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
130
132
  },
131
133
  retryOnBlocked: true,
132
134
  browserPoolOptions: {
133
135
  useFingerprints: false,
134
136
  preLaunchHooks: [
137
+ getPreLaunchHook(userDataDirectory),
135
138
  async (_pageId, launchContext) => {
136
- const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
137
-
138
- // Ensure base exists
139
- await fsp.mkdir(baseDir, { recursive: true });
140
-
141
- // Create a unique subdir per browser
142
- const subProfileDir = path.join(
143
- baseDir,
144
- `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
145
- );
146
- await fsp.mkdir(subProfileDir, { recursive: true });
147
-
148
- // Assign to Crawlee's launcher
149
- launchContext.userDataDir = subProfileDir;
150
-
151
- // Safely extend launchOptions
152
139
  launchContext.launchOptions = {
153
140
  ...launchContext.launchOptions,
154
141
  ignoreHTTPSErrors: true,
155
142
  ...playwrightDeviceDetailsObject,
143
+ ...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
156
144
  ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
157
145
  };
158
-
159
- // Optionally log for debugging
160
- // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
161
146
  },
162
147
  ],
163
148
  },
@@ -259,13 +244,11 @@ const crawlSitemap = async ({
259
244
  const hasExceededDuration =
260
245
  scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
261
246
 
262
- if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
247
+ if (hasExceededDuration) {
248
+ consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
249
+ durationExceeded = true;
263
250
  isAbortingScan = true;
264
- if (hasExceededDuration) {
265
- console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
266
- durationExceeded = true;
267
- }
268
- crawler.autoscaledPool.abort(); // stops new requests
251
+ crawler.autoscaledPool.abort();
269
252
  return;
270
253
  }
271
254
 
@@ -376,26 +359,33 @@ const crawlSitemap = async ({
376
359
  // Page/context was destroyed during navigation — handled by outer catch
377
360
  }
378
361
 
379
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
380
- numScanned: urlsCrawled.scanned.length,
381
- urlScanned: request.url,
382
- });
362
+ if (rateController.claimSlot()) {
363
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
364
+ numScanned: urlsCrawled.scanned.length,
365
+ urlScanned: request.url,
366
+ });
383
367
 
384
- urlsCrawled.scanned.push({
385
- url: request.url,
386
- pageTitle: results.pageTitle,
387
- actualUrl, // i.e. actualUrl
388
- });
368
+ urlsCrawled.scanned.push({
369
+ url: request.url,
370
+ pageTitle: results.pageTitle,
371
+ actualUrl, // i.e. actualUrl
372
+ });
373
+ rateController.onSuccess(crawler.autoscaledPool);
374
+ if (rateController.isLimitReached()) {
375
+ isAbortingScan = true;
376
+ crawler.autoscaledPool.abort();
377
+ }
389
378
 
390
- urlsCrawled.scannedRedirects.push({
391
- fromUrl: request.url,
392
- toUrl: actualUrl,
393
- });
379
+ urlsCrawled.scannedRedirects.push({
380
+ fromUrl: request.url,
381
+ toUrl: actualUrl,
382
+ });
394
383
 
395
- results.url = request.url;
396
- results.actualUrl = actualUrl;
384
+ results.url = request.url;
385
+ results.actualUrl = actualUrl;
397
386
 
398
- await dataset.pushData(results);
387
+ await dataset.pushData(results);
388
+ }
399
389
  } else {
400
390
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
401
391
  numScanned: urlsCrawled.scanned.length,
@@ -420,20 +410,10 @@ const crawlSitemap = async ({
420
410
  }
421
411
  }
422
412
  } catch (e) {
423
- if (!isAbortingScan) {
424
- guiInfoLog(guiInfoStatusTypes.ERROR, {
425
- numScanned: urlsCrawled.scanned.length,
426
- urlScanned: request.url,
427
- });
428
-
429
- urlsCrawled.error.push({
430
- url: request.url,
431
- pageTitle: request.url,
432
- actualUrl: request.url,
433
- metadata: STATUS_CODE_METADATA[2],
434
- httpStatusCode: 0,
435
- });
436
- }
413
+ // Do not push to urlsCrawled.error here — Crawlee will retry the request
414
+ // (up to maxRequestRetries, default 3). If all retries are exhausted,
415
+ // failedRequestHandler will record the error. Pushing here causes
416
+ // duplicates and false positives for URLs that succeed on retry.
437
417
  }
438
418
  },
439
419
  failedRequestHandler: async ({ request, response, error }) => {
@@ -441,12 +421,21 @@ const crawlSitemap = async ({
441
421
  return;
442
422
  }
443
423
 
424
+ const status = response?.status();
425
+ if (rateController.onFailure(status, crawler.autoscaledPool)) {
426
+ consoleLogger.info(
427
+ `Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
428
+ );
429
+ isAbortingScan = true;
430
+ crawler.autoscaledPool?.abort();
431
+ return;
432
+ }
433
+
444
434
  guiInfoLog(guiInfoStatusTypes.ERROR, {
445
435
  numScanned: urlsCrawled.scanned.length,
446
436
  urlScanned: request.url,
447
437
  });
448
438
 
449
- const status = response?.status();
450
439
  const metadata =
451
440
  typeof status === 'number'
452
441
  ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
@@ -463,15 +452,13 @@ const crawlSitemap = async ({
463
452
  },
464
453
  maxRequestsPerCrawl: Infinity,
465
454
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
466
- ...(process.env.OOBEE_FAST_CRAWLER && {
467
- autoscaledPoolOptions: {
468
- minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
469
- maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
470
- desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
471
- scaleUpStepRatio: 0.99, // Scale up faster
472
- scaleDownStepRatio: 0.1, // Scale down slower
473
- },
474
- }),
455
+ autoscaledPoolOptions: {
456
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
457
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
458
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
459
+ scaleUpStepRatio: 0.99, // Scale up faster
460
+ scaleDownStepRatio: 0.1, // Scale down slower
461
+ },
475
462
  }),
476
463
  );
477
464
 
@@ -1,5 +1,5 @@
1
1
  /* eslint-env browser */
2
- import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
2
+ import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
3
3
  import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
4
4
  import constants, {
5
5
  getIntermediateScreenshotsPath,
@@ -60,6 +60,7 @@ const runCustom = async (
60
60
  blacklistedPatterns: string[] | null,
61
61
  includeScreenshots: boolean,
62
62
  initialCustomFlowLabel?: string,
63
+ extraHTTPHeaders?: Record<string, string>,
63
64
  ) => {
64
65
  // checks and delete datasets path if it already exists
65
66
  process.env.CRAWLEE_STORAGE_DIR = randomToken;
@@ -109,6 +110,8 @@ const runCustom = async (
109
110
  ...customArgs,
110
111
  ];
111
112
 
113
+ const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
114
+
112
115
  const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
113
116
  ...baseLaunchOptions,
114
117
  args: mergedArgs,
@@ -118,8 +121,14 @@ const runCustom = async (
118
121
  viewport: null,
119
122
  ...(hasCustomViewport ? contextDeviceOptions : {}),
120
123
  userAgent: process.env.OOBEE_USER_AGENT || (deviceUserAgent as string | undefined),
124
+ ...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
125
+ ...(httpCredentials && { httpCredentials }),
121
126
  });
122
127
 
128
+ if (authHeader) {
129
+ await addAuthRouteHandler(context, url, authHeader);
130
+ }
131
+
123
132
  register(context);
124
133
 
125
134
  processPageParams.stopAll = async () => {