@govtechsg/oobee 0.10.91 → 0.10.93
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +303 -0
- package/README.md +22 -0
- package/dist/cli.js +3 -0
- package/dist/combine.js +15 -3
- package/dist/constants/cliFunctions.js +7 -0
- package/dist/constants/common.js +149 -80
- package/dist/constants/constants.js +1 -0
- package/dist/crawlers/commonCrawlerFunc.js +136 -15
- package/dist/crawlers/crawlDomain.js +55 -58
- package/dist/crawlers/crawlIntelligentSitemap.js +21 -11
- package/dist/crawlers/crawlRateController.js +47 -0
- package/dist/crawlers/crawlSitemap.js +51 -62
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/generateOobeeClientScanner.js +32 -1
- package/dist/mergeAxeResults/itemsStore.js +32 -3
- package/dist/mergeAxeResults/sentryTelemetry.js +3 -0
- package/dist/mergeAxeResults.js +120 -92
- package/dist/npmIndex.js +1 -0
- package/dist/utils.js +23 -28
- package/oobee-client-scanner.js +35 -4
- package/package.json +3 -3
- package/src/cli.ts +4 -0
- package/src/combine.ts +16 -1
- package/src/constants/cliFunctions.ts +7 -0
- package/src/constants/common.ts +162 -90
- package/src/constants/constants.ts +1 -0
- package/src/crawlers/commonCrawlerFunc.ts +148 -14
- package/src/crawlers/crawlDomain.ts +64 -66
- package/src/crawlers/crawlIntelligentSitemap.ts +23 -11
- package/src/crawlers/crawlRateController.ts +63 -0
- package/src/crawlers/crawlSitemap.ts +57 -70
- package/src/crawlers/runCustom.ts +10 -1
- package/src/generateOobeeClientScanner.ts +32 -1
- package/src/index.ts +1 -0
- package/src/mergeAxeResults/itemsStore.ts +37 -3
- package/src/mergeAxeResults/sentryTelemetry.ts +3 -0
- package/src/mergeAxeResults.ts +139 -99
- package/src/npmIndex.ts +1 -0
- package/src/utils.ts +25 -33
- /package/{bf04540e-0894-4d00-98ec-c1be74c6f199.txt → 7339fae5-e8ed-4b50-af13-317847620dbf.txt} +0 -0
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
import crawlee, { EnqueueStrategy } from 'crawlee';
|
|
2
|
+
import { CrawlRateController } from './crawlRateController.js';
|
|
2
3
|
import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
|
|
3
4
|
import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
|
|
4
|
-
import * as path from 'path';
|
|
5
|
-
import fsp from 'fs/promises';
|
|
6
5
|
import {
|
|
7
6
|
createCrawleeSubFolders,
|
|
7
|
+
getPreLaunchHook,
|
|
8
8
|
runAxeScript,
|
|
9
9
|
isUrlPdf,
|
|
10
10
|
shouldSkipClickDueToDisallowedHref,
|
|
11
11
|
shouldSkipDueToUnsupportedContent,
|
|
12
|
+
splitAuthHeaders,
|
|
12
13
|
} from './commonCrawlerFunc.js';
|
|
13
14
|
import constants, {
|
|
14
15
|
UrlsCrawled,
|
|
@@ -29,7 +30,7 @@ import {
|
|
|
29
30
|
getUrlsFromRobotsTxt,
|
|
30
31
|
waitForPageLoaded,
|
|
31
32
|
} from '../constants/common.js';
|
|
32
|
-
import { areLinksEqual, isFollowStrategy, normUrl, register } from '../utils.js';
|
|
33
|
+
import { areLinksEqual, isFollowStrategy, isSameHostname, normUrl, register } from '../utils.js';
|
|
33
34
|
import {
|
|
34
35
|
handlePdfDownload,
|
|
35
36
|
runPdfScan,
|
|
@@ -364,9 +365,7 @@ const crawlDomain = async ({
|
|
|
364
365
|
// same-domain strategy) still contribute their <a> links above, but
|
|
365
366
|
// clicking every interactive element on them is too slow and starves
|
|
366
367
|
// the crawler of time to discover pages on the primary hostname.
|
|
367
|
-
|
|
368
|
-
const seedHostname = new URL(url).hostname;
|
|
369
|
-
if (currentHostname === seedHostname) {
|
|
368
|
+
if (isSameHostname(new URL(page.url()).hostname, new URL(url).hostname)) {
|
|
370
369
|
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
|
371
370
|
try {
|
|
372
371
|
await customEnqueueLinksByClickingElements(page, browserContext);
|
|
@@ -382,53 +381,46 @@ const crawlDomain = async ({
|
|
|
382
381
|
};
|
|
383
382
|
|
|
384
383
|
let isAbortingScanNow = false;
|
|
384
|
+
const rateController = new CrawlRateController(
|
|
385
|
+
maxRequestsPerCrawl,
|
|
386
|
+
specifiedMaxConcurrency || constants.maxConcurrency,
|
|
387
|
+
);
|
|
388
|
+
|
|
389
|
+
const { nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
385
390
|
|
|
386
391
|
const crawler = register(
|
|
387
392
|
new crawlee.PlaywrightCrawler({
|
|
388
393
|
launchContext: {
|
|
389
394
|
launcher: constants.launcher,
|
|
390
395
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
|
391
|
-
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
392
|
-
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
393
396
|
},
|
|
394
397
|
retryOnBlocked: true,
|
|
395
398
|
browserPoolOptions: {
|
|
396
399
|
useFingerprints: false,
|
|
397
400
|
preLaunchHooks: [
|
|
401
|
+
getPreLaunchHook(userDataDirectory),
|
|
398
402
|
async (_pageId, launchContext) => {
|
|
399
|
-
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
|
400
|
-
|
|
401
|
-
// Ensure base exists
|
|
402
|
-
await fsp.mkdir(baseDir, { recursive: true });
|
|
403
|
-
|
|
404
|
-
// Create a unique subdir per browser
|
|
405
|
-
const subProfileDir = path.join(
|
|
406
|
-
baseDir,
|
|
407
|
-
`profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
|
408
|
-
);
|
|
409
|
-
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
410
|
-
|
|
411
|
-
// Assign to Crawlee's launcher
|
|
412
|
-
// Crawlee preLaunchHooks expects launchContext to be mutated in-place.
|
|
413
|
-
// eslint-disable-next-line no-param-reassign
|
|
414
|
-
launchContext.userDataDir = subProfileDir;
|
|
415
|
-
|
|
416
|
-
// Safely extend launchOptions
|
|
417
403
|
// eslint-disable-next-line no-param-reassign
|
|
418
404
|
launchContext.launchOptions = {
|
|
419
405
|
...launchContext.launchOptions,
|
|
420
406
|
ignoreHTTPSErrors: true,
|
|
421
407
|
...playwrightDeviceDetailsObject,
|
|
408
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
422
409
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
423
|
-
...(
|
|
410
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
411
|
+
...(httpCredentials && { httpCredentials }),
|
|
424
412
|
};
|
|
425
|
-
|
|
426
|
-
// Optionally log for debugging
|
|
427
|
-
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
|
428
413
|
},
|
|
429
414
|
],
|
|
430
415
|
},
|
|
431
416
|
requestQueue,
|
|
417
|
+
preNavigationHooks: [
|
|
418
|
+
async (crawlingContext) => {
|
|
419
|
+
if (extraHTTPHeaders) {
|
|
420
|
+
crawlingContext.request.headers = extraHTTPHeaders;
|
|
421
|
+
}
|
|
422
|
+
},
|
|
423
|
+
],
|
|
432
424
|
postNavigationHooks: [
|
|
433
425
|
async crawlingContext => {
|
|
434
426
|
const { page, request } = crawlingContext;
|
|
@@ -527,11 +519,9 @@ const crawlDomain = async ({
|
|
|
527
519
|
const hasExceededDuration =
|
|
528
520
|
scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
529
521
|
|
|
530
|
-
if (
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
durationExceeded = true;
|
|
534
|
-
}
|
|
522
|
+
if (hasExceededDuration) {
|
|
523
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
|
|
524
|
+
durationExceeded = true;
|
|
535
525
|
isAbortingScanNow = true;
|
|
536
526
|
activeCrawler.autoscaledPool.abort();
|
|
537
527
|
return;
|
|
@@ -691,8 +681,7 @@ const crawlDomain = async ({
|
|
|
691
681
|
return;
|
|
692
682
|
}
|
|
693
683
|
|
|
694
|
-
|
|
695
|
-
if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
|
684
|
+
if (rateController.claimSlot()) {
|
|
696
685
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
697
686
|
numScanned: urlsCrawled.scanned.length,
|
|
698
687
|
urlScanned: request.url,
|
|
@@ -703,6 +692,11 @@ const crawlDomain = async ({
|
|
|
703
692
|
pageTitle: results.pageTitle,
|
|
704
693
|
actualUrl, // i.e. actualUrl
|
|
705
694
|
});
|
|
695
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
696
|
+
if (rateController.isLimitReached()) {
|
|
697
|
+
isAbortingScanNow = true;
|
|
698
|
+
activeCrawler.autoscaledPool.abort();
|
|
699
|
+
}
|
|
706
700
|
scannedUrlSet.add(normUrl(request.url));
|
|
707
701
|
scannedResolvedUrlSet.add(normUrl(actualUrl));
|
|
708
702
|
|
|
@@ -715,8 +709,7 @@ const crawlDomain = async ({
|
|
|
715
709
|
results.actualUrl = actualUrl;
|
|
716
710
|
await dataset.pushData(results);
|
|
717
711
|
}
|
|
718
|
-
} else if (
|
|
719
|
-
// One more check if scanned pages have reached limit due to multi-instances of handler running
|
|
712
|
+
} else if (rateController.claimSlot()) {
|
|
720
713
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
721
714
|
numScanned: urlsCrawled.scanned.length,
|
|
722
715
|
urlScanned: request.url,
|
|
@@ -726,6 +719,11 @@ const crawlDomain = async ({
|
|
|
726
719
|
actualUrl: request.url,
|
|
727
720
|
pageTitle: results.pageTitle,
|
|
728
721
|
});
|
|
722
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
723
|
+
if (rateController.isLimitReached()) {
|
|
724
|
+
isAbortingScanNow = true;
|
|
725
|
+
activeCrawler.autoscaledPool.abort();
|
|
726
|
+
}
|
|
729
727
|
scannedUrlSet.add(normUrl(request.url));
|
|
730
728
|
scannedResolvedUrlSet.add(normUrl(request.url));
|
|
731
729
|
await dataset.pushData(results);
|
|
@@ -777,33 +775,35 @@ const crawlDomain = async ({
|
|
|
777
775
|
});
|
|
778
776
|
}
|
|
779
777
|
} catch {
|
|
780
|
-
//
|
|
778
|
+
// Recovery failed; Crawlee will retry the request automatically
|
|
781
779
|
}
|
|
782
780
|
|
|
783
|
-
//
|
|
784
|
-
//
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
numScanned: urlsCrawled.scanned.length,
|
|
788
|
-
urlScanned: request.url,
|
|
789
|
-
});
|
|
790
|
-
|
|
791
|
-
urlsCrawled.error.push({
|
|
792
|
-
url: request.url,
|
|
793
|
-
pageTitle: request.url,
|
|
794
|
-
actualUrl: request.url,
|
|
795
|
-
metadata: STATUS_CODE_METADATA[2],
|
|
796
|
-
});
|
|
797
|
-
}
|
|
781
|
+
// Do not push to urlsCrawled.error here — Crawlee will retry the request
|
|
782
|
+
// (up to maxRequestRetries, default 3). If all retries are exhausted,
|
|
783
|
+
// failedRequestHandler will record the error. Pushing here causes
|
|
784
|
+
// duplicates and false positives for URLs that succeed on retry.
|
|
798
785
|
}
|
|
799
786
|
},
|
|
800
787
|
failedRequestHandler: async ({ request, response }) => {
|
|
788
|
+
if (isAbortingScanNow) {
|
|
789
|
+
return;
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
const status = response?.status();
|
|
793
|
+
if (rateController.onFailure(status, crawler.autoscaledPool)) {
|
|
794
|
+
consoleLogger.info(
|
|
795
|
+
`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
|
|
796
|
+
);
|
|
797
|
+
isAbortingScanNow = true;
|
|
798
|
+
crawler.autoscaledPool?.abort();
|
|
799
|
+
return;
|
|
800
|
+
}
|
|
801
|
+
|
|
801
802
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
802
803
|
numScanned: urlsCrawled.scanned.length,
|
|
803
804
|
urlScanned: request.url,
|
|
804
805
|
});
|
|
805
806
|
|
|
806
|
-
const status = response?.status();
|
|
807
807
|
const metadata =
|
|
808
808
|
typeof status === 'number'
|
|
809
809
|
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
@@ -819,15 +819,13 @@ const crawlDomain = async ({
|
|
|
819
819
|
},
|
|
820
820
|
maxRequestsPerCrawl: Infinity,
|
|
821
821
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
},
|
|
830
|
-
}),
|
|
822
|
+
autoscaledPoolOptions: {
|
|
823
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
|
824
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
825
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
|
826
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
|
827
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
|
828
|
+
},
|
|
831
829
|
}),
|
|
832
830
|
);
|
|
833
831
|
|
|
@@ -850,7 +848,7 @@ const crawlDomain = async ({
|
|
|
850
848
|
.map(item => item.actualUrl || item.url)
|
|
851
849
|
.filter(pageUrl => {
|
|
852
850
|
try {
|
|
853
|
-
return new URL(pageUrl).hostname
|
|
851
|
+
return isSameHostname(new URL(pageUrl).hostname, seedHostname) && !clickPassVisited.has(pageUrl);
|
|
854
852
|
} catch {
|
|
855
853
|
return false;
|
|
856
854
|
}
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import fs from 'fs';
|
|
2
2
|
import { chromium, Page } from 'playwright';
|
|
3
3
|
import { EnqueueStrategy } from 'crawlee';
|
|
4
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
4
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
5
5
|
import constants, { FileTypes, guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
|
6
6
|
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
7
7
|
import crawlDomain from './crawlDomain.js';
|
|
8
8
|
import crawlSitemap from './crawlSitemap.js';
|
|
9
9
|
import { ViewportSettingsClass } from '../combine.js';
|
|
10
|
-
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt } from '../constants/common.js';
|
|
10
|
+
import { getPlaywrightLaunchOptions, getSitemapsFromRobotsTxt, initModifiedUserAgent } from '../constants/common.js';
|
|
11
11
|
import { register } from '../utils.js';
|
|
12
12
|
|
|
13
13
|
const crawlIntelligentSitemap = async (
|
|
@@ -40,6 +40,10 @@ const crawlIntelligentSitemap = async (
|
|
|
40
40
|
|
|
41
41
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
|
42
42
|
|
|
43
|
+
// Initialise modified User-Agent early so sitemap discovery requests
|
|
44
|
+
// don't expose "HeadlessChrome" (which triggers bot-blocking on some sites).
|
|
45
|
+
await initModifiedUserAgent(browser);
|
|
46
|
+
|
|
43
47
|
function getHomeUrl(parsedUrl: string) {
|
|
44
48
|
const urlObject = new URL(parsedUrl);
|
|
45
49
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
|
@@ -54,6 +58,7 @@ const crawlIntelligentSitemap = async (
|
|
|
54
58
|
let sitemapLink = '';
|
|
55
59
|
|
|
56
60
|
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
61
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
57
62
|
let context;
|
|
58
63
|
let browserInstance;
|
|
59
64
|
|
|
@@ -61,18 +66,25 @@ const crawlIntelligentSitemap = async (
|
|
|
61
66
|
const effectiveUserDataDirectory = userDataDirectory || '';
|
|
62
67
|
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
63
68
|
...launchOptions,
|
|
64
|
-
...(
|
|
69
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
70
|
+
...(httpCredentials && { httpCredentials }),
|
|
71
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
65
72
|
});
|
|
66
73
|
register(context);
|
|
67
74
|
} else {
|
|
68
|
-
// In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
|
|
69
75
|
browserInstance = await constants.launcher.launch(launchOptions);
|
|
70
76
|
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
71
77
|
context = await browserInstance.newContext({
|
|
72
|
-
...(
|
|
78
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
79
|
+
...(httpCredentials && { httpCredentials }),
|
|
80
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
73
81
|
});
|
|
74
82
|
}
|
|
75
83
|
|
|
84
|
+
if (authHeader) {
|
|
85
|
+
await addAuthRouteHandler(context, link, authHeader);
|
|
86
|
+
}
|
|
87
|
+
|
|
76
88
|
const page = await context.newPage();
|
|
77
89
|
|
|
78
90
|
for (const path of sitemapPaths) {
|
|
@@ -93,7 +105,7 @@ const crawlIntelligentSitemap = async (
|
|
|
93
105
|
const checkUrlExists = async (page: Page, parsedUrl: string) => {
|
|
94
106
|
try {
|
|
95
107
|
const response = await page.goto(parsedUrl);
|
|
96
|
-
return response
|
|
108
|
+
return response?.ok() ?? false;
|
|
97
109
|
} catch (e) {
|
|
98
110
|
consoleLogger.error(e);
|
|
99
111
|
return false;
|
|
@@ -105,7 +117,7 @@ const crawlIntelligentSitemap = async (
|
|
|
105
117
|
try {
|
|
106
118
|
sitemapUrls = await getSitemapsFromRobotsTxt(url, browser, userDataDirectory, extraHTTPHeaders);
|
|
107
119
|
if (sitemapUrls.length > 0) {
|
|
108
|
-
|
|
120
|
+
consoleLogger.info(`Found ${sitemapUrls.length} sitemap(s) in robots.txt: ${sitemapUrls.join(', ')}`);
|
|
109
121
|
sitemapExist = true;
|
|
110
122
|
}
|
|
111
123
|
} catch (error) {
|
|
@@ -125,7 +137,7 @@ const crawlIntelligentSitemap = async (
|
|
|
125
137
|
}
|
|
126
138
|
|
|
127
139
|
if (!sitemapExist) {
|
|
128
|
-
|
|
140
|
+
consoleLogger.info('Unable to find sitemap. Commencing website crawl instead.');
|
|
129
141
|
return await crawlDomain({
|
|
130
142
|
url,
|
|
131
143
|
randomToken,
|
|
@@ -157,7 +169,7 @@ const crawlIntelligentSitemap = async (
|
|
|
157
169
|
break;
|
|
158
170
|
}
|
|
159
171
|
|
|
160
|
-
|
|
172
|
+
consoleLogger.info(`Processing sitemap: ${currentSitemapUrl}`);
|
|
161
173
|
urlsCrawledFinal = await crawlSitemap({
|
|
162
174
|
sitemapUrl: currentSitemapUrl,
|
|
163
175
|
randomToken,
|
|
@@ -187,7 +199,7 @@ const crawlIntelligentSitemap = async (
|
|
|
187
199
|
const hasDurationRemaining = scanDuration === 0 || remainingScanDuration > 0;
|
|
188
200
|
|
|
189
201
|
if (urlsCrawled.scanned.length < maxRequestsPerCrawl && hasDurationRemaining) {
|
|
190
|
-
|
|
202
|
+
consoleLogger.info(
|
|
191
203
|
`Continuing crawl from root website.${scanDuration > 0 ? ` Remaining scan time: ${remainingScanDuration.toFixed(1)}s` : ''}`,
|
|
192
204
|
);
|
|
193
205
|
urlsCrawledFinal = await crawlDomain({
|
|
@@ -212,7 +224,7 @@ const crawlIntelligentSitemap = async (
|
|
|
212
224
|
scanDuration: remainingScanDuration,
|
|
213
225
|
});
|
|
214
226
|
} else if (!hasDurationRemaining) {
|
|
215
|
-
|
|
227
|
+
consoleLogger.info(
|
|
216
228
|
`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`,
|
|
217
229
|
);
|
|
218
230
|
durationExceeded = true;
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { consoleLogger } from '../logs.js';
|
|
2
|
+
|
|
3
|
+
export class CrawlRateController {
|
|
4
|
+
private scannedCount = 0;
|
|
5
|
+
private readonly maxPages: number;
|
|
6
|
+
private consecutiveFailures = 0;
|
|
7
|
+
private consecutiveSuccesses = 0;
|
|
8
|
+
private readonly maxConsecutiveFailures: number;
|
|
9
|
+
private readonly originalMaxConcurrency: number;
|
|
10
|
+
private static readonly RECOVERY_INTERVAL = 10;
|
|
11
|
+
|
|
12
|
+
constructor(maxRequestsPerCrawl: number, maxConcurrency: number) {
|
|
13
|
+
this.maxPages = maxRequestsPerCrawl;
|
|
14
|
+
this.maxConsecutiveFailures = Number(process.env.OOBEE_CONSECUTIVE_MAX_RETRIES) || 100;
|
|
15
|
+
this.originalMaxConcurrency = maxConcurrency;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
claimSlot(): boolean {
|
|
19
|
+
if (this.scannedCount >= this.maxPages) {
|
|
20
|
+
return false;
|
|
21
|
+
}
|
|
22
|
+
this.scannedCount++;
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
onSuccess(pool?: { maxConcurrency: number }): void {
|
|
27
|
+
this.consecutiveFailures = 0;
|
|
28
|
+
this.consecutiveSuccesses++;
|
|
29
|
+
|
|
30
|
+
if (pool && this.consecutiveSuccesses % CrawlRateController.RECOVERY_INTERVAL === 0) {
|
|
31
|
+
if (pool.maxConcurrency < this.originalMaxConcurrency) {
|
|
32
|
+
pool.maxConcurrency = Math.min(pool.maxConcurrency + 2, this.originalMaxConcurrency);
|
|
33
|
+
consoleLogger.info(`Recovering concurrency to ${pool.maxConcurrency}`);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
onFailure(httpStatus: number | undefined, pool?: { maxConcurrency: number }): boolean {
|
|
39
|
+
if (typeof httpStatus !== 'number' || httpStatus < 400) {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
this.consecutiveSuccesses = 0;
|
|
44
|
+
this.consecutiveFailures++;
|
|
45
|
+
|
|
46
|
+
if (pool && pool.maxConcurrency > 1) {
|
|
47
|
+
pool.maxConcurrency = Math.max(1, Math.floor(pool.maxConcurrency / 2));
|
|
48
|
+
consoleLogger.info(
|
|
49
|
+
`Rate limited (HTTP ${httpStatus}) — reducing concurrency to ${pool.maxConcurrency}`,
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (this.consecutiveFailures >= this.maxConsecutiveFailures) {
|
|
54
|
+
return true;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
isLimitReached(): boolean {
|
|
61
|
+
return this.scannedCount >= this.maxPages;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
|
|
2
|
+
import { CrawlRateController } from './crawlRateController.js';
|
|
2
3
|
import fs from 'fs';
|
|
3
|
-
import * as path from 'path';
|
|
4
|
-
import fsp from 'fs/promises';
|
|
5
4
|
import {
|
|
6
5
|
createCrawleeSubFolders,
|
|
6
|
+
getPreLaunchHook,
|
|
7
7
|
preNavigationHooks,
|
|
8
8
|
runAxeScript,
|
|
9
9
|
isUrlPdf,
|
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
mapPdfScanResults,
|
|
31
31
|
doPdfScreenshots,
|
|
32
32
|
} from './pdfScanFunc.js';
|
|
33
|
-
import { guiInfoLog } from '../logs.js';
|
|
33
|
+
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
34
34
|
import { ViewportSettingsClass } from '../combine.js';
|
|
35
35
|
|
|
36
36
|
const crawlSitemap = async ({
|
|
@@ -81,6 +81,10 @@ const crawlSitemap = async ({
|
|
|
81
81
|
let urlsCrawled: UrlsCrawled;
|
|
82
82
|
let durationExceeded = false;
|
|
83
83
|
let isAbortingScan = false;
|
|
84
|
+
const rateController = new CrawlRateController(
|
|
85
|
+
maxRequestsPerCrawl,
|
|
86
|
+
specifiedMaxConcurrency || constants.maxConcurrency,
|
|
87
|
+
);
|
|
84
88
|
|
|
85
89
|
if (fromCrawlIntelligentSitemap) {
|
|
86
90
|
dataset = datasetFromIntelligent;
|
|
@@ -125,39 +129,20 @@ const crawlSitemap = async ({
|
|
|
125
129
|
launchContext: {
|
|
126
130
|
launcher: constants.launcher,
|
|
127
131
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
|
128
|
-
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
129
|
-
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
|
|
130
132
|
},
|
|
131
133
|
retryOnBlocked: true,
|
|
132
134
|
browserPoolOptions: {
|
|
133
135
|
useFingerprints: false,
|
|
134
136
|
preLaunchHooks: [
|
|
137
|
+
getPreLaunchHook(userDataDirectory),
|
|
135
138
|
async (_pageId, launchContext) => {
|
|
136
|
-
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
|
|
137
|
-
|
|
138
|
-
// Ensure base exists
|
|
139
|
-
await fsp.mkdir(baseDir, { recursive: true });
|
|
140
|
-
|
|
141
|
-
// Create a unique subdir per browser
|
|
142
|
-
const subProfileDir = path.join(
|
|
143
|
-
baseDir,
|
|
144
|
-
`profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
|
145
|
-
);
|
|
146
|
-
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
147
|
-
|
|
148
|
-
// Assign to Crawlee's launcher
|
|
149
|
-
launchContext.userDataDir = subProfileDir;
|
|
150
|
-
|
|
151
|
-
// Safely extend launchOptions
|
|
152
139
|
launchContext.launchOptions = {
|
|
153
140
|
...launchContext.launchOptions,
|
|
154
141
|
ignoreHTTPSErrors: true,
|
|
155
142
|
...playwrightDeviceDetailsObject,
|
|
143
|
+
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
|
|
156
144
|
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
|
|
157
145
|
};
|
|
158
|
-
|
|
159
|
-
// Optionally log for debugging
|
|
160
|
-
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
|
|
161
146
|
},
|
|
162
147
|
],
|
|
163
148
|
},
|
|
@@ -259,13 +244,11 @@ const crawlSitemap = async ({
|
|
|
259
244
|
const hasExceededDuration =
|
|
260
245
|
scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
|
261
246
|
|
|
262
|
-
if (
|
|
247
|
+
if (hasExceededDuration) {
|
|
248
|
+
consoleLogger.info(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
249
|
+
durationExceeded = true;
|
|
263
250
|
isAbortingScan = true;
|
|
264
|
-
|
|
265
|
-
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
|
266
|
-
durationExceeded = true;
|
|
267
|
-
}
|
|
268
|
-
crawler.autoscaledPool.abort(); // stops new requests
|
|
251
|
+
crawler.autoscaledPool.abort();
|
|
269
252
|
return;
|
|
270
253
|
}
|
|
271
254
|
|
|
@@ -376,26 +359,33 @@ const crawlSitemap = async ({
|
|
|
376
359
|
// Page/context was destroyed during navigation — handled by outer catch
|
|
377
360
|
}
|
|
378
361
|
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
362
|
+
if (rateController.claimSlot()) {
|
|
363
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
364
|
+
numScanned: urlsCrawled.scanned.length,
|
|
365
|
+
urlScanned: request.url,
|
|
366
|
+
});
|
|
383
367
|
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
368
|
+
urlsCrawled.scanned.push({
|
|
369
|
+
url: request.url,
|
|
370
|
+
pageTitle: results.pageTitle,
|
|
371
|
+
actualUrl, // i.e. actualUrl
|
|
372
|
+
});
|
|
373
|
+
rateController.onSuccess(crawler.autoscaledPool);
|
|
374
|
+
if (rateController.isLimitReached()) {
|
|
375
|
+
isAbortingScan = true;
|
|
376
|
+
crawler.autoscaledPool.abort();
|
|
377
|
+
}
|
|
389
378
|
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
379
|
+
urlsCrawled.scannedRedirects.push({
|
|
380
|
+
fromUrl: request.url,
|
|
381
|
+
toUrl: actualUrl,
|
|
382
|
+
});
|
|
394
383
|
|
|
395
|
-
|
|
396
|
-
|
|
384
|
+
results.url = request.url;
|
|
385
|
+
results.actualUrl = actualUrl;
|
|
397
386
|
|
|
398
|
-
|
|
387
|
+
await dataset.pushData(results);
|
|
388
|
+
}
|
|
399
389
|
} else {
|
|
400
390
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
401
391
|
numScanned: urlsCrawled.scanned.length,
|
|
@@ -420,20 +410,10 @@ const crawlSitemap = async ({
|
|
|
420
410
|
}
|
|
421
411
|
}
|
|
422
412
|
} catch (e) {
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
});
|
|
428
|
-
|
|
429
|
-
urlsCrawled.error.push({
|
|
430
|
-
url: request.url,
|
|
431
|
-
pageTitle: request.url,
|
|
432
|
-
actualUrl: request.url,
|
|
433
|
-
metadata: STATUS_CODE_METADATA[2],
|
|
434
|
-
httpStatusCode: 0,
|
|
435
|
-
});
|
|
436
|
-
}
|
|
413
|
+
// Do not push to urlsCrawled.error here — Crawlee will retry the request
|
|
414
|
+
// (up to maxRequestRetries, default 3). If all retries are exhausted,
|
|
415
|
+
// failedRequestHandler will record the error. Pushing here causes
|
|
416
|
+
// duplicates and false positives for URLs that succeed on retry.
|
|
437
417
|
}
|
|
438
418
|
},
|
|
439
419
|
failedRequestHandler: async ({ request, response, error }) => {
|
|
@@ -441,12 +421,21 @@ const crawlSitemap = async ({
|
|
|
441
421
|
return;
|
|
442
422
|
}
|
|
443
423
|
|
|
424
|
+
const status = response?.status();
|
|
425
|
+
if (rateController.onFailure(status, crawler.autoscaledPool)) {
|
|
426
|
+
consoleLogger.info(
|
|
427
|
+
`Aborting crawl: consecutive HTTP failures threshold reached (site may be rate-limiting). Successfully scanned ${urlsCrawled.scanned.length} pages.`,
|
|
428
|
+
);
|
|
429
|
+
isAbortingScan = true;
|
|
430
|
+
crawler.autoscaledPool?.abort();
|
|
431
|
+
return;
|
|
432
|
+
}
|
|
433
|
+
|
|
444
434
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
|
445
435
|
numScanned: urlsCrawled.scanned.length,
|
|
446
436
|
urlScanned: request.url,
|
|
447
437
|
});
|
|
448
438
|
|
|
449
|
-
const status = response?.status();
|
|
450
439
|
const metadata =
|
|
451
440
|
typeof status === 'number'
|
|
452
441
|
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
|
@@ -463,15 +452,13 @@ const crawlSitemap = async ({
|
|
|
463
452
|
},
|
|
464
453
|
maxRequestsPerCrawl: Infinity,
|
|
465
454
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
},
|
|
474
|
-
}),
|
|
455
|
+
autoscaledPoolOptions: {
|
|
456
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
|
457
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
|
458
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
|
459
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
|
460
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
|
461
|
+
},
|
|
475
462
|
}),
|
|
476
463
|
);
|
|
477
464
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
/* eslint-env browser */
|
|
2
|
-
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
|
2
|
+
import { createCrawleeSubFolders, splitAuthHeaders, addAuthRouteHandler } from './commonCrawlerFunc.js';
|
|
3
3
|
import { cleanUpAndExit, register, registerSoftClose } from '../utils.js';
|
|
4
4
|
import constants, {
|
|
5
5
|
getIntermediateScreenshotsPath,
|
|
@@ -60,6 +60,7 @@ const runCustom = async (
|
|
|
60
60
|
blacklistedPatterns: string[] | null,
|
|
61
61
|
includeScreenshots: boolean,
|
|
62
62
|
initialCustomFlowLabel?: string,
|
|
63
|
+
extraHTTPHeaders?: Record<string, string>,
|
|
63
64
|
) => {
|
|
64
65
|
// checks and delete datasets path if it already exists
|
|
65
66
|
process.env.CRAWLEE_STORAGE_DIR = randomToken;
|
|
@@ -109,6 +110,8 @@ const runCustom = async (
|
|
|
109
110
|
...customArgs,
|
|
110
111
|
];
|
|
111
112
|
|
|
113
|
+
const { authHeader, nonAuthHeaders, httpCredentials } = splitAuthHeaders(extraHTTPHeaders);
|
|
114
|
+
|
|
112
115
|
const context = await constants.launcher.launchPersistentContext(userDataDirectory, {
|
|
113
116
|
...baseLaunchOptions,
|
|
114
117
|
args: mergedArgs,
|
|
@@ -118,8 +121,14 @@ const runCustom = async (
|
|
|
118
121
|
viewport: null,
|
|
119
122
|
...(hasCustomViewport ? contextDeviceOptions : {}),
|
|
120
123
|
userAgent: process.env.OOBEE_USER_AGENT || (deviceUserAgent as string | undefined),
|
|
124
|
+
...(nonAuthHeaders && { extraHTTPHeaders: nonAuthHeaders }),
|
|
125
|
+
...(httpCredentials && { httpCredentials }),
|
|
121
126
|
});
|
|
122
127
|
|
|
128
|
+
if (authHeader) {
|
|
129
|
+
await addAuthRouteHandler(context, url, authHeader);
|
|
130
|
+
}
|
|
131
|
+
|
|
123
132
|
register(context);
|
|
124
133
|
|
|
125
134
|
processPageParams.stopAll = async () => {
|