@govtechsg/oobee 0.10.51 → 0.10.58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/bump-package-version.yml +58 -0
- package/.github/workflows/image.yml +38 -17
- package/DETAILS.md +5 -2
- package/INTEGRATION.md +57 -53
- package/README.md +4 -1
- package/__tests__/test-sitemap-url-patterns.xml +105 -0
- package/exclusions.txt +1 -0
- package/package.json +7 -6
- package/src/cli.ts +35 -2
- package/src/combine.ts +10 -7
- package/src/constants/cliFunctions.ts +9 -0
- package/src/constants/common.ts +95 -105
- package/src/constants/constants.ts +47 -2
- package/src/crawlers/commonCrawlerFunc.ts +84 -5
- package/src/crawlers/crawlDomain.ts +93 -160
- package/src/crawlers/crawlIntelligentSitemap.ts +40 -36
- package/src/crawlers/crawlLocalFile.ts +77 -35
- package/src/crawlers/crawlSitemap.ts +156 -89
- package/src/crawlers/pdfScanFunc.ts +2 -0
- package/src/index.ts +2 -0
- package/src/logs.ts +4 -2
- package/src/mergeAxeResults.ts +20 -9
- package/src/npmIndex.ts +1 -1
- package/src/screenshotFunc/htmlScreenshotFunc.ts +7 -5
- package/src/screenshotFunc/pdfScreenshotFunc.ts +2 -2
- package/src/static/ejs/partials/components/wcagCompliance.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +1 -0
- package/src/static/ejs/partials/styles/styles.ejs +11 -0
- package/src/static/ejs/report.ejs +14 -1
- package/src/utils.ts +3 -3
@@ -2,15 +2,14 @@ import crawlee, { EnqueueStrategy } from 'crawlee';
|
|
2
2
|
import fs from 'fs';
|
3
3
|
import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
|
4
4
|
import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
|
5
|
-
import axios from 'axios';
|
6
|
-
import { fileTypeFromBuffer } from 'file-type';
|
7
|
-
import mime from 'mime-types';
|
8
5
|
import https from 'https';
|
9
6
|
import type { BatchAddRequestsResult } from '@crawlee/types';
|
10
7
|
import {
|
11
8
|
createCrawleeSubFolders,
|
12
9
|
runAxeScript,
|
13
10
|
isUrlPdf,
|
11
|
+
shouldSkipClickDueToDisallowedHref,
|
12
|
+
shouldSkipDueToUnsupportedContent,
|
14
13
|
} from './commonCrawlerFunc.js';
|
15
14
|
import constants, {
|
16
15
|
UrlsCrawled,
|
@@ -19,6 +18,8 @@ import constants, {
|
|
19
18
|
cssQuerySelectors,
|
20
19
|
RuleFlags,
|
21
20
|
STATUS_CODE_METADATA,
|
21
|
+
disallowedListOfPatterns,
|
22
|
+
disallowedSelectorPatterns,
|
22
23
|
} from '../constants/constants.js';
|
23
24
|
import {
|
24
25
|
getPlaywrightLaunchOptions,
|
@@ -37,7 +38,7 @@ import {
|
|
37
38
|
mapPdfScanResults,
|
38
39
|
doPdfScreenshots,
|
39
40
|
} from './pdfScanFunc.js';
|
40
|
-
import {
|
41
|
+
import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
|
41
42
|
import { ViewportSettingsClass } from '../combine.js';
|
42
43
|
|
43
44
|
const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
|
@@ -71,6 +72,7 @@ const crawlDomain = async ({
|
|
71
72
|
includeScreenshots,
|
72
73
|
followRobots,
|
73
74
|
extraHTTPHeaders,
|
75
|
+
scanDuration = 0,
|
74
76
|
safeMode = false,
|
75
77
|
fromCrawlIntelligentSitemap = false,
|
76
78
|
datasetFromIntelligent = null,
|
@@ -91,12 +93,14 @@ const crawlDomain = async ({
|
|
91
93
|
includeScreenshots: boolean;
|
92
94
|
followRobots: boolean;
|
93
95
|
extraHTTPHeaders: Record<string, string>;
|
96
|
+
scanDuration?: number;
|
94
97
|
safeMode?: boolean;
|
95
98
|
fromCrawlIntelligentSitemap?: boolean;
|
96
99
|
datasetFromIntelligent?: crawlee.Dataset;
|
97
100
|
urlsCrawledFromIntelligent?: UrlsCrawled;
|
98
101
|
ruleset?: RuleFlags[];
|
99
102
|
}) => {
|
103
|
+
const crawlStartTime = Date.now();
|
100
104
|
let dataset: crawlee.Dataset;
|
101
105
|
let urlsCrawled: UrlsCrawled;
|
102
106
|
let requestQueue: crawlee.RequestQueue;
|
@@ -162,95 +166,6 @@ const crawlDomain = async ({
|
|
162
166
|
});
|
163
167
|
}
|
164
168
|
|
165
|
-
const httpHeadCache = new Map<string, boolean>();
|
166
|
-
const isProcessibleUrl = async (url: string): Promise<boolean> => {
|
167
|
-
if (httpHeadCache.has(url)) {
|
168
|
-
silentLogger.info(`Skipping request as URL has been processed before ${url}}`);
|
169
|
-
return false; // return false to avoid processing the same url again
|
170
|
-
}
|
171
|
-
|
172
|
-
try {
|
173
|
-
// Send a HEAD request to check headers without downloading the file
|
174
|
-
const headResponse = await axios.head(url, {
|
175
|
-
headers: { Authorization: authHeader },
|
176
|
-
httpsAgent,
|
177
|
-
});
|
178
|
-
const contentType = headResponse.headers['content-type'] || '';
|
179
|
-
const contentDisposition = headResponse.headers['content-disposition'] || '';
|
180
|
-
|
181
|
-
// Check if the response suggests it's a downloadable file based on Content-Disposition header
|
182
|
-
if (contentDisposition.includes('attachment')) {
|
183
|
-
silentLogger.info(`Skipping URL due to attachment header: ${url}`);
|
184
|
-
httpHeadCache.set(url, false);
|
185
|
-
return false;
|
186
|
-
}
|
187
|
-
|
188
|
-
// Check if the MIME type suggests it's a downloadable file
|
189
|
-
if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
|
190
|
-
silentLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
|
191
|
-
httpHeadCache.set(url, false);
|
192
|
-
return false;
|
193
|
-
}
|
194
|
-
|
195
|
-
// Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
|
196
|
-
const mimeType = mime.lookup(contentType);
|
197
|
-
if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
|
198
|
-
silentLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
|
199
|
-
httpHeadCache.set(url, false);
|
200
|
-
return false;
|
201
|
-
}
|
202
|
-
|
203
|
-
// Additional check for zip files by their magic number (PK\x03\x04)
|
204
|
-
if (url.endsWith('.zip')) {
|
205
|
-
silentLogger.info(`Checking for zip file magic number at URL ${url}`);
|
206
|
-
|
207
|
-
// Download the first few bytes of the file to check for the magic number
|
208
|
-
const byteResponse = await axios.get(url, {
|
209
|
-
headers: { Range: 'bytes=0-3', Authorization: authHeader },
|
210
|
-
responseType: 'arraybuffer',
|
211
|
-
httpsAgent,
|
212
|
-
});
|
213
|
-
|
214
|
-
const magicNumber = byteResponse.data.toString('hex');
|
215
|
-
if (magicNumber === '504b0304') {
|
216
|
-
silentLogger.info(`Skipping zip file at URL ${url}`);
|
217
|
-
httpHeadCache.set(url, false);
|
218
|
-
return false;
|
219
|
-
}
|
220
|
-
silentLogger.info(
|
221
|
-
`Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
|
222
|
-
);
|
223
|
-
}
|
224
|
-
|
225
|
-
// If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content
|
226
|
-
const response = await axios.get(url, {
|
227
|
-
headers: { Range: 'bytes=0-4100', Authorization: authHeader },
|
228
|
-
responseType: 'arraybuffer',
|
229
|
-
httpsAgent,
|
230
|
-
});
|
231
|
-
|
232
|
-
const fileType = await fileTypeFromBuffer(response.data);
|
233
|
-
if (
|
234
|
-
fileType &&
|
235
|
-
!fileType.mime.startsWith('text/html') &&
|
236
|
-
!fileType.mime.startsWith('text/')
|
237
|
-
) {
|
238
|
-
silentLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
|
239
|
-
httpHeadCache.set(url, false);
|
240
|
-
return false;
|
241
|
-
}
|
242
|
-
} catch (e) {
|
243
|
-
// silentLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
|
244
|
-
// If an error occurs (e.g., a network issue), assume the URL is processible
|
245
|
-
httpHeadCache.set(url, true);
|
246
|
-
return true;
|
247
|
-
}
|
248
|
-
|
249
|
-
// If none of the conditions to skip are met, allow processing of the URL
|
250
|
-
httpHeadCache.set(url, true);
|
251
|
-
return true;
|
252
|
-
};
|
253
|
-
|
254
169
|
const enqueueProcess = async (
|
255
170
|
page: Page,
|
256
171
|
enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
|
@@ -259,14 +174,14 @@ const crawlDomain = async ({
|
|
259
174
|
try {
|
260
175
|
await enqueueLinks({
|
261
176
|
// set selector matches anchor elements with href but not contains # or starting with mailto:
|
262
|
-
selector:
|
177
|
+
selector: `a:not(${disallowedSelectorPatterns})`,
|
263
178
|
strategy,
|
264
179
|
requestQueue,
|
265
180
|
transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
|
266
181
|
try {
|
267
182
|
req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
268
183
|
} catch (e) {
|
269
|
-
|
184
|
+
consoleLogger.error(e);
|
270
185
|
}
|
271
186
|
if (urlsCrawled.scanned.some(item => item.url === req.url)) {
|
272
187
|
req.skipNavigation = true;
|
@@ -288,7 +203,7 @@ const crawlDomain = async ({
|
|
288
203
|
try {
|
289
204
|
await customEnqueueLinksByClickingElements(page, browserContext);
|
290
205
|
} catch (e) {
|
291
|
-
|
206
|
+
// do nothing;
|
292
207
|
}
|
293
208
|
}
|
294
209
|
} catch {
|
@@ -307,7 +222,10 @@ const crawlDomain = async ({
|
|
307
222
|
const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
|
308
223
|
const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
|
309
224
|
const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
|
310
|
-
|
225
|
+
const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
|
226
|
+
newPageUrl.toLowerCase().startsWith(pattern),
|
227
|
+
);
|
228
|
+
return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
|
311
229
|
};
|
312
230
|
const setPageListeners = (page: Page): void => {
|
313
231
|
// event listener to handle new page popups upon button click
|
@@ -431,6 +349,16 @@ const crawlDomain = async ({
|
|
431
349
|
});
|
432
350
|
} else if (!newUrlFoundInElement) {
|
433
351
|
try {
|
352
|
+
const shouldSkip = await shouldSkipClickDueToDisallowedHref(page, element);
|
353
|
+
if (shouldSkip) {
|
354
|
+
const elementHtml = await page.evaluate(el => el.outerHTML, element);
|
355
|
+
consoleLogger.info(
|
356
|
+
'Skipping a click due to disallowed href nearby. Element HTML:',
|
357
|
+
elementHtml,
|
358
|
+
);
|
359
|
+
continue;
|
360
|
+
}
|
361
|
+
|
434
362
|
// Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
|
435
363
|
await element.click({ force: true });
|
436
364
|
await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
|
@@ -455,7 +383,7 @@ const crawlDomain = async ({
|
|
455
383
|
}
|
456
384
|
|
457
385
|
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
458
|
-
|
386
|
+
|
459
387
|
const crawler = new crawlee.PlaywrightCrawler({
|
460
388
|
launchContext: {
|
461
389
|
launcher: constants.launcher,
|
@@ -486,36 +414,35 @@ const crawlDomain = async ({
|
|
486
414
|
return new Promise(resolve => {
|
487
415
|
let timeout;
|
488
416
|
let mutationCount = 0;
|
489
|
-
const MAX_MUTATIONS
|
490
|
-
const OBSERVER_TIMEOUT
|
491
|
-
|
417
|
+
const MAX_MUTATIONS = 250; // stop if things never quiet down
|
418
|
+
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
419
|
+
|
492
420
|
const observer = new MutationObserver(() => {
|
493
421
|
clearTimeout(timeout);
|
494
|
-
|
422
|
+
|
495
423
|
mutationCount++;
|
496
424
|
if (mutationCount > MAX_MUTATIONS) {
|
497
425
|
observer.disconnect();
|
498
426
|
resolve('Too many mutations, exiting.');
|
499
427
|
return;
|
500
428
|
}
|
501
|
-
|
429
|
+
|
502
430
|
// restart quiet‑period timer
|
503
431
|
timeout = setTimeout(() => {
|
504
432
|
observer.disconnect();
|
505
433
|
resolve('DOM stabilized.');
|
506
434
|
}, 1000);
|
507
435
|
});
|
508
|
-
|
436
|
+
|
509
437
|
// overall timeout in case the page never settles
|
510
438
|
timeout = setTimeout(() => {
|
511
439
|
observer.disconnect();
|
512
440
|
resolve('Observer timeout reached.');
|
513
441
|
}, OBSERVER_TIMEOUT);
|
514
|
-
|
442
|
+
|
515
443
|
const root = document.documentElement || document.body || document;
|
516
444
|
if (!root || typeof observer.observe !== 'function') {
|
517
445
|
resolve('No root node to observe.');
|
518
|
-
return;
|
519
446
|
}
|
520
447
|
});
|
521
448
|
});
|
@@ -537,33 +464,18 @@ const crawlDomain = async ({
|
|
537
464
|
}
|
538
465
|
},
|
539
466
|
],
|
540
|
-
preNavigationHooks:
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
},
|
553
|
-
]
|
554
|
-
: [
|
555
|
-
async ({ page, request }) => {
|
556
|
-
await page.setExtraHTTPHeaders({
|
557
|
-
...extraHTTPHeaders,
|
558
|
-
});
|
559
|
-
|
560
|
-
const processible = await isProcessibleUrl(request.url);
|
561
|
-
if (!processible) {
|
562
|
-
request.skipNavigation = true;
|
563
|
-
return null;
|
564
|
-
}
|
565
|
-
},
|
566
|
-
],
|
467
|
+
preNavigationHooks: [ async({ page, request}) => {
|
468
|
+
if (isBasicAuth) {
|
469
|
+
await page.setExtraHTTPHeaders({
|
470
|
+
Authorization: authHeader,
|
471
|
+
...extraHTTPHeaders,
|
472
|
+
});
|
473
|
+
} else {
|
474
|
+
await page.setExtraHTTPHeaders({
|
475
|
+
...extraHTTPHeaders,
|
476
|
+
});
|
477
|
+
}
|
478
|
+
}],
|
567
479
|
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
568
480
|
requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
|
569
481
|
const browserContext: BrowserContext = page.context();
|
@@ -586,7 +498,10 @@ const crawlDomain = async ({
|
|
586
498
|
actualUrl = page.url();
|
587
499
|
}
|
588
500
|
|
589
|
-
if (
|
501
|
+
if (
|
502
|
+
!isFollowStrategy(url, actualUrl, strategy) &&
|
503
|
+
(isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
|
504
|
+
) {
|
590
505
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
591
506
|
numScanned: urlsCrawled.scanned.length,
|
592
507
|
urlScanned: actualUrl,
|
@@ -594,7 +509,13 @@ const crawlDomain = async ({
|
|
594
509
|
return;
|
595
510
|
}
|
596
511
|
|
597
|
-
|
512
|
+
const hasExceededDuration =
|
513
|
+
scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
514
|
+
|
515
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
|
516
|
+
if (hasExceededDuration) {
|
517
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
|
518
|
+
}
|
598
519
|
isAbortingScanNow = true;
|
599
520
|
crawler.autoscaledPool.abort();
|
600
521
|
return;
|
@@ -612,7 +533,7 @@ const crawlDomain = async ({
|
|
612
533
|
}
|
613
534
|
|
614
535
|
// handle pdfs
|
615
|
-
if (request.skipNavigation && actualUrl ===
|
536
|
+
if (shouldSkipDueToUnsupportedContent(response, request.url) || (request.skipNavigation && actualUrl === 'about:blank')) {
|
616
537
|
if (!isScanPdfs) {
|
617
538
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
618
539
|
numScanned: urlsCrawled.scanned.length,
|
@@ -648,7 +569,7 @@ const crawlDomain = async ({
|
|
648
569
|
urlsCrawled.userExcluded.push({
|
649
570
|
url: request.url,
|
650
571
|
pageTitle: request.url,
|
651
|
-
actualUrl
|
572
|
+
actualUrl, // because about:blank is not useful
|
652
573
|
metadata: STATUS_CODE_METADATA[1],
|
653
574
|
httpStatusCode: 0,
|
654
575
|
});
|
@@ -656,15 +577,19 @@ const crawlDomain = async ({
|
|
656
577
|
return;
|
657
578
|
}
|
658
579
|
|
659
|
-
if (
|
580
|
+
if (
|
581
|
+
!isFollowStrategy(url, actualUrl, strategy) &&
|
582
|
+
blacklistedPatterns &&
|
583
|
+
isSkippedUrl(actualUrl, blacklistedPatterns)
|
584
|
+
) {
|
660
585
|
urlsCrawled.userExcluded.push({
|
661
586
|
url: request.url,
|
662
587
|
pageTitle: request.url,
|
663
|
-
actualUrl
|
588
|
+
actualUrl,
|
664
589
|
metadata: STATUS_CODE_METADATA[0],
|
665
590
|
httpStatusCode: 0,
|
666
591
|
});
|
667
|
-
|
592
|
+
|
668
593
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
669
594
|
numScanned: urlsCrawled.scanned.length,
|
670
595
|
urlScanned: request.url,
|
@@ -679,11 +604,7 @@ const crawlDomain = async ({
|
|
679
604
|
const isRedirected = !areLinksEqual(actualUrl, request.url);
|
680
605
|
|
681
606
|
// check if redirected link is following strategy (same-domain/same-hostname)
|
682
|
-
const isLoadedUrlFollowStrategy = isFollowStrategy(
|
683
|
-
actualUrl,
|
684
|
-
request.url,
|
685
|
-
strategy,
|
686
|
-
);
|
607
|
+
const isLoadedUrlFollowStrategy = isFollowStrategy(actualUrl, request.url, strategy);
|
687
608
|
if (isRedirected && !isLoadedUrlFollowStrategy) {
|
688
609
|
urlsCrawled.notScannedRedirects.push({
|
689
610
|
fromUrl: request.url,
|
@@ -693,7 +614,7 @@ const crawlDomain = async ({
|
|
693
614
|
}
|
694
615
|
|
695
616
|
const responseStatus = response?.status();
|
696
|
-
|
617
|
+
if (responseStatus && responseStatus >= 300) {
|
697
618
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
698
619
|
numScanned: urlsCrawled.scanned.length,
|
699
620
|
urlScanned: request.url,
|
@@ -706,7 +627,7 @@ const crawlDomain = async ({
|
|
706
627
|
httpStatusCode: responseStatus,
|
707
628
|
});
|
708
629
|
return;
|
709
|
-
|
630
|
+
}
|
710
631
|
|
711
632
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
712
633
|
|
@@ -733,7 +654,7 @@ const crawlDomain = async ({
|
|
733
654
|
urlsCrawled.scanned.push({
|
734
655
|
url: urlWithoutAuth(request.url),
|
735
656
|
pageTitle: results.pageTitle,
|
736
|
-
actualUrl
|
657
|
+
actualUrl, // i.e. actualUrl
|
737
658
|
});
|
738
659
|
|
739
660
|
urlsCrawled.scannedRedirects.push({
|
@@ -768,11 +689,10 @@ const crawlDomain = async ({
|
|
768
689
|
urlsCrawled.userExcluded.push({
|
769
690
|
url: request.url,
|
770
691
|
pageTitle: request.url,
|
771
|
-
actualUrl
|
692
|
+
actualUrl, // because about:blank is not useful
|
772
693
|
metadata: STATUS_CODE_METADATA[1],
|
773
694
|
httpStatusCode: 0,
|
774
695
|
});
|
775
|
-
|
776
696
|
}
|
777
697
|
|
778
698
|
if (followRobots) await getUrlsFromRobotsTxt(request.url, browser);
|
@@ -780,7 +700,7 @@ const crawlDomain = async ({
|
|
780
700
|
} catch (e) {
|
781
701
|
try {
|
782
702
|
if (!e.message.includes('page.evaluate')) {
|
783
|
-
|
703
|
+
// do nothing;
|
784
704
|
guiInfoLog(guiInfoStatusTypes.ERROR, {
|
785
705
|
numScanned: urlsCrawled.scanned.length,
|
786
706
|
urlScanned: request.url,
|
@@ -815,11 +735,11 @@ const crawlDomain = async ({
|
|
815
735
|
urlScanned: request.url,
|
816
736
|
});
|
817
737
|
|
818
|
-
urlsCrawled.error.push({
|
819
|
-
url: request.url,
|
820
|
-
pageTitle: request.url,
|
821
|
-
actualUrl: request.url,
|
822
|
-
metadata: STATUS_CODE_METADATA[2]
|
738
|
+
urlsCrawled.error.push({
|
739
|
+
url: request.url,
|
740
|
+
pageTitle: request.url,
|
741
|
+
actualUrl: request.url,
|
742
|
+
metadata: STATUS_CODE_METADATA[2],
|
823
743
|
});
|
824
744
|
}
|
825
745
|
}
|
@@ -831,9 +751,10 @@ const crawlDomain = async ({
|
|
831
751
|
});
|
832
752
|
|
833
753
|
const status = response?.status();
|
834
|
-
const metadata =
|
835
|
-
|
836
|
-
|
754
|
+
const metadata =
|
755
|
+
typeof status === 'number'
|
756
|
+
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
757
|
+
: STATUS_CODE_METADATA[2];
|
837
758
|
|
838
759
|
urlsCrawled.error.push({
|
839
760
|
url: request.url,
|
@@ -842,10 +763,18 @@ const crawlDomain = async ({
|
|
842
763
|
metadata,
|
843
764
|
httpStatusCode: typeof status === 'number' ? status : 0,
|
844
765
|
});
|
845
|
-
|
846
766
|
},
|
847
767
|
maxRequestsPerCrawl: Infinity,
|
848
768
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
769
|
+
...(process.env.OOBEE_FAST_CRAWLER && {
|
770
|
+
autoscaledPoolOptions: {
|
771
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
772
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
773
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
774
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
775
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
776
|
+
},
|
777
|
+
}),
|
849
778
|
});
|
850
779
|
|
851
780
|
await crawler.run();
|
@@ -875,6 +804,10 @@ const crawlDomain = async ({
|
|
875
804
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
876
805
|
}
|
877
806
|
|
807
|
+
if (scanDuration > 0) {
|
808
|
+
const elapsed = Math.round((Date.now() - crawlStartTime) / 1000);
|
809
|
+
console.log(`Crawl ended after ${elapsed}s. Limit: ${scanDuration}s.`);
|
810
|
+
}
|
878
811
|
return urlsCrawled;
|
879
812
|
};
|
880
813
|
|
@@ -2,7 +2,7 @@ import fs from 'fs';
|
|
2
2
|
import { chromium, Page } from 'playwright';
|
3
3
|
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
4
4
|
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
5
|
-
import {
|
5
|
+
import { consoleLogger, guiInfoLog } from '../logs.js';
|
6
6
|
import crawlDomain from './crawlDomain.js';
|
7
7
|
import crawlSitemap from './crawlSitemap.js';
|
8
8
|
import { EnqueueStrategy } from 'crawlee';
|
@@ -24,46 +24,42 @@ const crawlIntelligentSitemap = async (
|
|
24
24
|
followRobots: boolean,
|
25
25
|
extraHTTPHeaders: Record<string, string>,
|
26
26
|
safeMode: boolean,
|
27
|
+
scanDuration: number
|
27
28
|
) => {
|
29
|
+
const startTime = Date.now(); // Track start time
|
30
|
+
|
28
31
|
let urlsCrawledFinal;
|
29
|
-
let urlsCrawled;
|
32
|
+
let urlsCrawled = { ...constants.urlsCrawledObj };
|
30
33
|
let dataset;
|
31
34
|
let sitemapExist = false;
|
32
35
|
const fromCrawlIntelligentSitemap = true;
|
33
36
|
let sitemapUrl;
|
34
37
|
|
35
|
-
urlsCrawled = { ...constants.urlsCrawledObj };
|
36
38
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
37
|
-
|
38
39
|
if (!fs.existsSync(randomToken)) {
|
39
40
|
fs.mkdirSync(randomToken);
|
40
41
|
}
|
41
42
|
|
42
43
|
function getHomeUrl(parsedUrl: string) {
|
43
44
|
const urlObject = new URL(parsedUrl);
|
44
|
-
if (urlObject.username
|
45
|
+
if (urlObject.username && urlObject.password) {
|
45
46
|
return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
46
47
|
}
|
47
|
-
|
48
48
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
49
49
|
}
|
50
50
|
|
51
51
|
async function findSitemap(link: string) {
|
52
52
|
const homeUrl = getHomeUrl(link);
|
53
|
-
let sitemapLinkFound = false;
|
54
53
|
let sitemapLink = '';
|
55
|
-
const chromiumBrowser = await chromium.launch(
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
});
|
61
|
-
|
54
|
+
const chromiumBrowser = await chromium.launch({
|
55
|
+
headless: false,
|
56
|
+
channel: 'chrome',
|
57
|
+
args: ['--headless=new', '--no-sandbox'],
|
58
|
+
});
|
62
59
|
const page = await chromiumBrowser.newPage();
|
63
60
|
for (const path of sitemapPaths) {
|
64
61
|
sitemapLink = homeUrl + path;
|
65
|
-
|
66
|
-
if (sitemapLinkFound) {
|
62
|
+
if (await checkUrlExists(page, sitemapLink)) {
|
67
63
|
sitemapExist = true;
|
68
64
|
break;
|
69
65
|
}
|
@@ -75,12 +71,9 @@ const crawlIntelligentSitemap = async (
|
|
75
71
|
const checkUrlExists = async (page: Page, parsedUrl: string) => {
|
76
72
|
try {
|
77
73
|
const response = await page.goto(parsedUrl);
|
78
|
-
|
79
|
-
return true;
|
80
|
-
}
|
81
|
-
return false;
|
74
|
+
return response.ok();
|
82
75
|
} catch (e) {
|
83
|
-
|
76
|
+
consoleLogger.error(e);
|
84
77
|
return false;
|
85
78
|
}
|
86
79
|
};
|
@@ -88,13 +81,12 @@ const crawlIntelligentSitemap = async (
|
|
88
81
|
try {
|
89
82
|
sitemapUrl = await findSitemap(url);
|
90
83
|
} catch (error) {
|
91
|
-
|
84
|
+
consoleLogger.error(error);
|
92
85
|
}
|
93
86
|
|
94
87
|
if (!sitemapExist) {
|
95
88
|
console.log('Unable to find sitemap. Commencing website crawl instead.');
|
96
|
-
|
97
|
-
urlsCrawledFinal = await crawlDomain({
|
89
|
+
return await crawlDomain({
|
98
90
|
url,
|
99
91
|
randomToken,
|
100
92
|
host,
|
@@ -109,12 +101,13 @@ const crawlIntelligentSitemap = async (
|
|
109
101
|
includeScreenshots,
|
110
102
|
followRobots,
|
111
103
|
extraHTTPHeaders,
|
104
|
+
safeMode,
|
105
|
+
scanDuration, // Use full duration since no sitemap
|
112
106
|
});
|
113
|
-
return urlsCrawledFinal;
|
114
107
|
}
|
108
|
+
|
115
109
|
console.log(`Sitemap found at ${sitemapUrl}`);
|
116
|
-
|
117
|
-
urlsCrawledFinal = await crawlSitemap(
|
110
|
+
urlsCrawledFinal = await crawlSitemap({
|
118
111
|
sitemapUrl,
|
119
112
|
randomToken,
|
120
113
|
host,
|
@@ -128,14 +121,21 @@ const crawlIntelligentSitemap = async (
|
|
128
121
|
includeScreenshots,
|
129
122
|
extraHTTPHeaders,
|
130
123
|
fromCrawlIntelligentSitemap,
|
131
|
-
url,
|
132
|
-
dataset,
|
133
|
-
urlsCrawled,
|
134
|
-
false,
|
135
|
-
|
124
|
+
userUrlInputFromIntelligent: url,
|
125
|
+
datasetFromIntelligent: dataset,
|
126
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
127
|
+
crawledFromLocalFile: false,
|
128
|
+
scanDuration,
|
129
|
+
});
|
130
|
+
|
131
|
+
const elapsed = Date.now() - startTime;
|
132
|
+
const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
|
136
133
|
|
137
|
-
if (
|
138
|
-
|
134
|
+
if (
|
135
|
+
urlsCrawledFinal.scanned.length < maxRequestsPerCrawl &&
|
136
|
+
remainingScanDuration > 0
|
137
|
+
) {
|
138
|
+
console.log(`Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`);
|
139
139
|
urlsCrawledFinal = await crawlDomain({
|
140
140
|
url,
|
141
141
|
randomToken,
|
@@ -153,12 +153,16 @@ const crawlIntelligentSitemap = async (
|
|
153
153
|
extraHTTPHeaders,
|
154
154
|
safeMode,
|
155
155
|
fromCrawlIntelligentSitemap,
|
156
|
-
datasetFromIntelligent: dataset,
|
157
|
-
urlsCrawledFromIntelligent: urlsCrawledFinal,
|
156
|
+
datasetFromIntelligent: dataset,
|
157
|
+
urlsCrawledFromIntelligent: urlsCrawledFinal,
|
158
|
+
scanDuration: remainingScanDuration,
|
158
159
|
});
|
160
|
+
} else if (remainingScanDuration <= 0) {
|
161
|
+
console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
|
159
162
|
}
|
160
163
|
|
161
164
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
162
165
|
return urlsCrawledFinal;
|
163
166
|
};
|
167
|
+
|
164
168
|
export default crawlIntelligentSitemap;
|