@govtechsg/oobee 0.10.70 → 0.10.74
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/DETAILS.md +1 -1
- package/README.md +12 -0
- package/S3_UPLOAD_README.md +172 -0
- package/dev/runGenerateJustHtmlReport.ts +25 -0
- package/package.json +7 -4
- package/src/combine.ts +72 -15
- package/src/constants/common.ts +91 -93
- package/src/constants/constants.ts +536 -59
- package/src/crawlers/crawlDomain.ts +313 -305
- package/src/crawlers/crawlIntelligentSitemap.ts +24 -18
- package/src/crawlers/crawlLocalFile.ts +29 -27
- package/src/crawlers/crawlSitemap.ts +265 -254
- package/src/crawlers/custom/utils.ts +809 -119
- package/src/crawlers/runCustom.ts +29 -4
- package/src/generateHtmlReport.ts +224 -0
- package/src/mergeAxeResults.ts +133 -46
- package/src/runGenerateJustHtmlReport.ts +20 -0
- package/src/services/s3Uploader.ts +184 -0
- package/src/static/ejs/partials/components/allIssues/AllIssues.ejs +9 -0
- package/src/static/ejs/partials/components/allIssues/CategoryBadges.ejs +82 -0
- package/src/static/ejs/partials/components/allIssues/FilterBar.ejs +33 -0
- package/src/static/ejs/partials/components/allIssues/IssuesTable.ejs +41 -0
- package/src/static/ejs/partials/components/header/SiteInfo.ejs +119 -0
- package/src/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +15 -0
- package/src/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +44 -0
- package/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +142 -0
- package/src/static/ejs/partials/components/prioritiseIssues/IssueDetailCard.ejs +36 -0
- package/src/static/ejs/partials/components/prioritiseIssues/PrioritiseIssues.ejs +47 -0
- package/src/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +196 -0
- package/src/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +48 -0
- package/src/static/ejs/partials/components/shared/InfoAlert.ejs +3 -0
- package/src/static/ejs/partials/components/{topFive.ejs → topTen.ejs} +2 -2
- package/src/static/ejs/partials/components/wcagCompliance/FailedCriteria.ejs +47 -0
- package/src/static/ejs/partials/components/wcagCompliance/WcagCompliance.ejs +16 -0
- package/src/static/ejs/partials/components/wcagCompliance/WcagGaugeBar.ejs +16 -0
- package/src/static/ejs/partials/components/wcagCoverageDetails.ejs +18 -0
- package/src/static/ejs/partials/footer.ejs +1 -1
- package/src/static/ejs/partials/header.ejs +7 -223
- package/src/static/ejs/partials/main.ejs +12 -23
- package/src/static/ejs/partials/scripts/allIssues/AllIssues.ejs +376 -0
- package/src/static/ejs/partials/scripts/categorySummary.ejs +1 -1
- package/src/static/ejs/partials/scripts/header/SiteInfo.ejs +44 -0
- package/src/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +51 -0
- package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +127 -0
- package/src/static/ejs/partials/scripts/header/aboutScanModal/ScanDetails.ejs +60 -0
- package/src/static/ejs/partials/scripts/prioritiseIssues/IssueDetailCard.ejs +137 -0
- package/src/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +214 -0
- package/src/static/ejs/partials/scripts/prioritiseIssues/wcagSvgMap.ejs +861 -0
- package/src/static/ejs/partials/scripts/ruleModal/constants.ejs +957 -0
- package/src/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +353 -0
- package/src/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +468 -0
- package/src/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +306 -0
- package/src/static/ejs/partials/scripts/ruleModal/utilities.ejs +483 -0
- package/src/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +35 -0
- package/src/static/ejs/partials/scripts/screenshotLightbox.ejs +61 -57
- package/src/static/ejs/partials/scripts/topTen.ejs +61 -0
- package/src/static/ejs/partials/scripts/utils.ejs +15 -0
- package/src/static/ejs/partials/scripts/wcagCompliance/FailedCriteria.ejs +103 -0
- package/src/static/ejs/partials/scripts/wcagCompliance/WcagGaugeBar.ejs +47 -0
- package/src/static/ejs/partials/scripts/wcagCompliance.ejs +15 -0
- package/src/static/ejs/partials/scripts/wcagCoverageDetails.ejs +75 -0
- package/src/static/ejs/partials/styles/allIssues/AllIssues.ejs +384 -0
- package/src/static/ejs/partials/styles/bootstrap.ejs +17 -1
- package/src/static/ejs/partials/styles/header/SiteInfo.ejs +121 -0
- package/src/static/ejs/partials/styles/header/aboutScanModal/AboutScanModal.ejs +82 -0
- package/src/static/ejs/partials/styles/header/aboutScanModal/ScanConfiguration.ejs +50 -0
- package/src/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +149 -0
- package/src/static/ejs/partials/styles/header.ejs +7 -0
- package/src/static/ejs/partials/styles/prioritiseIssues/IssueDetailCard.ejs +141 -0
- package/src/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +204 -0
- package/src/static/ejs/partials/styles/ruleModal/ruleOffcanvas.ejs +456 -0
- package/src/static/ejs/partials/styles/scannedPagesSegmentedTabs.ejs +46 -0
- package/src/static/ejs/partials/styles/shared/InfoAlert.ejs +12 -0
- package/src/static/ejs/partials/styles/styles.ejs +198 -470
- package/src/static/ejs/partials/styles/topTenCard.ejs +44 -0
- package/src/static/ejs/partials/styles/wcagCompliance/FailedCriteria.ejs +59 -0
- package/src/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +62 -0
- package/src/static/ejs/partials/styles/wcagCompliance.ejs +36 -0
- package/src/static/ejs/partials/styles/wcagCoverageDetails.ejs +33 -0
- package/src/static/ejs/report.ejs +42 -259
- package/src/static/ejs/summary.ejs +1 -1
- package/src/utils.ts +30 -0
- package/src/static/ejs/partials/components/categorySelector.ejs +0 -4
- package/src/static/ejs/partials/components/categorySelectorDropdown.ejs +0 -57
- package/src/static/ejs/partials/components/pagesScannedModal.ejs +0 -70
- package/src/static/ejs/partials/components/reportSearch.ejs +0 -47
- package/src/static/ejs/partials/components/ruleOffcanvas.ejs +0 -105
- package/src/static/ejs/partials/components/scanAbout.ejs +0 -328
- package/src/static/ejs/partials/components/wcagCompliance.ejs +0 -52
- package/src/static/ejs/partials/scripts/categorySelectorDropdownScript.ejs +0 -190
- package/src/static/ejs/partials/scripts/reportSearch.ejs +0 -287
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +0 -804
- package/src/static/ejs/partials/scripts/scanAboutScript.ejs +0 -38
package/src/constants/common.ts
CHANGED
|
@@ -57,7 +57,7 @@ export const validateDirPath = (dirPath: string): string => {
|
|
|
57
57
|
}
|
|
58
58
|
};
|
|
59
59
|
|
|
60
|
-
|
|
60
|
+
export class RES {
|
|
61
61
|
status: number;
|
|
62
62
|
httpStatus?: number;
|
|
63
63
|
url: string;
|
|
@@ -289,16 +289,15 @@ export const sanitizeUrlInput = (url: string): { isValid: boolean; url: string }
|
|
|
289
289
|
const isAllowedContentType = (ct: string): boolean => {
|
|
290
290
|
const c = (ct || '').toLowerCase();
|
|
291
291
|
return (
|
|
292
|
-
c.startsWith('text/html') ||
|
|
293
|
-
c.startsWith('application/xhtml+xml') ||
|
|
294
|
-
c.startsWith('text/plain') ||
|
|
295
|
-
c.startsWith('application/xml') ||
|
|
296
|
-
c.startsWith('text/xml')
|
|
297
|
-
c.startsWith('application/pdf')
|
|
292
|
+
c.startsWith('text/html') || // html
|
|
293
|
+
c.startsWith('application/xhtml+xml') || // xhtml
|
|
294
|
+
c.startsWith('text/plain') || // txt
|
|
295
|
+
c.startsWith('application/xml') || // xml
|
|
296
|
+
c.startsWith('text/xml') || // xml (alt)
|
|
297
|
+
c.startsWith('application/pdf') // pdf
|
|
298
298
|
);
|
|
299
299
|
};
|
|
300
300
|
|
|
301
|
-
|
|
302
301
|
const checkUrlConnectivityWithBrowser = async (
|
|
303
302
|
url: string,
|
|
304
303
|
browserToRun: string,
|
|
@@ -314,9 +313,8 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
314
313
|
return res;
|
|
315
314
|
}
|
|
316
315
|
|
|
317
|
-
|
|
316
|
+
// STEP 1: For local file scans
|
|
318
317
|
let contentType = '';
|
|
319
|
-
|
|
320
318
|
const protocol = new URL(url).protocol;
|
|
321
319
|
|
|
322
320
|
if (protocol !== 'http:' && protocol !== 'https:') {
|
|
@@ -353,7 +351,7 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
353
351
|
}
|
|
354
352
|
|
|
355
353
|
// Ensure Accept header for non-html content fallback
|
|
356
|
-
extraHTTPHeaders
|
|
354
|
+
extraHTTPHeaders.Accept ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
|
357
355
|
|
|
358
356
|
await initModifiedUserAgent(browserToRun, playwrightDeviceDetailsObject, clonedDataDir);
|
|
359
357
|
|
|
@@ -386,45 +384,40 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
386
384
|
consoleLogger.info(`Unable to set download deny: ${(e as Error).message}`);
|
|
387
385
|
}
|
|
388
386
|
|
|
387
|
+
// OPTIMIZATION: Block heavy visual resources (Images/Fonts/CSS)
|
|
388
|
+
// This allows the "Connectivity Check" to pass as soon as HTML is ready
|
|
389
|
+
await page.route('**/*', (route) => {
|
|
390
|
+
const type = route.request().resourceType();
|
|
391
|
+
if (['image', 'media', 'font', 'stylesheet'].includes(type)) {
|
|
392
|
+
return route.abort();
|
|
393
|
+
}
|
|
394
|
+
return route.continue();
|
|
395
|
+
});
|
|
396
|
+
|
|
389
397
|
// STEP 2: Navigate (follows server-side redirects)
|
|
390
|
-
page.once('download', () => {
|
|
398
|
+
page.once('download', () => {
|
|
391
399
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
392
400
|
return res;
|
|
393
401
|
});
|
|
394
402
|
|
|
403
|
+
// OPTIMIZATION: Wait for 'domcontentloaded' only
|
|
395
404
|
const response = await page.goto(url, {
|
|
396
405
|
timeout: 15000,
|
|
397
406
|
waitUntil: 'domcontentloaded', // enough to get status + allow potential client redirects to kick in
|
|
398
407
|
});
|
|
399
408
|
|
|
400
|
-
|
|
401
|
-
try {
|
|
402
|
-
await page.waitForLoadState('networkidle', { timeout: 8000 });
|
|
403
|
-
} catch {
|
|
404
|
-
consoleLogger.info('networkidle not reached; proceeding with verification GET');
|
|
405
|
-
}
|
|
409
|
+
if (!response) throw new Error('No response from navigation');
|
|
406
410
|
|
|
407
|
-
//
|
|
411
|
+
// We use the response headers from the navigation we just performed.
|
|
408
412
|
const finalUrl = page.url();
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
verifyResp = await page.request.fetch(finalUrl, {
|
|
412
|
-
method: 'GET',
|
|
413
|
-
headers: extraHTTPHeaders,
|
|
414
|
-
});
|
|
415
|
-
} catch (e) {
|
|
416
|
-
consoleLogger.info(`Verification GET failed, falling back to navigation response: ${e.message}`);
|
|
417
|
-
}
|
|
418
|
-
|
|
419
|
-
// Prefer verification GET; fall back to nav response
|
|
420
|
-
const finalStatus = verifyResp?.status?.() ?? response?.status?.() ?? 0;
|
|
421
|
-
const headers = (verifyResp?.headers?.() ?? response?.headers?.()) || {};
|
|
413
|
+
const finalStatus = response.status();
|
|
414
|
+
const headers = response.headers();
|
|
422
415
|
contentType = headers['content-type'] || '';
|
|
423
416
|
|
|
424
417
|
if (!isAllowedContentType(contentType)) {
|
|
425
418
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
426
419
|
return res;
|
|
427
|
-
}
|
|
420
|
+
}
|
|
428
421
|
|
|
429
422
|
res.httpStatus = finalStatus;
|
|
430
423
|
res.url = finalUrl;
|
|
@@ -437,7 +430,9 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
437
430
|
// Some origins 405/501 but the browser-rendered page is still reachable after client redirects.
|
|
438
431
|
// As a last resort, consider DOM presence as success if we actually have a document.
|
|
439
432
|
const hasDOM = await page.evaluate(() => !!document && !!document.documentElement);
|
|
440
|
-
res.status = hasDOM
|
|
433
|
+
res.status = hasDOM
|
|
434
|
+
? constants.urlCheckStatuses.success.code
|
|
435
|
+
: constants.urlCheckStatuses.systemError.code;
|
|
441
436
|
} else {
|
|
442
437
|
res.status = constants.urlCheckStatuses.systemError.code;
|
|
443
438
|
}
|
|
@@ -448,11 +443,11 @@ const checkUrlConnectivityWithBrowser = async (
|
|
|
448
443
|
} else {
|
|
449
444
|
try {
|
|
450
445
|
// Try to get a stable DOM; don't fail the check if it times out
|
|
446
|
+
// Note: Since we used 'domcontentloaded' in goto, this is fast, but kept for safety/stability
|
|
451
447
|
await page.waitForLoadState('domcontentloaded', { timeout: 5000 });
|
|
452
448
|
} catch {}
|
|
453
449
|
res.content = await page.content();
|
|
454
450
|
}
|
|
455
|
-
|
|
456
451
|
} catch (error) {
|
|
457
452
|
if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
|
|
458
453
|
res.status = constants.urlCheckStatuses.unauthorised.code;
|
|
@@ -514,32 +509,30 @@ export const checkUrl = async (
|
|
|
514
509
|
clonedDataDir: string,
|
|
515
510
|
playwrightDeviceDetailsObject: DeviceDescriptor,
|
|
516
511
|
extraHTTPHeaders: Record<string, string>,
|
|
517
|
-
fileTypes: FileTypes
|
|
512
|
+
fileTypes: FileTypes,
|
|
518
513
|
) => {
|
|
519
|
-
|
|
520
514
|
const res = await checkUrlConnectivityWithBrowser(
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
515
|
+
url,
|
|
516
|
+
browser,
|
|
517
|
+
clonedDataDir,
|
|
518
|
+
playwrightDeviceDetailsObject,
|
|
519
|
+
extraHTTPHeaders,
|
|
526
520
|
);
|
|
527
521
|
|
|
528
522
|
// If response is 200 (meaning no other code was set earlier)
|
|
529
523
|
if (res.status === constants.urlCheckStatuses.success.code) {
|
|
530
|
-
|
|
531
524
|
// Check if document is pdf type
|
|
532
525
|
const isPdf = isPdfContent(res.content);
|
|
533
526
|
|
|
534
527
|
// Check if only HTML document is allowed to be scanned
|
|
535
528
|
if (fileTypes === FileTypes.HtmlOnly && isPdf) {
|
|
536
529
|
res.status = constants.urlCheckStatuses.notASupportedDocument.code;
|
|
537
|
-
|
|
538
|
-
|
|
530
|
+
|
|
531
|
+
// Check if only PDF document is allowed to be scanned
|
|
539
532
|
} else if (fileTypes === FileTypes.PdfOnly && !isPdf) {
|
|
540
533
|
res.status = constants.urlCheckStatuses.notAPdf.code;
|
|
541
534
|
|
|
542
|
-
|
|
535
|
+
// Check if sitemap is expected
|
|
543
536
|
} else if (scanner === ScannerTypes.SITEMAP) {
|
|
544
537
|
const isSitemap = isSitemapContent(res.content);
|
|
545
538
|
|
|
@@ -550,7 +543,7 @@ export const checkUrl = async (
|
|
|
550
543
|
|
|
551
544
|
// else proceed as normal
|
|
552
545
|
}
|
|
553
|
-
|
|
546
|
+
|
|
554
547
|
return res;
|
|
555
548
|
};
|
|
556
549
|
|
|
@@ -606,7 +599,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
606
599
|
zip,
|
|
607
600
|
ruleset,
|
|
608
601
|
generateJsonFiles,
|
|
609
|
-
scanDuration
|
|
602
|
+
scanDuration,
|
|
610
603
|
} = argv;
|
|
611
604
|
|
|
612
605
|
const extraHTTPHeaders = parseHeaders(header);
|
|
@@ -632,8 +625,8 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
632
625
|
password = temp.password;
|
|
633
626
|
|
|
634
627
|
if (username !== '' || password !== '') {
|
|
635
|
-
extraHTTPHeaders
|
|
636
|
-
}
|
|
628
|
+
extraHTTPHeaders.Authorization = `Basic ${Buffer.from(`${username}:${password}`).toString('base64')}`;
|
|
629
|
+
}
|
|
637
630
|
|
|
638
631
|
temp.username = '';
|
|
639
632
|
temp.password = '';
|
|
@@ -653,17 +646,22 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
653
646
|
if (exportDirectory) {
|
|
654
647
|
constants.exportDirectory = path.join(exportDirectory, resultFilename);
|
|
655
648
|
}
|
|
656
|
-
|
|
649
|
+
|
|
657
650
|
// Creating the playwrightDeviceDetailObject
|
|
658
|
-
deviceChosen =
|
|
659
|
-
|
|
651
|
+
deviceChosen =
|
|
652
|
+
customDevice === 'Desktop' || customDevice === 'Mobile' ? customDevice : deviceChosen;
|
|
653
|
+
|
|
660
654
|
const playwrightDeviceDetailsObject = getPlaywrightDeviceDetailsObject(
|
|
661
655
|
deviceChosen,
|
|
662
656
|
customDevice,
|
|
663
657
|
viewportWidth,
|
|
664
658
|
);
|
|
665
659
|
|
|
666
|
-
const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(
|
|
660
|
+
const { browserToRun: resolvedBrowser, clonedBrowserDataDir } = getBrowserToRun(
|
|
661
|
+
resultFilename,
|
|
662
|
+
browserToRun,
|
|
663
|
+
true,
|
|
664
|
+
);
|
|
667
665
|
browserToRun = resolvedBrowser;
|
|
668
666
|
|
|
669
667
|
const resolvedUserDataDirectory = getClonedProfilesWithRandomToken(browserToRun, resultFilename);
|
|
@@ -678,7 +676,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
678
676
|
|
|
679
677
|
return {
|
|
680
678
|
type: scanner,
|
|
681
|
-
url
|
|
679
|
+
url,
|
|
682
680
|
entryUrl: url,
|
|
683
681
|
isHeadless: headless,
|
|
684
682
|
deviceChosen,
|
|
@@ -699,7 +697,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
699
697
|
includeScreenshots: !(additional === 'none'),
|
|
700
698
|
metadata,
|
|
701
699
|
followRobots,
|
|
702
|
-
extraHTTPHeaders
|
|
700
|
+
extraHTTPHeaders,
|
|
703
701
|
safeMode,
|
|
704
702
|
userDataDirectory: resolvedUserDataDirectory,
|
|
705
703
|
zip,
|
|
@@ -709,7 +707,12 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
|
709
707
|
};
|
|
710
708
|
};
|
|
711
709
|
|
|
712
|
-
export const getUrlsFromRobotsTxt = async (
|
|
710
|
+
export const getUrlsFromRobotsTxt = async (
|
|
711
|
+
url: string,
|
|
712
|
+
browserToRun: string,
|
|
713
|
+
userDataDirectory: string,
|
|
714
|
+
extraHTTPHeaders: Record<string, string>,
|
|
715
|
+
): Promise<void> => {
|
|
713
716
|
if (!constants.robotsTxtUrls) return;
|
|
714
717
|
|
|
715
718
|
const domain = new URL(url).origin;
|
|
@@ -718,7 +721,12 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string, us
|
|
|
718
721
|
|
|
719
722
|
let robotsTxt: string;
|
|
720
723
|
try {
|
|
721
|
-
robotsTxt = await getRobotsTxtViaPlaywright(
|
|
724
|
+
robotsTxt = await getRobotsTxtViaPlaywright(
|
|
725
|
+
robotsUrl,
|
|
726
|
+
browserToRun,
|
|
727
|
+
userDataDirectory,
|
|
728
|
+
extraHTTPHeaders,
|
|
729
|
+
);
|
|
722
730
|
consoleLogger.info(`Fetched robots.txt from ${robotsUrl}`);
|
|
723
731
|
} catch (e) {
|
|
724
732
|
// if robots.txt is not found, do nothing
|
|
@@ -729,7 +737,7 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string, us
|
|
|
729
737
|
constants.robotsTxtUrls[domain] = {};
|
|
730
738
|
return;
|
|
731
739
|
}
|
|
732
|
-
|
|
740
|
+
|
|
733
741
|
const lines = robotsTxt.split(/\r?\n/);
|
|
734
742
|
let shouldCapture = false;
|
|
735
743
|
const disallowedUrls = [];
|
|
@@ -777,9 +785,13 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string, us
|
|
|
777
785
|
constants.robotsTxtUrls[domain] = { disallowedUrls, allowedUrls };
|
|
778
786
|
};
|
|
779
787
|
|
|
780
|
-
const getRobotsTxtViaPlaywright = async (
|
|
781
|
-
|
|
782
|
-
|
|
788
|
+
const getRobotsTxtViaPlaywright = async (
|
|
789
|
+
robotsUrl: string,
|
|
790
|
+
browser: string,
|
|
791
|
+
userDataDirectory: string,
|
|
792
|
+
extraHTTPHeaders: Record<string, string>,
|
|
793
|
+
): Promise<string> => {
|
|
794
|
+
const robotsDataDir = '';
|
|
783
795
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
|
784
796
|
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
785
797
|
// Create robots own user data directory else SingletonLock: File exists (17) with crawlDomain or crawlSitemap's own browser
|
|
@@ -932,7 +944,7 @@ export const getLinksFromSitemap = async (
|
|
|
932
944
|
const fetchUrls = async (url: string, extraHTTPHeaders: Record<string, string>) => {
|
|
933
945
|
let data;
|
|
934
946
|
let sitemapType;
|
|
935
|
-
|
|
947
|
+
|
|
936
948
|
if (scannedSitemaps.has(url)) {
|
|
937
949
|
// Skip processing if the sitemap has already been scanned
|
|
938
950
|
return;
|
|
@@ -948,7 +960,6 @@ export const getLinksFromSitemap = async (
|
|
|
948
960
|
if (!fs.existsSync(url)) {
|
|
949
961
|
return;
|
|
950
962
|
}
|
|
951
|
-
|
|
952
963
|
} else if (isValidHttpUrl(url)) {
|
|
953
964
|
// Do nothing, url is valid
|
|
954
965
|
} else {
|
|
@@ -973,7 +984,7 @@ export const getLinksFromSitemap = async (
|
|
|
973
984
|
|
|
974
985
|
await page.goto(url, { waitUntil: 'networkidle', timeout: 60000 });
|
|
975
986
|
|
|
976
|
-
if (await page.locator('body').count() > 0) {
|
|
987
|
+
if ((await page.locator('body').count()) > 0) {
|
|
977
988
|
data = await page.locator('body').innerText();
|
|
978
989
|
} else {
|
|
979
990
|
const urlSet = page.locator('urlset');
|
|
@@ -1003,7 +1014,6 @@ export const getLinksFromSitemap = async (
|
|
|
1003
1014
|
}
|
|
1004
1015
|
|
|
1005
1016
|
await getDataUsingPlaywright();
|
|
1006
|
-
|
|
1007
1017
|
} else {
|
|
1008
1018
|
url = convertLocalFileToPath(url);
|
|
1009
1019
|
data = fs.readFileSync(url, 'utf8');
|
|
@@ -1122,7 +1132,6 @@ export const getBrowserToRun = (
|
|
|
1122
1132
|
preferredBrowser?: BrowserTypes,
|
|
1123
1133
|
isCli = false,
|
|
1124
1134
|
): { browserToRun: BrowserTypes; clonedBrowserDataDir: string } => {
|
|
1125
|
-
|
|
1126
1135
|
const platform = os.platform();
|
|
1127
1136
|
|
|
1128
1137
|
// Prioritise Chrome on Windows and Mac platforms if user does not specify a browser
|
|
@@ -1248,14 +1257,14 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
|
|
|
1248
1257
|
if (os.platform() === 'win32') {
|
|
1249
1258
|
profileCookiesDir = globSync('**/Network/Cookies', {
|
|
1250
1259
|
...options,
|
|
1251
|
-
ignore: ['oobee
|
|
1260
|
+
ignore: ['oobee*/**'],
|
|
1252
1261
|
});
|
|
1253
1262
|
profileNamesRegex = /User Data\\(.*?)\\Network/;
|
|
1254
1263
|
} else if (os.platform() === 'darwin') {
|
|
1255
1264
|
// maxDepth 2 to avoid copying cookies from the oobee directory if it exists
|
|
1256
1265
|
profileCookiesDir = globSync('**/Cookies', {
|
|
1257
1266
|
...options,
|
|
1258
|
-
ignore: 'oobee
|
|
1267
|
+
ignore: 'oobee*/**',
|
|
1259
1268
|
});
|
|
1260
1269
|
profileNamesRegex = /Chrome\/(.*?)\/Cookies/;
|
|
1261
1270
|
}
|
|
@@ -1443,7 +1452,7 @@ export const cloneChromeProfiles = (randomToken: string): string => {
|
|
|
1443
1452
|
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1444
1453
|
|
|
1445
1454
|
if (fs.existsSync(destDir)) {
|
|
1446
|
-
// Don't delete since it will be handled at the end of the scan
|
|
1455
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1447
1456
|
// deleteClonedChromeProfiles(randomToken);
|
|
1448
1457
|
// Assume it cloned and don't re-clone
|
|
1449
1458
|
} else {
|
|
@@ -1463,7 +1472,6 @@ export const cloneChromeProfiles = (randomToken: string): string => {
|
|
|
1463
1472
|
}
|
|
1464
1473
|
|
|
1465
1474
|
consoleLogger.error('Failed to clone Chrome profiles. You may be logged out of your accounts.');
|
|
1466
|
-
|
|
1467
1475
|
}
|
|
1468
1476
|
// For future reference, return a null instead to halt the scan
|
|
1469
1477
|
return destDir;
|
|
@@ -1481,8 +1489,7 @@ export const cloneChromiumProfiles = (randomToken: string): string => {
|
|
|
1481
1489
|
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1482
1490
|
|
|
1483
1491
|
if (fs.existsSync(destDir)) {
|
|
1484
|
-
|
|
1485
|
-
// Don't delete since it will be handled at the end of the scan
|
|
1492
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1486
1493
|
// deleteClonedChromiumProfiles(randomToken);
|
|
1487
1494
|
// Assume it cloned and don't re-clone
|
|
1488
1495
|
} else {
|
|
@@ -1512,11 +1519,9 @@ export const cloneEdgeProfiles = (randomToken: string): string => {
|
|
|
1512
1519
|
destDir = path.join(baseDir, `oobee-${randomToken}`);
|
|
1513
1520
|
|
|
1514
1521
|
if (fs.existsSync(destDir)) {
|
|
1515
|
-
|
|
1516
|
-
// Don't delete since it will be handled at the end of the scan
|
|
1522
|
+
// Don't delete since it will be handled at the end of the scan
|
|
1517
1523
|
// deleteClonedEdgeProfiles(randomToken);
|
|
1518
1524
|
// Assume it cloned and don't re-clone
|
|
1519
|
-
|
|
1520
1525
|
} else {
|
|
1521
1526
|
if (!fs.existsSync(destDir)) {
|
|
1522
1527
|
fs.mkdirSync(destDir, { recursive: true });
|
|
@@ -1535,7 +1540,6 @@ export const cloneEdgeProfiles = (randomToken: string): string => {
|
|
|
1535
1540
|
}
|
|
1536
1541
|
|
|
1537
1542
|
consoleLogger.error('Failed to clone Edge profiles. You may be logged out of your accounts.');
|
|
1538
|
-
|
|
1539
1543
|
}
|
|
1540
1544
|
|
|
1541
1545
|
// For future reference, return a null instead to halt the scan
|
|
@@ -1597,7 +1601,6 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
|
|
|
1597
1601
|
* @returns null
|
|
1598
1602
|
*/
|
|
1599
1603
|
export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
|
|
1600
|
-
|
|
1601
1604
|
const baseDir = getDefaultEdgeDataDir();
|
|
1602
1605
|
|
|
1603
1606
|
if (!baseDir) {
|
|
@@ -1714,12 +1717,9 @@ export const submitFormViaPlaywright = async (
|
|
|
1714
1717
|
userDataDirectory: string,
|
|
1715
1718
|
finalUrl: string,
|
|
1716
1719
|
) => {
|
|
1717
|
-
const browserContext = await constants.launcher.launchPersistentContext(
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
...getPlaywrightLaunchOptions(browserToRun),
|
|
1721
|
-
},
|
|
1722
|
-
);
|
|
1720
|
+
const browserContext = await constants.launcher.launchPersistentContext(userDataDirectory, {
|
|
1721
|
+
...getPlaywrightLaunchOptions(browserToRun),
|
|
1722
|
+
});
|
|
1723
1723
|
|
|
1724
1724
|
register(browserContext);
|
|
1725
1725
|
|
|
@@ -1778,7 +1778,6 @@ export const submitForm = async (
|
|
|
1778
1778
|
finalUrl += `&${formDataFields.redirectUrlField}=${scannedUrl}`;
|
|
1779
1779
|
}
|
|
1780
1780
|
|
|
1781
|
-
|
|
1782
1781
|
try {
|
|
1783
1782
|
await axios.get(finalUrl, { timeout: 2000 });
|
|
1784
1783
|
} catch (error) {
|
|
@@ -1788,7 +1787,6 @@ export const submitForm = async (
|
|
|
1788
1787
|
}
|
|
1789
1788
|
}
|
|
1790
1789
|
}
|
|
1791
|
-
|
|
1792
1790
|
};
|
|
1793
1791
|
// Legacy code end - Google Sheets submission
|
|
1794
1792
|
|
|
@@ -1797,7 +1795,6 @@ export async function initModifiedUserAgent(
|
|
|
1797
1795
|
playwrightDeviceDetailsObject?: object,
|
|
1798
1796
|
userDataDirectory?: string,
|
|
1799
1797
|
) {
|
|
1800
|
-
|
|
1801
1798
|
const isHeadless = process.env.CRAWLEE_HEADLESS === '1';
|
|
1802
1799
|
|
|
1803
1800
|
// If headless mode is enabled, ensure the headless flag is set.
|
|
@@ -1814,13 +1811,14 @@ export async function initModifiedUserAgent(
|
|
|
1814
1811
|
};
|
|
1815
1812
|
|
|
1816
1813
|
// Launch a temporary persistent context with an empty userDataDir to mimic your production browser setup.
|
|
1817
|
-
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1'
|
|
1818
|
-
? userDataDirectory
|
|
1819
|
-
: '';
|
|
1814
|
+
const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1' ? userDataDirectory : '';
|
|
1820
1815
|
|
|
1821
|
-
const browserContext = await constants.launcher.launchPersistentContext(
|
|
1816
|
+
const browserContext = await constants.launcher.launchPersistentContext(
|
|
1817
|
+
effectiveUserDataDirectory,
|
|
1818
|
+
launchOptions,
|
|
1819
|
+
);
|
|
1822
1820
|
register(browserContext);
|
|
1823
|
-
|
|
1821
|
+
|
|
1824
1822
|
const page = await browserContext.newPage();
|
|
1825
1823
|
|
|
1826
1824
|
// Retrieve the default user agent.
|
|
@@ -1962,7 +1960,7 @@ export const waitForPageLoaded = async (page: Page, timeout = 10000) => {
|
|
|
1962
1960
|
// Only observe if root is a Node
|
|
1963
1961
|
observer.observe(root, {
|
|
1964
1962
|
childList: true,
|
|
1965
|
-
subtree:
|
|
1963
|
+
subtree: true,
|
|
1966
1964
|
attributes: true,
|
|
1967
1965
|
});
|
|
1968
1966
|
});
|