@govtechsg/oobee 0.10.76 → 0.10.78-alpha1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/.github/workflows/publish.yml +8 -1
  2. package/INTEGRATION.md +50 -3
  3. package/dist/cli.js +252 -0
  4. package/dist/combine.js +221 -0
  5. package/dist/constants/cliFunctions.js +306 -0
  6. package/dist/constants/common.js +1669 -0
  7. package/dist/constants/constants.js +913 -0
  8. package/dist/constants/errorMeta.json +319 -0
  9. package/dist/constants/itemTypeDescription.js +7 -0
  10. package/dist/constants/oobeeAi.js +121 -0
  11. package/dist/constants/questions.js +151 -0
  12. package/dist/constants/sampleData.js +176 -0
  13. package/dist/crawlers/commonCrawlerFunc.js +428 -0
  14. package/dist/crawlers/crawlDomain.js +613 -0
  15. package/dist/crawlers/crawlIntelligentSitemap.js +135 -0
  16. package/dist/crawlers/crawlLocalFile.js +151 -0
  17. package/dist/crawlers/crawlSitemap.js +303 -0
  18. package/dist/crawlers/custom/escapeCssSelector.js +10 -0
  19. package/dist/crawlers/custom/evaluateAltText.js +11 -0
  20. package/dist/crawlers/custom/extractAndGradeText.js +44 -0
  21. package/dist/crawlers/custom/extractText.js +27 -0
  22. package/dist/crawlers/custom/findElementByCssSelector.js +36 -0
  23. package/dist/crawlers/custom/flagUnlabelledClickableElements.js +963 -0
  24. package/dist/crawlers/custom/framesCheck.js +37 -0
  25. package/dist/crawlers/custom/getAxeConfiguration.js +111 -0
  26. package/dist/crawlers/custom/gradeReadability.js +23 -0
  27. package/dist/crawlers/custom/utils.js +1024 -0
  28. package/dist/crawlers/custom/xPathToCss.js +147 -0
  29. package/dist/crawlers/guards/urlGuard.js +71 -0
  30. package/dist/crawlers/pdfScanFunc.js +276 -0
  31. package/dist/crawlers/runCustom.js +89 -0
  32. package/dist/exclusions.txt +7 -0
  33. package/dist/generateHtmlReport.js +144 -0
  34. package/dist/index.js +62 -0
  35. package/dist/logs.js +84 -0
  36. package/dist/mergeAxeResults.js +1588 -0
  37. package/dist/npmIndex.js +640 -0
  38. package/dist/proxyService.js +360 -0
  39. package/dist/runGenerateJustHtmlReport.js +16 -0
  40. package/dist/screenshotFunc/htmlScreenshotFunc.js +355 -0
  41. package/dist/screenshotFunc/pdfScreenshotFunc.js +645 -0
  42. package/dist/services/s3Uploader.js +127 -0
  43. package/dist/static/ejs/partials/components/allIssues/AllIssues.ejs +9 -0
  44. package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +82 -0
  45. package/dist/static/ejs/partials/components/allIssues/FilterBar.ejs +33 -0
  46. package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +41 -0
  47. package/dist/static/ejs/partials/components/header/SiteInfo.ejs +119 -0
  48. package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +15 -0
  49. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +44 -0
  50. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +142 -0
  51. package/dist/static/ejs/partials/components/prioritiseIssues/IssueDetailCard.ejs +36 -0
  52. package/dist/static/ejs/partials/components/prioritiseIssues/PrioritiseIssues.ejs +47 -0
  53. package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +196 -0
  54. package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +48 -0
  55. package/dist/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
  56. package/dist/static/ejs/partials/components/shared/InfoAlert.ejs +3 -0
  57. package/dist/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
  58. package/dist/static/ejs/partials/components/summaryScanResults.ejs +16 -0
  59. package/dist/static/ejs/partials/components/summaryTable.ejs +20 -0
  60. package/dist/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
  61. package/dist/static/ejs/partials/components/topTen.ejs +6 -0
  62. package/dist/static/ejs/partials/components/wcagCompliance/FailedCriteria.ejs +47 -0
  63. package/dist/static/ejs/partials/components/wcagCompliance/WcagCompliance.ejs +16 -0
  64. package/dist/static/ejs/partials/components/wcagCompliance/WcagGaugeBar.ejs +16 -0
  65. package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +18 -0
  66. package/dist/static/ejs/partials/footer.ejs +24 -0
  67. package/dist/static/ejs/partials/header.ejs +14 -0
  68. package/dist/static/ejs/partials/main.ejs +29 -0
  69. package/dist/static/ejs/partials/scripts/allIssues/AllIssues.ejs +376 -0
  70. package/dist/static/ejs/partials/scripts/bootstrap.ejs +8 -0
  71. package/dist/static/ejs/partials/scripts/categorySummary.ejs +141 -0
  72. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +3 -0
  73. package/dist/static/ejs/partials/scripts/header/SiteInfo.ejs +44 -0
  74. package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +51 -0
  75. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +127 -0
  76. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanDetails.ejs +60 -0
  77. package/dist/static/ejs/partials/scripts/highlightjs.ejs +335 -0
  78. package/dist/static/ejs/partials/scripts/popper.ejs +7 -0
  79. package/dist/static/ejs/partials/scripts/prioritiseIssues/IssueDetailCard.ejs +137 -0
  80. package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +214 -0
  81. package/dist/static/ejs/partials/scripts/prioritiseIssues/wcagSvgMap.ejs +861 -0
  82. package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +957 -0
  83. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +353 -0
  84. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +468 -0
  85. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +306 -0
  86. package/dist/static/ejs/partials/scripts/ruleModal/utilities.ejs +483 -0
  87. package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +35 -0
  88. package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +75 -0
  89. package/dist/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
  90. package/dist/static/ejs/partials/scripts/summaryTable.ejs +78 -0
  91. package/dist/static/ejs/partials/scripts/topTen.ejs +61 -0
  92. package/dist/static/ejs/partials/scripts/utils.ejs +453 -0
  93. package/dist/static/ejs/partials/scripts/wcagCompliance/FailedCriteria.ejs +103 -0
  94. package/dist/static/ejs/partials/scripts/wcagCompliance/WcagGaugeBar.ejs +47 -0
  95. package/dist/static/ejs/partials/scripts/wcagCompliance.ejs +15 -0
  96. package/dist/static/ejs/partials/scripts/wcagCoverageDetails.ejs +75 -0
  97. package/dist/static/ejs/partials/styles/allIssues/AllIssues.ejs +384 -0
  98. package/dist/static/ejs/partials/styles/bootstrap.ejs +12391 -0
  99. package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +121 -0
  100. package/dist/static/ejs/partials/styles/header/aboutScanModal/AboutScanModal.ejs +82 -0
  101. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanConfiguration.ejs +50 -0
  102. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +149 -0
  103. package/dist/static/ejs/partials/styles/header.ejs +7 -0
  104. package/dist/static/ejs/partials/styles/highlightjs.ejs +54 -0
  105. package/dist/static/ejs/partials/styles/prioritiseIssues/IssueDetailCard.ejs +141 -0
  106. package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +204 -0
  107. package/dist/static/ejs/partials/styles/ruleModal/ruleOffcanvas.ejs +456 -0
  108. package/dist/static/ejs/partials/styles/scannedPagesSegmentedTabs.ejs +46 -0
  109. package/dist/static/ejs/partials/styles/shared/InfoAlert.ejs +12 -0
  110. package/dist/static/ejs/partials/styles/styles.ejs +1607 -0
  111. package/dist/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
  112. package/dist/static/ejs/partials/styles/topTenCard.ejs +44 -0
  113. package/dist/static/ejs/partials/styles/wcagCompliance/FailedCriteria.ejs +59 -0
  114. package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +62 -0
  115. package/dist/static/ejs/partials/styles/wcagCompliance.ejs +36 -0
  116. package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +33 -0
  117. package/dist/static/ejs/partials/summaryHeader.ejs +70 -0
  118. package/dist/static/ejs/partials/summaryMain.ejs +49 -0
  119. package/dist/static/ejs/report.ejs +226 -0
  120. package/dist/static/ejs/summary.ejs +47 -0
  121. package/dist/types/types.js +1 -0
  122. package/dist/utils.js +1070 -0
  123. package/examples/oobee-cypress-integration-js/cypress/support/e2e.js +36 -6
  124. package/examples/oobee-cypress-integration-js/cypress.config.js +45 -1
  125. package/examples/oobee-cypress-integration-ts/cypress.config.ts +47 -1
  126. package/examples/oobee-cypress-integration-ts/src/cypress/support/e2e.ts +36 -6
  127. package/examples/oobee-playwright-integration-js/oobee-playwright-demo.js +2 -1
  128. package/examples/oobee-playwright-integration-ts/src/oobee-playwright-demo.ts +2 -1
  129. package/examples/oobee-scan-html-demo.js +51 -0
  130. package/examples/oobee-scan-page-demo.js +40 -0
  131. package/package.json +9 -3
  132. package/src/constants/common.ts +2 -2
  133. package/src/constants/constants.ts +3 -1
  134. package/src/crawlers/crawlDomain.ts +1 -0
  135. package/src/crawlers/runCustom.ts +0 -1
  136. package/src/mergeAxeResults.ts +43 -22
  137. package/src/npmIndex.ts +500 -131
@@ -0,0 +1,135 @@
1
+ import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
2
+ import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
3
+ import { consoleLogger, guiInfoLog } from '../logs.js';
4
+ import crawlDomain from './crawlDomain.js';
5
+ import crawlSitemap from './crawlSitemap.js';
6
+ import { getPlaywrightLaunchOptions } from '../constants/common.js';
7
+ import { register } from '../utils.js';
8
+ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, safeMode, scanDuration) => {
9
+ const startTime = Date.now(); // Track start time
10
+ let urlsCrawledFinal;
11
+ const urlsCrawled = { ...constants.urlsCrawledObj };
12
+ let dataset;
13
+ let sitemapExist = false;
14
+ const fromCrawlIntelligentSitemap = true;
15
+ let sitemapUrl;
16
+ let durationExceeded = false;
17
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
18
+ function getHomeUrl(parsedUrl) {
19
+ const urlObject = new URL(parsedUrl);
20
+ return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
21
+ }
22
+ async function findSitemap(link, userDataDirectory, extraHTTPHeaders) {
23
+ const homeUrl = getHomeUrl(link);
24
+ let sitemapLink = '';
25
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1' ? userDataDirectory : '';
26
+ const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
27
+ headless: process.env.CRAWLEE_HEADLESS === '1',
28
+ ...getPlaywrightLaunchOptions(browser),
29
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
30
+ });
31
+ register(context);
32
+ const page = await context.newPage();
33
+ for (const path of sitemapPaths) {
34
+ sitemapLink = homeUrl + path;
35
+ if (await checkUrlExists(page, sitemapLink)) {
36
+ sitemapExist = true;
37
+ break;
38
+ }
39
+ }
40
+ await page.close();
41
+ await context.close().catch(() => { });
42
+ return sitemapExist ? sitemapLink : '';
43
+ }
44
+ const checkUrlExists = async (page, parsedUrl) => {
45
+ try {
46
+ const response = await page.goto(parsedUrl);
47
+ return response.ok();
48
+ }
49
+ catch (e) {
50
+ consoleLogger.error(e);
51
+ return false;
52
+ }
53
+ };
54
+ try {
55
+ sitemapUrl = await findSitemap(url, userDataDirectory, extraHTTPHeaders);
56
+ }
57
+ catch (error) {
58
+ consoleLogger.error(error);
59
+ }
60
+ if (!sitemapExist) {
61
+ console.log('Unable to find sitemap. Commencing website crawl instead.');
62
+ return await crawlDomain({
63
+ url,
64
+ randomToken,
65
+ host,
66
+ viewportSettings,
67
+ maxRequestsPerCrawl,
68
+ browser,
69
+ userDataDirectory,
70
+ strategy,
71
+ specifiedMaxConcurrency,
72
+ fileTypes,
73
+ blacklistedPatterns,
74
+ includeScreenshots,
75
+ followRobots,
76
+ extraHTTPHeaders,
77
+ safeMode,
78
+ scanDuration, // Use full duration since no sitemap
79
+ });
80
+ }
81
+ console.log(`Sitemap found at ${sitemapUrl}`);
82
+ urlsCrawledFinal = await crawlSitemap({
83
+ sitemapUrl,
84
+ randomToken,
85
+ host,
86
+ viewportSettings,
87
+ maxRequestsPerCrawl,
88
+ browser,
89
+ userDataDirectory,
90
+ specifiedMaxConcurrency,
91
+ fileTypes,
92
+ blacklistedPatterns,
93
+ includeScreenshots,
94
+ extraHTTPHeaders,
95
+ fromCrawlIntelligentSitemap,
96
+ userUrlInputFromIntelligent: url,
97
+ datasetFromIntelligent: dataset,
98
+ urlsCrawledFromIntelligent: urlsCrawled,
99
+ crawledFromLocalFile: false,
100
+ scanDuration,
101
+ });
102
+ const elapsed = Date.now() - startTime;
103
+ const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
104
+ if (urlsCrawledFinal.scanned.length < maxRequestsPerCrawl && remainingScanDuration > 0) {
105
+ console.log(`Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`);
106
+ urlsCrawledFinal = await crawlDomain({
107
+ url,
108
+ randomToken,
109
+ host,
110
+ viewportSettings,
111
+ maxRequestsPerCrawl,
112
+ browser,
113
+ userDataDirectory,
114
+ strategy,
115
+ specifiedMaxConcurrency,
116
+ fileTypes,
117
+ blacklistedPatterns,
118
+ includeScreenshots,
119
+ followRobots,
120
+ extraHTTPHeaders,
121
+ safeMode,
122
+ fromCrawlIntelligentSitemap,
123
+ datasetFromIntelligent: dataset,
124
+ urlsCrawledFromIntelligent: urlsCrawledFinal,
125
+ scanDuration: remainingScanDuration,
126
+ });
127
+ }
128
+ else if (remainingScanDuration <= 0) {
129
+ console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
130
+ durationExceeded = true;
131
+ }
132
+ guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
133
+ return { urlsCrawled: urlsCrawledFinal, durationExceeded };
134
+ };
135
+ export default crawlIntelligentSitemap;
@@ -0,0 +1,151 @@
1
+ import { Request, RequestList } from 'crawlee';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import { createCrawleeSubFolders, runAxeScript, isUrlPdf } from './commonCrawlerFunc.js';
5
+ import constants, { guiInfoStatusTypes, } from '../constants/constants.js';
6
+ import { getPlaywrightLaunchOptions, isFilePath, convertLocalFileToPath, convertPathToLocalFile, } from '../constants/common.js';
7
+ import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
8
+ import { guiInfoLog } from '../logs.js';
9
+ import crawlSitemap from './crawlSitemap.js';
10
+ import { getPdfStoragePath, register } from '../utils.js';
11
+ export const crawlLocalFile = async ({ url, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, }) => {
12
+ let dataset;
13
+ let urlsCrawled;
14
+ let linksFromSitemap = [];
15
+ let sitemapUrl;
16
+ let durationExceeded = false;
17
+ // Boolean to omit axe scan for basic auth URL
18
+ let isBasicAuth;
19
+ const basicAuthPage = 0;
20
+ let finalLinks = [];
21
+ const { playwrightDeviceDetailsObject } = viewportSettings;
22
+ if (fromCrawlIntelligentSitemap) {
23
+ dataset = datasetFromIntelligent;
24
+ urlsCrawled = urlsCrawledFromIntelligent;
25
+ }
26
+ else {
27
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
28
+ urlsCrawled = { ...constants.urlsCrawledObj };
29
+ }
30
+ // Checks if its in the right file format, and change it before placing into linksFromSitemap
31
+ url = convertLocalFileToPath(url);
32
+ // Check if the sitemapUrl is a local file and if it exists
33
+ if (!fs.existsSync(url) && !isFilePath(url)) {
34
+ // Convert to an absolute path
35
+ let normalizedPath = path.resolve(url);
36
+ // Normalize the path to handle different path separators
37
+ normalizedPath = path.normalize(normalizedPath);
38
+ // Check if the normalized path exists
39
+ if (!fs.existsSync(normalizedPath)) {
40
+ return;
41
+ }
42
+ // At this point, normalizedPath is a valid and existing file path
43
+ url = normalizedPath;
44
+ }
45
+ // XML Files
46
+ if (!(url.match(/\.xml$/i) || url.match(/\.txt$/i))) {
47
+ linksFromSitemap = [new Request({ url })];
48
+ // Non XML file
49
+ }
50
+ else {
51
+ sitemapUrl = url;
52
+ // Put it to crawlSitemap function to handle xml files
53
+ const updatedUrlsCrawled = await crawlSitemap({
54
+ sitemapUrl,
55
+ randomToken,
56
+ host,
57
+ viewportSettings,
58
+ maxRequestsPerCrawl,
59
+ browser,
60
+ userDataDirectory,
61
+ specifiedMaxConcurrency,
62
+ fileTypes,
63
+ blacklistedPatterns,
64
+ includeScreenshots,
65
+ extraHTTPHeaders,
66
+ scanDuration,
67
+ fromCrawlIntelligentSitemap,
68
+ userUrlInputFromIntelligent,
69
+ datasetFromIntelligent,
70
+ urlsCrawledFromIntelligent,
71
+ crawledFromLocalFile: true,
72
+ });
73
+ urlsCrawled = { ...urlsCrawled, ...updatedUrlsCrawled };
74
+ return urlsCrawled;
75
+ }
76
+ const uuidToPdfMapping = {}; // key and value of string type
77
+ finalLinks = [...finalLinks, ...linksFromSitemap];
78
+ await RequestList.open({
79
+ sources: finalLinks,
80
+ });
81
+ const request = linksFromSitemap[0];
82
+ let shouldAbort = false;
83
+ if (!isUrlPdf(url)) {
84
+ const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1' ? userDataDirectory : '';
85
+ const browserContext = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
86
+ headless: process.env.CRAWLEE_HEADLESS === '1',
87
+ ...getPlaywrightLaunchOptions(browser),
88
+ ...playwrightDeviceDetailsObject,
89
+ ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
90
+ });
91
+ register(browserContext);
92
+ const timeoutId = scanDuration > 0
93
+ ? setTimeout(() => {
94
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting local file scan.`);
95
+ durationExceeded = true;
96
+ shouldAbort = true;
97
+ }, scanDuration * 1000)
98
+ : null;
99
+ const page = await browserContext.newPage();
100
+ url = convertPathToLocalFile(url);
101
+ await page.goto(url);
102
+ if (shouldAbort) {
103
+ console.warn('Scan aborted due to timeout before page scan.');
104
+ await dataset.pushData({ scanned: [], scannedRedirects: [] });
105
+ await browserContext.close().catch(() => { });
106
+ return urlsCrawled;
107
+ }
108
+ const results = await runAxeScript({ includeScreenshots, page, randomToken });
109
+ const actualUrl = page.url() || request.loadedUrl || url;
110
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
111
+ numScanned: urlsCrawled.scanned.length,
112
+ urlScanned: url,
113
+ });
114
+ urlsCrawled.scanned.push({
115
+ url,
116
+ pageTitle: results.pageTitle,
117
+ actualUrl, // i.e. actualUrl
118
+ });
119
+ urlsCrawled.scannedRedirects.push({
120
+ fromUrl: url,
121
+ toUrl: actualUrl, // i.e. actualUrl
122
+ });
123
+ results.url = url;
124
+ results.actualUrl = actualUrl;
125
+ await dataset.pushData(results);
126
+ // Ensure proper cleanup of browser context before PDF generation
127
+ await browserContext.close().catch(() => { });
128
+ }
129
+ else {
130
+ const pdfFileName = path.basename(url);
131
+ const destinationFilePath = path.join(getPdfStoragePath(randomToken), pdfFileName);
132
+ fs.copyFileSync(url, destinationFilePath);
133
+ uuidToPdfMapping[pdfFileName] = url;
134
+ urlsCrawled.scanned.push({
135
+ url,
136
+ pageTitle: pdfFileName,
137
+ actualUrl: url,
138
+ });
139
+ await runPdfScan(randomToken);
140
+ // transform result format
141
+ const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
142
+ // get screenshots from pdf docs
143
+ if (includeScreenshots) {
144
+ await Promise.all(pdfResults.map(result => doPdfScreenshots(randomToken, result)));
145
+ }
146
+ // push results for each pdf document to key value store
147
+ await Promise.all(pdfResults.map(result => dataset.pushData(result)));
148
+ }
149
+ return { urlsCrawled, durationExceeded };
150
+ };
151
+ export default crawlLocalFile;
@@ -0,0 +1,303 @@
1
+ import crawlee, { RequestList } from 'crawlee';
2
+ import * as path from 'path';
3
+ import fsp from 'fs/promises';
4
+ import { createCrawleeSubFolders, preNavigationHooks, runAxeScript, } from './commonCrawlerFunc.js';
5
+ import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, disallowedListOfPatterns, FileTypes, } from '../constants/constants.js';
6
+ import { getLinksFromSitemap, getPlaywrightLaunchOptions, isSkippedUrl, waitForPageLoaded, isFilePath, } from '../constants/common.js';
7
+ import { areLinksEqual, isWhitelistedContentType, register } from '../utils.js';
8
+ import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
+ import { guiInfoLog } from '../logs.js';
10
+ const crawlSitemap = async ({ sitemapUrl, randomToken, host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, extraHTTPHeaders, scanDuration = 0, fromCrawlIntelligentSitemap = false, userUrlInputFromIntelligent = null, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, crawledFromLocalFile = false, }) => {
11
+ const crawlStartTime = Date.now();
12
+ let dataset;
13
+ let urlsCrawled;
14
+ let durationExceeded = false;
15
+ if (fromCrawlIntelligentSitemap) {
16
+ dataset = datasetFromIntelligent;
17
+ urlsCrawled = urlsCrawledFromIntelligent;
18
+ }
19
+ else {
20
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
21
+ urlsCrawled = { ...constants.urlsCrawledObj };
22
+ }
23
+ if (!crawledFromLocalFile && isFilePath(sitemapUrl)) {
24
+ console.log('Local file crawling not supported for sitemap. Please provide a valid URL.');
25
+ return;
26
+ }
27
+ const linksFromSitemap = await getLinksFromSitemap(sitemapUrl, maxRequestsPerCrawl, browser, userDataDirectory, userUrlInputFromIntelligent, fromCrawlIntelligentSitemap, extraHTTPHeaders);
28
+ sitemapUrl = encodeURI(sitemapUrl);
29
+ const pdfDownloads = [];
30
+ const uuidToPdfMapping = {};
31
+ const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes);
32
+ const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes);
33
+ const { playwrightDeviceDetailsObject } = viewportSettings;
34
+ const { maxConcurrency } = constants;
35
+ const requestList = await RequestList.open({
36
+ sources: linksFromSitemap,
37
+ });
38
+ const crawler = register(new crawlee.PlaywrightCrawler({
39
+ launchContext: {
40
+ launcher: constants.launcher,
41
+ launchOptions: getPlaywrightLaunchOptions(browser),
42
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
43
+ ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
44
+ },
45
+ retryOnBlocked: true,
46
+ browserPoolOptions: {
47
+ useFingerprints: false,
48
+ preLaunchHooks: [
49
+ async (_pageId, launchContext) => {
50
+ const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
51
+ // Ensure base exists
52
+ await fsp.mkdir(baseDir, { recursive: true });
53
+ // Create a unique subdir per browser
54
+ const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
55
+ await fsp.mkdir(subProfileDir, { recursive: true });
56
+ // Assign to Crawlee's launcher
57
+ launchContext.userDataDir = subProfileDir;
58
+ // Safely extend launchOptions
59
+ launchContext.launchOptions = {
60
+ ...launchContext.launchOptions,
61
+ ignoreHTTPSErrors: true,
62
+ ...playwrightDeviceDetailsObject,
63
+ ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
64
+ };
65
+ // Optionally log for debugging
66
+ // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
67
+ },
68
+ ],
69
+ },
70
+ requestList,
71
+ postNavigationHooks: [
72
+ async ({ page }) => {
73
+ try {
74
+ // Wait for a quiet period in the DOM, but with safeguards
75
+ await page.evaluate(() => {
76
+ return new Promise(resolve => {
77
+ let timeout;
78
+ let mutationCount = 0;
79
+ const MAX_MUTATIONS = 500; // stop if things never quiet down
80
+ const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
81
+ const observer = new MutationObserver(() => {
82
+ clearTimeout(timeout);
83
+ mutationCount++;
84
+ if (mutationCount > MAX_MUTATIONS) {
85
+ observer.disconnect();
86
+ resolve('Too many mutations, exiting.');
87
+ return;
88
+ }
89
+ // restart quiet‑period timer
90
+ timeout = setTimeout(() => {
91
+ observer.disconnect();
92
+ resolve('DOM stabilized.');
93
+ }, 1000);
94
+ });
95
+ // overall timeout in case the page never settles
96
+ timeout = setTimeout(() => {
97
+ observer.disconnect();
98
+ resolve('Observer timeout reached.');
99
+ }, OBSERVER_TIMEOUT);
100
+ const root = document.documentElement || document.body || document;
101
+ if (!root || typeof observer.observe !== 'function') {
102
+ resolve('No root node to observe.');
103
+ }
104
+ });
105
+ });
106
+ }
107
+ catch (err) {
108
+ // Handle page navigation errors gracefully
109
+ if (err.message.includes('was destroyed')) {
110
+ return; // Page navigated or closed, no need to handle
111
+ }
112
+ throw err; // Rethrow unknown errors
113
+ }
114
+ },
115
+ ],
116
+ preNavigationHooks: [
117
+ async ({ request, page }, gotoOptions) => {
118
+ const url = request.url.toLowerCase();
119
+ const isNotSupportedDocument = disallowedListOfPatterns.some(pattern => url.startsWith(pattern));
120
+ if (isNotSupportedDocument) {
121
+ request.skipNavigation = true;
122
+ request.userData.isNotSupportedDocument = true;
123
+ // Log for verification (optional, but not required for correctness)
124
+ // console.log(`[SKIP] Not supported: ${request.url}`);
125
+ return;
126
+ }
127
+ preNavigationHooks(extraHTTPHeaders);
128
+ },
129
+ ],
130
+ requestHandlerTimeoutSecs: 90,
131
+ requestHandler: async ({ page, request, response, sendRequest }) => {
132
+ // Log documents that are not supported
133
+ if (request.userData?.isNotSupportedDocument) {
134
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
135
+ numScanned: urlsCrawled.scanned.length,
136
+ urlScanned: request.url,
137
+ });
138
+ urlsCrawled.userExcluded.push({
139
+ url: request.url,
140
+ pageTitle: request.url,
141
+ actualUrl: request.url, // because about:blank is not useful
142
+ metadata: STATUS_CODE_METADATA[1],
143
+ httpStatusCode: 1,
144
+ });
145
+ return;
146
+ }
147
+ await waitForPageLoaded(page, 10000);
148
+ const actualUrl = page.url() || request.loadedUrl || request.url;
149
+ const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
150
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
151
+ if (hasExceededDuration) {
152
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
153
+ durationExceeded = true;
154
+ }
155
+ crawler.autoscaledPool.abort(); // stops new requests
156
+ return;
157
+ }
158
+ if (request.skipNavigation && actualUrl === 'about:blank') {
159
+ if (isScanPdfs) {
160
+ // pushes download promise into pdfDownloads
161
+ const { pdfFileName, url } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
162
+ uuidToPdfMapping[pdfFileName] = url;
163
+ return;
164
+ }
165
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
166
+ numScanned: urlsCrawled.scanned.length,
167
+ urlScanned: request.url,
168
+ });
169
+ urlsCrawled.userExcluded.push({
170
+ url: request.url,
171
+ pageTitle: request.url,
172
+ actualUrl: request.url, // because about:blank is not useful
173
+ metadata: STATUS_CODE_METADATA[1],
174
+ httpStatusCode: 1,
175
+ });
176
+ return;
177
+ }
178
+ const contentType = response?.headers?.()['content-type'] || '';
179
+ const status = response ? response.status() : 0;
180
+ if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
181
+ const isRedirected = !areLinksEqual(page.url(), request.url);
182
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => (item.actualUrl || item.url) === page.url());
183
+ if (isRedirected && isLoadedUrlInCrawledUrls) {
184
+ urlsCrawled.notScannedRedirects.push({
185
+ fromUrl: request.url,
186
+ toUrl: actualUrl, // i.e. actualUrl
187
+ });
188
+ return;
189
+ }
190
+ // This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
191
+ if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
192
+ urlsCrawled.userExcluded.push({
193
+ url: request.url,
194
+ pageTitle: request.url,
195
+ actualUrl,
196
+ metadata: STATUS_CODE_METADATA[0],
197
+ httpStatusCode: 0,
198
+ });
199
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
200
+ numScanned: urlsCrawled.scanned.length,
201
+ urlScanned: request.url,
202
+ });
203
+ return;
204
+ }
205
+ const results = await runAxeScript({ includeScreenshots, page, randomToken });
206
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
207
+ numScanned: urlsCrawled.scanned.length,
208
+ urlScanned: request.url,
209
+ });
210
+ urlsCrawled.scanned.push({
211
+ url: request.url,
212
+ pageTitle: results.pageTitle,
213
+ actualUrl, // i.e. actualUrl
214
+ });
215
+ urlsCrawled.scannedRedirects.push({
216
+ fromUrl: request.url,
217
+ toUrl: actualUrl,
218
+ });
219
+ results.url = request.url;
220
+ results.actualUrl = actualUrl;
221
+ await dataset.pushData(results);
222
+ }
223
+ else {
224
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
225
+ numScanned: urlsCrawled.scanned.length,
226
+ urlScanned: request.url,
227
+ });
228
+ if (isScanHtml) {
229
+ // carry through the HTTP status metadata
230
+ const status = response?.status();
231
+ const metadata = typeof status === 'number'
232
+ ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
233
+ : STATUS_CODE_METADATA[2];
234
+ urlsCrawled.invalid.push({
235
+ actualUrl,
236
+ url: request.url,
237
+ pageTitle: request.url,
238
+ metadata,
239
+ httpStatusCode: typeof status === 'number' ? status : 0,
240
+ });
241
+ }
242
+ }
243
+ },
244
+ failedRequestHandler: async ({ request, response, error }) => {
245
+ // check if scanned pages have reached limit due to multi-instances of handler running
246
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
247
+ return;
248
+ }
249
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
250
+ numScanned: urlsCrawled.scanned.length,
251
+ urlScanned: request.url,
252
+ });
253
+ const status = response?.status();
254
+ const metadata = typeof status === 'number'
255
+ ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
256
+ : STATUS_CODE_METADATA[2];
257
+ urlsCrawled.error.push({
258
+ url: request.url,
259
+ pageTitle: request.url,
260
+ actualUrl: request.url,
261
+ metadata,
262
+ httpStatusCode: typeof status === 'number' ? status : 0,
263
+ });
264
+ crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
265
+ },
266
+ maxRequestsPerCrawl: Infinity,
267
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
268
+ ...(process.env.OOBEE_FAST_CRAWLER && {
269
+ autoscaledPoolOptions: {
270
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
271
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
272
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
273
+ scaleUpStepRatio: 0.99, // Scale up faster
274
+ scaleDownStepRatio: 0.1, // Scale down slower
275
+ },
276
+ }),
277
+ }));
278
+ await crawler.run();
279
+ await requestList.isFinished();
280
+ if (pdfDownloads.length > 0) {
281
+ // wait for pdf downloads to complete
282
+ await Promise.all(pdfDownloads);
283
+ // scan and process pdf documents
284
+ await runPdfScan(randomToken);
285
+ // transform result format
286
+ const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
287
+ // get screenshots from pdf docs
288
+ if (includeScreenshots) {
289
+ await Promise.all(pdfResults.map(async (result) => await doPdfScreenshots(randomToken, result)));
290
+ }
291
+ // push results for each pdf document to key value store
292
+ await Promise.all(pdfResults.map(result => dataset.pushData(result)));
293
+ }
294
+ if (!fromCrawlIntelligentSitemap) {
295
+ guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
296
+ }
297
+ if (scanDuration > 0) {
298
+ const elapsed = Math.round((Date.now() - crawlStartTime) / 1000);
299
+ console.log(`Crawl ended after ${elapsed}s (limit: ${scanDuration}s).`);
300
+ }
301
+ return { urlsCrawled, durationExceeded };
302
+ };
303
+ export default crawlSitemap;
@@ -0,0 +1,10 @@
1
+ // for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
2
+ export function escapeCssSelector(selector) {
3
+ try {
4
+ return selector.replace(/([#\.])(\d)/g, (_match, prefix, digit) => `${prefix}\\3${digit} `);
5
+ }
6
+ catch (e) {
7
+ console.error(`error escaping css selector: ${selector}`, e);
8
+ return selector;
9
+ }
10
+ }
@@ -0,0 +1,11 @@
1
+ export function evaluateAltText(node) {
2
+ const altText = node.getAttribute('alt');
3
+ const confusingTexts = ['img', 'image', 'picture', 'photo', 'graphic'];
4
+ if (altText) {
5
+ const trimmedAltText = altText.trim().toLowerCase();
6
+ if (confusingTexts.includes(trimmedAltText)) {
7
+ return false;
8
+ }
9
+ }
10
+ return true;
11
+ }
@@ -0,0 +1,44 @@
1
+ import textReadability from 'text-readability';
2
+ export async function extractAndGradeText(page) {
3
+ try {
4
+ // Extract text content from all specified elements (e.g., paragraphs)
5
+ const sentences = await page.evaluate(() => {
6
+ const elements = document.querySelectorAll('p'); // Adjust selector as needed
7
+ const extractedSentences = [];
8
+ elements.forEach(element => {
9
+ const text = element.innerText.trim();
10
+ // Split the text into individual sentences
11
+ const sentencePattern = /[^.!?]*[.!?]+/g; // Match sentences ending with ., !, or ?
12
+ const matches = text.match(sentencePattern);
13
+ if (matches) {
14
+ // Add only sentences that end with punctuation
15
+ matches.forEach(sentence => {
16
+ const trimmedSentence = sentence.trim(); // Trim whitespace from each sentence
17
+ if (trimmedSentence.length > 0) {
18
+ extractedSentences.push(trimmedSentence);
19
+ }
20
+ });
21
+ }
22
+ });
23
+ return extractedSentences;
24
+ });
25
+ // Check if any valid sentences were extracted
26
+ if (sentences.length === 0) {
27
+ return ''; // Return an empty string if no valid sentences are found
28
+ }
29
+ // Join the valid sentences into a single string
30
+ const filteredText = sentences.join(' ').trim();
31
+ // Count the total number of words in the filtered text
32
+ const wordCount = filteredText.split(/\s+/).length;
33
+ // Grade the text content only if there are 20 words or more
34
+ const readabilityScore = wordCount >= 20 ? textReadability.fleschReadingEase(filteredText) : 0;
35
+ // Log details for debugging
36
+ // Determine the return value
37
+ const result = readabilityScore === 0 || readabilityScore > 50 ? '' : readabilityScore.toString(); // Convert readabilityScore to string
38
+ return result;
39
+ }
40
+ catch (error) {
41
+ console.error('Error extracting and grading text:', error);
42
+ return ''; // Return an empty string in case of an error
43
+ }
44
+ }