@govtechsg/oobee 0.10.76 → 0.10.78-alpha1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/.github/workflows/publish.yml +8 -1
  2. package/INTEGRATION.md +50 -3
  3. package/dist/cli.js +252 -0
  4. package/dist/combine.js +221 -0
  5. package/dist/constants/cliFunctions.js +306 -0
  6. package/dist/constants/common.js +1669 -0
  7. package/dist/constants/constants.js +913 -0
  8. package/dist/constants/errorMeta.json +319 -0
  9. package/dist/constants/itemTypeDescription.js +7 -0
  10. package/dist/constants/oobeeAi.js +121 -0
  11. package/dist/constants/questions.js +151 -0
  12. package/dist/constants/sampleData.js +176 -0
  13. package/dist/crawlers/commonCrawlerFunc.js +428 -0
  14. package/dist/crawlers/crawlDomain.js +613 -0
  15. package/dist/crawlers/crawlIntelligentSitemap.js +135 -0
  16. package/dist/crawlers/crawlLocalFile.js +151 -0
  17. package/dist/crawlers/crawlSitemap.js +303 -0
  18. package/dist/crawlers/custom/escapeCssSelector.js +10 -0
  19. package/dist/crawlers/custom/evaluateAltText.js +11 -0
  20. package/dist/crawlers/custom/extractAndGradeText.js +44 -0
  21. package/dist/crawlers/custom/extractText.js +27 -0
  22. package/dist/crawlers/custom/findElementByCssSelector.js +36 -0
  23. package/dist/crawlers/custom/flagUnlabelledClickableElements.js +963 -0
  24. package/dist/crawlers/custom/framesCheck.js +37 -0
  25. package/dist/crawlers/custom/getAxeConfiguration.js +111 -0
  26. package/dist/crawlers/custom/gradeReadability.js +23 -0
  27. package/dist/crawlers/custom/utils.js +1024 -0
  28. package/dist/crawlers/custom/xPathToCss.js +147 -0
  29. package/dist/crawlers/guards/urlGuard.js +71 -0
  30. package/dist/crawlers/pdfScanFunc.js +276 -0
  31. package/dist/crawlers/runCustom.js +89 -0
  32. package/dist/exclusions.txt +7 -0
  33. package/dist/generateHtmlReport.js +144 -0
  34. package/dist/index.js +62 -0
  35. package/dist/logs.js +84 -0
  36. package/dist/mergeAxeResults.js +1588 -0
  37. package/dist/npmIndex.js +640 -0
  38. package/dist/proxyService.js +360 -0
  39. package/dist/runGenerateJustHtmlReport.js +16 -0
  40. package/dist/screenshotFunc/htmlScreenshotFunc.js +355 -0
  41. package/dist/screenshotFunc/pdfScreenshotFunc.js +645 -0
  42. package/dist/services/s3Uploader.js +127 -0
  43. package/dist/static/ejs/partials/components/allIssues/AllIssues.ejs +9 -0
  44. package/dist/static/ejs/partials/components/allIssues/CategoryBadges.ejs +82 -0
  45. package/dist/static/ejs/partials/components/allIssues/FilterBar.ejs +33 -0
  46. package/dist/static/ejs/partials/components/allIssues/IssuesTable.ejs +41 -0
  47. package/dist/static/ejs/partials/components/header/SiteInfo.ejs +119 -0
  48. package/dist/static/ejs/partials/components/header/aboutScanModal/AboutScanModal.ejs +15 -0
  49. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanConfiguration.ejs +44 -0
  50. package/dist/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +142 -0
  51. package/dist/static/ejs/partials/components/prioritiseIssues/IssueDetailCard.ejs +36 -0
  52. package/dist/static/ejs/partials/components/prioritiseIssues/PrioritiseIssues.ejs +47 -0
  53. package/dist/static/ejs/partials/components/ruleModal/ruleOffcanvas.ejs +196 -0
  54. package/dist/static/ejs/partials/components/scannedPagesSegmentedTabs.ejs +48 -0
  55. package/dist/static/ejs/partials/components/screenshotLightbox.ejs +13 -0
  56. package/dist/static/ejs/partials/components/shared/InfoAlert.ejs +3 -0
  57. package/dist/static/ejs/partials/components/summaryScanAbout.ejs +141 -0
  58. package/dist/static/ejs/partials/components/summaryScanResults.ejs +16 -0
  59. package/dist/static/ejs/partials/components/summaryTable.ejs +20 -0
  60. package/dist/static/ejs/partials/components/summaryWcagCompliance.ejs +94 -0
  61. package/dist/static/ejs/partials/components/topTen.ejs +6 -0
  62. package/dist/static/ejs/partials/components/wcagCompliance/FailedCriteria.ejs +47 -0
  63. package/dist/static/ejs/partials/components/wcagCompliance/WcagCompliance.ejs +16 -0
  64. package/dist/static/ejs/partials/components/wcagCompliance/WcagGaugeBar.ejs +16 -0
  65. package/dist/static/ejs/partials/components/wcagCoverageDetails.ejs +18 -0
  66. package/dist/static/ejs/partials/footer.ejs +24 -0
  67. package/dist/static/ejs/partials/header.ejs +14 -0
  68. package/dist/static/ejs/partials/main.ejs +29 -0
  69. package/dist/static/ejs/partials/scripts/allIssues/AllIssues.ejs +376 -0
  70. package/dist/static/ejs/partials/scripts/bootstrap.ejs +8 -0
  71. package/dist/static/ejs/partials/scripts/categorySummary.ejs +141 -0
  72. package/dist/static/ejs/partials/scripts/decodeUnzipParse.ejs +3 -0
  73. package/dist/static/ejs/partials/scripts/header/SiteInfo.ejs +44 -0
  74. package/dist/static/ejs/partials/scripts/header/aboutScanModal/AboutScanModal.ejs +51 -0
  75. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanConfiguration.ejs +127 -0
  76. package/dist/static/ejs/partials/scripts/header/aboutScanModal/ScanDetails.ejs +60 -0
  77. package/dist/static/ejs/partials/scripts/highlightjs.ejs +335 -0
  78. package/dist/static/ejs/partials/scripts/popper.ejs +7 -0
  79. package/dist/static/ejs/partials/scripts/prioritiseIssues/IssueDetailCard.ejs +137 -0
  80. package/dist/static/ejs/partials/scripts/prioritiseIssues/PrioritiseIssues.ejs +214 -0
  81. package/dist/static/ejs/partials/scripts/prioritiseIssues/wcagSvgMap.ejs +861 -0
  82. package/dist/static/ejs/partials/scripts/ruleModal/constants.ejs +957 -0
  83. package/dist/static/ejs/partials/scripts/ruleModal/itemCardRenderer.ejs +353 -0
  84. package/dist/static/ejs/partials/scripts/ruleModal/pageAccordionBuilder.ejs +468 -0
  85. package/dist/static/ejs/partials/scripts/ruleModal/ruleOffcanvas.ejs +306 -0
  86. package/dist/static/ejs/partials/scripts/ruleModal/utilities.ejs +483 -0
  87. package/dist/static/ejs/partials/scripts/scannedPagesSegmentedTabs.ejs +35 -0
  88. package/dist/static/ejs/partials/scripts/screenshotLightbox.ejs +75 -0
  89. package/dist/static/ejs/partials/scripts/summaryScanResults.ejs +14 -0
  90. package/dist/static/ejs/partials/scripts/summaryTable.ejs +78 -0
  91. package/dist/static/ejs/partials/scripts/topTen.ejs +61 -0
  92. package/dist/static/ejs/partials/scripts/utils.ejs +453 -0
  93. package/dist/static/ejs/partials/scripts/wcagCompliance/FailedCriteria.ejs +103 -0
  94. package/dist/static/ejs/partials/scripts/wcagCompliance/WcagGaugeBar.ejs +47 -0
  95. package/dist/static/ejs/partials/scripts/wcagCompliance.ejs +15 -0
  96. package/dist/static/ejs/partials/scripts/wcagCoverageDetails.ejs +75 -0
  97. package/dist/static/ejs/partials/styles/allIssues/AllIssues.ejs +384 -0
  98. package/dist/static/ejs/partials/styles/bootstrap.ejs +12391 -0
  99. package/dist/static/ejs/partials/styles/header/SiteInfo.ejs +121 -0
  100. package/dist/static/ejs/partials/styles/header/aboutScanModal/AboutScanModal.ejs +82 -0
  101. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanConfiguration.ejs +50 -0
  102. package/dist/static/ejs/partials/styles/header/aboutScanModal/ScanDetails.ejs +149 -0
  103. package/dist/static/ejs/partials/styles/header.ejs +7 -0
  104. package/dist/static/ejs/partials/styles/highlightjs.ejs +54 -0
  105. package/dist/static/ejs/partials/styles/prioritiseIssues/IssueDetailCard.ejs +141 -0
  106. package/dist/static/ejs/partials/styles/prioritiseIssues/PrioritiseIssues.ejs +204 -0
  107. package/dist/static/ejs/partials/styles/ruleModal/ruleOffcanvas.ejs +456 -0
  108. package/dist/static/ejs/partials/styles/scannedPagesSegmentedTabs.ejs +46 -0
  109. package/dist/static/ejs/partials/styles/shared/InfoAlert.ejs +12 -0
  110. package/dist/static/ejs/partials/styles/styles.ejs +1607 -0
  111. package/dist/static/ejs/partials/styles/summaryBootstrap.ejs +12458 -0
  112. package/dist/static/ejs/partials/styles/topTenCard.ejs +44 -0
  113. package/dist/static/ejs/partials/styles/wcagCompliance/FailedCriteria.ejs +59 -0
  114. package/dist/static/ejs/partials/styles/wcagCompliance/WcagGaugeBar.ejs +62 -0
  115. package/dist/static/ejs/partials/styles/wcagCompliance.ejs +36 -0
  116. package/dist/static/ejs/partials/styles/wcagCoverageDetails.ejs +33 -0
  117. package/dist/static/ejs/partials/summaryHeader.ejs +70 -0
  118. package/dist/static/ejs/partials/summaryMain.ejs +49 -0
  119. package/dist/static/ejs/report.ejs +226 -0
  120. package/dist/static/ejs/summary.ejs +47 -0
  121. package/dist/types/types.js +1 -0
  122. package/dist/utils.js +1070 -0
  123. package/examples/oobee-cypress-integration-js/cypress/support/e2e.js +36 -6
  124. package/examples/oobee-cypress-integration-js/cypress.config.js +45 -1
  125. package/examples/oobee-cypress-integration-ts/cypress.config.ts +47 -1
  126. package/examples/oobee-cypress-integration-ts/src/cypress/support/e2e.ts +36 -6
  127. package/examples/oobee-playwright-integration-js/oobee-playwright-demo.js +2 -1
  128. package/examples/oobee-playwright-integration-ts/src/oobee-playwright-demo.ts +2 -1
  129. package/examples/oobee-scan-html-demo.js +51 -0
  130. package/examples/oobee-scan-page-demo.js +40 -0
  131. package/package.json +9 -3
  132. package/src/constants/common.ts +2 -2
  133. package/src/constants/constants.ts +3 -1
  134. package/src/crawlers/crawlDomain.ts +1 -0
  135. package/src/crawlers/runCustom.ts +0 -1
  136. package/src/mergeAxeResults.ts +43 -22
  137. package/src/npmIndex.ts +500 -131
@@ -0,0 +1,613 @@
1
+ import crawlee from 'crawlee';
2
+ import * as path from 'path';
3
+ import fsp from 'fs/promises';
4
+ import { createCrawleeSubFolders, runAxeScript, isUrlPdf, shouldSkipClickDueToDisallowedHref, shouldSkipDueToUnsupportedContent, } from './commonCrawlerFunc.js';
5
+ import constants, { blackListedFileExtensions, guiInfoStatusTypes, cssQuerySelectors, STATUS_CODE_METADATA, disallowedListOfPatterns, disallowedSelectorPatterns, FileTypes, } from '../constants/constants.js';
6
+ import { getPlaywrightLaunchOptions, isBlacklistedFileExtensions, isSkippedUrl, isDisallowedInRobotsTxt, getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js';
7
+ import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
8
+ import { handlePdfDownload, runPdfScan, mapPdfScanResults, doPdfScreenshots, } from './pdfScanFunc.js';
9
+ import { consoleLogger, guiInfoLog } from '../logs.js';
10
+ const isBlacklisted = (url, blacklistedPatterns) => {
11
+ if (!blacklistedPatterns) {
12
+ return false;
13
+ }
14
+ try {
15
+ const parsedUrl = new URL(url);
16
+ return blacklistedPatterns.some(pattern => new RegExp(pattern).test(parsedUrl.hostname) || new RegExp(pattern).test(url));
17
+ }
18
+ catch (error) {
19
+ console.error(`Error parsing URL: ${url}`, error);
20
+ return false;
21
+ }
22
+ };
23
+ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, maxRequestsPerCrawl, browser, userDataDirectory, strategy, specifiedMaxConcurrency, fileTypes, blacklistedPatterns, includeScreenshots, followRobots, extraHTTPHeaders, scanDuration = 0, safeMode = false, fromCrawlIntelligentSitemap = false, datasetFromIntelligent = null, urlsCrawledFromIntelligent = null, ruleset = [], }) => {
24
+ const crawlStartTime = Date.now();
25
+ let dataset;
26
+ let urlsCrawled;
27
+ let requestQueue;
28
+ let durationExceeded = false;
29
+ if (fromCrawlIntelligentSitemap) {
30
+ dataset = datasetFromIntelligent;
31
+ urlsCrawled = urlsCrawledFromIntelligent;
32
+ }
33
+ else {
34
+ ({ dataset } = await createCrawleeSubFolders(randomToken));
35
+ urlsCrawled = { ...constants.urlsCrawledObj };
36
+ }
37
+ ({ requestQueue } = await createCrawleeSubFolders(randomToken));
38
+ const pdfDownloads = [];
39
+ const uuidToPdfMapping = {};
40
+ const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes);
41
+ const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes);
42
+ const { maxConcurrency } = constants;
43
+ const { playwrightDeviceDetailsObject } = viewportSettings;
44
+ await requestQueue.addRequest({
45
+ url,
46
+ skipNavigation: isUrlPdf(url),
47
+ label: url,
48
+ });
49
+ const enqueueProcess = async (page, enqueueLinks, browserContext) => {
50
+ try {
51
+ await enqueueLinks({
52
+ // set selector matches anchor elements with href but not contains # or starting with mailto:
53
+ selector: `a:not(${disallowedSelectorPatterns})`,
54
+ strategy,
55
+ requestQueue,
56
+ transformRequestFunction: (req) => {
57
+ try {
58
+ req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
59
+ }
60
+ catch (e) {
61
+ consoleLogger.error(e);
62
+ }
63
+ if (urlsCrawled.scanned.some(item => item.url === req.url)) {
64
+ req.skipNavigation = true;
65
+ }
66
+ if (isDisallowedInRobotsTxt(req.url))
67
+ return null;
68
+ if (isBlacklisted(req.url, blacklistedPatterns))
69
+ return null;
70
+ if (isUrlPdf(req.url)) {
71
+ // playwright headless mode does not support navigation to pdf document
72
+ req.skipNavigation = true;
73
+ }
74
+ req.label = req.url;
75
+ return req;
76
+ },
77
+ });
78
+ // If safeMode flag is enabled, skip enqueueLinksByClickingElements
79
+ if (!safeMode) {
80
+ // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
81
+ try {
82
+ await customEnqueueLinksByClickingElements(page, browserContext);
83
+ }
84
+ catch (e) {
85
+ // do nothing;
86
+ }
87
+ }
88
+ }
89
+ catch {
90
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
91
+ // Handles browser page object been closed.
92
+ }
93
+ };
94
+ const customEnqueueLinksByClickingElements = async (page, browserContext) => {
95
+ const initialPageUrl = page.url().toString();
96
+ const isExcluded = (newPageUrl) => {
97
+ const isAlreadyScanned = urlsCrawled.scanned.some(item => item.url === newPageUrl);
98
+ const isBlacklistedUrl = isBlacklisted(newPageUrl, blacklistedPatterns);
99
+ const isNotFollowStrategy = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
100
+ const isNotSupportedDocument = disallowedListOfPatterns.some(pattern => newPageUrl.toLowerCase().startsWith(pattern));
101
+ return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
102
+ };
103
+ const setPageListeners = (page) => {
104
+ // event listener to handle new page popups upon button click
105
+ page.on('popup', async (newPage) => {
106
+ try {
107
+ if (newPage.url() != initialPageUrl && !isExcluded(newPage.url())) {
108
+ const newPageUrl = newPage.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
109
+ await requestQueue.addRequest({
110
+ url: newPageUrl,
111
+ skipNavigation: isUrlPdf(newPage.url()),
112
+ label: newPageUrl,
113
+ });
114
+ }
115
+ else {
116
+ try {
117
+ await newPage.close();
118
+ }
119
+ catch {
120
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
121
+ // Handles browser page object been closed.
122
+ }
123
+ }
124
+ }
125
+ catch {
126
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
127
+ // Handles browser page object been closed.
128
+ }
129
+ });
130
+ // event listener to handle navigation to new url within same page upon element click
131
+ page.on('framenavigated', async (newFrame) => {
132
+ try {
133
+ if (newFrame.url() !== initialPageUrl &&
134
+ !isExcluded(newFrame.url()) &&
135
+ !(newFrame.url() == 'about:blank')) {
136
+ const newFrameUrl = newFrame.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
137
+ await requestQueue.addRequest({
138
+ url: newFrameUrl,
139
+ skipNavigation: isUrlPdf(newFrame.url()),
140
+ label: newFrameUrl,
141
+ });
142
+ }
143
+ }
144
+ catch {
145
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
146
+ // Handles browser page object been closed.
147
+ }
148
+ });
149
+ };
150
+ setPageListeners(page);
151
+ let currentElementIndex = 0;
152
+ let isAllElementsHandled = false;
153
+ while (!isAllElementsHandled) {
154
+ try {
155
+ // navigate back to initial page if clicking on a element previously caused it to navigate to a new url
156
+ if (page.url() != initialPageUrl) {
157
+ try {
158
+ await page.close();
159
+ }
160
+ catch {
161
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
162
+ // Handles browser page object been closed.
163
+ }
164
+ page = await browserContext.newPage();
165
+ await page.goto(initialPageUrl, {
166
+ waitUntil: 'domcontentloaded',
167
+ });
168
+ setPageListeners(page);
169
+ }
170
+ const selectedElementsString = cssQuerySelectors.join(', ');
171
+ const selectedElements = await page.$$(selectedElementsString);
172
+ // edge case where there might be elements on page that appears intermittently
173
+ if (currentElementIndex + 1 > selectedElements.length || !selectedElements) {
174
+ break;
175
+ }
176
+ // handle the last element in selectedElements
177
+ if (currentElementIndex + 1 === selectedElements.length) {
178
+ isAllElementsHandled = true;
179
+ }
180
+ const element = selectedElements[currentElementIndex];
181
+ currentElementIndex += 1;
182
+ let newUrlFoundInElement = null;
183
+ if (await element.isVisible()) {
184
+ // Find url in html elements without clicking them
185
+ await page
186
+ .evaluate(element => {
187
+ // find href attribute
188
+ const hrefUrl = element.getAttribute('href');
189
+ // find url in datapath
190
+ const dataPathUrl = element.getAttribute('data-path');
191
+ return hrefUrl || dataPathUrl;
192
+ }, element)
193
+ .then(result => {
194
+ if (result) {
195
+ newUrlFoundInElement = result;
196
+ const pageUrl = new URL(page.url());
197
+ const baseUrl = `${pageUrl.protocol}//${pageUrl.host}`;
198
+ let absoluteUrl;
199
+ // Construct absolute URL using base URL
200
+ try {
201
+ // Check if newUrlFoundInElement is a valid absolute URL
202
+ absoluteUrl = new URL(newUrlFoundInElement);
203
+ }
204
+ catch (e) {
205
+ // If it's not a valid URL, treat it as a relative URL
206
+ absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
207
+ }
208
+ newUrlFoundInElement = absoluteUrl.href;
209
+ }
210
+ });
211
+ if (newUrlFoundInElement && !isExcluded(newUrlFoundInElement)) {
212
+ const newUrlFoundInElementUrl = newUrlFoundInElement.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
213
+ await requestQueue.addRequest({
214
+ url: newUrlFoundInElementUrl,
215
+ skipNavigation: isUrlPdf(newUrlFoundInElement),
216
+ label: newUrlFoundInElementUrl,
217
+ });
218
+ }
219
+ else if (!newUrlFoundInElement) {
220
+ try {
221
+ const shouldSkip = await shouldSkipClickDueToDisallowedHref(page, element);
222
+ if (shouldSkip) {
223
+ const elementHtml = await page.evaluate(el => el.outerHTML, element);
224
+ consoleLogger.info('Skipping a click due to disallowed href nearby. Element HTML:', elementHtml);
225
+ continue;
226
+ }
227
+ // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
228
+ await element.click({ force: true });
229
+ await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
230
+ }
231
+ catch {
232
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
233
+ // Handles browser page object been closed.
234
+ }
235
+ }
236
+ }
237
+ }
238
+ catch {
239
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
240
+ // Handles browser page object been closed.
241
+ }
242
+ }
243
+ };
244
+ let isAbortingScanNow = false;
245
+ const crawler = register(new crawlee.PlaywrightCrawler({
246
+ launchContext: {
247
+ launcher: constants.launcher,
248
+ launchOptions: getPlaywrightLaunchOptions(browser),
249
+ // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
250
+ ...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
251
+ },
252
+ retryOnBlocked: true,
253
+ browserPoolOptions: {
254
+ useFingerprints: false,
255
+ preLaunchHooks: [
256
+ async (_pageId, launchContext) => {
257
+ const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...
258
+ // Ensure base exists
259
+ await fsp.mkdir(baseDir, { recursive: true });
260
+ // Create a unique subdir per browser
261
+ const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
262
+ await fsp.mkdir(subProfileDir, { recursive: true });
263
+ // Assign to Crawlee's launcher
264
+ launchContext.userDataDir = subProfileDir;
265
+ // Safely extend launchOptions
266
+ launchContext.launchOptions = {
267
+ ...launchContext.launchOptions,
268
+ ignoreHTTPSErrors: true,
269
+ ...playwrightDeviceDetailsObject,
270
+ ...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
271
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
272
+ };
273
+ // Optionally log for debugging
274
+ // console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
275
+ },
276
+ ],
277
+ },
278
+ requestQueue,
279
+ postNavigationHooks: [
280
+ async (crawlingContext) => {
281
+ const { page, request } = crawlingContext;
282
+ await page.evaluate(() => {
283
+ return new Promise(resolve => {
284
+ let timeout;
285
+ let mutationCount = 0;
286
+ const MAX_MUTATIONS = 500; // stop if things never quiet down
287
+ const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
288
+ const observer = new MutationObserver(() => {
289
+ clearTimeout(timeout);
290
+ mutationCount++;
291
+ if (mutationCount > MAX_MUTATIONS) {
292
+ observer.disconnect();
293
+ resolve('Too many mutations, exiting.');
294
+ return;
295
+ }
296
+ // restart quiet‑period timer
297
+ timeout = setTimeout(() => {
298
+ observer.disconnect();
299
+ resolve('DOM stabilized.');
300
+ }, 1000);
301
+ });
302
+ // overall timeout in case the page never settles
303
+ timeout = setTimeout(() => {
304
+ observer.disconnect();
305
+ resolve('Observer timeout reached.');
306
+ }, OBSERVER_TIMEOUT);
307
+ const root = document.documentElement || document.body || document;
308
+ if (!root || typeof observer.observe !== 'function') {
309
+ resolve('No root node to observe.');
310
+ }
311
+ });
312
+ });
313
+ let finalUrl = page.url();
314
+ const requestLabelUrl = request.label;
315
+ // to handle scenario where the redirected link is not within the scanning website
316
+ const isLoadedUrlFollowStrategy = isFollowStrategy(finalUrl, requestLabelUrl, strategy);
317
+ if (!isLoadedUrlFollowStrategy) {
318
+ finalUrl = requestLabelUrl;
319
+ }
320
+ const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
321
+ if (isRedirected) {
322
+ await requestQueue.addRequest({ url: finalUrl, label: finalUrl });
323
+ }
324
+ else {
325
+ request.skipNavigation = false;
326
+ }
327
+ },
328
+ ],
329
+ requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
330
+ requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
331
+ const browserContext = page.context();
332
+ try {
333
+ await waitForPageLoaded(page, 10000);
334
+ let actualUrl = page.url() || request.loadedUrl || request.url;
335
+ if (page.url() !== 'about:blank') {
336
+ actualUrl = page.url();
337
+ }
338
+ if (!isFollowStrategy(url, actualUrl, strategy) &&
339
+ (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))) {
340
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
341
+ numScanned: urlsCrawled.scanned.length,
342
+ urlScanned: actualUrl,
343
+ });
344
+ return;
345
+ }
346
+ const hasExceededDuration = scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
347
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
348
+ if (hasExceededDuration) {
349
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting website crawl.`);
350
+ durationExceeded = true;
351
+ }
352
+ isAbortingScanNow = true;
353
+ crawler.autoscaledPool.abort();
354
+ return;
355
+ }
356
+ // if URL has already been scanned
357
+ if (urlsCrawled.scanned.some(item => item.url === request.url)) {
358
+ // await enqueueProcess(page, enqueueLinks, browserContext);
359
+ return;
360
+ }
361
+ if (isDisallowedInRobotsTxt(request.url)) {
362
+ await enqueueProcess(page, enqueueLinks, browserContext);
363
+ return;
364
+ }
365
+ // handle pdfs
366
+ if (shouldSkipDueToUnsupportedContent(response, request.url) ||
367
+ (request.skipNavigation && actualUrl === 'about:blank')) {
368
+ if (!isScanPdfs) {
369
+ // Don't inform the user it is skipped since web crawler is best-effort.
370
+ /*
371
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
372
+ numScanned: urlsCrawled.scanned.length,
373
+ urlScanned: request.url,
374
+ });
375
+ urlsCrawled.userExcluded.push({
376
+ url: request.url,
377
+ pageTitle: request.url,
378
+ actualUrl: request.url, // because about:blank is not useful
379
+ metadata: STATUS_CODE_METADATA[1],
380
+ httpStatusCode: 0,
381
+ });
382
+ */
383
+ return;
384
+ }
385
+ const { pdfFileName, url } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
386
+ uuidToPdfMapping[pdfFileName] = url;
387
+ return;
388
+ }
389
+ if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
390
+ // Don't inform the user it is skipped since web crawler is best-effort.
391
+ /*
392
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
393
+ numScanned: urlsCrawled.scanned.length,
394
+ urlScanned: request.url,
395
+ });
396
+ urlsCrawled.userExcluded.push({
397
+ url: request.url,
398
+ pageTitle: request.url,
399
+ actualUrl, // because about:blank is not useful
400
+ metadata: STATUS_CODE_METADATA[1],
401
+ httpStatusCode: 0,
402
+ });
403
+ */
404
+ return;
405
+ }
406
+ if (!isFollowStrategy(url, actualUrl, strategy) &&
407
+ blacklistedPatterns &&
408
+ isSkippedUrl(actualUrl, blacklistedPatterns)) {
409
+ urlsCrawled.userExcluded.push({
410
+ url: request.url,
411
+ pageTitle: request.url,
412
+ actualUrl,
413
+ metadata: STATUS_CODE_METADATA[0],
414
+ httpStatusCode: 0,
415
+ });
416
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
417
+ numScanned: urlsCrawled.scanned.length,
418
+ urlScanned: request.url,
419
+ });
420
+ await enqueueProcess(page, enqueueLinks, browserContext);
421
+ return;
422
+ }
423
+ if (isScanHtml) {
424
+ // For deduplication, if the URL is redirected, we want to store the original URL and the redirected URL (actualUrl)
425
+ const isRedirected = !areLinksEqual(actualUrl, request.url);
426
+ // check if redirected link is following strategy (same-domain/same-hostname)
427
+ const isLoadedUrlFollowStrategy = isFollowStrategy(actualUrl, request.url, strategy);
428
+ if (isRedirected && !isLoadedUrlFollowStrategy) {
429
+ urlsCrawled.notScannedRedirects.push({
430
+ fromUrl: request.url,
431
+ toUrl: actualUrl, // i.e. actualUrl
432
+ });
433
+ return;
434
+ }
435
+ const responseStatus = response?.status();
436
+ if (responseStatus && responseStatus >= 300) {
437
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
438
+ numScanned: urlsCrawled.scanned.length,
439
+ urlScanned: request.url,
440
+ });
441
+ urlsCrawled.userExcluded.push({
442
+ url: request.url,
443
+ pageTitle: request.url,
444
+ actualUrl,
445
+ metadata: STATUS_CODE_METADATA[responseStatus] || STATUS_CODE_METADATA[599],
446
+ httpStatusCode: responseStatus,
447
+ });
448
+ return;
449
+ }
450
+ const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
451
+ if (isRedirected) {
452
+ const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => (item.actualUrl || item.url) === actualUrl);
453
+ if (isLoadedUrlInCrawledUrls) {
454
+ urlsCrawled.notScannedRedirects.push({
455
+ fromUrl: request.url,
456
+ toUrl: actualUrl, // i.e. actualUrl
457
+ });
458
+ return;
459
+ }
460
+ // One more check if scanned pages have reached limit due to multi-instances of handler running
461
+ if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
462
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
463
+ numScanned: urlsCrawled.scanned.length,
464
+ urlScanned: request.url,
465
+ });
466
+ urlsCrawled.scanned.push({
467
+ url: request.url,
468
+ pageTitle: results.pageTitle,
469
+ actualUrl, // i.e. actualUrl
470
+ });
471
+ urlsCrawled.scannedRedirects.push({
472
+ fromUrl: request.url,
473
+ toUrl: actualUrl, // i.e. actualUrl
474
+ });
475
+ results.url = request.url;
476
+ results.actualUrl = actualUrl;
477
+ await dataset.pushData(results);
478
+ }
479
+ }
480
+ else {
481
+ // One more check if scanned pages have reached limit due to multi-instances of handler running
482
+ if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
483
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
484
+ numScanned: urlsCrawled.scanned.length,
485
+ urlScanned: request.url,
486
+ });
487
+ urlsCrawled.scanned.push({
488
+ url: request.url,
489
+ actualUrl: request.url,
490
+ pageTitle: results.pageTitle,
491
+ });
492
+ await dataset.pushData(results);
493
+ }
494
+ }
495
+ }
496
+ else {
497
+ // Don't inform the user it is skipped since web crawler is best-effort.
498
+ /*
499
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
500
+ numScanned: urlsCrawled.scanned.length,
501
+ urlScanned: request.url,
502
+ });
503
+ urlsCrawled.userExcluded.push({
504
+ url: request.url,
505
+ pageTitle: request.url,
506
+ actualUrl, // because about:blank is not useful
507
+ metadata: STATUS_CODE_METADATA[1],
508
+ httpStatusCode: 0,
509
+ });
510
+ */
511
+ }
512
+ if (followRobots)
513
+ await getUrlsFromRobotsTxt(request.url, browser, userDataDirectory, extraHTTPHeaders);
514
+ await enqueueProcess(page, enqueueLinks, browserContext);
515
+ }
516
+ catch (e) {
517
+ try {
518
+ if (!e.message.includes('page.evaluate')) {
519
+ // do nothing;
520
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
521
+ numScanned: urlsCrawled.scanned.length,
522
+ urlScanned: request.url,
523
+ });
524
+ page = await browserContext.newPage();
525
+ await page.goto(request.url);
526
+ await page.route('**/*', async (route) => {
527
+ const interceptedRequest = route.request();
528
+ if (interceptedRequest.resourceType() === 'document') {
529
+ const interceptedRequestUrl = interceptedRequest
530
+ .url()
531
+ .replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
532
+ await requestQueue.addRequest({
533
+ url: interceptedRequestUrl,
534
+ skipNavigation: isUrlPdf(interceptedRequest.url()),
535
+ label: interceptedRequestUrl,
536
+ });
537
+ }
538
+ });
539
+ }
540
+ }
541
+ catch {
542
+ // Do nothing since the error will be pushed
543
+ }
544
+ // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
545
+ // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
546
+ if (!isAbortingScanNow) {
547
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
548
+ numScanned: urlsCrawled.scanned.length,
549
+ urlScanned: request.url,
550
+ });
551
+ urlsCrawled.error.push({
552
+ url: request.url,
553
+ pageTitle: request.url,
554
+ actualUrl: request.url,
555
+ metadata: STATUS_CODE_METADATA[2],
556
+ });
557
+ }
558
+ }
559
+ },
560
+ failedRequestHandler: async ({ request, response }) => {
561
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
562
+ numScanned: urlsCrawled.scanned.length,
563
+ urlScanned: request.url,
564
+ });
565
+ const status = response?.status();
566
+ const metadata = typeof status === 'number'
567
+ ? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
568
+ : STATUS_CODE_METADATA[2];
569
+ urlsCrawled.error.push({
570
+ url: request.url,
571
+ pageTitle: request.url,
572
+ actualUrl: request.url,
573
+ metadata,
574
+ httpStatusCode: typeof status === 'number' ? status : 0,
575
+ });
576
+ },
577
+ maxRequestsPerCrawl: Infinity,
578
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
579
+ ...(process.env.OOBEE_FAST_CRAWLER && {
580
+ autoscaledPoolOptions: {
581
+ minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
582
+ maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
583
+ desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
584
+ scaleUpStepRatio: 0.99, // Scale up faster
585
+ scaleDownStepRatio: 0.1, // Scale down slower
586
+ },
587
+ }),
588
+ }));
589
+ await crawler.run();
590
+ if (pdfDownloads.length > 0) {
591
+ // wait for pdf downloads to complete
592
+ await Promise.all(pdfDownloads);
593
+ // scan and process pdf documents
594
+ await runPdfScan(randomToken);
595
+ // transform result format
596
+ const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
597
+ // get screenshots from pdf docs
598
+ if (includeScreenshots) {
599
+ await Promise.all(pdfResults.map(async (result) => await doPdfScreenshots(randomToken, result)));
600
+ }
601
+ // push results for each pdf document to key value store
602
+ await Promise.all(pdfResults.map(result => dataset.pushData(result)));
603
+ }
604
+ if (!fromCrawlIntelligentSitemap) {
605
+ guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
606
+ }
607
+ if (scanDuration > 0) {
608
+ const elapsed = Math.round((Date.now() - crawlStartTime) / 1000);
609
+ console.log(`Crawl ended after ${elapsed}s. Limit: ${scanDuration}s.`);
610
+ }
611
+ return { urlsCrawled, durationExceeded };
612
+ };
613
+ export default crawlDomain;