@govtechsg/oobee 0.10.83 → 0.10.84
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/dist/constants/common.js +13 -1
- package/dist/crawlers/crawlDomain.js +220 -120
- package/dist/crawlers/crawlIntelligentSitemap.js +22 -7
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/mergeAxeResults/itemReferences.js +55 -0
- package/dist/mergeAxeResults/jsonArtifacts.js +335 -0
- package/dist/mergeAxeResults/scanPages.js +159 -0
- package/dist/mergeAxeResults/sentryTelemetry.js +152 -0
- package/dist/mergeAxeResults/types.js +1 -0
- package/dist/mergeAxeResults/writeCsv.js +125 -0
- package/dist/mergeAxeResults/writeScanDetailsCsv.js +35 -0
- package/dist/mergeAxeResults/writeSitemap.js +10 -0
- package/dist/mergeAxeResults.js +24 -929
- package/dist/proxyService.js +90 -5
- package/dist/utils.js +20 -7
- package/package.json +6 -6
- package/src/constants/common.ts +13 -1
- package/src/crawlers/crawlDomain.ts +248 -137
- package/src/crawlers/crawlIntelligentSitemap.ts +22 -8
- package/src/crawlers/runCustom.ts +10 -2
- package/src/mergeAxeResults/itemReferences.ts +62 -0
- package/src/mergeAxeResults/jsonArtifacts.ts +451 -0
- package/src/mergeAxeResults/scanPages.ts +207 -0
- package/src/mergeAxeResults/sentryTelemetry.ts +183 -0
- package/src/mergeAxeResults/types.ts +99 -0
- package/src/mergeAxeResults/writeCsv.ts +145 -0
- package/src/mergeAxeResults/writeScanDetailsCsv.ts +51 -0
- package/src/mergeAxeResults/writeSitemap.ts +13 -0
- package/src/mergeAxeResults.ts +82 -1318
- package/src/proxyService.ts +96 -4
- package/src/utils.ts +19 -7
package/README.md
CHANGED
|
@@ -90,6 +90,11 @@ verapdf --version
|
|
|
90
90
|
| WARN_LEVEL | Only used in tests. | |
|
|
91
91
|
| OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
|
|
92
92
|
| OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
|
|
93
|
+
| HTTP_PROXY | URL of the proxy server to be used for HTTP requests (e.g. `http://proxy.example.com:8080`). | |
|
|
94
|
+
| HTTPS_PROXY | URL of the proxy server to be used for HTTPS requests (e.g. `https://proxy.example.com:8080`). | |
|
|
95
|
+
| ALL_PROXY | URL of the proxy server to be used for all requests, typically used for SOCKS5 proxies (e.g. `socks5://proxy.example.com:1080`. Note: IPv6 direct connections may still continue even though socks5 proxy is specified due to a known issue with Chrome/Chromium. (Recommended workaround is to turn off IPv6 at host-level). | |
|
|
96
|
+
| NO_PROXY | Comma-separated list of domains that should bypass the proxy (e.g. `localhost,127.0.0.1,.example.com`). | |
|
|
97
|
+
| INCLUDE_PROXY | Comma-separated list of domains that should specifically be routed through the proxy. | |
|
|
93
98
|
|
|
94
99
|
#### Environment variables used internally (Do not set)
|
|
95
100
|
Do not set these environment variables or behaviour might change unexpectedly.
|
|
@@ -677,4 +682,4 @@ It uses the existing report *.json files for the embedded HTML dataset.
|
|
|
677
682
|
|
|
678
683
|
```
|
|
679
684
|
npx tsx dev/runGenerateJustHtmlReport.ts results/<report directory>
|
|
680
|
-
```
|
|
685
|
+
```
|
package/dist/constants/common.js
CHANGED
|
@@ -364,7 +364,19 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
|
|
|
364
364
|
});
|
|
365
365
|
if (!response)
|
|
366
366
|
throw new Error('No response from navigation');
|
|
367
|
-
//
|
|
367
|
+
// Wait briefly for JS/meta-refresh redirects to settle before reading the final URL.
|
|
368
|
+
// Server-side redirects are already reflected after goto(), but client-side redirects
|
|
369
|
+
// (e.g. domain.tld -> www.domain.tld via JS or meta-refresh) need extra time.
|
|
370
|
+
try {
|
|
371
|
+
await Promise.race([
|
|
372
|
+
page.waitForURL(currentUrl => currentUrl !== url, { timeout: 5000 }),
|
|
373
|
+
new Promise(resolve => setTimeout(resolve, 1000)), // minimum settle time
|
|
374
|
+
]);
|
|
375
|
+
}
|
|
376
|
+
catch {
|
|
377
|
+
// No redirect happened within the window — that's fine, continue with current URL
|
|
378
|
+
}
|
|
379
|
+
// Re-read page.url() AFTER potential client-side redirects have resolved
|
|
368
380
|
const finalUrl = page.url();
|
|
369
381
|
const finalStatus = response.status();
|
|
370
382
|
const headers = response.headers();
|
|
@@ -24,7 +24,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
24
24
|
const crawlStartTime = Date.now();
|
|
25
25
|
let dataset;
|
|
26
26
|
let urlsCrawled;
|
|
27
|
-
|
|
27
|
+
const { requestQueue } = await createCrawleeSubFolders(randomToken);
|
|
28
28
|
let durationExceeded = false;
|
|
29
29
|
if (fromCrawlIntelligentSitemap) {
|
|
30
30
|
dataset = datasetFromIntelligent;
|
|
@@ -34,65 +34,41 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
34
34
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
|
35
35
|
urlsCrawled = { ...constants.urlsCrawledObj };
|
|
36
36
|
}
|
|
37
|
-
({ requestQueue } = await createCrawleeSubFolders(randomToken));
|
|
38
37
|
const pdfDownloads = [];
|
|
39
38
|
const uuidToPdfMapping = {};
|
|
39
|
+
const queuedUrlSet = new Set();
|
|
40
|
+
const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => item.url));
|
|
41
|
+
const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => item.actualUrl || item.url));
|
|
40
42
|
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes);
|
|
41
43
|
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes);
|
|
42
44
|
const { maxConcurrency } = constants;
|
|
43
45
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
|
44
|
-
|
|
45
|
-
url
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
const enqueueProcess = async (page, enqueueLinks, browserContext) => {
|
|
46
|
+
const enqueueUniqueRequest = async ({ url, skipNavigation, label, }) => {
|
|
47
|
+
if (queuedUrlSet.has(url)) {
|
|
48
|
+
return;
|
|
49
|
+
}
|
|
50
|
+
queuedUrlSet.add(url);
|
|
50
51
|
try {
|
|
51
|
-
await
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
requestQueue,
|
|
56
|
-
transformRequestFunction: (req) => {
|
|
57
|
-
try {
|
|
58
|
-
req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
59
|
-
}
|
|
60
|
-
catch (e) {
|
|
61
|
-
consoleLogger.error(e);
|
|
62
|
-
}
|
|
63
|
-
if (urlsCrawled.scanned.some(item => item.url === req.url)) {
|
|
64
|
-
req.skipNavigation = true;
|
|
65
|
-
}
|
|
66
|
-
if (isDisallowedInRobotsTxt(req.url))
|
|
67
|
-
return null;
|
|
68
|
-
if (isBlacklisted(req.url, blacklistedPatterns))
|
|
69
|
-
return null;
|
|
70
|
-
if (isUrlPdf(req.url)) {
|
|
71
|
-
// playwright headless mode does not support navigation to pdf document
|
|
72
|
-
req.skipNavigation = true;
|
|
73
|
-
}
|
|
74
|
-
req.label = req.url;
|
|
75
|
-
return req;
|
|
76
|
-
},
|
|
52
|
+
await requestQueue.addRequest({
|
|
53
|
+
url,
|
|
54
|
+
skipNavigation,
|
|
55
|
+
label,
|
|
77
56
|
});
|
|
78
|
-
// If safeMode flag is enabled, skip enqueueLinksByClickingElements
|
|
79
|
-
if (!safeMode) {
|
|
80
|
-
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
|
81
|
-
try {
|
|
82
|
-
await customEnqueueLinksByClickingElements(page, browserContext);
|
|
83
|
-
}
|
|
84
|
-
catch (e) {
|
|
85
|
-
// do nothing;
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
57
|
}
|
|
89
|
-
catch {
|
|
90
|
-
|
|
91
|
-
|
|
58
|
+
catch (error) {
|
|
59
|
+
queuedUrlSet.delete(url);
|
|
60
|
+
throw error;
|
|
92
61
|
}
|
|
93
62
|
};
|
|
94
|
-
|
|
95
|
-
|
|
63
|
+
await enqueueUniqueRequest({
|
|
64
|
+
url,
|
|
65
|
+
skipNavigation: isUrlPdf(url),
|
|
66
|
+
label: url,
|
|
67
|
+
});
|
|
68
|
+
const customEnqueueLinksByClickingElements = async (currentPage, browserContext) => {
|
|
69
|
+
let workingPage = currentPage;
|
|
70
|
+
const initialPageUrl = workingPage.url().toString();
|
|
71
|
+
const selectedElementsString = cssQuerySelectors.join(', ');
|
|
96
72
|
const isExcluded = (newPageUrl) => {
|
|
97
73
|
const isAlreadyScanned = urlsCrawled.scanned.some(item => item.url === newPageUrl);
|
|
98
74
|
const isBlacklistedUrl = isBlacklisted(newPageUrl, blacklistedPatterns);
|
|
@@ -100,13 +76,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
100
76
|
const isNotSupportedDocument = disallowedListOfPatterns.some(pattern => newPageUrl.toLowerCase().startsWith(pattern));
|
|
101
77
|
return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
|
|
102
78
|
};
|
|
103
|
-
const setPageListeners = (
|
|
79
|
+
const setPageListeners = (pageListener) => {
|
|
104
80
|
// event listener to handle new page popups upon button click
|
|
105
|
-
|
|
81
|
+
pageListener.on('popup', async (newPage) => {
|
|
106
82
|
try {
|
|
107
|
-
if (newPage.url()
|
|
83
|
+
if (newPage.url() !== initialPageUrl && !isExcluded(newPage.url())) {
|
|
108
84
|
const newPageUrl = newPage.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
109
|
-
await
|
|
85
|
+
await enqueueUniqueRequest({
|
|
110
86
|
url: newPageUrl,
|
|
111
87
|
skipNavigation: isUrlPdf(newPage.url()),
|
|
112
88
|
label: newPageUrl,
|
|
@@ -128,13 +104,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
128
104
|
}
|
|
129
105
|
});
|
|
130
106
|
// event listener to handle navigation to new url within same page upon element click
|
|
131
|
-
|
|
107
|
+
pageListener.on('framenavigated', async (newFrame) => {
|
|
132
108
|
try {
|
|
133
109
|
if (newFrame.url() !== initialPageUrl &&
|
|
134
110
|
!isExcluded(newFrame.url()) &&
|
|
135
|
-
!(newFrame.url()
|
|
111
|
+
!(newFrame.url() === 'about:blank')) {
|
|
136
112
|
const newFrameUrl = newFrame.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
137
|
-
await
|
|
113
|
+
await enqueueUniqueRequest({
|
|
138
114
|
url: newFrameUrl,
|
|
139
115
|
skipNavigation: isUrlPdf(newFrame.url()),
|
|
140
116
|
label: newFrameUrl,
|
|
@@ -147,28 +123,32 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
147
123
|
}
|
|
148
124
|
});
|
|
149
125
|
};
|
|
150
|
-
setPageListeners(
|
|
126
|
+
setPageListeners(workingPage);
|
|
151
127
|
let currentElementIndex = 0;
|
|
152
128
|
let isAllElementsHandled = false;
|
|
129
|
+
// This loop is intentionally sequential because each step depends on the latest page state
|
|
130
|
+
// (navigation, popup/frame events, and potential page recreation).
|
|
131
|
+
// Running iterations in parallel (for example with Promise.all) would race on shared `page`
|
|
132
|
+
// state, causing stale element handles and nondeterministic enqueue/navigation behavior.
|
|
133
|
+
/* eslint-disable no-await-in-loop */
|
|
153
134
|
while (!isAllElementsHandled) {
|
|
154
135
|
try {
|
|
155
136
|
// navigate back to initial page if clicking on a element previously caused it to navigate to a new url
|
|
156
|
-
if (
|
|
137
|
+
if (workingPage.url() !== initialPageUrl) {
|
|
157
138
|
try {
|
|
158
|
-
await
|
|
139
|
+
await workingPage.close();
|
|
159
140
|
}
|
|
160
141
|
catch {
|
|
161
142
|
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
|
162
143
|
// Handles browser page object been closed.
|
|
163
144
|
}
|
|
164
|
-
|
|
165
|
-
await
|
|
145
|
+
workingPage = await browserContext.newPage();
|
|
146
|
+
await workingPage.goto(initialPageUrl, {
|
|
166
147
|
waitUntil: 'domcontentloaded',
|
|
167
148
|
});
|
|
168
|
-
setPageListeners(
|
|
149
|
+
setPageListeners(workingPage);
|
|
169
150
|
}
|
|
170
|
-
const
|
|
171
|
-
const selectedElements = await page.$$(selectedElementsString);
|
|
151
|
+
const selectedElements = await workingPage.$$(selectedElementsString);
|
|
172
152
|
// edge case where there might be elements on page that appears intermittently
|
|
173
153
|
if (currentElementIndex + 1 > selectedElements.length || !selectedElements) {
|
|
174
154
|
break;
|
|
@@ -181,36 +161,34 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
181
161
|
currentElementIndex += 1;
|
|
182
162
|
let newUrlFoundInElement = null;
|
|
183
163
|
if (await element.isVisible()) {
|
|
164
|
+
const currentPageUrl = workingPage.url();
|
|
184
165
|
// Find url in html elements without clicking them
|
|
185
|
-
await
|
|
186
|
-
.evaluate(element => {
|
|
166
|
+
const result = await workingPage.evaluate(pageElement => {
|
|
187
167
|
// find href attribute
|
|
188
|
-
const hrefUrl =
|
|
168
|
+
const hrefUrl = pageElement.getAttribute('href');
|
|
189
169
|
// find url in datapath
|
|
190
|
-
const dataPathUrl =
|
|
170
|
+
const dataPathUrl = pageElement.getAttribute('data-path');
|
|
191
171
|
return hrefUrl || dataPathUrl;
|
|
192
|
-
}, element)
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
absoluteUrl = new URL(newUrlFoundInElement);
|
|
203
|
-
}
|
|
204
|
-
catch (e) {
|
|
205
|
-
// If it's not a valid URL, treat it as a relative URL
|
|
206
|
-
absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
|
|
207
|
-
}
|
|
208
|
-
newUrlFoundInElement = absoluteUrl.href;
|
|
172
|
+
}, element);
|
|
173
|
+
if (result) {
|
|
174
|
+
newUrlFoundInElement = result;
|
|
175
|
+
const pageUrl = new URL(currentPageUrl);
|
|
176
|
+
const baseUrl = `${pageUrl.protocol}//${pageUrl.host}`;
|
|
177
|
+
let absoluteUrl;
|
|
178
|
+
// Construct absolute URL using base URL
|
|
179
|
+
try {
|
|
180
|
+
// Check if newUrlFoundInElement is a valid absolute URL
|
|
181
|
+
absoluteUrl = new URL(newUrlFoundInElement);
|
|
209
182
|
}
|
|
210
|
-
|
|
183
|
+
catch {
|
|
184
|
+
// If it's not a valid URL, treat it as a relative URL
|
|
185
|
+
absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
|
|
186
|
+
}
|
|
187
|
+
newUrlFoundInElement = absoluteUrl.href;
|
|
188
|
+
}
|
|
211
189
|
if (newUrlFoundInElement && !isExcluded(newUrlFoundInElement)) {
|
|
212
190
|
const newUrlFoundInElementUrl = newUrlFoundInElement.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
213
|
-
await
|
|
191
|
+
await enqueueUniqueRequest({
|
|
214
192
|
url: newUrlFoundInElementUrl,
|
|
215
193
|
skipNavigation: isUrlPdf(newUrlFoundInElement),
|
|
216
194
|
label: newUrlFoundInElementUrl,
|
|
@@ -218,15 +196,16 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
218
196
|
}
|
|
219
197
|
else if (!newUrlFoundInElement) {
|
|
220
198
|
try {
|
|
221
|
-
const shouldSkip = await shouldSkipClickDueToDisallowedHref(
|
|
199
|
+
const shouldSkip = await shouldSkipClickDueToDisallowedHref(workingPage, element);
|
|
222
200
|
if (shouldSkip) {
|
|
223
|
-
const elementHtml = await
|
|
201
|
+
const elementHtml = await workingPage.evaluate(el => el.outerHTML, element);
|
|
224
202
|
consoleLogger.info('Skipping a click due to disallowed href nearby. Element HTML:', elementHtml);
|
|
225
|
-
continue;
|
|
226
203
|
}
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
204
|
+
else {
|
|
205
|
+
// Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
|
|
206
|
+
await element.click({ force: true });
|
|
207
|
+
await workingPage.waitForTimeout(1000); // Add a delay of 1 second between each Element click
|
|
208
|
+
}
|
|
230
209
|
}
|
|
231
210
|
catch {
|
|
232
211
|
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
|
@@ -240,6 +219,61 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
240
219
|
// Handles browser page object been closed.
|
|
241
220
|
}
|
|
242
221
|
}
|
|
222
|
+
/* eslint-enable no-await-in-loop */
|
|
223
|
+
};
|
|
224
|
+
const enqueueProcess = async (page, enqueueLinks, browserContext) => {
|
|
225
|
+
try {
|
|
226
|
+
await enqueueLinks({
|
|
227
|
+
// set selector matches anchor elements with href but not contains # or starting with mailto:
|
|
228
|
+
selector: `a:not(${disallowedSelectorPatterns})`,
|
|
229
|
+
strategy,
|
|
230
|
+
requestQueue,
|
|
231
|
+
transformRequestFunction: (req) => {
|
|
232
|
+
try {
|
|
233
|
+
req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
234
|
+
}
|
|
235
|
+
catch (e) {
|
|
236
|
+
consoleLogger.error(e);
|
|
237
|
+
}
|
|
238
|
+
if (scannedUrlSet.has(req.url)) {
|
|
239
|
+
req.skipNavigation = true;
|
|
240
|
+
}
|
|
241
|
+
if (isDisallowedInRobotsTxt(req.url))
|
|
242
|
+
return null;
|
|
243
|
+
if (isBlacklisted(req.url, blacklistedPatterns))
|
|
244
|
+
return null;
|
|
245
|
+
if (isUrlPdf(req.url)) {
|
|
246
|
+
// playwright headless mode does not support navigation to pdf document
|
|
247
|
+
req.skipNavigation = true;
|
|
248
|
+
}
|
|
249
|
+
req.label = req.url;
|
|
250
|
+
return req;
|
|
251
|
+
},
|
|
252
|
+
});
|
|
253
|
+
// If safeMode flag is enabled, skip enqueueLinksByClickingElements
|
|
254
|
+
if (!safeMode) {
|
|
255
|
+
// Only run the expensive element-clicking discovery on pages sharing the
|
|
256
|
+
// same hostname as the seed URL. Cross-subdomain pages (reachable via
|
|
257
|
+
// same-domain strategy) still contribute their <a> links above, but
|
|
258
|
+
// clicking every interactive element on them is too slow and starves
|
|
259
|
+
// the crawler of time to discover pages on the primary hostname.
|
|
260
|
+
const currentHostname = new URL(page.url()).hostname;
|
|
261
|
+
const seedHostname = new URL(url).hostname;
|
|
262
|
+
if (currentHostname === seedHostname) {
|
|
263
|
+
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
|
264
|
+
try {
|
|
265
|
+
await customEnqueueLinksByClickingElements(page, browserContext);
|
|
266
|
+
}
|
|
267
|
+
catch {
|
|
268
|
+
// do nothing;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
catch {
|
|
274
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
|
275
|
+
// Handles browser page object been closed.
|
|
276
|
+
}
|
|
243
277
|
};
|
|
244
278
|
let isAbortingScanNow = false;
|
|
245
279
|
const crawler = register(new crawlee.PlaywrightCrawler({
|
|
@@ -261,8 +295,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
261
295
|
const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
|
|
262
296
|
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
263
297
|
// Assign to Crawlee's launcher
|
|
298
|
+
// Crawlee preLaunchHooks expects launchContext to be mutated in-place.
|
|
299
|
+
// eslint-disable-next-line no-param-reassign
|
|
264
300
|
launchContext.userDataDir = subProfileDir;
|
|
265
301
|
// Safely extend launchOptions
|
|
302
|
+
// eslint-disable-next-line no-param-reassign
|
|
266
303
|
launchContext.launchOptions = {
|
|
267
304
|
...launchContext.launchOptions,
|
|
268
305
|
ignoreHTTPSErrors: true,
|
|
@@ -287,7 +324,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
287
324
|
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
|
288
325
|
const observer = new MutationObserver(() => {
|
|
289
326
|
clearTimeout(timeout);
|
|
290
|
-
mutationCount
|
|
327
|
+
mutationCount += 1;
|
|
291
328
|
if (mutationCount > MAX_MUTATIONS) {
|
|
292
329
|
observer.disconnect();
|
|
293
330
|
resolve('Too many mutations, exiting.');
|
|
@@ -308,6 +345,9 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
308
345
|
if (!root || typeof observer.observe !== 'function') {
|
|
309
346
|
resolve('No root node to observe.');
|
|
310
347
|
}
|
|
348
|
+
else {
|
|
349
|
+
observer.observe(root, { childList: true, subtree: true });
|
|
350
|
+
}
|
|
311
351
|
});
|
|
312
352
|
});
|
|
313
353
|
let finalUrl = page.url();
|
|
@@ -319,7 +359,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
319
359
|
}
|
|
320
360
|
const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
|
|
321
361
|
if (isRedirected) {
|
|
322
|
-
await
|
|
362
|
+
await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
|
|
323
363
|
}
|
|
324
364
|
else {
|
|
325
365
|
request.skipNavigation = false;
|
|
@@ -327,7 +367,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
327
367
|
},
|
|
328
368
|
],
|
|
329
369
|
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
|
330
|
-
requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
|
|
370
|
+
requestHandler: async ({ page, request, response, crawler: activeCrawler, sendRequest, enqueueLinks, }) => {
|
|
331
371
|
const browserContext = page.context();
|
|
332
372
|
try {
|
|
333
373
|
await waitForPageLoaded(page, 10000);
|
|
@@ -335,6 +375,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
335
375
|
if (page.url() !== 'about:blank') {
|
|
336
376
|
actualUrl = page.url();
|
|
337
377
|
}
|
|
378
|
+
// Second-pass requests: only do click-discovery, skip scanning
|
|
379
|
+
if (request.label?.startsWith('__clickpass__')) {
|
|
380
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
|
381
|
+
return;
|
|
382
|
+
}
|
|
338
383
|
if (!isFollowStrategy(url, actualUrl, strategy) &&
|
|
339
384
|
(isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))) {
|
|
340
385
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
|
@@ -350,12 +395,12 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
350
395
|
durationExceeded = true;
|
|
351
396
|
}
|
|
352
397
|
isAbortingScanNow = true;
|
|
353
|
-
|
|
398
|
+
activeCrawler.autoscaledPool.abort();
|
|
354
399
|
return;
|
|
355
400
|
}
|
|
356
401
|
// if URL has already been scanned
|
|
357
|
-
if (
|
|
358
|
-
|
|
402
|
+
if (scannedUrlSet.has(request.url)) {
|
|
403
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
|
359
404
|
return;
|
|
360
405
|
}
|
|
361
406
|
if (isDisallowedInRobotsTxt(request.url)) {
|
|
@@ -382,8 +427,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
382
427
|
*/
|
|
383
428
|
return;
|
|
384
429
|
}
|
|
385
|
-
const { pdfFileName, url } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
|
|
386
|
-
uuidToPdfMapping[pdfFileName] =
|
|
430
|
+
const { pdfFileName, url: downloadedPdfUrl } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
|
|
431
|
+
uuidToPdfMapping[pdfFileName] = downloadedPdfUrl;
|
|
387
432
|
return;
|
|
388
433
|
}
|
|
389
434
|
if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
|
|
@@ -449,7 +494,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
449
494
|
}
|
|
450
495
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
|
451
496
|
if (isRedirected) {
|
|
452
|
-
const isLoadedUrlInCrawledUrls =
|
|
497
|
+
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
|
|
453
498
|
if (isLoadedUrlInCrawledUrls) {
|
|
454
499
|
urlsCrawled.notScannedRedirects.push({
|
|
455
500
|
fromUrl: request.url,
|
|
@@ -468,6 +513,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
468
513
|
pageTitle: results.pageTitle,
|
|
469
514
|
actualUrl, // i.e. actualUrl
|
|
470
515
|
});
|
|
516
|
+
scannedUrlSet.add(request.url);
|
|
517
|
+
scannedResolvedUrlSet.add(actualUrl);
|
|
471
518
|
urlsCrawled.scannedRedirects.push({
|
|
472
519
|
fromUrl: request.url,
|
|
473
520
|
toUrl: actualUrl, // i.e. actualUrl
|
|
@@ -477,20 +524,20 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
477
524
|
await dataset.pushData(results);
|
|
478
525
|
}
|
|
479
526
|
}
|
|
480
|
-
else {
|
|
527
|
+
else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
|
481
528
|
// One more check if scanned pages have reached limit due to multi-instances of handler running
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
529
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
530
|
+
numScanned: urlsCrawled.scanned.length,
|
|
531
|
+
urlScanned: request.url,
|
|
532
|
+
});
|
|
533
|
+
urlsCrawled.scanned.push({
|
|
534
|
+
url: request.url,
|
|
535
|
+
actualUrl: request.url,
|
|
536
|
+
pageTitle: results.pageTitle,
|
|
537
|
+
});
|
|
538
|
+
scannedUrlSet.add(request.url);
|
|
539
|
+
scannedResolvedUrlSet.add(request.url);
|
|
540
|
+
await dataset.pushData(results);
|
|
494
541
|
}
|
|
495
542
|
}
|
|
496
543
|
else {
|
|
@@ -521,15 +568,15 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
521
568
|
numScanned: urlsCrawled.scanned.length,
|
|
522
569
|
urlScanned: request.url,
|
|
523
570
|
});
|
|
524
|
-
|
|
525
|
-
await
|
|
526
|
-
await
|
|
571
|
+
const recoveryPage = await browserContext.newPage();
|
|
572
|
+
await recoveryPage.goto(request.url);
|
|
573
|
+
await recoveryPage.route('**/*', async (route) => {
|
|
527
574
|
const interceptedRequest = route.request();
|
|
528
575
|
if (interceptedRequest.resourceType() === 'document') {
|
|
529
576
|
const interceptedRequestUrl = interceptedRequest
|
|
530
577
|
.url()
|
|
531
578
|
.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
532
|
-
await
|
|
579
|
+
await enqueueUniqueRequest({
|
|
533
580
|
url: interceptedRequestUrl,
|
|
534
581
|
skipNavigation: isUrlPdf(interceptedRequest.url()),
|
|
535
582
|
label: interceptedRequestUrl,
|
|
@@ -587,6 +634,59 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
587
634
|
}),
|
|
588
635
|
}));
|
|
589
636
|
await crawler.run();
|
|
637
|
+
// Additional passes: keep re-visiting scanned seed-hostname pages for
|
|
638
|
+
// click-discovery until no new pages are found or limits are reached.
|
|
639
|
+
if (!safeMode && !isAbortingScanNow && !durationExceeded) {
|
|
640
|
+
const seedHostname = new URL(url).hostname;
|
|
641
|
+
const clickPassVisited = new Set();
|
|
642
|
+
let prevScannedCount;
|
|
643
|
+
do {
|
|
644
|
+
prevScannedCount = urlsCrawled.scanned.length;
|
|
645
|
+
if (prevScannedCount >= maxRequestsPerCrawl)
|
|
646
|
+
break;
|
|
647
|
+
if (scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000)
|
|
648
|
+
break;
|
|
649
|
+
const seedHostnamePages = urlsCrawled.scanned
|
|
650
|
+
.map(item => item.actualUrl || item.url)
|
|
651
|
+
.filter(pageUrl => {
|
|
652
|
+
try {
|
|
653
|
+
return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
|
|
654
|
+
}
|
|
655
|
+
catch {
|
|
656
|
+
return false;
|
|
657
|
+
}
|
|
658
|
+
});
|
|
659
|
+
if (seedHostnamePages.length === 0)
|
|
660
|
+
break;
|
|
661
|
+
let enqueued = 0;
|
|
662
|
+
for (const pageUrl of seedHostnamePages) {
|
|
663
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl)
|
|
664
|
+
break;
|
|
665
|
+
if (scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000)
|
|
666
|
+
break;
|
|
667
|
+
clickPassVisited.add(pageUrl);
|
|
668
|
+
try {
|
|
669
|
+
const clickPassLabel = `__clickpass__${pageUrl}`;
|
|
670
|
+
if (!queuedUrlSet.has(clickPassLabel)) {
|
|
671
|
+
queuedUrlSet.add(clickPassLabel);
|
|
672
|
+
await requestQueue.addRequest({
|
|
673
|
+
url: pageUrl,
|
|
674
|
+
label: clickPassLabel,
|
|
675
|
+
skipNavigation: false,
|
|
676
|
+
});
|
|
677
|
+
enqueued += 1;
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
catch {
|
|
681
|
+
// ignore enqueue errors
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
if (enqueued === 0)
|
|
685
|
+
break;
|
|
686
|
+
await crawler.run();
|
|
687
|
+
// Stop looping if no new pages were discovered in this pass
|
|
688
|
+
} while (urlsCrawled.scanned.length > prevScannedCount);
|
|
689
|
+
}
|
|
590
690
|
if (pdfDownloads.length > 0) {
|
|
591
691
|
// wait for pdf downloads to complete
|
|
592
692
|
await Promise.all(pdfDownloads);
|
|
@@ -596,7 +696,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
|
|
|
596
696
|
const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
|
|
597
697
|
// get screenshots from pdf docs
|
|
598
698
|
if (includeScreenshots) {
|
|
599
|
-
await Promise.all(pdfResults.map(
|
|
699
|
+
await Promise.all(pdfResults.map(result => doPdfScreenshots(randomToken, result)));
|
|
600
700
|
}
|
|
601
701
|
// push results for each pdf document to key value store
|
|
602
702
|
await Promise.all(pdfResults.map(result => dataset.pushData(result)));
|
|
@@ -22,13 +22,25 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
22
22
|
async function findSitemap(link, userDataDirectory, extraHTTPHeaders) {
|
|
23
23
|
const homeUrl = getHomeUrl(link);
|
|
24
24
|
let sitemapLink = '';
|
|
25
|
-
const
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
25
|
+
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
26
|
+
let context;
|
|
27
|
+
let browserInstance;
|
|
28
|
+
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
29
|
+
const effectiveUserDataDirectory = userDataDirectory || '';
|
|
30
|
+
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
31
|
+
...launchOptions,
|
|
32
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
33
|
+
});
|
|
34
|
+
register(context);
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
// In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
|
|
38
|
+
browserInstance = await constants.launcher.launch(launchOptions);
|
|
39
|
+
register(browserInstance);
|
|
40
|
+
context = await browserInstance.newContext({
|
|
41
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
42
|
+
});
|
|
43
|
+
}
|
|
32
44
|
const page = await context.newPage();
|
|
33
45
|
for (const path of sitemapPaths) {
|
|
34
46
|
sitemapLink = homeUrl + path;
|
|
@@ -39,6 +51,9 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
|
|
|
39
51
|
}
|
|
40
52
|
await page.close();
|
|
41
53
|
await context.close().catch(() => { });
|
|
54
|
+
if (browserInstance) {
|
|
55
|
+
await browserInstance.close().catch(() => { });
|
|
56
|
+
}
|
|
42
57
|
return sitemapExist ? sitemapLink : '';
|
|
43
58
|
}
|
|
44
59
|
const checkUrlExists = async (page, parsedUrl) => {
|
|
@@ -6,6 +6,7 @@ import constants, { getIntermediateScreenshotsPath, guiInfoStatusTypes, } from '
|
|
|
6
6
|
import { initNewPage, log } from './custom/utils.js';
|
|
7
7
|
import { guiInfoLog } from '../logs.js';
|
|
8
8
|
import { addUrlGuardScript } from './guards/urlGuard.js';
|
|
9
|
+
import { getPlaywrightLaunchOptions } from '../constants/common.js';
|
|
9
10
|
// Export of classes
|
|
10
11
|
export class ProcessPageParams {
|
|
11
12
|
constructor(scannedIdx, blacklistedPatterns, includeScreenshots, dataset, intermediateScreenshotsPath, urlsCrawled, randomToken) {
|
|
@@ -34,11 +35,16 @@ const runCustom = async (url, randomToken, viewportSettings, blacklistedPatterns
|
|
|
34
35
|
try {
|
|
35
36
|
const deviceConfig = viewportSettings.playwrightDeviceDetailsObject;
|
|
36
37
|
const hasCustomViewport = !!deviceConfig;
|
|
38
|
+
const baseLaunchOptions = getPlaywrightLaunchOptions('chrome');
|
|
39
|
+
// Merge base args with custom flow specific args
|
|
40
|
+
const baseArgs = baseLaunchOptions.args || [];
|
|
41
|
+
const customArgs = hasCustomViewport ? ['--window-size=1920,1040'] : ['--start-maximized'];
|
|
42
|
+
const mergedArgs = [...baseArgs.filter(a => !a.startsWith('--window-size') && a !== '--start-maximized'), ...customArgs];
|
|
37
43
|
const browser = await chromium.launch({
|
|
38
|
-
|
|
44
|
+
...baseLaunchOptions,
|
|
45
|
+
args: mergedArgs,
|
|
39
46
|
headless: false,
|
|
40
47
|
channel: 'chrome',
|
|
41
|
-
// bypassCSP: true,
|
|
42
48
|
});
|
|
43
49
|
const context = await browser.newContext({
|
|
44
50
|
ignoreHTTPSErrors: true,
|