@govtechsg/oobee 0.10.83 → 0.10.84
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/dist/constants/common.js +13 -1
- package/dist/crawlers/crawlDomain.js +220 -120
- package/dist/crawlers/crawlIntelligentSitemap.js +22 -7
- package/dist/crawlers/runCustom.js +8 -2
- package/dist/mergeAxeResults/itemReferences.js +55 -0
- package/dist/mergeAxeResults/jsonArtifacts.js +335 -0
- package/dist/mergeAxeResults/scanPages.js +159 -0
- package/dist/mergeAxeResults/sentryTelemetry.js +152 -0
- package/dist/mergeAxeResults/types.js +1 -0
- package/dist/mergeAxeResults/writeCsv.js +125 -0
- package/dist/mergeAxeResults/writeScanDetailsCsv.js +35 -0
- package/dist/mergeAxeResults/writeSitemap.js +10 -0
- package/dist/mergeAxeResults.js +24 -929
- package/dist/proxyService.js +90 -5
- package/dist/utils.js +20 -7
- package/package.json +6 -6
- package/src/constants/common.ts +13 -1
- package/src/crawlers/crawlDomain.ts +248 -137
- package/src/crawlers/crawlIntelligentSitemap.ts +22 -8
- package/src/crawlers/runCustom.ts +10 -2
- package/src/mergeAxeResults/itemReferences.ts +62 -0
- package/src/mergeAxeResults/jsonArtifacts.ts +451 -0
- package/src/mergeAxeResults/scanPages.ts +207 -0
- package/src/mergeAxeResults/sentryTelemetry.ts +183 -0
- package/src/mergeAxeResults/types.ts +99 -0
- package/src/mergeAxeResults/writeCsv.ts +145 -0
- package/src/mergeAxeResults/writeScanDetailsCsv.ts +51 -0
- package/src/mergeAxeResults/writeSitemap.ts +13 -0
- package/src/mergeAxeResults.ts +82 -1318
- package/src/proxyService.ts +96 -4
- package/src/utils.ts +19 -7
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
import crawlee, { EnqueueStrategy } from 'crawlee';
|
|
2
|
-
import fs from 'fs';
|
|
3
2
|
import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
|
|
4
|
-
import type {
|
|
5
|
-
import https from 'https';
|
|
6
|
-
import type { BatchAddRequestsResult } from '@crawlee/types';
|
|
3
|
+
import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
|
|
7
4
|
import * as path from 'path';
|
|
8
5
|
import fsp from 'fs/promises';
|
|
9
6
|
import {
|
|
@@ -39,7 +36,7 @@ import {
|
|
|
39
36
|
mapPdfScanResults,
|
|
40
37
|
doPdfScreenshots,
|
|
41
38
|
} from './pdfScanFunc.js';
|
|
42
|
-
import { consoleLogger, guiInfoLog
|
|
39
|
+
import { consoleLogger, guiInfoLog } from '../logs.js';
|
|
43
40
|
import { ViewportSettingsClass } from '../combine.js';
|
|
44
41
|
|
|
45
42
|
const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
|
|
@@ -104,7 +101,8 @@ const crawlDomain = async ({
|
|
|
104
101
|
const crawlStartTime = Date.now();
|
|
105
102
|
let dataset: crawlee.Dataset;
|
|
106
103
|
let urlsCrawled: UrlsCrawled;
|
|
107
|
-
|
|
104
|
+
const { requestQueue }: { requestQueue: crawlee.RequestQueue } =
|
|
105
|
+
await createCrawleeSubFolders(randomToken);
|
|
108
106
|
let durationExceeded = false;
|
|
109
107
|
|
|
110
108
|
if (fromCrawlIntelligentSitemap) {
|
|
@@ -115,73 +113,57 @@ const crawlDomain = async ({
|
|
|
115
113
|
urlsCrawled = { ...constants.urlsCrawledObj };
|
|
116
114
|
}
|
|
117
115
|
|
|
118
|
-
({ requestQueue } = await createCrawleeSubFolders(randomToken));
|
|
119
|
-
|
|
120
116
|
const pdfDownloads: Promise<void>[] = [];
|
|
121
117
|
const uuidToPdfMapping: Record<string, string> = {};
|
|
118
|
+
const queuedUrlSet = new Set<string>();
|
|
119
|
+
const scannedUrlSet = new Set<string>(urlsCrawled.scanned.map(item => item.url));
|
|
120
|
+
const scannedResolvedUrlSet = new Set<string>(
|
|
121
|
+
urlsCrawled.scanned.map(item => item.actualUrl || item.url),
|
|
122
|
+
);
|
|
122
123
|
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
|
|
123
124
|
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
|
|
124
125
|
const { maxConcurrency } = constants;
|
|
125
126
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
|
126
127
|
|
|
127
|
-
|
|
128
|
+
const enqueueUniqueRequest = async ({
|
|
128
129
|
url,
|
|
129
|
-
skipNavigation
|
|
130
|
-
label
|
|
131
|
-
}
|
|
130
|
+
skipNavigation,
|
|
131
|
+
label,
|
|
132
|
+
}: {
|
|
133
|
+
url: string;
|
|
134
|
+
skipNavigation?: boolean;
|
|
135
|
+
label?: string;
|
|
136
|
+
}) => {
|
|
137
|
+
if (queuedUrlSet.has(url)) {
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
queuedUrlSet.add(url);
|
|
132
141
|
|
|
133
|
-
const enqueueProcess = async (
|
|
134
|
-
page: Page,
|
|
135
|
-
enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
|
|
136
|
-
browserContext: BrowserContext,
|
|
137
|
-
) => {
|
|
138
142
|
try {
|
|
139
|
-
await
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
requestQueue,
|
|
144
|
-
transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
|
|
145
|
-
try {
|
|
146
|
-
req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
147
|
-
} catch (e) {
|
|
148
|
-
consoleLogger.error(e);
|
|
149
|
-
}
|
|
150
|
-
if (urlsCrawled.scanned.some(item => item.url === req.url)) {
|
|
151
|
-
req.skipNavigation = true;
|
|
152
|
-
}
|
|
153
|
-
if (isDisallowedInRobotsTxt(req.url)) return null;
|
|
154
|
-
if (isBlacklisted(req.url, blacklistedPatterns)) return null;
|
|
155
|
-
if (isUrlPdf(req.url)) {
|
|
156
|
-
// playwright headless mode does not support navigation to pdf document
|
|
157
|
-
req.skipNavigation = true;
|
|
158
|
-
}
|
|
159
|
-
req.label = req.url;
|
|
160
|
-
|
|
161
|
-
return req;
|
|
162
|
-
},
|
|
143
|
+
await requestQueue.addRequest({
|
|
144
|
+
url,
|
|
145
|
+
skipNavigation,
|
|
146
|
+
label,
|
|
163
147
|
});
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
|
168
|
-
try {
|
|
169
|
-
await customEnqueueLinksByClickingElements(page, browserContext);
|
|
170
|
-
} catch (e) {
|
|
171
|
-
// do nothing;
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
} catch {
|
|
175
|
-
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
|
176
|
-
// Handles browser page object been closed.
|
|
148
|
+
} catch (error) {
|
|
149
|
+
queuedUrlSet.delete(url);
|
|
150
|
+
throw error;
|
|
177
151
|
}
|
|
178
152
|
};
|
|
179
153
|
|
|
154
|
+
await enqueueUniqueRequest({
|
|
155
|
+
url,
|
|
156
|
+
skipNavigation: isUrlPdf(url),
|
|
157
|
+
label: url,
|
|
158
|
+
});
|
|
159
|
+
|
|
180
160
|
const customEnqueueLinksByClickingElements = async (
|
|
181
|
-
|
|
161
|
+
currentPage: Page,
|
|
182
162
|
browserContext: BrowserContext,
|
|
183
163
|
): Promise<void> => {
|
|
184
|
-
|
|
164
|
+
let workingPage = currentPage;
|
|
165
|
+
const initialPageUrl: string = workingPage.url().toString();
|
|
166
|
+
const selectedElementsString = cssQuerySelectors.join(', ');
|
|
185
167
|
|
|
186
168
|
const isExcluded = (newPageUrl: string): boolean => {
|
|
187
169
|
const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
|
|
@@ -192,13 +174,13 @@ const crawlDomain = async ({
|
|
|
192
174
|
);
|
|
193
175
|
return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
|
|
194
176
|
};
|
|
195
|
-
const setPageListeners = (
|
|
177
|
+
const setPageListeners = (pageListener: Page): void => {
|
|
196
178
|
// event listener to handle new page popups upon button click
|
|
197
|
-
|
|
179
|
+
pageListener.on('popup', async (newPage: Page) => {
|
|
198
180
|
try {
|
|
199
|
-
if (newPage.url()
|
|
181
|
+
if (newPage.url() !== initialPageUrl && !isExcluded(newPage.url())) {
|
|
200
182
|
const newPageUrl: string = newPage.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
201
|
-
await
|
|
183
|
+
await enqueueUniqueRequest({
|
|
202
184
|
url: newPageUrl,
|
|
203
185
|
skipNavigation: isUrlPdf(newPage.url()),
|
|
204
186
|
label: newPageUrl,
|
|
@@ -218,15 +200,15 @@ const crawlDomain = async ({
|
|
|
218
200
|
});
|
|
219
201
|
|
|
220
202
|
// event listener to handle navigation to new url within same page upon element click
|
|
221
|
-
|
|
203
|
+
pageListener.on('framenavigated', async (newFrame: Frame) => {
|
|
222
204
|
try {
|
|
223
205
|
if (
|
|
224
206
|
newFrame.url() !== initialPageUrl &&
|
|
225
207
|
!isExcluded(newFrame.url()) &&
|
|
226
|
-
!(newFrame.url()
|
|
208
|
+
!(newFrame.url() === 'about:blank')
|
|
227
209
|
) {
|
|
228
210
|
const newFrameUrl: string = newFrame.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
229
|
-
await
|
|
211
|
+
await enqueueUniqueRequest({
|
|
230
212
|
url: newFrameUrl,
|
|
231
213
|
skipNavigation: isUrlPdf(newFrame.url()),
|
|
232
214
|
label: newFrameUrl,
|
|
@@ -238,28 +220,32 @@ const crawlDomain = async ({
|
|
|
238
220
|
}
|
|
239
221
|
});
|
|
240
222
|
};
|
|
241
|
-
setPageListeners(
|
|
223
|
+
setPageListeners(workingPage);
|
|
242
224
|
let currentElementIndex: number = 0;
|
|
243
225
|
let isAllElementsHandled: boolean = false;
|
|
226
|
+
// This loop is intentionally sequential because each step depends on the latest page state
|
|
227
|
+
// (navigation, popup/frame events, and potential page recreation).
|
|
228
|
+
// Running iterations in parallel (for example with Promise.all) would race on shared `page`
|
|
229
|
+
// state, causing stale element handles and nondeterministic enqueue/navigation behavior.
|
|
230
|
+
/* eslint-disable no-await-in-loop */
|
|
244
231
|
while (!isAllElementsHandled) {
|
|
245
232
|
try {
|
|
246
233
|
// navigate back to initial page if clicking on a element previously caused it to navigate to a new url
|
|
247
|
-
if (
|
|
234
|
+
if (workingPage.url() !== initialPageUrl) {
|
|
248
235
|
try {
|
|
249
|
-
await
|
|
236
|
+
await workingPage.close();
|
|
250
237
|
} catch {
|
|
251
238
|
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
|
252
239
|
// Handles browser page object been closed.
|
|
253
240
|
}
|
|
254
|
-
|
|
255
|
-
await
|
|
241
|
+
workingPage = await browserContext.newPage();
|
|
242
|
+
await workingPage.goto(initialPageUrl, {
|
|
256
243
|
waitUntil: 'domcontentloaded',
|
|
257
244
|
});
|
|
258
|
-
setPageListeners(
|
|
245
|
+
setPageListeners(workingPage);
|
|
259
246
|
}
|
|
260
|
-
const selectedElementsString = cssQuerySelectors.join(', ');
|
|
261
247
|
const selectedElements: ElementHandle<SVGElement | HTMLElement>[] =
|
|
262
|
-
await
|
|
248
|
+
await workingPage.$$(selectedElementsString);
|
|
263
249
|
// edge case where there might be elements on page that appears intermittently
|
|
264
250
|
if (currentElementIndex + 1 > selectedElements.length || !selectedElements) {
|
|
265
251
|
break;
|
|
@@ -273,60 +259,57 @@ const crawlDomain = async ({
|
|
|
273
259
|
currentElementIndex += 1;
|
|
274
260
|
let newUrlFoundInElement: string = null;
|
|
275
261
|
if (await element.isVisible()) {
|
|
262
|
+
const currentPageUrl = workingPage.url();
|
|
276
263
|
// Find url in html elements without clicking them
|
|
277
|
-
await
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
newUrlFoundInElement = absoluteUrl.href;
|
|
302
|
-
}
|
|
303
|
-
});
|
|
264
|
+
const result = await workingPage.evaluate(pageElement => {
|
|
265
|
+
// find href attribute
|
|
266
|
+
const hrefUrl: string = pageElement.getAttribute('href');
|
|
267
|
+
|
|
268
|
+
// find url in datapath
|
|
269
|
+
const dataPathUrl: string = pageElement.getAttribute('data-path');
|
|
270
|
+
|
|
271
|
+
return hrefUrl || dataPathUrl;
|
|
272
|
+
}, element);
|
|
273
|
+
if (result) {
|
|
274
|
+
newUrlFoundInElement = result;
|
|
275
|
+
const pageUrl: URL = new URL(currentPageUrl);
|
|
276
|
+
const baseUrl: string = `${pageUrl.protocol}//${pageUrl.host}`;
|
|
277
|
+
let absoluteUrl: URL;
|
|
278
|
+
// Construct absolute URL using base URL
|
|
279
|
+
try {
|
|
280
|
+
// Check if newUrlFoundInElement is a valid absolute URL
|
|
281
|
+
absoluteUrl = new URL(newUrlFoundInElement);
|
|
282
|
+
} catch {
|
|
283
|
+
// If it's not a valid URL, treat it as a relative URL
|
|
284
|
+
absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
|
|
285
|
+
}
|
|
286
|
+
newUrlFoundInElement = absoluteUrl.href;
|
|
287
|
+
}
|
|
304
288
|
if (newUrlFoundInElement && !isExcluded(newUrlFoundInElement)) {
|
|
305
289
|
const newUrlFoundInElementUrl: string = newUrlFoundInElement.replace(
|
|
306
290
|
/(?<=&|\?)utm_.*?(&|$)/gim,
|
|
307
291
|
'',
|
|
308
292
|
);
|
|
309
293
|
|
|
310
|
-
await
|
|
294
|
+
await enqueueUniqueRequest({
|
|
311
295
|
url: newUrlFoundInElementUrl,
|
|
312
296
|
skipNavigation: isUrlPdf(newUrlFoundInElement),
|
|
313
297
|
label: newUrlFoundInElementUrl,
|
|
314
298
|
});
|
|
315
299
|
} else if (!newUrlFoundInElement) {
|
|
316
300
|
try {
|
|
317
|
-
const shouldSkip = await shouldSkipClickDueToDisallowedHref(
|
|
301
|
+
const shouldSkip = await shouldSkipClickDueToDisallowedHref(workingPage, element);
|
|
318
302
|
if (shouldSkip) {
|
|
319
|
-
const elementHtml = await
|
|
303
|
+
const elementHtml = await workingPage.evaluate(el => el.outerHTML, element);
|
|
320
304
|
consoleLogger.info(
|
|
321
305
|
'Skipping a click due to disallowed href nearby. Element HTML:',
|
|
322
306
|
elementHtml,
|
|
323
307
|
);
|
|
324
|
-
|
|
308
|
+
} else {
|
|
309
|
+
// Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
|
|
310
|
+
await element.click({ force: true });
|
|
311
|
+
await workingPage.waitForTimeout(1000); // Add a delay of 1 second between each Element click
|
|
325
312
|
}
|
|
326
|
-
|
|
327
|
-
// Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
|
|
328
|
-
await element.click({ force: true });
|
|
329
|
-
await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
|
|
330
313
|
} catch {
|
|
331
314
|
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
|
332
315
|
// Handles browser page object been closed.
|
|
@@ -338,6 +321,63 @@ const crawlDomain = async ({
|
|
|
338
321
|
// Handles browser page object been closed.
|
|
339
322
|
}
|
|
340
323
|
}
|
|
324
|
+
/* eslint-enable no-await-in-loop */
|
|
325
|
+
};
|
|
326
|
+
|
|
327
|
+
const enqueueProcess = async (
|
|
328
|
+
page: Page,
|
|
329
|
+
enqueueLinks: PlaywrightCrawlingContext['enqueueLinks'],
|
|
330
|
+
browserContext: BrowserContext,
|
|
331
|
+
) => {
|
|
332
|
+
try {
|
|
333
|
+
await enqueueLinks({
|
|
334
|
+
// set selector matches anchor elements with href but not contains # or starting with mailto:
|
|
335
|
+
selector: `a:not(${disallowedSelectorPatterns})`,
|
|
336
|
+
strategy,
|
|
337
|
+
requestQueue,
|
|
338
|
+
transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
|
|
339
|
+
try {
|
|
340
|
+
req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
341
|
+
} catch (e) {
|
|
342
|
+
consoleLogger.error(e);
|
|
343
|
+
}
|
|
344
|
+
if (scannedUrlSet.has(req.url)) {
|
|
345
|
+
req.skipNavigation = true;
|
|
346
|
+
}
|
|
347
|
+
if (isDisallowedInRobotsTxt(req.url)) return null;
|
|
348
|
+
if (isBlacklisted(req.url, blacklistedPatterns)) return null;
|
|
349
|
+
if (isUrlPdf(req.url)) {
|
|
350
|
+
// playwright headless mode does not support navigation to pdf document
|
|
351
|
+
req.skipNavigation = true;
|
|
352
|
+
}
|
|
353
|
+
req.label = req.url;
|
|
354
|
+
|
|
355
|
+
return req;
|
|
356
|
+
},
|
|
357
|
+
});
|
|
358
|
+
|
|
359
|
+
// If safeMode flag is enabled, skip enqueueLinksByClickingElements
|
|
360
|
+
if (!safeMode) {
|
|
361
|
+
// Only run the expensive element-clicking discovery on pages sharing the
|
|
362
|
+
// same hostname as the seed URL. Cross-subdomain pages (reachable via
|
|
363
|
+
// same-domain strategy) still contribute their <a> links above, but
|
|
364
|
+
// clicking every interactive element on them is too slow and starves
|
|
365
|
+
// the crawler of time to discover pages on the primary hostname.
|
|
366
|
+
const currentHostname = new URL(page.url()).hostname;
|
|
367
|
+
const seedHostname = new URL(url).hostname;
|
|
368
|
+
if (currentHostname === seedHostname) {
|
|
369
|
+
// Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
|
|
370
|
+
try {
|
|
371
|
+
await customEnqueueLinksByClickingElements(page, browserContext);
|
|
372
|
+
} catch {
|
|
373
|
+
// do nothing;
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
} catch {
|
|
378
|
+
// No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
|
|
379
|
+
// Handles browser page object been closed.
|
|
380
|
+
}
|
|
341
381
|
};
|
|
342
382
|
|
|
343
383
|
let isAbortingScanNow = false;
|
|
@@ -368,9 +408,12 @@ const crawlDomain = async ({
|
|
|
368
408
|
await fsp.mkdir(subProfileDir, { recursive: true });
|
|
369
409
|
|
|
370
410
|
// Assign to Crawlee's launcher
|
|
411
|
+
// Crawlee preLaunchHooks expects launchContext to be mutated in-place.
|
|
412
|
+
// eslint-disable-next-line no-param-reassign
|
|
371
413
|
launchContext.userDataDir = subProfileDir;
|
|
372
414
|
|
|
373
415
|
// Safely extend launchOptions
|
|
416
|
+
// eslint-disable-next-line no-param-reassign
|
|
374
417
|
launchContext.launchOptions = {
|
|
375
418
|
...launchContext.launchOptions,
|
|
376
419
|
ignoreHTTPSErrors: true,
|
|
@@ -399,7 +442,7 @@ const crawlDomain = async ({
|
|
|
399
442
|
const observer = new MutationObserver(() => {
|
|
400
443
|
clearTimeout(timeout);
|
|
401
444
|
|
|
402
|
-
mutationCount
|
|
445
|
+
mutationCount += 1;
|
|
403
446
|
if (mutationCount > MAX_MUTATIONS) {
|
|
404
447
|
observer.disconnect();
|
|
405
448
|
resolve('Too many mutations, exiting.');
|
|
@@ -422,6 +465,8 @@ const crawlDomain = async ({
|
|
|
422
465
|
const root = document.documentElement || document.body || document;
|
|
423
466
|
if (!root || typeof observer.observe !== 'function') {
|
|
424
467
|
resolve('No root node to observe.');
|
|
468
|
+
} else {
|
|
469
|
+
observer.observe(root, { childList: true, subtree: true });
|
|
425
470
|
}
|
|
426
471
|
});
|
|
427
472
|
});
|
|
@@ -437,14 +482,21 @@ const crawlDomain = async ({
|
|
|
437
482
|
|
|
438
483
|
const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
|
|
439
484
|
if (isRedirected) {
|
|
440
|
-
await
|
|
485
|
+
await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
|
|
441
486
|
} else {
|
|
442
487
|
request.skipNavigation = false;
|
|
443
488
|
}
|
|
444
489
|
},
|
|
445
490
|
],
|
|
446
491
|
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
|
447
|
-
requestHandler: async ({
|
|
492
|
+
requestHandler: async ({
|
|
493
|
+
page,
|
|
494
|
+
request,
|
|
495
|
+
response,
|
|
496
|
+
crawler: activeCrawler,
|
|
497
|
+
sendRequest,
|
|
498
|
+
enqueueLinks,
|
|
499
|
+
}) => {
|
|
448
500
|
const browserContext: BrowserContext = page.context();
|
|
449
501
|
try {
|
|
450
502
|
await waitForPageLoaded(page, 10000);
|
|
@@ -454,6 +506,12 @@ const crawlDomain = async ({
|
|
|
454
506
|
actualUrl = page.url();
|
|
455
507
|
}
|
|
456
508
|
|
|
509
|
+
// Second-pass requests: only do click-discovery, skip scanning
|
|
510
|
+
if (request.label?.startsWith('__clickpass__')) {
|
|
511
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
|
512
|
+
return;
|
|
513
|
+
}
|
|
514
|
+
|
|
457
515
|
if (
|
|
458
516
|
!isFollowStrategy(url, actualUrl, strategy) &&
|
|
459
517
|
(isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
|
|
@@ -474,13 +532,13 @@ const crawlDomain = async ({
|
|
|
474
532
|
durationExceeded = true;
|
|
475
533
|
}
|
|
476
534
|
isAbortingScanNow = true;
|
|
477
|
-
|
|
535
|
+
activeCrawler.autoscaledPool.abort();
|
|
478
536
|
return;
|
|
479
537
|
}
|
|
480
538
|
|
|
481
539
|
// if URL has already been scanned
|
|
482
|
-
if (
|
|
483
|
-
|
|
540
|
+
if (scannedUrlSet.has(request.url)) {
|
|
541
|
+
await enqueueProcess(page, enqueueLinks, browserContext);
|
|
484
542
|
return;
|
|
485
543
|
}
|
|
486
544
|
|
|
@@ -512,7 +570,7 @@ const crawlDomain = async ({
|
|
|
512
570
|
|
|
513
571
|
return;
|
|
514
572
|
}
|
|
515
|
-
const { pdfFileName, url } = handlePdfDownload(
|
|
573
|
+
const { pdfFileName, url: downloadedPdfUrl } = handlePdfDownload(
|
|
516
574
|
randomToken,
|
|
517
575
|
pdfDownloads,
|
|
518
576
|
request,
|
|
@@ -520,7 +578,7 @@ const crawlDomain = async ({
|
|
|
520
578
|
urlsCrawled,
|
|
521
579
|
);
|
|
522
580
|
|
|
523
|
-
uuidToPdfMapping[pdfFileName] =
|
|
581
|
+
uuidToPdfMapping[pdfFileName] = downloadedPdfUrl;
|
|
524
582
|
return;
|
|
525
583
|
}
|
|
526
584
|
|
|
@@ -597,9 +655,7 @@ const crawlDomain = async ({
|
|
|
597
655
|
const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
|
|
598
656
|
|
|
599
657
|
if (isRedirected) {
|
|
600
|
-
const isLoadedUrlInCrawledUrls =
|
|
601
|
-
item => (item.actualUrl || item.url) === actualUrl,
|
|
602
|
-
);
|
|
658
|
+
const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
|
|
603
659
|
|
|
604
660
|
if (isLoadedUrlInCrawledUrls) {
|
|
605
661
|
urlsCrawled.notScannedRedirects.push({
|
|
@@ -621,6 +677,8 @@ const crawlDomain = async ({
|
|
|
621
677
|
pageTitle: results.pageTitle,
|
|
622
678
|
actualUrl, // i.e. actualUrl
|
|
623
679
|
});
|
|
680
|
+
scannedUrlSet.add(request.url);
|
|
681
|
+
scannedResolvedUrlSet.add(actualUrl);
|
|
624
682
|
|
|
625
683
|
urlsCrawled.scannedRedirects.push({
|
|
626
684
|
fromUrl: request.url,
|
|
@@ -631,20 +689,20 @@ const crawlDomain = async ({
|
|
|
631
689
|
results.actualUrl = actualUrl;
|
|
632
690
|
await dataset.pushData(results);
|
|
633
691
|
}
|
|
634
|
-
} else {
|
|
692
|
+
} else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
|
|
635
693
|
// One more check if scanned pages have reached limit due to multi-instances of handler running
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
694
|
+
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
|
695
|
+
numScanned: urlsCrawled.scanned.length,
|
|
696
|
+
urlScanned: request.url,
|
|
697
|
+
});
|
|
698
|
+
urlsCrawled.scanned.push({
|
|
699
|
+
url: request.url,
|
|
700
|
+
actualUrl: request.url,
|
|
701
|
+
pageTitle: results.pageTitle,
|
|
702
|
+
});
|
|
703
|
+
scannedUrlSet.add(request.url);
|
|
704
|
+
scannedResolvedUrlSet.add(request.url);
|
|
705
|
+
await dataset.pushData(results);
|
|
648
706
|
}
|
|
649
707
|
} else {
|
|
650
708
|
// Don't inform the user it is skipped since web crawler is best-effort.
|
|
@@ -675,16 +733,16 @@ const crawlDomain = async ({
|
|
|
675
733
|
urlScanned: request.url,
|
|
676
734
|
});
|
|
677
735
|
|
|
678
|
-
|
|
679
|
-
await
|
|
736
|
+
const recoveryPage = await browserContext.newPage();
|
|
737
|
+
await recoveryPage.goto(request.url);
|
|
680
738
|
|
|
681
|
-
await
|
|
739
|
+
await recoveryPage.route('**/*', async route => {
|
|
682
740
|
const interceptedRequest = route.request();
|
|
683
741
|
if (interceptedRequest.resourceType() === 'document') {
|
|
684
742
|
const interceptedRequestUrl = interceptedRequest
|
|
685
743
|
.url()
|
|
686
744
|
.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
|
|
687
|
-
await
|
|
745
|
+
await enqueueUniqueRequest({
|
|
688
746
|
url: interceptedRequestUrl,
|
|
689
747
|
skipNavigation: isUrlPdf(interceptedRequest.url()),
|
|
690
748
|
label: interceptedRequestUrl,
|
|
@@ -749,6 +807,61 @@ const crawlDomain = async ({
|
|
|
749
807
|
|
|
750
808
|
await crawler.run();
|
|
751
809
|
|
|
810
|
+
// Additional passes: keep re-visiting scanned seed-hostname pages for
|
|
811
|
+
// click-discovery until no new pages are found or limits are reached.
|
|
812
|
+
if (!safeMode && !isAbortingScanNow && !durationExceeded) {
|
|
813
|
+
const seedHostname = new URL(url).hostname;
|
|
814
|
+
const clickPassVisited = new Set<string>();
|
|
815
|
+
let prevScannedCount: number;
|
|
816
|
+
|
|
817
|
+
do {
|
|
818
|
+
prevScannedCount = urlsCrawled.scanned.length;
|
|
819
|
+
|
|
820
|
+
if (prevScannedCount >= maxRequestsPerCrawl) break;
|
|
821
|
+
if (scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000) break;
|
|
822
|
+
|
|
823
|
+
const seedHostnamePages = urlsCrawled.scanned
|
|
824
|
+
.map(item => item.actualUrl || item.url)
|
|
825
|
+
.filter(pageUrl => {
|
|
826
|
+
try {
|
|
827
|
+
return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
|
|
828
|
+
} catch {
|
|
829
|
+
return false;
|
|
830
|
+
}
|
|
831
|
+
});
|
|
832
|
+
|
|
833
|
+
if (seedHostnamePages.length === 0) break;
|
|
834
|
+
|
|
835
|
+
let enqueued = 0;
|
|
836
|
+
for (const pageUrl of seedHostnamePages) {
|
|
837
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) break;
|
|
838
|
+
if (scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000) break;
|
|
839
|
+
|
|
840
|
+
clickPassVisited.add(pageUrl);
|
|
841
|
+
try {
|
|
842
|
+
const clickPassLabel = `__clickpass__${pageUrl}`;
|
|
843
|
+
if (!queuedUrlSet.has(clickPassLabel)) {
|
|
844
|
+
queuedUrlSet.add(clickPassLabel);
|
|
845
|
+
await requestQueue.addRequest({
|
|
846
|
+
url: pageUrl,
|
|
847
|
+
label: clickPassLabel,
|
|
848
|
+
skipNavigation: false,
|
|
849
|
+
});
|
|
850
|
+
enqueued += 1;
|
|
851
|
+
}
|
|
852
|
+
} catch {
|
|
853
|
+
// ignore enqueue errors
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
if (enqueued === 0) break;
|
|
858
|
+
|
|
859
|
+
await crawler.run();
|
|
860
|
+
|
|
861
|
+
// Stop looping if no new pages were discovered in this pass
|
|
862
|
+
} while (urlsCrawled.scanned.length > prevScannedCount);
|
|
863
|
+
}
|
|
864
|
+
|
|
752
865
|
if (pdfDownloads.length > 0) {
|
|
753
866
|
// wait for pdf downloads to complete
|
|
754
867
|
await Promise.all(pdfDownloads);
|
|
@@ -761,9 +874,7 @@ const crawlDomain = async ({
|
|
|
761
874
|
|
|
762
875
|
// get screenshots from pdf docs
|
|
763
876
|
if (includeScreenshots) {
|
|
764
|
-
await Promise.all(
|
|
765
|
-
pdfResults.map(async result => await doPdfScreenshots(randomToken, result)),
|
|
766
|
-
);
|
|
877
|
+
await Promise.all(pdfResults.map(result => doPdfScreenshots(randomToken, result)));
|
|
767
878
|
}
|
|
768
879
|
|
|
769
880
|
// push results for each pdf document to key value store
|
|
@@ -53,14 +53,25 @@ const crawlIntelligentSitemap = async (
|
|
|
53
53
|
const homeUrl = getHomeUrl(link);
|
|
54
54
|
let sitemapLink = '';
|
|
55
55
|
|
|
56
|
-
const
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
56
|
+
const launchOptions = getPlaywrightLaunchOptions(browser);
|
|
57
|
+
let context;
|
|
58
|
+
let browserInstance;
|
|
59
|
+
|
|
60
|
+
if (process.env.CRAWLEE_HEADLESS === '1') {
|
|
61
|
+
const effectiveUserDataDirectory = userDataDirectory || '';
|
|
62
|
+
context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
|
|
63
|
+
...launchOptions,
|
|
64
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
65
|
+
});
|
|
66
|
+
register(context);
|
|
67
|
+
} else {
|
|
68
|
+
// In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
|
|
69
|
+
browserInstance = await constants.launcher.launch(launchOptions);
|
|
70
|
+
register(browserInstance as unknown as { close: () => Promise<void> });
|
|
71
|
+
context = await browserInstance.newContext({
|
|
72
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
|
73
|
+
});
|
|
74
|
+
}
|
|
64
75
|
|
|
65
76
|
const page = await context.newPage();
|
|
66
77
|
|
|
@@ -73,6 +84,9 @@ const crawlIntelligentSitemap = async (
|
|
|
73
84
|
}
|
|
74
85
|
await page.close();
|
|
75
86
|
await context.close().catch(() => {});
|
|
87
|
+
if (browserInstance) {
|
|
88
|
+
await browserInstance.close().catch(() => {});
|
|
89
|
+
}
|
|
76
90
|
return sitemapExist ? sitemapLink : '';
|
|
77
91
|
}
|
|
78
92
|
|