@govtechsg/oobee 0.10.83 → 0.10.84

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -90,6 +90,11 @@ verapdf --version
90
90
  | WARN_LEVEL | Only used in tests. | |
91
91
  | OOBEE_DISABLE_BROWSER_DOWNLOAD | Experimental flag to disable file downloads on Chrome/Chromium/Edge. Does not affect Local File scan | |
92
92
  | OOBEE_SLOWMO | Experimental flag to slow down web browser behaviour by specified duration (in miliseconds) | |
93
+ | HTTP_PROXY | URL of the proxy server to be used for HTTP requests (e.g. `http://proxy.example.com:8080`). | |
94
+ | HTTPS_PROXY | URL of the proxy server to be used for HTTPS requests (e.g. `https://proxy.example.com:8080`). | |
95
+ | ALL_PROXY | URL of the proxy server to be used for all requests, typically used for SOCKS5 proxies (e.g. `socks5://proxy.example.com:1080`. Note: IPv6 direct connections may still continue even though socks5 proxy is specified due to a known issue with Chrome/Chromium. (Recommended workaround is to turn off IPv6 at host-level). | |
96
+ | NO_PROXY | Comma-separated list of domains that should bypass the proxy (e.g. `localhost,127.0.0.1,.example.com`). | |
97
+ | INCLUDE_PROXY | Comma-separated list of domains that should specifically be routed through the proxy. | |
93
98
 
94
99
  #### Environment variables used internally (Do not set)
95
100
  Do not set these environment variables or behaviour might change unexpectedly.
@@ -677,4 +682,4 @@ It uses the existing report *.json files for the embedded HTML dataset.
677
682
 
678
683
  ```
679
684
  npx tsx dev/runGenerateJustHtmlReport.ts results/<report directory>
680
- ```
685
+ ```
@@ -364,7 +364,19 @@ const checkUrlConnectivityWithBrowser = async (url, browserToRun, clonedDataDir,
364
364
  });
365
365
  if (!response)
366
366
  throw new Error('No response from navigation');
367
- // We use the response headers from the navigation we just performed.
367
+ // Wait briefly for JS/meta-refresh redirects to settle before reading the final URL.
368
+ // Server-side redirects are already reflected after goto(), but client-side redirects
369
+ // (e.g. domain.tld -> www.domain.tld via JS or meta-refresh) need extra time.
370
+ try {
371
+ await Promise.race([
372
+ page.waitForURL(currentUrl => currentUrl !== url, { timeout: 5000 }),
373
+ new Promise(resolve => setTimeout(resolve, 1000)), // minimum settle time
374
+ ]);
375
+ }
376
+ catch {
377
+ // No redirect happened within the window — that's fine, continue with current URL
378
+ }
379
+ // Re-read page.url() AFTER potential client-side redirects have resolved
368
380
  const finalUrl = page.url();
369
381
  const finalStatus = response.status();
370
382
  const headers = response.headers();
@@ -24,7 +24,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
24
24
  const crawlStartTime = Date.now();
25
25
  let dataset;
26
26
  let urlsCrawled;
27
- let requestQueue;
27
+ const { requestQueue } = await createCrawleeSubFolders(randomToken);
28
28
  let durationExceeded = false;
29
29
  if (fromCrawlIntelligentSitemap) {
30
30
  dataset = datasetFromIntelligent;
@@ -34,65 +34,41 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
34
34
  ({ dataset } = await createCrawleeSubFolders(randomToken));
35
35
  urlsCrawled = { ...constants.urlsCrawledObj };
36
36
  }
37
- ({ requestQueue } = await createCrawleeSubFolders(randomToken));
38
37
  const pdfDownloads = [];
39
38
  const uuidToPdfMapping = {};
39
+ const queuedUrlSet = new Set();
40
+ const scannedUrlSet = new Set(urlsCrawled.scanned.map(item => item.url));
41
+ const scannedResolvedUrlSet = new Set(urlsCrawled.scanned.map(item => item.actualUrl || item.url));
40
42
  const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes);
41
43
  const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes);
42
44
  const { maxConcurrency } = constants;
43
45
  const { playwrightDeviceDetailsObject } = viewportSettings;
44
- await requestQueue.addRequest({
45
- url,
46
- skipNavigation: isUrlPdf(url),
47
- label: url,
48
- });
49
- const enqueueProcess = async (page, enqueueLinks, browserContext) => {
46
+ const enqueueUniqueRequest = async ({ url, skipNavigation, label, }) => {
47
+ if (queuedUrlSet.has(url)) {
48
+ return;
49
+ }
50
+ queuedUrlSet.add(url);
50
51
  try {
51
- await enqueueLinks({
52
- // set selector matches anchor elements with href but not contains # or starting with mailto:
53
- selector: `a:not(${disallowedSelectorPatterns})`,
54
- strategy,
55
- requestQueue,
56
- transformRequestFunction: (req) => {
57
- try {
58
- req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
59
- }
60
- catch (e) {
61
- consoleLogger.error(e);
62
- }
63
- if (urlsCrawled.scanned.some(item => item.url === req.url)) {
64
- req.skipNavigation = true;
65
- }
66
- if (isDisallowedInRobotsTxt(req.url))
67
- return null;
68
- if (isBlacklisted(req.url, blacklistedPatterns))
69
- return null;
70
- if (isUrlPdf(req.url)) {
71
- // playwright headless mode does not support navigation to pdf document
72
- req.skipNavigation = true;
73
- }
74
- req.label = req.url;
75
- return req;
76
- },
52
+ await requestQueue.addRequest({
53
+ url,
54
+ skipNavigation,
55
+ label,
77
56
  });
78
- // If safeMode flag is enabled, skip enqueueLinksByClickingElements
79
- if (!safeMode) {
80
- // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
81
- try {
82
- await customEnqueueLinksByClickingElements(page, browserContext);
83
- }
84
- catch (e) {
85
- // do nothing;
86
- }
87
- }
88
57
  }
89
- catch {
90
- // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
91
- // Handles browser page object been closed.
58
+ catch (error) {
59
+ queuedUrlSet.delete(url);
60
+ throw error;
92
61
  }
93
62
  };
94
- const customEnqueueLinksByClickingElements = async (page, browserContext) => {
95
- const initialPageUrl = page.url().toString();
63
+ await enqueueUniqueRequest({
64
+ url,
65
+ skipNavigation: isUrlPdf(url),
66
+ label: url,
67
+ });
68
+ const customEnqueueLinksByClickingElements = async (currentPage, browserContext) => {
69
+ let workingPage = currentPage;
70
+ const initialPageUrl = workingPage.url().toString();
71
+ const selectedElementsString = cssQuerySelectors.join(', ');
96
72
  const isExcluded = (newPageUrl) => {
97
73
  const isAlreadyScanned = urlsCrawled.scanned.some(item => item.url === newPageUrl);
98
74
  const isBlacklistedUrl = isBlacklisted(newPageUrl, blacklistedPatterns);
@@ -100,13 +76,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
100
76
  const isNotSupportedDocument = disallowedListOfPatterns.some(pattern => newPageUrl.toLowerCase().startsWith(pattern));
101
77
  return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
102
78
  };
103
- const setPageListeners = (page) => {
79
+ const setPageListeners = (pageListener) => {
104
80
  // event listener to handle new page popups upon button click
105
- page.on('popup', async (newPage) => {
81
+ pageListener.on('popup', async (newPage) => {
106
82
  try {
107
- if (newPage.url() != initialPageUrl && !isExcluded(newPage.url())) {
83
+ if (newPage.url() !== initialPageUrl && !isExcluded(newPage.url())) {
108
84
  const newPageUrl = newPage.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
109
- await requestQueue.addRequest({
85
+ await enqueueUniqueRequest({
110
86
  url: newPageUrl,
111
87
  skipNavigation: isUrlPdf(newPage.url()),
112
88
  label: newPageUrl,
@@ -128,13 +104,13 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
128
104
  }
129
105
  });
130
106
  // event listener to handle navigation to new url within same page upon element click
131
- page.on('framenavigated', async (newFrame) => {
107
+ pageListener.on('framenavigated', async (newFrame) => {
132
108
  try {
133
109
  if (newFrame.url() !== initialPageUrl &&
134
110
  !isExcluded(newFrame.url()) &&
135
- !(newFrame.url() == 'about:blank')) {
111
+ !(newFrame.url() === 'about:blank')) {
136
112
  const newFrameUrl = newFrame.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
137
- await requestQueue.addRequest({
113
+ await enqueueUniqueRequest({
138
114
  url: newFrameUrl,
139
115
  skipNavigation: isUrlPdf(newFrame.url()),
140
116
  label: newFrameUrl,
@@ -147,28 +123,32 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
147
123
  }
148
124
  });
149
125
  };
150
- setPageListeners(page);
126
+ setPageListeners(workingPage);
151
127
  let currentElementIndex = 0;
152
128
  let isAllElementsHandled = false;
129
+ // This loop is intentionally sequential because each step depends on the latest page state
130
+ // (navigation, popup/frame events, and potential page recreation).
131
+ // Running iterations in parallel (for example with Promise.all) would race on shared `page`
132
+ // state, causing stale element handles and nondeterministic enqueue/navigation behavior.
133
+ /* eslint-disable no-await-in-loop */
153
134
  while (!isAllElementsHandled) {
154
135
  try {
155
136
  // navigate back to initial page if clicking on a element previously caused it to navigate to a new url
156
- if (page.url() != initialPageUrl) {
137
+ if (workingPage.url() !== initialPageUrl) {
157
138
  try {
158
- await page.close();
139
+ await workingPage.close();
159
140
  }
160
141
  catch {
161
142
  // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
162
143
  // Handles browser page object been closed.
163
144
  }
164
- page = await browserContext.newPage();
165
- await page.goto(initialPageUrl, {
145
+ workingPage = await browserContext.newPage();
146
+ await workingPage.goto(initialPageUrl, {
166
147
  waitUntil: 'domcontentloaded',
167
148
  });
168
- setPageListeners(page);
149
+ setPageListeners(workingPage);
169
150
  }
170
- const selectedElementsString = cssQuerySelectors.join(', ');
171
- const selectedElements = await page.$$(selectedElementsString);
151
+ const selectedElements = await workingPage.$$(selectedElementsString);
172
152
  // edge case where there might be elements on page that appears intermittently
173
153
  if (currentElementIndex + 1 > selectedElements.length || !selectedElements) {
174
154
  break;
@@ -181,36 +161,34 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
181
161
  currentElementIndex += 1;
182
162
  let newUrlFoundInElement = null;
183
163
  if (await element.isVisible()) {
164
+ const currentPageUrl = workingPage.url();
184
165
  // Find url in html elements without clicking them
185
- await page
186
- .evaluate(element => {
166
+ const result = await workingPage.evaluate(pageElement => {
187
167
  // find href attribute
188
- const hrefUrl = element.getAttribute('href');
168
+ const hrefUrl = pageElement.getAttribute('href');
189
169
  // find url in datapath
190
- const dataPathUrl = element.getAttribute('data-path');
170
+ const dataPathUrl = pageElement.getAttribute('data-path');
191
171
  return hrefUrl || dataPathUrl;
192
- }, element)
193
- .then(result => {
194
- if (result) {
195
- newUrlFoundInElement = result;
196
- const pageUrl = new URL(page.url());
197
- const baseUrl = `${pageUrl.protocol}//${pageUrl.host}`;
198
- let absoluteUrl;
199
- // Construct absolute URL using base URL
200
- try {
201
- // Check if newUrlFoundInElement is a valid absolute URL
202
- absoluteUrl = new URL(newUrlFoundInElement);
203
- }
204
- catch (e) {
205
- // If it's not a valid URL, treat it as a relative URL
206
- absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
207
- }
208
- newUrlFoundInElement = absoluteUrl.href;
172
+ }, element);
173
+ if (result) {
174
+ newUrlFoundInElement = result;
175
+ const pageUrl = new URL(currentPageUrl);
176
+ const baseUrl = `${pageUrl.protocol}//${pageUrl.host}`;
177
+ let absoluteUrl;
178
+ // Construct absolute URL using base URL
179
+ try {
180
+ // Check if newUrlFoundInElement is a valid absolute URL
181
+ absoluteUrl = new URL(newUrlFoundInElement);
209
182
  }
210
- });
183
+ catch {
184
+ // If it's not a valid URL, treat it as a relative URL
185
+ absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
186
+ }
187
+ newUrlFoundInElement = absoluteUrl.href;
188
+ }
211
189
  if (newUrlFoundInElement && !isExcluded(newUrlFoundInElement)) {
212
190
  const newUrlFoundInElementUrl = newUrlFoundInElement.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
213
- await requestQueue.addRequest({
191
+ await enqueueUniqueRequest({
214
192
  url: newUrlFoundInElementUrl,
215
193
  skipNavigation: isUrlPdf(newUrlFoundInElement),
216
194
  label: newUrlFoundInElementUrl,
@@ -218,15 +196,16 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
218
196
  }
219
197
  else if (!newUrlFoundInElement) {
220
198
  try {
221
- const shouldSkip = await shouldSkipClickDueToDisallowedHref(page, element);
199
+ const shouldSkip = await shouldSkipClickDueToDisallowedHref(workingPage, element);
222
200
  if (shouldSkip) {
223
- const elementHtml = await page.evaluate(el => el.outerHTML, element);
201
+ const elementHtml = await workingPage.evaluate(el => el.outerHTML, element);
224
202
  consoleLogger.info('Skipping a click due to disallowed href nearby. Element HTML:', elementHtml);
225
- continue;
226
203
  }
227
- // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
228
- await element.click({ force: true });
229
- await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
204
+ else {
205
+ // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
206
+ await element.click({ force: true });
207
+ await workingPage.waitForTimeout(1000); // Add a delay of 1 second between each Element click
208
+ }
230
209
  }
231
210
  catch {
232
211
  // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
@@ -240,6 +219,61 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
240
219
  // Handles browser page object been closed.
241
220
  }
242
221
  }
222
+ /* eslint-enable no-await-in-loop */
223
+ };
224
+ const enqueueProcess = async (page, enqueueLinks, browserContext) => {
225
+ try {
226
+ await enqueueLinks({
227
+ // set selector matches anchor elements with href but not contains # or starting with mailto:
228
+ selector: `a:not(${disallowedSelectorPatterns})`,
229
+ strategy,
230
+ requestQueue,
231
+ transformRequestFunction: (req) => {
232
+ try {
233
+ req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
234
+ }
235
+ catch (e) {
236
+ consoleLogger.error(e);
237
+ }
238
+ if (scannedUrlSet.has(req.url)) {
239
+ req.skipNavigation = true;
240
+ }
241
+ if (isDisallowedInRobotsTxt(req.url))
242
+ return null;
243
+ if (isBlacklisted(req.url, blacklistedPatterns))
244
+ return null;
245
+ if (isUrlPdf(req.url)) {
246
+ // playwright headless mode does not support navigation to pdf document
247
+ req.skipNavigation = true;
248
+ }
249
+ req.label = req.url;
250
+ return req;
251
+ },
252
+ });
253
+ // If safeMode flag is enabled, skip enqueueLinksByClickingElements
254
+ if (!safeMode) {
255
+ // Only run the expensive element-clicking discovery on pages sharing the
256
+ // same hostname as the seed URL. Cross-subdomain pages (reachable via
257
+ // same-domain strategy) still contribute their <a> links above, but
258
+ // clicking every interactive element on them is too slow and starves
259
+ // the crawler of time to discover pages on the primary hostname.
260
+ const currentHostname = new URL(page.url()).hostname;
261
+ const seedHostname = new URL(url).hostname;
262
+ if (currentHostname === seedHostname) {
263
+ // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
264
+ try {
265
+ await customEnqueueLinksByClickingElements(page, browserContext);
266
+ }
267
+ catch {
268
+ // do nothing;
269
+ }
270
+ }
271
+ }
272
+ }
273
+ catch {
274
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
275
+ // Handles browser page object been closed.
276
+ }
243
277
  };
244
278
  let isAbortingScanNow = false;
245
279
  const crawler = register(new crawlee.PlaywrightCrawler({
@@ -261,8 +295,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
261
295
  const subProfileDir = path.join(baseDir, `profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`);
262
296
  await fsp.mkdir(subProfileDir, { recursive: true });
263
297
  // Assign to Crawlee's launcher
298
+ // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
299
+ // eslint-disable-next-line no-param-reassign
264
300
  launchContext.userDataDir = subProfileDir;
265
301
  // Safely extend launchOptions
302
+ // eslint-disable-next-line no-param-reassign
266
303
  launchContext.launchOptions = {
267
304
  ...launchContext.launchOptions,
268
305
  ignoreHTTPSErrors: true,
@@ -287,7 +324,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
287
324
  const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
288
325
  const observer = new MutationObserver(() => {
289
326
  clearTimeout(timeout);
290
- mutationCount++;
327
+ mutationCount += 1;
291
328
  if (mutationCount > MAX_MUTATIONS) {
292
329
  observer.disconnect();
293
330
  resolve('Too many mutations, exiting.');
@@ -308,6 +345,9 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
308
345
  if (!root || typeof observer.observe !== 'function') {
309
346
  resolve('No root node to observe.');
310
347
  }
348
+ else {
349
+ observer.observe(root, { childList: true, subtree: true });
350
+ }
311
351
  });
312
352
  });
313
353
  let finalUrl = page.url();
@@ -319,7 +359,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
319
359
  }
320
360
  const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
321
361
  if (isRedirected) {
322
- await requestQueue.addRequest({ url: finalUrl, label: finalUrl });
362
+ await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
323
363
  }
324
364
  else {
325
365
  request.skipNavigation = false;
@@ -327,7 +367,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
327
367
  },
328
368
  ],
329
369
  requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
330
- requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
370
+ requestHandler: async ({ page, request, response, crawler: activeCrawler, sendRequest, enqueueLinks, }) => {
331
371
  const browserContext = page.context();
332
372
  try {
333
373
  await waitForPageLoaded(page, 10000);
@@ -335,6 +375,11 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
335
375
  if (page.url() !== 'about:blank') {
336
376
  actualUrl = page.url();
337
377
  }
378
+ // Second-pass requests: only do click-discovery, skip scanning
379
+ if (request.label?.startsWith('__clickpass__')) {
380
+ await enqueueProcess(page, enqueueLinks, browserContext);
381
+ return;
382
+ }
338
383
  if (!isFollowStrategy(url, actualUrl, strategy) &&
339
384
  (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))) {
340
385
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
@@ -350,12 +395,12 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
350
395
  durationExceeded = true;
351
396
  }
352
397
  isAbortingScanNow = true;
353
- crawler.autoscaledPool.abort();
398
+ activeCrawler.autoscaledPool.abort();
354
399
  return;
355
400
  }
356
401
  // if URL has already been scanned
357
- if (urlsCrawled.scanned.some(item => item.url === request.url)) {
358
- // await enqueueProcess(page, enqueueLinks, browserContext);
402
+ if (scannedUrlSet.has(request.url)) {
403
+ await enqueueProcess(page, enqueueLinks, browserContext);
359
404
  return;
360
405
  }
361
406
  if (isDisallowedInRobotsTxt(request.url)) {
@@ -382,8 +427,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
382
427
  */
383
428
  return;
384
429
  }
385
- const { pdfFileName, url } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
386
- uuidToPdfMapping[pdfFileName] = url;
430
+ const { pdfFileName, url: downloadedPdfUrl } = handlePdfDownload(randomToken, pdfDownloads, request, sendRequest, urlsCrawled);
431
+ uuidToPdfMapping[pdfFileName] = downloadedPdfUrl;
387
432
  return;
388
433
  }
389
434
  if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
@@ -449,7 +494,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
449
494
  }
450
495
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
451
496
  if (isRedirected) {
452
- const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(item => (item.actualUrl || item.url) === actualUrl);
497
+ const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
453
498
  if (isLoadedUrlInCrawledUrls) {
454
499
  urlsCrawled.notScannedRedirects.push({
455
500
  fromUrl: request.url,
@@ -468,6 +513,8 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
468
513
  pageTitle: results.pageTitle,
469
514
  actualUrl, // i.e. actualUrl
470
515
  });
516
+ scannedUrlSet.add(request.url);
517
+ scannedResolvedUrlSet.add(actualUrl);
471
518
  urlsCrawled.scannedRedirects.push({
472
519
  fromUrl: request.url,
473
520
  toUrl: actualUrl, // i.e. actualUrl
@@ -477,20 +524,20 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
477
524
  await dataset.pushData(results);
478
525
  }
479
526
  }
480
- else {
527
+ else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
481
528
  // One more check if scanned pages have reached limit due to multi-instances of handler running
482
- if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
483
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
484
- numScanned: urlsCrawled.scanned.length,
485
- urlScanned: request.url,
486
- });
487
- urlsCrawled.scanned.push({
488
- url: request.url,
489
- actualUrl: request.url,
490
- pageTitle: results.pageTitle,
491
- });
492
- await dataset.pushData(results);
493
- }
529
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
530
+ numScanned: urlsCrawled.scanned.length,
531
+ urlScanned: request.url,
532
+ });
533
+ urlsCrawled.scanned.push({
534
+ url: request.url,
535
+ actualUrl: request.url,
536
+ pageTitle: results.pageTitle,
537
+ });
538
+ scannedUrlSet.add(request.url);
539
+ scannedResolvedUrlSet.add(request.url);
540
+ await dataset.pushData(results);
494
541
  }
495
542
  }
496
543
  else {
@@ -521,15 +568,15 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
521
568
  numScanned: urlsCrawled.scanned.length,
522
569
  urlScanned: request.url,
523
570
  });
524
- page = await browserContext.newPage();
525
- await page.goto(request.url);
526
- await page.route('**/*', async (route) => {
571
+ const recoveryPage = await browserContext.newPage();
572
+ await recoveryPage.goto(request.url);
573
+ await recoveryPage.route('**/*', async (route) => {
527
574
  const interceptedRequest = route.request();
528
575
  if (interceptedRequest.resourceType() === 'document') {
529
576
  const interceptedRequestUrl = interceptedRequest
530
577
  .url()
531
578
  .replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
532
- await requestQueue.addRequest({
579
+ await enqueueUniqueRequest({
533
580
  url: interceptedRequestUrl,
534
581
  skipNavigation: isUrlPdf(interceptedRequest.url()),
535
582
  label: interceptedRequestUrl,
@@ -587,6 +634,59 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
587
634
  }),
588
635
  }));
589
636
  await crawler.run();
637
+ // Additional passes: keep re-visiting scanned seed-hostname pages for
638
+ // click-discovery until no new pages are found or limits are reached.
639
+ if (!safeMode && !isAbortingScanNow && !durationExceeded) {
640
+ const seedHostname = new URL(url).hostname;
641
+ const clickPassVisited = new Set();
642
+ let prevScannedCount;
643
+ do {
644
+ prevScannedCount = urlsCrawled.scanned.length;
645
+ if (prevScannedCount >= maxRequestsPerCrawl)
646
+ break;
647
+ if (scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000)
648
+ break;
649
+ const seedHostnamePages = urlsCrawled.scanned
650
+ .map(item => item.actualUrl || item.url)
651
+ .filter(pageUrl => {
652
+ try {
653
+ return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
654
+ }
655
+ catch {
656
+ return false;
657
+ }
658
+ });
659
+ if (seedHostnamePages.length === 0)
660
+ break;
661
+ let enqueued = 0;
662
+ for (const pageUrl of seedHostnamePages) {
663
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl)
664
+ break;
665
+ if (scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000)
666
+ break;
667
+ clickPassVisited.add(pageUrl);
668
+ try {
669
+ const clickPassLabel = `__clickpass__${pageUrl}`;
670
+ if (!queuedUrlSet.has(clickPassLabel)) {
671
+ queuedUrlSet.add(clickPassLabel);
672
+ await requestQueue.addRequest({
673
+ url: pageUrl,
674
+ label: clickPassLabel,
675
+ skipNavigation: false,
676
+ });
677
+ enqueued += 1;
678
+ }
679
+ }
680
+ catch {
681
+ // ignore enqueue errors
682
+ }
683
+ }
684
+ if (enqueued === 0)
685
+ break;
686
+ await crawler.run();
687
+ // Stop looping if no new pages were discovered in this pass
688
+ } while (urlsCrawled.scanned.length > prevScannedCount);
689
+ }
590
690
  if (pdfDownloads.length > 0) {
591
691
  // wait for pdf downloads to complete
592
692
  await Promise.all(pdfDownloads);
@@ -596,7 +696,7 @@ const crawlDomain = async ({ url, randomToken, host: _host, viewportSettings, ma
596
696
  const pdfResults = await mapPdfScanResults(randomToken, uuidToPdfMapping);
597
697
  // get screenshots from pdf docs
598
698
  if (includeScreenshots) {
599
- await Promise.all(pdfResults.map(async (result) => await doPdfScreenshots(randomToken, result)));
699
+ await Promise.all(pdfResults.map(result => doPdfScreenshots(randomToken, result)));
600
700
  }
601
701
  // push results for each pdf document to key value store
602
702
  await Promise.all(pdfResults.map(result => dataset.pushData(result)));
@@ -22,13 +22,25 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
22
22
  async function findSitemap(link, userDataDirectory, extraHTTPHeaders) {
23
23
  const homeUrl = getHomeUrl(link);
24
24
  let sitemapLink = '';
25
- const effectiveUserDataDirectory = process.env.CRAWLEE_HEADLESS === '1' ? userDataDirectory : '';
26
- const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
27
- headless: process.env.CRAWLEE_HEADLESS === '1',
28
- ...getPlaywrightLaunchOptions(browser),
29
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
30
- });
31
- register(context);
25
+ const launchOptions = getPlaywrightLaunchOptions(browser);
26
+ let context;
27
+ let browserInstance;
28
+ if (process.env.CRAWLEE_HEADLESS === '1') {
29
+ const effectiveUserDataDirectory = userDataDirectory || '';
30
+ context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
31
+ ...launchOptions,
32
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
33
+ });
34
+ register(context);
35
+ }
36
+ else {
37
+ // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
38
+ browserInstance = await constants.launcher.launch(launchOptions);
39
+ register(browserInstance);
40
+ context = await browserInstance.newContext({
41
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
42
+ });
43
+ }
32
44
  const page = await context.newPage();
33
45
  for (const path of sitemapPaths) {
34
46
  sitemapLink = homeUrl + path;
@@ -39,6 +51,9 @@ const crawlIntelligentSitemap = async (url, randomToken, host, viewportSettings,
39
51
  }
40
52
  await page.close();
41
53
  await context.close().catch(() => { });
54
+ if (browserInstance) {
55
+ await browserInstance.close().catch(() => { });
56
+ }
42
57
  return sitemapExist ? sitemapLink : '';
43
58
  }
44
59
  const checkUrlExists = async (page, parsedUrl) => {
@@ -6,6 +6,7 @@ import constants, { getIntermediateScreenshotsPath, guiInfoStatusTypes, } from '
6
6
  import { initNewPage, log } from './custom/utils.js';
7
7
  import { guiInfoLog } from '../logs.js';
8
8
  import { addUrlGuardScript } from './guards/urlGuard.js';
9
+ import { getPlaywrightLaunchOptions } from '../constants/common.js';
9
10
  // Export of classes
10
11
  export class ProcessPageParams {
11
12
  constructor(scannedIdx, blacklistedPatterns, includeScreenshots, dataset, intermediateScreenshotsPath, urlsCrawled, randomToken) {
@@ -34,11 +35,16 @@ const runCustom = async (url, randomToken, viewportSettings, blacklistedPatterns
34
35
  try {
35
36
  const deviceConfig = viewportSettings.playwrightDeviceDetailsObject;
36
37
  const hasCustomViewport = !!deviceConfig;
38
+ const baseLaunchOptions = getPlaywrightLaunchOptions('chrome');
39
+ // Merge base args with custom flow specific args
40
+ const baseArgs = baseLaunchOptions.args || [];
41
+ const customArgs = hasCustomViewport ? ['--window-size=1920,1040'] : ['--start-maximized'];
42
+ const mergedArgs = [...baseArgs.filter(a => !a.startsWith('--window-size') && a !== '--start-maximized'), ...customArgs];
37
43
  const browser = await chromium.launch({
38
- args: hasCustomViewport ? ['--window-size=1920,1040'] : ['--start-maximized'],
44
+ ...baseLaunchOptions,
45
+ args: mergedArgs,
39
46
  headless: false,
40
47
  channel: 'chrome',
41
- // bypassCSP: true,
42
48
  });
43
49
  const context = await browser.newContext({
44
50
  ignoreHTTPSErrors: true,