@govtechsg/oobee 0.10.83 → 0.10.84

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,6 @@
1
1
  import crawlee, { EnqueueStrategy } from 'crawlee';
2
- import fs from 'fs';
3
2
  import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
4
- import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
5
- import https from 'https';
6
- import type { BatchAddRequestsResult } from '@crawlee/types';
3
+ import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
7
4
  import * as path from 'path';
8
5
  import fsp from 'fs/promises';
9
6
  import {
@@ -39,7 +36,7 @@ import {
39
36
  mapPdfScanResults,
40
37
  doPdfScreenshots,
41
38
  } from './pdfScanFunc.js';
42
- import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
39
+ import { consoleLogger, guiInfoLog } from '../logs.js';
43
40
  import { ViewportSettingsClass } from '../combine.js';
44
41
 
45
42
  const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
@@ -104,7 +101,8 @@ const crawlDomain = async ({
104
101
  const crawlStartTime = Date.now();
105
102
  let dataset: crawlee.Dataset;
106
103
  let urlsCrawled: UrlsCrawled;
107
- let requestQueue: crawlee.RequestQueue;
104
+ const { requestQueue }: { requestQueue: crawlee.RequestQueue } =
105
+ await createCrawleeSubFolders(randomToken);
108
106
  let durationExceeded = false;
109
107
 
110
108
  if (fromCrawlIntelligentSitemap) {
@@ -115,73 +113,57 @@ const crawlDomain = async ({
115
113
  urlsCrawled = { ...constants.urlsCrawledObj };
116
114
  }
117
115
 
118
- ({ requestQueue } = await createCrawleeSubFolders(randomToken));
119
-
120
116
  const pdfDownloads: Promise<void>[] = [];
121
117
  const uuidToPdfMapping: Record<string, string> = {};
118
+ const queuedUrlSet = new Set<string>();
119
+ const scannedUrlSet = new Set<string>(urlsCrawled.scanned.map(item => item.url));
120
+ const scannedResolvedUrlSet = new Set<string>(
121
+ urlsCrawled.scanned.map(item => item.actualUrl || item.url),
122
+ );
122
123
  const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
123
124
  const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
124
125
  const { maxConcurrency } = constants;
125
126
  const { playwrightDeviceDetailsObject } = viewportSettings;
126
127
 
127
- await requestQueue.addRequest({
128
+ const enqueueUniqueRequest = async ({
128
129
  url,
129
- skipNavigation: isUrlPdf(url),
130
- label: url,
131
- });
130
+ skipNavigation,
131
+ label,
132
+ }: {
133
+ url: string;
134
+ skipNavigation?: boolean;
135
+ label?: string;
136
+ }) => {
137
+ if (queuedUrlSet.has(url)) {
138
+ return;
139
+ }
140
+ queuedUrlSet.add(url);
132
141
 
133
- const enqueueProcess = async (
134
- page: Page,
135
- enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
136
- browserContext: BrowserContext,
137
- ) => {
138
142
  try {
139
- await enqueueLinks({
140
- // set selector matches anchor elements with href but not contains # or starting with mailto:
141
- selector: `a:not(${disallowedSelectorPatterns})`,
142
- strategy,
143
- requestQueue,
144
- transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
145
- try {
146
- req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
147
- } catch (e) {
148
- consoleLogger.error(e);
149
- }
150
- if (urlsCrawled.scanned.some(item => item.url === req.url)) {
151
- req.skipNavigation = true;
152
- }
153
- if (isDisallowedInRobotsTxt(req.url)) return null;
154
- if (isBlacklisted(req.url, blacklistedPatterns)) return null;
155
- if (isUrlPdf(req.url)) {
156
- // playwright headless mode does not support navigation to pdf document
157
- req.skipNavigation = true;
158
- }
159
- req.label = req.url;
160
-
161
- return req;
162
- },
143
+ await requestQueue.addRequest({
144
+ url,
145
+ skipNavigation,
146
+ label,
163
147
  });
164
-
165
- // If safeMode flag is enabled, skip enqueueLinksByClickingElements
166
- if (!safeMode) {
167
- // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
168
- try {
169
- await customEnqueueLinksByClickingElements(page, browserContext);
170
- } catch (e) {
171
- // do nothing;
172
- }
173
- }
174
- } catch {
175
- // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
176
- // Handles browser page object been closed.
148
+ } catch (error) {
149
+ queuedUrlSet.delete(url);
150
+ throw error;
177
151
  }
178
152
  };
179
153
 
154
+ await enqueueUniqueRequest({
155
+ url,
156
+ skipNavigation: isUrlPdf(url),
157
+ label: url,
158
+ });
159
+
180
160
  const customEnqueueLinksByClickingElements = async (
181
- page: Page,
161
+ currentPage: Page,
182
162
  browserContext: BrowserContext,
183
163
  ): Promise<void> => {
184
- const initialPageUrl: string = page.url().toString();
164
+ let workingPage = currentPage;
165
+ const initialPageUrl: string = workingPage.url().toString();
166
+ const selectedElementsString = cssQuerySelectors.join(', ');
185
167
 
186
168
  const isExcluded = (newPageUrl: string): boolean => {
187
169
  const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
@@ -192,13 +174,13 @@ const crawlDomain = async ({
192
174
  );
193
175
  return isNotSupportedDocument || isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
194
176
  };
195
- const setPageListeners = (page: Page): void => {
177
+ const setPageListeners = (pageListener: Page): void => {
196
178
  // event listener to handle new page popups upon button click
197
- page.on('popup', async (newPage: Page) => {
179
+ pageListener.on('popup', async (newPage: Page) => {
198
180
  try {
199
- if (newPage.url() != initialPageUrl && !isExcluded(newPage.url())) {
181
+ if (newPage.url() !== initialPageUrl && !isExcluded(newPage.url())) {
200
182
  const newPageUrl: string = newPage.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
201
- await requestQueue.addRequest({
183
+ await enqueueUniqueRequest({
202
184
  url: newPageUrl,
203
185
  skipNavigation: isUrlPdf(newPage.url()),
204
186
  label: newPageUrl,
@@ -218,15 +200,15 @@ const crawlDomain = async ({
218
200
  });
219
201
 
220
202
  // event listener to handle navigation to new url within same page upon element click
221
- page.on('framenavigated', async (newFrame: Frame) => {
203
+ pageListener.on('framenavigated', async (newFrame: Frame) => {
222
204
  try {
223
205
  if (
224
206
  newFrame.url() !== initialPageUrl &&
225
207
  !isExcluded(newFrame.url()) &&
226
- !(newFrame.url() == 'about:blank')
208
+ !(newFrame.url() === 'about:blank')
227
209
  ) {
228
210
  const newFrameUrl: string = newFrame.url().replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
229
- await requestQueue.addRequest({
211
+ await enqueueUniqueRequest({
230
212
  url: newFrameUrl,
231
213
  skipNavigation: isUrlPdf(newFrame.url()),
232
214
  label: newFrameUrl,
@@ -238,28 +220,32 @@ const crawlDomain = async ({
238
220
  }
239
221
  });
240
222
  };
241
- setPageListeners(page);
223
+ setPageListeners(workingPage);
242
224
  let currentElementIndex: number = 0;
243
225
  let isAllElementsHandled: boolean = false;
226
+ // This loop is intentionally sequential because each step depends on the latest page state
227
+ // (navigation, popup/frame events, and potential page recreation).
228
+ // Running iterations in parallel (for example with Promise.all) would race on shared `page`
229
+ // state, causing stale element handles and nondeterministic enqueue/navigation behavior.
230
+ /* eslint-disable no-await-in-loop */
244
231
  while (!isAllElementsHandled) {
245
232
  try {
246
233
  // navigate back to initial page if clicking on a element previously caused it to navigate to a new url
247
- if (page.url() != initialPageUrl) {
234
+ if (workingPage.url() !== initialPageUrl) {
248
235
  try {
249
- await page.close();
236
+ await workingPage.close();
250
237
  } catch {
251
238
  // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
252
239
  // Handles browser page object been closed.
253
240
  }
254
- page = await browserContext.newPage();
255
- await page.goto(initialPageUrl, {
241
+ workingPage = await browserContext.newPage();
242
+ await workingPage.goto(initialPageUrl, {
256
243
  waitUntil: 'domcontentloaded',
257
244
  });
258
- setPageListeners(page);
245
+ setPageListeners(workingPage);
259
246
  }
260
- const selectedElementsString = cssQuerySelectors.join(', ');
261
247
  const selectedElements: ElementHandle<SVGElement | HTMLElement>[] =
262
- await page.$$(selectedElementsString);
248
+ await workingPage.$$(selectedElementsString);
263
249
  // edge case where there might be elements on page that appears intermittently
264
250
  if (currentElementIndex + 1 > selectedElements.length || !selectedElements) {
265
251
  break;
@@ -273,60 +259,57 @@ const crawlDomain = async ({
273
259
  currentElementIndex += 1;
274
260
  let newUrlFoundInElement: string = null;
275
261
  if (await element.isVisible()) {
262
+ const currentPageUrl = workingPage.url();
276
263
  // Find url in html elements without clicking them
277
- await page
278
- .evaluate(element => {
279
- // find href attribute
280
- const hrefUrl: string = element.getAttribute('href');
281
-
282
- // find url in datapath
283
- const dataPathUrl: string = element.getAttribute('data-path');
284
-
285
- return hrefUrl || dataPathUrl;
286
- }, element)
287
- .then(result => {
288
- if (result) {
289
- newUrlFoundInElement = result;
290
- const pageUrl: URL = new URL(page.url());
291
- const baseUrl: string = `${pageUrl.protocol}//${pageUrl.host}`;
292
- let absoluteUrl: URL;
293
- // Construct absolute URL using base URL
294
- try {
295
- // Check if newUrlFoundInElement is a valid absolute URL
296
- absoluteUrl = new URL(newUrlFoundInElement);
297
- } catch (e) {
298
- // If it's not a valid URL, treat it as a relative URL
299
- absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
300
- }
301
- newUrlFoundInElement = absoluteUrl.href;
302
- }
303
- });
264
+ const result = await workingPage.evaluate(pageElement => {
265
+ // find href attribute
266
+ const hrefUrl: string = pageElement.getAttribute('href');
267
+
268
+ // find url in datapath
269
+ const dataPathUrl: string = pageElement.getAttribute('data-path');
270
+
271
+ return hrefUrl || dataPathUrl;
272
+ }, element);
273
+ if (result) {
274
+ newUrlFoundInElement = result;
275
+ const pageUrl: URL = new URL(currentPageUrl);
276
+ const baseUrl: string = `${pageUrl.protocol}//${pageUrl.host}`;
277
+ let absoluteUrl: URL;
278
+ // Construct absolute URL using base URL
279
+ try {
280
+ // Check if newUrlFoundInElement is a valid absolute URL
281
+ absoluteUrl = new URL(newUrlFoundInElement);
282
+ } catch {
283
+ // If it's not a valid URL, treat it as a relative URL
284
+ absoluteUrl = new URL(newUrlFoundInElement, baseUrl);
285
+ }
286
+ newUrlFoundInElement = absoluteUrl.href;
287
+ }
304
288
  if (newUrlFoundInElement && !isExcluded(newUrlFoundInElement)) {
305
289
  const newUrlFoundInElementUrl: string = newUrlFoundInElement.replace(
306
290
  /(?<=&|\?)utm_.*?(&|$)/gim,
307
291
  '',
308
292
  );
309
293
 
310
- await requestQueue.addRequest({
294
+ await enqueueUniqueRequest({
311
295
  url: newUrlFoundInElementUrl,
312
296
  skipNavigation: isUrlPdf(newUrlFoundInElement),
313
297
  label: newUrlFoundInElementUrl,
314
298
  });
315
299
  } else if (!newUrlFoundInElement) {
316
300
  try {
317
- const shouldSkip = await shouldSkipClickDueToDisallowedHref(page, element);
301
+ const shouldSkip = await shouldSkipClickDueToDisallowedHref(workingPage, element);
318
302
  if (shouldSkip) {
319
- const elementHtml = await page.evaluate(el => el.outerHTML, element);
303
+ const elementHtml = await workingPage.evaluate(el => el.outerHTML, element);
320
304
  consoleLogger.info(
321
305
  'Skipping a click due to disallowed href nearby. Element HTML:',
322
306
  elementHtml,
323
307
  );
324
- continue;
308
+ } else {
309
+ // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
310
+ await element.click({ force: true });
311
+ await workingPage.waitForTimeout(1000); // Add a delay of 1 second between each Element click
325
312
  }
326
-
327
- // Find url in html elements by manually clicking them. New page navigation/popups will be handled by event listeners above
328
- await element.click({ force: true });
329
- await page.waitForTimeout(1000); // Add a delay of 1 second between each Element click
330
313
  } catch {
331
314
  // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
332
315
  // Handles browser page object been closed.
@@ -338,6 +321,63 @@ const crawlDomain = async ({
338
321
  // Handles browser page object been closed.
339
322
  }
340
323
  }
324
+ /* eslint-enable no-await-in-loop */
325
+ };
326
+
327
+ const enqueueProcess = async (
328
+ page: Page,
329
+ enqueueLinks: PlaywrightCrawlingContext['enqueueLinks'],
330
+ browserContext: BrowserContext,
331
+ ) => {
332
+ try {
333
+ await enqueueLinks({
334
+ // set selector matches anchor elements with href but not contains # or starting with mailto:
335
+ selector: `a:not(${disallowedSelectorPatterns})`,
336
+ strategy,
337
+ requestQueue,
338
+ transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
339
+ try {
340
+ req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
341
+ } catch (e) {
342
+ consoleLogger.error(e);
343
+ }
344
+ if (scannedUrlSet.has(req.url)) {
345
+ req.skipNavigation = true;
346
+ }
347
+ if (isDisallowedInRobotsTxt(req.url)) return null;
348
+ if (isBlacklisted(req.url, blacklistedPatterns)) return null;
349
+ if (isUrlPdf(req.url)) {
350
+ // playwright headless mode does not support navigation to pdf document
351
+ req.skipNavigation = true;
352
+ }
353
+ req.label = req.url;
354
+
355
+ return req;
356
+ },
357
+ });
358
+
359
+ // If safeMode flag is enabled, skip enqueueLinksByClickingElements
360
+ if (!safeMode) {
361
+ // Only run the expensive element-clicking discovery on pages sharing the
362
+ // same hostname as the seed URL. Cross-subdomain pages (reachable via
363
+ // same-domain strategy) still contribute their <a> links above, but
364
+ // clicking every interactive element on them is too slow and starves
365
+ // the crawler of time to discover pages on the primary hostname.
366
+ const currentHostname = new URL(page.url()).hostname;
367
+ const seedHostname = new URL(url).hostname;
368
+ if (currentHostname === seedHostname) {
369
+ // Try catch is necessary as clicking links is best effort, it may result in new pages that cause browser load or navigation errors that PlaywrightCrawler does not handle
370
+ try {
371
+ await customEnqueueLinksByClickingElements(page, browserContext);
372
+ } catch {
373
+ // do nothing;
374
+ }
375
+ }
376
+ }
377
+ } catch {
378
+ // No logging for this case as it is best effort to handle dynamic client-side JavaScript redirects and clicks.
379
+ // Handles browser page object been closed.
380
+ }
341
381
  };
342
382
 
343
383
  let isAbortingScanNow = false;
@@ -368,9 +408,12 @@ const crawlDomain = async ({
368
408
  await fsp.mkdir(subProfileDir, { recursive: true });
369
409
 
370
410
  // Assign to Crawlee's launcher
411
+ // Crawlee preLaunchHooks expects launchContext to be mutated in-place.
412
+ // eslint-disable-next-line no-param-reassign
371
413
  launchContext.userDataDir = subProfileDir;
372
414
 
373
415
  // Safely extend launchOptions
416
+ // eslint-disable-next-line no-param-reassign
374
417
  launchContext.launchOptions = {
375
418
  ...launchContext.launchOptions,
376
419
  ignoreHTTPSErrors: true,
@@ -399,7 +442,7 @@ const crawlDomain = async ({
399
442
  const observer = new MutationObserver(() => {
400
443
  clearTimeout(timeout);
401
444
 
402
- mutationCount++;
445
+ mutationCount += 1;
403
446
  if (mutationCount > MAX_MUTATIONS) {
404
447
  observer.disconnect();
405
448
  resolve('Too many mutations, exiting.');
@@ -422,6 +465,8 @@ const crawlDomain = async ({
422
465
  const root = document.documentElement || document.body || document;
423
466
  if (!root || typeof observer.observe !== 'function') {
424
467
  resolve('No root node to observe.');
468
+ } else {
469
+ observer.observe(root, { childList: true, subtree: true });
425
470
  }
426
471
  });
427
472
  });
@@ -437,14 +482,21 @@ const crawlDomain = async ({
437
482
 
438
483
  const isRedirected = !areLinksEqual(finalUrl, requestLabelUrl);
439
484
  if (isRedirected) {
440
- await requestQueue.addRequest({ url: finalUrl, label: finalUrl });
485
+ await enqueueUniqueRequest({ url: finalUrl, label: finalUrl });
441
486
  } else {
442
487
  request.skipNavigation = false;
443
488
  }
444
489
  },
445
490
  ],
446
491
  requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
447
- requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
492
+ requestHandler: async ({
493
+ page,
494
+ request,
495
+ response,
496
+ crawler: activeCrawler,
497
+ sendRequest,
498
+ enqueueLinks,
499
+ }) => {
448
500
  const browserContext: BrowserContext = page.context();
449
501
  try {
450
502
  await waitForPageLoaded(page, 10000);
@@ -454,6 +506,12 @@ const crawlDomain = async ({
454
506
  actualUrl = page.url();
455
507
  }
456
508
 
509
+ // Second-pass requests: only do click-discovery, skip scanning
510
+ if (request.label?.startsWith('__clickpass__')) {
511
+ await enqueueProcess(page, enqueueLinks, browserContext);
512
+ return;
513
+ }
514
+
457
515
  if (
458
516
  !isFollowStrategy(url, actualUrl, strategy) &&
459
517
  (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
@@ -474,13 +532,13 @@ const crawlDomain = async ({
474
532
  durationExceeded = true;
475
533
  }
476
534
  isAbortingScanNow = true;
477
- crawler.autoscaledPool.abort();
535
+ activeCrawler.autoscaledPool.abort();
478
536
  return;
479
537
  }
480
538
 
481
539
  // if URL has already been scanned
482
- if (urlsCrawled.scanned.some(item => item.url === request.url)) {
483
- // await enqueueProcess(page, enqueueLinks, browserContext);
540
+ if (scannedUrlSet.has(request.url)) {
541
+ await enqueueProcess(page, enqueueLinks, browserContext);
484
542
  return;
485
543
  }
486
544
 
@@ -512,7 +570,7 @@ const crawlDomain = async ({
512
570
 
513
571
  return;
514
572
  }
515
- const { pdfFileName, url } = handlePdfDownload(
573
+ const { pdfFileName, url: downloadedPdfUrl } = handlePdfDownload(
516
574
  randomToken,
517
575
  pdfDownloads,
518
576
  request,
@@ -520,7 +578,7 @@ const crawlDomain = async ({
520
578
  urlsCrawled,
521
579
  );
522
580
 
523
- uuidToPdfMapping[pdfFileName] = url;
581
+ uuidToPdfMapping[pdfFileName] = downloadedPdfUrl;
524
582
  return;
525
583
  }
526
584
 
@@ -597,9 +655,7 @@ const crawlDomain = async ({
597
655
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
598
656
 
599
657
  if (isRedirected) {
600
- const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
601
- item => (item.actualUrl || item.url) === actualUrl,
602
- );
658
+ const isLoadedUrlInCrawledUrls = scannedResolvedUrlSet.has(actualUrl);
603
659
 
604
660
  if (isLoadedUrlInCrawledUrls) {
605
661
  urlsCrawled.notScannedRedirects.push({
@@ -621,6 +677,8 @@ const crawlDomain = async ({
621
677
  pageTitle: results.pageTitle,
622
678
  actualUrl, // i.e. actualUrl
623
679
  });
680
+ scannedUrlSet.add(request.url);
681
+ scannedResolvedUrlSet.add(actualUrl);
624
682
 
625
683
  urlsCrawled.scannedRedirects.push({
626
684
  fromUrl: request.url,
@@ -631,20 +689,20 @@ const crawlDomain = async ({
631
689
  results.actualUrl = actualUrl;
632
690
  await dataset.pushData(results);
633
691
  }
634
- } else {
692
+ } else if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
635
693
  // One more check if scanned pages have reached limit due to multi-instances of handler running
636
- if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
637
- guiInfoLog(guiInfoStatusTypes.SCANNED, {
638
- numScanned: urlsCrawled.scanned.length,
639
- urlScanned: request.url,
640
- });
641
- urlsCrawled.scanned.push({
642
- url: request.url,
643
- actualUrl: request.url,
644
- pageTitle: results.pageTitle,
645
- });
646
- await dataset.pushData(results);
647
- }
694
+ guiInfoLog(guiInfoStatusTypes.SCANNED, {
695
+ numScanned: urlsCrawled.scanned.length,
696
+ urlScanned: request.url,
697
+ });
698
+ urlsCrawled.scanned.push({
699
+ url: request.url,
700
+ actualUrl: request.url,
701
+ pageTitle: results.pageTitle,
702
+ });
703
+ scannedUrlSet.add(request.url);
704
+ scannedResolvedUrlSet.add(request.url);
705
+ await dataset.pushData(results);
648
706
  }
649
707
  } else {
650
708
  // Don't inform the user it is skipped since web crawler is best-effort.
@@ -675,16 +733,16 @@ const crawlDomain = async ({
675
733
  urlScanned: request.url,
676
734
  });
677
735
 
678
- page = await browserContext.newPage();
679
- await page.goto(request.url);
736
+ const recoveryPage = await browserContext.newPage();
737
+ await recoveryPage.goto(request.url);
680
738
 
681
- await page.route('**/*', async route => {
739
+ await recoveryPage.route('**/*', async route => {
682
740
  const interceptedRequest = route.request();
683
741
  if (interceptedRequest.resourceType() === 'document') {
684
742
  const interceptedRequestUrl = interceptedRequest
685
743
  .url()
686
744
  .replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
687
- await requestQueue.addRequest({
745
+ await enqueueUniqueRequest({
688
746
  url: interceptedRequestUrl,
689
747
  skipNavigation: isUrlPdf(interceptedRequest.url()),
690
748
  label: interceptedRequestUrl,
@@ -749,6 +807,61 @@ const crawlDomain = async ({
749
807
 
750
808
  await crawler.run();
751
809
 
810
+ // Additional passes: keep re-visiting scanned seed-hostname pages for
811
+ // click-discovery until no new pages are found or limits are reached.
812
+ if (!safeMode && !isAbortingScanNow && !durationExceeded) {
813
+ const seedHostname = new URL(url).hostname;
814
+ const clickPassVisited = new Set<string>();
815
+ let prevScannedCount: number;
816
+
817
+ do {
818
+ prevScannedCount = urlsCrawled.scanned.length;
819
+
820
+ if (prevScannedCount >= maxRequestsPerCrawl) break;
821
+ if (scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000) break;
822
+
823
+ const seedHostnamePages = urlsCrawled.scanned
824
+ .map(item => item.actualUrl || item.url)
825
+ .filter(pageUrl => {
826
+ try {
827
+ return new URL(pageUrl).hostname === seedHostname && !clickPassVisited.has(pageUrl);
828
+ } catch {
829
+ return false;
830
+ }
831
+ });
832
+
833
+ if (seedHostnamePages.length === 0) break;
834
+
835
+ let enqueued = 0;
836
+ for (const pageUrl of seedHostnamePages) {
837
+ if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) break;
838
+ if (scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000) break;
839
+
840
+ clickPassVisited.add(pageUrl);
841
+ try {
842
+ const clickPassLabel = `__clickpass__${pageUrl}`;
843
+ if (!queuedUrlSet.has(clickPassLabel)) {
844
+ queuedUrlSet.add(clickPassLabel);
845
+ await requestQueue.addRequest({
846
+ url: pageUrl,
847
+ label: clickPassLabel,
848
+ skipNavigation: false,
849
+ });
850
+ enqueued += 1;
851
+ }
852
+ } catch {
853
+ // ignore enqueue errors
854
+ }
855
+ }
856
+
857
+ if (enqueued === 0) break;
858
+
859
+ await crawler.run();
860
+
861
+ // Stop looping if no new pages were discovered in this pass
862
+ } while (urlsCrawled.scanned.length > prevScannedCount);
863
+ }
864
+
752
865
  if (pdfDownloads.length > 0) {
753
866
  // wait for pdf downloads to complete
754
867
  await Promise.all(pdfDownloads);
@@ -761,9 +874,7 @@ const crawlDomain = async ({
761
874
 
762
875
  // get screenshots from pdf docs
763
876
  if (includeScreenshots) {
764
- await Promise.all(
765
- pdfResults.map(async result => await doPdfScreenshots(randomToken, result)),
766
- );
877
+ await Promise.all(pdfResults.map(result => doPdfScreenshots(randomToken, result)));
767
878
  }
768
879
 
769
880
  // push results for each pdf document to key value store
@@ -53,14 +53,25 @@ const crawlIntelligentSitemap = async (
53
53
  const homeUrl = getHomeUrl(link);
54
54
  let sitemapLink = '';
55
55
 
56
- const effectiveUserDataDirectory =
57
- process.env.CRAWLEE_HEADLESS === '1' ? userDataDirectory : '';
58
- const context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
59
- headless: process.env.CRAWLEE_HEADLESS === '1',
60
- ...getPlaywrightLaunchOptions(browser),
61
- ...(extraHTTPHeaders && { extraHTTPHeaders }),
62
- });
63
- register(context);
56
+ const launchOptions = getPlaywrightLaunchOptions(browser);
57
+ let context;
58
+ let browserInstance;
59
+
60
+ if (process.env.CRAWLEE_HEADLESS === '1') {
61
+ const effectiveUserDataDirectory = userDataDirectory || '';
62
+ context = await constants.launcher.launchPersistentContext(effectiveUserDataDirectory, {
63
+ ...launchOptions,
64
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
65
+ });
66
+ register(context);
67
+ } else {
68
+ // In headful mode, avoid launchPersistentContext to prevent "Browser window not found"
69
+ browserInstance = await constants.launcher.launch(launchOptions);
70
+ register(browserInstance as unknown as { close: () => Promise<void> });
71
+ context = await browserInstance.newContext({
72
+ ...(extraHTTPHeaders && { extraHTTPHeaders }),
73
+ });
74
+ }
64
75
 
65
76
  const page = await context.newPage();
66
77
 
@@ -73,6 +84,9 @@ const crawlIntelligentSitemap = async (
73
84
  }
74
85
  await page.close();
75
86
  await context.close().catch(() => {});
87
+ if (browserInstance) {
88
+ await browserInstance.close().catch(() => {});
89
+ }
76
90
  return sitemapExist ? sitemapLink : '';
77
91
  }
78
92