@govtechsg/oobee 0.10.21 → 0.10.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/.github/workflows/docker-test.yml +1 -1
  2. package/DETAILS.md +40 -25
  3. package/Dockerfile +41 -47
  4. package/INSTALLATION.md +1 -1
  5. package/LICENSE-3RD-PARTY-REPORT.txt +448 -0
  6. package/LICENSE-3RD-PARTY.txt +19913 -0
  7. package/README.md +10 -2
  8. package/__mocks__/mock-report.html +1503 -1360
  9. package/package.json +8 -4
  10. package/scripts/decodeUnzipParse.js +29 -0
  11. package/scripts/install_oobee_dependencies.command +2 -2
  12. package/scripts/install_oobee_dependencies.ps1 +3 -3
  13. package/src/cli.ts +3 -2
  14. package/src/combine.ts +1 -0
  15. package/src/constants/cliFunctions.ts +17 -3
  16. package/src/constants/common.ts +29 -5
  17. package/src/constants/constants.ts +28 -26
  18. package/src/constants/questions.ts +4 -1
  19. package/src/crawlers/commonCrawlerFunc.ts +159 -187
  20. package/src/crawlers/crawlDomain.ts +29 -30
  21. package/src/crawlers/crawlIntelligentSitemap.ts +7 -1
  22. package/src/crawlers/crawlLocalFile.ts +1 -1
  23. package/src/crawlers/crawlSitemap.ts +1 -1
  24. package/src/crawlers/custom/flagUnlabelledClickableElements.ts +546 -472
  25. package/src/crawlers/customAxeFunctions.ts +2 -2
  26. package/src/index.ts +0 -2
  27. package/src/mergeAxeResults.ts +608 -220
  28. package/src/screenshotFunc/pdfScreenshotFunc.ts +3 -3
  29. package/src/static/ejs/partials/components/wcagCompliance.ejs +10 -29
  30. package/src/static/ejs/partials/footer.ejs +10 -13
  31. package/src/static/ejs/partials/scripts/categorySummary.ejs +2 -2
  32. package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +3 -0
  33. package/src/static/ejs/partials/scripts/reportSearch.ejs +1 -0
  34. package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +54 -52
  35. package/src/static/ejs/partials/styles/styles.ejs +4 -0
  36. package/src/static/ejs/partials/summaryMain.ejs +15 -42
  37. package/src/static/ejs/report.ejs +21 -12
  38. package/src/utils.ts +10 -2
  39. package/src/xPathToCss.ts +186 -0
  40. package/a11y-scan-results.zip +0 -0
  41. package/src/types/xpath-to-css.d.ts +0 -3
@@ -1,14 +1,14 @@
1
1
  import crawlee, { CrawlingContext, PlaywrightGotoOptions } from 'crawlee';
2
2
  import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
3
- import xPathToCss from 'xpath-to-css';
4
- import { Page } from 'playwright';
3
+ import { BrowserContext, Page } from 'playwright';
4
+ import { xPathToCss } from '../xPathToCss.js';
5
5
  import {
6
6
  axeScript,
7
7
  guiInfoStatusTypes,
8
8
  RuleFlags,
9
9
  saflyIconSelector,
10
10
  } from '../constants/constants.js';
11
- import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
11
+ import { guiInfoLog, silentLogger } from '../logs.js';
12
12
  import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
13
13
  import { isFilePath } from '../constants/common.js';
14
14
  import { customAxeConfig } from './customAxeFunctions.js';
@@ -208,63 +208,70 @@ export const runAxeScript = async ({
208
208
  selectors?: string[];
209
209
  ruleset?: RuleFlags[];
210
210
  }) => {
211
- // Checking for DOM mutations before proceeding to scan
212
- await page.evaluate(() => {
213
- return new Promise(resolve => {
214
- let timeout: NodeJS.Timeout;
215
- let mutationCount = 0;
216
- const MAX_MUTATIONS = 100;
217
- const MAX_SAME_MUTATION_LIMIT = 10;
218
- const mutationHash = {};
219
-
220
- const observer = new MutationObserver(mutationsList => {
221
- clearTimeout(timeout);
211
+ const browserContext: BrowserContext = page.context();
212
+ const requestUrl = page.url();
213
+
214
+ try {
215
+ // Checking for DOM mutations before proceeding to scan
216
+ await page.evaluate(() => {
217
+ return new Promise(resolve => {
218
+ let timeout: NodeJS.Timeout;
219
+ let mutationCount = 0;
220
+ const MAX_MUTATIONS = 250;
221
+ const MAX_SAME_MUTATION_LIMIT = 10;
222
+ const mutationHash = {};
223
+
224
+ const observer = new MutationObserver(mutationsList => {
225
+ clearTimeout(timeout);
226
+
227
+ mutationCount += 1;
228
+
229
+ if (mutationCount > MAX_MUTATIONS) {
230
+ observer.disconnect();
231
+ resolve('Too many mutations detected');
232
+ }
222
233
 
223
- mutationCount += 1;
234
+ // To handle scenario where DOM elements are constantly changing and unable to exit
235
+ mutationsList.forEach(mutation => {
236
+ let mutationKey: string;
224
237
 
225
- if (mutationCount > MAX_MUTATIONS) {
226
- observer.disconnect();
227
- resolve('Too many mutations detected');
228
- }
238
+ if (mutation.target instanceof Element) {
239
+ Array.from(mutation.target.attributes).forEach(attr => {
240
+ mutationKey = `${mutation.target.nodeName}-${attr.name}`;
229
241
 
230
- // To handle scenario where DOM elements are constantly changing and unable to exit
231
- mutationsList.forEach(mutation => {
232
- let mutationKey: string;
233
-
234
- if (mutation.target instanceof Element) {
235
- Array.from(mutation.target.attributes).forEach(attr => {
236
- mutationKey = `${mutation.target.nodeName}-${attr.name}`;
242
+ if (mutationKey) {
243
+ if (!mutationHash[mutationKey]) {
244
+ mutationHash[mutationKey] = 1;
245
+ } else {
246
+ mutationHash[mutationKey] += 1;
247
+ }
237
248
 
238
- if (mutationKey) {
239
- if (!mutationHash[mutationKey]) {
240
- mutationHash[mutationKey] = 1;
241
- } else {
242
- mutationHash[mutationKey] += 1;
249
+ if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
250
+ observer.disconnect();
251
+ resolve(`Repeated mutation detected for ${mutationKey}`);
252
+ }
243
253
  }
254
+ });
255
+ }
256
+ });
244
257
 
245
- if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
246
- observer.disconnect();
247
- resolve(`Repeated mutation detected for ${mutationKey}`);
248
- }
249
- }
250
- });
251
- }
258
+ timeout = setTimeout(() => {
259
+ observer.disconnect();
260
+ resolve('DOM stabilized after mutations.');
261
+ }, 1000);
252
262
  });
253
263
 
254
264
  timeout = setTimeout(() => {
255
265
  observer.disconnect();
256
- resolve('DOM stabilized after mutations.');
266
+ resolve('No mutations detected, exit from idle state');
257
267
  }, 1000);
258
- });
259
-
260
- timeout = setTimeout(() => {
261
- observer.disconnect();
262
- resolve('No mutations detected, exit from idle state');
263
- }, 1000);
264
268
 
265
- observer.observe(document, { childList: true, subtree: true, attributes: true });
269
+ observer.observe(document, { childList: true, subtree: true, attributes: true });
270
+ });
266
271
  });
267
- });
272
+ } catch (e) {
273
+ silentLogger.warn(`Error while checking for DOM mutations: ${e}`);
274
+ }
268
275
 
269
276
  page.on('console', msg => {
270
277
  const type = msg.type();
@@ -350,24 +357,28 @@ export const runAxeScript = async ({
350
357
  return !node.dataset.flagged; // fail any element with a data-flagged attribute set to true
351
358
  },
352
359
  },
353
- {
354
- ...customAxeConfig.checks[2],
355
- evaluate: (_node: HTMLElement) => {
356
- if (gradingReadabilityFlag === '') {
357
- return true; // Pass if no readability issues
358
- }
359
- // Dynamically update the grading messages
360
- const gradingCheck = customAxeConfig.checks.find(
361
- check => check.id === 'oobee-grading-text-contents',
362
- );
363
- if (gradingCheck) {
364
- gradingCheck.metadata.messages.incomplete = `The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease score of ${gradingReadabilityFlag
365
- }.\nThe target passing score is above 50, indicating content readable by university students and lower grade levels.\nA higher score reflects better readability.`;
366
- }
367
-
368
- // Fail if readability issues are detected
369
- },
370
- },
360
+ ...(enableWcagAaa
361
+ ? [
362
+ {
363
+ ...customAxeConfig.checks[2],
364
+ evaluate: (_node: HTMLElement) => {
365
+ if (gradingReadabilityFlag === '') {
366
+ return true; // Pass if no readability issues
367
+ }
368
+ // Dynamically update the grading messages
369
+ const gradingCheck = customAxeConfig.checks.find(
370
+ check => check.id === 'oobee-grading-text-contents',
371
+ );
372
+ if (gradingCheck) {
373
+ gradingCheck.metadata.messages.incomplete = `The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease score of ${gradingReadabilityFlag
374
+ }.\nThe target passing score is above 50, indicating content readable by university students and lower grade levels.\nA higher score reflects better readability.`;
375
+ }
376
+
377
+ // Fail if readability issues are detected
378
+ },
379
+ },
380
+ ]
381
+ : []),
371
382
  ],
372
383
  rules: customAxeConfig.rules
373
384
  .filter(rule => (disableOobee ? !rule.id.startsWith('oobee') : true))
@@ -409,123 +420,66 @@ export const runAxeScript = async ({
409
420
  const escapedCssSelectors =
410
421
  oobeeAccessibleLabelFlaggedCssSelectors.map(escapeCSSSelector);
411
422
 
412
- function frameCheck(cssSelector: string): { doc: Document; remainingSelector: string } {
423
+ function framesCheck(cssSelector: string): {
424
+ doc: Document;
425
+ remainingSelector: string;
426
+ } {
413
427
  let doc = document; // Start with the main document
414
- let frameSelector = ""; // To store the frame part of the selector
428
+ let remainingSelector = ''; // To store the last part of the selector
429
+ let targetIframe = null;
415
430
 
416
- // Extract the 'frame' part of the selector
417
- let frameMatch = cssSelector.match(/(frame[^>]*>)/i);
418
- if (frameMatch) {
419
- frameSelector = frameMatch[1].replace(">", "").trim(); // Clean up the frame part
420
- cssSelector = cssSelector.split(frameMatch[1])[1].trim(); // Remove the frame portion
421
- }
431
+ // Split the selector into parts at "> html"
432
+ const diffParts = cssSelector.split(/\s*>\s*html\s*/);
422
433
 
423
- let targetFrame = null; // Target frame element
424
-
425
- // Locate the frame based on the extracted frameSelector
426
- if (frameSelector.includes("first-of-type")) {
427
- // Select the first frame
428
- targetFrame = document.querySelector("frame:first-of-type");
429
- } else if (frameSelector.includes("nth-of-type")) {
430
- // Select the nth frame
431
- let nthIndex = frameSelector.match(/nth-of-type\((\d+)\)/);
432
- if (nthIndex) {
433
- let index = parseInt(nthIndex[1]) - 1; // Zero-based index
434
- targetFrame = document.querySelectorAll("frame")[index];
435
- }
436
- } else if (frameSelector.includes("#")) {
437
- // Frame with a specific ID
438
- let idMatch = frameSelector.match(/#([\w-]+)/);
439
- if (idMatch) {
440
- targetFrame = document.getElementById(idMatch[1]);
441
- }
442
- } else if (frameSelector.includes('[name="')) {
443
- // Frame with a specific name attribute
444
- let nameMatch = frameSelector.match(/name="([\w-]+)"/);
445
- if (nameMatch) {
446
- targetFrame = document.querySelector(`frame[name="${nameMatch[1]}"]`);
447
- }
448
- } else {
449
- // Default to the first frame
450
- targetFrame = document.querySelector("frame");
451
- }
434
+ for (let i = 0; i < diffParts.length - 1; i++) {
435
+ let iframeSelector = `${diffParts[i].trim()}`;
452
436
 
453
- // Update the document if the frame was found
454
- if (targetFrame && targetFrame.contentDocument) {
455
- doc = targetFrame.contentDocument;
456
- } else {
457
- console.warn("Frame not found or contentDocument inaccessible.");
458
- }
459
-
460
- return { doc, remainingSelector: cssSelector };
461
- }
462
-
463
- function iframeCheck(cssSelector: string): { doc: Document; remainingSelector: string } {
464
- let doc = document; // Start with the main document
465
- let iframeSelector = ""; // To store the iframe part of the selector
466
-
467
- // Extract the 'iframe' part of the selector
468
- let iframeMatch = cssSelector.match(/(iframe[^>]*>)/i);
469
- if (iframeMatch) {
470
- iframeSelector = iframeMatch[1].replace(">", "").trim(); // Clean up the iframe part
471
- cssSelector = cssSelector.split(iframeMatch[1])[1].trim(); // Remove the iframe portion
472
- }
437
+ // Add back '> html' to the current part
438
+ if (i > 0) {
439
+ iframeSelector = `html > ${iframeSelector}`;
440
+ }
473
441
 
474
- let targetIframe = null; // Target iframe element
475
-
476
- // Locate the iframe based on the extracted iframeSelector
477
- if (iframeSelector.includes("first-of-type")) {
478
- // Select the first iframe
479
- targetIframe = document.querySelector("iframe:first-of-type");
480
- } else if (iframeSelector.includes("nth-of-type")) {
481
- // Select the nth iframe
482
- let nthIndex = iframeSelector.match(/nth-of-type\((\d+)\)/);
483
- if (nthIndex) {
484
- let index = parseInt(nthIndex[1]) - 1; // Zero-based index
485
- targetIframe = document.querySelectorAll("iframe")[index];
442
+ let frameset = null;
443
+ // Find the iframe using the current document context
444
+ if (doc.querySelector('frameset')) {
445
+ frameset = doc.querySelector('frameset');
486
446
  }
487
- } else if (iframeSelector.includes("#")) {
488
- // Iframe with a specific ID
489
- let idMatch = iframeSelector.match(/#([\w-]+)/);
490
- if (idMatch) {
491
- targetIframe = document.getElementById(idMatch[1]);
447
+
448
+ if (frameset) {
449
+ doc = frameset;
450
+ iframeSelector = iframeSelector.split('body >')[1].trim();
492
451
  }
493
- } else if (iframeSelector.includes('[name="')) {
494
- // Iframe with a specific name attribute
495
- let nameMatch = iframeSelector.match(/name="([\w-]+)"/);
496
- if (nameMatch) {
497
- targetIframe = document.querySelector(`iframe[name="${nameMatch[1]}"]`);
452
+ targetIframe = doc.querySelector(iframeSelector);
453
+
454
+ if (targetIframe && targetIframe.contentDocument) {
455
+ // Update the document to the iframe's contentDocument
456
+ doc = targetIframe.contentDocument;
457
+ } else {
458
+ console.warn(
459
+ `Iframe not found or contentDocument inaccessible for selector: ${iframeSelector}`,
460
+ );
461
+ return { doc, remainingSelector: cssSelector }; // Return original selector if iframe not found
498
462
  }
499
- } else {
500
- // Default to the first iframe
501
- targetIframe = document.querySelector("iframe");
502
463
  }
503
464
 
504
- // Update the document if the iframe was found
505
- if (targetIframe && targetIframe.contentDocument) {
506
- doc = targetIframe.contentDocument;
507
- } else {
508
- console.warn("Iframe not found or contentDocument inaccessible.");
509
- }
465
+ // The last part is the remaining CSS selector
466
+ remainingSelector = diffParts[diffParts.length - 1].trim();
510
467
 
511
- return { doc, remainingSelector: cssSelector };
468
+ // Remove any leading '>' combinators from remainingSelector
469
+ remainingSelector = `html${remainingSelector}`;
470
+
471
+ return { doc, remainingSelector };
512
472
  }
513
473
 
514
474
  function findElementByCssSelector(cssSelector: string): string | null {
515
475
  let doc = document;
516
476
 
517
- // Check if the selector includes 'frame' and update doc and selector
518
- if (cssSelector.includes("frame")) {
519
- const result = frameCheck(cssSelector);
520
- doc = result.doc;
521
- cssSelector = result.remainingSelector;
522
- }
477
+ // Check if the selector includes 'frame' or 'iframe' and update doc and selector
523
478
 
524
- // Check for iframe
525
- if (cssSelector.includes("iframe")) {
526
- const result = iframeCheck(cssSelector);
527
- doc = result.doc;
528
- cssSelector = result.remainingSelector;
479
+ if (/\s*>\s*html\s*/.test(cssSelector)) {
480
+ const inFrames = framesCheck(cssSelector);
481
+ doc = inFrames.doc;
482
+ cssSelector = inFrames.remainingSelector;
529
483
  }
530
484
 
531
485
  // Query the element in the document (including inside frames)
@@ -553,35 +507,42 @@ export const runAxeScript = async ({
553
507
  }
554
508
  }
555
509
 
556
- return element ? element.outerHTML : null;
510
+ if (element) {
511
+ return element.outerHTML;
512
+ }
513
+
514
+ console.warn(`Unable to find element for css selector: ${cssSelector}`);
515
+ return null;
557
516
  }
558
517
 
559
518
  // Add oobee violations to Axe's report
560
519
  const oobeeAccessibleLabelViolations = {
561
520
  id: 'oobee-accessible-label',
562
521
  impact: 'serious' as ImpactValue,
563
- tags: ['wcag2a', 'wcag211', 'wcag243', 'wcag412'],
522
+ tags: ['wcag2a', 'wcag211', 'wcag412'],
564
523
  description: 'Ensures clickable elements have an accessible label.',
565
524
  help: 'Clickable elements (i.e. elements with mouse-click interaction) must have accessible labels.',
566
525
  helpUrl: 'https://www.deque.com/blog/accessible-aria-buttons',
567
- nodes: escapedCssSelectors.map(cssSelector => ({
568
- html: findElementByCssSelector(cssSelector),
569
- target: [cssSelector],
570
- impact: 'serious' as ImpactValue,
571
- failureSummary:
572
- 'Fix any of the following:\n The clickable element does not have an accessible label.',
573
- any: [
574
- {
575
- id: 'oobee-accessible-label',
576
- data: null,
577
- relatedNodes: [],
578
- impact: 'serious',
579
- message: 'The clickable element does not have an accessible label.',
580
- },
581
- ],
582
- all: [],
583
- none: [],
584
- })),
526
+ nodes: escapedCssSelectors
527
+ .map(cssSelector => ({
528
+ html: findElementByCssSelector(cssSelector),
529
+ target: [cssSelector],
530
+ impact: 'serious' as ImpactValue,
531
+ failureSummary:
532
+ 'Fix any of the following:\n The clickable element does not have an accessible label.',
533
+ any: [
534
+ {
535
+ id: 'oobee-accessible-label',
536
+ data: null,
537
+ relatedNodes: [],
538
+ impact: 'serious',
539
+ message: 'The clickable element does not have an accessible label.',
540
+ },
541
+ ],
542
+ all: [],
543
+ none: [],
544
+ }))
545
+ .filter(item => item.html),
585
546
  };
586
547
 
587
548
  results.violations = [...results.violations, oobeeAccessibleLabelViolations];
@@ -612,7 +573,18 @@ export const runAxeScript = async ({
612
573
  results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
613
574
  }
614
575
 
615
- const pageTitle = await page.evaluate(() => document.title);
576
+ let pageTitle = null;
577
+ try {
578
+ pageTitle = await page.evaluate(() => document.title);
579
+ } catch (e) {
580
+ silentLogger.warn(`Error while getting page title: ${e}`);
581
+ if (page.isClosed()) {
582
+ silentLogger.info(`Page was closed for ${requestUrl}, creating new page`);
583
+ page = await browserContext.newPage();
584
+ await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
585
+ pageTitle = await page.evaluate(() => document.title);
586
+ }
587
+ }
616
588
 
617
589
  return filterAxeResults(results, pageTitle, customFlowDetails);
618
590
  };
@@ -653,4 +625,4 @@ export const isUrlPdf = (url: string) => {
653
625
  }
654
626
  const parsedUrl = new URL(url);
655
627
  return /\.pdf($|\?|#)/i.test(parsedUrl.pathname) || /\.pdf($|\?|#)/i.test(parsedUrl.href);
656
- };
628
+ };
@@ -40,8 +40,7 @@ import {
40
40
  import { silentLogger, guiInfoLog } from '../logs.js';
41
41
  import { ViewportSettingsClass } from '../combine.js';
42
42
 
43
- const isBlacklisted = (url: string) => {
44
- const blacklistedPatterns = getBlackListedPatterns(null);
43
+ const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
45
44
  if (!blacklistedPatterns) {
46
45
  return false;
47
46
  }
@@ -122,7 +121,7 @@ const crawlDomain = async ({
122
121
  const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
123
122
  const { maxConcurrency } = constants;
124
123
  const { playwrightDeviceDetailsObject } = viewportSettings;
125
- const isBlacklistedUrl = isBlacklisted(url);
124
+ const isBlacklistedUrl = isBlacklisted(url, blacklistedPatterns);
126
125
 
127
126
  const httpsAgent = new https.Agent({ rejectUnauthorized: false });
128
127
 
@@ -315,7 +314,7 @@ const crawlDomain = async ({
315
314
 
316
315
  const isExcluded = (newPageUrl: string): boolean => {
317
316
  const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
318
- const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl);
317
+ const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
319
318
  const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
320
319
  return isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
321
320
  };
@@ -469,7 +468,7 @@ const crawlDomain = async ({
469
468
  launcher: constants.launcher,
470
469
  launchOptions: getPlaywrightLaunchOptions(browser),
471
470
  // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
472
- userDataDir,
471
+ ...(process.env.CRAWLEE_HEADLESS === '0' && { userDataDir }),
473
472
  },
474
473
  retryOnBlocked: true,
475
474
  browserPoolOptions: {
@@ -496,7 +495,7 @@ const crawlDomain = async ({
496
495
  return new Promise(resolve => {
497
496
  let timeout;
498
497
  let mutationCount = 0;
499
- const MAX_MUTATIONS = 100;
498
+ const MAX_MUTATIONS = 250;
500
499
  const MAX_SAME_MUTATION_LIMIT = 10;
501
500
  const mutationHash = {};
502
501
 
@@ -568,31 +567,31 @@ const crawlDomain = async ({
568
567
  ],
569
568
  preNavigationHooks: isBasicAuth
570
569
  ? [
571
- async ({ page, request }) => {
572
- await page.setExtraHTTPHeaders({
573
- Authorization: authHeader,
574
- ...extraHTTPHeaders,
575
- });
576
- const processible = await isProcessibleUrl(request.url);
577
- if (!processible) {
578
- request.skipNavigation = true;
579
- return null;
580
- }
581
- },
582
- ]
570
+ async ({ page, request }) => {
571
+ await page.setExtraHTTPHeaders({
572
+ Authorization: authHeader,
573
+ ...extraHTTPHeaders,
574
+ });
575
+ const processible = await isProcessibleUrl(request.url);
576
+ if (!processible) {
577
+ request.skipNavigation = true;
578
+ return null;
579
+ }
580
+ },
581
+ ]
583
582
  : [
584
- async ({ page, request }) => {
585
- await page.setExtraHTTPHeaders({
586
- ...extraHTTPHeaders,
587
- });
583
+ async ({ page, request }) => {
584
+ await page.setExtraHTTPHeaders({
585
+ ...extraHTTPHeaders,
586
+ });
588
587
 
589
- const processible = await isProcessibleUrl(request.url);
590
- if (!processible) {
591
- request.skipNavigation = true;
592
- return null;
593
- }
594
- },
595
- ],
588
+ const processible = await isProcessibleUrl(request.url);
589
+ if (!processible) {
590
+ request.skipNavigation = true;
591
+ return null;
592
+ }
593
+ },
594
+ ],
596
595
  requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
597
596
  requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
598
597
  const browserContext: BrowserContext = page.context();
@@ -615,7 +614,7 @@ const crawlDomain = async ({
615
614
  actualUrl = page.url();
616
615
  }
617
616
 
618
- if (isBlacklisted(actualUrl) || (isUrlPdf(actualUrl) && !isScanPdfs)) {
617
+ if (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs)) {
619
618
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
620
619
  numScanned: urlsCrawled.scanned.length,
621
620
  urlScanned: actualUrl,
@@ -50,7 +50,13 @@ const crawlIntelligentSitemap = async (
50
50
  const homeUrl = getHomeUrl(link);
51
51
  let sitemapLinkFound = false;
52
52
  let sitemapLink = '';
53
- const chromiumBrowser = await chromium.launch({ headless: true, channel: 'chrome' });
53
+ const chromiumBrowser = await chromium.launch(
54
+ {
55
+ headless: false,
56
+ channel: 'chrome',
57
+ args: ['--headless=new', '--no-sandbox']
58
+ });
59
+
54
60
  const page = await chromiumBrowser.newPage();
55
61
  for (const path of sitemapPaths) {
56
62
  sitemapLink = homeUrl + path;
@@ -143,7 +143,7 @@ const crawlLocalFile = async (
143
143
 
144
144
  if (!isUrlPdf(request.url)) {
145
145
  const browserContext = await constants.launcher.launchPersistentContext('', {
146
- headless: process.env.CRAWLEE_HEADLESS === '1',
146
+ headless: false,
147
147
  ...getPlaywrightLaunchOptions(browser),
148
148
  ...playwrightDeviceDetailsObject,
149
149
  });
@@ -144,7 +144,7 @@ const crawlSitemap = async (
144
144
  launcher: constants.launcher,
145
145
  launchOptions: getPlaywrightLaunchOptions(browser),
146
146
  // Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
147
- userDataDir,
147
+ ...(process.env.CRAWLEE_HEADLESS === '0' && { userDataDir }),
148
148
  },
149
149
  retryOnBlocked: true,
150
150
  browserPoolOptions: {