@govtechsg/oobee 0.10.21 → 0.10.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/docker-test.yml +1 -1
- package/DETAILS.md +40 -25
- package/Dockerfile +41 -47
- package/INSTALLATION.md +1 -1
- package/LICENSE-3RD-PARTY-REPORT.txt +448 -0
- package/LICENSE-3RD-PARTY.txt +19913 -0
- package/README.md +10 -2
- package/__mocks__/mock-report.html +1503 -1360
- package/package.json +8 -4
- package/scripts/decodeUnzipParse.js +29 -0
- package/scripts/install_oobee_dependencies.command +2 -2
- package/scripts/install_oobee_dependencies.ps1 +3 -3
- package/src/cli.ts +3 -2
- package/src/combine.ts +1 -0
- package/src/constants/cliFunctions.ts +17 -3
- package/src/constants/common.ts +29 -5
- package/src/constants/constants.ts +28 -26
- package/src/constants/questions.ts +4 -1
- package/src/crawlers/commonCrawlerFunc.ts +159 -187
- package/src/crawlers/crawlDomain.ts +29 -30
- package/src/crawlers/crawlIntelligentSitemap.ts +7 -1
- package/src/crawlers/crawlLocalFile.ts +1 -1
- package/src/crawlers/crawlSitemap.ts +1 -1
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +546 -472
- package/src/crawlers/customAxeFunctions.ts +2 -2
- package/src/index.ts +0 -2
- package/src/mergeAxeResults.ts +608 -220
- package/src/screenshotFunc/pdfScreenshotFunc.ts +3 -3
- package/src/static/ejs/partials/components/wcagCompliance.ejs +10 -29
- package/src/static/ejs/partials/footer.ejs +10 -13
- package/src/static/ejs/partials/scripts/categorySummary.ejs +2 -2
- package/src/static/ejs/partials/scripts/decodeUnzipParse.ejs +3 -0
- package/src/static/ejs/partials/scripts/reportSearch.ejs +1 -0
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +54 -52
- package/src/static/ejs/partials/styles/styles.ejs +4 -0
- package/src/static/ejs/partials/summaryMain.ejs +15 -42
- package/src/static/ejs/report.ejs +21 -12
- package/src/utils.ts +10 -2
- package/src/xPathToCss.ts +186 -0
- package/a11y-scan-results.zip +0 -0
- package/src/types/xpath-to-css.d.ts +0 -3
@@ -1,14 +1,14 @@
|
|
1
1
|
import crawlee, { CrawlingContext, PlaywrightGotoOptions } from 'crawlee';
|
2
2
|
import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
|
3
|
-
import
|
4
|
-
import {
|
3
|
+
import { BrowserContext, Page } from 'playwright';
|
4
|
+
import { xPathToCss } from '../xPathToCss.js';
|
5
5
|
import {
|
6
6
|
axeScript,
|
7
7
|
guiInfoStatusTypes,
|
8
8
|
RuleFlags,
|
9
9
|
saflyIconSelector,
|
10
10
|
} from '../constants/constants.js';
|
11
|
-
import {
|
11
|
+
import { guiInfoLog, silentLogger } from '../logs.js';
|
12
12
|
import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
|
13
13
|
import { isFilePath } from '../constants/common.js';
|
14
14
|
import { customAxeConfig } from './customAxeFunctions.js';
|
@@ -208,63 +208,70 @@ export const runAxeScript = async ({
|
|
208
208
|
selectors?: string[];
|
209
209
|
ruleset?: RuleFlags[];
|
210
210
|
}) => {
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
211
|
+
const browserContext: BrowserContext = page.context();
|
212
|
+
const requestUrl = page.url();
|
213
|
+
|
214
|
+
try {
|
215
|
+
// Checking for DOM mutations before proceeding to scan
|
216
|
+
await page.evaluate(() => {
|
217
|
+
return new Promise(resolve => {
|
218
|
+
let timeout: NodeJS.Timeout;
|
219
|
+
let mutationCount = 0;
|
220
|
+
const MAX_MUTATIONS = 250;
|
221
|
+
const MAX_SAME_MUTATION_LIMIT = 10;
|
222
|
+
const mutationHash = {};
|
223
|
+
|
224
|
+
const observer = new MutationObserver(mutationsList => {
|
225
|
+
clearTimeout(timeout);
|
226
|
+
|
227
|
+
mutationCount += 1;
|
228
|
+
|
229
|
+
if (mutationCount > MAX_MUTATIONS) {
|
230
|
+
observer.disconnect();
|
231
|
+
resolve('Too many mutations detected');
|
232
|
+
}
|
222
233
|
|
223
|
-
|
234
|
+
// To handle scenario where DOM elements are constantly changing and unable to exit
|
235
|
+
mutationsList.forEach(mutation => {
|
236
|
+
let mutationKey: string;
|
224
237
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
}
|
238
|
+
if (mutation.target instanceof Element) {
|
239
|
+
Array.from(mutation.target.attributes).forEach(attr => {
|
240
|
+
mutationKey = `${mutation.target.nodeName}-${attr.name}`;
|
229
241
|
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
mutationKey = `${mutation.target.nodeName}-${attr.name}`;
|
242
|
+
if (mutationKey) {
|
243
|
+
if (!mutationHash[mutationKey]) {
|
244
|
+
mutationHash[mutationKey] = 1;
|
245
|
+
} else {
|
246
|
+
mutationHash[mutationKey] += 1;
|
247
|
+
}
|
237
248
|
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
mutationHash[mutationKey] += 1;
|
249
|
+
if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
|
250
|
+
observer.disconnect();
|
251
|
+
resolve(`Repeated mutation detected for ${mutationKey}`);
|
252
|
+
}
|
243
253
|
}
|
254
|
+
});
|
255
|
+
}
|
256
|
+
});
|
244
257
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
}
|
250
|
-
});
|
251
|
-
}
|
258
|
+
timeout = setTimeout(() => {
|
259
|
+
observer.disconnect();
|
260
|
+
resolve('DOM stabilized after mutations.');
|
261
|
+
}, 1000);
|
252
262
|
});
|
253
263
|
|
254
264
|
timeout = setTimeout(() => {
|
255
265
|
observer.disconnect();
|
256
|
-
resolve('
|
266
|
+
resolve('No mutations detected, exit from idle state');
|
257
267
|
}, 1000);
|
258
|
-
});
|
259
|
-
|
260
|
-
timeout = setTimeout(() => {
|
261
|
-
observer.disconnect();
|
262
|
-
resolve('No mutations detected, exit from idle state');
|
263
|
-
}, 1000);
|
264
268
|
|
265
|
-
|
269
|
+
observer.observe(document, { childList: true, subtree: true, attributes: true });
|
270
|
+
});
|
266
271
|
});
|
267
|
-
})
|
272
|
+
} catch (e) {
|
273
|
+
silentLogger.warn(`Error while checking for DOM mutations: ${e}`);
|
274
|
+
}
|
268
275
|
|
269
276
|
page.on('console', msg => {
|
270
277
|
const type = msg.type();
|
@@ -350,24 +357,28 @@ export const runAxeScript = async ({
|
|
350
357
|
return !node.dataset.flagged; // fail any element with a data-flagged attribute set to true
|
351
358
|
},
|
352
359
|
},
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
360
|
+
...(enableWcagAaa
|
361
|
+
? [
|
362
|
+
{
|
363
|
+
...customAxeConfig.checks[2],
|
364
|
+
evaluate: (_node: HTMLElement) => {
|
365
|
+
if (gradingReadabilityFlag === '') {
|
366
|
+
return true; // Pass if no readability issues
|
367
|
+
}
|
368
|
+
// Dynamically update the grading messages
|
369
|
+
const gradingCheck = customAxeConfig.checks.find(
|
370
|
+
check => check.id === 'oobee-grading-text-contents',
|
371
|
+
);
|
372
|
+
if (gradingCheck) {
|
373
|
+
gradingCheck.metadata.messages.incomplete = `The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease score of ${gradingReadabilityFlag
|
374
|
+
}.\nThe target passing score is above 50, indicating content readable by university students and lower grade levels.\nA higher score reflects better readability.`;
|
375
|
+
}
|
376
|
+
|
377
|
+
// Fail if readability issues are detected
|
378
|
+
},
|
379
|
+
},
|
380
|
+
]
|
381
|
+
: []),
|
371
382
|
],
|
372
383
|
rules: customAxeConfig.rules
|
373
384
|
.filter(rule => (disableOobee ? !rule.id.startsWith('oobee') : true))
|
@@ -409,123 +420,66 @@ export const runAxeScript = async ({
|
|
409
420
|
const escapedCssSelectors =
|
410
421
|
oobeeAccessibleLabelFlaggedCssSelectors.map(escapeCSSSelector);
|
411
422
|
|
412
|
-
function
|
423
|
+
function framesCheck(cssSelector: string): {
|
424
|
+
doc: Document;
|
425
|
+
remainingSelector: string;
|
426
|
+
} {
|
413
427
|
let doc = document; // Start with the main document
|
414
|
-
let
|
428
|
+
let remainingSelector = ''; // To store the last part of the selector
|
429
|
+
let targetIframe = null;
|
415
430
|
|
416
|
-
//
|
417
|
-
|
418
|
-
if (frameMatch) {
|
419
|
-
frameSelector = frameMatch[1].replace(">", "").trim(); // Clean up the frame part
|
420
|
-
cssSelector = cssSelector.split(frameMatch[1])[1].trim(); // Remove the frame portion
|
421
|
-
}
|
431
|
+
// Split the selector into parts at "> html"
|
432
|
+
const diffParts = cssSelector.split(/\s*>\s*html\s*/);
|
422
433
|
|
423
|
-
let
|
424
|
-
|
425
|
-
// Locate the frame based on the extracted frameSelector
|
426
|
-
if (frameSelector.includes("first-of-type")) {
|
427
|
-
// Select the first frame
|
428
|
-
targetFrame = document.querySelector("frame:first-of-type");
|
429
|
-
} else if (frameSelector.includes("nth-of-type")) {
|
430
|
-
// Select the nth frame
|
431
|
-
let nthIndex = frameSelector.match(/nth-of-type\((\d+)\)/);
|
432
|
-
if (nthIndex) {
|
433
|
-
let index = parseInt(nthIndex[1]) - 1; // Zero-based index
|
434
|
-
targetFrame = document.querySelectorAll("frame")[index];
|
435
|
-
}
|
436
|
-
} else if (frameSelector.includes("#")) {
|
437
|
-
// Frame with a specific ID
|
438
|
-
let idMatch = frameSelector.match(/#([\w-]+)/);
|
439
|
-
if (idMatch) {
|
440
|
-
targetFrame = document.getElementById(idMatch[1]);
|
441
|
-
}
|
442
|
-
} else if (frameSelector.includes('[name="')) {
|
443
|
-
// Frame with a specific name attribute
|
444
|
-
let nameMatch = frameSelector.match(/name="([\w-]+)"/);
|
445
|
-
if (nameMatch) {
|
446
|
-
targetFrame = document.querySelector(`frame[name="${nameMatch[1]}"]`);
|
447
|
-
}
|
448
|
-
} else {
|
449
|
-
// Default to the first frame
|
450
|
-
targetFrame = document.querySelector("frame");
|
451
|
-
}
|
434
|
+
for (let i = 0; i < diffParts.length - 1; i++) {
|
435
|
+
let iframeSelector = `${diffParts[i].trim()}`;
|
452
436
|
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
console.warn("Frame not found or contentDocument inaccessible.");
|
458
|
-
}
|
459
|
-
|
460
|
-
return { doc, remainingSelector: cssSelector };
|
461
|
-
}
|
462
|
-
|
463
|
-
function iframeCheck(cssSelector: string): { doc: Document; remainingSelector: string } {
|
464
|
-
let doc = document; // Start with the main document
|
465
|
-
let iframeSelector = ""; // To store the iframe part of the selector
|
466
|
-
|
467
|
-
// Extract the 'iframe' part of the selector
|
468
|
-
let iframeMatch = cssSelector.match(/(iframe[^>]*>)/i);
|
469
|
-
if (iframeMatch) {
|
470
|
-
iframeSelector = iframeMatch[1].replace(">", "").trim(); // Clean up the iframe part
|
471
|
-
cssSelector = cssSelector.split(iframeMatch[1])[1].trim(); // Remove the iframe portion
|
472
|
-
}
|
437
|
+
// Add back '> html' to the current part
|
438
|
+
if (i > 0) {
|
439
|
+
iframeSelector = `html > ${iframeSelector}`;
|
440
|
+
}
|
473
441
|
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
// Select the first iframe
|
479
|
-
targetIframe = document.querySelector("iframe:first-of-type");
|
480
|
-
} else if (iframeSelector.includes("nth-of-type")) {
|
481
|
-
// Select the nth iframe
|
482
|
-
let nthIndex = iframeSelector.match(/nth-of-type\((\d+)\)/);
|
483
|
-
if (nthIndex) {
|
484
|
-
let index = parseInt(nthIndex[1]) - 1; // Zero-based index
|
485
|
-
targetIframe = document.querySelectorAll("iframe")[index];
|
442
|
+
let frameset = null;
|
443
|
+
// Find the iframe using the current document context
|
444
|
+
if (doc.querySelector('frameset')) {
|
445
|
+
frameset = doc.querySelector('frameset');
|
486
446
|
}
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
targetIframe = document.getElementById(idMatch[1]);
|
447
|
+
|
448
|
+
if (frameset) {
|
449
|
+
doc = frameset;
|
450
|
+
iframeSelector = iframeSelector.split('body >')[1].trim();
|
492
451
|
}
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
452
|
+
targetIframe = doc.querySelector(iframeSelector);
|
453
|
+
|
454
|
+
if (targetIframe && targetIframe.contentDocument) {
|
455
|
+
// Update the document to the iframe's contentDocument
|
456
|
+
doc = targetIframe.contentDocument;
|
457
|
+
} else {
|
458
|
+
console.warn(
|
459
|
+
`Iframe not found or contentDocument inaccessible for selector: ${iframeSelector}`,
|
460
|
+
);
|
461
|
+
return { doc, remainingSelector: cssSelector }; // Return original selector if iframe not found
|
498
462
|
}
|
499
|
-
} else {
|
500
|
-
// Default to the first iframe
|
501
|
-
targetIframe = document.querySelector("iframe");
|
502
463
|
}
|
503
464
|
|
504
|
-
//
|
505
|
-
|
506
|
-
doc = targetIframe.contentDocument;
|
507
|
-
} else {
|
508
|
-
console.warn("Iframe not found or contentDocument inaccessible.");
|
509
|
-
}
|
465
|
+
// The last part is the remaining CSS selector
|
466
|
+
remainingSelector = diffParts[diffParts.length - 1].trim();
|
510
467
|
|
511
|
-
|
468
|
+
// Remove any leading '>' combinators from remainingSelector
|
469
|
+
remainingSelector = `html${remainingSelector}`;
|
470
|
+
|
471
|
+
return { doc, remainingSelector };
|
512
472
|
}
|
513
473
|
|
514
474
|
function findElementByCssSelector(cssSelector: string): string | null {
|
515
475
|
let doc = document;
|
516
476
|
|
517
|
-
// Check if the selector includes 'frame' and update doc and selector
|
518
|
-
if (cssSelector.includes("frame")) {
|
519
|
-
const result = frameCheck(cssSelector);
|
520
|
-
doc = result.doc;
|
521
|
-
cssSelector = result.remainingSelector;
|
522
|
-
}
|
477
|
+
// Check if the selector includes 'frame' or 'iframe' and update doc and selector
|
523
478
|
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
cssSelector = result.remainingSelector;
|
479
|
+
if (/\s*>\s*html\s*/.test(cssSelector)) {
|
480
|
+
const inFrames = framesCheck(cssSelector);
|
481
|
+
doc = inFrames.doc;
|
482
|
+
cssSelector = inFrames.remainingSelector;
|
529
483
|
}
|
530
484
|
|
531
485
|
// Query the element in the document (including inside frames)
|
@@ -553,35 +507,42 @@ export const runAxeScript = async ({
|
|
553
507
|
}
|
554
508
|
}
|
555
509
|
|
556
|
-
|
510
|
+
if (element) {
|
511
|
+
return element.outerHTML;
|
512
|
+
}
|
513
|
+
|
514
|
+
console.warn(`Unable to find element for css selector: ${cssSelector}`);
|
515
|
+
return null;
|
557
516
|
}
|
558
517
|
|
559
518
|
// Add oobee violations to Axe's report
|
560
519
|
const oobeeAccessibleLabelViolations = {
|
561
520
|
id: 'oobee-accessible-label',
|
562
521
|
impact: 'serious' as ImpactValue,
|
563
|
-
tags: ['wcag2a', 'wcag211', '
|
522
|
+
tags: ['wcag2a', 'wcag211', 'wcag412'],
|
564
523
|
description: 'Ensures clickable elements have an accessible label.',
|
565
524
|
help: 'Clickable elements (i.e. elements with mouse-click interaction) must have accessible labels.',
|
566
525
|
helpUrl: 'https://www.deque.com/blog/accessible-aria-buttons',
|
567
|
-
nodes: escapedCssSelectors
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
526
|
+
nodes: escapedCssSelectors
|
527
|
+
.map(cssSelector => ({
|
528
|
+
html: findElementByCssSelector(cssSelector),
|
529
|
+
target: [cssSelector],
|
530
|
+
impact: 'serious' as ImpactValue,
|
531
|
+
failureSummary:
|
532
|
+
'Fix any of the following:\n The clickable element does not have an accessible label.',
|
533
|
+
any: [
|
534
|
+
{
|
535
|
+
id: 'oobee-accessible-label',
|
536
|
+
data: null,
|
537
|
+
relatedNodes: [],
|
538
|
+
impact: 'serious',
|
539
|
+
message: 'The clickable element does not have an accessible label.',
|
540
|
+
},
|
541
|
+
],
|
542
|
+
all: [],
|
543
|
+
none: [],
|
544
|
+
}))
|
545
|
+
.filter(item => item.html),
|
585
546
|
};
|
586
547
|
|
587
548
|
results.violations = [...results.violations, oobeeAccessibleLabelViolations];
|
@@ -612,7 +573,18 @@ export const runAxeScript = async ({
|
|
612
573
|
results.incomplete = await takeScreenshotForHTMLElements(results.incomplete, page, randomToken);
|
613
574
|
}
|
614
575
|
|
615
|
-
|
576
|
+
let pageTitle = null;
|
577
|
+
try {
|
578
|
+
pageTitle = await page.evaluate(() => document.title);
|
579
|
+
} catch (e) {
|
580
|
+
silentLogger.warn(`Error while getting page title: ${e}`);
|
581
|
+
if (page.isClosed()) {
|
582
|
+
silentLogger.info(`Page was closed for ${requestUrl}, creating new page`);
|
583
|
+
page = await browserContext.newPage();
|
584
|
+
await page.goto(requestUrl, { waitUntil: 'domcontentloaded' });
|
585
|
+
pageTitle = await page.evaluate(() => document.title);
|
586
|
+
}
|
587
|
+
}
|
616
588
|
|
617
589
|
return filterAxeResults(results, pageTitle, customFlowDetails);
|
618
590
|
};
|
@@ -653,4 +625,4 @@ export const isUrlPdf = (url: string) => {
|
|
653
625
|
}
|
654
626
|
const parsedUrl = new URL(url);
|
655
627
|
return /\.pdf($|\?|#)/i.test(parsedUrl.pathname) || /\.pdf($|\?|#)/i.test(parsedUrl.href);
|
656
|
-
};
|
628
|
+
};
|
@@ -40,8 +40,7 @@ import {
|
|
40
40
|
import { silentLogger, guiInfoLog } from '../logs.js';
|
41
41
|
import { ViewportSettingsClass } from '../combine.js';
|
42
42
|
|
43
|
-
const isBlacklisted = (url: string) => {
|
44
|
-
const blacklistedPatterns = getBlackListedPatterns(null);
|
43
|
+
const isBlacklisted = (url: string, blacklistedPatterns: string[]) => {
|
45
44
|
if (!blacklistedPatterns) {
|
46
45
|
return false;
|
47
46
|
}
|
@@ -122,7 +121,7 @@ const crawlDomain = async ({
|
|
122
121
|
const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
|
123
122
|
const { maxConcurrency } = constants;
|
124
123
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
125
|
-
const isBlacklistedUrl = isBlacklisted(url);
|
124
|
+
const isBlacklistedUrl = isBlacklisted(url, blacklistedPatterns);
|
126
125
|
|
127
126
|
const httpsAgent = new https.Agent({ rejectUnauthorized: false });
|
128
127
|
|
@@ -315,7 +314,7 @@ const crawlDomain = async ({
|
|
315
314
|
|
316
315
|
const isExcluded = (newPageUrl: string): boolean => {
|
317
316
|
const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
|
318
|
-
const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl);
|
317
|
+
const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
|
319
318
|
const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
|
320
319
|
return isAlreadyScanned || isBlacklistedUrl || isNotFollowStrategy;
|
321
320
|
};
|
@@ -469,7 +468,7 @@ const crawlDomain = async ({
|
|
469
468
|
launcher: constants.launcher,
|
470
469
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
471
470
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
472
|
-
userDataDir,
|
471
|
+
...(process.env.CRAWLEE_HEADLESS === '0' && { userDataDir }),
|
473
472
|
},
|
474
473
|
retryOnBlocked: true,
|
475
474
|
browserPoolOptions: {
|
@@ -496,7 +495,7 @@ const crawlDomain = async ({
|
|
496
495
|
return new Promise(resolve => {
|
497
496
|
let timeout;
|
498
497
|
let mutationCount = 0;
|
499
|
-
const MAX_MUTATIONS =
|
498
|
+
const MAX_MUTATIONS = 250;
|
500
499
|
const MAX_SAME_MUTATION_LIMIT = 10;
|
501
500
|
const mutationHash = {};
|
502
501
|
|
@@ -568,31 +567,31 @@ const crawlDomain = async ({
|
|
568
567
|
],
|
569
568
|
preNavigationHooks: isBasicAuth
|
570
569
|
? [
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
570
|
+
async ({ page, request }) => {
|
571
|
+
await page.setExtraHTTPHeaders({
|
572
|
+
Authorization: authHeader,
|
573
|
+
...extraHTTPHeaders,
|
574
|
+
});
|
575
|
+
const processible = await isProcessibleUrl(request.url);
|
576
|
+
if (!processible) {
|
577
|
+
request.skipNavigation = true;
|
578
|
+
return null;
|
579
|
+
}
|
580
|
+
},
|
581
|
+
]
|
583
582
|
: [
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
583
|
+
async ({ page, request }) => {
|
584
|
+
await page.setExtraHTTPHeaders({
|
585
|
+
...extraHTTPHeaders,
|
586
|
+
});
|
588
587
|
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
588
|
+
const processible = await isProcessibleUrl(request.url);
|
589
|
+
if (!processible) {
|
590
|
+
request.skipNavigation = true;
|
591
|
+
return null;
|
592
|
+
}
|
593
|
+
},
|
594
|
+
],
|
596
595
|
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
597
596
|
requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
|
598
597
|
const browserContext: BrowserContext = page.context();
|
@@ -615,7 +614,7 @@ const crawlDomain = async ({
|
|
615
614
|
actualUrl = page.url();
|
616
615
|
}
|
617
616
|
|
618
|
-
if (isBlacklisted(actualUrl) || (isUrlPdf(actualUrl) && !isScanPdfs)) {
|
617
|
+
if (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs)) {
|
619
618
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
620
619
|
numScanned: urlsCrawled.scanned.length,
|
621
620
|
urlScanned: actualUrl,
|
@@ -50,7 +50,13 @@ const crawlIntelligentSitemap = async (
|
|
50
50
|
const homeUrl = getHomeUrl(link);
|
51
51
|
let sitemapLinkFound = false;
|
52
52
|
let sitemapLink = '';
|
53
|
-
const chromiumBrowser = await chromium.launch(
|
53
|
+
const chromiumBrowser = await chromium.launch(
|
54
|
+
{
|
55
|
+
headless: false,
|
56
|
+
channel: 'chrome',
|
57
|
+
args: ['--headless=new', '--no-sandbox']
|
58
|
+
});
|
59
|
+
|
54
60
|
const page = await chromiumBrowser.newPage();
|
55
61
|
for (const path of sitemapPaths) {
|
56
62
|
sitemapLink = homeUrl + path;
|
@@ -143,7 +143,7 @@ const crawlLocalFile = async (
|
|
143
143
|
|
144
144
|
if (!isUrlPdf(request.url)) {
|
145
145
|
const browserContext = await constants.launcher.launchPersistentContext('', {
|
146
|
-
headless:
|
146
|
+
headless: false,
|
147
147
|
...getPlaywrightLaunchOptions(browser),
|
148
148
|
...playwrightDeviceDetailsObject,
|
149
149
|
});
|
@@ -144,7 +144,7 @@ const crawlSitemap = async (
|
|
144
144
|
launcher: constants.launcher,
|
145
145
|
launchOptions: getPlaywrightLaunchOptions(browser),
|
146
146
|
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
|
147
|
-
userDataDir,
|
147
|
+
...(process.env.CRAWLEE_HEADLESS === '0' && { userDataDir }),
|
148
148
|
},
|
149
149
|
retryOnBlocked: true,
|
150
150
|
browserPoolOptions: {
|