@govtechsg/oobee 0.10.36 → 0.10.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/docker-test.yml +1 -1
- package/DETAILS.md +3 -3
- package/INTEGRATION.md +142 -53
- package/README.md +17 -0
- package/REPORTS.md +362 -0
- package/exclusions.txt +4 -1
- package/package.json +2 -2
- package/src/constants/cliFunctions.ts +0 -7
- package/src/constants/common.ts +39 -1
- package/src/constants/constants.ts +9 -8
- package/src/crawlers/commonCrawlerFunc.ts +95 -220
- package/src/crawlers/crawlDomain.ts +10 -23
- package/src/crawlers/crawlLocalFile.ts +2 -0
- package/src/crawlers/crawlSitemap.ts +6 -4
- package/src/crawlers/custom/escapeCssSelector.ts +10 -0
- package/src/crawlers/custom/evaluateAltText.ts +13 -0
- package/src/crawlers/custom/extractAndGradeText.ts +0 -2
- package/src/crawlers/custom/extractText.ts +28 -0
- package/src/crawlers/custom/findElementByCssSelector.ts +46 -0
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +982 -842
- package/src/crawlers/custom/framesCheck.ts +51 -0
- package/src/crawlers/custom/getAxeConfiguration.ts +126 -0
- package/src/crawlers/custom/gradeReadability.ts +30 -0
- package/src/crawlers/custom/xPathToCss.ts +178 -0
- package/src/crawlers/pdfScanFunc.ts +67 -26
- package/src/mergeAxeResults.ts +535 -132
- package/src/npmIndex.ts +130 -62
- package/src/screenshotFunc/htmlScreenshotFunc.ts +1 -1
- package/src/screenshotFunc/pdfScreenshotFunc.ts +34 -1
- package/src/static/ejs/partials/components/ruleOffcanvas.ejs +1 -1
- package/src/static/ejs/partials/components/scanAbout.ejs +1 -1
- package/src/static/ejs/partials/footer.ejs +3 -3
- package/src/static/ejs/partials/scripts/reportSearch.ejs +112 -74
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +2 -2
- package/src/static/ejs/partials/summaryMain.ejs +3 -3
- package/src/static/ejs/report.ejs +3 -3
- package/src/utils.ts +289 -13
- package/src/xPathToCssCypress.ts +178 -0
- package/src/crawlers/customAxeFunctions.ts +0 -82
@@ -1,7 +1,6 @@
|
|
1
1
|
import crawlee, { CrawlingContext, PlaywrightGotoOptions } from 'crawlee';
|
2
2
|
import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
|
3
3
|
import { BrowserContext, Page } from 'playwright';
|
4
|
-
import { xPathToCss } from '../xPathToCss.js';
|
5
4
|
import {
|
6
5
|
axeScript,
|
7
6
|
guiInfoStatusTypes,
|
@@ -11,10 +10,15 @@ import {
|
|
11
10
|
import { guiInfoLog, silentLogger } from '../logs.js';
|
12
11
|
import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
|
13
12
|
import { isFilePath } from '../constants/common.js';
|
14
|
-
import { customAxeConfig } from './customAxeFunctions.js';
|
15
|
-
import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
|
16
13
|
import { extractAndGradeText } from './custom/extractAndGradeText.js';
|
17
14
|
import { ItemsInfo } from '../mergeAxeResults.js';
|
15
|
+
import { evaluateAltText } from './custom/evaluateAltText.js';
|
16
|
+
import { escapeCssSelector } from './custom/escapeCssSelector.js';
|
17
|
+
import { framesCheck } from './custom/framesCheck.js';
|
18
|
+
import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
|
19
|
+
import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
|
20
|
+
import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
|
21
|
+
import { xPathToCss } from './custom/xPathToCss.js';
|
18
22
|
|
19
23
|
// types
|
20
24
|
interface AxeResultsWithScreenshot extends AxeResults {
|
@@ -65,6 +69,30 @@ type FilteredResults = {
|
|
65
69
|
actualUrl?: string;
|
66
70
|
};
|
67
71
|
|
72
|
+
const truncateHtml = (html: string, maxBytes = 1024, suffix = '…'): string => {
|
73
|
+
const encoder = new TextEncoder();
|
74
|
+
if (encoder.encode(html).length <= maxBytes) return html;
|
75
|
+
|
76
|
+
let left = 0;
|
77
|
+
let right = html.length;
|
78
|
+
let result = '';
|
79
|
+
|
80
|
+
while (left <= right) {
|
81
|
+
const mid = Math.floor((left + right) / 2);
|
82
|
+
const truncated = html.slice(0, mid) + suffix;
|
83
|
+
const bytes = encoder.encode(truncated).length;
|
84
|
+
|
85
|
+
if (bytes <= maxBytes) {
|
86
|
+
result = truncated;
|
87
|
+
left = mid + 1;
|
88
|
+
} else {
|
89
|
+
right = mid - 1;
|
90
|
+
}
|
91
|
+
}
|
92
|
+
|
93
|
+
return result;
|
94
|
+
};
|
95
|
+
|
68
96
|
export const filterAxeResults = (
|
69
97
|
results: AxeResultsWithScreenshot,
|
70
98
|
pageTitle: string,
|
@@ -86,17 +114,17 @@ export const filterAxeResults = (
|
|
86
114
|
const conformance = tags.filter(tag => tag.startsWith('wcag') || tag === 'best-practice');
|
87
115
|
|
88
116
|
// handle rare cases where conformance level is not the first element
|
89
|
-
const
|
90
|
-
|
117
|
+
const wcagRegex = /^wcag\d+a+$/;
|
118
|
+
|
119
|
+
if (conformance[0] !== 'best-practice' && !wcagRegex.test(conformance[0])) {
|
91
120
|
conformance.sort((a, b) => {
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
return 0;
|
121
|
+
if (wcagRegex.test(a) && !wcagRegex.test(b)) {
|
122
|
+
return -1;
|
123
|
+
}
|
124
|
+
if (!wcagRegex.test(a) && wcagRegex.test(b)) {
|
125
|
+
return 1;
|
126
|
+
}
|
127
|
+
return 0;
|
100
128
|
});
|
101
129
|
}
|
102
130
|
|
@@ -120,6 +148,7 @@ export const filterAxeResults = (
|
|
120
148
|
if (html.includes('</script>')) {
|
121
149
|
finalHtml = html.replaceAll('</script>', '</script>');
|
122
150
|
}
|
151
|
+
finalHtml = truncateHtml(finalHtml);
|
123
152
|
|
124
153
|
const xpath = target.length === 1 && typeof target[0] === 'string' ? target[0] : null;
|
125
154
|
|
@@ -138,15 +167,13 @@ export const filterAxeResults = (
|
|
138
167
|
|
139
168
|
nodes.forEach(node => {
|
140
169
|
const { impact } = node;
|
141
|
-
const
|
142
|
-
const
|
143
|
-
const
|
170
|
+
const hasWcagA = conformance.some(tag => /^wcag\d*a$/.test(tag));
|
171
|
+
const hasWcagAA = conformance.some(tag => /^wcag\d*aa$/.test(tag));
|
172
|
+
// const hasWcagAAA = conformance.some(tag => /^wcag\d*aaa$/.test(tag));
|
144
173
|
|
145
174
|
if (displayNeedsReview) {
|
146
175
|
addTo(needsReview, node);
|
147
|
-
} else if (
|
148
|
-
addTo(goodToFix, node);
|
149
|
-
} else if (hasWcag2a || hasWcag2aa) {
|
176
|
+
} else if (hasWcagA || hasWcagAA) {
|
150
177
|
addTo(mustFix, node);
|
151
178
|
} else {
|
152
179
|
addTo(goodToFix, node);
|
@@ -176,7 +203,10 @@ export const filterAxeResults = (
|
|
176
203
|
items: [],
|
177
204
|
};
|
178
205
|
}
|
179
|
-
|
206
|
+
|
207
|
+
const finalHtml = truncateHtml(html);
|
208
|
+
passed.rules[rule].items.push({ html: finalHtml, screenshotPath: '', message: '', xpath: '' });
|
209
|
+
|
180
210
|
passed.totalItems += 1;
|
181
211
|
passed.rules[rule].totalItems += 1;
|
182
212
|
totalItems += 1;
|
@@ -289,21 +319,6 @@ export const runAxeScript = async ({
|
|
289
319
|
});
|
290
320
|
|
291
321
|
const disableOobee = ruleset.includes(RuleFlags.DISABLE_OOBEE);
|
292
|
-
const oobeeAccessibleLabelFlaggedXpaths = disableOobee
|
293
|
-
? []
|
294
|
-
: (await flagUnlabelledClickableElements(page)).map(item => item.xpath);
|
295
|
-
const oobeeAccessibleLabelFlaggedCssSelectors = oobeeAccessibleLabelFlaggedXpaths
|
296
|
-
.map(xpath => {
|
297
|
-
try {
|
298
|
-
const cssSelector = xPathToCss(xpath);
|
299
|
-
return cssSelector;
|
300
|
-
} catch (e) {
|
301
|
-
console.error('Error converting XPath to CSS: ', xpath, e);
|
302
|
-
return '';
|
303
|
-
}
|
304
|
-
})
|
305
|
-
.filter(item => item !== '');
|
306
|
-
|
307
322
|
const enableWcagAaa = ruleset.includes(RuleFlags.ENABLE_WCAG_AAA);
|
308
323
|
|
309
324
|
const gradingReadabilityFlag = await extractAndGradeText(page); // Ensure flag is obtained before proceeding
|
@@ -314,103 +329,52 @@ export const runAxeScript = async ({
|
|
314
329
|
async ({
|
315
330
|
selectors,
|
316
331
|
saflyIconSelector,
|
317
|
-
customAxeConfig,
|
318
332
|
disableOobee,
|
319
333
|
enableWcagAaa,
|
320
|
-
oobeeAccessibleLabelFlaggedCssSelectors,
|
321
334
|
gradingReadabilityFlag,
|
335
|
+
evaluateAltTextFunctionString,
|
336
|
+
escapeCssSelectorFunctionString,
|
337
|
+
framesCheckFunctionString,
|
338
|
+
findElementByCssSelectorFunctionString,
|
339
|
+
getAxeConfigurationFunctionString,
|
340
|
+
flagUnlabelledClickableElementsFunctionString,
|
341
|
+
xPathToCssFunctionString,
|
322
342
|
}) => {
|
323
343
|
try {
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
}
|
333
|
-
}
|
334
|
-
return true;
|
335
|
-
};
|
336
|
-
|
337
|
-
// for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
|
338
|
-
const escapeCSSSelector = (selector: string) => {
|
339
|
-
try {
|
340
|
-
return selector.replace(
|
341
|
-
/([#\.])(\d)/g,
|
342
|
-
(_match, prefix, digit) => `${prefix}\\3${digit} `,
|
343
|
-
);
|
344
|
-
} catch (e) {
|
345
|
-
console.error(`error escaping css selector: ${selector}`, e);
|
346
|
-
return selector;
|
347
|
-
}
|
348
|
-
};
|
349
|
-
|
344
|
+
// Load functions into the browser context
|
345
|
+
eval(evaluateAltTextFunctionString);
|
346
|
+
eval(escapeCssSelectorFunctionString);
|
347
|
+
eval(framesCheckFunctionString);
|
348
|
+
eval(findElementByCssSelectorFunctionString);
|
349
|
+
eval(flagUnlabelledClickableElementsFunctionString);
|
350
|
+
eval(xPathToCssFunctionString);
|
351
|
+
eval(getAxeConfigurationFunctionString);
|
350
352
|
// remove so that axe does not scan
|
351
353
|
document.querySelector(saflyIconSelector)?.remove();
|
352
354
|
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
return true; // Pass if no readability issues
|
373
|
-
}
|
374
|
-
// Dynamically update the grading messages
|
375
|
-
const gradingCheck = customAxeConfig.checks.find(
|
376
|
-
check => check.id === 'oobee-grading-text-contents',
|
377
|
-
);
|
378
|
-
if (gradingCheck) {
|
379
|
-
gradingCheck.metadata.messages.incomplete = `The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease score of ${gradingReadabilityFlag
|
380
|
-
}.\nThe target passing score is above 50, indicating content readable by university students and lower grade levels.\nA higher score reflects better readability.`;
|
381
|
-
}
|
382
|
-
|
383
|
-
// Fail if readability issues are detected
|
384
|
-
},
|
385
|
-
},
|
386
|
-
]
|
387
|
-
: []),
|
388
|
-
],
|
389
|
-
rules: customAxeConfig.rules
|
390
|
-
.filter(rule => (disableOobee ? !rule.id.startsWith('oobee') : true))
|
391
|
-
.concat(
|
392
|
-
enableWcagAaa
|
393
|
-
? [
|
394
|
-
{
|
395
|
-
id: 'color-contrast-enhanced',
|
396
|
-
enabled: true,
|
397
|
-
tags: ['wcag2aaa', 'wcag146'],
|
398
|
-
},
|
399
|
-
{
|
400
|
-
id: 'identical-links-same-purpose',
|
401
|
-
enabled: true,
|
402
|
-
tags: ['wcag2aaa', 'wcag249'],
|
403
|
-
},
|
404
|
-
{
|
405
|
-
id: 'meta-refresh-no-exceptions',
|
406
|
-
enabled: true,
|
407
|
-
tags: ['wcag2aaa', 'wcag224', 'wcag325'],
|
408
|
-
},
|
409
|
-
]
|
410
|
-
: [],
|
411
|
-
),
|
355
|
+
const oobeeAccessibleLabelFlaggedXpaths = disableOobee
|
356
|
+
? []
|
357
|
+
: (await flagUnlabelledClickableElements()).map(item => item.xpath);
|
358
|
+
const oobeeAccessibleLabelFlaggedCssSelectors = oobeeAccessibleLabelFlaggedXpaths
|
359
|
+
.map(xpath => {
|
360
|
+
try {
|
361
|
+
const cssSelector = xPathToCss(xpath);
|
362
|
+
return cssSelector;
|
363
|
+
} catch (e) {
|
364
|
+
console.error('Error converting XPath to CSS: ', xpath, e);
|
365
|
+
return '';
|
366
|
+
}
|
367
|
+
})
|
368
|
+
.filter(item => item !== '');
|
369
|
+
|
370
|
+
const axeConfig = getAxeConfiguration({
|
371
|
+
enableWcagAaa,
|
372
|
+
gradingReadabilityFlag,
|
373
|
+
disableOobee,
|
412
374
|
});
|
413
375
|
|
376
|
+
axe.configure(axeConfig);
|
377
|
+
|
414
378
|
// removed needsReview condition
|
415
379
|
const defaultResultTypes: resultGroups[] = ['violations', 'passes', 'incomplete'];
|
416
380
|
|
@@ -424,102 +388,7 @@ export const runAxeScript = async ({
|
|
424
388
|
}
|
425
389
|
// handle css id selectors that start with a digit
|
426
390
|
const escapedCssSelectors =
|
427
|
-
oobeeAccessibleLabelFlaggedCssSelectors.map(
|
428
|
-
|
429
|
-
function framesCheck(cssSelector: string): {
|
430
|
-
doc: Document;
|
431
|
-
remainingSelector: string;
|
432
|
-
} {
|
433
|
-
let doc = document; // Start with the main document
|
434
|
-
let remainingSelector = ''; // To store the last part of the selector
|
435
|
-
let targetIframe = null;
|
436
|
-
|
437
|
-
// Split the selector into parts at "> html"
|
438
|
-
const diffParts = cssSelector.split(/\s*>\s*html\s*/);
|
439
|
-
|
440
|
-
for (let i = 0; i < diffParts.length - 1; i++) {
|
441
|
-
let iframeSelector = `${diffParts[i].trim()}`;
|
442
|
-
|
443
|
-
// Add back '> html' to the current part
|
444
|
-
if (i > 0) {
|
445
|
-
iframeSelector = `html > ${iframeSelector}`;
|
446
|
-
}
|
447
|
-
|
448
|
-
let frameset = null;
|
449
|
-
// Find the iframe using the current document context
|
450
|
-
if (doc.querySelector('frameset')) {
|
451
|
-
frameset = doc.querySelector('frameset');
|
452
|
-
}
|
453
|
-
|
454
|
-
if (frameset) {
|
455
|
-
doc = frameset;
|
456
|
-
iframeSelector = iframeSelector.split('body >')[1].trim();
|
457
|
-
}
|
458
|
-
targetIframe = doc.querySelector(iframeSelector);
|
459
|
-
|
460
|
-
if (targetIframe && targetIframe.contentDocument) {
|
461
|
-
// Update the document to the iframe's contentDocument
|
462
|
-
doc = targetIframe.contentDocument;
|
463
|
-
} else {
|
464
|
-
console.warn(
|
465
|
-
`Iframe not found or contentDocument inaccessible for selector: ${iframeSelector}`,
|
466
|
-
);
|
467
|
-
return { doc, remainingSelector: cssSelector }; // Return original selector if iframe not found
|
468
|
-
}
|
469
|
-
}
|
470
|
-
|
471
|
-
// The last part is the remaining CSS selector
|
472
|
-
remainingSelector = diffParts[diffParts.length - 1].trim();
|
473
|
-
|
474
|
-
// Remove any leading '>' combinators from remainingSelector
|
475
|
-
remainingSelector = `html${remainingSelector}`;
|
476
|
-
|
477
|
-
return { doc, remainingSelector };
|
478
|
-
}
|
479
|
-
|
480
|
-
function findElementByCssSelector(cssSelector: string): string | null {
|
481
|
-
let doc = document;
|
482
|
-
|
483
|
-
// Check if the selector includes 'frame' or 'iframe' and update doc and selector
|
484
|
-
|
485
|
-
if (/\s*>\s*html\s*/.test(cssSelector)) {
|
486
|
-
const inFrames = framesCheck(cssSelector);
|
487
|
-
doc = inFrames.doc;
|
488
|
-
cssSelector = inFrames.remainingSelector;
|
489
|
-
}
|
490
|
-
|
491
|
-
// Query the element in the document (including inside frames)
|
492
|
-
let element = doc.querySelector(cssSelector);
|
493
|
-
|
494
|
-
// Handle Shadow DOM if the element is not found
|
495
|
-
if (!element) {
|
496
|
-
const shadowRoots = [];
|
497
|
-
const allElements = document.querySelectorAll('*');
|
498
|
-
|
499
|
-
// Look for elements with shadow roots
|
500
|
-
allElements.forEach(el => {
|
501
|
-
if (el.shadowRoot) {
|
502
|
-
shadowRoots.push(el.shadowRoot);
|
503
|
-
}
|
504
|
-
});
|
505
|
-
|
506
|
-
// Search inside each shadow root for the element
|
507
|
-
for (const shadowRoot of shadowRoots) {
|
508
|
-
const shadowElement = shadowRoot.querySelector(cssSelector);
|
509
|
-
if (shadowElement) {
|
510
|
-
element = shadowElement; // Found the element inside shadow DOM
|
511
|
-
break;
|
512
|
-
}
|
513
|
-
}
|
514
|
-
}
|
515
|
-
|
516
|
-
if (element) {
|
517
|
-
return element.outerHTML;
|
518
|
-
}
|
519
|
-
|
520
|
-
console.warn(`Unable to find element for css selector: ${cssSelector}`);
|
521
|
-
return null;
|
522
|
-
}
|
391
|
+
oobeeAccessibleLabelFlaggedCssSelectors.map(escapeCssSelector);
|
523
392
|
|
524
393
|
// Add oobee violations to Axe's report
|
525
394
|
const oobeeAccessibleLabelViolations = {
|
@@ -566,11 +435,17 @@ export const runAxeScript = async ({
|
|
566
435
|
{
|
567
436
|
selectors,
|
568
437
|
saflyIconSelector,
|
569
|
-
customAxeConfig,
|
570
438
|
disableOobee,
|
571
439
|
enableWcagAaa,
|
572
|
-
oobeeAccessibleLabelFlaggedCssSelectors,
|
573
440
|
gradingReadabilityFlag,
|
441
|
+
evaluateAltTextFunctionString: evaluateAltText.toString(),
|
442
|
+
escapeCssSelectorFunctionString: escapeCssSelector.toString(),
|
443
|
+
framesCheckFunctionString: framesCheck.toString(),
|
444
|
+
findElementByCssSelectorFunctionString: findElementByCssSelector.toString(),
|
445
|
+
getAxeConfigurationFunctionString: getAxeConfiguration.toString(),
|
446
|
+
flagUnlabelledClickableElementsFunctionString:
|
447
|
+
flagUnlabelledClickableElements.toString(),
|
448
|
+
xPathToCssFunctionString: xPathToCss.toString(),
|
574
449
|
},
|
575
450
|
);
|
576
451
|
|
@@ -29,6 +29,7 @@ import {
|
|
29
29
|
getBlackListedPatterns,
|
30
30
|
urlWithoutAuth,
|
31
31
|
waitForPageLoaded,
|
32
|
+
initModifiedUserAgent,
|
32
33
|
} from '../constants/common.js';
|
33
34
|
import { areLinksEqual, isFollowStrategy } from '../utils.js';
|
34
35
|
import {
|
@@ -455,6 +456,8 @@ const crawlDomain = async ({
|
|
455
456
|
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
456
457
|
}
|
457
458
|
|
459
|
+
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
460
|
+
|
458
461
|
const crawler = new crawlee.PlaywrightCrawler({
|
459
462
|
launchContext: {
|
460
463
|
launcher: constants.launcher,
|
@@ -632,7 +635,7 @@ const crawlDomain = async ({
|
|
632
635
|
}
|
633
636
|
|
634
637
|
// handle pdfs
|
635
|
-
if (request.skipNavigation &&
|
638
|
+
if (request.skipNavigation && actualUrl === "about:blank") {
|
636
639
|
if (!isScanPdfs) {
|
637
640
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
638
641
|
numScanned: urlsCrawled.scanned.length,
|
@@ -658,24 +661,6 @@ const crawlDomain = async ({
|
|
658
661
|
return;
|
659
662
|
}
|
660
663
|
|
661
|
-
const resHeaders = response ? response.headers() : {}; // Safely access response headers
|
662
|
-
const contentType = resHeaders['content-type'] || ''; // Ensure contentType is defined
|
663
|
-
|
664
|
-
// Skip non-HTML and non-PDF URLs
|
665
|
-
if (!contentType.includes('text/html') && !contentType.includes('application/pdf')) {
|
666
|
-
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
667
|
-
numScanned: urlsCrawled.scanned.length,
|
668
|
-
urlScanned: request.url,
|
669
|
-
});
|
670
|
-
urlsCrawled.blacklisted.push({
|
671
|
-
url: request.url,
|
672
|
-
pageTitle: request.url,
|
673
|
-
actualUrl: actualUrl, // i.e. actualUrl
|
674
|
-
});
|
675
|
-
|
676
|
-
return;
|
677
|
-
}
|
678
|
-
|
679
664
|
if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
|
680
665
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
681
666
|
numScanned: urlsCrawled.scanned.length,
|
@@ -701,7 +686,7 @@ const crawlDomain = async ({
|
|
701
686
|
return;
|
702
687
|
}
|
703
688
|
|
704
|
-
if (response.status() === 403) {
|
689
|
+
if (response && response.status() === 403) {
|
705
690
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
706
691
|
numScanned: urlsCrawled.scanned.length,
|
707
692
|
urlScanned: request.url,
|
@@ -715,7 +700,8 @@ const crawlDomain = async ({
|
|
715
700
|
return;
|
716
701
|
}
|
717
702
|
|
718
|
-
if (response.status() !== 200) {
|
703
|
+
if (response && response.status() !== 200) {
|
704
|
+
|
719
705
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
720
706
|
numScanned: urlsCrawled.scanned.length,
|
721
707
|
urlScanned: request.url,
|
@@ -847,7 +833,7 @@ const crawlDomain = async ({
|
|
847
833
|
// when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
|
848
834
|
// a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
|
849
835
|
if (!isAbortingScanNow) {
|
850
|
-
|
836
|
+
urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
|
851
837
|
}
|
852
838
|
}
|
853
839
|
},
|
@@ -856,7 +842,8 @@ const crawlDomain = async ({
|
|
856
842
|
numScanned: urlsCrawled.scanned.length,
|
857
843
|
urlScanned: request.url,
|
858
844
|
});
|
859
|
-
urlsCrawled.error.push({ url: request.url });
|
845
|
+
urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
|
846
|
+
|
860
847
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
861
848
|
},
|
862
849
|
maxRequestsPerCrawl: Infinity,
|
@@ -10,6 +10,7 @@ import {
|
|
10
10
|
isFilePath,
|
11
11
|
convertLocalFileToPath,
|
12
12
|
convertPathToLocalFile,
|
13
|
+
initModifiedUserAgent,
|
13
14
|
} from '../constants/common.js';
|
14
15
|
import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
|
15
16
|
import { guiInfoLog } from '../logs.js';
|
@@ -142,6 +143,7 @@ const crawlLocalFile = async (
|
|
142
143
|
uuidToPdfMapping[pdfFileName] = trimmedUrl;
|
143
144
|
|
144
145
|
if (!isUrlPdf(request.url)) {
|
146
|
+
await initModifiedUserAgent(browser);
|
145
147
|
const browserContext = await constants.launcher.launchPersistentContext('', {
|
146
148
|
headless: false,
|
147
149
|
...getPlaywrightLaunchOptions(browser),
|
@@ -17,6 +17,7 @@ import {
|
|
17
17
|
urlWithoutAuth,
|
18
18
|
waitForPageLoaded,
|
19
19
|
isFilePath,
|
20
|
+
initModifiedUserAgent,
|
20
21
|
} from '../constants/common.js';
|
21
22
|
import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
|
22
23
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
|
@@ -139,6 +140,7 @@ const crawlSitemap = async (
|
|
139
140
|
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
140
141
|
}
|
141
142
|
|
143
|
+
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
142
144
|
const crawler = new crawlee.PlaywrightCrawler({
|
143
145
|
launchContext: {
|
144
146
|
launcher: constants.launcher,
|
@@ -244,7 +246,7 @@ const crawlSitemap = async (
|
|
244
246
|
return;
|
245
247
|
}
|
246
248
|
|
247
|
-
if (
|
249
|
+
if (request.skipNavigation && actualUrl === "about:blank") {
|
248
250
|
if (!isScanPdfs) {
|
249
251
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
250
252
|
numScanned: urlsCrawled.scanned.length,
|
@@ -271,8 +273,8 @@ const crawlSitemap = async (
|
|
271
273
|
return;
|
272
274
|
}
|
273
275
|
|
274
|
-
const contentType = response
|
275
|
-
const status = response.status();
|
276
|
+
const contentType = response?.headers?.()['content-type'] || '';
|
277
|
+
const status = response ? response.status() : 0;
|
276
278
|
|
277
279
|
if (blacklistedPatterns && !isFollowStrategy(actualUrl, request.url, "same-hostname") && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
278
280
|
urlsCrawled.userExcluded.push({
|
@@ -379,7 +381,7 @@ const crawlSitemap = async (
|
|
379
381
|
numScanned: urlsCrawled.scanned.length,
|
380
382
|
urlScanned: request.url,
|
381
383
|
});
|
382
|
-
urlsCrawled.error.push(
|
384
|
+
urlsCrawled.error.push(request.url);
|
383
385
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
384
386
|
},
|
385
387
|
maxRequestsPerCrawl: Infinity,
|
@@ -0,0 +1,10 @@
|
|
1
|
+
// for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
|
2
|
+
export function escapeCssSelector(selector: string) {
|
3
|
+
try {
|
4
|
+
return selector.replace(/([#\.])(\d)/g, (_match, prefix, digit) => `${prefix}\\3${digit} `);
|
5
|
+
} catch (e) {
|
6
|
+
console.error(`error escaping css selector: ${selector}`, e);
|
7
|
+
return selector;
|
8
|
+
}
|
9
|
+
}
|
10
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
export function evaluateAltText(node: Element) {
|
2
|
+
const altText = node.getAttribute('alt');
|
3
|
+
const confusingTexts = ['img', 'image', 'picture', 'photo', 'graphic'];
|
4
|
+
|
5
|
+
if (altText) {
|
6
|
+
const trimmedAltText = altText.trim().toLowerCase();
|
7
|
+
if (confusingTexts.includes(trimmedAltText)) {
|
8
|
+
return false;
|
9
|
+
}
|
10
|
+
}
|
11
|
+
return true;
|
12
|
+
}
|
13
|
+
|
@@ -47,8 +47,6 @@ export async function extractAndGradeText(page: Page): Promise<string> {
|
|
47
47
|
const result =
|
48
48
|
readabilityScore === 0 || readabilityScore > 50 ? '' : readabilityScore.toString(); // Convert readabilityScore to string
|
49
49
|
|
50
|
-
const pageUrl = await page.url(); // Get the page URL
|
51
|
-
|
52
50
|
return result;
|
53
51
|
} catch (error) {
|
54
52
|
console.error('Error extracting and grading text:', error);
|
@@ -0,0 +1,28 @@
|
|
1
|
+
export function extractText(): string[] {
|
2
|
+
try {
|
3
|
+
// Extract text content from all specified elements (e.g., paragraphs)
|
4
|
+
const elements = document.querySelectorAll('p'); // Adjust selector as needed
|
5
|
+
const extractedSentences: string[] = [];
|
6
|
+
|
7
|
+
elements.forEach(element => {
|
8
|
+
const text = element.innerText.trim();
|
9
|
+
// Split the text into individual sentences
|
10
|
+
const sentencePattern = /[^.!?]*[.!?]+/g; // Match sentences ending with ., !, or ?
|
11
|
+
const matches = text.match(sentencePattern);
|
12
|
+
if (matches) {
|
13
|
+
// Add only sentences that end with punctuation
|
14
|
+
matches.forEach(sentence => {
|
15
|
+
const trimmedSentence = sentence.trim(); // Trim whitespace from each sentence
|
16
|
+
if (trimmedSentence.length > 0) {
|
17
|
+
extractedSentences.push(trimmedSentence);
|
18
|
+
}
|
19
|
+
});
|
20
|
+
}
|
21
|
+
});
|
22
|
+
|
23
|
+
return extractedSentences;
|
24
|
+
} catch (error) {
|
25
|
+
console.error('Error extracting text:', error);
|
26
|
+
return []; // Return an empty string in case of an error
|
27
|
+
}
|
28
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import { framesCheck } from "./framesCheck.js";
|
2
|
+
|
3
|
+
export function findElementByCssSelector(cssSelector: string): string | null {
|
4
|
+
let doc = document;
|
5
|
+
|
6
|
+
// Check if the selector includes 'frame' or 'iframe' and update doc and selector
|
7
|
+
|
8
|
+
if (/\s*>\s*html\s*/.test(cssSelector)) {
|
9
|
+
const inFrames = framesCheck(cssSelector);
|
10
|
+
doc = inFrames.doc;
|
11
|
+
cssSelector = inFrames.remainingSelector;
|
12
|
+
}
|
13
|
+
|
14
|
+
// Query the element in the document (including inside frames)
|
15
|
+
let element = doc.querySelector(cssSelector);
|
16
|
+
|
17
|
+
// Handle Shadow DOM if the element is not found
|
18
|
+
if (!element) {
|
19
|
+
const shadowRoots = [];
|
20
|
+
const allElements = document.querySelectorAll('*');
|
21
|
+
|
22
|
+
// Look for elements with shadow roots
|
23
|
+
allElements.forEach(el => {
|
24
|
+
if (el.shadowRoot) {
|
25
|
+
shadowRoots.push(el.shadowRoot);
|
26
|
+
}
|
27
|
+
});
|
28
|
+
|
29
|
+
// Search inside each shadow root for the element
|
30
|
+
for (const shadowRoot of shadowRoots) {
|
31
|
+
const shadowElement = shadowRoot.querySelector(cssSelector);
|
32
|
+
if (shadowElement) {
|
33
|
+
element = shadowElement; // Found the element inside shadow DOM
|
34
|
+
break;
|
35
|
+
}
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
39
|
+
if (element) {
|
40
|
+
return element.outerHTML;
|
41
|
+
}
|
42
|
+
|
43
|
+
console.warn(`Unable to find element for css selector: ${cssSelector}`);
|
44
|
+
return null;
|
45
|
+
}
|
46
|
+
|