@govtechsg/oobee 0.10.34 → 0.10.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.vscode/settings.json +1 -1
- package/DETAILS.md +58 -42
- package/INTEGRATION.md +142 -53
- package/README.md +15 -0
- package/__mocks__/mock-report.html +1 -1
- package/exclusions.txt +4 -1
- package/package.json +2 -2
- package/src/constants/cliFunctions.ts +0 -7
- package/src/constants/common.ts +39 -1
- package/src/constants/constants.ts +9 -8
- package/src/constants/itemTypeDescription.ts +3 -3
- package/src/crawlers/commonCrawlerFunc.ts +67 -214
- package/src/crawlers/crawlDomain.ts +6 -2
- package/src/crawlers/crawlLocalFile.ts +2 -0
- package/src/crawlers/crawlSitemap.ts +5 -3
- package/src/crawlers/custom/escapeCssSelector.ts +10 -0
- package/src/crawlers/custom/evaluateAltText.ts +13 -0
- package/src/crawlers/custom/extractAndGradeText.ts +0 -2
- package/src/crawlers/custom/extractText.ts +28 -0
- package/src/crawlers/custom/findElementByCssSelector.ts +46 -0
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +1006 -901
- package/src/crawlers/custom/framesCheck.ts +51 -0
- package/src/crawlers/custom/getAxeConfiguration.ts +126 -0
- package/src/crawlers/custom/gradeReadability.ts +30 -0
- package/src/crawlers/custom/xPathToCss.ts +178 -0
- package/src/mergeAxeResults.ts +503 -132
- package/src/npmIndex.ts +130 -62
- package/src/static/ejs/partials/components/ruleOffcanvas.ejs +1 -1
- package/src/static/ejs/partials/components/scanAbout.ejs +1 -1
- package/src/static/ejs/partials/components/summaryScanResults.ejs +1 -1
- package/src/static/ejs/partials/components/wcagCompliance.ejs +3 -2
- package/src/static/ejs/partials/footer.ejs +13 -7
- package/src/static/ejs/partials/scripts/reportSearch.ejs +112 -74
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +2 -2
- package/src/static/ejs/partials/scripts/utils.ejs +1 -1
- package/src/static/ejs/partials/summaryMain.ejs +6 -6
- package/src/static/ejs/report.ejs +5 -5
- package/src/utils.ts +29 -10
- package/src/xPathToCssCypress.ts +178 -0
- package/src/crawlers/customAxeFunctions.ts +0 -82
@@ -7,6 +7,7 @@ import os from 'os';
|
|
7
7
|
import { spawnSync, execSync } from 'child_process';
|
8
8
|
import { chromium } from 'playwright';
|
9
9
|
import { silentLogger } from '../logs.js';
|
10
|
+
import { PageInfo } from '../mergeAxeResults.js';
|
10
11
|
|
11
12
|
const filename = fileURLToPath(import.meta.url);
|
12
13
|
const dirname = path.dirname(filename);
|
@@ -177,16 +178,16 @@ export const basicAuthRegex = /^.*\/\/.*:.*@.*$/i;
|
|
177
178
|
export const axeScript = path.join(dirname, '../../node_modules/axe-core/axe.min.js');
|
178
179
|
export class UrlsCrawled {
|
179
180
|
toScan: string[] = [];
|
180
|
-
scanned:
|
181
|
-
invalid:
|
181
|
+
scanned: PageInfo[] = [];
|
182
|
+
invalid: PageInfo[] = [];
|
182
183
|
scannedRedirects: { fromUrl: string; toUrl: string }[] = [];
|
183
184
|
notScannedRedirects: { fromUrl: string; toUrl: string }[] = [];
|
184
|
-
outOfDomain:
|
185
|
-
blacklisted:
|
186
|
-
error:
|
187
|
-
exceededRequests:
|
188
|
-
forbidden:
|
189
|
-
userExcluded:
|
185
|
+
outOfDomain: PageInfo[] = [];
|
186
|
+
blacklisted: PageInfo[] = [];
|
187
|
+
error: PageInfo[] = [];
|
188
|
+
exceededRequests: PageInfo[] = [];
|
189
|
+
forbidden: PageInfo[] = [];
|
190
|
+
userExcluded: PageInfo[] = [];
|
190
191
|
everything: string[] = [];
|
191
192
|
|
192
193
|
constructor(urlsCrawled?: Partial<UrlsCrawled>) {
|
@@ -1,10 +1,10 @@
|
|
1
1
|
const itemTypeDescription = {
|
2
2
|
mustFix:
|
3
|
-
'
|
3
|
+
'Must Fix issues includes WCAG A & AA success criteria (excluding those requiring review).',
|
4
4
|
goodToFix:
|
5
|
-
'
|
5
|
+
'Good to Fix issues includes WCAG Level AAA success criteria issues and all best practice rules that do not necessarily conform to WCAG success criterion but are industry accepted practices that improve the user experience.',
|
6
6
|
needsReview:
|
7
|
-
'
|
7
|
+
'Manual Review Required occurrences could potentially be false positive, requiring human validation for accuracy.',
|
8
8
|
passed: 'Occurrences that passed the automated checks.',
|
9
9
|
};
|
10
10
|
|
@@ -1,7 +1,6 @@
|
|
1
1
|
import crawlee, { CrawlingContext, PlaywrightGotoOptions } from 'crawlee';
|
2
2
|
import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
|
3
3
|
import { BrowserContext, Page } from 'playwright';
|
4
|
-
import { xPathToCss } from '../xPathToCss.js';
|
5
4
|
import {
|
6
5
|
axeScript,
|
7
6
|
guiInfoStatusTypes,
|
@@ -11,10 +10,15 @@ import {
|
|
11
10
|
import { guiInfoLog, silentLogger } from '../logs.js';
|
12
11
|
import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
|
13
12
|
import { isFilePath } from '../constants/common.js';
|
14
|
-
import { customAxeConfig } from './customAxeFunctions.js';
|
15
|
-
import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
|
16
13
|
import { extractAndGradeText } from './custom/extractAndGradeText.js';
|
17
14
|
import { ItemsInfo } from '../mergeAxeResults.js';
|
15
|
+
import { evaluateAltText } from './custom/evaluateAltText.js';
|
16
|
+
import { escapeCssSelector } from './custom/escapeCssSelector.js';
|
17
|
+
import { framesCheck } from './custom/framesCheck.js';
|
18
|
+
import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
|
19
|
+
import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
|
20
|
+
import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
|
21
|
+
import { xPathToCss } from './custom/xPathToCss.js';
|
18
22
|
|
19
23
|
// types
|
20
24
|
interface AxeResultsWithScreenshot extends AxeResults {
|
@@ -86,17 +90,17 @@ export const filterAxeResults = (
|
|
86
90
|
const conformance = tags.filter(tag => tag.startsWith('wcag') || tag === 'best-practice');
|
87
91
|
|
88
92
|
// handle rare cases where conformance level is not the first element
|
89
|
-
const
|
90
|
-
|
93
|
+
const wcagRegex = /^wcag\d+a+$/;
|
94
|
+
|
95
|
+
if (conformance[0] !== 'best-practice' && !wcagRegex.test(conformance[0])) {
|
91
96
|
conformance.sort((a, b) => {
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
return 0;
|
97
|
+
if (wcagRegex.test(a) && !wcagRegex.test(b)) {
|
98
|
+
return -1;
|
99
|
+
}
|
100
|
+
if (!wcagRegex.test(a) && wcagRegex.test(b)) {
|
101
|
+
return 1;
|
102
|
+
}
|
103
|
+
return 0;
|
100
104
|
});
|
101
105
|
}
|
102
106
|
|
@@ -138,9 +142,13 @@ export const filterAxeResults = (
|
|
138
142
|
|
139
143
|
nodes.forEach(node => {
|
140
144
|
const { impact } = node;
|
145
|
+
const hasWcagA = conformance.some(tag => /^wcag\d*a$/.test(tag));
|
146
|
+
const hasWcagAA = conformance.some(tag => /^wcag\d*aa$/.test(tag));
|
147
|
+
// const hasWcagAAA = conformance.some(tag => /^wcag\d*aaa$/.test(tag));
|
148
|
+
|
141
149
|
if (displayNeedsReview) {
|
142
150
|
addTo(needsReview, node);
|
143
|
-
} else if (
|
151
|
+
} else if (hasWcagA || hasWcagAA) {
|
144
152
|
addTo(mustFix, node);
|
145
153
|
} else {
|
146
154
|
addTo(goodToFix, node);
|
@@ -283,21 +291,6 @@ export const runAxeScript = async ({
|
|
283
291
|
});
|
284
292
|
|
285
293
|
const disableOobee = ruleset.includes(RuleFlags.DISABLE_OOBEE);
|
286
|
-
const oobeeAccessibleLabelFlaggedXpaths = disableOobee
|
287
|
-
? []
|
288
|
-
: (await flagUnlabelledClickableElements(page)).map(item => item.xpath);
|
289
|
-
const oobeeAccessibleLabelFlaggedCssSelectors = oobeeAccessibleLabelFlaggedXpaths
|
290
|
-
.map(xpath => {
|
291
|
-
try {
|
292
|
-
const cssSelector = xPathToCss(xpath);
|
293
|
-
return cssSelector;
|
294
|
-
} catch (e) {
|
295
|
-
console.error('Error converting XPath to CSS: ', xpath, e);
|
296
|
-
return '';
|
297
|
-
}
|
298
|
-
})
|
299
|
-
.filter(item => item !== '');
|
300
|
-
|
301
294
|
const enableWcagAaa = ruleset.includes(RuleFlags.ENABLE_WCAG_AAA);
|
302
295
|
|
303
296
|
const gradingReadabilityFlag = await extractAndGradeText(page); // Ensure flag is obtained before proceeding
|
@@ -308,103 +301,52 @@ export const runAxeScript = async ({
|
|
308
301
|
async ({
|
309
302
|
selectors,
|
310
303
|
saflyIconSelector,
|
311
|
-
customAxeConfig,
|
312
304
|
disableOobee,
|
313
305
|
enableWcagAaa,
|
314
|
-
oobeeAccessibleLabelFlaggedCssSelectors,
|
315
306
|
gradingReadabilityFlag,
|
307
|
+
evaluateAltTextFunctionString,
|
308
|
+
escapeCssSelectorFunctionString,
|
309
|
+
framesCheckFunctionString,
|
310
|
+
findElementByCssSelectorFunctionString,
|
311
|
+
getAxeConfigurationFunctionString,
|
312
|
+
flagUnlabelledClickableElementsFunctionString,
|
313
|
+
xPathToCssFunctionString,
|
316
314
|
}) => {
|
317
315
|
try {
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
}
|
327
|
-
}
|
328
|
-
return true;
|
329
|
-
};
|
330
|
-
|
331
|
-
// for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
|
332
|
-
const escapeCSSSelector = (selector: string) => {
|
333
|
-
try {
|
334
|
-
return selector.replace(
|
335
|
-
/([#\.])(\d)/g,
|
336
|
-
(_match, prefix, digit) => `${prefix}\\3${digit} `,
|
337
|
-
);
|
338
|
-
} catch (e) {
|
339
|
-
console.error(`error escaping css selector: ${selector}`, e);
|
340
|
-
return selector;
|
341
|
-
}
|
342
|
-
};
|
343
|
-
|
316
|
+
// Load functions into the browser context
|
317
|
+
eval(evaluateAltTextFunctionString);
|
318
|
+
eval(escapeCssSelectorFunctionString);
|
319
|
+
eval(framesCheckFunctionString);
|
320
|
+
eval(findElementByCssSelectorFunctionString);
|
321
|
+
eval(flagUnlabelledClickableElementsFunctionString);
|
322
|
+
eval(xPathToCssFunctionString);
|
323
|
+
eval(getAxeConfigurationFunctionString);
|
344
324
|
// remove so that axe does not scan
|
345
325
|
document.querySelector(saflyIconSelector)?.remove();
|
346
326
|
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
return true; // Pass if no readability issues
|
367
|
-
}
|
368
|
-
// Dynamically update the grading messages
|
369
|
-
const gradingCheck = customAxeConfig.checks.find(
|
370
|
-
check => check.id === 'oobee-grading-text-contents',
|
371
|
-
);
|
372
|
-
if (gradingCheck) {
|
373
|
-
gradingCheck.metadata.messages.incomplete = `The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease score of ${gradingReadabilityFlag
|
374
|
-
}.\nThe target passing score is above 50, indicating content readable by university students and lower grade levels.\nA higher score reflects better readability.`;
|
375
|
-
}
|
376
|
-
|
377
|
-
// Fail if readability issues are detected
|
378
|
-
},
|
379
|
-
},
|
380
|
-
]
|
381
|
-
: []),
|
382
|
-
],
|
383
|
-
rules: customAxeConfig.rules
|
384
|
-
.filter(rule => (disableOobee ? !rule.id.startsWith('oobee') : true))
|
385
|
-
.concat(
|
386
|
-
enableWcagAaa
|
387
|
-
? [
|
388
|
-
{
|
389
|
-
id: 'color-contrast-enhanced',
|
390
|
-
enabled: true,
|
391
|
-
tags: ['wcag2aaa', 'wcag146'],
|
392
|
-
},
|
393
|
-
{
|
394
|
-
id: 'identical-links-same-purpose',
|
395
|
-
enabled: true,
|
396
|
-
tags: ['wcag2aaa', 'wcag249'],
|
397
|
-
},
|
398
|
-
{
|
399
|
-
id: 'meta-refresh-no-exceptions',
|
400
|
-
enabled: true,
|
401
|
-
tags: ['wcag2aaa', 'wcag224', 'wcag325'],
|
402
|
-
},
|
403
|
-
]
|
404
|
-
: [],
|
405
|
-
),
|
327
|
+
const oobeeAccessibleLabelFlaggedXpaths = disableOobee
|
328
|
+
? []
|
329
|
+
: (await flagUnlabelledClickableElements()).map(item => item.xpath);
|
330
|
+
const oobeeAccessibleLabelFlaggedCssSelectors = oobeeAccessibleLabelFlaggedXpaths
|
331
|
+
.map(xpath => {
|
332
|
+
try {
|
333
|
+
const cssSelector = xPathToCss(xpath);
|
334
|
+
return cssSelector;
|
335
|
+
} catch (e) {
|
336
|
+
console.error('Error converting XPath to CSS: ', xpath, e);
|
337
|
+
return '';
|
338
|
+
}
|
339
|
+
})
|
340
|
+
.filter(item => item !== '');
|
341
|
+
|
342
|
+
const axeConfig = getAxeConfiguration({
|
343
|
+
enableWcagAaa,
|
344
|
+
gradingReadabilityFlag,
|
345
|
+
disableOobee,
|
406
346
|
});
|
407
347
|
|
348
|
+
axe.configure(axeConfig);
|
349
|
+
|
408
350
|
// removed needsReview condition
|
409
351
|
const defaultResultTypes: resultGroups[] = ['violations', 'passes', 'incomplete'];
|
410
352
|
|
@@ -418,102 +360,7 @@ export const runAxeScript = async ({
|
|
418
360
|
}
|
419
361
|
// handle css id selectors that start with a digit
|
420
362
|
const escapedCssSelectors =
|
421
|
-
oobeeAccessibleLabelFlaggedCssSelectors.map(
|
422
|
-
|
423
|
-
function framesCheck(cssSelector: string): {
|
424
|
-
doc: Document;
|
425
|
-
remainingSelector: string;
|
426
|
-
} {
|
427
|
-
let doc = document; // Start with the main document
|
428
|
-
let remainingSelector = ''; // To store the last part of the selector
|
429
|
-
let targetIframe = null;
|
430
|
-
|
431
|
-
// Split the selector into parts at "> html"
|
432
|
-
const diffParts = cssSelector.split(/\s*>\s*html\s*/);
|
433
|
-
|
434
|
-
for (let i = 0; i < diffParts.length - 1; i++) {
|
435
|
-
let iframeSelector = `${diffParts[i].trim()}`;
|
436
|
-
|
437
|
-
// Add back '> html' to the current part
|
438
|
-
if (i > 0) {
|
439
|
-
iframeSelector = `html > ${iframeSelector}`;
|
440
|
-
}
|
441
|
-
|
442
|
-
let frameset = null;
|
443
|
-
// Find the iframe using the current document context
|
444
|
-
if (doc.querySelector('frameset')) {
|
445
|
-
frameset = doc.querySelector('frameset');
|
446
|
-
}
|
447
|
-
|
448
|
-
if (frameset) {
|
449
|
-
doc = frameset;
|
450
|
-
iframeSelector = iframeSelector.split('body >')[1].trim();
|
451
|
-
}
|
452
|
-
targetIframe = doc.querySelector(iframeSelector);
|
453
|
-
|
454
|
-
if (targetIframe && targetIframe.contentDocument) {
|
455
|
-
// Update the document to the iframe's contentDocument
|
456
|
-
doc = targetIframe.contentDocument;
|
457
|
-
} else {
|
458
|
-
console.warn(
|
459
|
-
`Iframe not found or contentDocument inaccessible for selector: ${iframeSelector}`,
|
460
|
-
);
|
461
|
-
return { doc, remainingSelector: cssSelector }; // Return original selector if iframe not found
|
462
|
-
}
|
463
|
-
}
|
464
|
-
|
465
|
-
// The last part is the remaining CSS selector
|
466
|
-
remainingSelector = diffParts[diffParts.length - 1].trim();
|
467
|
-
|
468
|
-
// Remove any leading '>' combinators from remainingSelector
|
469
|
-
remainingSelector = `html${remainingSelector}`;
|
470
|
-
|
471
|
-
return { doc, remainingSelector };
|
472
|
-
}
|
473
|
-
|
474
|
-
function findElementByCssSelector(cssSelector: string): string | null {
|
475
|
-
let doc = document;
|
476
|
-
|
477
|
-
// Check if the selector includes 'frame' or 'iframe' and update doc and selector
|
478
|
-
|
479
|
-
if (/\s*>\s*html\s*/.test(cssSelector)) {
|
480
|
-
const inFrames = framesCheck(cssSelector);
|
481
|
-
doc = inFrames.doc;
|
482
|
-
cssSelector = inFrames.remainingSelector;
|
483
|
-
}
|
484
|
-
|
485
|
-
// Query the element in the document (including inside frames)
|
486
|
-
let element = doc.querySelector(cssSelector);
|
487
|
-
|
488
|
-
// Handle Shadow DOM if the element is not found
|
489
|
-
if (!element) {
|
490
|
-
const shadowRoots = [];
|
491
|
-
const allElements = document.querySelectorAll('*');
|
492
|
-
|
493
|
-
// Look for elements with shadow roots
|
494
|
-
allElements.forEach(el => {
|
495
|
-
if (el.shadowRoot) {
|
496
|
-
shadowRoots.push(el.shadowRoot);
|
497
|
-
}
|
498
|
-
});
|
499
|
-
|
500
|
-
// Search inside each shadow root for the element
|
501
|
-
for (const shadowRoot of shadowRoots) {
|
502
|
-
const shadowElement = shadowRoot.querySelector(cssSelector);
|
503
|
-
if (shadowElement) {
|
504
|
-
element = shadowElement; // Found the element inside shadow DOM
|
505
|
-
break;
|
506
|
-
}
|
507
|
-
}
|
508
|
-
}
|
509
|
-
|
510
|
-
if (element) {
|
511
|
-
return element.outerHTML;
|
512
|
-
}
|
513
|
-
|
514
|
-
console.warn(`Unable to find element for css selector: ${cssSelector}`);
|
515
|
-
return null;
|
516
|
-
}
|
363
|
+
oobeeAccessibleLabelFlaggedCssSelectors.map(escapeCssSelector);
|
517
364
|
|
518
365
|
// Add oobee violations to Axe's report
|
519
366
|
const oobeeAccessibleLabelViolations = {
|
@@ -560,11 +407,17 @@ export const runAxeScript = async ({
|
|
560
407
|
{
|
561
408
|
selectors,
|
562
409
|
saflyIconSelector,
|
563
|
-
customAxeConfig,
|
564
410
|
disableOobee,
|
565
411
|
enableWcagAaa,
|
566
|
-
oobeeAccessibleLabelFlaggedCssSelectors,
|
567
412
|
gradingReadabilityFlag,
|
413
|
+
evaluateAltTextFunctionString: evaluateAltText.toString(),
|
414
|
+
escapeCssSelectorFunctionString: escapeCssSelector.toString(),
|
415
|
+
framesCheckFunctionString: framesCheck.toString(),
|
416
|
+
findElementByCssSelectorFunctionString: findElementByCssSelector.toString(),
|
417
|
+
getAxeConfigurationFunctionString: getAxeConfiguration.toString(),
|
418
|
+
flagUnlabelledClickableElementsFunctionString:
|
419
|
+
flagUnlabelledClickableElements.toString(),
|
420
|
+
xPathToCssFunctionString: xPathToCss.toString(),
|
568
421
|
},
|
569
422
|
);
|
570
423
|
|
@@ -29,6 +29,7 @@ import {
|
|
29
29
|
getBlackListedPatterns,
|
30
30
|
urlWithoutAuth,
|
31
31
|
waitForPageLoaded,
|
32
|
+
initModifiedUserAgent,
|
32
33
|
} from '../constants/common.js';
|
33
34
|
import { areLinksEqual, isFollowStrategy } from '../utils.js';
|
34
35
|
import {
|
@@ -455,6 +456,8 @@ const crawlDomain = async ({
|
|
455
456
|
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
456
457
|
}
|
457
458
|
|
459
|
+
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
460
|
+
|
458
461
|
const crawler = new crawlee.PlaywrightCrawler({
|
459
462
|
launchContext: {
|
460
463
|
launcher: constants.launcher,
|
@@ -847,7 +850,7 @@ const crawlDomain = async ({
|
|
847
850
|
// when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
|
848
851
|
// a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
|
849
852
|
if (!isAbortingScanNow) {
|
850
|
-
|
853
|
+
urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
|
851
854
|
}
|
852
855
|
}
|
853
856
|
},
|
@@ -856,7 +859,8 @@ const crawlDomain = async ({
|
|
856
859
|
numScanned: urlsCrawled.scanned.length,
|
857
860
|
urlScanned: request.url,
|
858
861
|
});
|
859
|
-
urlsCrawled.error.push({ url: request.url });
|
862
|
+
urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
|
863
|
+
|
860
864
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
861
865
|
},
|
862
866
|
maxRequestsPerCrawl: Infinity,
|
@@ -10,6 +10,7 @@ import {
|
|
10
10
|
isFilePath,
|
11
11
|
convertLocalFileToPath,
|
12
12
|
convertPathToLocalFile,
|
13
|
+
initModifiedUserAgent,
|
13
14
|
} from '../constants/common.js';
|
14
15
|
import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
|
15
16
|
import { guiInfoLog } from '../logs.js';
|
@@ -142,6 +143,7 @@ const crawlLocalFile = async (
|
|
142
143
|
uuidToPdfMapping[pdfFileName] = trimmedUrl;
|
143
144
|
|
144
145
|
if (!isUrlPdf(request.url)) {
|
146
|
+
await initModifiedUserAgent(browser);
|
145
147
|
const browserContext = await constants.launcher.launchPersistentContext('', {
|
146
148
|
headless: false,
|
147
149
|
...getPlaywrightLaunchOptions(browser),
|
@@ -17,6 +17,7 @@ import {
|
|
17
17
|
urlWithoutAuth,
|
18
18
|
waitForPageLoaded,
|
19
19
|
isFilePath,
|
20
|
+
initModifiedUserAgent,
|
20
21
|
} from '../constants/common.js';
|
21
22
|
import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
|
22
23
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
|
@@ -139,6 +140,7 @@ const crawlSitemap = async (
|
|
139
140
|
userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
|
140
141
|
}
|
141
142
|
|
143
|
+
await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
|
142
144
|
const crawler = new crawlee.PlaywrightCrawler({
|
143
145
|
launchContext: {
|
144
146
|
launcher: constants.launcher,
|
@@ -271,8 +273,8 @@ const crawlSitemap = async (
|
|
271
273
|
return;
|
272
274
|
}
|
273
275
|
|
274
|
-
const contentType = response
|
275
|
-
const status = response.status();
|
276
|
+
const contentType = response?.headers?.()['content-type'] || '';
|
277
|
+
const status = response ? response.status() : 0;
|
276
278
|
|
277
279
|
if (blacklistedPatterns && !isFollowStrategy(actualUrl, request.url, "same-hostname") && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
278
280
|
urlsCrawled.userExcluded.push({
|
@@ -379,7 +381,7 @@ const crawlSitemap = async (
|
|
379
381
|
numScanned: urlsCrawled.scanned.length,
|
380
382
|
urlScanned: request.url,
|
381
383
|
});
|
382
|
-
urlsCrawled.error.push(
|
384
|
+
urlsCrawled.error.push(request.url);
|
383
385
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
384
386
|
},
|
385
387
|
maxRequestsPerCrawl: Infinity,
|
@@ -0,0 +1,10 @@
|
|
1
|
+
// for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
|
2
|
+
export function escapeCssSelector(selector: string) {
|
3
|
+
try {
|
4
|
+
return selector.replace(/([#\.])(\d)/g, (_match, prefix, digit) => `${prefix}\\3${digit} `);
|
5
|
+
} catch (e) {
|
6
|
+
console.error(`error escaping css selector: ${selector}`, e);
|
7
|
+
return selector;
|
8
|
+
}
|
9
|
+
}
|
10
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
export function evaluateAltText(node: Element) {
|
2
|
+
const altText = node.getAttribute('alt');
|
3
|
+
const confusingTexts = ['img', 'image', 'picture', 'photo', 'graphic'];
|
4
|
+
|
5
|
+
if (altText) {
|
6
|
+
const trimmedAltText = altText.trim().toLowerCase();
|
7
|
+
if (confusingTexts.includes(trimmedAltText)) {
|
8
|
+
return false;
|
9
|
+
}
|
10
|
+
}
|
11
|
+
return true;
|
12
|
+
}
|
13
|
+
|
@@ -47,8 +47,6 @@ export async function extractAndGradeText(page: Page): Promise<string> {
|
|
47
47
|
const result =
|
48
48
|
readabilityScore === 0 || readabilityScore > 50 ? '' : readabilityScore.toString(); // Convert readabilityScore to string
|
49
49
|
|
50
|
-
const pageUrl = await page.url(); // Get the page URL
|
51
|
-
|
52
50
|
return result;
|
53
51
|
} catch (error) {
|
54
52
|
console.error('Error extracting and grading text:', error);
|
@@ -0,0 +1,28 @@
|
|
1
|
+
export function extractText(): string[] {
|
2
|
+
try {
|
3
|
+
// Extract text content from all specified elements (e.g., paragraphs)
|
4
|
+
const elements = document.querySelectorAll('p'); // Adjust selector as needed
|
5
|
+
const extractedSentences: string[] = [];
|
6
|
+
|
7
|
+
elements.forEach(element => {
|
8
|
+
const text = element.innerText.trim();
|
9
|
+
// Split the text into individual sentences
|
10
|
+
const sentencePattern = /[^.!?]*[.!?]+/g; // Match sentences ending with ., !, or ?
|
11
|
+
const matches = text.match(sentencePattern);
|
12
|
+
if (matches) {
|
13
|
+
// Add only sentences that end with punctuation
|
14
|
+
matches.forEach(sentence => {
|
15
|
+
const trimmedSentence = sentence.trim(); // Trim whitespace from each sentence
|
16
|
+
if (trimmedSentence.length > 0) {
|
17
|
+
extractedSentences.push(trimmedSentence);
|
18
|
+
}
|
19
|
+
});
|
20
|
+
}
|
21
|
+
});
|
22
|
+
|
23
|
+
return extractedSentences;
|
24
|
+
} catch (error) {
|
25
|
+
console.error('Error extracting text:', error);
|
26
|
+
return []; // Return an empty string in case of an error
|
27
|
+
}
|
28
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
import { framesCheck } from "./framesCheck.js";
|
2
|
+
|
3
|
+
export function findElementByCssSelector(cssSelector: string): string | null {
|
4
|
+
let doc = document;
|
5
|
+
|
6
|
+
// Check if the selector includes 'frame' or 'iframe' and update doc and selector
|
7
|
+
|
8
|
+
if (/\s*>\s*html\s*/.test(cssSelector)) {
|
9
|
+
const inFrames = framesCheck(cssSelector);
|
10
|
+
doc = inFrames.doc;
|
11
|
+
cssSelector = inFrames.remainingSelector;
|
12
|
+
}
|
13
|
+
|
14
|
+
// Query the element in the document (including inside frames)
|
15
|
+
let element = doc.querySelector(cssSelector);
|
16
|
+
|
17
|
+
// Handle Shadow DOM if the element is not found
|
18
|
+
if (!element) {
|
19
|
+
const shadowRoots = [];
|
20
|
+
const allElements = document.querySelectorAll('*');
|
21
|
+
|
22
|
+
// Look for elements with shadow roots
|
23
|
+
allElements.forEach(el => {
|
24
|
+
if (el.shadowRoot) {
|
25
|
+
shadowRoots.push(el.shadowRoot);
|
26
|
+
}
|
27
|
+
});
|
28
|
+
|
29
|
+
// Search inside each shadow root for the element
|
30
|
+
for (const shadowRoot of shadowRoots) {
|
31
|
+
const shadowElement = shadowRoot.querySelector(cssSelector);
|
32
|
+
if (shadowElement) {
|
33
|
+
element = shadowElement; // Found the element inside shadow DOM
|
34
|
+
break;
|
35
|
+
}
|
36
|
+
}
|
37
|
+
}
|
38
|
+
|
39
|
+
if (element) {
|
40
|
+
return element.outerHTML;
|
41
|
+
}
|
42
|
+
|
43
|
+
console.warn(`Unable to find element for css selector: ${cssSelector}`);
|
44
|
+
return null;
|
45
|
+
}
|
46
|
+
|