@govtechsg/oobee 0.10.34 → 0.10.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/.vscode/settings.json +1 -1
  2. package/DETAILS.md +58 -42
  3. package/INTEGRATION.md +142 -53
  4. package/README.md +15 -0
  5. package/__mocks__/mock-report.html +1 -1
  6. package/exclusions.txt +4 -1
  7. package/package.json +2 -2
  8. package/src/constants/cliFunctions.ts +0 -7
  9. package/src/constants/common.ts +39 -1
  10. package/src/constants/constants.ts +9 -8
  11. package/src/constants/itemTypeDescription.ts +3 -3
  12. package/src/crawlers/commonCrawlerFunc.ts +67 -214
  13. package/src/crawlers/crawlDomain.ts +6 -2
  14. package/src/crawlers/crawlLocalFile.ts +2 -0
  15. package/src/crawlers/crawlSitemap.ts +5 -3
  16. package/src/crawlers/custom/escapeCssSelector.ts +10 -0
  17. package/src/crawlers/custom/evaluateAltText.ts +13 -0
  18. package/src/crawlers/custom/extractAndGradeText.ts +0 -2
  19. package/src/crawlers/custom/extractText.ts +28 -0
  20. package/src/crawlers/custom/findElementByCssSelector.ts +46 -0
  21. package/src/crawlers/custom/flagUnlabelledClickableElements.ts +1006 -901
  22. package/src/crawlers/custom/framesCheck.ts +51 -0
  23. package/src/crawlers/custom/getAxeConfiguration.ts +126 -0
  24. package/src/crawlers/custom/gradeReadability.ts +30 -0
  25. package/src/crawlers/custom/xPathToCss.ts +178 -0
  26. package/src/mergeAxeResults.ts +503 -132
  27. package/src/npmIndex.ts +130 -62
  28. package/src/static/ejs/partials/components/ruleOffcanvas.ejs +1 -1
  29. package/src/static/ejs/partials/components/scanAbout.ejs +1 -1
  30. package/src/static/ejs/partials/components/summaryScanResults.ejs +1 -1
  31. package/src/static/ejs/partials/components/wcagCompliance.ejs +3 -2
  32. package/src/static/ejs/partials/footer.ejs +13 -7
  33. package/src/static/ejs/partials/scripts/reportSearch.ejs +112 -74
  34. package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +2 -2
  35. package/src/static/ejs/partials/scripts/utils.ejs +1 -1
  36. package/src/static/ejs/partials/summaryMain.ejs +6 -6
  37. package/src/static/ejs/report.ejs +5 -5
  38. package/src/utils.ts +29 -10
  39. package/src/xPathToCssCypress.ts +178 -0
  40. package/src/crawlers/customAxeFunctions.ts +0 -82
@@ -7,6 +7,7 @@ import os from 'os';
7
7
  import { spawnSync, execSync } from 'child_process';
8
8
  import { chromium } from 'playwright';
9
9
  import { silentLogger } from '../logs.js';
10
+ import { PageInfo } from '../mergeAxeResults.js';
10
11
 
11
12
  const filename = fileURLToPath(import.meta.url);
12
13
  const dirname = path.dirname(filename);
@@ -177,16 +178,16 @@ export const basicAuthRegex = /^.*\/\/.*:.*@.*$/i;
177
178
  export const axeScript = path.join(dirname, '../../node_modules/axe-core/axe.min.js');
178
179
  export class UrlsCrawled {
179
180
  toScan: string[] = [];
180
- scanned: { url: string; actualUrl: string; pageTitle: string }[] = [];
181
- invalid: { url: string; actualUrl: string; pageTitle: string }[] = [];
181
+ scanned: PageInfo[] = [];
182
+ invalid: PageInfo[] = [];
182
183
  scannedRedirects: { fromUrl: string; toUrl: string }[] = [];
183
184
  notScannedRedirects: { fromUrl: string; toUrl: string }[] = [];
184
- outOfDomain: string[] = [];
185
- blacklisted: { url: string; actualUrl: string; pageTitle: string }[] = [];
186
- error: { url: string }[] = [];
187
- exceededRequests: string[] = [];
188
- forbidden: { url: string; actualUrl: string; pageTitle: string }[] = [];
189
- userExcluded: { url: string; actualUrl: string; pageTitle: string }[] = [];
185
+ outOfDomain: PageInfo[] = [];
186
+ blacklisted: PageInfo[] = [];
187
+ error: PageInfo[] = [];
188
+ exceededRequests: PageInfo[] = [];
189
+ forbidden: PageInfo[] = [];
190
+ userExcluded: PageInfo[] = [];
190
191
  everything: string[] = [];
191
192
 
192
193
  constructor(urlsCrawled?: Partial<UrlsCrawled>) {
@@ -1,10 +1,10 @@
1
1
  const itemTypeDescription = {
2
2
  mustFix:
3
- 'Issues that need to be addressed promptly, as they create significant barriers for persons with disabilities and can prevent them from accessing essential content or features.',
3
+ 'Must Fix issues includes WCAG A & AA success criteria (excluding those requiring review).',
4
4
  goodToFix:
5
- 'Issues that could pose certain challenges for persons with disabilities (PWDs), but are unlikely to completely hinder their access to essential content or features.',
5
+ 'Good to Fix issues includes WCAG Level AAA success criteria issues and all best practice rules that do not necessarily conform to WCAG success criterion but are industry accepted practices that improve the user experience.',
6
6
  needsReview:
7
- 'Occurrences could potentially be false positives, requiring human validation for accuracy.',
7
+ 'Manual Review Required occurrences could potentially be false positive, requiring human validation for accuracy.',
8
8
  passed: 'Occurrences that passed the automated checks.',
9
9
  };
10
10
 
@@ -1,7 +1,6 @@
1
1
  import crawlee, { CrawlingContext, PlaywrightGotoOptions } from 'crawlee';
2
2
  import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
3
3
  import { BrowserContext, Page } from 'playwright';
4
- import { xPathToCss } from '../xPathToCss.js';
5
4
  import {
6
5
  axeScript,
7
6
  guiInfoStatusTypes,
@@ -11,10 +10,15 @@ import {
11
10
  import { guiInfoLog, silentLogger } from '../logs.js';
12
11
  import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
13
12
  import { isFilePath } from '../constants/common.js';
14
- import { customAxeConfig } from './customAxeFunctions.js';
15
- import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
16
13
  import { extractAndGradeText } from './custom/extractAndGradeText.js';
17
14
  import { ItemsInfo } from '../mergeAxeResults.js';
15
+ import { evaluateAltText } from './custom/evaluateAltText.js';
16
+ import { escapeCssSelector } from './custom/escapeCssSelector.js';
17
+ import { framesCheck } from './custom/framesCheck.js';
18
+ import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
19
+ import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
20
+ import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
21
+ import { xPathToCss } from './custom/xPathToCss.js';
18
22
 
19
23
  // types
20
24
  interface AxeResultsWithScreenshot extends AxeResults {
@@ -86,17 +90,17 @@ export const filterAxeResults = (
86
90
  const conformance = tags.filter(tag => tag.startsWith('wcag') || tag === 'best-practice');
87
91
 
88
92
  // handle rare cases where conformance level is not the first element
89
- const levels = ['wcag2a', 'wcag2aa', 'wcag2aaa'];
90
- if (conformance[0] !== 'best-practice' && !levels.includes(conformance[0])) {
93
+ const wcagRegex = /^wcag\d+a+$/;
94
+
95
+ if (conformance[0] !== 'best-practice' && !wcagRegex.test(conformance[0])) {
91
96
  conformance.sort((a, b) => {
92
- if (levels.includes(a)) {
93
- return -1;
94
- }
95
- if (levels.includes(b)) {
96
- return 1;
97
- }
98
-
99
- return 0;
97
+ if (wcagRegex.test(a) && !wcagRegex.test(b)) {
98
+ return -1;
99
+ }
100
+ if (!wcagRegex.test(a) && wcagRegex.test(b)) {
101
+ return 1;
102
+ }
103
+ return 0;
100
104
  });
101
105
  }
102
106
 
@@ -138,9 +142,13 @@ export const filterAxeResults = (
138
142
 
139
143
  nodes.forEach(node => {
140
144
  const { impact } = node;
145
+ const hasWcagA = conformance.some(tag => /^wcag\d*a$/.test(tag));
146
+ const hasWcagAA = conformance.some(tag => /^wcag\d*aa$/.test(tag));
147
+ // const hasWcagAAA = conformance.some(tag => /^wcag\d*aaa$/.test(tag));
148
+
141
149
  if (displayNeedsReview) {
142
150
  addTo(needsReview, node);
143
- } else if (impact === 'critical' || impact === 'serious') {
151
+ } else if (hasWcagA || hasWcagAA) {
144
152
  addTo(mustFix, node);
145
153
  } else {
146
154
  addTo(goodToFix, node);
@@ -283,21 +291,6 @@ export const runAxeScript = async ({
283
291
  });
284
292
 
285
293
  const disableOobee = ruleset.includes(RuleFlags.DISABLE_OOBEE);
286
- const oobeeAccessibleLabelFlaggedXpaths = disableOobee
287
- ? []
288
- : (await flagUnlabelledClickableElements(page)).map(item => item.xpath);
289
- const oobeeAccessibleLabelFlaggedCssSelectors = oobeeAccessibleLabelFlaggedXpaths
290
- .map(xpath => {
291
- try {
292
- const cssSelector = xPathToCss(xpath);
293
- return cssSelector;
294
- } catch (e) {
295
- console.error('Error converting XPath to CSS: ', xpath, e);
296
- return '';
297
- }
298
- })
299
- .filter(item => item !== '');
300
-
301
294
  const enableWcagAaa = ruleset.includes(RuleFlags.ENABLE_WCAG_AAA);
302
295
 
303
296
  const gradingReadabilityFlag = await extractAndGradeText(page); // Ensure flag is obtained before proceeding
@@ -308,103 +301,52 @@ export const runAxeScript = async ({
308
301
  async ({
309
302
  selectors,
310
303
  saflyIconSelector,
311
- customAxeConfig,
312
304
  disableOobee,
313
305
  enableWcagAaa,
314
- oobeeAccessibleLabelFlaggedCssSelectors,
315
306
  gradingReadabilityFlag,
307
+ evaluateAltTextFunctionString,
308
+ escapeCssSelectorFunctionString,
309
+ framesCheckFunctionString,
310
+ findElementByCssSelectorFunctionString,
311
+ getAxeConfigurationFunctionString,
312
+ flagUnlabelledClickableElementsFunctionString,
313
+ xPathToCssFunctionString,
316
314
  }) => {
317
315
  try {
318
- const evaluateAltText = (node: Element) => {
319
- const altText = node.getAttribute('alt');
320
- const confusingTexts = ['img', 'image', 'picture', 'photo', 'graphic'];
321
-
322
- if (altText) {
323
- const trimmedAltText = altText.trim().toLowerCase();
324
- if (confusingTexts.includes(trimmedAltText)) {
325
- return false;
326
- }
327
- }
328
- return true;
329
- };
330
-
331
- // for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
332
- const escapeCSSSelector = (selector: string) => {
333
- try {
334
- return selector.replace(
335
- /([#\.])(\d)/g,
336
- (_match, prefix, digit) => `${prefix}\\3${digit} `,
337
- );
338
- } catch (e) {
339
- console.error(`error escaping css selector: ${selector}`, e);
340
- return selector;
341
- }
342
- };
343
-
316
+ // Load functions into the browser context
317
+ eval(evaluateAltTextFunctionString);
318
+ eval(escapeCssSelectorFunctionString);
319
+ eval(framesCheckFunctionString);
320
+ eval(findElementByCssSelectorFunctionString);
321
+ eval(flagUnlabelledClickableElementsFunctionString);
322
+ eval(xPathToCssFunctionString);
323
+ eval(getAxeConfigurationFunctionString);
344
324
  // remove so that axe does not scan
345
325
  document.querySelector(saflyIconSelector)?.remove();
346
326
 
347
- axe.configure({
348
- branding: customAxeConfig.branding,
349
- checks: [
350
- {
351
- ...customAxeConfig.checks[0],
352
- evaluate: evaluateAltText,
353
- },
354
- {
355
- ...customAxeConfig.checks[1],
356
- evaluate: (node: HTMLElement) => {
357
- return !node.dataset.flagged; // fail any element with a data-flagged attribute set to true
358
- },
359
- },
360
- ...(enableWcagAaa
361
- ? [
362
- {
363
- ...customAxeConfig.checks[2],
364
- evaluate: (_node: HTMLElement) => {
365
- if (gradingReadabilityFlag === '') {
366
- return true; // Pass if no readability issues
367
- }
368
- // Dynamically update the grading messages
369
- const gradingCheck = customAxeConfig.checks.find(
370
- check => check.id === 'oobee-grading-text-contents',
371
- );
372
- if (gradingCheck) {
373
- gradingCheck.metadata.messages.incomplete = `The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease score of ${gradingReadabilityFlag
374
- }.\nThe target passing score is above 50, indicating content readable by university students and lower grade levels.\nA higher score reflects better readability.`;
375
- }
376
-
377
- // Fail if readability issues are detected
378
- },
379
- },
380
- ]
381
- : []),
382
- ],
383
- rules: customAxeConfig.rules
384
- .filter(rule => (disableOobee ? !rule.id.startsWith('oobee') : true))
385
- .concat(
386
- enableWcagAaa
387
- ? [
388
- {
389
- id: 'color-contrast-enhanced',
390
- enabled: true,
391
- tags: ['wcag2aaa', 'wcag146'],
392
- },
393
- {
394
- id: 'identical-links-same-purpose',
395
- enabled: true,
396
- tags: ['wcag2aaa', 'wcag249'],
397
- },
398
- {
399
- id: 'meta-refresh-no-exceptions',
400
- enabled: true,
401
- tags: ['wcag2aaa', 'wcag224', 'wcag325'],
402
- },
403
- ]
404
- : [],
405
- ),
327
+ const oobeeAccessibleLabelFlaggedXpaths = disableOobee
328
+ ? []
329
+ : (await flagUnlabelledClickableElements()).map(item => item.xpath);
330
+ const oobeeAccessibleLabelFlaggedCssSelectors = oobeeAccessibleLabelFlaggedXpaths
331
+ .map(xpath => {
332
+ try {
333
+ const cssSelector = xPathToCss(xpath);
334
+ return cssSelector;
335
+ } catch (e) {
336
+ console.error('Error converting XPath to CSS: ', xpath, e);
337
+ return '';
338
+ }
339
+ })
340
+ .filter(item => item !== '');
341
+
342
+ const axeConfig = getAxeConfiguration({
343
+ enableWcagAaa,
344
+ gradingReadabilityFlag,
345
+ disableOobee,
406
346
  });
407
347
 
348
+ axe.configure(axeConfig);
349
+
408
350
  // removed needsReview condition
409
351
  const defaultResultTypes: resultGroups[] = ['violations', 'passes', 'incomplete'];
410
352
 
@@ -418,102 +360,7 @@ export const runAxeScript = async ({
418
360
  }
419
361
  // handle css id selectors that start with a digit
420
362
  const escapedCssSelectors =
421
- oobeeAccessibleLabelFlaggedCssSelectors.map(escapeCSSSelector);
422
-
423
- function framesCheck(cssSelector: string): {
424
- doc: Document;
425
- remainingSelector: string;
426
- } {
427
- let doc = document; // Start with the main document
428
- let remainingSelector = ''; // To store the last part of the selector
429
- let targetIframe = null;
430
-
431
- // Split the selector into parts at "> html"
432
- const diffParts = cssSelector.split(/\s*>\s*html\s*/);
433
-
434
- for (let i = 0; i < diffParts.length - 1; i++) {
435
- let iframeSelector = `${diffParts[i].trim()}`;
436
-
437
- // Add back '> html' to the current part
438
- if (i > 0) {
439
- iframeSelector = `html > ${iframeSelector}`;
440
- }
441
-
442
- let frameset = null;
443
- // Find the iframe using the current document context
444
- if (doc.querySelector('frameset')) {
445
- frameset = doc.querySelector('frameset');
446
- }
447
-
448
- if (frameset) {
449
- doc = frameset;
450
- iframeSelector = iframeSelector.split('body >')[1].trim();
451
- }
452
- targetIframe = doc.querySelector(iframeSelector);
453
-
454
- if (targetIframe && targetIframe.contentDocument) {
455
- // Update the document to the iframe's contentDocument
456
- doc = targetIframe.contentDocument;
457
- } else {
458
- console.warn(
459
- `Iframe not found or contentDocument inaccessible for selector: ${iframeSelector}`,
460
- );
461
- return { doc, remainingSelector: cssSelector }; // Return original selector if iframe not found
462
- }
463
- }
464
-
465
- // The last part is the remaining CSS selector
466
- remainingSelector = diffParts[diffParts.length - 1].trim();
467
-
468
- // Remove any leading '>' combinators from remainingSelector
469
- remainingSelector = `html${remainingSelector}`;
470
-
471
- return { doc, remainingSelector };
472
- }
473
-
474
- function findElementByCssSelector(cssSelector: string): string | null {
475
- let doc = document;
476
-
477
- // Check if the selector includes 'frame' or 'iframe' and update doc and selector
478
-
479
- if (/\s*>\s*html\s*/.test(cssSelector)) {
480
- const inFrames = framesCheck(cssSelector);
481
- doc = inFrames.doc;
482
- cssSelector = inFrames.remainingSelector;
483
- }
484
-
485
- // Query the element in the document (including inside frames)
486
- let element = doc.querySelector(cssSelector);
487
-
488
- // Handle Shadow DOM if the element is not found
489
- if (!element) {
490
- const shadowRoots = [];
491
- const allElements = document.querySelectorAll('*');
492
-
493
- // Look for elements with shadow roots
494
- allElements.forEach(el => {
495
- if (el.shadowRoot) {
496
- shadowRoots.push(el.shadowRoot);
497
- }
498
- });
499
-
500
- // Search inside each shadow root for the element
501
- for (const shadowRoot of shadowRoots) {
502
- const shadowElement = shadowRoot.querySelector(cssSelector);
503
- if (shadowElement) {
504
- element = shadowElement; // Found the element inside shadow DOM
505
- break;
506
- }
507
- }
508
- }
509
-
510
- if (element) {
511
- return element.outerHTML;
512
- }
513
-
514
- console.warn(`Unable to find element for css selector: ${cssSelector}`);
515
- return null;
516
- }
363
+ oobeeAccessibleLabelFlaggedCssSelectors.map(escapeCssSelector);
517
364
 
518
365
  // Add oobee violations to Axe's report
519
366
  const oobeeAccessibleLabelViolations = {
@@ -560,11 +407,17 @@ export const runAxeScript = async ({
560
407
  {
561
408
  selectors,
562
409
  saflyIconSelector,
563
- customAxeConfig,
564
410
  disableOobee,
565
411
  enableWcagAaa,
566
- oobeeAccessibleLabelFlaggedCssSelectors,
567
412
  gradingReadabilityFlag,
413
+ evaluateAltTextFunctionString: evaluateAltText.toString(),
414
+ escapeCssSelectorFunctionString: escapeCssSelector.toString(),
415
+ framesCheckFunctionString: framesCheck.toString(),
416
+ findElementByCssSelectorFunctionString: findElementByCssSelector.toString(),
417
+ getAxeConfigurationFunctionString: getAxeConfiguration.toString(),
418
+ flagUnlabelledClickableElementsFunctionString:
419
+ flagUnlabelledClickableElements.toString(),
420
+ xPathToCssFunctionString: xPathToCss.toString(),
568
421
  },
569
422
  );
570
423
 
@@ -29,6 +29,7 @@ import {
29
29
  getBlackListedPatterns,
30
30
  urlWithoutAuth,
31
31
  waitForPageLoaded,
32
+ initModifiedUserAgent,
32
33
  } from '../constants/common.js';
33
34
  import { areLinksEqual, isFollowStrategy } from '../utils.js';
34
35
  import {
@@ -455,6 +456,8 @@ const crawlDomain = async ({
455
456
  userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
456
457
  }
457
458
 
459
+ await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
460
+
458
461
  const crawler = new crawlee.PlaywrightCrawler({
459
462
  launchContext: {
460
463
  launcher: constants.launcher,
@@ -847,7 +850,7 @@ const crawlDomain = async ({
847
850
  // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
848
851
  // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
849
852
  if (!isAbortingScanNow) {
850
- urlsCrawled.error.push({ url: request.url });
853
+ urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
851
854
  }
852
855
  }
853
856
  },
@@ -856,7 +859,8 @@ const crawlDomain = async ({
856
859
  numScanned: urlsCrawled.scanned.length,
857
860
  urlScanned: request.url,
858
861
  });
859
- urlsCrawled.error.push({ url: request.url });
862
+ urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
863
+
860
864
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
861
865
  },
862
866
  maxRequestsPerCrawl: Infinity,
@@ -10,6 +10,7 @@ import {
10
10
  isFilePath,
11
11
  convertLocalFileToPath,
12
12
  convertPathToLocalFile,
13
+ initModifiedUserAgent,
13
14
  } from '../constants/common.js';
14
15
  import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
15
16
  import { guiInfoLog } from '../logs.js';
@@ -142,6 +143,7 @@ const crawlLocalFile = async (
142
143
  uuidToPdfMapping[pdfFileName] = trimmedUrl;
143
144
 
144
145
  if (!isUrlPdf(request.url)) {
146
+ await initModifiedUserAgent(browser);
145
147
  const browserContext = await constants.launcher.launchPersistentContext('', {
146
148
  headless: false,
147
149
  ...getPlaywrightLaunchOptions(browser),
@@ -17,6 +17,7 @@ import {
17
17
  urlWithoutAuth,
18
18
  waitForPageLoaded,
19
19
  isFilePath,
20
+ initModifiedUserAgent,
20
21
  } from '../constants/common.js';
21
22
  import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
22
23
  import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
@@ -139,6 +140,7 @@ const crawlSitemap = async (
139
140
  userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
140
141
  }
141
142
 
143
+ await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
142
144
  const crawler = new crawlee.PlaywrightCrawler({
143
145
  launchContext: {
144
146
  launcher: constants.launcher,
@@ -271,8 +273,8 @@ const crawlSitemap = async (
271
273
  return;
272
274
  }
273
275
 
274
- const contentType = response.headers()['content-type'];
275
- const status = response.status();
276
+ const contentType = response?.headers?.()['content-type'] || '';
277
+ const status = response ? response.status() : 0;
276
278
 
277
279
  if (blacklistedPatterns && !isFollowStrategy(actualUrl, request.url, "same-hostname") && isSkippedUrl(actualUrl, blacklistedPatterns)) {
278
280
  urlsCrawled.userExcluded.push({
@@ -379,7 +381,7 @@ const crawlSitemap = async (
379
381
  numScanned: urlsCrawled.scanned.length,
380
382
  urlScanned: request.url,
381
383
  });
382
- urlsCrawled.error.push({ url: request.url });
384
+ urlsCrawled.error.push(request.url);
383
385
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
384
386
  },
385
387
  maxRequestsPerCrawl: Infinity,
@@ -0,0 +1,10 @@
1
+ // for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
2
+ export function escapeCssSelector(selector: string) {
3
+ try {
4
+ return selector.replace(/([#\.])(\d)/g, (_match, prefix, digit) => `${prefix}\\3${digit} `);
5
+ } catch (e) {
6
+ console.error(`error escaping css selector: ${selector}`, e);
7
+ return selector;
8
+ }
9
+ }
10
+
@@ -0,0 +1,13 @@
1
+ export function evaluateAltText(node: Element) {
2
+ const altText = node.getAttribute('alt');
3
+ const confusingTexts = ['img', 'image', 'picture', 'photo', 'graphic'];
4
+
5
+ if (altText) {
6
+ const trimmedAltText = altText.trim().toLowerCase();
7
+ if (confusingTexts.includes(trimmedAltText)) {
8
+ return false;
9
+ }
10
+ }
11
+ return true;
12
+ }
13
+
@@ -47,8 +47,6 @@ export async function extractAndGradeText(page: Page): Promise<string> {
47
47
  const result =
48
48
  readabilityScore === 0 || readabilityScore > 50 ? '' : readabilityScore.toString(); // Convert readabilityScore to string
49
49
 
50
- const pageUrl = await page.url(); // Get the page URL
51
-
52
50
  return result;
53
51
  } catch (error) {
54
52
  console.error('Error extracting and grading text:', error);
@@ -0,0 +1,28 @@
1
+ export function extractText(): string[] {
2
+ try {
3
+ // Extract text content from all specified elements (e.g., paragraphs)
4
+ const elements = document.querySelectorAll('p'); // Adjust selector as needed
5
+ const extractedSentences: string[] = [];
6
+
7
+ elements.forEach(element => {
8
+ const text = element.innerText.trim();
9
+ // Split the text into individual sentences
10
+ const sentencePattern = /[^.!?]*[.!?]+/g; // Match sentences ending with ., !, or ?
11
+ const matches = text.match(sentencePattern);
12
+ if (matches) {
13
+ // Add only sentences that end with punctuation
14
+ matches.forEach(sentence => {
15
+ const trimmedSentence = sentence.trim(); // Trim whitespace from each sentence
16
+ if (trimmedSentence.length > 0) {
17
+ extractedSentences.push(trimmedSentence);
18
+ }
19
+ });
20
+ }
21
+ });
22
+
23
+ return extractedSentences;
24
+ } catch (error) {
25
+ console.error('Error extracting text:', error);
26
+ return []; // Return an empty string in case of an error
27
+ }
28
+ }
@@ -0,0 +1,46 @@
1
+ import { framesCheck } from "./framesCheck.js";
2
+
3
+ export function findElementByCssSelector(cssSelector: string): string | null {
4
+ let doc = document;
5
+
6
+ // Check if the selector includes 'frame' or 'iframe' and update doc and selector
7
+
8
+ if (/\s*>\s*html\s*/.test(cssSelector)) {
9
+ const inFrames = framesCheck(cssSelector);
10
+ doc = inFrames.doc;
11
+ cssSelector = inFrames.remainingSelector;
12
+ }
13
+
14
+ // Query the element in the document (including inside frames)
15
+ let element = doc.querySelector(cssSelector);
16
+
17
+ // Handle Shadow DOM if the element is not found
18
+ if (!element) {
19
+ const shadowRoots = [];
20
+ const allElements = document.querySelectorAll('*');
21
+
22
+ // Look for elements with shadow roots
23
+ allElements.forEach(el => {
24
+ if (el.shadowRoot) {
25
+ shadowRoots.push(el.shadowRoot);
26
+ }
27
+ });
28
+
29
+ // Search inside each shadow root for the element
30
+ for (const shadowRoot of shadowRoots) {
31
+ const shadowElement = shadowRoot.querySelector(cssSelector);
32
+ if (shadowElement) {
33
+ element = shadowElement; // Found the element inside shadow DOM
34
+ break;
35
+ }
36
+ }
37
+ }
38
+
39
+ if (element) {
40
+ return element.outerHTML;
41
+ }
42
+
43
+ console.warn(`Unable to find element for css selector: ${cssSelector}`);
44
+ return null;
45
+ }
46
+