@govtechsg/oobee 0.10.36 → 0.10.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/.github/workflows/docker-test.yml +1 -1
  2. package/DETAILS.md +3 -3
  3. package/INTEGRATION.md +142 -53
  4. package/README.md +17 -0
  5. package/REPORTS.md +362 -0
  6. package/exclusions.txt +4 -1
  7. package/package.json +2 -2
  8. package/src/constants/cliFunctions.ts +0 -7
  9. package/src/constants/common.ts +39 -1
  10. package/src/constants/constants.ts +9 -8
  11. package/src/crawlers/commonCrawlerFunc.ts +95 -220
  12. package/src/crawlers/crawlDomain.ts +10 -23
  13. package/src/crawlers/crawlLocalFile.ts +2 -0
  14. package/src/crawlers/crawlSitemap.ts +6 -4
  15. package/src/crawlers/custom/escapeCssSelector.ts +10 -0
  16. package/src/crawlers/custom/evaluateAltText.ts +13 -0
  17. package/src/crawlers/custom/extractAndGradeText.ts +0 -2
  18. package/src/crawlers/custom/extractText.ts +28 -0
  19. package/src/crawlers/custom/findElementByCssSelector.ts +46 -0
  20. package/src/crawlers/custom/flagUnlabelledClickableElements.ts +982 -842
  21. package/src/crawlers/custom/framesCheck.ts +51 -0
  22. package/src/crawlers/custom/getAxeConfiguration.ts +126 -0
  23. package/src/crawlers/custom/gradeReadability.ts +30 -0
  24. package/src/crawlers/custom/xPathToCss.ts +178 -0
  25. package/src/crawlers/pdfScanFunc.ts +67 -26
  26. package/src/mergeAxeResults.ts +535 -132
  27. package/src/npmIndex.ts +130 -62
  28. package/src/screenshotFunc/htmlScreenshotFunc.ts +1 -1
  29. package/src/screenshotFunc/pdfScreenshotFunc.ts +34 -1
  30. package/src/static/ejs/partials/components/ruleOffcanvas.ejs +1 -1
  31. package/src/static/ejs/partials/components/scanAbout.ejs +1 -1
  32. package/src/static/ejs/partials/footer.ejs +3 -3
  33. package/src/static/ejs/partials/scripts/reportSearch.ejs +112 -74
  34. package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +2 -2
  35. package/src/static/ejs/partials/summaryMain.ejs +3 -3
  36. package/src/static/ejs/report.ejs +3 -3
  37. package/src/utils.ts +289 -13
  38. package/src/xPathToCssCypress.ts +178 -0
  39. package/src/crawlers/customAxeFunctions.ts +0 -82
@@ -1,7 +1,6 @@
1
1
  import crawlee, { CrawlingContext, PlaywrightGotoOptions } from 'crawlee';
2
2
  import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
3
3
  import { BrowserContext, Page } from 'playwright';
4
- import { xPathToCss } from '../xPathToCss.js';
5
4
  import {
6
5
  axeScript,
7
6
  guiInfoStatusTypes,
@@ -11,10 +10,15 @@ import {
11
10
  import { guiInfoLog, silentLogger } from '../logs.js';
12
11
  import { takeScreenshotForHTMLElements } from '../screenshotFunc/htmlScreenshotFunc.js';
13
12
  import { isFilePath } from '../constants/common.js';
14
- import { customAxeConfig } from './customAxeFunctions.js';
15
- import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
16
13
  import { extractAndGradeText } from './custom/extractAndGradeText.js';
17
14
  import { ItemsInfo } from '../mergeAxeResults.js';
15
+ import { evaluateAltText } from './custom/evaluateAltText.js';
16
+ import { escapeCssSelector } from './custom/escapeCssSelector.js';
17
+ import { framesCheck } from './custom/framesCheck.js';
18
+ import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
19
+ import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
20
+ import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
21
+ import { xPathToCss } from './custom/xPathToCss.js';
18
22
 
19
23
  // types
20
24
  interface AxeResultsWithScreenshot extends AxeResults {
@@ -65,6 +69,30 @@ type FilteredResults = {
65
69
  actualUrl?: string;
66
70
  };
67
71
 
72
+ const truncateHtml = (html: string, maxBytes = 1024, suffix = '…'): string => {
73
+ const encoder = new TextEncoder();
74
+ if (encoder.encode(html).length <= maxBytes) return html;
75
+
76
+ let left = 0;
77
+ let right = html.length;
78
+ let result = '';
79
+
80
+ while (left <= right) {
81
+ const mid = Math.floor((left + right) / 2);
82
+ const truncated = html.slice(0, mid) + suffix;
83
+ const bytes = encoder.encode(truncated).length;
84
+
85
+ if (bytes <= maxBytes) {
86
+ result = truncated;
87
+ left = mid + 1;
88
+ } else {
89
+ right = mid - 1;
90
+ }
91
+ }
92
+
93
+ return result;
94
+ };
95
+
68
96
  export const filterAxeResults = (
69
97
  results: AxeResultsWithScreenshot,
70
98
  pageTitle: string,
@@ -86,17 +114,17 @@ export const filterAxeResults = (
86
114
  const conformance = tags.filter(tag => tag.startsWith('wcag') || tag === 'best-practice');
87
115
 
88
116
  // handle rare cases where conformance level is not the first element
89
- const levels = ['wcag2a', 'wcag2aa', 'wcag2aaa'];
90
- if (conformance[0] !== 'best-practice' && !levels.includes(conformance[0])) {
117
+ const wcagRegex = /^wcag\d+a+$/;
118
+
119
+ if (conformance[0] !== 'best-practice' && !wcagRegex.test(conformance[0])) {
91
120
  conformance.sort((a, b) => {
92
- if (levels.includes(a)) {
93
- return -1;
94
- }
95
- if (levels.includes(b)) {
96
- return 1;
97
- }
98
-
99
- return 0;
121
+ if (wcagRegex.test(a) && !wcagRegex.test(b)) {
122
+ return -1;
123
+ }
124
+ if (!wcagRegex.test(a) && wcagRegex.test(b)) {
125
+ return 1;
126
+ }
127
+ return 0;
100
128
  });
101
129
  }
102
130
 
@@ -120,6 +148,7 @@ export const filterAxeResults = (
120
148
  if (html.includes('</script>')) {
121
149
  finalHtml = html.replaceAll('</script>', '&lt;/script>');
122
150
  }
151
+ finalHtml = truncateHtml(finalHtml);
123
152
 
124
153
  const xpath = target.length === 1 && typeof target[0] === 'string' ? target[0] : null;
125
154
 
@@ -138,15 +167,13 @@ export const filterAxeResults = (
138
167
 
139
168
  nodes.forEach(node => {
140
169
  const { impact } = node;
141
- const hasWcag2a = conformance.includes('wcag2a');
142
- const hasWcag2aa = conformance.includes('wcag2aa');
143
- const hasWcag2aaa = conformance.includes('wcag2aaa');
170
+ const hasWcagA = conformance.some(tag => /^wcag\d*a$/.test(tag));
171
+ const hasWcagAA = conformance.some(tag => /^wcag\d*aa$/.test(tag));
172
+ // const hasWcagAAA = conformance.some(tag => /^wcag\d*aaa$/.test(tag));
144
173
 
145
174
  if (displayNeedsReview) {
146
175
  addTo(needsReview, node);
147
- } else if (hasWcag2aaa) {
148
- addTo(goodToFix, node);
149
- } else if (hasWcag2a || hasWcag2aa) {
176
+ } else if (hasWcagA || hasWcagAA) {
150
177
  addTo(mustFix, node);
151
178
  } else {
152
179
  addTo(goodToFix, node);
@@ -176,7 +203,10 @@ export const filterAxeResults = (
176
203
  items: [],
177
204
  };
178
205
  }
179
- passed.rules[rule].items.push({ html, screenshotPath: '', message: '', xpath: '' });
206
+
207
+ const finalHtml = truncateHtml(html);
208
+ passed.rules[rule].items.push({ html: finalHtml, screenshotPath: '', message: '', xpath: '' });
209
+
180
210
  passed.totalItems += 1;
181
211
  passed.rules[rule].totalItems += 1;
182
212
  totalItems += 1;
@@ -289,21 +319,6 @@ export const runAxeScript = async ({
289
319
  });
290
320
 
291
321
  const disableOobee = ruleset.includes(RuleFlags.DISABLE_OOBEE);
292
- const oobeeAccessibleLabelFlaggedXpaths = disableOobee
293
- ? []
294
- : (await flagUnlabelledClickableElements(page)).map(item => item.xpath);
295
- const oobeeAccessibleLabelFlaggedCssSelectors = oobeeAccessibleLabelFlaggedXpaths
296
- .map(xpath => {
297
- try {
298
- const cssSelector = xPathToCss(xpath);
299
- return cssSelector;
300
- } catch (e) {
301
- console.error('Error converting XPath to CSS: ', xpath, e);
302
- return '';
303
- }
304
- })
305
- .filter(item => item !== '');
306
-
307
322
  const enableWcagAaa = ruleset.includes(RuleFlags.ENABLE_WCAG_AAA);
308
323
 
309
324
  const gradingReadabilityFlag = await extractAndGradeText(page); // Ensure flag is obtained before proceeding
@@ -314,103 +329,52 @@ export const runAxeScript = async ({
314
329
  async ({
315
330
  selectors,
316
331
  saflyIconSelector,
317
- customAxeConfig,
318
332
  disableOobee,
319
333
  enableWcagAaa,
320
- oobeeAccessibleLabelFlaggedCssSelectors,
321
334
  gradingReadabilityFlag,
335
+ evaluateAltTextFunctionString,
336
+ escapeCssSelectorFunctionString,
337
+ framesCheckFunctionString,
338
+ findElementByCssSelectorFunctionString,
339
+ getAxeConfigurationFunctionString,
340
+ flagUnlabelledClickableElementsFunctionString,
341
+ xPathToCssFunctionString,
322
342
  }) => {
323
343
  try {
324
- const evaluateAltText = (node: Element) => {
325
- const altText = node.getAttribute('alt');
326
- const confusingTexts = ['img', 'image', 'picture', 'photo', 'graphic'];
327
-
328
- if (altText) {
329
- const trimmedAltText = altText.trim().toLowerCase();
330
- if (confusingTexts.includes(trimmedAltText)) {
331
- return false;
332
- }
333
- }
334
- return true;
335
- };
336
-
337
- // for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
338
- const escapeCSSSelector = (selector: string) => {
339
- try {
340
- return selector.replace(
341
- /([#\.])(\d)/g,
342
- (_match, prefix, digit) => `${prefix}\\3${digit} `,
343
- );
344
- } catch (e) {
345
- console.error(`error escaping css selector: ${selector}`, e);
346
- return selector;
347
- }
348
- };
349
-
344
+ // Load functions into the browser context
345
+ eval(evaluateAltTextFunctionString);
346
+ eval(escapeCssSelectorFunctionString);
347
+ eval(framesCheckFunctionString);
348
+ eval(findElementByCssSelectorFunctionString);
349
+ eval(flagUnlabelledClickableElementsFunctionString);
350
+ eval(xPathToCssFunctionString);
351
+ eval(getAxeConfigurationFunctionString);
350
352
  // remove so that axe does not scan
351
353
  document.querySelector(saflyIconSelector)?.remove();
352
354
 
353
- axe.configure({
354
- branding: customAxeConfig.branding,
355
- checks: [
356
- {
357
- ...customAxeConfig.checks[0],
358
- evaluate: evaluateAltText,
359
- },
360
- {
361
- ...customAxeConfig.checks[1],
362
- evaluate: (node: HTMLElement) => {
363
- return !node.dataset.flagged; // fail any element with a data-flagged attribute set to true
364
- },
365
- },
366
- ...(enableWcagAaa
367
- ? [
368
- {
369
- ...customAxeConfig.checks[2],
370
- evaluate: (_node: HTMLElement) => {
371
- if (gradingReadabilityFlag === '') {
372
- return true; // Pass if no readability issues
373
- }
374
- // Dynamically update the grading messages
375
- const gradingCheck = customAxeConfig.checks.find(
376
- check => check.id === 'oobee-grading-text-contents',
377
- );
378
- if (gradingCheck) {
379
- gradingCheck.metadata.messages.incomplete = `The text content is potentially difficult to read, with a Flesch-Kincaid Reading Ease score of ${gradingReadabilityFlag
380
- }.\nThe target passing score is above 50, indicating content readable by university students and lower grade levels.\nA higher score reflects better readability.`;
381
- }
382
-
383
- // Fail if readability issues are detected
384
- },
385
- },
386
- ]
387
- : []),
388
- ],
389
- rules: customAxeConfig.rules
390
- .filter(rule => (disableOobee ? !rule.id.startsWith('oobee') : true))
391
- .concat(
392
- enableWcagAaa
393
- ? [
394
- {
395
- id: 'color-contrast-enhanced',
396
- enabled: true,
397
- tags: ['wcag2aaa', 'wcag146'],
398
- },
399
- {
400
- id: 'identical-links-same-purpose',
401
- enabled: true,
402
- tags: ['wcag2aaa', 'wcag249'],
403
- },
404
- {
405
- id: 'meta-refresh-no-exceptions',
406
- enabled: true,
407
- tags: ['wcag2aaa', 'wcag224', 'wcag325'],
408
- },
409
- ]
410
- : [],
411
- ),
355
+ const oobeeAccessibleLabelFlaggedXpaths = disableOobee
356
+ ? []
357
+ : (await flagUnlabelledClickableElements()).map(item => item.xpath);
358
+ const oobeeAccessibleLabelFlaggedCssSelectors = oobeeAccessibleLabelFlaggedXpaths
359
+ .map(xpath => {
360
+ try {
361
+ const cssSelector = xPathToCss(xpath);
362
+ return cssSelector;
363
+ } catch (e) {
364
+ console.error('Error converting XPath to CSS: ', xpath, e);
365
+ return '';
366
+ }
367
+ })
368
+ .filter(item => item !== '');
369
+
370
+ const axeConfig = getAxeConfiguration({
371
+ enableWcagAaa,
372
+ gradingReadabilityFlag,
373
+ disableOobee,
412
374
  });
413
375
 
376
+ axe.configure(axeConfig);
377
+
414
378
  // removed needsReview condition
415
379
  const defaultResultTypes: resultGroups[] = ['violations', 'passes', 'incomplete'];
416
380
 
@@ -424,102 +388,7 @@ export const runAxeScript = async ({
424
388
  }
425
389
  // handle css id selectors that start with a digit
426
390
  const escapedCssSelectors =
427
- oobeeAccessibleLabelFlaggedCssSelectors.map(escapeCSSSelector);
428
-
429
- function framesCheck(cssSelector: string): {
430
- doc: Document;
431
- remainingSelector: string;
432
- } {
433
- let doc = document; // Start with the main document
434
- let remainingSelector = ''; // To store the last part of the selector
435
- let targetIframe = null;
436
-
437
- // Split the selector into parts at "> html"
438
- const diffParts = cssSelector.split(/\s*>\s*html\s*/);
439
-
440
- for (let i = 0; i < diffParts.length - 1; i++) {
441
- let iframeSelector = `${diffParts[i].trim()}`;
442
-
443
- // Add back '> html' to the current part
444
- if (i > 0) {
445
- iframeSelector = `html > ${iframeSelector}`;
446
- }
447
-
448
- let frameset = null;
449
- // Find the iframe using the current document context
450
- if (doc.querySelector('frameset')) {
451
- frameset = doc.querySelector('frameset');
452
- }
453
-
454
- if (frameset) {
455
- doc = frameset;
456
- iframeSelector = iframeSelector.split('body >')[1].trim();
457
- }
458
- targetIframe = doc.querySelector(iframeSelector);
459
-
460
- if (targetIframe && targetIframe.contentDocument) {
461
- // Update the document to the iframe's contentDocument
462
- doc = targetIframe.contentDocument;
463
- } else {
464
- console.warn(
465
- `Iframe not found or contentDocument inaccessible for selector: ${iframeSelector}`,
466
- );
467
- return { doc, remainingSelector: cssSelector }; // Return original selector if iframe not found
468
- }
469
- }
470
-
471
- // The last part is the remaining CSS selector
472
- remainingSelector = diffParts[diffParts.length - 1].trim();
473
-
474
- // Remove any leading '>' combinators from remainingSelector
475
- remainingSelector = `html${remainingSelector}`;
476
-
477
- return { doc, remainingSelector };
478
- }
479
-
480
- function findElementByCssSelector(cssSelector: string): string | null {
481
- let doc = document;
482
-
483
- // Check if the selector includes 'frame' or 'iframe' and update doc and selector
484
-
485
- if (/\s*>\s*html\s*/.test(cssSelector)) {
486
- const inFrames = framesCheck(cssSelector);
487
- doc = inFrames.doc;
488
- cssSelector = inFrames.remainingSelector;
489
- }
490
-
491
- // Query the element in the document (including inside frames)
492
- let element = doc.querySelector(cssSelector);
493
-
494
- // Handle Shadow DOM if the element is not found
495
- if (!element) {
496
- const shadowRoots = [];
497
- const allElements = document.querySelectorAll('*');
498
-
499
- // Look for elements with shadow roots
500
- allElements.forEach(el => {
501
- if (el.shadowRoot) {
502
- shadowRoots.push(el.shadowRoot);
503
- }
504
- });
505
-
506
- // Search inside each shadow root for the element
507
- for (const shadowRoot of shadowRoots) {
508
- const shadowElement = shadowRoot.querySelector(cssSelector);
509
- if (shadowElement) {
510
- element = shadowElement; // Found the element inside shadow DOM
511
- break;
512
- }
513
- }
514
- }
515
-
516
- if (element) {
517
- return element.outerHTML;
518
- }
519
-
520
- console.warn(`Unable to find element for css selector: ${cssSelector}`);
521
- return null;
522
- }
391
+ oobeeAccessibleLabelFlaggedCssSelectors.map(escapeCssSelector);
523
392
 
524
393
  // Add oobee violations to Axe's report
525
394
  const oobeeAccessibleLabelViolations = {
@@ -566,11 +435,17 @@ export const runAxeScript = async ({
566
435
  {
567
436
  selectors,
568
437
  saflyIconSelector,
569
- customAxeConfig,
570
438
  disableOobee,
571
439
  enableWcagAaa,
572
- oobeeAccessibleLabelFlaggedCssSelectors,
573
440
  gradingReadabilityFlag,
441
+ evaluateAltTextFunctionString: evaluateAltText.toString(),
442
+ escapeCssSelectorFunctionString: escapeCssSelector.toString(),
443
+ framesCheckFunctionString: framesCheck.toString(),
444
+ findElementByCssSelectorFunctionString: findElementByCssSelector.toString(),
445
+ getAxeConfigurationFunctionString: getAxeConfiguration.toString(),
446
+ flagUnlabelledClickableElementsFunctionString:
447
+ flagUnlabelledClickableElements.toString(),
448
+ xPathToCssFunctionString: xPathToCss.toString(),
574
449
  },
575
450
  );
576
451
 
@@ -29,6 +29,7 @@ import {
29
29
  getBlackListedPatterns,
30
30
  urlWithoutAuth,
31
31
  waitForPageLoaded,
32
+ initModifiedUserAgent,
32
33
  } from '../constants/common.js';
33
34
  import { areLinksEqual, isFollowStrategy } from '../utils.js';
34
35
  import {
@@ -455,6 +456,8 @@ const crawlDomain = async ({
455
456
  userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
456
457
  }
457
458
 
459
+ await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
460
+
458
461
  const crawler = new crawlee.PlaywrightCrawler({
459
462
  launchContext: {
460
463
  launcher: constants.launcher,
@@ -632,7 +635,7 @@ const crawlDomain = async ({
632
635
  }
633
636
 
634
637
  // handle pdfs
635
- if (request.skipNavigation && isUrlPdf(actualUrl)) {
638
+ if (request.skipNavigation && actualUrl === "about:blank") {
636
639
  if (!isScanPdfs) {
637
640
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
638
641
  numScanned: urlsCrawled.scanned.length,
@@ -658,24 +661,6 @@ const crawlDomain = async ({
658
661
  return;
659
662
  }
660
663
 
661
- const resHeaders = response ? response.headers() : {}; // Safely access response headers
662
- const contentType = resHeaders['content-type'] || ''; // Ensure contentType is defined
663
-
664
- // Skip non-HTML and non-PDF URLs
665
- if (!contentType.includes('text/html') && !contentType.includes('application/pdf')) {
666
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
667
- numScanned: urlsCrawled.scanned.length,
668
- urlScanned: request.url,
669
- });
670
- urlsCrawled.blacklisted.push({
671
- url: request.url,
672
- pageTitle: request.url,
673
- actualUrl: actualUrl, // i.e. actualUrl
674
- });
675
-
676
- return;
677
- }
678
-
679
664
  if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
680
665
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
681
666
  numScanned: urlsCrawled.scanned.length,
@@ -701,7 +686,7 @@ const crawlDomain = async ({
701
686
  return;
702
687
  }
703
688
 
704
- if (response.status() === 403) {
689
+ if (response && response.status() === 403) {
705
690
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
706
691
  numScanned: urlsCrawled.scanned.length,
707
692
  urlScanned: request.url,
@@ -715,7 +700,8 @@ const crawlDomain = async ({
715
700
  return;
716
701
  }
717
702
 
718
- if (response.status() !== 200) {
703
+ if (response && response.status() !== 200) {
704
+
719
705
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
720
706
  numScanned: urlsCrawled.scanned.length,
721
707
  urlScanned: request.url,
@@ -847,7 +833,7 @@ const crawlDomain = async ({
847
833
  // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
848
834
  // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
849
835
  if (!isAbortingScanNow) {
850
- urlsCrawled.error.push({ url: request.url });
836
+ urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
851
837
  }
852
838
  }
853
839
  },
@@ -856,7 +842,8 @@ const crawlDomain = async ({
856
842
  numScanned: urlsCrawled.scanned.length,
857
843
  urlScanned: request.url,
858
844
  });
859
- urlsCrawled.error.push({ url: request.url });
845
+ urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
846
+
860
847
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
861
848
  },
862
849
  maxRequestsPerCrawl: Infinity,
@@ -10,6 +10,7 @@ import {
10
10
  isFilePath,
11
11
  convertLocalFileToPath,
12
12
  convertPathToLocalFile,
13
+ initModifiedUserAgent,
13
14
  } from '../constants/common.js';
14
15
  import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.js';
15
16
  import { guiInfoLog } from '../logs.js';
@@ -142,6 +143,7 @@ const crawlLocalFile = async (
142
143
  uuidToPdfMapping[pdfFileName] = trimmedUrl;
143
144
 
144
145
  if (!isUrlPdf(request.url)) {
146
+ await initModifiedUserAgent(browser);
145
147
  const browserContext = await constants.launcher.launchPersistentContext('', {
146
148
  headless: false,
147
149
  ...getPlaywrightLaunchOptions(browser),
@@ -17,6 +17,7 @@ import {
17
17
  urlWithoutAuth,
18
18
  waitForPageLoaded,
19
19
  isFilePath,
20
+ initModifiedUserAgent,
20
21
  } from '../constants/common.js';
21
22
  import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
22
23
  import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
@@ -139,6 +140,7 @@ const crawlSitemap = async (
139
140
  userDataDir = process.env.CRAWLEE_HEADLESS !== '0' ? userDataDirectory : '';
140
141
  }
141
142
 
143
+ await initModifiedUserAgent(browser, playwrightDeviceDetailsObject);
142
144
  const crawler = new crawlee.PlaywrightCrawler({
143
145
  launchContext: {
144
146
  launcher: constants.launcher,
@@ -244,7 +246,7 @@ const crawlSitemap = async (
244
246
  return;
245
247
  }
246
248
 
247
- if (isUrlPdf(actualUrl)) {
249
+ if (request.skipNavigation && actualUrl === "about:blank") {
248
250
  if (!isScanPdfs) {
249
251
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
250
252
  numScanned: urlsCrawled.scanned.length,
@@ -271,8 +273,8 @@ const crawlSitemap = async (
271
273
  return;
272
274
  }
273
275
 
274
- const contentType = response.headers()['content-type'];
275
- const status = response.status();
276
+ const contentType = response?.headers?.()['content-type'] || '';
277
+ const status = response ? response.status() : 0;
276
278
 
277
279
  if (blacklistedPatterns && !isFollowStrategy(actualUrl, request.url, "same-hostname") && isSkippedUrl(actualUrl, blacklistedPatterns)) {
278
280
  urlsCrawled.userExcluded.push({
@@ -379,7 +381,7 @@ const crawlSitemap = async (
379
381
  numScanned: urlsCrawled.scanned.length,
380
382
  urlScanned: request.url,
381
383
  });
382
- urlsCrawled.error.push({ url: request.url });
384
+ urlsCrawled.error.push(request.url);
383
385
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
384
386
  },
385
387
  maxRequestsPerCrawl: Infinity,
@@ -0,0 +1,10 @@
1
+ // for css id selectors starting with a digit, escape it with the unicode character e.g. #123 -> #\31 23
2
+ export function escapeCssSelector(selector: string) {
3
+ try {
4
+ return selector.replace(/([#\.])(\d)/g, (_match, prefix, digit) => `${prefix}\\3${digit} `);
5
+ } catch (e) {
6
+ console.error(`error escaping css selector: ${selector}`, e);
7
+ return selector;
8
+ }
9
+ }
10
+
@@ -0,0 +1,13 @@
1
+ export function evaluateAltText(node: Element) {
2
+ const altText = node.getAttribute('alt');
3
+ const confusingTexts = ['img', 'image', 'picture', 'photo', 'graphic'];
4
+
5
+ if (altText) {
6
+ const trimmedAltText = altText.trim().toLowerCase();
7
+ if (confusingTexts.includes(trimmedAltText)) {
8
+ return false;
9
+ }
10
+ }
11
+ return true;
12
+ }
13
+
@@ -47,8 +47,6 @@ export async function extractAndGradeText(page: Page): Promise<string> {
47
47
  const result =
48
48
  readabilityScore === 0 || readabilityScore > 50 ? '' : readabilityScore.toString(); // Convert readabilityScore to string
49
49
 
50
- const pageUrl = await page.url(); // Get the page URL
51
-
52
50
  return result;
53
51
  } catch (error) {
54
52
  console.error('Error extracting and grading text:', error);
@@ -0,0 +1,28 @@
1
+ export function extractText(): string[] {
2
+ try {
3
+ // Extract text content from all specified elements (e.g., paragraphs)
4
+ const elements = document.querySelectorAll('p'); // Adjust selector as needed
5
+ const extractedSentences: string[] = [];
6
+
7
+ elements.forEach(element => {
8
+ const text = element.innerText.trim();
9
+ // Split the text into individual sentences
10
+ const sentencePattern = /[^.!?]*[.!?]+/g; // Match sentences ending with ., !, or ?
11
+ const matches = text.match(sentencePattern);
12
+ if (matches) {
13
+ // Add only sentences that end with punctuation
14
+ matches.forEach(sentence => {
15
+ const trimmedSentence = sentence.trim(); // Trim whitespace from each sentence
16
+ if (trimmedSentence.length > 0) {
17
+ extractedSentences.push(trimmedSentence);
18
+ }
19
+ });
20
+ }
21
+ });
22
+
23
+ return extractedSentences;
24
+ } catch (error) {
25
+ console.error('Error extracting text:', error);
26
+ return []; // Return an empty string in case of an error
27
+ }
28
+ }
@@ -0,0 +1,46 @@
1
+ import { framesCheck } from "./framesCheck.js";
2
+
3
+ export function findElementByCssSelector(cssSelector: string): string | null {
4
+ let doc = document;
5
+
6
+ // Check if the selector includes 'frame' or 'iframe' and update doc and selector
7
+
8
+ if (/\s*>\s*html\s*/.test(cssSelector)) {
9
+ const inFrames = framesCheck(cssSelector);
10
+ doc = inFrames.doc;
11
+ cssSelector = inFrames.remainingSelector;
12
+ }
13
+
14
+ // Query the element in the document (including inside frames)
15
+ let element = doc.querySelector(cssSelector);
16
+
17
+ // Handle Shadow DOM if the element is not found
18
+ if (!element) {
19
+ const shadowRoots = [];
20
+ const allElements = document.querySelectorAll('*');
21
+
22
+ // Look for elements with shadow roots
23
+ allElements.forEach(el => {
24
+ if (el.shadowRoot) {
25
+ shadowRoots.push(el.shadowRoot);
26
+ }
27
+ });
28
+
29
+ // Search inside each shadow root for the element
30
+ for (const shadowRoot of shadowRoots) {
31
+ const shadowElement = shadowRoot.querySelector(cssSelector);
32
+ if (shadowElement) {
33
+ element = shadowElement; // Found the element inside shadow DOM
34
+ break;
35
+ }
36
+ }
37
+ }
38
+
39
+ if (element) {
40
+ return element.outerHTML;
41
+ }
42
+
43
+ console.warn(`Unable to find element for css selector: ${cssSelector}`);
44
+ return null;
45
+ }
46
+