@govtechsg/oobee 0.10.42 → 0.10.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import crawlee, { CrawlingContext, PlaywrightGotoOptions } from 'crawlee';
1
+ import crawlee, { CrawlingContext, PlaywrightGotoOptions, Request } from 'crawlee';
2
2
  import axe, { AxeResults, ImpactValue, NodeResult, Result, resultGroups, TagValue } from 'axe-core';
3
3
  import { BrowserContext, Page } from 'playwright';
4
4
  import {
@@ -18,7 +18,7 @@ import { framesCheck } from './custom/framesCheck.js';
18
18
  import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
19
19
  import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
20
20
  import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
21
- import { xPathToCss } from './custom/xPathToCss.js';
21
+ import xPathToCss from './custom/xPathToCss.js';
22
22
 
23
23
  // types
24
24
  interface AxeResultsWithScreenshot extends AxeResults {
@@ -118,13 +118,13 @@ export const filterAxeResults = (
118
118
 
119
119
  if (conformance[0] !== 'best-practice' && !wcagRegex.test(conformance[0])) {
120
120
  conformance.sort((a, b) => {
121
- if (wcagRegex.test(a) && !wcagRegex.test(b)) {
122
- return -1;
123
- }
124
- if (!wcagRegex.test(a) && wcagRegex.test(b)) {
125
- return 1;
126
- }
127
- return 0;
121
+ if (wcagRegex.test(a) && !wcagRegex.test(b)) {
122
+ return -1;
123
+ }
124
+ if (!wcagRegex.test(a) && wcagRegex.test(b)) {
125
+ return 1;
126
+ }
127
+ return 0;
128
128
  });
129
129
  }
130
130
 
@@ -166,7 +166,6 @@ export const filterAxeResults = (
166
166
  };
167
167
 
168
168
  nodes.forEach(node => {
169
- const { impact } = node;
170
169
  const hasWcagA = conformance.some(tag => /^wcag\d*a$/.test(tag));
171
170
  const hasWcagAA = conformance.some(tag => /^wcag\d*aa$/.test(tag));
172
171
  // const hasWcagAAA = conformance.some(tag => /^wcag\d*aaa$/.test(tag));
@@ -255,7 +254,7 @@ export const runAxeScript = async ({
255
254
  let mutationCount = 0;
256
255
  const MAX_MUTATIONS = 250;
257
256
  const MAX_SAME_MUTATION_LIMIT = 10;
258
- const mutationHash = {};
257
+ const mutationHash: Record<string, number> = {};
259
258
 
260
259
  const observer = new MutationObserver(mutationsList => {
261
260
  clearTimeout(timeout);
@@ -309,6 +308,8 @@ export const runAxeScript = async ({
309
308
  silentLogger.warn(`Error while checking for DOM mutations: ${e}`);
310
309
  }
311
310
 
311
+ // Omit logging of browser console errors to reduce unnecessary verbosity
312
+ /*
312
313
  page.on('console', msg => {
313
314
  const type = msg.type();
314
315
  if (type === 'error') {
@@ -317,6 +318,7 @@ export const runAxeScript = async ({
317
318
  silentLogger.log({ level: 'info', message: msg.text() });
318
319
  }
319
320
  });
321
+ */
320
322
 
321
323
  const disableOobee = ruleset.includes(RuleFlags.DISABLE_OOBEE);
322
324
  const enableWcagAaa = ruleset.includes(RuleFlags.ENABLE_WCAG_AAA);
@@ -399,7 +401,7 @@ export const runAxeScript = async ({
399
401
  help: 'Clickable elements (i.e. elements with mouse-click interaction) must have accessible labels.',
400
402
  helpUrl: 'https://www.deque.com/blog/accessible-aria-buttons',
401
403
  nodes: escapedCssSelectors
402
- .map(cssSelector => ({
404
+ .map((cssSelector: string): NodeResult => ({
403
405
  html: findElementByCssSelector(cssSelector),
404
406
  target: [cssSelector],
405
407
  impact: 'serious' as ImpactValue,
@@ -443,8 +445,7 @@ export const runAxeScript = async ({
443
445
  framesCheckFunctionString: framesCheck.toString(),
444
446
  findElementByCssSelectorFunctionString: findElementByCssSelector.toString(),
445
447
  getAxeConfigurationFunctionString: getAxeConfiguration.toString(),
446
- flagUnlabelledClickableElementsFunctionString:
447
- flagUnlabelledClickableElements.toString(),
448
+ flagUnlabelledClickableElementsFunctionString: flagUnlabelledClickableElements.toString(),
448
449
  xPathToCssFunctionString: xPathToCss.toString(),
449
450
  },
450
451
  );
@@ -495,7 +496,7 @@ export const postNavigationHooks = [
495
496
  },
496
497
  ];
497
498
 
498
- export const failedRequestHandler = async ({ request }) => {
499
+ export const failedRequestHandler = async ({ request }: { request: Request }) => {
499
500
  guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
500
501
  crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
501
502
  };
@@ -9,7 +9,6 @@ import https from 'https';
9
9
  import type { BatchAddRequestsResult } from '@crawlee/types';
10
10
  import {
11
11
  createCrawleeSubFolders,
12
- preNavigationHooks,
13
12
  runAxeScript,
14
13
  isUrlPdf,
15
14
  } from './commonCrawlerFunc.js';
@@ -19,6 +18,7 @@ import constants, {
19
18
  guiInfoStatusTypes,
20
19
  cssQuerySelectors,
21
20
  RuleFlags,
21
+ STATUS_CODE_METADATA,
22
22
  } from '../constants/constants.js';
23
23
  import {
24
24
  getPlaywrightLaunchOptions,
@@ -26,7 +26,6 @@ import {
26
26
  isSkippedUrl,
27
27
  isDisallowedInRobotsTxt,
28
28
  getUrlsFromRobotsTxt,
29
- getBlackListedPatterns,
30
29
  urlWithoutAuth,
31
30
  waitForPageLoaded,
32
31
  initModifiedUserAgent,
@@ -116,13 +115,12 @@ const crawlDomain = async ({
116
115
  fs.mkdirSync(randomToken);
117
116
  }
118
117
 
119
- const pdfDownloads = [];
120
- const uuidToPdfMapping = {};
118
+ const pdfDownloads: Promise<void>[] = [];
119
+ const uuidToPdfMapping: Record<string, string> = {};
121
120
  const isScanHtml = ['all', 'html-only'].includes(fileTypes);
122
121
  const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
123
122
  const { maxConcurrency } = constants;
124
123
  const { playwrightDeviceDetailsObject } = viewportSettings;
125
- const isBlacklistedUrl = isBlacklisted(url, blacklistedPatterns);
126
124
 
127
125
  const httpsAgent = new https.Agent({ rejectUnauthorized: false });
128
126
 
@@ -167,8 +165,8 @@ const crawlDomain = async ({
167
165
  const httpHeadCache = new Map<string, boolean>();
168
166
  const isProcessibleUrl = async (url: string): Promise<boolean> => {
169
167
  if (httpHeadCache.has(url)) {
170
- silentLogger.info('cache hit', url, httpHeadCache.get(url));
171
- return false; // return false to avoid processing the url again
168
+ silentLogger.info(`Skipping request as URL has been processed before ${url}}`);
169
+ return false; // return false to avoid processing the same url again
172
170
  }
173
171
 
174
172
  try {
@@ -490,56 +488,35 @@ const crawlDomain = async ({
490
488
  return new Promise(resolve => {
491
489
  let timeout;
492
490
  let mutationCount = 0;
493
- const MAX_MUTATIONS = 250;
494
- const MAX_SAME_MUTATION_LIMIT = 10;
495
- const mutationHash = {};
496
-
497
- const observer = new MutationObserver(mutationsList => {
491
+ const MAX_MUTATIONS = 250; // stop if things never quiet down
492
+ const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
493
+
494
+ const observer = new MutationObserver(() => {
498
495
  clearTimeout(timeout);
499
-
500
- mutationCount += 1;
501
-
496
+
497
+ mutationCount++;
502
498
  if (mutationCount > MAX_MUTATIONS) {
503
499
  observer.disconnect();
504
- resolve('Too many mutations detected');
500
+ resolve('Too many mutations, exiting.');
501
+ return;
505
502
  }
506
-
507
- // To handle scenario where DOM elements are constantly changing and unable to exit
508
- mutationsList.forEach(mutation => {
509
- let mutationKey;
510
-
511
- if (mutation.target instanceof Element) {
512
- Array.from(mutation.target.attributes).forEach(attr => {
513
- mutationKey = `${mutation.target.nodeName}-${attr.name}`;
514
-
515
- if (mutationKey) {
516
- if (!mutationHash[mutationKey]) {
517
- mutationHash[mutationKey] = 1;
518
- } else {
519
- mutationHash[mutationKey]++;
520
- }
521
-
522
- if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
523
- observer.disconnect();
524
- resolve(`Repeated mutation detected for ${mutationKey}`);
525
- }
526
- }
527
- });
528
- }
529
- });
530
-
503
+
504
+ // restart quiet‑period timer
531
505
  timeout = setTimeout(() => {
532
506
  observer.disconnect();
533
- resolve('DOM stabilized after mutations.');
507
+ resolve('DOM stabilized.');
534
508
  }, 1000);
535
509
  });
536
-
510
+
511
+ // overall timeout in case the page never settles
537
512
  timeout = setTimeout(() => {
538
513
  observer.disconnect();
539
- resolve('No mutations detected, exit from idle state');
540
- }, 1000);
541
-
542
- observer.observe(document, { childList: true, subtree: true, attributes: true });
514
+ resolve('Observer timeout reached.');
515
+ }, OBSERVER_TIMEOUT);
516
+
517
+ // **HERE**: select the real DOM node inside evaluate
518
+ const root = document.documentElement;
519
+ observer.observe(root, { childList: true, subtree: true });
543
520
  });
544
521
  });
545
522
 
@@ -641,10 +618,12 @@ const crawlDomain = async ({
641
618
  numScanned: urlsCrawled.scanned.length,
642
619
  urlScanned: request.url,
643
620
  });
644
- urlsCrawled.blacklisted.push({
621
+ urlsCrawled.userExcluded.push({
645
622
  url: request.url,
646
623
  pageTitle: request.url,
647
- actualUrl: actualUrl, // i.e. actualUrl
624
+ actualUrl: request.url, // because about:blank is not useful
625
+ metadata: STATUS_CODE_METADATA[1],
626
+ httpStatusCode: 0,
648
627
  });
649
628
 
650
629
  return;
@@ -666,10 +645,12 @@ const crawlDomain = async ({
666
645
  numScanned: urlsCrawled.scanned.length,
667
646
  urlScanned: request.url,
668
647
  });
669
- urlsCrawled.blacklisted.push({
648
+ urlsCrawled.userExcluded.push({
670
649
  url: request.url,
671
650
  pageTitle: request.url,
672
- actualUrl: actualUrl, // i.e. actualUrl
651
+ actualUrl: actualUrl, // because about:blank is not useful
652
+ metadata: STATUS_CODE_METADATA[1],
653
+ httpStatusCode: 0,
673
654
  });
674
655
 
675
656
  return;
@@ -680,38 +661,16 @@ const crawlDomain = async ({
680
661
  url: request.url,
681
662
  pageTitle: request.url,
682
663
  actualUrl: actualUrl,
664
+ metadata: STATUS_CODE_METADATA[0],
665
+ httpStatusCode: 0,
683
666
  });
684
-
685
- await enqueueProcess(page, enqueueLinks, browserContext);
686
- return;
687
- }
688
-
689
- if (response && response.status() === 403) {
690
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
691
- numScanned: urlsCrawled.scanned.length,
692
- urlScanned: request.url,
693
- });
694
- urlsCrawled.forbidden.push({
695
- url: request.url,
696
- pageTitle: request.url,
697
- actualUrl: actualUrl, // i.e. actualUrl
698
- });
699
-
700
- return;
701
- }
702
-
703
- if (response && response.status() !== 200) {
704
-
667
+
705
668
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
706
669
  numScanned: urlsCrawled.scanned.length,
707
670
  urlScanned: request.url,
708
671
  });
709
- urlsCrawled.invalid.push({
710
- url: request.url,
711
- pageTitle: request.url,
712
- actualUrl: actualUrl, // i.e. actualUrl
713
- });
714
672
 
673
+ await enqueueProcess(page, enqueueLinks, browserContext);
715
674
  return;
716
675
  }
717
676
 
@@ -733,6 +692,22 @@ const crawlDomain = async ({
733
692
  return;
734
693
  }
735
694
 
695
+ const responseStatus = response?.status();
696
+ if (responseStatus && responseStatus >= 300) {
697
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
698
+ numScanned: urlsCrawled.scanned.length,
699
+ urlScanned: request.url,
700
+ });
701
+ urlsCrawled.userExcluded.push({
702
+ url: request.url,
703
+ pageTitle: request.url,
704
+ actualUrl,
705
+ metadata: STATUS_CODE_METADATA[responseStatus] || STATUS_CODE_METADATA[599],
706
+ httpStatusCode: responseStatus,
707
+ });
708
+ return;
709
+ }
710
+
736
711
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
737
712
 
738
713
  if (isRedirected) {
@@ -790,10 +765,12 @@ const crawlDomain = async ({
790
765
  numScanned: urlsCrawled.scanned.length,
791
766
  urlScanned: request.url,
792
767
  });
793
- urlsCrawled.blacklisted.push({
768
+ urlsCrawled.userExcluded.push({
794
769
  url: request.url,
795
770
  pageTitle: request.url,
796
- actualUrl: actualUrl, // i.e. actualUrl
771
+ actualUrl: actualUrl, // because about:blank is not useful
772
+ metadata: STATUS_CODE_METADATA[1],
773
+ httpStatusCode: 0,
797
774
  });
798
775
 
799
776
  }
@@ -833,18 +810,39 @@ const crawlDomain = async ({
833
810
  // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
834
811
  // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
835
812
  if (!isAbortingScanNow) {
836
- urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
813
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
814
+ numScanned: urlsCrawled.scanned.length,
815
+ urlScanned: request.url,
816
+ });
817
+
818
+ urlsCrawled.error.push({
819
+ url: request.url,
820
+ pageTitle: request.url,
821
+ actualUrl: request.url,
822
+ metadata: STATUS_CODE_METADATA[2]
823
+ });
837
824
  }
838
825
  }
839
826
  },
840
- failedRequestHandler: async ({ request }) => {
827
+ failedRequestHandler: async ({ request, response }) => {
841
828
  guiInfoLog(guiInfoStatusTypes.ERROR, {
842
829
  numScanned: urlsCrawled.scanned.length,
843
830
  urlScanned: request.url,
844
831
  });
845
- urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
846
-
847
- crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
832
+
833
+ const status = response?.status();
834
+ const metadata = typeof status === 'number'
835
+ ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
836
+ : STATUS_CODE_METADATA[2];
837
+
838
+ urlsCrawled.error.push({
839
+ url: request.url,
840
+ pageTitle: request.url,
841
+ actualUrl: request.url,
842
+ metadata,
843
+ httpStatusCode: typeof status === 'number' ? status : 0,
844
+ });
845
+
848
846
  },
849
847
  maxRequestsPerCrawl: Infinity,
850
848
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
@@ -1,27 +1,29 @@
1
1
  import fs from 'fs';
2
- import { chromium } from 'playwright';
2
+ import { chromium, Page } from 'playwright';
3
3
  import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
4
  import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
5
5
  import { silentLogger, guiInfoLog } from '../logs.js';
6
6
  import crawlDomain from './crawlDomain.js';
7
7
  import crawlSitemap from './crawlSitemap.js';
8
+ import { EnqueueStrategy } from 'crawlee';
9
+ import { ViewportSettingsClass } from '../combine.js';
8
10
 
9
11
  const crawlIntelligentSitemap = async (
10
- url,
11
- randomToken,
12
- host,
13
- viewportSettings,
14
- maxRequestsPerCrawl,
15
- browser,
16
- userDataDirectory,
17
- strategy,
18
- specifiedMaxConcurrency,
19
- fileTypes,
20
- blacklistedPatterns,
21
- includeScreenshots,
22
- followRobots,
23
- extraHTTPHeaders,
24
- safeMode,
12
+ url: string,
13
+ randomToken: string,
14
+ host: string,
15
+ viewportSettings: ViewportSettingsClass,
16
+ maxRequestsPerCrawl: number,
17
+ browser: string,
18
+ userDataDirectory: string,
19
+ strategy: EnqueueStrategy,
20
+ specifiedMaxConcurrency: number,
21
+ fileTypes: string,
22
+ blacklistedPatterns: string[],
23
+ includeScreenshots: boolean,
24
+ followRobots: boolean,
25
+ extraHTTPHeaders: Record<string, string>,
26
+ safeMode: boolean,
25
27
  ) => {
26
28
  let urlsCrawledFinal;
27
29
  let urlsCrawled;
@@ -37,7 +39,7 @@ const crawlIntelligentSitemap = async (
37
39
  fs.mkdirSync(randomToken);
38
40
  }
39
41
 
40
- function getHomeUrl(parsedUrl) {
42
+ function getHomeUrl(parsedUrl: string) {
41
43
  const urlObject = new URL(parsedUrl);
42
44
  if (urlObject.username !== '' && urlObject.password !== '') {
43
45
  return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -46,7 +48,7 @@ const crawlIntelligentSitemap = async (
46
48
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
47
49
  }
48
50
 
49
- async function findSitemap(link) {
51
+ async function findSitemap(link: string) {
50
52
  const homeUrl = getHomeUrl(link);
51
53
  let sitemapLinkFound = false;
52
54
  let sitemapLink = '';
@@ -70,7 +72,7 @@ const crawlIntelligentSitemap = async (
70
72
  return sitemapExist ? sitemapLink : '';
71
73
  }
72
74
 
73
- const checkUrlExists = async (page, parsedUrl) => {
75
+ const checkUrlExists = async (page: Page, parsedUrl: string) => {
74
76
  try {
75
77
  const response = await page.goto(parsedUrl);
76
78
  if (response.ok()) {