@govtechsg/oobee 0.10.39 → 0.10.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,6 @@ import https from 'https';
9
9
  import type { BatchAddRequestsResult } from '@crawlee/types';
10
10
  import {
11
11
  createCrawleeSubFolders,
12
- preNavigationHooks,
13
12
  runAxeScript,
14
13
  isUrlPdf,
15
14
  } from './commonCrawlerFunc.js';
@@ -19,6 +18,7 @@ import constants, {
19
18
  guiInfoStatusTypes,
20
19
  cssQuerySelectors,
21
20
  RuleFlags,
21
+ STATUS_CODE_METADATA,
22
22
  } from '../constants/constants.js';
23
23
  import {
24
24
  getPlaywrightLaunchOptions,
@@ -26,7 +26,6 @@ import {
26
26
  isSkippedUrl,
27
27
  isDisallowedInRobotsTxt,
28
28
  getUrlsFromRobotsTxt,
29
- getBlackListedPatterns,
30
29
  urlWithoutAuth,
31
30
  waitForPageLoaded,
32
31
  initModifiedUserAgent,
@@ -116,13 +115,12 @@ const crawlDomain = async ({
116
115
  fs.mkdirSync(randomToken);
117
116
  }
118
117
 
119
- const pdfDownloads = [];
120
- const uuidToPdfMapping = {};
118
+ const pdfDownloads: Promise<void>[] = [];
119
+ const uuidToPdfMapping: Record<string, string> = {};
121
120
  const isScanHtml = ['all', 'html-only'].includes(fileTypes);
122
121
  const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
123
122
  const { maxConcurrency } = constants;
124
123
  const { playwrightDeviceDetailsObject } = viewportSettings;
125
- const isBlacklistedUrl = isBlacklisted(url, blacklistedPatterns);
126
124
 
127
125
  const httpsAgent = new https.Agent({ rejectUnauthorized: false });
128
126
 
@@ -167,8 +165,8 @@ const crawlDomain = async ({
167
165
  const httpHeadCache = new Map<string, boolean>();
168
166
  const isProcessibleUrl = async (url: string): Promise<boolean> => {
169
167
  if (httpHeadCache.has(url)) {
170
- silentLogger.info('cache hit', url, httpHeadCache.get(url));
171
- return false; // return false to avoid processing the url again
168
+ silentLogger.info(`Skipping request as URL has been processed before ${url}}`);
169
+ return false; // return false to avoid processing the same url again
172
170
  }
173
171
 
174
172
  try {
@@ -490,56 +488,35 @@ const crawlDomain = async ({
490
488
  return new Promise(resolve => {
491
489
  let timeout;
492
490
  let mutationCount = 0;
493
- const MAX_MUTATIONS = 250;
494
- const MAX_SAME_MUTATION_LIMIT = 10;
495
- const mutationHash = {};
496
-
497
- const observer = new MutationObserver(mutationsList => {
491
+ const MAX_MUTATIONS = 250; // stop if things never quiet down
492
+ const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
493
+
494
+ const observer = new MutationObserver(() => {
498
495
  clearTimeout(timeout);
499
-
500
- mutationCount += 1;
501
-
496
+
497
+ mutationCount++;
502
498
  if (mutationCount > MAX_MUTATIONS) {
503
499
  observer.disconnect();
504
- resolve('Too many mutations detected');
500
+ resolve('Too many mutations, exiting.');
501
+ return;
505
502
  }
506
-
507
- // To handle scenario where DOM elements are constantly changing and unable to exit
508
- mutationsList.forEach(mutation => {
509
- let mutationKey;
510
-
511
- if (mutation.target instanceof Element) {
512
- Array.from(mutation.target.attributes).forEach(attr => {
513
- mutationKey = `${mutation.target.nodeName}-${attr.name}`;
514
-
515
- if (mutationKey) {
516
- if (!mutationHash[mutationKey]) {
517
- mutationHash[mutationKey] = 1;
518
- } else {
519
- mutationHash[mutationKey]++;
520
- }
521
-
522
- if (mutationHash[mutationKey] >= MAX_SAME_MUTATION_LIMIT) {
523
- observer.disconnect();
524
- resolve(`Repeated mutation detected for ${mutationKey}`);
525
- }
526
- }
527
- });
528
- }
529
- });
530
-
503
+
504
+ // restart quiet‑period timer
531
505
  timeout = setTimeout(() => {
532
506
  observer.disconnect();
533
- resolve('DOM stabilized after mutations.');
507
+ resolve('DOM stabilized.');
534
508
  }, 1000);
535
509
  });
536
-
510
+
511
+ // overall timeout in case the page never settles
537
512
  timeout = setTimeout(() => {
538
513
  observer.disconnect();
539
- resolve('No mutations detected, exit from idle state');
540
- }, 1000);
541
-
542
- observer.observe(document, { childList: true, subtree: true, attributes: true });
514
+ resolve('Observer timeout reached.');
515
+ }, OBSERVER_TIMEOUT);
516
+
517
+ // **HERE**: select the real DOM node inside evaluate
518
+ const root = document.documentElement;
519
+ observer.observe(root, { childList: true, subtree: true });
543
520
  });
544
521
  });
545
522
 
@@ -635,16 +612,18 @@ const crawlDomain = async ({
635
612
  }
636
613
 
637
614
  // handle pdfs
638
- if (request.skipNavigation && isUrlPdf(actualUrl)) {
615
+ if (request.skipNavigation && actualUrl === "about:blank") {
639
616
  if (!isScanPdfs) {
640
617
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
641
618
  numScanned: urlsCrawled.scanned.length,
642
619
  urlScanned: request.url,
643
620
  });
644
- urlsCrawled.blacklisted.push({
621
+ urlsCrawled.userExcluded.push({
645
622
  url: request.url,
646
623
  pageTitle: request.url,
647
- actualUrl: actualUrl, // i.e. actualUrl
624
+ actualUrl: request.url, // because about:blank is not useful
625
+ metadata: STATUS_CODE_METADATA[1],
626
+ httpStatusCode: 0,
648
627
  });
649
628
 
650
629
  return;
@@ -661,33 +640,17 @@ const crawlDomain = async ({
661
640
  return;
662
641
  }
663
642
 
664
- const resHeaders = response ? response.headers() : {}; // Safely access response headers
665
- const contentType = resHeaders['content-type'] || ''; // Ensure contentType is defined
666
-
667
- // Skip non-HTML and non-PDF URLs
668
- if (!contentType.includes('text/html') && !contentType.includes('application/pdf')) {
669
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
670
- numScanned: urlsCrawled.scanned.length,
671
- urlScanned: request.url,
672
- });
673
- urlsCrawled.blacklisted.push({
674
- url: request.url,
675
- pageTitle: request.url,
676
- actualUrl: actualUrl, // i.e. actualUrl
677
- });
678
-
679
- return;
680
- }
681
-
682
643
  if (isBlacklistedFileExtensions(actualUrl, blackListedFileExtensions)) {
683
644
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
684
645
  numScanned: urlsCrawled.scanned.length,
685
646
  urlScanned: request.url,
686
647
  });
687
- urlsCrawled.blacklisted.push({
648
+ urlsCrawled.userExcluded.push({
688
649
  url: request.url,
689
650
  pageTitle: request.url,
690
- actualUrl: actualUrl, // i.e. actualUrl
651
+ actualUrl: actualUrl, // because about:blank is not useful
652
+ metadata: STATUS_CODE_METADATA[1],
653
+ httpStatusCode: 0,
691
654
  });
692
655
 
693
656
  return;
@@ -698,37 +661,16 @@ const crawlDomain = async ({
698
661
  url: request.url,
699
662
  pageTitle: request.url,
700
663
  actualUrl: actualUrl,
664
+ metadata: STATUS_CODE_METADATA[0],
665
+ httpStatusCode: 0,
701
666
  });
702
-
703
- await enqueueProcess(page, enqueueLinks, browserContext);
704
- return;
705
- }
706
-
707
- if (response.status() === 403) {
667
+
708
668
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
709
669
  numScanned: urlsCrawled.scanned.length,
710
670
  urlScanned: request.url,
711
671
  });
712
- urlsCrawled.forbidden.push({
713
- url: request.url,
714
- pageTitle: request.url,
715
- actualUrl: actualUrl, // i.e. actualUrl
716
- });
717
-
718
- return;
719
- }
720
-
721
- if (response.status() !== 200) {
722
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
723
- numScanned: urlsCrawled.scanned.length,
724
- urlScanned: request.url,
725
- });
726
- urlsCrawled.invalid.push({
727
- url: request.url,
728
- pageTitle: request.url,
729
- actualUrl: actualUrl, // i.e. actualUrl
730
- });
731
672
 
673
+ await enqueueProcess(page, enqueueLinks, browserContext);
732
674
  return;
733
675
  }
734
676
 
@@ -750,6 +692,22 @@ const crawlDomain = async ({
750
692
  return;
751
693
  }
752
694
 
695
+ const responseStatus = response?.status();
696
+ if (responseStatus && responseStatus >= 300) {
697
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
698
+ numScanned: urlsCrawled.scanned.length,
699
+ urlScanned: request.url,
700
+ });
701
+ urlsCrawled.userExcluded.push({
702
+ url: request.url,
703
+ pageTitle: request.url,
704
+ actualUrl,
705
+ metadata: STATUS_CODE_METADATA[responseStatus] || STATUS_CODE_METADATA[599],
706
+ httpStatusCode: responseStatus,
707
+ });
708
+ return;
709
+ }
710
+
753
711
  const results = await runAxeScript({ includeScreenshots, page, randomToken, ruleset });
754
712
 
755
713
  if (isRedirected) {
@@ -807,10 +765,12 @@ const crawlDomain = async ({
807
765
  numScanned: urlsCrawled.scanned.length,
808
766
  urlScanned: request.url,
809
767
  });
810
- urlsCrawled.blacklisted.push({
768
+ urlsCrawled.userExcluded.push({
811
769
  url: request.url,
812
770
  pageTitle: request.url,
813
- actualUrl: actualUrl, // i.e. actualUrl
771
+ actualUrl: actualUrl, // because about:blank is not useful
772
+ metadata: STATUS_CODE_METADATA[1],
773
+ httpStatusCode: 0,
814
774
  });
815
775
 
816
776
  }
@@ -850,18 +810,39 @@ const crawlDomain = async ({
850
810
  // when max pages have been scanned, scan will abort and all relevant pages still opened will close instantly.
851
811
  // a browser close error will then be flagged. Since this is an intended behaviour, this error will be excluded.
852
812
  if (!isAbortingScanNow) {
853
- urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
813
+ guiInfoLog(guiInfoStatusTypes.ERROR, {
814
+ numScanned: urlsCrawled.scanned.length,
815
+ urlScanned: request.url,
816
+ });
817
+
818
+ urlsCrawled.error.push({
819
+ url: request.url,
820
+ pageTitle: request.url,
821
+ actualUrl: request.url,
822
+ metadata: STATUS_CODE_METADATA[2]
823
+ });
854
824
  }
855
825
  }
856
826
  },
857
- failedRequestHandler: async ({ request }) => {
827
+ failedRequestHandler: async ({ request, response }) => {
858
828
  guiInfoLog(guiInfoStatusTypes.ERROR, {
859
829
  numScanned: urlsCrawled.scanned.length,
860
830
  urlScanned: request.url,
861
831
  });
862
- urlsCrawled.error.push({ url: request.url, pageTitle: request.url, actualUrl: request.url });
863
-
864
- crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
832
+
833
+ const status = response?.status();
834
+ const metadata = typeof status === 'number'
835
+ ? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
836
+ : STATUS_CODE_METADATA[2];
837
+
838
+ urlsCrawled.error.push({
839
+ url: request.url,
840
+ pageTitle: request.url,
841
+ actualUrl: request.url,
842
+ metadata,
843
+ httpStatusCode: typeof status === 'number' ? status : 0,
844
+ });
845
+
865
846
  },
866
847
  maxRequestsPerCrawl: Infinity,
867
848
  maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
@@ -1,27 +1,29 @@
1
1
  import fs from 'fs';
2
- import { chromium } from 'playwright';
2
+ import { chromium, Page } from 'playwright';
3
3
  import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
4
  import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
5
5
  import { silentLogger, guiInfoLog } from '../logs.js';
6
6
  import crawlDomain from './crawlDomain.js';
7
7
  import crawlSitemap from './crawlSitemap.js';
8
+ import { EnqueueStrategy } from 'crawlee';
9
+ import { ViewportSettingsClass } from '../combine.js';
8
10
 
9
11
  const crawlIntelligentSitemap = async (
10
- url,
11
- randomToken,
12
- host,
13
- viewportSettings,
14
- maxRequestsPerCrawl,
15
- browser,
16
- userDataDirectory,
17
- strategy,
18
- specifiedMaxConcurrency,
19
- fileTypes,
20
- blacklistedPatterns,
21
- includeScreenshots,
22
- followRobots,
23
- extraHTTPHeaders,
24
- safeMode,
12
+ url: string,
13
+ randomToken: string,
14
+ host: string,
15
+ viewportSettings: ViewportSettingsClass,
16
+ maxRequestsPerCrawl: number,
17
+ browser: string,
18
+ userDataDirectory: string,
19
+ strategy: EnqueueStrategy,
20
+ specifiedMaxConcurrency: number,
21
+ fileTypes: string,
22
+ blacklistedPatterns: string[],
23
+ includeScreenshots: boolean,
24
+ followRobots: boolean,
25
+ extraHTTPHeaders: Record<string, string>,
26
+ safeMode: boolean,
25
27
  ) => {
26
28
  let urlsCrawledFinal;
27
29
  let urlsCrawled;
@@ -37,7 +39,7 @@ const crawlIntelligentSitemap = async (
37
39
  fs.mkdirSync(randomToken);
38
40
  }
39
41
 
40
- function getHomeUrl(parsedUrl) {
42
+ function getHomeUrl(parsedUrl: string) {
41
43
  const urlObject = new URL(parsedUrl);
42
44
  if (urlObject.username !== '' && urlObject.password !== '') {
43
45
  return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
@@ -46,7 +48,7 @@ const crawlIntelligentSitemap = async (
46
48
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
47
49
  }
48
50
 
49
- async function findSitemap(link) {
51
+ async function findSitemap(link: string) {
50
52
  const homeUrl = getHomeUrl(link);
51
53
  let sitemapLinkFound = false;
52
54
  let sitemapLink = '';
@@ -70,7 +72,7 @@ const crawlIntelligentSitemap = async (
70
72
  return sitemapExist ? sitemapLink : '';
71
73
  }
72
74
 
73
- const checkUrlExists = async (page, parsedUrl) => {
75
+ const checkUrlExists = async (page: Page, parsedUrl: string) => {
74
76
  try {
75
77
  const response = await page.goto(parsedUrl);
76
78
  if (response.ok()) {