@govtechsg/oobee 0.10.51 → 0.10.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,7 +2,7 @@ import fs from 'fs';
2
2
  import { chromium, Page } from 'playwright';
3
3
  import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
4
4
  import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
5
- import { silentLogger, guiInfoLog } from '../logs.js';
5
+ import { consoleLogger, guiInfoLog } from '../logs.js';
6
6
  import crawlDomain from './crawlDomain.js';
7
7
  import crawlSitemap from './crawlSitemap.js';
8
8
  import { EnqueueStrategy } from 'crawlee';
@@ -24,46 +24,42 @@ const crawlIntelligentSitemap = async (
24
24
  followRobots: boolean,
25
25
  extraHTTPHeaders: Record<string, string>,
26
26
  safeMode: boolean,
27
+ scanDuration: number
27
28
  ) => {
29
+ const startTime = Date.now(); // Track start time
30
+
28
31
  let urlsCrawledFinal;
29
- let urlsCrawled;
32
+ let urlsCrawled = { ...constants.urlsCrawledObj };
30
33
  let dataset;
31
34
  let sitemapExist = false;
32
35
  const fromCrawlIntelligentSitemap = true;
33
36
  let sitemapUrl;
34
37
 
35
- urlsCrawled = { ...constants.urlsCrawledObj };
36
38
  ({ dataset } = await createCrawleeSubFolders(randomToken));
37
-
38
39
  if (!fs.existsSync(randomToken)) {
39
40
  fs.mkdirSync(randomToken);
40
41
  }
41
42
 
42
43
  function getHomeUrl(parsedUrl: string) {
43
44
  const urlObject = new URL(parsedUrl);
44
- if (urlObject.username !== '' && urlObject.password !== '') {
45
+ if (urlObject.username && urlObject.password) {
45
46
  return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
46
47
  }
47
-
48
48
  return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
49
49
  }
50
50
 
51
51
  async function findSitemap(link: string) {
52
52
  const homeUrl = getHomeUrl(link);
53
- let sitemapLinkFound = false;
54
53
  let sitemapLink = '';
55
- const chromiumBrowser = await chromium.launch(
56
- {
57
- headless: false,
58
- channel: 'chrome',
59
- args: ['--headless=new', '--no-sandbox']
60
- });
61
-
54
+ const chromiumBrowser = await chromium.launch({
55
+ headless: false,
56
+ channel: 'chrome',
57
+ args: ['--headless=new', '--no-sandbox'],
58
+ });
62
59
  const page = await chromiumBrowser.newPage();
63
60
  for (const path of sitemapPaths) {
64
61
  sitemapLink = homeUrl + path;
65
- sitemapLinkFound = await checkUrlExists(page, sitemapLink);
66
- if (sitemapLinkFound) {
62
+ if (await checkUrlExists(page, sitemapLink)) {
67
63
  sitemapExist = true;
68
64
  break;
69
65
  }
@@ -75,12 +71,9 @@ const crawlIntelligentSitemap = async (
75
71
  const checkUrlExists = async (page: Page, parsedUrl: string) => {
76
72
  try {
77
73
  const response = await page.goto(parsedUrl);
78
- if (response.ok()) {
79
- return true;
80
- }
81
- return false;
74
+ return response.ok();
82
75
  } catch (e) {
83
- silentLogger.error(e);
76
+ consoleLogger.error(e);
84
77
  return false;
85
78
  }
86
79
  };
@@ -88,13 +81,12 @@ const crawlIntelligentSitemap = async (
88
81
  try {
89
82
  sitemapUrl = await findSitemap(url);
90
83
  } catch (error) {
91
- silentLogger.error(error);
84
+ consoleLogger.error(error);
92
85
  }
93
86
 
94
87
  if (!sitemapExist) {
95
88
  console.log('Unable to find sitemap. Commencing website crawl instead.');
96
- // run crawlDomain as per normal
97
- urlsCrawledFinal = await crawlDomain({
89
+ return await crawlDomain({
98
90
  url,
99
91
  randomToken,
100
92
  host,
@@ -109,12 +101,13 @@ const crawlIntelligentSitemap = async (
109
101
  includeScreenshots,
110
102
  followRobots,
111
103
  extraHTTPHeaders,
104
+ safeMode,
105
+ scanDuration, // Use full duration since no sitemap
112
106
  });
113
- return urlsCrawledFinal;
114
107
  }
108
+
115
109
  console.log(`Sitemap found at ${sitemapUrl}`);
116
- // run crawlSitemap then crawDomain subsequently if urlsCrawled.scanned.length < maxRequestsPerCrawl
117
- urlsCrawledFinal = await crawlSitemap(
110
+ urlsCrawledFinal = await crawlSitemap({
118
111
  sitemapUrl,
119
112
  randomToken,
120
113
  host,
@@ -128,14 +121,21 @@ const crawlIntelligentSitemap = async (
128
121
  includeScreenshots,
129
122
  extraHTTPHeaders,
130
123
  fromCrawlIntelligentSitemap,
131
- url,
132
- dataset, // for crawlSitemap to add on to
133
- urlsCrawled, // for crawlSitemap to add on to
134
- false,
135
- );
124
+ userUrlInputFromIntelligent: url,
125
+ datasetFromIntelligent: dataset,
126
+ urlsCrawledFromIntelligent: urlsCrawled,
127
+ crawledFromLocalFile: false,
128
+ scanDuration,
129
+ });
130
+
131
+ const elapsed = Date.now() - startTime;
132
+ const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
136
133
 
137
- if (urlsCrawled.scanned.length < maxRequestsPerCrawl) {
138
- // run crawl domain starting from root website, only on pages not scanned before
134
+ if (
135
+ urlsCrawledFinal.scanned.length < maxRequestsPerCrawl &&
136
+ remainingScanDuration > 0
137
+ ) {
138
+ console.log(`Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`);
139
139
  urlsCrawledFinal = await crawlDomain({
140
140
  url,
141
141
  randomToken,
@@ -153,12 +153,16 @@ const crawlIntelligentSitemap = async (
153
153
  extraHTTPHeaders,
154
154
  safeMode,
155
155
  fromCrawlIntelligentSitemap,
156
- datasetFromIntelligent: dataset, // for crawlDomain to add on to
157
- urlsCrawledFromIntelligent: urlsCrawledFinal, // urls for crawlDomain to exclude
156
+ datasetFromIntelligent: dataset,
157
+ urlsCrawledFromIntelligent: urlsCrawledFinal,
158
+ scanDuration: remainingScanDuration,
158
159
  });
160
+ } else if (remainingScanDuration <= 0) {
161
+ console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
159
162
  }
160
163
 
161
164
  guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
162
165
  return urlsCrawledFinal;
163
166
  };
167
+
164
168
  export default crawlIntelligentSitemap;
@@ -1,12 +1,15 @@
1
- import { Request, RequestList } from 'crawlee';
2
- import printMessage from 'print-message';
1
+ import { Request, RequestList, Dataset } from 'crawlee';
3
2
  import fs from 'fs';
4
3
  import path from 'path';
5
4
  import { createCrawleeSubFolders, runAxeScript, isUrlPdf } from './commonCrawlerFunc.js';
6
- import constants, { guiInfoStatusTypes, basicAuthRegex } from '../constants/constants.js';
5
+ import constants, {
6
+ guiInfoStatusTypes,
7
+ basicAuthRegex,
8
+ UrlsCrawled,
9
+ } from '../constants/constants.js';
10
+ import { ViewportSettingsClass } from '../combine.js';
7
11
  import {
8
12
  getPlaywrightLaunchOptions,
9
- messageOptions,
10
13
  isFilePath,
11
14
  convertLocalFileToPath,
12
15
  convertPathToLocalFile,
@@ -16,27 +19,47 @@ import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.j
16
19
  import { guiInfoLog } from '../logs.js';
17
20
  import crawlSitemap from './crawlSitemap.js';
18
21
 
19
- const crawlLocalFile = async (
20
- sitemapUrl: string,
21
- randomToken: string,
22
- host: string,
23
- viewportSettings: any,
24
- maxRequestsPerCrawl: number,
25
- browser: string,
26
- userDataDirectory: string,
27
- specifiedMaxConcurrency: number,
28
- fileTypes: string,
29
- blacklistedPatterns: string[],
30
- includeScreenshots: boolean,
31
- extraHTTPHeaders: any,
32
- fromCrawlIntelligentSitemap: boolean = false, // optional
33
- userUrlInputFromIntelligent: any = null, // optional
34
- datasetFromIntelligent: any = null, // optional
35
- urlsCrawledFromIntelligent: any = null, // optional
36
- ) => {
22
+ export const crawlLocalFile = async ({
23
+ url,
24
+ randomToken,
25
+ host,
26
+ viewportSettings,
27
+ maxRequestsPerCrawl,
28
+ browser,
29
+ userDataDirectory,
30
+ specifiedMaxConcurrency,
31
+ fileTypes,
32
+ blacklistedPatterns,
33
+ includeScreenshots,
34
+ extraHTTPHeaders,
35
+ scanDuration = 0,
36
+ fromCrawlIntelligentSitemap = false,
37
+ userUrlInputFromIntelligent = null,
38
+ datasetFromIntelligent = null,
39
+ urlsCrawledFromIntelligent = null,
40
+ }: {
41
+ url: string;
42
+ randomToken: string;
43
+ host: string;
44
+ viewportSettings: ViewportSettingsClass;
45
+ maxRequestsPerCrawl: number;
46
+ browser: string;
47
+ userDataDirectory: string;
48
+ specifiedMaxConcurrency: number;
49
+ fileTypes: string;
50
+ blacklistedPatterns: string[];
51
+ includeScreenshots: boolean;
52
+ extraHTTPHeaders: Record<string, string>;
53
+ scanDuration?: number;
54
+ fromCrawlIntelligentSitemap?: boolean;
55
+ userUrlInputFromIntelligent?: string | null;
56
+ datasetFromIntelligent?: Dataset | null;
57
+ urlsCrawledFromIntelligent?: UrlsCrawled | null;
58
+ }) => {
37
59
  let dataset: any;
38
- let urlsCrawled: any;
60
+ let urlsCrawled: UrlsCrawled;
39
61
  let linksFromSitemap = [];
62
+ let sitemapUrl = url;
40
63
 
41
64
  // Boolean to omit axe scan for basic auth URL
42
65
  let isBasicAuth: boolean;
@@ -82,7 +105,7 @@ const crawlLocalFile = async (
82
105
  // Non XML file
83
106
  } else {
84
107
  // Put it to crawlSitemap function to handle xml files
85
- const updatedUrlsCrawled = await crawlSitemap(
108
+ const updatedUrlsCrawled = await crawlSitemap({
86
109
  sitemapUrl,
87
110
  randomToken,
88
111
  host,
@@ -95,12 +118,13 @@ const crawlLocalFile = async (
95
118
  blacklistedPatterns,
96
119
  includeScreenshots,
97
120
  extraHTTPHeaders,
98
- (fromCrawlIntelligentSitemap = false), // optional
99
- (userUrlInputFromIntelligent = null), // optional
100
- (datasetFromIntelligent = null), // optional
101
- (urlsCrawledFromIntelligent = null), // optional
102
- true,
103
- );
121
+ scanDuration,
122
+ fromCrawlIntelligentSitemap,
123
+ userUrlInputFromIntelligent,
124
+ datasetFromIntelligent,
125
+ urlsCrawledFromIntelligent,
126
+ crawledFromLocalFile: true,
127
+ });
104
128
 
105
129
  urlsCrawled = { ...urlsCrawled, ...updatedUrlsCrawled };
106
130
  return urlsCrawled;
@@ -124,16 +148,12 @@ const crawlLocalFile = async (
124
148
 
125
149
  const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
126
150
 
127
- printMessage(['Fetching URLs. This might take some time...'], { border: false });
128
-
129
151
  finalLinks = [...finalLinks, ...linksFromSitemap];
130
152
 
131
153
  await RequestList.open({
132
154
  sources: finalLinks,
133
155
  });
134
156
 
135
- printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
136
-
137
157
  const request = linksFromSitemap[0];
138
158
  const pdfFileName = path.basename(request.url);
139
159
  const trimmedUrl: string = request.url;
@@ -142,6 +162,8 @@ const crawlLocalFile = async (
142
162
  fs.writeFileSync(destinationFilePath, data);
143
163
  uuidToPdfMapping[pdfFileName] = trimmedUrl;
144
164
 
165
+ let shouldAbort = false;
166
+
145
167
  if (!isUrlPdf(request.url)) {
146
168
  await initModifiedUserAgent(browser);
147
169
  const browserContext = await constants.launcher.launchPersistentContext('', {
@@ -150,9 +172,24 @@ const crawlLocalFile = async (
150
172
  ...playwrightDeviceDetailsObject,
151
173
  });
152
174
 
175
+ const timeoutId = scanDuration > 0
176
+ ? setTimeout(() => {
177
+ console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting local file scan.`);
178
+ shouldAbort = true;
179
+ }, scanDuration * 1000)
180
+ : null;
181
+
153
182
  const page = await browserContext.newPage();
154
183
  request.url = convertPathToLocalFile(request.url);
155
184
  await page.goto(request.url);
185
+
186
+ if (shouldAbort) {
187
+ console.warn('Scan aborted due to timeout before page scan.');
188
+ await dataset.pushData({ scanned: [], scannedRedirects: [] });
189
+ await browserContext.close().catch(() => {});
190
+ return urlsCrawled;
191
+ }
192
+
156
193
  const results = await runAxeScript({ includeScreenshots, page, randomToken });
157
194
 
158
195
  const actualUrl = page.url() || request.loadedUrl || request.url;
@@ -178,7 +215,11 @@ const crawlLocalFile = async (
178
215
 
179
216
  await dataset.pushData(results);
180
217
  } else {
181
- urlsCrawled.scanned.push({ url: trimmedUrl, pageTitle: pdfFileName });
218
+ urlsCrawled.scanned.push({
219
+ url: trimmedUrl,
220
+ pageTitle: pdfFileName,
221
+ actualUrl: trimmedUrl,
222
+ });
182
223
 
183
224
  await runPdfScan(randomToken);
184
225
  // transform result format
@@ -192,6 +233,7 @@ const crawlLocalFile = async (
192
233
  // push results for each pdf document to key value store
193
234
  await Promise.all(pdfResults.map(result => dataset.pushData(result)));
194
235
  }
236
+
195
237
  return urlsCrawled;
196
238
  };
197
239
  export default crawlLocalFile;