@govtechsg/oobee 0.10.29 → 0.10.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/exclusions.txt CHANGED
@@ -1,2 +1,3 @@
1
1
  \.*login.singpass.gov.sg\.*
2
- \.*auth.singpass.gov.sg\.*
2
+ \.*auth.singpass.gov.sg\.*
3
+ \.*form.gov.sg\.*
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.29",
4
+ "version": "0.10.33",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "dependencies": {
package/src/combine.ts CHANGED
@@ -210,6 +210,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
210
210
  ...urlsCrawledObj.error,
211
211
  ...urlsCrawledObj.invalid,
212
212
  ...urlsCrawledObj.forbidden,
213
+ ...urlsCrawledObj.userExcluded,
213
214
  ];
214
215
  const basicFormHTMLSnippet = await generateArtifacts(
215
216
  randomToken,
@@ -240,6 +241,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
240
241
  pagesNotScanned.length,
241
242
  metadata,
242
243
  );
244
+ } else {
245
+ printMessage([`No pages were scanned.`], alertMessageOptions);
243
246
  }
244
247
  } else {
245
248
  printMessage([`No pages were scanned.`], alertMessageOptions);
@@ -1819,13 +1819,72 @@ export const urlWithoutAuth = (url: string): string => {
1819
1819
  };
1820
1820
 
1821
1821
  export const waitForPageLoaded = async (page, timeout = 10000) => {
1822
+ const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
1823
+
1822
1824
  return Promise.race([
1823
- page.waitForLoadState('load'),
1824
- page.waitForLoadState('networkidle'),
1825
- new Promise(resolve => setTimeout(resolve, timeout)),
1825
+ page.waitForLoadState('load'), // Ensure page load completes
1826
+ page.waitForLoadState('networkidle'), // Wait for network requests to settle
1827
+ new Promise(resolve => setTimeout(resolve, timeout)), // Hard timeout as a fallback
1828
+ page.evaluate((OBSERVER_TIMEOUT) => {
1829
+ return new Promise((resolve) => {
1830
+ // Skip mutation check for PDFs
1831
+ if (document.contentType === 'application/pdf') {
1832
+ resolve('Skipping DOM mutation check for PDF.');
1833
+ return;
1834
+ }
1835
+
1836
+ let timeout;
1837
+ let mutationCount = 0;
1838
+ const MAX_MUTATIONS = 250; // Limit max mutations
1839
+ const mutationHash = {};
1840
+
1841
+ const observer = new MutationObserver(mutationsList => {
1842
+ clearTimeout(timeout);
1843
+
1844
+ mutationCount++;
1845
+ if (mutationCount > MAX_MUTATIONS) {
1846
+ observer.disconnect();
1847
+ resolve('Too many mutations detected, exiting.');
1848
+ return;
1849
+ }
1850
+
1851
+ mutationsList.forEach(mutation => {
1852
+ if (mutation.target instanceof Element) {
1853
+ Array.from(mutation.target.attributes).forEach(attr => {
1854
+ const mutationKey = `${mutation.target.nodeName}-${attr.name}`;
1855
+
1856
+ if (mutationKey) {
1857
+ mutationHash[mutationKey] = (mutationHash[mutationKey] || 0) + 1;
1858
+
1859
+ if (mutationHash[mutationKey] >= 10) {
1860
+ observer.disconnect();
1861
+ resolve(`Repeated mutation detected for ${mutationKey}, exiting.`);
1862
+ }
1863
+ }
1864
+ });
1865
+ }
1866
+ });
1867
+
1868
+ // If no mutations occur for 1 second, resolve
1869
+ timeout = setTimeout(() => {
1870
+ observer.disconnect();
1871
+ resolve('DOM stabilized after mutations.');
1872
+ }, 1000);
1873
+ });
1874
+
1875
+ // Final timeout to avoid infinite waiting
1876
+ timeout = setTimeout(() => {
1877
+ observer.disconnect();
1878
+ resolve('Observer timeout reached, exiting.');
1879
+ }, OBSERVER_TIMEOUT);
1880
+
1881
+ observer.observe(document.documentElement, { childList: true, subtree: true, attributes: true });
1882
+ });
1883
+ }, OBSERVER_TIMEOUT), // Pass OBSERVER_TIMEOUT dynamically to the browser context
1826
1884
  ]);
1827
1885
  };
1828
1886
 
1887
+
1829
1888
  function isValidHttpUrl(urlString) {
1830
1889
  const pattern = /^(http|https):\/\/[^ "]+$/;
1831
1890
  return pattern.test(urlString);
@@ -186,7 +186,7 @@ export class UrlsCrawled {
186
186
  error: { url: string }[] = [];
187
187
  exceededRequests: string[] = [];
188
188
  forbidden: string[] = [];
189
- userExcluded: string[] = [];
189
+ userExcluded: { url: string; actualUrl: string; pageTitle: string }[] = [];
190
190
  everything: string[] = [];
191
191
 
192
192
  constructor(urlsCrawled?: Partial<UrlsCrawled>) {
@@ -125,14 +125,6 @@ const crawlDomain = async ({
125
125
 
126
126
  const httpsAgent = new https.Agent({ rejectUnauthorized: false });
127
127
 
128
- if (isBlacklistedUrl) {
129
- guiInfoLog(guiInfoStatusTypes.SKIPPED, {
130
- numScanned: urlsCrawled.scanned.length,
131
- urlScanned: url,
132
- });
133
- return;
134
- }
135
-
136
128
  // Boolean to omit axe scan for basic auth URL
137
129
  let isBasicAuth = false;
138
130
  let authHeader = '';
@@ -608,13 +600,13 @@ const crawlDomain = async ({
608
600
  }
609
601
 
610
602
  await waitForPageLoaded(page, 10000);
611
- let actualUrl = request.url;
603
+ let actualUrl = page.url() || request.loadedUrl || request.url;
612
604
 
613
605
  if (page.url() !== 'about:blank') {
614
606
  actualUrl = page.url();
615
607
  }
616
608
 
617
- if (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs)) {
609
+ if (!isFollowStrategy(url, actualUrl, strategy) && (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))) {
618
610
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
619
611
  numScanned: urlsCrawled.scanned.length,
620
612
  urlScanned: actualUrl,
@@ -683,8 +675,13 @@ const crawlDomain = async ({
683
675
  return;
684
676
  }
685
677
 
686
- if (blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
687
- urlsCrawled.userExcluded.push(request.url);
678
+ if (!isFollowStrategy(url, actualUrl, strategy) && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
679
+ urlsCrawled.userExcluded.push({
680
+ url: request.url,
681
+ pageTitle: request.url,
682
+ actualUrl: actualUrl,
683
+ });
684
+
688
685
  await enqueueProcess(page, enqueueLinks, browserContext);
689
686
  return;
690
687
  }
@@ -709,18 +706,18 @@ const crawlDomain = async ({
709
706
 
710
707
  if (isScanHtml) {
711
708
  // For deduplication, if the URL is redirected, we want to store the original URL and the redirected URL (actualUrl)
712
- const isRedirected = !areLinksEqual(request.loadedUrl, request.url);
709
+ const isRedirected = !areLinksEqual(actualUrl, request.url);
713
710
 
714
711
  // check if redirected link is following strategy (same-domain/same-hostname)
715
712
  const isLoadedUrlFollowStrategy = isFollowStrategy(
716
- request.loadedUrl,
713
+ actualUrl,
717
714
  request.url,
718
715
  strategy,
719
716
  );
720
717
  if (isRedirected && !isLoadedUrlFollowStrategy) {
721
718
  urlsCrawled.notScannedRedirects.push({
722
719
  fromUrl: request.url,
723
- toUrl: request.loadedUrl, // i.e. actualUrl
720
+ toUrl: actualUrl, // i.e. actualUrl
724
721
  });
725
722
  return;
726
723
  }
@@ -729,13 +726,13 @@ const crawlDomain = async ({
729
726
 
730
727
  if (isRedirected) {
731
728
  const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
732
- item => (item.actualUrl || item.url) === request.loadedUrl,
729
+ item => (item.actualUrl || item.url) === actualUrl,
733
730
  );
734
731
 
735
732
  if (isLoadedUrlInCrawledUrls) {
736
733
  urlsCrawled.notScannedRedirects.push({
737
734
  fromUrl: request.url,
738
- toUrl: request.loadedUrl, // i.e. actualUrl
735
+ toUrl: actualUrl, // i.e. actualUrl
739
736
  });
740
737
  return;
741
738
  }
@@ -750,16 +747,16 @@ const crawlDomain = async ({
750
747
  urlsCrawled.scanned.push({
751
748
  url: urlWithoutAuth(request.url),
752
749
  pageTitle: results.pageTitle,
753
- actualUrl: request.loadedUrl, // i.e. actualUrl
750
+ actualUrl: actualUrl, // i.e. actualUrl
754
751
  });
755
752
 
756
753
  urlsCrawled.scannedRedirects.push({
757
754
  fromUrl: urlWithoutAuth(request.url),
758
- toUrl: request.loadedUrl, // i.e. actualUrl
755
+ toUrl: actualUrl, // i.e. actualUrl
759
756
  });
760
757
 
761
758
  results.url = request.url;
762
- results.actualUrl = request.loadedUrl;
759
+ results.actualUrl = actualUrl;
763
760
  await dataset.pushData(results);
764
761
  }
765
762
  } else {
@@ -153,6 +153,8 @@ const crawlLocalFile = async (
153
153
  await page.goto(request.url);
154
154
  const results = await runAxeScript({ includeScreenshots, page, randomToken });
155
155
 
156
+ const actualUrl = page.url() || request.loadedUrl || request.url;
157
+
156
158
  guiInfoLog(guiInfoStatusTypes.SCANNED, {
157
159
  numScanned: urlsCrawled.scanned.length,
158
160
  urlScanned: request.url,
@@ -161,16 +163,16 @@ const crawlLocalFile = async (
161
163
  urlsCrawled.scanned.push({
162
164
  url: request.url,
163
165
  pageTitle: results.pageTitle,
164
- actualUrl: request.loadedUrl, // i.e. actualUrl
166
+ actualUrl: actualUrl, // i.e. actualUrl
165
167
  });
166
168
 
167
169
  urlsCrawled.scannedRedirects.push({
168
170
  fromUrl: request.url,
169
- toUrl: request.loadedUrl, // i.e. actualUrl
171
+ toUrl: actualUrl, // i.e. actualUrl
170
172
  });
171
173
 
172
174
  results.url = request.url;
173
- // results.actualUrl = request.loadedUrl;
175
+ results.actualUrl = actualUrl;
174
176
 
175
177
  await dataset.pushData(results);
176
178
  } else {
@@ -18,7 +18,7 @@ import {
18
18
  waitForPageLoaded,
19
19
  isFilePath,
20
20
  } from '../constants/common.js';
21
- import { areLinksEqual, isWhitelistedContentType } from '../utils.js';
21
+ import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
22
22
  import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
23
23
  import { guiInfoLog } from '../logs.js';
24
24
 
@@ -161,21 +161,67 @@ const crawlSitemap = async (
161
161
  ],
162
162
  },
163
163
  requestList,
164
+ postNavigationHooks: [
165
+ async ({ page, request }) => {
166
+ try {
167
+ // Wait for a quiet period in the DOM, but with safeguards
168
+ await page.evaluate(() => {
169
+ return new Promise((resolve) => {
170
+ let timeout;
171
+ let mutationCount = 0;
172
+ const MAX_MUTATIONS = 250; // Prevent infinite mutations
173
+ const OBSERVER_TIMEOUT = 5000; // Hard timeout to exit
174
+
175
+ const observer = new MutationObserver(() => {
176
+ clearTimeout(timeout);
177
+
178
+ mutationCount++;
179
+ if (mutationCount > MAX_MUTATIONS) {
180
+ observer.disconnect();
181
+ resolve('Too many mutations detected, exiting.');
182
+ return;
183
+ }
184
+
185
+ timeout = setTimeout(() => {
186
+ observer.disconnect();
187
+ resolve('DOM stabilized after mutations.');
188
+ }, 1000);
189
+ });
190
+
191
+ timeout = setTimeout(() => {
192
+ observer.disconnect();
193
+ resolve('Observer timeout reached, exiting.');
194
+ }, OBSERVER_TIMEOUT); // Ensure the observer stops after X seconds
195
+
196
+ observer.observe(document.documentElement, { childList: true, subtree: true });
197
+
198
+ });
199
+ });
200
+ } catch (err) {
201
+ // Handle page navigation errors gracefully
202
+ if (err.message.includes('was destroyed')) {
203
+ return; // Page navigated or closed, no need to handle
204
+ }
205
+ throw err; // Rethrow unknown errors
206
+ }
207
+ },
208
+ ],
209
+
164
210
  preNavigationHooks: isBasicAuth
165
211
  ? [
166
- async ({ page }) => {
167
- await page.setExtraHTTPHeaders({
168
- Authorization: authHeader,
169
- ...extraHTTPHeaders,
170
- });
171
- },
172
- ]
212
+ async ({ page }) => {
213
+ await page.setExtraHTTPHeaders({
214
+ Authorization: authHeader,
215
+ ...extraHTTPHeaders,
216
+ });
217
+ },
218
+ ]
173
219
  : [
174
- async () => {
175
- preNavigationHooks(extraHTTPHeaders);
176
- // insert other code here
177
- },
178
- ],
220
+ async () => {
221
+ preNavigationHooks(extraHTTPHeaders);
222
+ // insert other code here
223
+ },
224
+ ],
179
225
  requestHandlerTimeoutSecs: 90,
180
226
  requestHandler: async ({ page, request, response, sendRequest }) => {
181
227
  await waitForPageLoaded(page, 10000);
@@ -191,7 +237,7 @@ const crawlSitemap = async (
191
237
  request.url = currentUrl.href;
192
238
  }
193
239
 
194
- const actualUrl = request.loadedUrl || request.url;
240
+ const actualUrl = page.url() || request.loadedUrl || request.url;
195
241
 
196
242
  if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
197
243
  crawler.autoscaledPool.abort();
@@ -223,8 +269,17 @@ const crawlSitemap = async (
223
269
  const contentType = response.headers()['content-type'];
224
270
  const status = response.status();
225
271
 
226
- if (blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
227
- urlsCrawled.userExcluded.push(request.url);
272
+ if (blacklistedPatterns && !isFollowStrategy(actualUrl, request.url, "same-hostname") && isSkippedUrl(actualUrl, blacklistedPatterns)) {
273
+ urlsCrawled.userExcluded.push({
274
+ url: request.url,
275
+ pageTitle: request.url,
276
+ actualUrl: actualUrl,
277
+ });
278
+
279
+ guiInfoLog(guiInfoStatusTypes.SKIPPED, {
280
+ numScanned: urlsCrawled.scanned.length,
281
+ urlScanned: request.url,
282
+ });
228
283
  return;
229
284
  }
230
285
 
@@ -255,16 +310,16 @@ const crawlSitemap = async (
255
310
  urlScanned: request.url,
256
311
  });
257
312
 
258
- const isRedirected = !areLinksEqual(request.loadedUrl, request.url);
313
+ const isRedirected = !areLinksEqual(page.url(), request.url);
259
314
  if (isRedirected) {
260
315
  const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
261
- item => (item.actualUrl || item.url.href) === request.loadedUrl,
316
+ item => (item.actualUrl || item.url.href) === page,
262
317
  );
263
318
 
264
319
  if (isLoadedUrlInCrawledUrls) {
265
320
  urlsCrawled.notScannedRedirects.push({
266
321
  fromUrl: request.url,
267
- toUrl: request.loadedUrl, // i.e. actualUrl
322
+ toUrl: actualUrl, // i.e. actualUrl
268
323
  });
269
324
  return;
270
325
  }
@@ -272,16 +327,16 @@ const crawlSitemap = async (
272
327
  urlsCrawled.scanned.push({
273
328
  url: urlWithoutAuth(request.url),
274
329
  pageTitle: results.pageTitle,
275
- actualUrl: request.loadedUrl, // i.e. actualUrl
330
+ actualUrl: actualUrl, // i.e. actualUrl
276
331
  });
277
332
 
278
333
  urlsCrawled.scannedRedirects.push({
279
334
  fromUrl: urlWithoutAuth(request.url),
280
- toUrl: request.loadedUrl, // i.e. actualUrl
335
+ toUrl: actualUrl,
281
336
  });
282
337
 
283
338
  results.url = request.url;
284
- results.actualUrl = request.loadedUrl;
339
+ results.actualUrl = actualUrl;
285
340
  } else {
286
341
  urlsCrawled.scanned.push({
287
342
  url: urlWithoutAuth(request.url),
@@ -152,7 +152,12 @@ export const processPage = async (page, processPageParams) => {
152
152
  window.confirm('Page has been excluded, would you still like to proceed with the scan?'),
153
153
  );
154
154
  if (!continueScan) {
155
- urlsCrawled.userExcluded.push(pageUrl);
155
+ urlsCrawled.userExcluded.push({
156
+ url: pageUrl,
157
+ pageTitle: pageUrl,
158
+ actualUrl: pageUrl,
159
+ });
160
+
156
161
  return;
157
162
  }
158
163
  }
@@ -396,7 +401,7 @@ export const initNewPage = async (page, pageClosePromises, processPageParams, pa
396
401
  // eslint-disable-next-line no-underscore-dangle
397
402
  const pageId = page._guid;
398
403
 
399
- page.on('dialog', () => {});
404
+ page.on('dialog', () => { });
400
405
 
401
406
  const pageClosePromise = new Promise(resolve => {
402
407
  page.on('close', () => {