@govtechsg/oobee 0.10.29 → 0.10.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/exclusions.txt +2 -1
- package/package.json +1 -1
- package/src/combine.ts +3 -0
- package/src/constants/common.ts +62 -3
- package/src/constants/constants.ts +1 -1
- package/src/crawlers/crawlDomain.ts +17 -20
- package/src/crawlers/crawlLocalFile.ts +5 -3
- package/src/crawlers/crawlSitemap.ts +77 -22
- package/src/crawlers/custom/utils.ts +7 -2
package/exclusions.txt
CHANGED
package/package.json
CHANGED
package/src/combine.ts
CHANGED
@@ -210,6 +210,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
210
210
|
...urlsCrawledObj.error,
|
211
211
|
...urlsCrawledObj.invalid,
|
212
212
|
...urlsCrawledObj.forbidden,
|
213
|
+
...urlsCrawledObj.userExcluded,
|
213
214
|
];
|
214
215
|
const basicFormHTMLSnippet = await generateArtifacts(
|
215
216
|
randomToken,
|
@@ -240,6 +241,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
240
241
|
pagesNotScanned.length,
|
241
242
|
metadata,
|
242
243
|
);
|
244
|
+
} else {
|
245
|
+
printMessage([`No pages were scanned.`], alertMessageOptions);
|
243
246
|
}
|
244
247
|
} else {
|
245
248
|
printMessage([`No pages were scanned.`], alertMessageOptions);
|
package/src/constants/common.ts
CHANGED
@@ -1819,13 +1819,72 @@ export const urlWithoutAuth = (url: string): string => {
|
|
1819
1819
|
};
|
1820
1820
|
|
1821
1821
|
export const waitForPageLoaded = async (page, timeout = 10000) => {
|
1822
|
+
const OBSERVER_TIMEOUT = timeout; // Ensure observer timeout does not exceed the main timeout
|
1823
|
+
|
1822
1824
|
return Promise.race([
|
1823
|
-
page.waitForLoadState('load'),
|
1824
|
-
page.waitForLoadState('networkidle'),
|
1825
|
-
new Promise(resolve => setTimeout(resolve, timeout)),
|
1825
|
+
page.waitForLoadState('load'), // Ensure page load completes
|
1826
|
+
page.waitForLoadState('networkidle'), // Wait for network requests to settle
|
1827
|
+
new Promise(resolve => setTimeout(resolve, timeout)), // Hard timeout as a fallback
|
1828
|
+
page.evaluate((OBSERVER_TIMEOUT) => {
|
1829
|
+
return new Promise((resolve) => {
|
1830
|
+
// Skip mutation check for PDFs
|
1831
|
+
if (document.contentType === 'application/pdf') {
|
1832
|
+
resolve('Skipping DOM mutation check for PDF.');
|
1833
|
+
return;
|
1834
|
+
}
|
1835
|
+
|
1836
|
+
let timeout;
|
1837
|
+
let mutationCount = 0;
|
1838
|
+
const MAX_MUTATIONS = 250; // Limit max mutations
|
1839
|
+
const mutationHash = {};
|
1840
|
+
|
1841
|
+
const observer = new MutationObserver(mutationsList => {
|
1842
|
+
clearTimeout(timeout);
|
1843
|
+
|
1844
|
+
mutationCount++;
|
1845
|
+
if (mutationCount > MAX_MUTATIONS) {
|
1846
|
+
observer.disconnect();
|
1847
|
+
resolve('Too many mutations detected, exiting.');
|
1848
|
+
return;
|
1849
|
+
}
|
1850
|
+
|
1851
|
+
mutationsList.forEach(mutation => {
|
1852
|
+
if (mutation.target instanceof Element) {
|
1853
|
+
Array.from(mutation.target.attributes).forEach(attr => {
|
1854
|
+
const mutationKey = `${mutation.target.nodeName}-${attr.name}`;
|
1855
|
+
|
1856
|
+
if (mutationKey) {
|
1857
|
+
mutationHash[mutationKey] = (mutationHash[mutationKey] || 0) + 1;
|
1858
|
+
|
1859
|
+
if (mutationHash[mutationKey] >= 10) {
|
1860
|
+
observer.disconnect();
|
1861
|
+
resolve(`Repeated mutation detected for ${mutationKey}, exiting.`);
|
1862
|
+
}
|
1863
|
+
}
|
1864
|
+
});
|
1865
|
+
}
|
1866
|
+
});
|
1867
|
+
|
1868
|
+
// If no mutations occur for 1 second, resolve
|
1869
|
+
timeout = setTimeout(() => {
|
1870
|
+
observer.disconnect();
|
1871
|
+
resolve('DOM stabilized after mutations.');
|
1872
|
+
}, 1000);
|
1873
|
+
});
|
1874
|
+
|
1875
|
+
// Final timeout to avoid infinite waiting
|
1876
|
+
timeout = setTimeout(() => {
|
1877
|
+
observer.disconnect();
|
1878
|
+
resolve('Observer timeout reached, exiting.');
|
1879
|
+
}, OBSERVER_TIMEOUT);
|
1880
|
+
|
1881
|
+
observer.observe(document.documentElement, { childList: true, subtree: true, attributes: true });
|
1882
|
+
});
|
1883
|
+
}, OBSERVER_TIMEOUT), // Pass OBSERVER_TIMEOUT dynamically to the browser context
|
1826
1884
|
]);
|
1827
1885
|
};
|
1828
1886
|
|
1887
|
+
|
1829
1888
|
function isValidHttpUrl(urlString) {
|
1830
1889
|
const pattern = /^(http|https):\/\/[^ "]+$/;
|
1831
1890
|
return pattern.test(urlString);
|
@@ -186,7 +186,7 @@ export class UrlsCrawled {
|
|
186
186
|
error: { url: string }[] = [];
|
187
187
|
exceededRequests: string[] = [];
|
188
188
|
forbidden: string[] = [];
|
189
|
-
userExcluded: string[] = [];
|
189
|
+
userExcluded: { url: string; actualUrl: string; pageTitle: string }[] = [];
|
190
190
|
everything: string[] = [];
|
191
191
|
|
192
192
|
constructor(urlsCrawled?: Partial<UrlsCrawled>) {
|
@@ -125,14 +125,6 @@ const crawlDomain = async ({
|
|
125
125
|
|
126
126
|
const httpsAgent = new https.Agent({ rejectUnauthorized: false });
|
127
127
|
|
128
|
-
if (isBlacklistedUrl) {
|
129
|
-
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
130
|
-
numScanned: urlsCrawled.scanned.length,
|
131
|
-
urlScanned: url,
|
132
|
-
});
|
133
|
-
return;
|
134
|
-
}
|
135
|
-
|
136
128
|
// Boolean to omit axe scan for basic auth URL
|
137
129
|
let isBasicAuth = false;
|
138
130
|
let authHeader = '';
|
@@ -608,13 +600,13 @@ const crawlDomain = async ({
|
|
608
600
|
}
|
609
601
|
|
610
602
|
await waitForPageLoaded(page, 10000);
|
611
|
-
let actualUrl = request.url;
|
603
|
+
let actualUrl = page.url() || request.loadedUrl || request.url;
|
612
604
|
|
613
605
|
if (page.url() !== 'about:blank') {
|
614
606
|
actualUrl = page.url();
|
615
607
|
}
|
616
608
|
|
617
|
-
if (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs)) {
|
609
|
+
if (!isFollowStrategy(url, actualUrl, strategy) && (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))) {
|
618
610
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
619
611
|
numScanned: urlsCrawled.scanned.length,
|
620
612
|
urlScanned: actualUrl,
|
@@ -683,8 +675,13 @@ const crawlDomain = async ({
|
|
683
675
|
return;
|
684
676
|
}
|
685
677
|
|
686
|
-
if (blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
687
|
-
urlsCrawled.userExcluded.push(
|
678
|
+
if (!isFollowStrategy(url, actualUrl, strategy) && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
679
|
+
urlsCrawled.userExcluded.push({
|
680
|
+
url: request.url,
|
681
|
+
pageTitle: request.url,
|
682
|
+
actualUrl: actualUrl,
|
683
|
+
});
|
684
|
+
|
688
685
|
await enqueueProcess(page, enqueueLinks, browserContext);
|
689
686
|
return;
|
690
687
|
}
|
@@ -709,18 +706,18 @@ const crawlDomain = async ({
|
|
709
706
|
|
710
707
|
if (isScanHtml) {
|
711
708
|
// For deduplication, if the URL is redirected, we want to store the original URL and the redirected URL (actualUrl)
|
712
|
-
const isRedirected = !areLinksEqual(
|
709
|
+
const isRedirected = !areLinksEqual(actualUrl, request.url);
|
713
710
|
|
714
711
|
// check if redirected link is following strategy (same-domain/same-hostname)
|
715
712
|
const isLoadedUrlFollowStrategy = isFollowStrategy(
|
716
|
-
|
713
|
+
actualUrl,
|
717
714
|
request.url,
|
718
715
|
strategy,
|
719
716
|
);
|
720
717
|
if (isRedirected && !isLoadedUrlFollowStrategy) {
|
721
718
|
urlsCrawled.notScannedRedirects.push({
|
722
719
|
fromUrl: request.url,
|
723
|
-
toUrl:
|
720
|
+
toUrl: actualUrl, // i.e. actualUrl
|
724
721
|
});
|
725
722
|
return;
|
726
723
|
}
|
@@ -729,13 +726,13 @@ const crawlDomain = async ({
|
|
729
726
|
|
730
727
|
if (isRedirected) {
|
731
728
|
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
|
732
|
-
item => (item.actualUrl || item.url) ===
|
729
|
+
item => (item.actualUrl || item.url) === actualUrl,
|
733
730
|
);
|
734
731
|
|
735
732
|
if (isLoadedUrlInCrawledUrls) {
|
736
733
|
urlsCrawled.notScannedRedirects.push({
|
737
734
|
fromUrl: request.url,
|
738
|
-
toUrl:
|
735
|
+
toUrl: actualUrl, // i.e. actualUrl
|
739
736
|
});
|
740
737
|
return;
|
741
738
|
}
|
@@ -750,16 +747,16 @@ const crawlDomain = async ({
|
|
750
747
|
urlsCrawled.scanned.push({
|
751
748
|
url: urlWithoutAuth(request.url),
|
752
749
|
pageTitle: results.pageTitle,
|
753
|
-
actualUrl:
|
750
|
+
actualUrl: actualUrl, // i.e. actualUrl
|
754
751
|
});
|
755
752
|
|
756
753
|
urlsCrawled.scannedRedirects.push({
|
757
754
|
fromUrl: urlWithoutAuth(request.url),
|
758
|
-
toUrl:
|
755
|
+
toUrl: actualUrl, // i.e. actualUrl
|
759
756
|
});
|
760
757
|
|
761
758
|
results.url = request.url;
|
762
|
-
results.actualUrl =
|
759
|
+
results.actualUrl = actualUrl;
|
763
760
|
await dataset.pushData(results);
|
764
761
|
}
|
765
762
|
} else {
|
@@ -153,6 +153,8 @@ const crawlLocalFile = async (
|
|
153
153
|
await page.goto(request.url);
|
154
154
|
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
155
155
|
|
156
|
+
const actualUrl = page.url() || request.loadedUrl || request.url;
|
157
|
+
|
156
158
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
157
159
|
numScanned: urlsCrawled.scanned.length,
|
158
160
|
urlScanned: request.url,
|
@@ -161,16 +163,16 @@ const crawlLocalFile = async (
|
|
161
163
|
urlsCrawled.scanned.push({
|
162
164
|
url: request.url,
|
163
165
|
pageTitle: results.pageTitle,
|
164
|
-
actualUrl:
|
166
|
+
actualUrl: actualUrl, // i.e. actualUrl
|
165
167
|
});
|
166
168
|
|
167
169
|
urlsCrawled.scannedRedirects.push({
|
168
170
|
fromUrl: request.url,
|
169
|
-
toUrl:
|
171
|
+
toUrl: actualUrl, // i.e. actualUrl
|
170
172
|
});
|
171
173
|
|
172
174
|
results.url = request.url;
|
173
|
-
|
175
|
+
results.actualUrl = actualUrl;
|
174
176
|
|
175
177
|
await dataset.pushData(results);
|
176
178
|
} else {
|
@@ -18,7 +18,7 @@ import {
|
|
18
18
|
waitForPageLoaded,
|
19
19
|
isFilePath,
|
20
20
|
} from '../constants/common.js';
|
21
|
-
import { areLinksEqual, isWhitelistedContentType } from '../utils.js';
|
21
|
+
import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
|
22
22
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
|
23
23
|
import { guiInfoLog } from '../logs.js';
|
24
24
|
|
@@ -161,21 +161,67 @@ const crawlSitemap = async (
|
|
161
161
|
],
|
162
162
|
},
|
163
163
|
requestList,
|
164
|
+
postNavigationHooks: [
|
165
|
+
async ({ page, request }) => {
|
166
|
+
try {
|
167
|
+
// Wait for a quiet period in the DOM, but with safeguards
|
168
|
+
await page.evaluate(() => {
|
169
|
+
return new Promise((resolve) => {
|
170
|
+
let timeout;
|
171
|
+
let mutationCount = 0;
|
172
|
+
const MAX_MUTATIONS = 250; // Prevent infinite mutations
|
173
|
+
const OBSERVER_TIMEOUT = 5000; // Hard timeout to exit
|
174
|
+
|
175
|
+
const observer = new MutationObserver(() => {
|
176
|
+
clearTimeout(timeout);
|
177
|
+
|
178
|
+
mutationCount++;
|
179
|
+
if (mutationCount > MAX_MUTATIONS) {
|
180
|
+
observer.disconnect();
|
181
|
+
resolve('Too many mutations detected, exiting.');
|
182
|
+
return;
|
183
|
+
}
|
184
|
+
|
185
|
+
timeout = setTimeout(() => {
|
186
|
+
observer.disconnect();
|
187
|
+
resolve('DOM stabilized after mutations.');
|
188
|
+
}, 1000);
|
189
|
+
});
|
190
|
+
|
191
|
+
timeout = setTimeout(() => {
|
192
|
+
observer.disconnect();
|
193
|
+
resolve('Observer timeout reached, exiting.');
|
194
|
+
}, OBSERVER_TIMEOUT); // Ensure the observer stops after X seconds
|
195
|
+
|
196
|
+
observer.observe(document.documentElement, { childList: true, subtree: true });
|
197
|
+
|
198
|
+
});
|
199
|
+
});
|
200
|
+
} catch (err) {
|
201
|
+
// Handle page navigation errors gracefully
|
202
|
+
if (err.message.includes('was destroyed')) {
|
203
|
+
return; // Page navigated or closed, no need to handle
|
204
|
+
}
|
205
|
+
throw err; // Rethrow unknown errors
|
206
|
+
}
|
207
|
+
},
|
208
|
+
],
|
209
|
+
|
164
210
|
preNavigationHooks: isBasicAuth
|
165
211
|
? [
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
212
|
+
async ({ page }) => {
|
213
|
+
await page.setExtraHTTPHeaders({
|
214
|
+
Authorization: authHeader,
|
215
|
+
...extraHTTPHeaders,
|
216
|
+
});
|
217
|
+
},
|
218
|
+
]
|
173
219
|
: [
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
220
|
+
async () => {
|
221
|
+
preNavigationHooks(extraHTTPHeaders);
|
222
|
+
// insert other code here
|
223
|
+
},
|
224
|
+
],
|
179
225
|
requestHandlerTimeoutSecs: 90,
|
180
226
|
requestHandler: async ({ page, request, response, sendRequest }) => {
|
181
227
|
await waitForPageLoaded(page, 10000);
|
@@ -191,7 +237,7 @@ const crawlSitemap = async (
|
|
191
237
|
request.url = currentUrl.href;
|
192
238
|
}
|
193
239
|
|
194
|
-
const actualUrl = request.loadedUrl || request.url;
|
240
|
+
const actualUrl = page.url() || request.loadedUrl || request.url;
|
195
241
|
|
196
242
|
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl) {
|
197
243
|
crawler.autoscaledPool.abort();
|
@@ -223,8 +269,17 @@ const crawlSitemap = async (
|
|
223
269
|
const contentType = response.headers()['content-type'];
|
224
270
|
const status = response.status();
|
225
271
|
|
226
|
-
if (blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
227
|
-
urlsCrawled.userExcluded.push(
|
272
|
+
if (blacklistedPatterns && !isFollowStrategy(actualUrl, request.url, "same-hostname") && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
273
|
+
urlsCrawled.userExcluded.push({
|
274
|
+
url: request.url,
|
275
|
+
pageTitle: request.url,
|
276
|
+
actualUrl: actualUrl,
|
277
|
+
});
|
278
|
+
|
279
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
280
|
+
numScanned: urlsCrawled.scanned.length,
|
281
|
+
urlScanned: request.url,
|
282
|
+
});
|
228
283
|
return;
|
229
284
|
}
|
230
285
|
|
@@ -255,16 +310,16 @@ const crawlSitemap = async (
|
|
255
310
|
urlScanned: request.url,
|
256
311
|
});
|
257
312
|
|
258
|
-
const isRedirected = !areLinksEqual(
|
313
|
+
const isRedirected = !areLinksEqual(page.url(), request.url);
|
259
314
|
if (isRedirected) {
|
260
315
|
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
|
261
|
-
item => (item.actualUrl || item.url.href) ===
|
316
|
+
item => (item.actualUrl || item.url.href) === page,
|
262
317
|
);
|
263
318
|
|
264
319
|
if (isLoadedUrlInCrawledUrls) {
|
265
320
|
urlsCrawled.notScannedRedirects.push({
|
266
321
|
fromUrl: request.url,
|
267
|
-
toUrl:
|
322
|
+
toUrl: actualUrl, // i.e. actualUrl
|
268
323
|
});
|
269
324
|
return;
|
270
325
|
}
|
@@ -272,16 +327,16 @@ const crawlSitemap = async (
|
|
272
327
|
urlsCrawled.scanned.push({
|
273
328
|
url: urlWithoutAuth(request.url),
|
274
329
|
pageTitle: results.pageTitle,
|
275
|
-
actualUrl:
|
330
|
+
actualUrl: actualUrl, // i.e. actualUrl
|
276
331
|
});
|
277
332
|
|
278
333
|
urlsCrawled.scannedRedirects.push({
|
279
334
|
fromUrl: urlWithoutAuth(request.url),
|
280
|
-
toUrl:
|
335
|
+
toUrl: actualUrl,
|
281
336
|
});
|
282
337
|
|
283
338
|
results.url = request.url;
|
284
|
-
results.actualUrl =
|
339
|
+
results.actualUrl = actualUrl;
|
285
340
|
} else {
|
286
341
|
urlsCrawled.scanned.push({
|
287
342
|
url: urlWithoutAuth(request.url),
|
@@ -152,7 +152,12 @@ export const processPage = async (page, processPageParams) => {
|
|
152
152
|
window.confirm('Page has been excluded, would you still like to proceed with the scan?'),
|
153
153
|
);
|
154
154
|
if (!continueScan) {
|
155
|
-
urlsCrawled.userExcluded.push(
|
155
|
+
urlsCrawled.userExcluded.push({
|
156
|
+
url: pageUrl,
|
157
|
+
pageTitle: pageUrl,
|
158
|
+
actualUrl: pageUrl,
|
159
|
+
});
|
160
|
+
|
156
161
|
return;
|
157
162
|
}
|
158
163
|
}
|
@@ -396,7 +401,7 @@ export const initNewPage = async (page, pageClosePromises, processPageParams, pa
|
|
396
401
|
// eslint-disable-next-line no-underscore-dangle
|
397
402
|
const pageId = page._guid;
|
398
403
|
|
399
|
-
page.on('dialog', () => {});
|
404
|
+
page.on('dialog', () => { });
|
400
405
|
|
401
406
|
const pageClosePromise = new Promise(resolve => {
|
402
407
|
page.on('close', () => {
|