@govtechsg/oobee 0.10.39 → 0.10.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/docker-test.yml +1 -1
- package/README.md +2 -0
- package/REPORTS.md +431 -0
- package/package.json +3 -2
- package/src/cli.ts +2 -11
- package/src/constants/common.ts +68 -52
- package/src/constants/constants.ts +81 -1
- package/src/constants/oobeeAi.ts +6 -6
- package/src/constants/questions.ts +3 -2
- package/src/crawlers/commonCrawlerFunc.ts +45 -16
- package/src/crawlers/crawlDomain.ts +83 -102
- package/src/crawlers/crawlIntelligentSitemap.ts +21 -19
- package/src/crawlers/crawlSitemap.ts +121 -110
- package/src/crawlers/custom/findElementByCssSelector.ts +1 -1
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +593 -558
- package/src/crawlers/custom/xPathToCss.ts +10 -10
- package/src/crawlers/pdfScanFunc.ts +67 -26
- package/src/crawlers/runCustom.ts +1 -1
- package/src/index.ts +3 -4
- package/src/logs.ts +1 -1
- package/src/mergeAxeResults.ts +305 -242
- package/src/npmIndex.ts +12 -8
- package/src/screenshotFunc/htmlScreenshotFunc.ts +8 -20
- package/src/screenshotFunc/pdfScreenshotFunc.ts +34 -1
- package/src/types/text-readability.d.ts +3 -0
- package/src/types/types.ts +1 -1
- package/src/utils.ts +340 -50
- package/src/xPathToCss.ts +0 -186
- package/src/xPathToCssCypress.ts +0 -178
@@ -1,4 +1,4 @@
|
|
1
|
-
import crawlee, { Request, RequestList } from 'crawlee';
|
1
|
+
import crawlee, { LaunchContext, Request, RequestList } from 'crawlee';
|
2
2
|
import printMessage from 'print-message';
|
3
3
|
import fs from 'fs';
|
4
4
|
import {
|
@@ -8,7 +8,7 @@ import {
|
|
8
8
|
isUrlPdf,
|
9
9
|
} from './commonCrawlerFunc.js';
|
10
10
|
|
11
|
-
import constants, { guiInfoStatusTypes } from '../constants/constants.js';
|
11
|
+
import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, UrlsCrawled } from '../constants/constants.js';
|
12
12
|
import {
|
13
13
|
getLinksFromSitemap,
|
14
14
|
getPlaywrightLaunchOptions,
|
@@ -22,31 +22,32 @@ import {
|
|
22
22
|
import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
|
23
23
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
|
24
24
|
import { guiInfoLog } from '../logs.js';
|
25
|
+
import { ViewportSettingsClass } from '../combine.js';
|
25
26
|
|
26
27
|
const crawlSitemap = async (
|
27
|
-
sitemapUrl,
|
28
|
-
randomToken,
|
29
|
-
|
30
|
-
viewportSettings,
|
31
|
-
maxRequestsPerCrawl,
|
32
|
-
browser,
|
33
|
-
userDataDirectory,
|
34
|
-
specifiedMaxConcurrency,
|
35
|
-
fileTypes,
|
36
|
-
blacklistedPatterns,
|
37
|
-
includeScreenshots,
|
38
|
-
extraHTTPHeaders,
|
28
|
+
sitemapUrl: string,
|
29
|
+
randomToken: string,
|
30
|
+
_host: string,
|
31
|
+
viewportSettings: ViewportSettingsClass,
|
32
|
+
maxRequestsPerCrawl: number,
|
33
|
+
browser: string,
|
34
|
+
userDataDirectory: string,
|
35
|
+
specifiedMaxConcurrency: number,
|
36
|
+
fileTypes: string,
|
37
|
+
blacklistedPatterns: string[],
|
38
|
+
includeScreenshots: boolean,
|
39
|
+
extraHTTPHeaders: Record<string, string>,
|
39
40
|
fromCrawlIntelligentSitemap = false, // optional
|
40
|
-
userUrlInputFromIntelligent = null, // optional
|
41
|
-
datasetFromIntelligent = null, // optional
|
42
|
-
urlsCrawledFromIntelligent = null, // optional
|
41
|
+
userUrlInputFromIntelligent: string = null, // optional
|
42
|
+
datasetFromIntelligent: crawlee.Dataset = null, // optional
|
43
|
+
urlsCrawledFromIntelligent: UrlsCrawled = null, // optional
|
43
44
|
crawledFromLocalFile = false, // optional
|
44
45
|
) => {
|
45
|
-
let dataset;
|
46
|
-
let urlsCrawled;
|
46
|
+
let dataset: crawlee.Dataset;
|
47
|
+
let urlsCrawled: UrlsCrawled;
|
47
48
|
|
48
49
|
// Boolean to omit axe scan for basic auth URL
|
49
|
-
let isBasicAuth;
|
50
|
+
let isBasicAuth: boolean;
|
50
51
|
let basicAuthPage = 0;
|
51
52
|
let finalLinks = [];
|
52
53
|
let authHeader = '';
|
@@ -119,8 +120,8 @@ const crawlSitemap = async (
|
|
119
120
|
basicAuthPage = -2;
|
120
121
|
}
|
121
122
|
|
122
|
-
const pdfDownloads = [];
|
123
|
-
const uuidToPdfMapping = {};
|
123
|
+
const pdfDownloads: Promise<void>[] = [];
|
124
|
+
const uuidToPdfMapping: Record<string, string> = {};
|
124
125
|
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
125
126
|
const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
|
126
127
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
@@ -152,7 +153,7 @@ const crawlSitemap = async (
|
|
152
153
|
browserPoolOptions: {
|
153
154
|
useFingerprints: false,
|
154
155
|
preLaunchHooks: [
|
155
|
-
async (
|
156
|
+
async (_pageId: string, launchContext: LaunchContext) => {
|
156
157
|
launchContext.launchOptions = {
|
157
158
|
...launchContext.launchOptions,
|
158
159
|
bypassCSP: true,
|
@@ -164,39 +165,43 @@ const crawlSitemap = async (
|
|
164
165
|
},
|
165
166
|
requestList,
|
166
167
|
postNavigationHooks: [
|
167
|
-
|
168
|
+
|
169
|
+
async ({ page }) => {
|
168
170
|
try {
|
169
171
|
// Wait for a quiet period in the DOM, but with safeguards
|
170
172
|
await page.evaluate(() => {
|
171
|
-
return new Promise(
|
173
|
+
return new Promise(resolve => {
|
172
174
|
let timeout;
|
173
175
|
let mutationCount = 0;
|
174
|
-
const MAX_MUTATIONS
|
175
|
-
const OBSERVER_TIMEOUT
|
176
|
-
|
176
|
+
const MAX_MUTATIONS = 250; // stop if things never quiet down
|
177
|
+
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
178
|
+
|
177
179
|
const observer = new MutationObserver(() => {
|
178
180
|
clearTimeout(timeout);
|
179
|
-
|
181
|
+
|
180
182
|
mutationCount++;
|
181
183
|
if (mutationCount > MAX_MUTATIONS) {
|
182
184
|
observer.disconnect();
|
183
|
-
resolve('Too many mutations
|
185
|
+
resolve('Too many mutations, exiting.');
|
184
186
|
return;
|
185
187
|
}
|
186
|
-
|
188
|
+
|
189
|
+
// restart quiet‑period timer
|
187
190
|
timeout = setTimeout(() => {
|
188
191
|
observer.disconnect();
|
189
|
-
resolve('DOM stabilized
|
192
|
+
resolve('DOM stabilized.');
|
190
193
|
}, 1000);
|
191
194
|
});
|
192
|
-
|
195
|
+
|
196
|
+
// overall timeout in case the page never settles
|
193
197
|
timeout = setTimeout(() => {
|
194
198
|
observer.disconnect();
|
195
|
-
resolve('Observer timeout reached
|
196
|
-
}, OBSERVER_TIMEOUT);
|
197
|
-
|
198
|
-
|
199
|
-
|
199
|
+
resolve('Observer timeout reached.');
|
200
|
+
}, OBSERVER_TIMEOUT);
|
201
|
+
|
202
|
+
// **HERE**: select the real DOM node inside evaluate
|
203
|
+
const root = document.documentElement;
|
204
|
+
observer.observe(root, { childList: true, subtree: true });
|
200
205
|
});
|
201
206
|
});
|
202
207
|
} catch (err) {
|
@@ -207,6 +212,7 @@ const crawlSitemap = async (
|
|
207
212
|
throw err; // Rethrow unknown errors
|
208
213
|
}
|
209
214
|
},
|
215
|
+
|
210
216
|
],
|
211
217
|
|
212
218
|
preNavigationHooks: isBasicAuth
|
@@ -246,16 +252,18 @@ const crawlSitemap = async (
|
|
246
252
|
return;
|
247
253
|
}
|
248
254
|
|
249
|
-
if (
|
255
|
+
if (request.skipNavigation && actualUrl === "about:blank") {
|
250
256
|
if (!isScanPdfs) {
|
251
257
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
252
258
|
numScanned: urlsCrawled.scanned.length,
|
253
259
|
urlScanned: request.url,
|
254
260
|
});
|
255
|
-
urlsCrawled.
|
261
|
+
urlsCrawled.userExcluded.push({
|
256
262
|
url: request.url,
|
257
263
|
pageTitle: request.url,
|
258
|
-
actualUrl:
|
264
|
+
actualUrl: request.url, // because about:blank is not useful
|
265
|
+
metadata: STATUS_CODE_METADATA[1],
|
266
|
+
httpStatusCode: 0,
|
259
267
|
});
|
260
268
|
|
261
269
|
return;
|
@@ -276,85 +284,64 @@ const crawlSitemap = async (
|
|
276
284
|
const contentType = response?.headers?.()['content-type'] || '';
|
277
285
|
const status = response ? response.status() : 0;
|
278
286
|
|
279
|
-
if (
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
287
|
-
numScanned: urlsCrawled.scanned.length,
|
288
|
-
urlScanned: request.url,
|
289
|
-
});
|
290
|
-
return;
|
291
|
-
}
|
287
|
+
if (basicAuthPage < 0) {
|
288
|
+
basicAuthPage += 1;
|
289
|
+
} else if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
|
290
|
+
const isRedirected = !areLinksEqual(page.url(), request.url);
|
291
|
+
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
|
292
|
+
item => (item.actualUrl || item.url) === page.url(),
|
293
|
+
);
|
292
294
|
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
}
|
295
|
+
if (isRedirected && isLoadedUrlInCrawledUrls) {
|
296
|
+
urlsCrawled.notScannedRedirects.push({
|
297
|
+
fromUrl: request.url,
|
298
|
+
toUrl: actualUrl, // i.e. actualUrl
|
299
|
+
});
|
300
|
+
return;
|
301
|
+
}
|
301
302
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
303
|
+
// This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
|
304
|
+
if (
|
305
|
+
isRedirected &&
|
306
|
+
blacklistedPatterns &&
|
307
|
+
isSkippedUrl(actualUrl, blacklistedPatterns)
|
308
|
+
) {
|
309
|
+
urlsCrawled.userExcluded.push({
|
310
|
+
url: request.url,
|
311
|
+
pageTitle: request.url,
|
312
|
+
actualUrl: actualUrl,
|
313
|
+
metadata: STATUS_CODE_METADATA[0],
|
314
|
+
httpStatusCode: 0,
|
315
|
+
});
|
312
316
|
|
313
|
-
|
314
|
-
|
317
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
318
|
+
numScanned: urlsCrawled.scanned.length,
|
319
|
+
urlScanned: request.url,
|
320
|
+
});
|
321
|
+
return;
|
322
|
+
}
|
315
323
|
|
316
|
-
if (basicAuthPage < 0) {
|
317
|
-
basicAuthPage += 1;
|
318
|
-
} else if (isScanHtml && status === 200 && isWhitelistedContentType(contentType)) {
|
319
324
|
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
325
|
+
|
320
326
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
321
327
|
numScanned: urlsCrawled.scanned.length,
|
322
328
|
urlScanned: request.url,
|
323
329
|
});
|
324
330
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
if (isLoadedUrlInCrawledUrls) {
|
332
|
-
urlsCrawled.notScannedRedirects.push({
|
333
|
-
fromUrl: request.url,
|
334
|
-
toUrl: actualUrl, // i.e. actualUrl
|
335
|
-
});
|
336
|
-
return;
|
337
|
-
}
|
331
|
+
urlsCrawled.scanned.push({
|
332
|
+
url: urlWithoutAuth(request.url),
|
333
|
+
pageTitle: results.pageTitle,
|
334
|
+
actualUrl: actualUrl, // i.e. actualUrl
|
335
|
+
});
|
338
336
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
});
|
337
|
+
urlsCrawled.scannedRedirects.push({
|
338
|
+
fromUrl: urlWithoutAuth(request.url),
|
339
|
+
toUrl: actualUrl,
|
340
|
+
});
|
344
341
|
|
345
|
-
|
346
|
-
|
347
|
-
toUrl: actualUrl,
|
348
|
-
});
|
342
|
+
results.url = request.url;
|
343
|
+
results.actualUrl = actualUrl;
|
349
344
|
|
350
|
-
results.url = request.url;
|
351
|
-
results.actualUrl = actualUrl;
|
352
|
-
} else {
|
353
|
-
urlsCrawled.scanned.push({
|
354
|
-
url: urlWithoutAuth(request.url),
|
355
|
-
pageTitle: results.pageTitle,
|
356
|
-
});
|
357
|
-
}
|
358
345
|
await dataset.pushData(results);
|
359
346
|
} else {
|
360
347
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
@@ -363,11 +350,23 @@ const crawlSitemap = async (
|
|
363
350
|
});
|
364
351
|
|
365
352
|
if (isScanHtml) {
|
366
|
-
|
353
|
+
// carry through the HTTP status metadata
|
354
|
+
const status = response?.status();
|
355
|
+
const metadata = typeof status === 'number'
|
356
|
+
? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
|
357
|
+
: STATUS_CODE_METADATA[2];
|
358
|
+
|
359
|
+
urlsCrawled.invalid.push({
|
360
|
+
actualUrl,
|
361
|
+
url: request.url,
|
362
|
+
pageTitle: request.url,
|
363
|
+
metadata,
|
364
|
+
httpStatusCode: typeof status === 'number' ? status : 0
|
365
|
+
});
|
367
366
|
}
|
368
367
|
}
|
369
368
|
},
|
370
|
-
failedRequestHandler: async ({ request }) => {
|
369
|
+
failedRequestHandler: async ({ request, response, error }) => {
|
371
370
|
if (isBasicAuth && request.url) {
|
372
371
|
request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`;
|
373
372
|
}
|
@@ -381,7 +380,19 @@ const crawlSitemap = async (
|
|
381
380
|
numScanned: urlsCrawled.scanned.length,
|
382
381
|
urlScanned: request.url,
|
383
382
|
});
|
384
|
-
|
383
|
+
|
384
|
+
const status = response?.status();
|
385
|
+
const metadata = typeof status === 'number'
|
386
|
+
? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
|
387
|
+
: STATUS_CODE_METADATA[2];
|
388
|
+
|
389
|
+
urlsCrawled.error.push({
|
390
|
+
url: request.url,
|
391
|
+
pageTitle: request.url,
|
392
|
+
actualUrl: request.url,
|
393
|
+
metadata,
|
394
|
+
httpStatusCode: typeof status === 'number' ? status : 0
|
395
|
+
});
|
385
396
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
386
397
|
},
|
387
398
|
maxRequestsPerCrawl: Infinity,
|
@@ -16,7 +16,7 @@ export function findElementByCssSelector(cssSelector: string): string | null {
|
|
16
16
|
|
17
17
|
// Handle Shadow DOM if the element is not found
|
18
18
|
if (!element) {
|
19
|
-
const shadowRoots = [];
|
19
|
+
const shadowRoots: ShadowRoot[] = [];
|
20
20
|
const allElements = document.querySelectorAll('*');
|
21
21
|
|
22
22
|
// Look for elements with shadow roots
|