@govtechsg/oobee 0.10.42 → 0.10.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/REPORTS.md +71 -2
- package/package.json +3 -2
- package/src/cli.ts +2 -11
- package/src/constants/common.ts +68 -52
- package/src/constants/constants.ts +81 -1
- package/src/constants/oobeeAi.ts +6 -6
- package/src/constants/questions.ts +3 -2
- package/src/crawlers/commonCrawlerFunc.ts +16 -15
- package/src/crawlers/crawlDomain.ts +82 -84
- package/src/crawlers/crawlIntelligentSitemap.ts +21 -19
- package/src/crawlers/crawlSitemap.ts +120 -109
- package/src/crawlers/custom/findElementByCssSelector.ts +1 -1
- package/src/crawlers/custom/flagUnlabelledClickableElements.ts +8 -8
- package/src/crawlers/custom/xPathToCss.ts +10 -10
- package/src/crawlers/runCustom.ts +1 -1
- package/src/index.ts +3 -4
- package/src/logs.ts +1 -1
- package/src/mergeAxeResults.ts +3 -5
- package/src/npmIndex.ts +12 -8
- package/src/screenshotFunc/htmlScreenshotFunc.ts +7 -19
- package/src/types/text-readability.d.ts +3 -0
- package/src/types/types.ts +1 -1
- package/src/utils.ts +128 -114
- package/src/xPathToCss.ts +0 -186
- package/src/xPathToCssCypress.ts +0 -178
@@ -1,4 +1,4 @@
|
|
1
|
-
import crawlee, { Request, RequestList } from 'crawlee';
|
1
|
+
import crawlee, { LaunchContext, Request, RequestList } from 'crawlee';
|
2
2
|
import printMessage from 'print-message';
|
3
3
|
import fs from 'fs';
|
4
4
|
import {
|
@@ -8,7 +8,7 @@ import {
|
|
8
8
|
isUrlPdf,
|
9
9
|
} from './commonCrawlerFunc.js';
|
10
10
|
|
11
|
-
import constants, { guiInfoStatusTypes } from '../constants/constants.js';
|
11
|
+
import constants, { STATUS_CODE_METADATA, guiInfoStatusTypes, UrlsCrawled } from '../constants/constants.js';
|
12
12
|
import {
|
13
13
|
getLinksFromSitemap,
|
14
14
|
getPlaywrightLaunchOptions,
|
@@ -22,31 +22,32 @@ import {
|
|
22
22
|
import { areLinksEqual, isWhitelistedContentType, isFollowStrategy } from '../utils.js';
|
23
23
|
import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.js';
|
24
24
|
import { guiInfoLog } from '../logs.js';
|
25
|
+
import { ViewportSettingsClass } from '../combine.js';
|
25
26
|
|
26
27
|
const crawlSitemap = async (
|
27
|
-
sitemapUrl,
|
28
|
-
randomToken,
|
29
|
-
|
30
|
-
viewportSettings,
|
31
|
-
maxRequestsPerCrawl,
|
32
|
-
browser,
|
33
|
-
userDataDirectory,
|
34
|
-
specifiedMaxConcurrency,
|
35
|
-
fileTypes,
|
36
|
-
blacklistedPatterns,
|
37
|
-
includeScreenshots,
|
38
|
-
extraHTTPHeaders,
|
28
|
+
sitemapUrl: string,
|
29
|
+
randomToken: string,
|
30
|
+
_host: string,
|
31
|
+
viewportSettings: ViewportSettingsClass,
|
32
|
+
maxRequestsPerCrawl: number,
|
33
|
+
browser: string,
|
34
|
+
userDataDirectory: string,
|
35
|
+
specifiedMaxConcurrency: number,
|
36
|
+
fileTypes: string,
|
37
|
+
blacklistedPatterns: string[],
|
38
|
+
includeScreenshots: boolean,
|
39
|
+
extraHTTPHeaders: Record<string, string>,
|
39
40
|
fromCrawlIntelligentSitemap = false, // optional
|
40
|
-
userUrlInputFromIntelligent = null, // optional
|
41
|
-
datasetFromIntelligent = null, // optional
|
42
|
-
urlsCrawledFromIntelligent = null, // optional
|
41
|
+
userUrlInputFromIntelligent: string = null, // optional
|
42
|
+
datasetFromIntelligent: crawlee.Dataset = null, // optional
|
43
|
+
urlsCrawledFromIntelligent: UrlsCrawled = null, // optional
|
43
44
|
crawledFromLocalFile = false, // optional
|
44
45
|
) => {
|
45
|
-
let dataset;
|
46
|
-
let urlsCrawled;
|
46
|
+
let dataset: crawlee.Dataset;
|
47
|
+
let urlsCrawled: UrlsCrawled;
|
47
48
|
|
48
49
|
// Boolean to omit axe scan for basic auth URL
|
49
|
-
let isBasicAuth;
|
50
|
+
let isBasicAuth: boolean;
|
50
51
|
let basicAuthPage = 0;
|
51
52
|
let finalLinks = [];
|
52
53
|
let authHeader = '';
|
@@ -119,8 +120,8 @@ const crawlSitemap = async (
|
|
119
120
|
basicAuthPage = -2;
|
120
121
|
}
|
121
122
|
|
122
|
-
const pdfDownloads = [];
|
123
|
-
const uuidToPdfMapping = {};
|
123
|
+
const pdfDownloads: Promise<void>[] = [];
|
124
|
+
const uuidToPdfMapping: Record<string, string> = {};
|
124
125
|
const isScanHtml = ['all', 'html-only'].includes(fileTypes);
|
125
126
|
const isScanPdfs = ['all', 'pdf-only'].includes(fileTypes);
|
126
127
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
@@ -152,7 +153,7 @@ const crawlSitemap = async (
|
|
152
153
|
browserPoolOptions: {
|
153
154
|
useFingerprints: false,
|
154
155
|
preLaunchHooks: [
|
155
|
-
async (
|
156
|
+
async (_pageId: string, launchContext: LaunchContext) => {
|
156
157
|
launchContext.launchOptions = {
|
157
158
|
...launchContext.launchOptions,
|
158
159
|
bypassCSP: true,
|
@@ -164,39 +165,43 @@ const crawlSitemap = async (
|
|
164
165
|
},
|
165
166
|
requestList,
|
166
167
|
postNavigationHooks: [
|
167
|
-
|
168
|
+
|
169
|
+
async ({ page }) => {
|
168
170
|
try {
|
169
171
|
// Wait for a quiet period in the DOM, but with safeguards
|
170
172
|
await page.evaluate(() => {
|
171
|
-
return new Promise(
|
173
|
+
return new Promise(resolve => {
|
172
174
|
let timeout;
|
173
175
|
let mutationCount = 0;
|
174
|
-
const MAX_MUTATIONS
|
175
|
-
const OBSERVER_TIMEOUT
|
176
|
-
|
176
|
+
const MAX_MUTATIONS = 250; // stop if things never quiet down
|
177
|
+
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
178
|
+
|
177
179
|
const observer = new MutationObserver(() => {
|
178
180
|
clearTimeout(timeout);
|
179
|
-
|
181
|
+
|
180
182
|
mutationCount++;
|
181
183
|
if (mutationCount > MAX_MUTATIONS) {
|
182
184
|
observer.disconnect();
|
183
|
-
resolve('Too many mutations
|
185
|
+
resolve('Too many mutations, exiting.');
|
184
186
|
return;
|
185
187
|
}
|
186
|
-
|
188
|
+
|
189
|
+
// restart quiet‑period timer
|
187
190
|
timeout = setTimeout(() => {
|
188
191
|
observer.disconnect();
|
189
|
-
resolve('DOM stabilized
|
192
|
+
resolve('DOM stabilized.');
|
190
193
|
}, 1000);
|
191
194
|
});
|
192
|
-
|
195
|
+
|
196
|
+
// overall timeout in case the page never settles
|
193
197
|
timeout = setTimeout(() => {
|
194
198
|
observer.disconnect();
|
195
|
-
resolve('Observer timeout reached
|
196
|
-
}, OBSERVER_TIMEOUT);
|
197
|
-
|
198
|
-
|
199
|
-
|
199
|
+
resolve('Observer timeout reached.');
|
200
|
+
}, OBSERVER_TIMEOUT);
|
201
|
+
|
202
|
+
// **HERE**: select the real DOM node inside evaluate
|
203
|
+
const root = document.documentElement;
|
204
|
+
observer.observe(root, { childList: true, subtree: true });
|
200
205
|
});
|
201
206
|
});
|
202
207
|
} catch (err) {
|
@@ -207,6 +212,7 @@ const crawlSitemap = async (
|
|
207
212
|
throw err; // Rethrow unknown errors
|
208
213
|
}
|
209
214
|
},
|
215
|
+
|
210
216
|
],
|
211
217
|
|
212
218
|
preNavigationHooks: isBasicAuth
|
@@ -252,10 +258,12 @@ const crawlSitemap = async (
|
|
252
258
|
numScanned: urlsCrawled.scanned.length,
|
253
259
|
urlScanned: request.url,
|
254
260
|
});
|
255
|
-
urlsCrawled.
|
261
|
+
urlsCrawled.userExcluded.push({
|
256
262
|
url: request.url,
|
257
263
|
pageTitle: request.url,
|
258
|
-
actualUrl:
|
264
|
+
actualUrl: request.url, // because about:blank is not useful
|
265
|
+
metadata: STATUS_CODE_METADATA[1],
|
266
|
+
httpStatusCode: 0,
|
259
267
|
});
|
260
268
|
|
261
269
|
return;
|
@@ -276,85 +284,64 @@ const crawlSitemap = async (
|
|
276
284
|
const contentType = response?.headers?.()['content-type'] || '';
|
277
285
|
const status = response ? response.status() : 0;
|
278
286
|
|
279
|
-
if (
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
287
|
-
numScanned: urlsCrawled.scanned.length,
|
288
|
-
urlScanned: request.url,
|
289
|
-
});
|
290
|
-
return;
|
291
|
-
}
|
287
|
+
if (basicAuthPage < 0) {
|
288
|
+
basicAuthPage += 1;
|
289
|
+
} else if (isScanHtml && status < 300 && isWhitelistedContentType(contentType)) {
|
290
|
+
const isRedirected = !areLinksEqual(page.url(), request.url);
|
291
|
+
const isLoadedUrlInCrawledUrls = urlsCrawled.scanned.some(
|
292
|
+
item => (item.actualUrl || item.url) === page.url(),
|
293
|
+
);
|
292
294
|
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
}
|
295
|
+
if (isRedirected && isLoadedUrlInCrawledUrls) {
|
296
|
+
urlsCrawled.notScannedRedirects.push({
|
297
|
+
fromUrl: request.url,
|
298
|
+
toUrl: actualUrl, // i.e. actualUrl
|
299
|
+
});
|
300
|
+
return;
|
301
|
+
}
|
301
302
|
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
303
|
+
// This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
|
304
|
+
if (
|
305
|
+
isRedirected &&
|
306
|
+
blacklistedPatterns &&
|
307
|
+
isSkippedUrl(actualUrl, blacklistedPatterns)
|
308
|
+
) {
|
309
|
+
urlsCrawled.userExcluded.push({
|
310
|
+
url: request.url,
|
311
|
+
pageTitle: request.url,
|
312
|
+
actualUrl: actualUrl,
|
313
|
+
metadata: STATUS_CODE_METADATA[0],
|
314
|
+
httpStatusCode: 0,
|
315
|
+
});
|
312
316
|
|
313
|
-
|
314
|
-
|
317
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
318
|
+
numScanned: urlsCrawled.scanned.length,
|
319
|
+
urlScanned: request.url,
|
320
|
+
});
|
321
|
+
return;
|
322
|
+
}
|
315
323
|
|
316
|
-
if (basicAuthPage < 0) {
|
317
|
-
basicAuthPage += 1;
|
318
|
-
} else if (isScanHtml && status === 200 && isWhitelistedContentType(contentType)) {
|
319
324
|
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
325
|
+
|
320
326
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
321
327
|
numScanned: urlsCrawled.scanned.length,
|
322
328
|
urlScanned: request.url,
|
323
329
|
});
|
324
330
|
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
if (isLoadedUrlInCrawledUrls) {
|
332
|
-
urlsCrawled.notScannedRedirects.push({
|
333
|
-
fromUrl: request.url,
|
334
|
-
toUrl: actualUrl, // i.e. actualUrl
|
335
|
-
});
|
336
|
-
return;
|
337
|
-
}
|
331
|
+
urlsCrawled.scanned.push({
|
332
|
+
url: urlWithoutAuth(request.url),
|
333
|
+
pageTitle: results.pageTitle,
|
334
|
+
actualUrl: actualUrl, // i.e. actualUrl
|
335
|
+
});
|
338
336
|
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
});
|
337
|
+
urlsCrawled.scannedRedirects.push({
|
338
|
+
fromUrl: urlWithoutAuth(request.url),
|
339
|
+
toUrl: actualUrl,
|
340
|
+
});
|
344
341
|
|
345
|
-
|
346
|
-
|
347
|
-
toUrl: actualUrl,
|
348
|
-
});
|
342
|
+
results.url = request.url;
|
343
|
+
results.actualUrl = actualUrl;
|
349
344
|
|
350
|
-
results.url = request.url;
|
351
|
-
results.actualUrl = actualUrl;
|
352
|
-
} else {
|
353
|
-
urlsCrawled.scanned.push({
|
354
|
-
url: urlWithoutAuth(request.url),
|
355
|
-
pageTitle: results.pageTitle,
|
356
|
-
});
|
357
|
-
}
|
358
345
|
await dataset.pushData(results);
|
359
346
|
} else {
|
360
347
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
@@ -363,11 +350,23 @@ const crawlSitemap = async (
|
|
363
350
|
});
|
364
351
|
|
365
352
|
if (isScanHtml) {
|
366
|
-
|
353
|
+
// carry through the HTTP status metadata
|
354
|
+
const status = response?.status();
|
355
|
+
const metadata = typeof status === 'number'
|
356
|
+
? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
|
357
|
+
: STATUS_CODE_METADATA[2];
|
358
|
+
|
359
|
+
urlsCrawled.invalid.push({
|
360
|
+
actualUrl,
|
361
|
+
url: request.url,
|
362
|
+
pageTitle: request.url,
|
363
|
+
metadata,
|
364
|
+
httpStatusCode: typeof status === 'number' ? status : 0
|
365
|
+
});
|
367
366
|
}
|
368
367
|
}
|
369
368
|
},
|
370
|
-
failedRequestHandler: async ({ request }) => {
|
369
|
+
failedRequestHandler: async ({ request, response, error }) => {
|
371
370
|
if (isBasicAuth && request.url) {
|
372
371
|
request.url = `${request.url.split('://')[0]}://${request.url.split('@')[1]}`;
|
373
372
|
}
|
@@ -381,7 +380,19 @@ const crawlSitemap = async (
|
|
381
380
|
numScanned: urlsCrawled.scanned.length,
|
382
381
|
urlScanned: request.url,
|
383
382
|
});
|
384
|
-
|
383
|
+
|
384
|
+
const status = response?.status();
|
385
|
+
const metadata = typeof status === 'number'
|
386
|
+
? (STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599])
|
387
|
+
: STATUS_CODE_METADATA[2];
|
388
|
+
|
389
|
+
urlsCrawled.error.push({
|
390
|
+
url: request.url,
|
391
|
+
pageTitle: request.url,
|
392
|
+
actualUrl: request.url,
|
393
|
+
metadata,
|
394
|
+
httpStatusCode: typeof status === 'number' ? status : 0
|
395
|
+
});
|
385
396
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
386
397
|
},
|
387
398
|
maxRequestsPerCrawl: Infinity,
|
@@ -16,7 +16,7 @@ export function findElementByCssSelector(cssSelector: string): string | null {
|
|
16
16
|
|
17
17
|
// Handle Shadow DOM if the element is not found
|
18
18
|
if (!element) {
|
19
|
-
const shadowRoots = [];
|
19
|
+
const shadowRoots: ShadowRoot[] = [];
|
20
20
|
const allElements = document.querySelectorAll('*');
|
21
21
|
|
22
22
|
// Look for elements with shadow roots
|
@@ -27,9 +27,9 @@ export async function flagUnlabelledClickableElements() {
|
|
27
27
|
const loggingEnabled = false; // Set to true to enable console warnings
|
28
28
|
|
29
29
|
let previousFlaggedXPathsByDocument = {}; // Object to hold previous flagged XPaths
|
30
|
-
const previousAllFlaggedElementsXPaths = []; // Array to store all flagged XPaths
|
30
|
+
const previousAllFlaggedElementsXPaths : {xpath: string, code: string }[] = []; // Array to store all flagged XPaths
|
31
31
|
|
32
|
-
function getXPath(element: Node) {
|
32
|
+
function getXPath(element: Node): string {
|
33
33
|
if (!element) return null;
|
34
34
|
if (element instanceof HTMLElement && element.id) {
|
35
35
|
return `//*[@id="${element.id}"]`;
|
@@ -297,7 +297,7 @@ function hasPointerCursor(node: Node): boolean {
|
|
297
297
|
return hasAccessibleChildElement || hasDirectAccessibleText;
|
298
298
|
}
|
299
299
|
|
300
|
-
function hasAllChildrenAccessible(element: Element) {
|
300
|
+
function hasAllChildrenAccessible(element: Element): boolean {
|
301
301
|
// If the element is aria-hidden, consider it accessible
|
302
302
|
if (element.getAttribute('aria-hidden') === 'true') {
|
303
303
|
return true;
|
@@ -331,7 +331,7 @@ function hasPointerCursor(node: Node): boolean {
|
|
331
331
|
function hasChildNotANewInteractWithAccessibleText(element: Element) {
|
332
332
|
|
333
333
|
// Helper function to check if the element is a link or button
|
334
|
-
const isBuildInInteractable = (child) => {
|
334
|
+
const isBuildInInteractable = (child: Element) => {
|
335
335
|
return child.nodeName.toLowerCase() === "a" || child.nodeName.toLowerCase() === "button" || child.nodeName.toLowerCase() === "input" ||
|
336
336
|
child.getAttribute('role') === 'link' || child.getAttribute('role') === 'button';
|
337
337
|
};
|
@@ -376,7 +376,7 @@ function hasPointerCursor(node: Node): boolean {
|
|
376
376
|
}
|
377
377
|
|
378
378
|
// Recursively check for text content inside child nodes of elements that are not links or buttons
|
379
|
-
if (node.nodeType === Node.ELEMENT_NODE && !isBuildInInteractable(node)) {
|
379
|
+
if (node.nodeType === Node.ELEMENT_NODE && !isBuildInInteractable(node as Element)) {
|
380
380
|
return Array.from(node.childNodes).some(innerNode => {
|
381
381
|
if (innerNode.nodeType === Node.TEXT_NODE) {
|
382
382
|
const innerTextContent = getTextContent(innerNode).trim();
|
@@ -440,7 +440,7 @@ function hasPointerCursor(node: Node): boolean {
|
|
440
440
|
const beforeContent = window.getComputedStyle(element, '::before').getPropertyValue('content');
|
441
441
|
const afterContent = window.getComputedStyle(element, '::after').getPropertyValue('content');
|
442
442
|
|
443
|
-
function isAccessibleContent(value) {
|
443
|
+
function isAccessibleContent(value: string) {
|
444
444
|
if (!value || value === 'none' || value === 'normal') {
|
445
445
|
return false;
|
446
446
|
}
|
@@ -1126,11 +1126,11 @@ function hasPointerCursor(node: Node): boolean {
|
|
1126
1126
|
});
|
1127
1127
|
|
1128
1128
|
// Collect XPaths and outerHTMLs of flagged elements per document
|
1129
|
-
const flaggedXPathsByDocument = {};
|
1129
|
+
const flaggedXPathsByDocument: { [key: string]: { xpath: string; code: string }[] } = {};
|
1130
1130
|
|
1131
1131
|
for (const docKey in currentFlaggedElementsByDocument) {
|
1132
1132
|
const elements = currentFlaggedElementsByDocument[docKey];
|
1133
|
-
const flaggedInfo = []; // Array to hold flagged element info
|
1133
|
+
const flaggedInfo: { xpath: string; code: string }[] = []; // Array to hold flagged element info
|
1134
1134
|
elements.forEach(flaggedElement => {
|
1135
1135
|
const parentFlagged = flaggedElement.closest('[data-flagged="true"]');
|
1136
1136
|
if (!parentFlagged || parentFlagged === flaggedElement) {
|
@@ -1,12 +1,12 @@
|
|
1
|
-
export function xPathToCss(expr: string) {
|
2
|
-
const isValidXPath = expr =>
|
1
|
+
export default function xPathToCss(expr: string) {
|
2
|
+
const isValidXPath = (expr: string) =>
|
3
3
|
typeof expr !== 'undefined' &&
|
4
4
|
expr.replace(/[\s-_=]/g, '') !== '' &&
|
5
5
|
expr.length ===
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
6
|
+
expr.replace(
|
7
|
+
/[-_\w:.]+\(\)\s*=|=\s*[-_\w:.]+\(\)|\sor\s|\sand\s|\[(?:[^\/\]]+[\/\[]\/?.+)+\]|starts-with\(|\[.*last\(\)\s*[-\+<>=].+\]|number\(\)|not\(|count\(|text\(|first\(|normalize-space|[^\/]following-sibling|concat\(|descendant::|parent::|self::|child::|/gi,
|
8
|
+
'',
|
9
|
+
).length;
|
10
10
|
|
11
11
|
const getValidationRegex = () => {
|
12
12
|
let regex =
|
@@ -30,7 +30,7 @@ export function xPathToCss(expr: string) {
|
|
30
30
|
value: '\\s*[\\w/:][-/\\w\\s,:;.]*',
|
31
31
|
};
|
32
32
|
|
33
|
-
Object.keys(subRegexes).forEach(key => {
|
33
|
+
Object.keys(subRegexes).forEach((key: keyof typeof subRegexes) => {
|
34
34
|
regex = regex.replace(new RegExp(`%\\(${key}\\)s`, 'gi'), subRegexes[key]);
|
35
35
|
});
|
36
36
|
|
@@ -42,14 +42,14 @@ export function xPathToCss(expr: string) {
|
|
42
42
|
return new RegExp(regex, 'gi');
|
43
43
|
};
|
44
44
|
|
45
|
-
const preParseXpath = expr =>
|
45
|
+
const preParseXpath = (expr: string) =>
|
46
46
|
expr.replace(
|
47
47
|
/contains\s*\(\s*concat\(["']\s+["']\s*,\s*@class\s*,\s*["']\s+["']\)\s*,\s*["']\s+([a-zA-Z0-9-_]+)\s+["']\)/gi,
|
48
48
|
'@class="$1"',
|
49
49
|
);
|
50
50
|
|
51
|
-
function escapeCssIdSelectors(cssSelector) {
|
52
|
-
return cssSelector.replace(/#([^ >]+)/g, (
|
51
|
+
function escapeCssIdSelectors(cssSelector: string) {
|
52
|
+
return cssSelector.replace(/#([^ >]+)/g, (_match, id) => {
|
53
53
|
// Escape special characters in the id part
|
54
54
|
return `#${id.replace(/[!"#$%&'()*+,./:;<=>?@[\\\]^`{|}~]/g, '\\$&')}`;
|
55
55
|
});
|
@@ -48,7 +48,7 @@ const runCustom = async (
|
|
48
48
|
includeScreenshots: boolean,
|
49
49
|
) => {
|
50
50
|
// checks and delete datasets path if it already exists
|
51
|
-
|
51
|
+
cleanUp(randomToken);
|
52
52
|
process.env.CRAWLEE_STORAGE_DIR = randomToken;
|
53
53
|
|
54
54
|
const urlsCrawled: UrlsCrawled = { ...constants.urlsCrawledObj };
|
package/src/index.ts
CHANGED
@@ -1,6 +1,4 @@
|
|
1
1
|
#!/usr/bin/env node
|
2
|
-
/* eslint-disable func-names */
|
3
|
-
/* eslint-disable no-param-reassign */
|
4
2
|
import printMessage from 'print-message';
|
5
3
|
import inquirer from 'inquirer';
|
6
4
|
import { EnqueueStrategy } from 'crawlee';
|
@@ -22,6 +20,7 @@ import {
|
|
22
20
|
import questions from './constants/questions.js';
|
23
21
|
import combineRun from './combine.js';
|
24
22
|
import { BrowserTypes, RuleFlags, ScannerTypes } from './constants/constants.js';
|
23
|
+
import { DeviceDescriptor } from './types/types.js';
|
25
24
|
|
26
25
|
export type Answers = {
|
27
26
|
headless: boolean;
|
@@ -32,7 +31,7 @@ export type Answers = {
|
|
32
31
|
scanner: ScannerTypes;
|
33
32
|
url: string;
|
34
33
|
clonedBrowserDataDir: string;
|
35
|
-
playwrightDeviceDetailsObject:
|
34
|
+
playwrightDeviceDetailsObject: DeviceDescriptor;
|
36
35
|
nameEmail: string;
|
37
36
|
fileTypes: string;
|
38
37
|
metadata: string;
|
@@ -61,7 +60,7 @@ export type Data = {
|
|
61
60
|
deviceChosen: string;
|
62
61
|
customDevice: string;
|
63
62
|
viewportWidth: number;
|
64
|
-
playwrightDeviceDetailsObject:
|
63
|
+
playwrightDeviceDetailsObject: DeviceDescriptor;
|
65
64
|
maxRequestsPerCrawl: number;
|
66
65
|
strategy: EnqueueStrategy;
|
67
66
|
isLocalFileScan: boolean;
|
package/src/logs.ts
CHANGED
@@ -40,7 +40,7 @@ const silentLogger = createLogger({
|
|
40
40
|
});
|
41
41
|
|
42
42
|
// guiInfoLogger feeds the gui information via console log and is mainly used for scanning process
|
43
|
-
export const guiInfoLog = (status, data) => {
|
43
|
+
export const guiInfoLog = (status: string, data: { numScanned?: number; urlScanned?: string }) => {
|
44
44
|
if (process.env.RUNNING_FROM_PH_GUI || process.env.OOBEE_VERBOSE) {
|
45
45
|
switch (status) {
|
46
46
|
case guiInfoStatusTypes.COMPLETED:
|
package/src/mergeAxeResults.ts
CHANGED
@@ -45,6 +45,7 @@ export type PageInfo = {
|
|
45
45
|
pageImagePath?: string;
|
46
46
|
pageIndex?: number;
|
47
47
|
metadata?: string;
|
48
|
+
httpStatusCode?: number;
|
48
49
|
};
|
49
50
|
|
50
51
|
export type RuleInfo = {
|
@@ -248,7 +249,7 @@ const writeCsv = async (allIssues, storagePath) => {
|
|
248
249
|
scanCompletedAt: allIssues.endTime ? allIssues.endTime.toISOString() : '',
|
249
250
|
severity: 'error',
|
250
251
|
issueId: 'error-pages-skipped',
|
251
|
-
issueDescription: '
|
252
|
+
issueDescription: page.metadata ? page.metadata : 'An unknown error caused the page to be skipped',
|
252
253
|
wcagConformance: '',
|
253
254
|
url: page.url || page || '',
|
254
255
|
pageTitle: 'Error',
|
@@ -791,25 +792,21 @@ const writeJsonAndBase64Files = async (
|
|
791
792
|
items.mustFix.rules.forEach(rule => {
|
792
793
|
rule.pagesAffected.forEach(page => {
|
793
794
|
page.itemsCount = page.items.length;
|
794
|
-
page.items = [];
|
795
795
|
});
|
796
796
|
});
|
797
797
|
items.goodToFix.rules.forEach(rule => {
|
798
798
|
rule.pagesAffected.forEach(page => {
|
799
799
|
page.itemsCount = page.items.length;
|
800
|
-
page.items = [];
|
801
800
|
});
|
802
801
|
});
|
803
802
|
items.needsReview.rules.forEach(rule => {
|
804
803
|
rule.pagesAffected.forEach(page => {
|
805
804
|
page.itemsCount = page.items.length;
|
806
|
-
page.items = [];
|
807
805
|
});
|
808
806
|
});
|
809
807
|
items.passed.rules.forEach(rule => {
|
810
808
|
rule.pagesAffected.forEach(page => {
|
811
809
|
page.itemsCount = page.items.length;
|
812
|
-
page.items = [];
|
813
810
|
});
|
814
811
|
});
|
815
812
|
|
@@ -1205,6 +1202,7 @@ const createRuleIdJson = allIssues => {
|
|
1205
1202
|
});
|
1206
1203
|
});
|
1207
1204
|
snippets = [...snippetsSet];
|
1205
|
+
rule.pagesAffected.forEach(p => { delete p.items; });
|
1208
1206
|
}
|
1209
1207
|
compiledRuleJson[ruleId] = {
|
1210
1208
|
snippets,
|
package/src/npmIndex.ts
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import fs from 'fs';
|
2
2
|
import path from 'path';
|
3
3
|
import printMessage from 'print-message';
|
4
|
-
import axe, { ImpactValue } from 'axe-core';
|
4
|
+
import axe, { AxeResults, ImpactValue } from 'axe-core';
|
5
5
|
import { fileURLToPath } from 'url';
|
6
6
|
import { EnqueueStrategy } from 'crawlee';
|
7
7
|
import constants, { BrowserTypes, RuleFlags, ScannerTypes } from './constants/constants.js';
|
@@ -16,7 +16,7 @@ import { createCrawleeSubFolders, filterAxeResults } from './crawlers/commonCraw
|
|
16
16
|
import { createAndUpdateResultsFolders, createDetailsAndLogs } from './utils.js';
|
17
17
|
import generateArtifacts from './mergeAxeResults.js';
|
18
18
|
import { takeScreenshotForHTMLElements } from './screenshotFunc/htmlScreenshotFunc.js';
|
19
|
-
import { silentLogger } from './logs.js';
|
19
|
+
import { consoleLogger, silentLogger } from './logs.js';
|
20
20
|
import { alertMessageOptions } from './constants/cliFunctions.js';
|
21
21
|
import { evaluateAltText } from './crawlers/custom/evaluateAltText.js';
|
22
22
|
import { escapeCssSelector } from './crawlers/custom/escapeCssSelector.js';
|
@@ -24,7 +24,7 @@ import { framesCheck } from './crawlers/custom/framesCheck.js';
|
|
24
24
|
import { findElementByCssSelector } from './crawlers/custom/findElementByCssSelector.js';
|
25
25
|
import { getAxeConfiguration } from './crawlers/custom/getAxeConfiguration.js';
|
26
26
|
import { flagUnlabelledClickableElements } from './crawlers/custom/flagUnlabelledClickableElements.js';
|
27
|
-
import
|
27
|
+
import xPathToCss from './crawlers/custom/xPathToCss.js';
|
28
28
|
import { extractText } from './crawlers/custom/extractText.js';
|
29
29
|
import { gradeReadability } from './crawlers/custom/gradeReadability.js';
|
30
30
|
|
@@ -65,7 +65,7 @@ export const init = async ({
|
|
65
65
|
specifiedMaxConcurrency?: number;
|
66
66
|
followRobots?: boolean;
|
67
67
|
}) => {
|
68
|
-
|
68
|
+
consoleLogger.info('Starting Oobee');
|
69
69
|
|
70
70
|
const [date, time] = new Date().toLocaleString('sv').replaceAll(/-|:/g, '').split(' ');
|
71
71
|
const domain = new URL(entryUrl).hostname;
|
@@ -126,7 +126,7 @@ export const init = async ({
|
|
126
126
|
const cssSelector = xPathToCss(xpath);
|
127
127
|
return cssSelector;
|
128
128
|
} catch (e) {
|
129
|
-
|
129
|
+
consoleLogger.error(`Error converting XPath to CSS: ${xpath} - ${e}`);
|
130
130
|
return '';
|
131
131
|
}
|
132
132
|
})
|
@@ -197,7 +197,11 @@ export const init = async ({
|
|
197
197
|
`;
|
198
198
|
};
|
199
199
|
|
200
|
-
const pushScanResults = async (
|
200
|
+
const pushScanResults = async (
|
201
|
+
res: { pageUrl: string; pageTitle: string; axeScanResults: AxeResults },
|
202
|
+
metadata: string,
|
203
|
+
elementsToClick: string[],
|
204
|
+
) => {
|
201
205
|
throwErrorIfTerminated();
|
202
206
|
if (includeScreenshots) {
|
203
207
|
// use chrome by default
|
@@ -211,7 +215,7 @@ export const init = async ({
|
|
211
215
|
await page.waitForLoadState('networkidle');
|
212
216
|
|
213
217
|
// click on elements to reveal hidden elements so screenshots can be taken
|
214
|
-
elementsToClick?.forEach(async elem => {
|
218
|
+
elementsToClick?.forEach(async (elem: string) => {
|
215
219
|
try {
|
216
220
|
await page.locator(elem).click();
|
217
221
|
} catch (e) {
|
@@ -259,7 +263,7 @@ export const init = async ({
|
|
259
263
|
|
260
264
|
const terminate = async () => {
|
261
265
|
throwErrorIfTerminated();
|
262
|
-
|
266
|
+
consoleLogger.info('Stopping Oobee');
|
263
267
|
isInstanceTerminated = true;
|
264
268
|
scanDetails.endTime = new Date();
|
265
269
|
scanDetails.urlsCrawled = urlsCrawled;
|