@govtechsg/oobee 0.10.51 → 0.10.58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/bump-package-version.yml +58 -0
- package/.github/workflows/image.yml +38 -17
- package/DETAILS.md +5 -2
- package/INTEGRATION.md +57 -53
- package/README.md +4 -1
- package/__tests__/test-sitemap-url-patterns.xml +105 -0
- package/exclusions.txt +1 -0
- package/package.json +7 -6
- package/src/cli.ts +35 -2
- package/src/combine.ts +10 -7
- package/src/constants/cliFunctions.ts +9 -0
- package/src/constants/common.ts +95 -105
- package/src/constants/constants.ts +47 -2
- package/src/crawlers/commonCrawlerFunc.ts +84 -5
- package/src/crawlers/crawlDomain.ts +93 -160
- package/src/crawlers/crawlIntelligentSitemap.ts +40 -36
- package/src/crawlers/crawlLocalFile.ts +77 -35
- package/src/crawlers/crawlSitemap.ts +156 -89
- package/src/crawlers/pdfScanFunc.ts +2 -0
- package/src/index.ts +2 -0
- package/src/logs.ts +4 -2
- package/src/mergeAxeResults.ts +20 -9
- package/src/npmIndex.ts +1 -1
- package/src/screenshotFunc/htmlScreenshotFunc.ts +7 -5
- package/src/screenshotFunc/pdfScreenshotFunc.ts +2 -2
- package/src/static/ejs/partials/components/wcagCompliance.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +1 -0
- package/src/static/ejs/partials/styles/styles.ejs +11 -0
- package/src/static/ejs/report.ejs +14 -1
- package/src/utils.ts +3 -3
@@ -1,12 +1,15 @@
|
|
1
|
-
import { Request, RequestList } from 'crawlee';
|
2
|
-
import printMessage from 'print-message';
|
1
|
+
import { Request, RequestList, Dataset } from 'crawlee';
|
3
2
|
import fs from 'fs';
|
4
3
|
import path from 'path';
|
5
4
|
import { createCrawleeSubFolders, runAxeScript, isUrlPdf } from './commonCrawlerFunc.js';
|
6
|
-
import constants, {
|
5
|
+
import constants, {
|
6
|
+
guiInfoStatusTypes,
|
7
|
+
basicAuthRegex,
|
8
|
+
UrlsCrawled,
|
9
|
+
} from '../constants/constants.js';
|
10
|
+
import { ViewportSettingsClass } from '../combine.js';
|
7
11
|
import {
|
8
12
|
getPlaywrightLaunchOptions,
|
9
|
-
messageOptions,
|
10
13
|
isFilePath,
|
11
14
|
convertLocalFileToPath,
|
12
15
|
convertPathToLocalFile,
|
@@ -16,27 +19,47 @@ import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.j
|
|
16
19
|
import { guiInfoLog } from '../logs.js';
|
17
20
|
import crawlSitemap from './crawlSitemap.js';
|
18
21
|
|
19
|
-
const crawlLocalFile = async (
|
20
|
-
|
21
|
-
randomToken
|
22
|
-
host
|
23
|
-
viewportSettings
|
24
|
-
maxRequestsPerCrawl
|
25
|
-
browser
|
26
|
-
userDataDirectory
|
27
|
-
specifiedMaxConcurrency
|
28
|
-
fileTypes
|
29
|
-
blacklistedPatterns
|
30
|
-
includeScreenshots
|
31
|
-
extraHTTPHeaders
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
22
|
+
export const crawlLocalFile = async ({
|
23
|
+
url,
|
24
|
+
randomToken,
|
25
|
+
host,
|
26
|
+
viewportSettings,
|
27
|
+
maxRequestsPerCrawl,
|
28
|
+
browser,
|
29
|
+
userDataDirectory,
|
30
|
+
specifiedMaxConcurrency,
|
31
|
+
fileTypes,
|
32
|
+
blacklistedPatterns,
|
33
|
+
includeScreenshots,
|
34
|
+
extraHTTPHeaders,
|
35
|
+
scanDuration = 0,
|
36
|
+
fromCrawlIntelligentSitemap = false,
|
37
|
+
userUrlInputFromIntelligent = null,
|
38
|
+
datasetFromIntelligent = null,
|
39
|
+
urlsCrawledFromIntelligent = null,
|
40
|
+
}: {
|
41
|
+
url: string;
|
42
|
+
randomToken: string;
|
43
|
+
host: string;
|
44
|
+
viewportSettings: ViewportSettingsClass;
|
45
|
+
maxRequestsPerCrawl: number;
|
46
|
+
browser: string;
|
47
|
+
userDataDirectory: string;
|
48
|
+
specifiedMaxConcurrency: number;
|
49
|
+
fileTypes: string;
|
50
|
+
blacklistedPatterns: string[];
|
51
|
+
includeScreenshots: boolean;
|
52
|
+
extraHTTPHeaders: Record<string, string>;
|
53
|
+
scanDuration?: number;
|
54
|
+
fromCrawlIntelligentSitemap?: boolean;
|
55
|
+
userUrlInputFromIntelligent?: string | null;
|
56
|
+
datasetFromIntelligent?: Dataset | null;
|
57
|
+
urlsCrawledFromIntelligent?: UrlsCrawled | null;
|
58
|
+
}) => {
|
37
59
|
let dataset: any;
|
38
|
-
let urlsCrawled:
|
60
|
+
let urlsCrawled: UrlsCrawled;
|
39
61
|
let linksFromSitemap = [];
|
62
|
+
let sitemapUrl = url;
|
40
63
|
|
41
64
|
// Boolean to omit axe scan for basic auth URL
|
42
65
|
let isBasicAuth: boolean;
|
@@ -82,7 +105,7 @@ const crawlLocalFile = async (
|
|
82
105
|
// Non XML file
|
83
106
|
} else {
|
84
107
|
// Put it to crawlSitemap function to handle xml files
|
85
|
-
const updatedUrlsCrawled = await crawlSitemap(
|
108
|
+
const updatedUrlsCrawled = await crawlSitemap({
|
86
109
|
sitemapUrl,
|
87
110
|
randomToken,
|
88
111
|
host,
|
@@ -95,12 +118,13 @@ const crawlLocalFile = async (
|
|
95
118
|
blacklistedPatterns,
|
96
119
|
includeScreenshots,
|
97
120
|
extraHTTPHeaders,
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
121
|
+
scanDuration,
|
122
|
+
fromCrawlIntelligentSitemap,
|
123
|
+
userUrlInputFromIntelligent,
|
124
|
+
datasetFromIntelligent,
|
125
|
+
urlsCrawledFromIntelligent,
|
126
|
+
crawledFromLocalFile: true,
|
127
|
+
});
|
104
128
|
|
105
129
|
urlsCrawled = { ...urlsCrawled, ...updatedUrlsCrawled };
|
106
130
|
return urlsCrawled;
|
@@ -124,16 +148,12 @@ const crawlLocalFile = async (
|
|
124
148
|
|
125
149
|
const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
|
126
150
|
|
127
|
-
printMessage(['Fetching URLs. This might take some time...'], { border: false });
|
128
|
-
|
129
151
|
finalLinks = [...finalLinks, ...linksFromSitemap];
|
130
152
|
|
131
153
|
await RequestList.open({
|
132
154
|
sources: finalLinks,
|
133
155
|
});
|
134
156
|
|
135
|
-
printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
|
136
|
-
|
137
157
|
const request = linksFromSitemap[0];
|
138
158
|
const pdfFileName = path.basename(request.url);
|
139
159
|
const trimmedUrl: string = request.url;
|
@@ -142,6 +162,8 @@ const crawlLocalFile = async (
|
|
142
162
|
fs.writeFileSync(destinationFilePath, data);
|
143
163
|
uuidToPdfMapping[pdfFileName] = trimmedUrl;
|
144
164
|
|
165
|
+
let shouldAbort = false;
|
166
|
+
|
145
167
|
if (!isUrlPdf(request.url)) {
|
146
168
|
await initModifiedUserAgent(browser);
|
147
169
|
const browserContext = await constants.launcher.launchPersistentContext('', {
|
@@ -150,9 +172,24 @@ const crawlLocalFile = async (
|
|
150
172
|
...playwrightDeviceDetailsObject,
|
151
173
|
});
|
152
174
|
|
175
|
+
const timeoutId = scanDuration > 0
|
176
|
+
? setTimeout(() => {
|
177
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting local file scan.`);
|
178
|
+
shouldAbort = true;
|
179
|
+
}, scanDuration * 1000)
|
180
|
+
: null;
|
181
|
+
|
153
182
|
const page = await browserContext.newPage();
|
154
183
|
request.url = convertPathToLocalFile(request.url);
|
155
184
|
await page.goto(request.url);
|
185
|
+
|
186
|
+
if (shouldAbort) {
|
187
|
+
console.warn('Scan aborted due to timeout before page scan.');
|
188
|
+
await dataset.pushData({ scanned: [], scannedRedirects: [] });
|
189
|
+
await browserContext.close().catch(() => {});
|
190
|
+
return urlsCrawled;
|
191
|
+
}
|
192
|
+
|
156
193
|
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
157
194
|
|
158
195
|
const actualUrl = page.url() || request.loadedUrl || request.url;
|
@@ -178,7 +215,11 @@ const crawlLocalFile = async (
|
|
178
215
|
|
179
216
|
await dataset.pushData(results);
|
180
217
|
} else {
|
181
|
-
urlsCrawled.scanned.push({
|
218
|
+
urlsCrawled.scanned.push({
|
219
|
+
url: trimmedUrl,
|
220
|
+
pageTitle: pdfFileName,
|
221
|
+
actualUrl: trimmedUrl,
|
222
|
+
});
|
182
223
|
|
183
224
|
await runPdfScan(randomToken);
|
184
225
|
// transform result format
|
@@ -192,6 +233,7 @@ const crawlLocalFile = async (
|
|
192
233
|
// push results for each pdf document to key value store
|
193
234
|
await Promise.all(pdfResults.map(result => dataset.pushData(result)));
|
194
235
|
}
|
236
|
+
|
195
237
|
return urlsCrawled;
|
196
238
|
};
|
197
239
|
export default crawlLocalFile;
|
@@ -1,5 +1,4 @@
|
|
1
|
-
import crawlee, { LaunchContext, Request, RequestList } from 'crawlee';
|
2
|
-
import printMessage from 'print-message';
|
1
|
+
import crawlee, { LaunchContext, Request, RequestList, Dataset } from 'crawlee';
|
3
2
|
import fs from 'fs';
|
4
3
|
import {
|
5
4
|
createCrawleeSubFolders,
|
@@ -8,11 +7,15 @@ import {
|
|
8
7
|
isUrlPdf,
|
9
8
|
} from './commonCrawlerFunc.js';
|
10
9
|
|
11
|
-
import constants, {
|
10
|
+
import constants, {
|
11
|
+
STATUS_CODE_METADATA,
|
12
|
+
guiInfoStatusTypes,
|
13
|
+
UrlsCrawled,
|
14
|
+
disallowedListOfPatterns,
|
15
|
+
} from '../constants/constants.js';
|
12
16
|
import {
|
13
17
|
getLinksFromSitemap,
|
14
18
|
getPlaywrightLaunchOptions,
|
15
|
-
messageOptions,
|
16
19
|
isSkippedUrl,
|
17
20
|
urlWithoutAuth,
|
18
21
|
waitForPageLoaded,
|
@@ -24,25 +27,46 @@ import { handlePdfDownload, runPdfScan, mapPdfScanResults } from './pdfScanFunc.
|
|
24
27
|
import { guiInfoLog } from '../logs.js';
|
25
28
|
import { ViewportSettingsClass } from '../combine.js';
|
26
29
|
|
27
|
-
const crawlSitemap = async (
|
28
|
-
sitemapUrl
|
29
|
-
randomToken
|
30
|
-
|
31
|
-
viewportSettings
|
32
|
-
maxRequestsPerCrawl
|
33
|
-
browser
|
34
|
-
userDataDirectory
|
35
|
-
specifiedMaxConcurrency
|
36
|
-
fileTypes
|
37
|
-
blacklistedPatterns
|
38
|
-
includeScreenshots
|
39
|
-
extraHTTPHeaders
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
30
|
+
const crawlSitemap = async ({
|
31
|
+
sitemapUrl,
|
32
|
+
randomToken,
|
33
|
+
host,
|
34
|
+
viewportSettings,
|
35
|
+
maxRequestsPerCrawl,
|
36
|
+
browser,
|
37
|
+
userDataDirectory,
|
38
|
+
specifiedMaxConcurrency,
|
39
|
+
fileTypes,
|
40
|
+
blacklistedPatterns,
|
41
|
+
includeScreenshots,
|
42
|
+
extraHTTPHeaders,
|
43
|
+
scanDuration = 0,
|
44
|
+
fromCrawlIntelligentSitemap = false,
|
45
|
+
userUrlInputFromIntelligent = null,
|
46
|
+
datasetFromIntelligent = null,
|
47
|
+
urlsCrawledFromIntelligent = null,
|
48
|
+
crawledFromLocalFile = false,
|
49
|
+
}: {
|
50
|
+
sitemapUrl: string;
|
51
|
+
randomToken: string;
|
52
|
+
host: string;
|
53
|
+
viewportSettings: ViewportSettingsClass;
|
54
|
+
maxRequestsPerCrawl: number;
|
55
|
+
browser: string;
|
56
|
+
userDataDirectory: string;
|
57
|
+
specifiedMaxConcurrency: number;
|
58
|
+
fileTypes: string;
|
59
|
+
blacklistedPatterns: string[];
|
60
|
+
includeScreenshots: boolean;
|
61
|
+
extraHTTPHeaders: Record<string, string>;
|
62
|
+
scanDuration?: number;
|
63
|
+
fromCrawlIntelligentSitemap?: boolean;
|
64
|
+
userUrlInputFromIntelligent?: string;
|
65
|
+
datasetFromIntelligent?: Dataset;
|
66
|
+
urlsCrawledFromIntelligent?: UrlsCrawled;
|
67
|
+
crawledFromLocalFile?: boolean;
|
68
|
+
}) => {
|
69
|
+
const crawlStartTime = Date.now();
|
46
70
|
let dataset: crawlee.Dataset;
|
47
71
|
let urlsCrawled: UrlsCrawled;
|
48
72
|
|
@@ -127,14 +151,11 @@ const crawlSitemap = async (
|
|
127
151
|
const { playwrightDeviceDetailsObject } = viewportSettings;
|
128
152
|
const { maxConcurrency } = constants;
|
129
153
|
|
130
|
-
printMessage(['Fetching URLs. This might take some time...'], { border: false });
|
131
|
-
|
132
154
|
finalLinks = [...finalLinks, ...linksFromSitemap];
|
133
155
|
|
134
156
|
const requestList = await RequestList.open({
|
135
157
|
sources: finalLinks,
|
136
158
|
});
|
137
|
-
printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
|
138
159
|
|
139
160
|
let userDataDir = '';
|
140
161
|
if (userDataDirectory) {
|
@@ -165,7 +186,6 @@ const crawlSitemap = async (
|
|
165
186
|
},
|
166
187
|
requestList,
|
167
188
|
postNavigationHooks: [
|
168
|
-
|
169
189
|
async ({ page }) => {
|
170
190
|
try {
|
171
191
|
// Wait for a quiet period in the DOM, but with safeguards
|
@@ -173,36 +193,35 @@ const crawlSitemap = async (
|
|
173
193
|
return new Promise(resolve => {
|
174
194
|
let timeout;
|
175
195
|
let mutationCount = 0;
|
176
|
-
const MAX_MUTATIONS
|
177
|
-
const OBSERVER_TIMEOUT
|
178
|
-
|
196
|
+
const MAX_MUTATIONS = 250; // stop if things never quiet down
|
197
|
+
const OBSERVER_TIMEOUT = 5000; // hard cap on total wait
|
198
|
+
|
179
199
|
const observer = new MutationObserver(() => {
|
180
200
|
clearTimeout(timeout);
|
181
|
-
|
201
|
+
|
182
202
|
mutationCount++;
|
183
203
|
if (mutationCount > MAX_MUTATIONS) {
|
184
204
|
observer.disconnect();
|
185
205
|
resolve('Too many mutations, exiting.');
|
186
206
|
return;
|
187
207
|
}
|
188
|
-
|
208
|
+
|
189
209
|
// restart quiet‑period timer
|
190
210
|
timeout = setTimeout(() => {
|
191
211
|
observer.disconnect();
|
192
212
|
resolve('DOM stabilized.');
|
193
213
|
}, 1000);
|
194
214
|
});
|
195
|
-
|
215
|
+
|
196
216
|
// overall timeout in case the page never settles
|
197
217
|
timeout = setTimeout(() => {
|
198
218
|
observer.disconnect();
|
199
219
|
resolve('Observer timeout reached.');
|
200
220
|
}, OBSERVER_TIMEOUT);
|
201
|
-
|
221
|
+
|
202
222
|
const root = document.documentElement || document.body || document;
|
203
223
|
if (!root || typeof observer.observe !== 'function') {
|
204
224
|
resolve('No root node to observe.');
|
205
|
-
return;
|
206
225
|
}
|
207
226
|
});
|
208
227
|
});
|
@@ -214,27 +233,54 @@ const crawlSitemap = async (
|
|
214
233
|
throw err; // Rethrow unknown errors
|
215
234
|
}
|
216
235
|
},
|
217
|
-
|
218
236
|
],
|
237
|
+
preNavigationHooks: [
|
238
|
+
async ({ request, page }, gotoOptions) => {
|
239
|
+
const url = request.url.toLowerCase();
|
219
240
|
|
220
|
-
|
221
|
-
|
222
|
-
|
241
|
+
const isNotSupportedDocument = disallowedListOfPatterns.some(pattern =>
|
242
|
+
url.startsWith(pattern),
|
243
|
+
);
|
244
|
+
|
245
|
+
if (isNotSupportedDocument) {
|
246
|
+
request.skipNavigation = true;
|
247
|
+
request.userData.isNotSupportedDocument = true;
|
248
|
+
|
249
|
+
// Log for verification (optional, but not required for correctness)
|
250
|
+
// console.log(`[SKIP] Not supported: ${request.url}`);
|
251
|
+
|
252
|
+
return;
|
253
|
+
}
|
254
|
+
|
255
|
+
// Set headers if basic auth
|
256
|
+
if (isBasicAuth) {
|
223
257
|
await page.setExtraHTTPHeaders({
|
224
258
|
Authorization: authHeader,
|
225
259
|
...extraHTTPHeaders,
|
226
260
|
});
|
227
|
-
}
|
228
|
-
]
|
229
|
-
: [
|
230
|
-
async () => {
|
261
|
+
} else {
|
231
262
|
preNavigationHooks(extraHTTPHeaders);
|
232
|
-
|
233
|
-
|
234
|
-
|
263
|
+
}
|
264
|
+
},
|
265
|
+
],
|
235
266
|
requestHandlerTimeoutSecs: 90,
|
236
267
|
requestHandler: async ({ page, request, response, sendRequest }) => {
|
237
|
-
|
268
|
+
// Log documents that are not supported
|
269
|
+
if (request.userData?.isNotSupportedDocument) {
|
270
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
271
|
+
numScanned: urlsCrawled.scanned.length,
|
272
|
+
urlScanned: request.url,
|
273
|
+
});
|
274
|
+
urlsCrawled.userExcluded.push({
|
275
|
+
url: request.url,
|
276
|
+
pageTitle: request.url,
|
277
|
+
actualUrl: request.url, // because about:blank is not useful
|
278
|
+
metadata: STATUS_CODE_METADATA[1],
|
279
|
+
httpStatusCode: 0,
|
280
|
+
});
|
281
|
+
|
282
|
+
return;
|
283
|
+
}
|
238
284
|
|
239
285
|
// Set basic auth header if needed
|
240
286
|
if (isBasicAuth) {
|
@@ -247,39 +293,48 @@ const crawlSitemap = async (
|
|
247
293
|
request.url = currentUrl.href;
|
248
294
|
}
|
249
295
|
|
296
|
+
await waitForPageLoaded(page, 10000);
|
297
|
+
|
250
298
|
const actualUrl = page.url() || request.loadedUrl || request.url;
|
251
299
|
|
252
|
-
|
253
|
-
|
300
|
+
const hasExceededDuration =
|
301
|
+
scanDuration > 0 && Date.now() - crawlStartTime > scanDuration * 1000;
|
302
|
+
|
303
|
+
if (urlsCrawled.scanned.length >= maxRequestsPerCrawl || hasExceededDuration) {
|
304
|
+
if (hasExceededDuration) {
|
305
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting sitemap crawl.`);
|
306
|
+
}
|
307
|
+
crawler.autoscaledPool.abort(); // stops new requests
|
254
308
|
return;
|
255
309
|
}
|
256
310
|
|
257
|
-
if (request.skipNavigation && actualUrl ===
|
258
|
-
if (
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
});
|
270
|
-
|
311
|
+
if (request.skipNavigation && actualUrl === 'about:blank') {
|
312
|
+
if (isScanPdfs) {
|
313
|
+
// pushes download promise into pdfDownloads
|
314
|
+
const { pdfFileName, url } = handlePdfDownload(
|
315
|
+
randomToken,
|
316
|
+
pdfDownloads,
|
317
|
+
request,
|
318
|
+
sendRequest,
|
319
|
+
urlsCrawled,
|
320
|
+
);
|
321
|
+
|
322
|
+
uuidToPdfMapping[pdfFileName] = url;
|
271
323
|
return;
|
272
324
|
}
|
273
|
-
// pushes download promise into pdfDownloads
|
274
|
-
const { pdfFileName, url } = handlePdfDownload(
|
275
|
-
randomToken,
|
276
|
-
pdfDownloads,
|
277
|
-
request,
|
278
|
-
sendRequest,
|
279
|
-
urlsCrawled,
|
280
|
-
);
|
281
325
|
|
282
|
-
|
326
|
+
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
327
|
+
numScanned: urlsCrawled.scanned.length,
|
328
|
+
urlScanned: request.url,
|
329
|
+
});
|
330
|
+
urlsCrawled.userExcluded.push({
|
331
|
+
url: request.url,
|
332
|
+
pageTitle: request.url,
|
333
|
+
actualUrl: request.url, // because about:blank is not useful
|
334
|
+
metadata: STATUS_CODE_METADATA[1],
|
335
|
+
httpStatusCode: 0,
|
336
|
+
});
|
337
|
+
|
283
338
|
return;
|
284
339
|
}
|
285
340
|
|
@@ -303,15 +358,11 @@ const crawlSitemap = async (
|
|
303
358
|
}
|
304
359
|
|
305
360
|
// This logic is different from crawlDomain, as it also checks if the pae is redirected before checking if it is excluded using exclusions.txt
|
306
|
-
if (
|
307
|
-
isRedirected &&
|
308
|
-
blacklistedPatterns &&
|
309
|
-
isSkippedUrl(actualUrl, blacklistedPatterns)
|
310
|
-
) {
|
361
|
+
if (isRedirected && blacklistedPatterns && isSkippedUrl(actualUrl, blacklistedPatterns)) {
|
311
362
|
urlsCrawled.userExcluded.push({
|
312
363
|
url: request.url,
|
313
364
|
pageTitle: request.url,
|
314
|
-
actualUrl
|
365
|
+
actualUrl,
|
315
366
|
metadata: STATUS_CODE_METADATA[0],
|
316
367
|
httpStatusCode: 0,
|
317
368
|
});
|
@@ -324,7 +375,7 @@ const crawlSitemap = async (
|
|
324
375
|
}
|
325
376
|
|
326
377
|
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
327
|
-
|
378
|
+
|
328
379
|
guiInfoLog(guiInfoStatusTypes.SCANNED, {
|
329
380
|
numScanned: urlsCrawled.scanned.length,
|
330
381
|
urlScanned: request.url,
|
@@ -333,7 +384,7 @@ const crawlSitemap = async (
|
|
333
384
|
urlsCrawled.scanned.push({
|
334
385
|
url: urlWithoutAuth(request.url),
|
335
386
|
pageTitle: results.pageTitle,
|
336
|
-
actualUrl
|
387
|
+
actualUrl, // i.e. actualUrl
|
337
388
|
});
|
338
389
|
|
339
390
|
urlsCrawled.scannedRedirects.push({
|
@@ -354,16 +405,17 @@ const crawlSitemap = async (
|
|
354
405
|
if (isScanHtml) {
|
355
406
|
// carry through the HTTP status metadata
|
356
407
|
const status = response?.status();
|
357
|
-
const metadata =
|
358
|
-
|
359
|
-
|
408
|
+
const metadata =
|
409
|
+
typeof status === 'number'
|
410
|
+
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
411
|
+
: STATUS_CODE_METADATA[2];
|
360
412
|
|
361
|
-
|
413
|
+
urlsCrawled.invalid.push({
|
362
414
|
actualUrl,
|
363
415
|
url: request.url,
|
364
416
|
pageTitle: request.url,
|
365
417
|
metadata,
|
366
|
-
httpStatusCode: typeof status === 'number' ? status : 0
|
418
|
+
httpStatusCode: typeof status === 'number' ? status : 0,
|
367
419
|
});
|
368
420
|
}
|
369
421
|
}
|
@@ -384,21 +436,31 @@ const crawlSitemap = async (
|
|
384
436
|
});
|
385
437
|
|
386
438
|
const status = response?.status();
|
387
|
-
const metadata =
|
388
|
-
|
389
|
-
|
439
|
+
const metadata =
|
440
|
+
typeof status === 'number'
|
441
|
+
? STATUS_CODE_METADATA[status] || STATUS_CODE_METADATA[599]
|
442
|
+
: STATUS_CODE_METADATA[2];
|
390
443
|
|
391
444
|
urlsCrawled.error.push({
|
392
445
|
url: request.url,
|
393
446
|
pageTitle: request.url,
|
394
447
|
actualUrl: request.url,
|
395
448
|
metadata,
|
396
|
-
httpStatusCode: typeof status === 'number' ? status : 0
|
449
|
+
httpStatusCode: typeof status === 'number' ? status : 0,
|
397
450
|
});
|
398
451
|
crawlee.log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
|
399
452
|
},
|
400
453
|
maxRequestsPerCrawl: Infinity,
|
401
454
|
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
455
|
+
...(process.env.OOBEE_FAST_CRAWLER && {
|
456
|
+
autoscaledPoolOptions: {
|
457
|
+
minConcurrency: specifiedMaxConcurrency ? Math.min(specifiedMaxConcurrency, 10) : 10,
|
458
|
+
maxConcurrency: specifiedMaxConcurrency || maxConcurrency,
|
459
|
+
desiredConcurrencyRatio: 0.98, // Increase threshold for scaling up
|
460
|
+
scaleUpStepRatio: 0.99, // Scale up faster
|
461
|
+
scaleDownStepRatio: 0.1, // Scale down slower
|
462
|
+
},
|
463
|
+
}),
|
402
464
|
});
|
403
465
|
|
404
466
|
await crawler.run();
|
@@ -430,6 +492,11 @@ const crawlSitemap = async (
|
|
430
492
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
431
493
|
}
|
432
494
|
|
495
|
+
if (scanDuration > 0) {
|
496
|
+
const elapsed = Math.round((Date.now() - crawlStartTime) / 1000);
|
497
|
+
console.log(`Crawl ended after ${elapsed}s (limit: ${scanDuration}s).`);
|
498
|
+
}
|
499
|
+
|
433
500
|
return urlsCrawled;
|
434
501
|
};
|
435
502
|
|
@@ -12,6 +12,7 @@ import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
|
|
12
12
|
import constants, {
|
13
13
|
getExecutablePath,
|
14
14
|
guiInfoStatusTypes,
|
15
|
+
STATUS_CODE_METADATA,
|
15
16
|
UrlsCrawled,
|
16
17
|
} from '../constants/constants.js';
|
17
18
|
|
@@ -296,6 +297,7 @@ export const handlePdfDownload = (
|
|
296
297
|
url: request.url,
|
297
298
|
pageTitle: url,
|
298
299
|
actualUrl: url,
|
300
|
+
metadata: STATUS_CODE_METADATA[1],
|
299
301
|
});
|
300
302
|
}
|
301
303
|
|
package/src/index.ts
CHANGED
@@ -50,6 +50,7 @@ export type Answers = {
|
|
50
50
|
zip: string;
|
51
51
|
ruleset: RuleFlags[];
|
52
52
|
generateJsonFiles: boolean;
|
53
|
+
scanDuration?: number;
|
53
54
|
};
|
54
55
|
|
55
56
|
export type Data = {
|
@@ -80,6 +81,7 @@ export type Data = {
|
|
80
81
|
zip?: string;
|
81
82
|
ruleset: RuleFlags[];
|
82
83
|
generateJsonFiles: boolean;
|
84
|
+
scanDuration: number;
|
83
85
|
};
|
84
86
|
|
85
87
|
const userData = getUserDataTxt();
|
package/src/logs.ts
CHANGED
@@ -23,8 +23,10 @@ const logFormat = printf(({ timestamp, level, message }) => {
|
|
23
23
|
// All logs in combined.txt, error in errors.txt
|
24
24
|
|
25
25
|
const consoleLogger = createLogger({
|
26
|
+
silent: !(process.env.RUNNING_FROM_PH_GUI || process.env.OOBEE_VERBOSE),
|
26
27
|
format: combine(timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }), logFormat),
|
27
|
-
transports:
|
28
|
+
transports:
|
29
|
+
process.env.RUNNING_FROM_PH_GUI || process.env.OOBEE_VERBOSE ? [new transports.Console()] : [],
|
28
30
|
});
|
29
31
|
|
30
32
|
// No display in consoles, this will mostly be used within the interactive script to avoid disrupting the flow
|
@@ -33,7 +35,7 @@ const consoleLogger = createLogger({
|
|
33
35
|
const silentLogger = createLogger({
|
34
36
|
format: combine(timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }), logFormat),
|
35
37
|
transports: [
|
36
|
-
process.env.OOBEE_VERBOSE
|
38
|
+
process.env.OOBEE_VERBOSE || process.env.RUNNING_FROM_PH_GUI
|
37
39
|
? new transports.Console({ handleExceptions: true })
|
38
40
|
: new transports.File({ filename: 'errors.txt', level: 'warn', handleExceptions: true }),
|
39
41
|
].filter(Boolean),
|