@govtechsg/oobee 0.10.50 → 0.10.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/bump-package-version.yml +58 -0
- package/.github/workflows/image.yml +38 -17
- package/DETAILS.md +5 -2
- package/INTEGRATION.md +57 -53
- package/README.md +4 -1
- package/__tests__/test-sitemap-url-patterns.xml +105 -0
- package/exclusions.txt +1 -0
- package/package.json +7 -6
- package/src/cli.ts +35 -2
- package/src/combine.ts +10 -7
- package/src/constants/cliFunctions.ts +9 -0
- package/src/constants/common.ts +95 -105
- package/src/constants/constants.ts +47 -2
- package/src/crawlers/commonCrawlerFunc.ts +50 -5
- package/src/crawlers/crawlDomain.ts +112 -73
- package/src/crawlers/crawlIntelligentSitemap.ts +40 -36
- package/src/crawlers/crawlLocalFile.ts +77 -35
- package/src/crawlers/crawlSitemap.ts +156 -89
- package/src/index.ts +2 -0
- package/src/logs.ts +4 -2
- package/src/mergeAxeResults.ts +20 -9
- package/src/npmIndex.ts +1 -1
- package/src/screenshotFunc/htmlScreenshotFunc.ts +7 -5
- package/src/screenshotFunc/pdfScreenshotFunc.ts +2 -2
- package/src/static/ejs/partials/components/wcagCompliance.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +1 -0
- package/src/static/ejs/partials/styles/styles.ejs +11 -0
- package/src/static/ejs/report.ejs +14 -1
- package/src/utils.ts +3 -3
package/package.json
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
{
|
2
2
|
"name": "@govtechsg/oobee",
|
3
3
|
"main": "dist/npmIndex.js",
|
4
|
-
"version": "0.10.
|
4
|
+
"version": "0.10.57",
|
5
5
|
"type": "module",
|
6
6
|
"author": "Government Technology Agency <info@tech.gov.sg>",
|
7
7
|
"dependencies": {
|
8
8
|
"@json2csv/node": "^7.0.3",
|
9
9
|
"@napi-rs/canvas": "^0.1.53",
|
10
10
|
"@sentry/node": "^9.13.0",
|
11
|
-
"axe-core": "^4.10.
|
11
|
+
"axe-core": "^4.10.3",
|
12
12
|
"axios": "^1.8.2",
|
13
13
|
"base64-stream": "^1.0.0",
|
14
14
|
"cheerio": "^1.0.0-rc.12",
|
15
|
-
"crawlee": "^3.
|
15
|
+
"crawlee": "^3.13.10",
|
16
16
|
"ejs": "^3.1.9",
|
17
17
|
"file-type": "^19.5.0",
|
18
18
|
"fs-extra": "^11.2.0",
|
@@ -59,16 +59,17 @@
|
|
59
59
|
"eslint-plugin-import": "^2.27.4",
|
60
60
|
"eslint-plugin-prettier": "^5.0.0",
|
61
61
|
"globals": "^15.2.0",
|
62
|
-
"jest": "^
|
62
|
+
"jest": "^30.0.4",
|
63
63
|
"readable-stream": "^4.7.0",
|
64
|
-
"typescript-eslint": "^8.
|
64
|
+
"typescript-eslint": "^8.36.0"
|
65
65
|
},
|
66
66
|
"overrides": {
|
67
67
|
"node-fetch": "^2.3.0",
|
68
68
|
"json5": "^2.2.3",
|
69
69
|
"ansi-regex": "^5.0.1",
|
70
70
|
"tough-cookie": "^5.0.0-rc.2",
|
71
|
-
"micromatch": "github:micromatch/micromatch.git#4.0.
|
71
|
+
"micromatch": "github:micromatch/micromatch.git#4.0.8",
|
72
|
+
"brace-expansion": "^1.1.12"
|
72
73
|
},
|
73
74
|
"optionalDependencies": {
|
74
75
|
"@napi-rs/canvas-darwin-arm64": "^0.1.53",
|
package/src/cli.ts
CHANGED
@@ -26,6 +26,7 @@ import constants, { ScannerTypes } from './constants/constants.js';
|
|
26
26
|
import { cliOptions, messageOptions } from './constants/cliFunctions.js';
|
27
27
|
import combineRun from './combine.js';
|
28
28
|
import { Answers } from './index.js';
|
29
|
+
import { consoleLogger } from './logs.js';
|
29
30
|
|
30
31
|
const appVersion = getVersion();
|
31
32
|
const yargs = _yargs(hideBin(process.argv));
|
@@ -194,6 +195,23 @@ Usage: npm run cli -- -c <crawler> -d <device> -w <viewport> -u <url> OPTIONS`,
|
|
194
195
|
}
|
195
196
|
return true;
|
196
197
|
})
|
198
|
+
.coerce('l', (option) => {
|
199
|
+
const duration = Number(option);
|
200
|
+
if (isNaN(duration) || duration < 0) {
|
201
|
+
printMessage(
|
202
|
+
['Invalid scan duration. Please provide a positive number of seconds.'],
|
203
|
+
messageOptions,
|
204
|
+
);
|
205
|
+
process.exit(1);
|
206
|
+
}
|
207
|
+
return duration;
|
208
|
+
})
|
209
|
+
.check(argvs => {
|
210
|
+
if (argvs.scanner === ScannerTypes.CUSTOM && argvs.scanDuration > 0) {
|
211
|
+
throw new Error('-l or --scanDuration is not allowed for custom flow scans.');
|
212
|
+
}
|
213
|
+
return true;
|
214
|
+
})
|
197
215
|
.conflicts('d', 'w')
|
198
216
|
.parse() as unknown as Answers;
|
199
217
|
|
@@ -240,25 +258,32 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
240
258
|
isCustomFlow,
|
241
259
|
parseHeaders(updatedArgvs.header),
|
242
260
|
);
|
261
|
+
|
262
|
+
if (res.httpStatus) consoleLogger.info(`Connectivity Check HTTP Response Code: ${res.httpStatus}`);
|
263
|
+
|
243
264
|
switch (res.status) {
|
244
265
|
case statuses.success.code: {
|
245
266
|
updatedArgvs.finalUrl = res.url;
|
246
|
-
if (process.env.
|
267
|
+
if (process.env.OOBEE_VALIDATE_URL) {
|
247
268
|
console.log('Url is valid');
|
248
269
|
process.exit(0);
|
249
270
|
}
|
271
|
+
|
250
272
|
break;
|
251
273
|
}
|
252
274
|
case statuses.unauthorised.code: {
|
253
275
|
printMessage([statuses.unauthorised.message], messageOptions);
|
276
|
+
consoleLogger.info(statuses.unauthorised.message);
|
254
277
|
process.exit(res.status);
|
255
278
|
}
|
256
279
|
case statuses.cannotBeResolved.code: {
|
257
280
|
printMessage([statuses.cannotBeResolved.message], messageOptions);
|
281
|
+
consoleLogger.info(statuses.cannotBeResolved.message);
|
258
282
|
process.exit(res.status);
|
259
283
|
}
|
260
284
|
case statuses.systemError.code: {
|
261
285
|
printMessage([statuses.systemError.message], messageOptions);
|
286
|
+
consoleLogger.info(statuses.systemError.message);
|
262
287
|
process.exit(res.status);
|
263
288
|
}
|
264
289
|
case statuses.invalidUrl.code: {
|
@@ -267,6 +292,7 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
267
292
|
updatedArgvs.scanner !== ScannerTypes.LOCALFILE
|
268
293
|
) {
|
269
294
|
printMessage([statuses.invalidUrl.message], messageOptions);
|
295
|
+
consoleLogger.info(statuses.invalidUrl.message);
|
270
296
|
process.exit(res.status);
|
271
297
|
}
|
272
298
|
|
@@ -274,29 +300,35 @@ const scanInit = async (argvs: Answers): Promise<string> => {
|
|
274
300
|
if (finalFilePath) {
|
275
301
|
updatedArgvs.isLocalFileScan = true;
|
276
302
|
updatedArgvs.finalUrl = finalFilePath;
|
277
|
-
|
303
|
+
|
304
|
+
if (process.env.OOBEE_VALIDATE_URL) {
|
278
305
|
console.log('Url is valid');
|
279
306
|
process.exit(0);
|
280
307
|
}
|
281
308
|
} else if (updatedArgvs.scanner === ScannerTypes.LOCALFILE) {
|
282
309
|
printMessage([statuses.notALocalFile.message], messageOptions);
|
310
|
+
consoleLogger.info(statuses.notALocalFile.message);
|
283
311
|
process.exit(statuses.notALocalFile.code);
|
284
312
|
} else if (updatedArgvs.scanner !== ScannerTypes.SITEMAP) {
|
285
313
|
printMessage([statuses.notASitemap.message], messageOptions);
|
314
|
+
consoleLogger.info(statuses.notASitemap.message);
|
286
315
|
process.exit(statuses.notASitemap.code);
|
287
316
|
}
|
288
317
|
break;
|
289
318
|
}
|
290
319
|
case statuses.notASitemap.code: {
|
291
320
|
printMessage([statuses.notASitemap.message], messageOptions);
|
321
|
+
consoleLogger.info(statuses.notASitemap.message);
|
292
322
|
process.exit(res.status);
|
293
323
|
}
|
294
324
|
case statuses.notALocalFile.code: {
|
295
325
|
printMessage([statuses.notALocalFile.message], messageOptions);
|
326
|
+
consoleLogger.info(statuses.notALocalFile.message);
|
296
327
|
process.exit(res.status);
|
297
328
|
}
|
298
329
|
case statuses.browserError.code: {
|
299
330
|
printMessage([statuses.browserError.message], messageOptions);
|
331
|
+
consoleLogger.info(statuses.browserError.message);
|
300
332
|
process.exit(res.status);
|
301
333
|
}
|
302
334
|
default:
|
@@ -386,6 +418,7 @@ const optionsAnswer: Answers = {
|
|
386
418
|
playwrightDeviceDetailsObject: options.playwrightDeviceDetailsObject,
|
387
419
|
ruleset: options.ruleset,
|
388
420
|
generateJsonFiles: options.generateJsonFiles,
|
421
|
+
scanDuration: options.scanDuration,
|
389
422
|
};
|
390
423
|
|
391
424
|
await scanInit(optionsAnswer);
|
package/src/combine.ts
CHANGED
@@ -61,6 +61,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
61
61
|
zip,
|
62
62
|
ruleset, // Enable custom checks, Enable WCAG AAA: if checked, = 'enable-wcag-aaa')
|
63
63
|
generateJsonFiles,
|
64
|
+
scanDuration
|
64
65
|
} = envDetails;
|
65
66
|
|
66
67
|
process.env.CRAWLEE_LOG_LEVEL = 'ERROR';
|
@@ -73,7 +74,6 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
73
74
|
blacklistedPatterns = getBlackListedPatterns(blacklistedPatternsFilename);
|
74
75
|
} catch (error) {
|
75
76
|
consoleLogger.error(error);
|
76
|
-
silentLogger.error(error);
|
77
77
|
process.exit(1);
|
78
78
|
}
|
79
79
|
|
@@ -127,8 +127,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
127
127
|
break;
|
128
128
|
|
129
129
|
case ScannerTypes.SITEMAP:
|
130
|
-
|
131
|
-
url,
|
130
|
+
urlsCrawledObj = await crawlSitemap({
|
131
|
+
sitemapUrl: url,
|
132
132
|
randomToken,
|
133
133
|
host,
|
134
134
|
viewportSettings,
|
@@ -140,11 +140,12 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
140
140
|
blacklistedPatterns,
|
141
141
|
includeScreenshots,
|
142
142
|
extraHTTPHeaders,
|
143
|
-
|
143
|
+
scanDuration,
|
144
|
+
});
|
144
145
|
break;
|
145
146
|
|
146
147
|
case ScannerTypes.LOCALFILE:
|
147
|
-
urlsCrawledObj = await crawlLocalFile(
|
148
|
+
urlsCrawledObj = await crawlLocalFile({
|
148
149
|
url,
|
149
150
|
randomToken,
|
150
151
|
host,
|
@@ -157,7 +158,8 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
157
158
|
blacklistedPatterns,
|
158
159
|
includeScreenshots,
|
159
160
|
extraHTTPHeaders,
|
160
|
-
|
161
|
+
scanDuration,
|
162
|
+
});
|
161
163
|
break;
|
162
164
|
|
163
165
|
case ScannerTypes.INTELLIGENT:
|
@@ -177,6 +179,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
177
179
|
followRobots,
|
178
180
|
extraHTTPHeaders,
|
179
181
|
safeMode,
|
182
|
+
scanDuration
|
180
183
|
);
|
181
184
|
break;
|
182
185
|
|
@@ -196,6 +199,7 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
196
199
|
includeScreenshots,
|
197
200
|
followRobots,
|
198
201
|
extraHTTPHeaders,
|
202
|
+
scanDuration,
|
199
203
|
safeMode,
|
200
204
|
ruleset,
|
201
205
|
});
|
@@ -203,7 +207,6 @@ const combineRun = async (details: Data, deviceToScan: string) => {
|
|
203
207
|
|
204
208
|
default:
|
205
209
|
consoleLogger.error(`type: ${type} not defined`);
|
206
|
-
silentLogger.error(`type: ${type} not defined`);
|
207
210
|
process.exit(1);
|
208
211
|
}
|
209
212
|
|
@@ -331,5 +331,14 @@ To obtain the JSON files, you need to base64-decode the file followed by gunzip.
|
|
331
331
|
throw new Error(`Invalid value "${value}" for --generate. Use "yes", "y", "no", or "n".`);
|
332
332
|
},
|
333
333
|
},
|
334
|
+
l: {
|
335
|
+
alias: 'scanDuration',
|
336
|
+
describe: 'Maximum scan duration in seconds (0 means unlimited)',
|
337
|
+
type: 'number',
|
338
|
+
requiresArg: true,
|
339
|
+
default: 0,
|
340
|
+
demandOption: false,
|
341
|
+
coerce: val => Number(val),
|
342
|
+
},
|
334
343
|
};
|
335
344
|
|
package/src/constants/common.ts
CHANGED
@@ -29,7 +29,7 @@ import constants, {
|
|
29
29
|
ScannerTypes,
|
30
30
|
BrowserTypes,
|
31
31
|
} from './constants.js';
|
32
|
-
import { silentLogger } from '../logs.js';
|
32
|
+
import { consoleLogger, silentLogger } from '../logs.js';
|
33
33
|
import { isUrlPdf } from '../crawlers/commonCrawlerFunc.js';
|
34
34
|
import { randomThreeDigitNumberString } from '../utils.js';
|
35
35
|
import { Answers, Data } from '../index.js';
|
@@ -54,8 +54,9 @@ export const validateDirPath = (dirPath: string): string => {
|
|
54
54
|
}
|
55
55
|
};
|
56
56
|
|
57
|
-
export class RES {
|
57
|
+
export class RES {
|
58
58
|
status: number;
|
59
|
+
httpStatus?: number;
|
59
60
|
url: string;
|
60
61
|
content: string;
|
61
62
|
constructor(res?: Partial<RES>) {
|
@@ -370,7 +371,6 @@ const requestToUrl = async (
|
|
370
371
|
} else {
|
371
372
|
res.status = constants.urlCheckStatuses.systemError.code;
|
372
373
|
}
|
373
|
-
silentLogger.error(error);
|
374
374
|
});
|
375
375
|
return res;
|
376
376
|
};
|
@@ -385,105 +385,89 @@ const checkUrlConnectivityWithBrowser = async (
|
|
385
385
|
) => {
|
386
386
|
const res = new RES();
|
387
387
|
|
388
|
+
const data = sanitizeUrlInput(url);
|
389
|
+
if (!data.isValid) {
|
390
|
+
res.status = constants.urlCheckStatuses.invalidUrl.code;
|
391
|
+
return res;
|
392
|
+
}
|
393
|
+
|
388
394
|
let viewport = null;
|
389
395
|
let userAgent = null;
|
396
|
+
if ('viewport' in playwrightDeviceDetailsObject) viewport = playwrightDeviceDetailsObject.viewport;
|
397
|
+
if ('userAgent' in playwrightDeviceDetailsObject) userAgent = playwrightDeviceDetailsObject.userAgent;
|
390
398
|
|
391
|
-
|
392
|
-
|
393
|
-
viewport = playwrightDeviceDetailsObject.viewport;
|
394
|
-
}
|
399
|
+
// Ensure Accept header for non-html content fallback
|
400
|
+
extraHTTPHeaders['Accept'] ||= 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
|
395
401
|
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
402
|
+
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
403
|
+
const browserContextLaunchOptions = {
|
404
|
+
...launchOptions,
|
405
|
+
args: [...launchOptions.args, '--headless=new'],
|
406
|
+
};
|
400
407
|
|
401
|
-
|
402
|
-
|
408
|
+
let browserContext;
|
409
|
+
try {
|
410
|
+
browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
|
411
|
+
...browserContextLaunchOptions,
|
412
|
+
...(viewport && { viewport }),
|
413
|
+
...(userAgent && { userAgent }),
|
414
|
+
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
415
|
+
});
|
416
|
+
} catch (err) {
|
417
|
+
printMessage([`Unable to launch browser\n${err}`], messageOptions);
|
418
|
+
res.status = constants.urlCheckStatuses.browserError.code;
|
419
|
+
return res;
|
420
|
+
}
|
403
421
|
|
404
|
-
|
405
|
-
|
422
|
+
try {
|
423
|
+
const page = await browserContext.newPage();
|
406
424
|
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
const launchOptions = getPlaywrightLaunchOptions(browserToRun);
|
411
|
-
const browserContextLaunchOptions = {
|
412
|
-
...launchOptions,
|
413
|
-
args: [...launchOptions.args, '--headless=new'],
|
414
|
-
};
|
415
|
-
|
416
|
-
browserContext = await constants.launcher.launchPersistentContext(clonedDataDir, {
|
417
|
-
...browserContextLaunchOptions,
|
418
|
-
...(viewport && { viewport }),
|
419
|
-
...(userAgent && { userAgent }),
|
420
|
-
...(extraHTTPHeaders && { extraHTTPHeaders }),
|
421
|
-
});
|
422
|
-
} catch (err) {
|
423
|
-
printMessage([`Unable to launch browser\n${err}`], messageOptions);
|
424
|
-
res.status = constants.urlCheckStatuses.browserError.code;
|
425
|
-
return res;
|
425
|
+
// Skip Playwright for PDF (use raw request instead)
|
426
|
+
if (isUrlPdf(url)) {
|
427
|
+
return await requestToUrl(url, false, extraHTTPHeaders);
|
426
428
|
}
|
427
429
|
|
428
|
-
|
429
|
-
|
430
|
+
const response = await page.goto(url, {
|
431
|
+
timeout: 30000,
|
432
|
+
...(proxy && { waitUntil: 'commit' }),
|
433
|
+
});
|
430
434
|
|
431
|
-
// method will not throw an error when any valid HTTP status code is returned by the remote server, including 404 "Not Found" and 500 "Internal Server Error".
|
432
|
-
// navigation to about:blank or navigation to the same URL with a different hash, which would succeed and return null.
|
433
435
|
try {
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
}
|
439
|
-
|
440
|
-
const response = await page.goto(url, {
|
441
|
-
timeout: 30000,
|
442
|
-
...(proxy && { waitUntil: 'commit' }),
|
443
|
-
});
|
436
|
+
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
437
|
+
} catch {
|
438
|
+
consoleLogger.info('Unable to detect networkidle');
|
439
|
+
}
|
444
440
|
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
}
|
441
|
+
const status = response.status();
|
442
|
+
res.status = status === 401
|
443
|
+
? constants.urlCheckStatuses.unauthorised.code
|
444
|
+
: constants.urlCheckStatuses.success.code;
|
450
445
|
|
451
|
-
|
452
|
-
|
453
|
-
res.status = constants.urlCheckStatuses.unauthorised.code;
|
454
|
-
} else {
|
455
|
-
res.status = constants.urlCheckStatuses.success.code;
|
456
|
-
}
|
446
|
+
// Store the status code
|
447
|
+
res.httpStatus = response?.status?.() ?? 0;
|
457
448
|
|
458
|
-
|
459
|
-
|
460
|
-
res.url = url;
|
461
|
-
} else {
|
462
|
-
res.url = page.url();
|
463
|
-
}
|
449
|
+
// Store final navigated URL
|
450
|
+
res.url = isCustomFlow ? url : page.url();
|
464
451
|
|
465
|
-
|
452
|
+
// Check content type to determine how to extract content
|
453
|
+
const contentType = response.headers()['content-type'] || '';
|
466
454
|
|
467
|
-
|
468
|
-
|
469
|
-
|
455
|
+
if (contentType.includes('xml') || res.url.endsWith('.xml')) {
|
456
|
+
// Fetch raw content to avoid Playwright's HTML-wrapped <pre> behavior
|
457
|
+
const rawResponse = await requestToUrl(res.url, true, extraHTTPHeaders);
|
458
|
+
res.content = rawResponse.content;
|
459
|
+
} else {
|
460
|
+
res.content = await page.content(); // rendered DOM
|
461
|
+
}
|
470
462
|
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
res.status = constants.urlCheckStatuses.unauthorised.code;
|
477
|
-
} else {
|
478
|
-
// enters here if input is not a URL or not using http/https protocols
|
479
|
-
res.status = constants.urlCheckStatuses.systemError.code;
|
480
|
-
}
|
481
|
-
} finally {
|
482
|
-
await browserContext.close();
|
463
|
+
} catch (error) {
|
464
|
+
if (error.message.includes('net::ERR_INVALID_AUTH_CREDENTIALS')) {
|
465
|
+
res.status = constants.urlCheckStatuses.unauthorised.code;
|
466
|
+
} else {
|
467
|
+
res.status = constants.urlCheckStatuses.systemError.code;
|
483
468
|
}
|
484
|
-
}
|
485
|
-
|
486
|
-
res.status = constants.urlCheckStatuses.invalidUrl.code;
|
469
|
+
} finally {
|
470
|
+
await browserContext.close();
|
487
471
|
}
|
488
472
|
|
489
473
|
return res;
|
@@ -597,6 +581,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
597
581
|
zip,
|
598
582
|
ruleset,
|
599
583
|
generateJsonFiles,
|
584
|
+
scanDuration
|
600
585
|
} = argv;
|
601
586
|
|
602
587
|
// construct filename for scan results
|
@@ -644,6 +629,7 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
|
|
644
629
|
zip,
|
645
630
|
ruleset,
|
646
631
|
generateJsonFiles,
|
632
|
+
scanDuration,
|
647
633
|
};
|
648
634
|
};
|
649
635
|
|
@@ -662,7 +648,7 @@ export const getUrlsFromRobotsTxt = async (url: string, browserToRun: string): P
|
|
662
648
|
robotsTxt = await getRobotsTxtViaAxios(robotsUrl);
|
663
649
|
}
|
664
650
|
} catch (e) {
|
665
|
-
|
651
|
+
// if robots.txt is not found, do nothing
|
666
652
|
}
|
667
653
|
console.log('robotsTxt', robotsTxt);
|
668
654
|
if (!robotsTxt) {
|
@@ -1019,7 +1005,7 @@ export const getLinksFromSitemap = async (
|
|
1019
1005
|
|
1020
1006
|
switch (sitemapType) {
|
1021
1007
|
case constants.xmlSitemapTypes.xmlIndex:
|
1022
|
-
|
1008
|
+
consoleLogger.info(`This is a XML format sitemap index.`);
|
1023
1009
|
for (const childSitemapUrl of $('loc')) {
|
1024
1010
|
const childSitemapUrlText = $(childSitemapUrl).text();
|
1025
1011
|
if (isLimitReached()) {
|
@@ -1033,19 +1019,19 @@ export const getLinksFromSitemap = async (
|
|
1033
1019
|
}
|
1034
1020
|
break;
|
1035
1021
|
case constants.xmlSitemapTypes.xml:
|
1036
|
-
|
1022
|
+
consoleLogger.info(`This is a XML format sitemap.`);
|
1037
1023
|
await processXmlSitemap($, sitemapType, 'loc', 'lastmod', 'url');
|
1038
1024
|
break;
|
1039
1025
|
case constants.xmlSitemapTypes.rss:
|
1040
|
-
|
1026
|
+
consoleLogger.info(`This is a RSS format sitemap.`);
|
1041
1027
|
await processXmlSitemap($, sitemapType, 'link', 'pubDate', 'item');
|
1042
1028
|
break;
|
1043
1029
|
case constants.xmlSitemapTypes.atom:
|
1044
|
-
|
1030
|
+
consoleLogger.info(`This is a Atom format sitemap.`);
|
1045
1031
|
await processXmlSitemap($, sitemapType, 'link', 'published', 'entry');
|
1046
1032
|
break;
|
1047
1033
|
default:
|
1048
|
-
|
1034
|
+
consoleLogger.info(`This is an unrecognised XML sitemap format.`);
|
1049
1035
|
processNonStandardSitemap(data);
|
1050
1036
|
}
|
1051
1037
|
};
|
@@ -1053,7 +1039,7 @@ export const getLinksFromSitemap = async (
|
|
1053
1039
|
try {
|
1054
1040
|
await fetchUrls(sitemapUrl);
|
1055
1041
|
} catch (e) {
|
1056
|
-
|
1042
|
+
consoleLogger.error(e);
|
1057
1043
|
}
|
1058
1044
|
|
1059
1045
|
const requestList = Object.values(urls);
|
@@ -1262,7 +1248,7 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
|
|
1262
1248
|
try {
|
1263
1249
|
fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
|
1264
1250
|
} catch (err) {
|
1265
|
-
|
1251
|
+
consoleLogger.error(err);
|
1266
1252
|
if (err.code === 'EBUSY') {
|
1267
1253
|
console.log(
|
1268
1254
|
`Unable to copy the file for ${profileName} because it is currently in use.`,
|
@@ -1284,7 +1270,7 @@ const cloneChromeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, d
|
|
1284
1270
|
return success;
|
1285
1271
|
}
|
1286
1272
|
|
1287
|
-
|
1273
|
+
consoleLogger.warn('Unable to find Chrome profile cookies file in the system.');
|
1288
1274
|
printMessage(['Unable to find Chrome profile cookies file in the system.'], messageOptions);
|
1289
1275
|
return false;
|
1290
1276
|
};
|
@@ -1338,7 +1324,7 @@ const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, des
|
|
1338
1324
|
try {
|
1339
1325
|
fs.copyFileSync(dir, path.join(destProfileDir, 'Cookies'));
|
1340
1326
|
} catch (err) {
|
1341
|
-
|
1327
|
+
consoleLogger.error(err);
|
1342
1328
|
if (err.code === 'EBUSY') {
|
1343
1329
|
console.log(
|
1344
1330
|
`Unable to copy the file for ${profileName} because it is currently in use.`,
|
@@ -1357,7 +1343,7 @@ const cloneEdgeProfileCookieFiles = (options: GlobOptionsWithFileTypesFalse, des
|
|
1357
1343
|
});
|
1358
1344
|
return success;
|
1359
1345
|
}
|
1360
|
-
|
1346
|
+
consoleLogger.warn('Unable to find Edge profile cookies file in the system.');
|
1361
1347
|
printMessage(['Unable to find Edge profile cookies file in the system.'], messageOptions);
|
1362
1348
|
return false;
|
1363
1349
|
};
|
@@ -1383,7 +1369,7 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
|
|
1383
1369
|
try {
|
1384
1370
|
fs.copyFileSync(dir, path.join(destDir, 'Local State'));
|
1385
1371
|
} catch (err) {
|
1386
|
-
|
1372
|
+
consoleLogger.error(err);
|
1387
1373
|
if (err.code === 'EBUSY') {
|
1388
1374
|
console.log(`Unable to copy the file because it is currently in use.`);
|
1389
1375
|
console.log('Please close any applications that might be using this file and try again.');
|
@@ -1398,7 +1384,7 @@ const cloneLocalStateFile = (options: GlobOptionsWithFileTypesFalse, destDir: st
|
|
1398
1384
|
});
|
1399
1385
|
return success;
|
1400
1386
|
}
|
1401
|
-
|
1387
|
+
consoleLogger.warn('Unable to find local state file in the system.');
|
1402
1388
|
printMessage(['Unable to find local state file in the system.'], messageOptions);
|
1403
1389
|
return false;
|
1404
1390
|
};
|
@@ -1561,7 +1547,7 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
|
|
1561
1547
|
try {
|
1562
1548
|
fs.rmSync(dir, { recursive: true });
|
1563
1549
|
} catch (err) {
|
1564
|
-
|
1550
|
+
consoleLogger.error(
|
1565
1551
|
`CHROME Unable to delete ${dir} folder in the Chrome data directory. ${err}`,
|
1566
1552
|
);
|
1567
1553
|
}
|
@@ -1570,7 +1556,7 @@ export const deleteClonedChromeProfiles = (randomToken?: string): void => {
|
|
1570
1556
|
return;
|
1571
1557
|
}
|
1572
1558
|
|
1573
|
-
|
1559
|
+
consoleLogger.warn('Unable to find oobee directory in the Chrome data directory.');
|
1574
1560
|
console.warn('Unable to find oobee directory in the Chrome data directory.');
|
1575
1561
|
};
|
1576
1562
|
|
@@ -1605,7 +1591,7 @@ export const deleteClonedEdgeProfiles = (randomToken?: string): void => {
|
|
1605
1591
|
try {
|
1606
1592
|
fs.rmSync(dir, { recursive: true });
|
1607
1593
|
} catch (err) {
|
1608
|
-
|
1594
|
+
consoleLogger.error(
|
1609
1595
|
`EDGE Unable to delete ${dir} folder in the Chrome data directory. ${err}`,
|
1610
1596
|
);
|
1611
1597
|
}
|
@@ -1637,7 +1623,7 @@ export const deleteClonedChromiumProfiles = (randomToken?: string): void => {
|
|
1637
1623
|
try {
|
1638
1624
|
fs.rmSync(dir, { recursive: true });
|
1639
1625
|
} catch (err) {
|
1640
|
-
|
1626
|
+
consoleLogger.error(
|
1641
1627
|
`CHROMIUM Unable to delete ${dir} folder in the Chromium data directory. ${err}`,
|
1642
1628
|
);
|
1643
1629
|
}
|
@@ -1646,7 +1632,7 @@ export const deleteClonedChromiumProfiles = (randomToken?: string): void => {
|
|
1646
1632
|
return;
|
1647
1633
|
}
|
1648
1634
|
|
1649
|
-
|
1635
|
+
consoleLogger.warn('Unable to find oobee directory in Chromium support directory');
|
1650
1636
|
console.warn('Unable to find oobee directory in Chromium support directory');
|
1651
1637
|
};
|
1652
1638
|
|
@@ -1723,10 +1709,10 @@ export const submitFormViaPlaywright = async (
|
|
1723
1709
|
try {
|
1724
1710
|
await page.waitForLoadState('networkidle', { timeout: 10000 });
|
1725
1711
|
} catch {
|
1726
|
-
|
1712
|
+
consoleLogger.info('Unable to detect networkidle');
|
1727
1713
|
}
|
1728
1714
|
} catch (error) {
|
1729
|
-
|
1715
|
+
consoleLogger.error(error);
|
1730
1716
|
} finally {
|
1731
1717
|
await browserContext.close();
|
1732
1718
|
if (proxy && browserToRun === BrowserTypes.EDGE) {
|
@@ -1842,7 +1828,11 @@ export const getPlaywrightLaunchOptions = (browser?: string): LaunchOptions => {
|
|
1842
1828
|
}
|
1843
1829
|
|
1844
1830
|
// Set new headless mode as Chrome 132 does not support headless=old
|
1845
|
-
|
1831
|
+
// Also mute audio
|
1832
|
+
if (process.env.CRAWLEE_HEADLESS === '1') {
|
1833
|
+
constants.launchOptionsArgs.push('--headless=new');
|
1834
|
+
constants.launchOptionsArgs.push('--mute-audio');
|
1835
|
+
}
|
1846
1836
|
|
1847
1837
|
const options: LaunchOptions = {
|
1848
1838
|
// Drop the --use-mock-keychain flag to allow MacOS devices
|