@govtechsg/oobee 0.10.50 → 0.10.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/bump-package-version.yml +58 -0
- package/.github/workflows/image.yml +38 -17
- package/DETAILS.md +5 -2
- package/INTEGRATION.md +57 -53
- package/README.md +4 -1
- package/__tests__/test-sitemap-url-patterns.xml +105 -0
- package/exclusions.txt +1 -0
- package/package.json +7 -6
- package/src/cli.ts +35 -2
- package/src/combine.ts +10 -7
- package/src/constants/cliFunctions.ts +9 -0
- package/src/constants/common.ts +95 -105
- package/src/constants/constants.ts +47 -2
- package/src/crawlers/commonCrawlerFunc.ts +50 -5
- package/src/crawlers/crawlDomain.ts +112 -73
- package/src/crawlers/crawlIntelligentSitemap.ts +40 -36
- package/src/crawlers/crawlLocalFile.ts +77 -35
- package/src/crawlers/crawlSitemap.ts +156 -89
- package/src/index.ts +2 -0
- package/src/logs.ts +4 -2
- package/src/mergeAxeResults.ts +20 -9
- package/src/npmIndex.ts +1 -1
- package/src/screenshotFunc/htmlScreenshotFunc.ts +7 -5
- package/src/screenshotFunc/pdfScreenshotFunc.ts +2 -2
- package/src/static/ejs/partials/components/wcagCompliance.ejs +1 -1
- package/src/static/ejs/partials/scripts/ruleOffcanvas.ejs +1 -0
- package/src/static/ejs/partials/styles/styles.ejs +11 -0
- package/src/static/ejs/report.ejs +14 -1
- package/src/utils.ts +3 -3
@@ -2,7 +2,7 @@ import fs from 'fs';
|
|
2
2
|
import { chromium, Page } from 'playwright';
|
3
3
|
import { createCrawleeSubFolders } from './commonCrawlerFunc.js';
|
4
4
|
import constants, { guiInfoStatusTypes, sitemapPaths } from '../constants/constants.js';
|
5
|
-
import {
|
5
|
+
import { consoleLogger, guiInfoLog } from '../logs.js';
|
6
6
|
import crawlDomain from './crawlDomain.js';
|
7
7
|
import crawlSitemap from './crawlSitemap.js';
|
8
8
|
import { EnqueueStrategy } from 'crawlee';
|
@@ -24,46 +24,42 @@ const crawlIntelligentSitemap = async (
|
|
24
24
|
followRobots: boolean,
|
25
25
|
extraHTTPHeaders: Record<string, string>,
|
26
26
|
safeMode: boolean,
|
27
|
+
scanDuration: number
|
27
28
|
) => {
|
29
|
+
const startTime = Date.now(); // Track start time
|
30
|
+
|
28
31
|
let urlsCrawledFinal;
|
29
|
-
let urlsCrawled;
|
32
|
+
let urlsCrawled = { ...constants.urlsCrawledObj };
|
30
33
|
let dataset;
|
31
34
|
let sitemapExist = false;
|
32
35
|
const fromCrawlIntelligentSitemap = true;
|
33
36
|
let sitemapUrl;
|
34
37
|
|
35
|
-
urlsCrawled = { ...constants.urlsCrawledObj };
|
36
38
|
({ dataset } = await createCrawleeSubFolders(randomToken));
|
37
|
-
|
38
39
|
if (!fs.existsSync(randomToken)) {
|
39
40
|
fs.mkdirSync(randomToken);
|
40
41
|
}
|
41
42
|
|
42
43
|
function getHomeUrl(parsedUrl: string) {
|
43
44
|
const urlObject = new URL(parsedUrl);
|
44
|
-
if (urlObject.username
|
45
|
+
if (urlObject.username && urlObject.password) {
|
45
46
|
return `${urlObject.protocol}//${urlObject.username}:${urlObject.password}@${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
46
47
|
}
|
47
|
-
|
48
48
|
return `${urlObject.protocol}//${urlObject.hostname}${urlObject.port ? `:${urlObject.port}` : ''}`;
|
49
49
|
}
|
50
50
|
|
51
51
|
async function findSitemap(link: string) {
|
52
52
|
const homeUrl = getHomeUrl(link);
|
53
|
-
let sitemapLinkFound = false;
|
54
53
|
let sitemapLink = '';
|
55
|
-
const chromiumBrowser = await chromium.launch(
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
});
|
61
|
-
|
54
|
+
const chromiumBrowser = await chromium.launch({
|
55
|
+
headless: false,
|
56
|
+
channel: 'chrome',
|
57
|
+
args: ['--headless=new', '--no-sandbox'],
|
58
|
+
});
|
62
59
|
const page = await chromiumBrowser.newPage();
|
63
60
|
for (const path of sitemapPaths) {
|
64
61
|
sitemapLink = homeUrl + path;
|
65
|
-
|
66
|
-
if (sitemapLinkFound) {
|
62
|
+
if (await checkUrlExists(page, sitemapLink)) {
|
67
63
|
sitemapExist = true;
|
68
64
|
break;
|
69
65
|
}
|
@@ -75,12 +71,9 @@ const crawlIntelligentSitemap = async (
|
|
75
71
|
const checkUrlExists = async (page: Page, parsedUrl: string) => {
|
76
72
|
try {
|
77
73
|
const response = await page.goto(parsedUrl);
|
78
|
-
|
79
|
-
return true;
|
80
|
-
}
|
81
|
-
return false;
|
74
|
+
return response.ok();
|
82
75
|
} catch (e) {
|
83
|
-
|
76
|
+
consoleLogger.error(e);
|
84
77
|
return false;
|
85
78
|
}
|
86
79
|
};
|
@@ -88,13 +81,12 @@ const crawlIntelligentSitemap = async (
|
|
88
81
|
try {
|
89
82
|
sitemapUrl = await findSitemap(url);
|
90
83
|
} catch (error) {
|
91
|
-
|
84
|
+
consoleLogger.error(error);
|
92
85
|
}
|
93
86
|
|
94
87
|
if (!sitemapExist) {
|
95
88
|
console.log('Unable to find sitemap. Commencing website crawl instead.');
|
96
|
-
|
97
|
-
urlsCrawledFinal = await crawlDomain({
|
89
|
+
return await crawlDomain({
|
98
90
|
url,
|
99
91
|
randomToken,
|
100
92
|
host,
|
@@ -109,12 +101,13 @@ const crawlIntelligentSitemap = async (
|
|
109
101
|
includeScreenshots,
|
110
102
|
followRobots,
|
111
103
|
extraHTTPHeaders,
|
104
|
+
safeMode,
|
105
|
+
scanDuration, // Use full duration since no sitemap
|
112
106
|
});
|
113
|
-
return urlsCrawledFinal;
|
114
107
|
}
|
108
|
+
|
115
109
|
console.log(`Sitemap found at ${sitemapUrl}`);
|
116
|
-
|
117
|
-
urlsCrawledFinal = await crawlSitemap(
|
110
|
+
urlsCrawledFinal = await crawlSitemap({
|
118
111
|
sitemapUrl,
|
119
112
|
randomToken,
|
120
113
|
host,
|
@@ -128,14 +121,21 @@ const crawlIntelligentSitemap = async (
|
|
128
121
|
includeScreenshots,
|
129
122
|
extraHTTPHeaders,
|
130
123
|
fromCrawlIntelligentSitemap,
|
131
|
-
url,
|
132
|
-
dataset,
|
133
|
-
urlsCrawled,
|
134
|
-
false,
|
135
|
-
|
124
|
+
userUrlInputFromIntelligent: url,
|
125
|
+
datasetFromIntelligent: dataset,
|
126
|
+
urlsCrawledFromIntelligent: urlsCrawled,
|
127
|
+
crawledFromLocalFile: false,
|
128
|
+
scanDuration,
|
129
|
+
});
|
130
|
+
|
131
|
+
const elapsed = Date.now() - startTime;
|
132
|
+
const remainingScanDuration = Math.max(scanDuration - elapsed / 1000, 0); // in seconds
|
136
133
|
|
137
|
-
if (
|
138
|
-
|
134
|
+
if (
|
135
|
+
urlsCrawledFinal.scanned.length < maxRequestsPerCrawl &&
|
136
|
+
remainingScanDuration > 0
|
137
|
+
) {
|
138
|
+
console.log(`Continuing crawl from root website. Remaining scan time: ${remainingScanDuration.toFixed(1)}s`);
|
139
139
|
urlsCrawledFinal = await crawlDomain({
|
140
140
|
url,
|
141
141
|
randomToken,
|
@@ -153,12 +153,16 @@ const crawlIntelligentSitemap = async (
|
|
153
153
|
extraHTTPHeaders,
|
154
154
|
safeMode,
|
155
155
|
fromCrawlIntelligentSitemap,
|
156
|
-
datasetFromIntelligent: dataset,
|
157
|
-
urlsCrawledFromIntelligent: urlsCrawledFinal,
|
156
|
+
datasetFromIntelligent: dataset,
|
157
|
+
urlsCrawledFromIntelligent: urlsCrawledFinal,
|
158
|
+
scanDuration: remainingScanDuration,
|
158
159
|
});
|
160
|
+
} else if (remainingScanDuration <= 0) {
|
161
|
+
console.log(`Crawl duration exceeded before more pages could be found (limit: ${scanDuration}s).`);
|
159
162
|
}
|
160
163
|
|
161
164
|
guiInfoLog(guiInfoStatusTypes.COMPLETED, {});
|
162
165
|
return urlsCrawledFinal;
|
163
166
|
};
|
167
|
+
|
164
168
|
export default crawlIntelligentSitemap;
|
@@ -1,12 +1,15 @@
|
|
1
|
-
import { Request, RequestList } from 'crawlee';
|
2
|
-
import printMessage from 'print-message';
|
1
|
+
import { Request, RequestList, Dataset } from 'crawlee';
|
3
2
|
import fs from 'fs';
|
4
3
|
import path from 'path';
|
5
4
|
import { createCrawleeSubFolders, runAxeScript, isUrlPdf } from './commonCrawlerFunc.js';
|
6
|
-
import constants, {
|
5
|
+
import constants, {
|
6
|
+
guiInfoStatusTypes,
|
7
|
+
basicAuthRegex,
|
8
|
+
UrlsCrawled,
|
9
|
+
} from '../constants/constants.js';
|
10
|
+
import { ViewportSettingsClass } from '../combine.js';
|
7
11
|
import {
|
8
12
|
getPlaywrightLaunchOptions,
|
9
|
-
messageOptions,
|
10
13
|
isFilePath,
|
11
14
|
convertLocalFileToPath,
|
12
15
|
convertPathToLocalFile,
|
@@ -16,27 +19,47 @@ import { runPdfScan, mapPdfScanResults, doPdfScreenshots } from './pdfScanFunc.j
|
|
16
19
|
import { guiInfoLog } from '../logs.js';
|
17
20
|
import crawlSitemap from './crawlSitemap.js';
|
18
21
|
|
19
|
-
const crawlLocalFile = async (
|
20
|
-
|
21
|
-
randomToken
|
22
|
-
host
|
23
|
-
viewportSettings
|
24
|
-
maxRequestsPerCrawl
|
25
|
-
browser
|
26
|
-
userDataDirectory
|
27
|
-
specifiedMaxConcurrency
|
28
|
-
fileTypes
|
29
|
-
blacklistedPatterns
|
30
|
-
includeScreenshots
|
31
|
-
extraHTTPHeaders
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
22
|
+
export const crawlLocalFile = async ({
|
23
|
+
url,
|
24
|
+
randomToken,
|
25
|
+
host,
|
26
|
+
viewportSettings,
|
27
|
+
maxRequestsPerCrawl,
|
28
|
+
browser,
|
29
|
+
userDataDirectory,
|
30
|
+
specifiedMaxConcurrency,
|
31
|
+
fileTypes,
|
32
|
+
blacklistedPatterns,
|
33
|
+
includeScreenshots,
|
34
|
+
extraHTTPHeaders,
|
35
|
+
scanDuration = 0,
|
36
|
+
fromCrawlIntelligentSitemap = false,
|
37
|
+
userUrlInputFromIntelligent = null,
|
38
|
+
datasetFromIntelligent = null,
|
39
|
+
urlsCrawledFromIntelligent = null,
|
40
|
+
}: {
|
41
|
+
url: string;
|
42
|
+
randomToken: string;
|
43
|
+
host: string;
|
44
|
+
viewportSettings: ViewportSettingsClass;
|
45
|
+
maxRequestsPerCrawl: number;
|
46
|
+
browser: string;
|
47
|
+
userDataDirectory: string;
|
48
|
+
specifiedMaxConcurrency: number;
|
49
|
+
fileTypes: string;
|
50
|
+
blacklistedPatterns: string[];
|
51
|
+
includeScreenshots: boolean;
|
52
|
+
extraHTTPHeaders: Record<string, string>;
|
53
|
+
scanDuration?: number;
|
54
|
+
fromCrawlIntelligentSitemap?: boolean;
|
55
|
+
userUrlInputFromIntelligent?: string | null;
|
56
|
+
datasetFromIntelligent?: Dataset | null;
|
57
|
+
urlsCrawledFromIntelligent?: UrlsCrawled | null;
|
58
|
+
}) => {
|
37
59
|
let dataset: any;
|
38
|
-
let urlsCrawled:
|
60
|
+
let urlsCrawled: UrlsCrawled;
|
39
61
|
let linksFromSitemap = [];
|
62
|
+
let sitemapUrl = url;
|
40
63
|
|
41
64
|
// Boolean to omit axe scan for basic auth URL
|
42
65
|
let isBasicAuth: boolean;
|
@@ -82,7 +105,7 @@ const crawlLocalFile = async (
|
|
82
105
|
// Non XML file
|
83
106
|
} else {
|
84
107
|
// Put it to crawlSitemap function to handle xml files
|
85
|
-
const updatedUrlsCrawled = await crawlSitemap(
|
108
|
+
const updatedUrlsCrawled = await crawlSitemap({
|
86
109
|
sitemapUrl,
|
87
110
|
randomToken,
|
88
111
|
host,
|
@@ -95,12 +118,13 @@ const crawlLocalFile = async (
|
|
95
118
|
blacklistedPatterns,
|
96
119
|
includeScreenshots,
|
97
120
|
extraHTTPHeaders,
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
121
|
+
scanDuration,
|
122
|
+
fromCrawlIntelligentSitemap,
|
123
|
+
userUrlInputFromIntelligent,
|
124
|
+
datasetFromIntelligent,
|
125
|
+
urlsCrawledFromIntelligent,
|
126
|
+
crawledFromLocalFile: true,
|
127
|
+
});
|
104
128
|
|
105
129
|
urlsCrawled = { ...urlsCrawled, ...updatedUrlsCrawled };
|
106
130
|
return urlsCrawled;
|
@@ -124,16 +148,12 @@ const crawlLocalFile = async (
|
|
124
148
|
|
125
149
|
const uuidToPdfMapping: Record<string, string> = {}; // key and value of string type
|
126
150
|
|
127
|
-
printMessage(['Fetching URLs. This might take some time...'], { border: false });
|
128
|
-
|
129
151
|
finalLinks = [...finalLinks, ...linksFromSitemap];
|
130
152
|
|
131
153
|
await RequestList.open({
|
132
154
|
sources: finalLinks,
|
133
155
|
});
|
134
156
|
|
135
|
-
printMessage(['Fetch URLs completed. Beginning scan'], messageOptions);
|
136
|
-
|
137
157
|
const request = linksFromSitemap[0];
|
138
158
|
const pdfFileName = path.basename(request.url);
|
139
159
|
const trimmedUrl: string = request.url;
|
@@ -142,6 +162,8 @@ const crawlLocalFile = async (
|
|
142
162
|
fs.writeFileSync(destinationFilePath, data);
|
143
163
|
uuidToPdfMapping[pdfFileName] = trimmedUrl;
|
144
164
|
|
165
|
+
let shouldAbort = false;
|
166
|
+
|
145
167
|
if (!isUrlPdf(request.url)) {
|
146
168
|
await initModifiedUserAgent(browser);
|
147
169
|
const browserContext = await constants.launcher.launchPersistentContext('', {
|
@@ -150,9 +172,24 @@ const crawlLocalFile = async (
|
|
150
172
|
...playwrightDeviceDetailsObject,
|
151
173
|
});
|
152
174
|
|
175
|
+
const timeoutId = scanDuration > 0
|
176
|
+
? setTimeout(() => {
|
177
|
+
console.log(`Crawl duration of ${scanDuration}s exceeded. Aborting local file scan.`);
|
178
|
+
shouldAbort = true;
|
179
|
+
}, scanDuration * 1000)
|
180
|
+
: null;
|
181
|
+
|
153
182
|
const page = await browserContext.newPage();
|
154
183
|
request.url = convertPathToLocalFile(request.url);
|
155
184
|
await page.goto(request.url);
|
185
|
+
|
186
|
+
if (shouldAbort) {
|
187
|
+
console.warn('Scan aborted due to timeout before page scan.');
|
188
|
+
await dataset.pushData({ scanned: [], scannedRedirects: [] });
|
189
|
+
await browserContext.close().catch(() => {});
|
190
|
+
return urlsCrawled;
|
191
|
+
}
|
192
|
+
|
156
193
|
const results = await runAxeScript({ includeScreenshots, page, randomToken });
|
157
194
|
|
158
195
|
const actualUrl = page.url() || request.loadedUrl || request.url;
|
@@ -178,7 +215,11 @@ const crawlLocalFile = async (
|
|
178
215
|
|
179
216
|
await dataset.pushData(results);
|
180
217
|
} else {
|
181
|
-
urlsCrawled.scanned.push({
|
218
|
+
urlsCrawled.scanned.push({
|
219
|
+
url: trimmedUrl,
|
220
|
+
pageTitle: pdfFileName,
|
221
|
+
actualUrl: trimmedUrl,
|
222
|
+
});
|
182
223
|
|
183
224
|
await runPdfScan(randomToken);
|
184
225
|
// transform result format
|
@@ -192,6 +233,7 @@ const crawlLocalFile = async (
|
|
192
233
|
// push results for each pdf document to key value store
|
193
234
|
await Promise.all(pdfResults.map(result => dataset.pushData(result)));
|
194
235
|
}
|
236
|
+
|
195
237
|
return urlsCrawled;
|
196
238
|
};
|
197
239
|
export default crawlLocalFile;
|