@govtechsg/oobee 0.10.57 → 0.10.58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +1 -1
- package/src/crawlers/commonCrawlerFunc.ts +35 -1
- package/src/crawlers/crawlDomain.ts +14 -120
- package/src/crawlers/pdfScanFunc.ts +2 -0
package/README.md
CHANGED
@@ -84,7 +84,7 @@ verapdf --version
|
|
84
84
|
| Variable Name | Description | Default |
|
85
85
|
| ------------- | ----------- | ------- |
|
86
86
|
| OOBEE_VERBOSE | When set to `true`, log output goes to console | `false` |
|
87
|
-
| OOBEE_FAST_CRAWLER| When set to `true`, increases scan concurrency at a rapid rate. Experimental, may cause system stability issues. | `false`|
|
87
|
+
| OOBEE_FAST_CRAWLER| When set to `true`, increases scan concurrency at a rapid rate. Experimental, may cause system stability issues on low-powered devices. | `false`|
|
88
88
|
| OOBEE_VALIDATE_URL| When set to `true`, validates if URLs are valid and exits. | `false` |
|
89
89
|
| WARN_LEVEL | Only used in tests. | |
|
90
90
|
|
package/package.json
CHANGED
@@ -20,6 +20,7 @@ import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
|
|
20
20
|
import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
|
21
21
|
import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
|
22
22
|
import xPathToCss from './custom/xPathToCss.js';
|
23
|
+
import type { Response as PlaywrightResponse } from 'playwright';
|
23
24
|
|
24
25
|
// types
|
25
26
|
interface AxeResultsWithScreenshot extends AxeResults {
|
@@ -552,4 +553,37 @@ export async function shouldSkipClickDueToDisallowedHref(
|
|
552
553
|
disallowedPrefixes: disallowedListOfPatterns,
|
553
554
|
}
|
554
555
|
);
|
555
|
-
}
|
556
|
+
}
|
557
|
+
|
558
|
+
/**
|
559
|
+
* Check if response should be skipped based on content headers.
|
560
|
+
* @param response - Playwright Response object
|
561
|
+
* @param requestUrl - Optional: request URL for logging
|
562
|
+
* @returns true if the content should be skipped
|
563
|
+
*/
|
564
|
+
export const shouldSkipDueToUnsupportedContent = (
|
565
|
+
response: PlaywrightResponse,
|
566
|
+
requestUrl: string = ''
|
567
|
+
): boolean => {
|
568
|
+
if (!response) return false;
|
569
|
+
|
570
|
+
const headers = response.headers();
|
571
|
+
const contentDisposition = headers['content-disposition'] || '';
|
572
|
+
const contentType = headers['content-type'] || '';
|
573
|
+
|
574
|
+
if (contentDisposition.includes('attachment')) {
|
575
|
+
// consoleLogger.info(`Skipping attachment (content-disposition) at ${requestUrl}`);
|
576
|
+
return true;
|
577
|
+
}
|
578
|
+
|
579
|
+
if (
|
580
|
+
contentType.startsWith('application/') ||
|
581
|
+
contentType.includes('octet-stream') ||
|
582
|
+
(!contentType.startsWith('text/') && !contentType.includes('html'))
|
583
|
+
) {
|
584
|
+
// consoleLogger.info(`Skipping non-processible content-type "${contentType}" at ${requestUrl}`);
|
585
|
+
return true;
|
586
|
+
}
|
587
|
+
|
588
|
+
return false;
|
589
|
+
};
|
@@ -2,9 +2,6 @@ import crawlee, { EnqueueStrategy } from 'crawlee';
|
|
2
2
|
import fs from 'fs';
|
3
3
|
import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
|
4
4
|
import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
|
5
|
-
import axios from 'axios';
|
6
|
-
import { fileTypeFromBuffer } from 'file-type';
|
7
|
-
import mime from 'mime-types';
|
8
5
|
import https from 'https';
|
9
6
|
import type { BatchAddRequestsResult } from '@crawlee/types';
|
10
7
|
import {
|
@@ -12,6 +9,7 @@ import {
|
|
12
9
|
runAxeScript,
|
13
10
|
isUrlPdf,
|
14
11
|
shouldSkipClickDueToDisallowedHref,
|
12
|
+
shouldSkipDueToUnsupportedContent,
|
15
13
|
} from './commonCrawlerFunc.js';
|
16
14
|
import constants, {
|
17
15
|
UrlsCrawled,
|
@@ -168,95 +166,6 @@ const crawlDomain = async ({
|
|
168
166
|
});
|
169
167
|
}
|
170
168
|
|
171
|
-
const httpHeadCache = new Map<string, boolean>();
|
172
|
-
const isProcessibleUrl = async (url: string): Promise<boolean> => {
|
173
|
-
if (httpHeadCache.has(url)) {
|
174
|
-
consoleLogger.info(`Skipping request as URL has been processed before: ${url}}`);
|
175
|
-
return false; // return false to avoid processing the same url again
|
176
|
-
}
|
177
|
-
|
178
|
-
try {
|
179
|
-
// Send a HEAD request to check headers without downloading the file
|
180
|
-
const headResponse = await axios.head(url, {
|
181
|
-
headers: { Authorization: authHeader },
|
182
|
-
httpsAgent,
|
183
|
-
});
|
184
|
-
const contentType = headResponse.headers['content-type'] || '';
|
185
|
-
const contentDisposition = headResponse.headers['content-disposition'] || '';
|
186
|
-
|
187
|
-
// Check if the response suggests it's a downloadable file based on Content-Disposition header
|
188
|
-
if (contentDisposition.includes('attachment')) {
|
189
|
-
consoleLogger.info(`Skipping URL due to attachment header: ${url}`);
|
190
|
-
httpHeadCache.set(url, false);
|
191
|
-
return false;
|
192
|
-
}
|
193
|
-
|
194
|
-
// Check if the MIME type suggests it's a downloadable file
|
195
|
-
if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
|
196
|
-
consoleLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
|
197
|
-
httpHeadCache.set(url, false);
|
198
|
-
return false;
|
199
|
-
}
|
200
|
-
|
201
|
-
// Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
|
202
|
-
const mimeType = mime.lookup(contentType);
|
203
|
-
if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
|
204
|
-
consoleLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
|
205
|
-
httpHeadCache.set(url, false);
|
206
|
-
return false;
|
207
|
-
}
|
208
|
-
|
209
|
-
// Additional check for zip files by their magic number (PK\x03\x04)
|
210
|
-
if (url.endsWith('.zip')) {
|
211
|
-
consoleLogger.info(`Checking for zip file magic number at URL ${url}`);
|
212
|
-
|
213
|
-
// Download the first few bytes of the file to check for the magic number
|
214
|
-
const byteResponse = await axios.get(url, {
|
215
|
-
headers: { Range: 'bytes=0-3', Authorization: authHeader },
|
216
|
-
responseType: 'arraybuffer',
|
217
|
-
httpsAgent,
|
218
|
-
});
|
219
|
-
|
220
|
-
const magicNumber = byteResponse.data.toString('hex');
|
221
|
-
if (magicNumber === '504b0304') {
|
222
|
-
consoleLogger.info(`Skipping zip file at URL ${url}`);
|
223
|
-
httpHeadCache.set(url, false);
|
224
|
-
return false;
|
225
|
-
}
|
226
|
-
consoleLogger.info(
|
227
|
-
`Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
|
228
|
-
);
|
229
|
-
}
|
230
|
-
|
231
|
-
// If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content
|
232
|
-
const response = await axios.get(url, {
|
233
|
-
headers: { Range: 'bytes=0-4100', Authorization: authHeader },
|
234
|
-
responseType: 'arraybuffer',
|
235
|
-
httpsAgent,
|
236
|
-
});
|
237
|
-
|
238
|
-
const fileType = await fileTypeFromBuffer(response.data);
|
239
|
-
if (
|
240
|
-
fileType &&
|
241
|
-
!fileType.mime.startsWith('text/html') &&
|
242
|
-
!fileType.mime.startsWith('text/')
|
243
|
-
) {
|
244
|
-
consoleLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
|
245
|
-
httpHeadCache.set(url, false);
|
246
|
-
return false;
|
247
|
-
}
|
248
|
-
} catch (e) {
|
249
|
-
// consoleLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
|
250
|
-
// If an error occurs (e.g., a network issue), assume the URL is processible
|
251
|
-
httpHeadCache.set(url, true);
|
252
|
-
return true;
|
253
|
-
}
|
254
|
-
|
255
|
-
// If none of the conditions to skip are met, allow processing of the URL
|
256
|
-
httpHeadCache.set(url, true);
|
257
|
-
return true;
|
258
|
-
};
|
259
|
-
|
260
169
|
const enqueueProcess = async (
|
261
170
|
page: Page,
|
262
171
|
enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
|
@@ -555,33 +464,18 @@ const crawlDomain = async ({
|
|
555
464
|
}
|
556
465
|
},
|
557
466
|
],
|
558
|
-
preNavigationHooks:
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
},
|
571
|
-
]
|
572
|
-
: [
|
573
|
-
async ({ page, request }) => {
|
574
|
-
await page.setExtraHTTPHeaders({
|
575
|
-
...extraHTTPHeaders,
|
576
|
-
});
|
577
|
-
|
578
|
-
const processible = await isProcessibleUrl(request.url);
|
579
|
-
if (!processible) {
|
580
|
-
request.skipNavigation = true;
|
581
|
-
return null;
|
582
|
-
}
|
583
|
-
},
|
584
|
-
],
|
467
|
+
preNavigationHooks: [ async({ page, request}) => {
|
468
|
+
if (isBasicAuth) {
|
469
|
+
await page.setExtraHTTPHeaders({
|
470
|
+
Authorization: authHeader,
|
471
|
+
...extraHTTPHeaders,
|
472
|
+
});
|
473
|
+
} else {
|
474
|
+
await page.setExtraHTTPHeaders({
|
475
|
+
...extraHTTPHeaders,
|
476
|
+
});
|
477
|
+
}
|
478
|
+
}],
|
585
479
|
requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
|
586
480
|
requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
|
587
481
|
const browserContext: BrowserContext = page.context();
|
@@ -639,7 +533,7 @@ const crawlDomain = async ({
|
|
639
533
|
}
|
640
534
|
|
641
535
|
// handle pdfs
|
642
|
-
if (request.skipNavigation && actualUrl === 'about:blank') {
|
536
|
+
if (shouldSkipDueToUnsupportedContent(response, request.url) || (request.skipNavigation && actualUrl === 'about:blank')) {
|
643
537
|
if (!isScanPdfs) {
|
644
538
|
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
|
645
539
|
numScanned: urlsCrawled.scanned.length,
|
@@ -12,6 +12,7 @@ import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
|
|
12
12
|
import constants, {
|
13
13
|
getExecutablePath,
|
14
14
|
guiInfoStatusTypes,
|
15
|
+
STATUS_CODE_METADATA,
|
15
16
|
UrlsCrawled,
|
16
17
|
} from '../constants/constants.js';
|
17
18
|
|
@@ -296,6 +297,7 @@ export const handlePdfDownload = (
|
|
296
297
|
url: request.url,
|
297
298
|
pageTitle: url,
|
298
299
|
actualUrl: url,
|
300
|
+
metadata: STATUS_CODE_METADATA[1],
|
299
301
|
});
|
300
302
|
}
|
301
303
|
|