@govtechsg/oobee 0.10.57 → 0.10.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -84,7 +84,7 @@ verapdf --version
84
84
  | Variable Name | Description | Default |
85
85
  | ------------- | ----------- | ------- |
86
86
  | OOBEE_VERBOSE | When set to `true`, log output goes to console | `false` |
87
- | OOBEE_FAST_CRAWLER| When set to `true`, increases scan concurrency at a rapid rate. Experimental, may cause system stability issues. | `false`|
87
+ | OOBEE_FAST_CRAWLER| When set to `true`, increases scan concurrency at a rapid rate. Experimental, may cause system stability issues on low-powered devices. | `false`|
88
88
  | OOBEE_VALIDATE_URL| When set to `true`, validates if URLs are valid and exits. | `false` |
89
89
  | WARN_LEVEL | Only used in tests. | |
90
90
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@govtechsg/oobee",
3
3
  "main": "dist/npmIndex.js",
4
- "version": "0.10.57",
4
+ "version": "0.10.58",
5
5
  "type": "module",
6
6
  "author": "Government Technology Agency <info@tech.gov.sg>",
7
7
  "dependencies": {
@@ -20,6 +20,7 @@ import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
20
20
  import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
21
21
  import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
22
22
  import xPathToCss from './custom/xPathToCss.js';
23
+ import type { Response as PlaywrightResponse } from 'playwright';
23
24
 
24
25
  // types
25
26
  interface AxeResultsWithScreenshot extends AxeResults {
@@ -552,4 +553,37 @@ export async function shouldSkipClickDueToDisallowedHref(
552
553
  disallowedPrefixes: disallowedListOfPatterns,
553
554
  }
554
555
  );
555
- }
556
+ }
557
+
558
+ /**
559
+ * Check if response should be skipped based on content headers.
560
+ * @param response - Playwright Response object
561
+ * @param requestUrl - Optional: request URL for logging
562
+ * @returns true if the content should be skipped
563
+ */
564
+ export const shouldSkipDueToUnsupportedContent = (
565
+ response: PlaywrightResponse,
566
+ requestUrl: string = ''
567
+ ): boolean => {
568
+ if (!response) return false;
569
+
570
+ const headers = response.headers();
571
+ const contentDisposition = headers['content-disposition'] || '';
572
+ const contentType = headers['content-type'] || '';
573
+
574
+ if (contentDisposition.includes('attachment')) {
575
+ // consoleLogger.info(`Skipping attachment (content-disposition) at ${requestUrl}`);
576
+ return true;
577
+ }
578
+
579
+ if (
580
+ contentType.startsWith('application/') ||
581
+ contentType.includes('octet-stream') ||
582
+ (!contentType.startsWith('text/') && !contentType.includes('html'))
583
+ ) {
584
+ // consoleLogger.info(`Skipping non-processible content-type "${contentType}" at ${requestUrl}`);
585
+ return true;
586
+ }
587
+
588
+ return false;
589
+ };
@@ -2,9 +2,6 @@ import crawlee, { EnqueueStrategy } from 'crawlee';
2
2
  import fs from 'fs';
3
3
  import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
4
4
  import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
5
- import axios from 'axios';
6
- import { fileTypeFromBuffer } from 'file-type';
7
- import mime from 'mime-types';
8
5
  import https from 'https';
9
6
  import type { BatchAddRequestsResult } from '@crawlee/types';
10
7
  import {
@@ -12,6 +9,7 @@ import {
12
9
  runAxeScript,
13
10
  isUrlPdf,
14
11
  shouldSkipClickDueToDisallowedHref,
12
+ shouldSkipDueToUnsupportedContent,
15
13
  } from './commonCrawlerFunc.js';
16
14
  import constants, {
17
15
  UrlsCrawled,
@@ -168,95 +166,6 @@ const crawlDomain = async ({
168
166
  });
169
167
  }
170
168
 
171
- const httpHeadCache = new Map<string, boolean>();
172
- const isProcessibleUrl = async (url: string): Promise<boolean> => {
173
- if (httpHeadCache.has(url)) {
174
- consoleLogger.info(`Skipping request as URL has been processed before: ${url}}`);
175
- return false; // return false to avoid processing the same url again
176
- }
177
-
178
- try {
179
- // Send a HEAD request to check headers without downloading the file
180
- const headResponse = await axios.head(url, {
181
- headers: { Authorization: authHeader },
182
- httpsAgent,
183
- });
184
- const contentType = headResponse.headers['content-type'] || '';
185
- const contentDisposition = headResponse.headers['content-disposition'] || '';
186
-
187
- // Check if the response suggests it's a downloadable file based on Content-Disposition header
188
- if (contentDisposition.includes('attachment')) {
189
- consoleLogger.info(`Skipping URL due to attachment header: ${url}`);
190
- httpHeadCache.set(url, false);
191
- return false;
192
- }
193
-
194
- // Check if the MIME type suggests it's a downloadable file
195
- if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
196
- consoleLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
197
- httpHeadCache.set(url, false);
198
- return false;
199
- }
200
-
201
- // Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
202
- const mimeType = mime.lookup(contentType);
203
- if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
204
- consoleLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
205
- httpHeadCache.set(url, false);
206
- return false;
207
- }
208
-
209
- // Additional check for zip files by their magic number (PK\x03\x04)
210
- if (url.endsWith('.zip')) {
211
- consoleLogger.info(`Checking for zip file magic number at URL ${url}`);
212
-
213
- // Download the first few bytes of the file to check for the magic number
214
- const byteResponse = await axios.get(url, {
215
- headers: { Range: 'bytes=0-3', Authorization: authHeader },
216
- responseType: 'arraybuffer',
217
- httpsAgent,
218
- });
219
-
220
- const magicNumber = byteResponse.data.toString('hex');
221
- if (magicNumber === '504b0304') {
222
- consoleLogger.info(`Skipping zip file at URL ${url}`);
223
- httpHeadCache.set(url, false);
224
- return false;
225
- }
226
- consoleLogger.info(
227
- `Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
228
- );
229
- }
230
-
231
- // If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content
232
- const response = await axios.get(url, {
233
- headers: { Range: 'bytes=0-4100', Authorization: authHeader },
234
- responseType: 'arraybuffer',
235
- httpsAgent,
236
- });
237
-
238
- const fileType = await fileTypeFromBuffer(response.data);
239
- if (
240
- fileType &&
241
- !fileType.mime.startsWith('text/html') &&
242
- !fileType.mime.startsWith('text/')
243
- ) {
244
- consoleLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
245
- httpHeadCache.set(url, false);
246
- return false;
247
- }
248
- } catch (e) {
249
- // consoleLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
250
- // If an error occurs (e.g., a network issue), assume the URL is processible
251
- httpHeadCache.set(url, true);
252
- return true;
253
- }
254
-
255
- // If none of the conditions to skip are met, allow processing of the URL
256
- httpHeadCache.set(url, true);
257
- return true;
258
- };
259
-
260
169
  const enqueueProcess = async (
261
170
  page: Page,
262
171
  enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
@@ -555,33 +464,18 @@ const crawlDomain = async ({
555
464
  }
556
465
  },
557
466
  ],
558
- preNavigationHooks: isBasicAuth
559
- ? [
560
- async ({ page, request }) => {
561
- await page.setExtraHTTPHeaders({
562
- Authorization: authHeader,
563
- ...extraHTTPHeaders,
564
- });
565
- const processible = await isProcessibleUrl(request.url);
566
- if (!processible) {
567
- request.skipNavigation = true;
568
- return null;
569
- }
570
- },
571
- ]
572
- : [
573
- async ({ page, request }) => {
574
- await page.setExtraHTTPHeaders({
575
- ...extraHTTPHeaders,
576
- });
577
-
578
- const processible = await isProcessibleUrl(request.url);
579
- if (!processible) {
580
- request.skipNavigation = true;
581
- return null;
582
- }
583
- },
584
- ],
467
+ preNavigationHooks: [ async({ page, request}) => {
468
+ if (isBasicAuth) {
469
+ await page.setExtraHTTPHeaders({
470
+ Authorization: authHeader,
471
+ ...extraHTTPHeaders,
472
+ });
473
+ } else {
474
+ await page.setExtraHTTPHeaders({
475
+ ...extraHTTPHeaders,
476
+ });
477
+ }
478
+ }],
585
479
  requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
586
480
  requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
587
481
  const browserContext: BrowserContext = page.context();
@@ -639,7 +533,7 @@ const crawlDomain = async ({
639
533
  }
640
534
 
641
535
  // handle pdfs
642
- if (request.skipNavigation && actualUrl === 'about:blank') {
536
+ if (shouldSkipDueToUnsupportedContent(response, request.url) || (request.skipNavigation && actualUrl === 'about:blank')) {
643
537
  if (!isScanPdfs) {
644
538
  guiInfoLog(guiInfoStatusTypes.SKIPPED, {
645
539
  numScanned: urlsCrawled.scanned.length,
@@ -12,6 +12,7 @@ import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
12
12
  import constants, {
13
13
  getExecutablePath,
14
14
  guiInfoStatusTypes,
15
+ STATUS_CODE_METADATA,
15
16
  UrlsCrawled,
16
17
  } from '../constants/constants.js';
17
18
 
@@ -296,6 +297,7 @@ export const handlePdfDownload = (
296
297
  url: request.url,
297
298
  pageTitle: url,
298
299
  actualUrl: url,
300
+ metadata: STATUS_CODE_METADATA[1],
299
301
  });
300
302
  }
301
303