npm - @govtechsg/oobee - Versions diffs - 0.10.57 → 0.10.58 - Mend

@govtechsg/oobee 0.10.57 → 0.10.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md +1 -1
package/package.json +1 -1
package/src/crawlers/commonCrawlerFunc.ts +35 -1
package/src/crawlers/crawlDomain.ts +14 -120
package/src/crawlers/pdfScanFunc.ts +2 -0

package/README.md CHANGED Viewed

@@ -84,7 +84,7 @@ verapdf --version
 | Variable Name | Description | Default |
 | ------------- | ----------- | ------- |
 | OOBEE_VERBOSE | When set to `true`, log output goes to console | `false` |
-| OOBEE_FAST_CRAWLER| When set to `true`, increases scan concurrency at a rapid rate.  Experimental, may cause system stability issues. | `false`|
+| OOBEE_FAST_CRAWLER| When set to `true`, increases scan concurrency at a rapid rate.  Experimental, may cause system stability issues on low-powered devices. | `false`|
 | OOBEE_VALIDATE_URL| When set to `true`, validates if URLs are valid and exits. | `false` |
 | WARN_LEVEL | Only used in tests. |  |

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@govtechsg/oobee",
   "main": "dist/npmIndex.js",
-  "version": "0.10.57",
+  "version": "0.10.58",
   "type": "module",
   "author": "Government Technology Agency <info@tech.gov.sg>",
   "dependencies": {

package/src/crawlers/commonCrawlerFunc.ts CHANGED Viewed

@@ -20,6 +20,7 @@ import { findElementByCssSelector } from './custom/findElementByCssSelector.js';
 import { getAxeConfiguration } from './custom/getAxeConfiguration.js';
 import { flagUnlabelledClickableElements } from './custom/flagUnlabelledClickableElements.js';
 import xPathToCss from './custom/xPathToCss.js';
+import type { Response as PlaywrightResponse } from 'playwright';
 // types
 interface AxeResultsWithScreenshot extends AxeResults {
@@ -552,4 +553,37 @@ export async function shouldSkipClickDueToDisallowedHref(
       disallowedPrefixes: disallowedListOfPatterns,
     }
   );
-}
+}
+/**
+ * Check if response should be skipped based on content headers.
+ * @param response - Playwright Response object
+ * @param requestUrl - Optional: request URL for logging
+ * @returns true if the content should be skipped
+ */
+export const shouldSkipDueToUnsupportedContent = (
+  response: PlaywrightResponse,
+  requestUrl: string = ''
+): boolean => {
+  if (!response) return false;
+  const headers = response.headers();
+  const contentDisposition = headers['content-disposition'] || '';
+  const contentType = headers['content-type'] || '';
+  if (contentDisposition.includes('attachment')) {
+    // consoleLogger.info(`Skipping attachment (content-disposition) at ${requestUrl}`);
+    return true;
+  }
+  if (
+    contentType.startsWith('application/') ||
+    contentType.includes('octet-stream') ||
+    (!contentType.startsWith('text/') && !contentType.includes('html'))
+  ) {
+    // consoleLogger.info(`Skipping non-processible content-type "${contentType}" at ${requestUrl}`);
+    return true;
+  }
+  return false;
+};

package/src/crawlers/crawlDomain.ts CHANGED Viewed

@@ -2,9 +2,6 @@ import crawlee, { EnqueueStrategy } from 'crawlee';
 import fs from 'fs';
 import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
 import type { EnqueueLinksOptions, RequestOptions } from 'crawlee';
-import axios from 'axios';
-import { fileTypeFromBuffer } from 'file-type';
-import mime from 'mime-types';
 import https from 'https';
 import type { BatchAddRequestsResult } from '@crawlee/types';
 import {
@@ -12,6 +9,7 @@ import {
   runAxeScript,
   isUrlPdf,
   shouldSkipClickDueToDisallowedHref,
+  shouldSkipDueToUnsupportedContent,
 } from './commonCrawlerFunc.js';
 import constants, {
   UrlsCrawled,
@@ -168,95 +166,6 @@ const crawlDomain = async ({
     });
   }
-  const httpHeadCache = new Map<string, boolean>();
-  const isProcessibleUrl = async (url: string): Promise<boolean> => {
-    if (httpHeadCache.has(url)) {
-      consoleLogger.info(`Skipping request as URL has been processed before: ${url}}`);
-      return false; // return false to avoid processing the same url again
-    }
-    try {
-      // Send a HEAD request to check headers without downloading the file
-      const headResponse = await axios.head(url, {
-        headers: { Authorization: authHeader },
-        httpsAgent,
-      });
-      const contentType = headResponse.headers['content-type'] || '';
-      const contentDisposition = headResponse.headers['content-disposition'] || '';
-      // Check if the response suggests it's a downloadable file based on Content-Disposition header
-      if (contentDisposition.includes('attachment')) {
-        consoleLogger.info(`Skipping URL due to attachment header: ${url}`);
-        httpHeadCache.set(url, false);
-        return false;
-      }
-      // Check if the MIME type suggests it's a downloadable file
-      if (contentType.startsWith('application/') || contentType.includes('octet-stream')) {
-        consoleLogger.info(`Skipping potential downloadable file: ${contentType} at URL ${url}`);
-        httpHeadCache.set(url, false);
-        return false;
-      }
-      // Use the mime-types library to ensure it's processible content (e.g., HTML or plain text)
-      const mimeType = mime.lookup(contentType);
-      if (mimeType && !mimeType.startsWith('text/html') && !mimeType.startsWith('text/')) {
-        consoleLogger.info(`Detected non-processible MIME type: ${mimeType} at URL ${url}`);
-        httpHeadCache.set(url, false);
-        return false;
-      }
-      // Additional check for zip files by their magic number (PK\x03\x04)
-      if (url.endsWith('.zip')) {
-        consoleLogger.info(`Checking for zip file magic number at URL ${url}`);
-        // Download the first few bytes of the file to check for the magic number
-        const byteResponse = await axios.get(url, {
-          headers: { Range: 'bytes=0-3', Authorization: authHeader },
-          responseType: 'arraybuffer',
-          httpsAgent,
-        });
-        const magicNumber = byteResponse.data.toString('hex');
-        if (magicNumber === '504b0304') {
-          consoleLogger.info(`Skipping zip file at URL ${url}`);
-          httpHeadCache.set(url, false);
-          return false;
-        }
-        consoleLogger.info(
-          `Not skipping ${url}, magic number does not match ZIP file: ${magicNumber}`,
-        );
-      }
-      // If you want more robust checks, you can download a portion of the content and use the file-type package to detect file types by content
-      const response = await axios.get(url, {
-        headers: { Range: 'bytes=0-4100', Authorization: authHeader },
-        responseType: 'arraybuffer',
-        httpsAgent,
-      });
-      const fileType = await fileTypeFromBuffer(response.data);
-      if (
-        fileType &&
-        !fileType.mime.startsWith('text/html') &&
-        !fileType.mime.startsWith('text/')
-      ) {
-        consoleLogger.info(`Detected downloadable file of type ${fileType.mime} at URL ${url}`);
-        httpHeadCache.set(url, false);
-        return false;
-      }
-    } catch (e) {
-      // consoleLogger.error(`Error checking the MIME type of ${url}: ${e.message}`);
-      // If an error occurs (e.g., a network issue), assume the URL is processible
-      httpHeadCache.set(url, true);
-      return true;
-    }
-    // If none of the conditions to skip are met, allow processing of the URL
-    httpHeadCache.set(url, true);
-    return true;
-  };
   const enqueueProcess = async (
     page: Page,
     enqueueLinks: (options: EnqueueLinksOptions) => Promise<BatchAddRequestsResult>,
@@ -555,33 +464,18 @@ const crawlDomain = async ({
         }
       },
     ],
-    preNavigationHooks: isBasicAuth
-      ? [
-          async ({ page, request }) => {
-            await page.setExtraHTTPHeaders({
-              Authorization: authHeader,
-              ...extraHTTPHeaders,
-            });
-            const processible = await isProcessibleUrl(request.url);
-            if (!processible) {
-              request.skipNavigation = true;
-              return null;
-            }
-          },
-        ]
-      : [
-          async ({ page, request }) => {
-            await page.setExtraHTTPHeaders({
-              ...extraHTTPHeaders,
-            });
-            const processible = await isProcessibleUrl(request.url);
-            if (!processible) {
-              request.skipNavigation = true;
-              return null;
-            }
-          },
-        ],
+    preNavigationHooks: [ async({ page, request}) => {
+      if (isBasicAuth) {
+        await page.setExtraHTTPHeaders({
+          Authorization: authHeader,
+          ...extraHTTPHeaders,
+        });
+      } else {
+        await page.setExtraHTTPHeaders({
+          ...extraHTTPHeaders,
+        });
+      }
+    }],
     requestHandlerTimeoutSecs: 90, // Allow each page to be processed by up from default 60 seconds
     requestHandler: async ({ page, request, response, crawler, sendRequest, enqueueLinks }) => {
       const browserContext: BrowserContext = page.context();
@@ -639,7 +533,7 @@ const crawlDomain = async ({
         }
         // handle pdfs
-        if (request.skipNavigation && actualUrl === 'about:blank') {
+        if (shouldSkipDueToUnsupportedContent(response, request.url) || (request.skipNavigation && actualUrl === 'about:blank')) {
           if (!isScanPdfs) {
             guiInfoLog(guiInfoStatusTypes.SKIPPED, {
               numScanned: urlsCrawled.scanned.length,

package/src/crawlers/pdfScanFunc.ts CHANGED Viewed

@@ -12,6 +12,7 @@ import { consoleLogger, guiInfoLog, silentLogger } from '../logs.js';
 import constants, {
   getExecutablePath,
   guiInfoStatusTypes,
+  STATUS_CODE_METADATA,
   UrlsCrawled,
 } from '../constants/constants.js';
@@ -296,6 +297,7 @@ export const handlePdfDownload = (
               url: request.url,
               pageTitle: url,
               actualUrl: url,
+              metadata: STATUS_CODE_METADATA[1],
             });
           }