npm - scraply - Versions diffs - 1.0.19 → 1.0.21 - Mend

scraply 1.0.19 → 1.0.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +5 -3
package/readme.md +9 -8
package/src/defaultConfig.js +17 -8
package/src/scraply.js +9 -0
package/src/utils/crawl/browser/helper.js +142 -0
package/src/utils/crawl/url/fetch.js +40 -16
package/src/utils/crawl/url/handlers.js +5 -11
package/src/utils/crawl/url/normalize.js +8 -2

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "scraply",
   "description": "A simple, configurable and functional content scraper",
-  "version": "1.0.19",
+  "version": "1.0.21",
   "main": "src/scraply.js",
   "type": "module",
   "scripts": {
@@ -13,9 +13,11 @@
   ],
   "author": "Pau Serrat Gutiérrez",
   "dependencies": {
-    "axios": "^1.7.7",
+    "axios": "^1.7.9",
     "cheerio": "^1.0.0",
-    "he": "^1.2.0"
+    "he": "^1.2.0",
+    "puppeteer": "^24.2.0",
+    "puppeteer-cluster": "^0.24.0"
   },
   "publishConfig": {
     "registry": "https://registry.npmjs.org/",

package/readme.md CHANGED Viewed

@@ -41,7 +41,7 @@ CRAWLER: {
     'https://crawler-test.com/'
   ],
   INCLUDE_URLS: [
-    'https://crawler-test.com/.*'
+    'https://crawler-test.com/'
   ],
   ALLOWED_CONTENT_TYPES: [
     'text/html'
@@ -70,15 +70,16 @@ CRAWLER: {
     'aside',
     'button'
   ],
-  RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
-  REQUEST_TIMEOUT: 4000,
-  MAX_REDIRECTS: 3,
-  MAX_RETRIES: 2,
+  RETRY_STATUS_CODES: [408, 500, 502, 503, 504],
+  REQUEST_TIMEOUT: 3000,
+  MAX_REDIRECTS: 2,
+  MAX_CONTENT_LENGTH: 20 * 1024 * 1024, // 20MB
+  MAX_RETRIES: 1,
   CRAWL_DELAY_MS: 200,
-  CRAWL_ERROR_RETRY_DELAY_MS: 800,
+  CRAWL_ERROR_RETRY_DELAY_MS: 1000,
   CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS: 60000,
-  EXIT_CODE_RATE_LIMIT: 10,
-  EXIT_ON_RATE_LIMIT: true
+  EXIT_ON_RATE_LIMIT: true, // If true, forces exit instantly. If false, only exits after retries (if still 429)
+  EXIT_CODE_RATE_LIMIT: 10
 },
 DATA_FORMATTER: {

package/src/defaultConfig.js CHANGED Viewed

@@ -6,7 +6,7 @@ export const DEFAULT_CONFIG = {
       'https://crawler-test.com/'
     ],
     INCLUDE_URLS: [
-      'https://crawler-test.com/.*'
+      'https://crawler-test.com/'
     ],
     ALLOWED_CONTENT_TYPES: [
       'text/html'
@@ -33,16 +33,25 @@ export const DEFAULT_CONFIG = {
       'header',
       'footer',
       'aside',
-      'button'
+      'button',
+      '[aria-modal]',
+      '[role="dialog"]',
+      '[role="alert"]',
+      '[role="banner"]',
+      '[role="form"]',
+      '[role="navigation"]',
+      '[role="search"]'
     ],
-    RETRY_STATUS_CODES: [408, 429, 500, 502, 503, 504],
-    REQUEST_TIMEOUT: 4000,
-    MAX_REDIRECTS: 3,
-    MAX_RETRIES: 2,
+    DYNAMIC_CRAWLING: false,
+    RETRY_STATUS_CODES: [408, 500, 502, 503, 504],
+    REQUEST_TIMEOUT: 3000,
+    MAX_REDIRECTS: 2,
+    MAX_CONTENT_LENGTH: 20 * 1024 * 1024, // 20MB
+    MAX_RETRIES: 1,
     CRAWL_DELAY_MS: 200,
-    CRAWL_ERROR_RETRY_DELAY_MS: 800,
+    CRAWL_ERROR_RETRY_DELAY_MS: 1000,
     CRAWL_RATE_LIMIT_FALLBACK_DELAY_MS: 60000,
-    EXIT_ON_RATE_LIMIT: true,
+    EXIT_ON_RATE_LIMIT: true, // If true, forces exit instantly. If false, only exits after retries (if still 429)
     EXIT_CODE_RATE_LIMIT: 10
   },

package/src/scraply.js CHANGED Viewed

@@ -3,6 +3,7 @@ import { normalizeURL } from './utils/crawl/url/normalize.js';
 import { loadJSON, saveQueue, deleteDataFiles, deleteUntrackedFiles } from './utils/crawl/fileOperations.js';
 import { processURL } from './utils/crawl/url/processor.js';
 import { formatData, saveSortedFormattedJSON } from './utils/format/formatData.js';
+import { initializeCluster, closeCluster } from './utils/crawl/browser/helper.js';
 let urlData = [];
 let CONFIG = {};
@@ -52,6 +53,10 @@ const start = async () => {
   - Crawl Error Retry Delay: ${CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS}ms
   `);
+  if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) {
+    await initializeCluster();
+  }
   let fileNumber = urlData.filter(entry => entry.file).length + 1;
   for await (const entry of urlData) {
     if (!entry.file) {
@@ -104,6 +109,10 @@ const start = async () => {
   console.log(`\nCLEANING UP UNTRACKED FILES...`);
   deleteUntrackedFiles(CONFIG.DATA_FORMATTER.FORMATTED_PATH, generatedFiles); // Delete files not generated during this crawl
   generatedFiles.clear(); // Clear the set to prepare for future crawls
+  if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) {
+    await closeCluster();
+  }
 };
 // Main function to be exported and used

package/src/utils/crawl/browser/helper.js ADDED Viewed

@@ -0,0 +1,142 @@
+import { Cluster } from 'puppeteer-cluster';
+import { delay } from '../delay.js';
+let cluster;
+let initializing = false;
+export const initializeCluster = async () => {
+  if (!cluster && !initializing) {
+    initializing = true;
+    console.log('Initializing Puppeteer cluster...');
+    cluster = await Cluster.launch({
+      concurrency: Cluster.CONCURRENCY_CONTEXT,
+      maxConcurrency: 1, // Lower concurrency for stability
+      puppeteerOptions: {
+        headless: true,
+        args: [
+          // '--no-sandbox',
+          '--disable-setuid-sandbox',
+          '--disable-dev-shm-usage',
+          '--disable-accelerated-2d-canvas',
+          '--disable-gpu',
+          // '--single-process',
+          '--disable-background-networking',
+          '--disable-background-timer-throttling',
+          '--disable-breakpad',
+          '--disable-client-side-phishing-detection',
+          '--disable-default-apps',
+          '--disable-extensions',
+          '--disable-hang-monitor',
+          '--disable-popup-blocking',
+          '--disable-prompt-on-repost',
+          '--disable-sync',
+          '--disable-translate',
+          '--metrics-recording-only',
+          '--no-first-run',
+          '--safebrowsing-disable-auto-update',
+          '--enable-automation',
+          '--password-store=basic',
+          '--use-mock-keychain',
+          '--disable-software-rasterizer',
+          '--no-zygote',
+          '--disable-infobars',
+          '--disable-blink-features=AutomationControlled',
+          '--disable-component-extensions-with-background-pages',
+          '--mute-audio',
+          '--window-size=1280,800', // Moderate window size
+          '--window-position=0,0',
+          '--ignore-certificate-errors',
+          '--ignore-certificate-errors-skip-list',
+          '--hide-scrollbars',
+          '--disable-notifications',
+          '--disable-backgrounding-occluded-windows',
+          '--disable-features=TranslateUI,BlinkGenPropertyTrees',
+          '--disable-ipc-flooding-protection',
+          '--disable-renderer-backgrounding',
+          '--enable-features=NetworkService,NetworkServiceInProcess',
+          '--force-color-profile=srgb'
+        ],
+        timeout: 15000
+      }
+    });
+    cluster.task(async ({ page, data: { url } }) => {
+      let response;
+      try {
+        page.on('response', res => {
+          if (res.url() === url) response = res;
+        });
+        // Skip downloading resources
+        await page.setRequestInterception(true);
+        page.on('request', (req) => {
+          const resourceType = req.resourceType();
+          if (['image', 'stylesheet', 'font'].includes(resourceType)) {
+            req.abort();
+          } else {
+            req.continue();
+          }
+        });
+        await page.goto(url, { timeout: 15000, waitUntil: 'domcontentloaded' }); // Possible values: load, domcontentloaded, networkidle0, networkidle2
+        const content = await page.content();
+        const headers = response ? response.headers() : {};
+        const statusCode = response ? response.status() : 0;
+        return { content, headers, statusCode };
+      } catch (error) {
+        console.error(`Error in cluster task for URL ${url}:`, error);
+        throw error;
+      } finally {
+        if (page) {
+          try {
+            await page.close();
+          } catch (closeError) {
+            console.error('Error closing page:', closeError);
+          }
+        }
+      }
+    });
+    console.log('Puppeteer cluster initialized!');
+    initializing = false;
+  } else if (initializing) {
+    console.log('Cluster is already being initialized, waiting...');
+    while (initializing) {
+      await delay(100); // Wait for initialization to complete
+    }
+  }
+  return cluster;
+};
+export const fetchPageContent = async (url, selector) => {
+  if (!cluster) await initializeCluster();
+  try {
+    const result = await cluster.execute({ url, selector }); // Returns the page content
+    return { data: result.content, headers: result.headers, status: result.statusCode };
+  } catch (error) {
+    console.error(`Error fetching page content for URL ${url}:`, error);
+    throw {
+      response: { status: error.statusCode, statusText: error.message }
+    };
+  }
+};
+export const closeCluster = async () => {
+  if (cluster) {
+    console.log('Closing Puppeteer cluster...');
+    await cluster.idle();
+    await cluster.close();
+    cluster = null;
+    console.log('Puppeteer cluster closed!');
+  }
+};
+// Handle app termination
+const handleAppTermination = async () => {
+  await closeCluster();
+  process.exit(0);
+};
+process.on('SIGINT', handleAppTermination);
+process.on('SIGTERM', handleAppTermination);

package/src/utils/crawl/url/fetch.js CHANGED Viewed

@@ -1,38 +1,62 @@
 import axios from 'axios';
 import { delay } from '../delay.js';
 import { shouldRetry } from './handlers.js';
+import { fetchPageContent } from '../browser/helper.js';
+// import { normalizeURL } from './normalize.js';
 export async function fetchURL(url, retries = 2) {
   try {
-    const response = await axios.get(url, {
-      timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT,
-      maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS
-    });
+    let response;
+    if (CONFIG.CRAWLER.DYNAMIC_CRAWLING) { // JavaScript Dynamic Content
+      response = await fetchPageContent(url, 'body'); // Returns a custom object with content, headers and status, similar to axios
-    const { 'content-type': contentType } = response.headers;
+      // Manually handle redirects (Puppeter doesn't follow them, axios does automatically)
+      if (response.status >= 300 && response.status < 400) { // 3xx Redirect
+        const redirectUrl = response.headers.location;
+        if (redirectUrl) {
+          if (CONFIG.CRAWLER.MAX_REDIRECTS <= 0) {
+            const error = new Error(`Max redirects reached`);
+            error.response = { status: response.status, headers: response.headers, data: response.data };
+            throw error;
+          }
+          // Normalize URL ?
+          const newUrl = new URL(redirectUrl, url).href;
+          return fetchURL(newUrl, retries - 1);
+        }
+      }
-    // Validate content type
-    if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) {
-      return {
-        error: `Content-Type ${contentType} is not allowed.`,
-        status: response.status
-      };
-    };
+      // Validate status code (Puppeteer doesn't throw, axios does automatically)
+      if (response.status < 200 || response.status >= 300) {
+        const error = new Error(`Invalid status code: ${response.status}`);
+        error.response = { status: response.status, headers: response.headers, data: response.data };
+        throw error;
+      }
+    } else { // Static Content
+      response = await axios.get(url, {
+        timeout: CONFIG.CRAWLER.REQUEST_TIMEOUT,
+        maxRedirects: CONFIG.CRAWLER.MAX_REDIRECTS,
+        maxContentLength: CONFIG.CRAWLER.MAX_CONTENT_LENGTH
+      });
+    }
+    // Validate content type header
+    const { 'content-type': contentType } = response.headers;
+    if (!contentType) return { error: `Missing Content-Type header`, status: response.status };
+    if (!CONFIG.CRAWLER.ALLOWED_CONTENT_TYPES.some(type => contentType.includes(type))) return { error: `Content-Type ${contentType} is not allowed.`, status: response.status };
     return { data: response.data, status: response.status };
   } catch (error) {
     if (retries > 0 && (await shouldRetry(error))) {
       const retryCount = CONFIG.CRAWLER.MAX_RETRIES - retries + 1;
       console.log(`Retrying (${retryCount}/${CONFIG.CRAWLER.MAX_RETRIES}) -> ${url}`);
-      if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
+      if (CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS > 0) await delay(CONFIG.CRAWLER.CRAWL_ERROR_RETRY_DELAY_MS);
       return fetchURL(url, retries - 1);
     }
     // If still 429 after retries, exit with configured code
     if (error.response?.status === 429) {
-      console.log(`Force exiting with code ${CONFIG.CRAWLER.EXIT_CODE_RATE_LIMIT} after retries...`);
+      console.log(`Force exiting with code ${CONFIG.CRAWLER.EXIT_CODE_RATE_LIMIT} (after retries)...`);
       process.exit(CONFIG.CRAWLER.EXIT_CODE_RATE_LIMIT);
     }

package/src/utils/crawl/url/handlers.js CHANGED Viewed

@@ -2,7 +2,6 @@ import { URL } from 'node:url';
 import { normalizeURL } from './normalize.js';
 import { delay } from '../delay.js';
-// Handle HTML Status Codes HERE!
 export const shouldRetry = async (error) => {
   if (!error.response) return true;
@@ -42,21 +41,16 @@ export const shouldRetry = async (error) => {
 const shouldIncludeURL = (url) => {
   try {
     const { INITIAL_URLS, INCLUDE_URLS, EXCLUDE_PATTERNS } = CONFIG.CRAWLER;
     if (INITIAL_URLS.includes(url)) return true;
     // Pre-compile string patterns into regular expressions for both include and exclude patterns
-    const compiledExcludePatterns = EXCLUDE_PATTERNS.map(pattern =>
-      typeof pattern === 'string' ? new RegExp(pattern) : pattern
-    );
-    const compiledIncludePatterns = INCLUDE_URLS.map(pattern =>
-      typeof pattern === 'string' ? new RegExp(pattern) : pattern
-    );
+    const compiledExcludePatterns = EXCLUDE_PATTERNS.map(p => typeof p === 'string' ? new RegExp(p) : p);
+    if (compiledExcludePatterns.some(p => p.test(url))) return false;
-    if (compiledExcludePatterns.some(pattern => pattern.test(url))) return false;
-    if (compiledIncludePatterns.some(pattern => pattern.test(url))) return true;
+    const compiledIncludePatterns = INCLUDE_URLS.map(p => typeof p === 'string' ? new RegExp(p) : p);
+    if (compiledIncludePatterns.some(p => p.test(url))) return true;
-    return false; // If the URL doesn't match any include patterns, exclude it.
+    return false; // URL doesn't match any include patterns, it is excluded.
   } catch (error) {
     console.error(`Error processing URL: ${url}`, error);
     return false;

package/src/utils/crawl/url/normalize.js CHANGED Viewed

@@ -2,7 +2,13 @@ export const normalizeURL = (url) => {
   const urlObj = new URL(url);
   urlObj.hash = ''; // Remove the fragment part
   urlObj.search = ''; // Remove the query part
-  urlObj.pathname = urlObj.pathname.endsWith('/') ? urlObj.pathname.slice(0, -1) : urlObj.pathname; // Remove trailing slashes
-  urlObj.pathname = urlObj.pathname === '' ? '/' : urlObj.pathname; // Handle the root URL separately
+  urlObj.protocol = 'https:'; // Force HTTPS
+  urlObj.pathname = urlObj.pathname.endsWith('/')
+    ? urlObj.pathname.slice(0, -1)
+    : urlObj.pathname; // Remove trailing slashes
+  urlObj.pathname = urlObj.pathname === ''
+    ? '/'
+    : urlObj.pathname; // Handle the root URL separately
+  urlObj.hostname = urlObj.hostname.replace(/^www\./, ''); // Remove 'www.' prefix
   return urlObj.toString();
 };