npm - clean-web-scraper - Versions diffs - 3.9.1 → 3.9.3 - Mend

clean-web-scraper 3.9.1 → 3.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -4,13 +4,13 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
 ## ✨ Features
-- 🌐 Smart recursive web crawling of internal links
+- 🌐 Smart web crawling of internal links
+- 🔄 Smart retry mechanism with proxy fallback
 - 📝 Clean content extraction using Mozilla's Readability
 - 🧹 Smart content processing and cleaning
 - 🗂️ Maintains original URL structure in saved files
 - 🚫 Excludes unwanted paths from scraping
-- 🔄 Handles relative and absolute URLs like a pro
-- 🎯 No duplicate page visits
+- 🚦 Configurable rate limiting and delays
 - 🤖 AI-friendly output formats (JSONL, CSV, clean text)
 - 📊 Rich metadata extraction
 - 📁 Combine results from multiple scrapers into a unified dataset
@@ -53,6 +53,23 @@ const scraper = new WebScraper({
   jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
   textOutputPath: "./example.com/texts",        // Optional: Custom text output path
   csvOutputPath: "./example.com/train.csv",     // Optional: Custom CSV output path
+  strictBaseURL: true,                          // Optional: Only scrape URLs from same domain
+  maxDepth: Infinity,                           // Optional: Maximum crawling depth
+  maxArticles: Infinity,                        // Optional: Maximum articles to scrape
+  concurrencyLimit: 2,                          // Optional: Limit concurrent requests
+  crawlingDelay: 1000,                          // Optional: Delay between requests (ms)
+  // Network options
+  axiosHeaders: {},                             // Optional: Custom HTTP headers
+  axiosProxy: "http://proxy:port",              // Optional: HTTP/HTTPS proxy
+  axiosMaxRetries: 5,                           // Optional: Max retry attempts
+  axiosRetryDelay: 40000,                       // Optional: Delay between retries (ms)
+  useProxyAsFallback: false,                    // Optional: Fallback to proxy on failure
+  // Puppeteer options for handling dynamic content
+  usePuppeteer: false,                          // Optional: Enable Puppeteer browser
+  puppeteerProxy: "http://127.0.0.1:2080",      // Optional: Puppeteer proxy
+  puppeteerExecutablePath: "/path/to/chrome",   // Optional: Custom browser path
 });
 await scraper.start();
 ```

package/main.js CHANGED Viewed

@@ -10,41 +10,41 @@ class WebScraper
 	constructor ( config = {})
 	{
 		// Base configuration
-		this.baseURL = baseURL;
-		this.startURL = startURL || baseURL;
-		this.strictBaseURL = strictBaseURL || true;
-		this.maxDepth = maxDepth || Infinity;
-		this.maxArticles = maxArticles || Infinity;
-		this.concurrencyLimit = concurrencyLimit || 2;
-		this.crawlingDelay = crawlingDelay || 1000;
+		this.baseURL = config.baseURL;
+		this.startURL = config.startURL || config.baseURL;
+		this.strictBaseURL = config.strictBaseURL || true;
+		this.maxDepth = config.maxDepth || Infinity;
+		this.maxArticles = config.maxArticles || Infinity;
+		this.concurrencyLimit = config.concurrencyLimit || 2;
+		this.crawlingDelay = config.crawlingDelay || 1000;
 		// Output paths setup
-		this.scrapResultPath = scrapResultPath || "./dataset";
-		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
+		this.scrapResultPath = config.scrapResultPath || "./dataset";
+		this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
 		this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
-		this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
+		this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
-		this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
+		this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
 		this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
 		// Metadata configuration
-		this.includeMetadata = includeMetadata || false;
-		this.metadataFields = new Set( metadataFields || [] );
+		this.includeMetadata = config.includeMetadata || false;
+		this.metadataFields = new Set( config.metadataFields || [] );
 		// URL filtering setup
 		this.visited = new Set();
-		this.excludeList = this.normalizeExcludeList( excludeList );
-		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
-		this.filterFileTypes = filterFileTypes || true;
-		this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
-		this.removeURLFragment = removeURLFragment || true;
+		this.excludeList = this.normalizeExcludeList( config.excludeList );
+		this.exactExcludeList = this.normalizeExcludeList( config.exactExcludeList );
+		this.filterFileTypes = config.filterFileTypes ?? true;
+		this.excludedFileTypes = config.excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
+		this.removeURLFragment = config.removeURLFragment ?? true;
 		// Network configuration
-		this.axiosHeaders = axiosHeaders;
-		this.axiosProxy = axiosProxy;
-		this.axiosMaxRetries = axiosMaxRetries || 5;
-		this.axiosRetryDelay = axiosRetryDelay || 40000;
-		this.useProxyAsFallback = useProxyAsFallback || false;
+		this.axiosHeaders = config.axiosHeaders;
+		this.axiosProxy = config.axiosProxy;
+		this.axiosMaxRetries = config.axiosMaxRetries || 5;
+		this.axiosRetryDelay = config.axiosRetryDelay || 40000;
+		this.useProxyAsFallback = config.useProxyAsFallback || false;
 		this.axiosOptions = {};
 		if ( this.axiosHeaders )
 		{
@@ -59,10 +59,10 @@ class WebScraper
 		this.allProcessedContent = [];
 		// Puppeteer configuration
-		this.usePuppeteer = usePuppeteer || false;
-		this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
-		this.puppeteerExecutablePath = puppeteerExecutablePath;
-		this.puppeteerRealProxy = puppeteerRealProxy;
+		this.usePuppeteer = config.usePuppeteer || false;
+		this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
+		this.puppeteerExecutablePath = config.puppeteerExecutablePath;
+		this.puppeteerRealProxy = config.puppeteerRealProxy;
 		this.configurePuppeteer();
 	}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "3.9.1",
+  "version": "3.9.3",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",