npm - clean-web-scraper - Versions diffs - 4.3.0 → 4.3.2 - Mend

clean-web-scraper 4.3.0 → 4.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/README.md CHANGED Viewed

@@ -17,7 +17,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
 ## 🛠️ Prerequisites
-- Node.js (v18 or higher)
+- Node.js (v20 or higher)
 - npm
 ## 📦 Dependencies
@@ -39,7 +39,7 @@ sudo pacman -S extra/xorg-server-xvfb chromium
 npm install
 # Skip chromium download during npm installation
-# npm i --ignore-scripts
+# npm install --ignore-scripts
 ```
 ## 💻 Usage
@@ -62,6 +62,7 @@ const scraper = new WebScraper({
   maxArticles: Infinity,                        // Optional: Maximum articles to scrape
   crawlingDelay: 1000,                          // Optional: Delay between requests (ms)
   batchSize: 5,                                 // Optional: Number of URLs to process concurrently
+  minContentLength: 400,                        // Optional: Minimum content length to consider valid
   // Network options
   axiosHeaders: {},                             // Optional: Custom HTTP headers
@@ -91,7 +92,7 @@ const docsScraper = new WebScraper({
   scrapResultPath: './datasets/docs',
   maxDepth: 3,                               // Optional: Maximum depth for recursive crawling
   includeMetadata: true,                     // Optional: Include metadata in output files
-  metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
+  metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
    // Optional: Specify metadata fields to include
 });
@@ -114,8 +115,7 @@ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
 ```
 ```bash
-# 8 GB RAM
-node --max-old-space-size=8192 example-usage.js
+node example-usage.js
 ```
 ## 📤 Output
@@ -132,9 +132,11 @@ example.com/
 ├── website/
 │   ├── page1.txt         # Clean text content
 │   ├── page1.json        # Full metadata
+│   ├── page1.html                # Original HTML content
 │   └── blog/
 │       ├── post1.txt
 │       └── post1.json
+│       └── post1.html
 ├── texts/                # Numbered text files
 │   ├── 1.txt
 │   └── 2.txt
@@ -174,9 +176,10 @@ The actual article content starts here. This is the clean, processed text of the
 ```text
 articleTitle: Palestine history
 description: This is a great article about Palestine history
-author: John Doe
+author: Rawan
 language: en
 dateScraped: 2024-01-20T10:30:00Z
+url: https://palianswers.com
 ---
@@ -201,10 +204,18 @@ The actual article content starts here. This is the clean, processed text of the
 ```json
 {
-  "url": "<https://example.com/page>",
-  "title": "Page Title",
+  "url": "https://example.com/page",
+  "pageTitle": "Page Title",
   "description": "Page description",
-  "dateScraped": "2024-01-20T10:30:00Z"
+  "language": "en",
+  "canonicalUrl": "https://example.com/canonical",
+  "ogTitle": "Open Graph Title",
+  "ogDescription": "Open Graph Description",
+  "ogImage": "https://example.com/image.jpg",
+  "ogType": "article",
+  "dataScrapedDate": "2024-01-20T10:30:00Z",
+  "originalHtml": "<html>...</html>",
+  "articleTitle": "Article Title",
 }
 ```

package/example-usage.js CHANGED Viewed

@@ -134,9 +134,11 @@ async function electronicintifada ( enable )
 			"https://electronicintifada.net/review",
 			"https://electronicintifada.net/artmusicculture",
 			"https://electronicintifada.net/blog/editors",
-			"https://electronicintifada.net/blog"
 		],
 		exactExcludeList: [
+			"https://electronicintifada.net/blog",
+			/^https:\/\/electronicintifada\.net\/blog\/.*/,
+			/^https:\/\/electronicintifada\.net\/blog\?page=\d+$/,
 			"https://electronicintifada.net",
 			"https://electronicintifada.net/blogs",
 			"https://electronicintifada.net/review",
@@ -149,10 +151,10 @@ async function electronicintifada ( enable )
 		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
 		maxArticles: 2000,
 		maxDepth: 16,
-		batchSize: 30,
+		batchSize: 40,
 		axiosHeaders: headers,
-		axiosMaxRetries: 3,
-		axiosRetryDelay: 10000,
+		axiosMaxRetries: 2,
+		axiosRetryDelay: 8000,
 		axiosProxy: {
 			host: "localhost",
 			port: 2080,
@@ -230,7 +232,7 @@ async function mondoweiss ( enable )
 		maxDepth: 15,
 		batchSize: 20,
 		axiosHeaders: headers,
-		axiosMaxRetries: 3,
+		axiosMaxRetries: 2,
 		axiosRetryDelay: 10000,
 		axiosProxy: {
 			host: "localhost",
@@ -280,7 +282,16 @@ async function bdsmovement ( enable )
 		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
 		maxArticles: 2000,
 		maxDepth: 16,
-		batchSize: 20
+		batchSize: 40,
+		axiosHeaders: headers,
+		axiosMaxRetries: 2,
+		axiosRetryDelay: 8000,
+		axiosProxy: {
+			host: "localhost",
+			port: 2080,
+			protocol: "http"
+		},
+		useProxyAsFallback: true
 	};
 	return await runScraper( config, enable );
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "4.3.0",
+  "version": "4.3.2",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",
@@ -30,4 +30,4 @@
     "puppeteer": "^24.1.1",
     "puppeteer-real-browser": "^1.3.22"
   }
-}
+}