clean-web-scraper 4.3.0 → 4.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -9
- package/example-usage.js +17 -6
- package/package.json +2 -2
package/README.md
CHANGED
@@ -17,7 +17,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
17
17
|
|
18
18
|
## 🛠️ Prerequisites
|
19
19
|
|
20
|
-
- Node.js (
|
20
|
+
- Node.js (v20 or higher)
|
21
21
|
- npm
|
22
22
|
|
23
23
|
## 📦 Dependencies
|
@@ -39,7 +39,7 @@ sudo pacman -S extra/xorg-server-xvfb chromium
|
|
39
39
|
npm install
|
40
40
|
|
41
41
|
# Skip chromium download during npm installation
|
42
|
-
# npm
|
42
|
+
# npm install --ignore-scripts
|
43
43
|
```
|
44
44
|
|
45
45
|
## 💻 Usage
|
@@ -62,6 +62,7 @@ const scraper = new WebScraper({
|
|
62
62
|
maxArticles: Infinity, // Optional: Maximum articles to scrape
|
63
63
|
crawlingDelay: 1000, // Optional: Delay between requests (ms)
|
64
64
|
batchSize: 5, // Optional: Number of URLs to process concurrently
|
65
|
+
minContentLength: 400, // Optional: Minimum content length to consider valid
|
65
66
|
|
66
67
|
// Network options
|
67
68
|
axiosHeaders: {}, // Optional: Custom HTTP headers
|
@@ -91,7 +92,7 @@ const docsScraper = new WebScraper({
|
|
91
92
|
scrapResultPath: './datasets/docs',
|
92
93
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
93
94
|
includeMetadata: true, // Optional: Include metadata in output files
|
94
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
|
95
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
95
96
|
// Optional: Specify metadata fields to include
|
96
97
|
});
|
97
98
|
|
@@ -114,8 +115,7 @@ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
|
|
114
115
|
```
|
115
116
|
|
116
117
|
```bash
|
117
|
-
|
118
|
-
node --max-old-space-size=8192 example-usage.js
|
118
|
+
node example-usage.js
|
119
119
|
```
|
120
120
|
|
121
121
|
## 📤 Output
|
@@ -132,9 +132,11 @@ example.com/
|
|
132
132
|
├── website/
|
133
133
|
│ ├── page1.txt # Clean text content
|
134
134
|
│ ├── page1.json # Full metadata
|
135
|
+
│ ├── page1.html # Original HTML content
|
135
136
|
│ └── blog/
|
136
137
|
│ ├── post1.txt
|
137
138
|
│ └── post1.json
|
139
|
+
│ └── post1.html
|
138
140
|
├── texts/ # Numbered text files
|
139
141
|
│ ├── 1.txt
|
140
142
|
│ └── 2.txt
|
@@ -174,9 +176,10 @@ The actual article content starts here. This is the clean, processed text of the
|
|
174
176
|
```text
|
175
177
|
articleTitle: Palestine history
|
176
178
|
description: This is a great article about Palestine history
|
177
|
-
author:
|
179
|
+
author: Rawan
|
178
180
|
language: en
|
179
181
|
dateScraped: 2024-01-20T10:30:00Z
|
182
|
+
url: https://palianswers.com
|
180
183
|
|
181
184
|
---
|
182
185
|
|
@@ -201,10 +204,18 @@ The actual article content starts here. This is the clean, processed text of the
|
|
201
204
|
|
202
205
|
```json
|
203
206
|
{
|
204
|
-
"url": "
|
205
|
-
"
|
207
|
+
"url": "https://example.com/page",
|
208
|
+
"pageTitle": "Page Title",
|
206
209
|
"description": "Page description",
|
207
|
-
"
|
210
|
+
"language": "en",
|
211
|
+
"canonicalUrl": "https://example.com/canonical",
|
212
|
+
"ogTitle": "Open Graph Title",
|
213
|
+
"ogDescription": "Open Graph Description",
|
214
|
+
"ogImage": "https://example.com/image.jpg",
|
215
|
+
"ogType": "article",
|
216
|
+
"dataScrapedDate": "2024-01-20T10:30:00Z",
|
217
|
+
"originalHtml": "<html>...</html>",
|
218
|
+
"articleTitle": "Article Title",
|
208
219
|
}
|
209
220
|
```
|
210
221
|
|
package/example-usage.js
CHANGED
@@ -134,9 +134,11 @@ async function electronicintifada ( enable )
|
|
134
134
|
"https://electronicintifada.net/review",
|
135
135
|
"https://electronicintifada.net/artmusicculture",
|
136
136
|
"https://electronicintifada.net/blog/editors",
|
137
|
-
"https://electronicintifada.net/blog"
|
138
137
|
],
|
139
138
|
exactExcludeList: [
|
139
|
+
"https://electronicintifada.net/blog",
|
140
|
+
/^https:\/\/electronicintifada\.net\/blog\/.*/,
|
141
|
+
/^https:\/\/electronicintifada\.net\/blog\?page=\d+$/,
|
140
142
|
"https://electronicintifada.net",
|
141
143
|
"https://electronicintifada.net/blogs",
|
142
144
|
"https://electronicintifada.net/review",
|
@@ -149,10 +151,10 @@ async function electronicintifada ( enable )
|
|
149
151
|
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
150
152
|
maxArticles: 2000,
|
151
153
|
maxDepth: 16,
|
152
|
-
batchSize:
|
154
|
+
batchSize: 40,
|
153
155
|
axiosHeaders: headers,
|
154
|
-
axiosMaxRetries:
|
155
|
-
axiosRetryDelay:
|
156
|
+
axiosMaxRetries: 2,
|
157
|
+
axiosRetryDelay: 8000,
|
156
158
|
axiosProxy: {
|
157
159
|
host: "localhost",
|
158
160
|
port: 2080,
|
@@ -230,7 +232,7 @@ async function mondoweiss ( enable )
|
|
230
232
|
maxDepth: 15,
|
231
233
|
batchSize: 20,
|
232
234
|
axiosHeaders: headers,
|
233
|
-
axiosMaxRetries:
|
235
|
+
axiosMaxRetries: 2,
|
234
236
|
axiosRetryDelay: 10000,
|
235
237
|
axiosProxy: {
|
236
238
|
host: "localhost",
|
@@ -280,7 +282,16 @@ async function bdsmovement ( enable )
|
|
280
282
|
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
281
283
|
maxArticles: 2000,
|
282
284
|
maxDepth: 16,
|
283
|
-
batchSize:
|
285
|
+
batchSize: 40,
|
286
|
+
axiosHeaders: headers,
|
287
|
+
axiosMaxRetries: 2,
|
288
|
+
axiosRetryDelay: 8000,
|
289
|
+
axiosProxy: {
|
290
|
+
host: "localhost",
|
291
|
+
port: 2080,
|
292
|
+
protocol: "http"
|
293
|
+
},
|
294
|
+
useProxyAsFallback: true
|
284
295
|
};
|
285
296
|
return await runScraper( config, enable );
|
286
297
|
}
|
package/package.json
CHANGED