clean-web-scraper 4.3.1 → 4.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -9
- package/example-usage.js +11 -1
- package/package.json +1 -1
package/README.md
CHANGED
@@ -17,7 +17,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
17
17
|
|
18
18
|
## 🛠️ Prerequisites
|
19
19
|
|
20
|
-
- Node.js (
|
20
|
+
- Node.js (v20 or higher)
|
21
21
|
- npm
|
22
22
|
|
23
23
|
## 📦 Dependencies
|
@@ -39,7 +39,7 @@ sudo pacman -S extra/xorg-server-xvfb chromium
|
|
39
39
|
npm install
|
40
40
|
|
41
41
|
# Skip chromium download during npm installation
|
42
|
-
# npm
|
42
|
+
# npm install --ignore-scripts
|
43
43
|
```
|
44
44
|
|
45
45
|
## 💻 Usage
|
@@ -62,6 +62,7 @@ const scraper = new WebScraper({
|
|
62
62
|
maxArticles: Infinity, // Optional: Maximum articles to scrape
|
63
63
|
crawlingDelay: 1000, // Optional: Delay between requests (ms)
|
64
64
|
batchSize: 5, // Optional: Number of URLs to process concurrently
|
65
|
+
minContentLength: 400, // Optional: Minimum content length to consider valid
|
65
66
|
|
66
67
|
// Network options
|
67
68
|
axiosHeaders: {}, // Optional: Custom HTTP headers
|
@@ -91,7 +92,7 @@ const docsScraper = new WebScraper({
|
|
91
92
|
scrapResultPath: './datasets/docs',
|
92
93
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
93
94
|
includeMetadata: true, // Optional: Include metadata in output files
|
94
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
|
95
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
95
96
|
// Optional: Specify metadata fields to include
|
96
97
|
});
|
97
98
|
|
@@ -114,8 +115,7 @@ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
|
|
114
115
|
```
|
115
116
|
|
116
117
|
```bash
|
117
|
-
|
118
|
-
node --max-old-space-size=8192 example-usage.js
|
118
|
+
node example-usage.js
|
119
119
|
```
|
120
120
|
|
121
121
|
## 📤 Output
|
@@ -132,9 +132,11 @@ example.com/
|
|
132
132
|
├── website/
|
133
133
|
│ ├── page1.txt # Clean text content
|
134
134
|
│ ├── page1.json # Full metadata
|
135
|
+
│ ├── page1.html # Original HTML content
|
135
136
|
│ └── blog/
|
136
137
|
│ ├── post1.txt
|
137
138
|
│ └── post1.json
|
139
|
+
│ └── post1.html
|
138
140
|
├── texts/ # Numbered text files
|
139
141
|
│ ├── 1.txt
|
140
142
|
│ └── 2.txt
|
@@ -174,9 +176,10 @@ The actual article content starts here. This is the clean, processed text of the
|
|
174
176
|
```text
|
175
177
|
articleTitle: Palestine history
|
176
178
|
description: This is a great article about Palestine history
|
177
|
-
author:
|
179
|
+
author: Rawan
|
178
180
|
language: en
|
179
181
|
dateScraped: 2024-01-20T10:30:00Z
|
182
|
+
url: https://palianswers.com
|
180
183
|
|
181
184
|
---
|
182
185
|
|
@@ -201,10 +204,18 @@ The actual article content starts here. This is the clean, processed text of the
|
|
201
204
|
|
202
205
|
```json
|
203
206
|
{
|
204
|
-
"url": "
|
205
|
-
"
|
207
|
+
"url": "https://example.com/page",
|
208
|
+
"pageTitle": "Page Title",
|
206
209
|
"description": "Page description",
|
207
|
-
"
|
210
|
+
"language": "en",
|
211
|
+
"canonicalUrl": "https://example.com/canonical",
|
212
|
+
"ogTitle": "Open Graph Title",
|
213
|
+
"ogDescription": "Open Graph Description",
|
214
|
+
"ogImage": "https://example.com/image.jpg",
|
215
|
+
"ogType": "article",
|
216
|
+
"dataScrapedDate": "2024-01-20T10:30:00Z",
|
217
|
+
"originalHtml": "<html>...</html>",
|
218
|
+
"articleTitle": "Article Title",
|
208
219
|
}
|
209
220
|
```
|
210
221
|
|
package/example-usage.js
CHANGED
@@ -273,6 +273,7 @@ async function bdsmovement ( enable )
|
|
273
273
|
/^https:\/\/bdsmovement\.net\/resources\?type=\d+$/,
|
274
274
|
/^https:\/\/bdsmovement\.net\/news\?type=\d+$/,
|
275
275
|
/^https:\/\/bdsmovement\.net\/news\?campaign=\d+$/,
|
276
|
+
/^https:\/\/bdsmovement\.net\/news\?location=\d+$/,
|
276
277
|
],
|
277
278
|
scrapResultPath: "./dataset/bdsmovement/website",
|
278
279
|
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
@@ -282,7 +283,16 @@ async function bdsmovement ( enable )
|
|
282
283
|
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
283
284
|
maxArticles: 2000,
|
284
285
|
maxDepth: 16,
|
285
|
-
batchSize:
|
286
|
+
batchSize: 40,
|
287
|
+
axiosHeaders: headers,
|
288
|
+
axiosMaxRetries: 2,
|
289
|
+
axiosRetryDelay: 8000,
|
290
|
+
axiosProxy: {
|
291
|
+
host: "localhost",
|
292
|
+
port: 2080,
|
293
|
+
protocol: "http"
|
294
|
+
},
|
295
|
+
useProxyAsFallback: true
|
286
296
|
};
|
287
297
|
return await runScraper( config, enable );
|
288
298
|
}
|