clean-web-scraper 4.3.1 → 4.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -17,7 +17,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
17
17
 
18
18
  ## 🛠️ Prerequisites
19
19
 
20
- - Node.js (v18 or higher)
20
+ - Node.js (v20 or higher)
21
21
  - npm
22
22
 
23
23
  ## 📦 Dependencies
@@ -39,7 +39,7 @@ sudo pacman -S extra/xorg-server-xvfb chromium
39
39
  npm install
40
40
 
41
41
  # Skip chromium download during npm installation
42
- # npm i --ignore-scripts
42
+ # npm install --ignore-scripts
43
43
  ```
44
44
 
45
45
  ## 💻 Usage
@@ -62,6 +62,7 @@ const scraper = new WebScraper({
62
62
  maxArticles: Infinity, // Optional: Maximum articles to scrape
63
63
  crawlingDelay: 1000, // Optional: Delay between requests (ms)
64
64
  batchSize: 5, // Optional: Number of URLs to process concurrently
65
+ minContentLength: 400, // Optional: Minimum content length to consider valid
65
66
 
66
67
  // Network options
67
68
  axiosHeaders: {}, // Optional: Custom HTTP headers
@@ -91,7 +92,7 @@ const docsScraper = new WebScraper({
91
92
  scrapResultPath: './datasets/docs',
92
93
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
93
94
  includeMetadata: true, // Optional: Include metadata in output files
94
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
95
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
95
96
  // Optional: Specify metadata fields to include
96
97
  });
97
98
 
@@ -114,8 +115,7 @@ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
114
115
  ```
115
116
 
116
117
  ```bash
117
- # 8 GB RAM
118
- node --max-old-space-size=8192 example-usage.js
118
+ node example-usage.js
119
119
  ```
120
120
 
121
121
  ## 📤 Output
@@ -132,9 +132,11 @@ example.com/
132
132
  ├── website/
133
133
  │ ├── page1.txt # Clean text content
134
134
  │ ├── page1.json # Full metadata
135
+ │ ├── page1.html # Original HTML content
135
136
  │ └── blog/
136
137
  │ ├── post1.txt
137
138
  │ └── post1.json
139
+ │ └── post1.html
138
140
  ├── texts/ # Numbered text files
139
141
  │ ├── 1.txt
140
142
  │ └── 2.txt
@@ -174,9 +176,10 @@ The actual article content starts here. This is the clean, processed text of the
174
176
  ```text
175
177
  articleTitle: Palestine history
176
178
  description: This is a great article about Palestine history
177
- author: John Doe
179
+ author: Rawan
178
180
  language: en
179
181
  dateScraped: 2024-01-20T10:30:00Z
182
+ url: https://palianswers.com
180
183
 
181
184
  ---
182
185
 
@@ -201,10 +204,18 @@ The actual article content starts here. This is the clean, processed text of the
201
204
 
202
205
  ```json
203
206
  {
204
- "url": "<https://example.com/page>",
205
- "title": "Page Title",
207
+ "url": "https://example.com/page",
208
+ "pageTitle": "Page Title",
206
209
  "description": "Page description",
207
- "dateScraped": "2024-01-20T10:30:00Z"
210
+ "language": "en",
211
+ "canonicalUrl": "https://example.com/canonical",
212
+ "ogTitle": "Open Graph Title",
213
+ "ogDescription": "Open Graph Description",
214
+ "ogImage": "https://example.com/image.jpg",
215
+ "ogType": "article",
216
+ "dataScrapedDate": "2024-01-20T10:30:00Z",
217
+ "originalHtml": "<html>...</html>",
218
+ "articleTitle": "Article Title",
208
219
  }
209
220
  ```
210
221
 
package/example-usage.js CHANGED
@@ -273,6 +273,7 @@ async function bdsmovement ( enable )
273
273
  /^https:\/\/bdsmovement\.net\/resources\?type=\d+$/,
274
274
  /^https:\/\/bdsmovement\.net\/news\?type=\d+$/,
275
275
  /^https:\/\/bdsmovement\.net\/news\?campaign=\d+$/,
276
+ /^https:\/\/bdsmovement\.net\/news\?location=\d+$/,
276
277
  ],
277
278
  scrapResultPath: "./dataset/bdsmovement/website",
278
279
  jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
@@ -282,7 +283,16 @@ async function bdsmovement ( enable )
282
283
  metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
283
284
  maxArticles: 2000,
284
285
  maxDepth: 16,
285
- batchSize: 20
286
+ batchSize: 40,
287
+ axiosHeaders: headers,
288
+ axiosMaxRetries: 2,
289
+ axiosRetryDelay: 8000,
290
+ axiosProxy: {
291
+ host: "localhost",
292
+ port: 2080,
293
+ protocol: "http"
294
+ },
295
+ useProxyAsFallback: true
286
296
  };
287
297
  return await runScraper( config, enable );
288
298
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.3.1",
3
+ "version": "4.3.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",