clean-web-scraper 3.2.3 → 3.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -15
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -14,7 +14,6 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
|
14
14
|
- 🤖 AI-friendly output formats (JSONL, CSV, clean text)
|
|
15
15
|
- 📊 Rich metadata extraction
|
|
16
16
|
- 📁 Combine results from multiple scrapers into a unified dataset
|
|
17
|
-
- 🎯 Turn any website into an AI training dataset
|
|
18
17
|
|
|
19
18
|
## 🛠️ Prerequisites
|
|
20
19
|
|
|
@@ -53,17 +52,10 @@ const scraper = new WebScraper({
|
|
|
53
52
|
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
|
54
53
|
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
|
55
54
|
csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
|
|
56
|
-
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
57
|
-
includeMetadata: false, // Optional: Include metadata in output files
|
|
58
|
-
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
59
55
|
});
|
|
60
56
|
await scraper.start();
|
|
61
57
|
```
|
|
62
58
|
|
|
63
|
-
```bash
|
|
64
|
-
node example-usage.js
|
|
65
|
-
```
|
|
66
|
-
|
|
67
59
|
## 💻 Advanced Usage: Multi-Site Scraping
|
|
68
60
|
|
|
69
61
|
```js
|
|
@@ -72,13 +64,19 @@ const WebScraper = require('clean-web-scraper');
|
|
|
72
64
|
// Scrape documentation website
|
|
73
65
|
const docsScraper = new WebScraper({
|
|
74
66
|
baseURL: 'https://docs.example.com',
|
|
75
|
-
scrapResultPath: './datasets/docs'
|
|
67
|
+
scrapResultPath: './datasets/docs',
|
|
68
|
+
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
69
|
+
includeMetadata: true, // Optional: Include metadata in output files
|
|
70
|
+
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
76
71
|
});
|
|
77
72
|
|
|
78
73
|
// Scrape blog website
|
|
79
74
|
const blogScraper = new WebScraper({
|
|
80
75
|
baseURL: 'https://blog.example.com',
|
|
81
|
-
scrapResultPath: './datasets/blog'
|
|
76
|
+
scrapResultPath: './datasets/blog',
|
|
77
|
+
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
78
|
+
includeMetadata: true, // Optional: Include metadata in output files
|
|
79
|
+
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
82
80
|
});
|
|
83
81
|
|
|
84
82
|
// Start scraping both sites
|
|
@@ -89,16 +87,18 @@ await blogScraper.start();
|
|
|
89
87
|
await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
|
|
90
88
|
```
|
|
91
89
|
|
|
90
|
+
```bash
|
|
91
|
+
node example-usage.js
|
|
92
|
+
```
|
|
93
|
+
|
|
92
94
|
## 📤 Output
|
|
93
95
|
|
|
94
96
|
Your AI-ready content is saved in a clean, structured format:
|
|
95
97
|
|
|
96
|
-
- 📁 Base folder:
|
|
98
|
+
- 📁 Base folder: `./folderPath/example.com/`
|
|
97
99
|
- 📑 Files preserve original URL paths
|
|
98
|
-
-
|
|
99
|
-
-
|
|
100
|
-
- 📊 JSONL output for ML training
|
|
101
|
-
- 📈 CSV output with clean text content
|
|
100
|
+
- 🤖 No HTML, no noise - just clean, structured text (`.txt` files)
|
|
101
|
+
- 📊 `JSONL` and `CSV` outputs, ready for AI consumption and model training
|
|
102
102
|
|
|
103
103
|
```bash
|
|
104
104
|
example.com/
|
|
@@ -138,10 +138,13 @@ combined/
|
|
|
138
138
|
|
|
139
139
|
### 📝 Text Files (*.txt)
|
|
140
140
|
|
|
141
|
+
```text
|
|
141
142
|
The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage
|
|
143
|
+
```
|
|
142
144
|
|
|
143
145
|
### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
|
|
144
146
|
|
|
147
|
+
```text
|
|
145
148
|
title: My Awesome Page
|
|
146
149
|
description: This is a great article about coding
|
|
147
150
|
author: John Doe
|
|
@@ -151,6 +154,7 @@ dateScraped: 2024-01-20T10:30:00Z
|
|
|
151
154
|
\-\-\-
|
|
152
155
|
|
|
153
156
|
The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage.
|
|
157
|
+
```
|
|
154
158
|
|
|
155
159
|
### 📊 JSONL Files (train.jsonl)
|
|
156
160
|
|