clean-web-scraper 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -5
- package/example-usage.js +2 -1
- package/package.json +1 -1
- package/src/WebScraper.js +21 -1
package/README.md
CHANGED
|
@@ -11,8 +11,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
|
11
11
|
- 🚫 Excludes unwanted paths from scraping
|
|
12
12
|
- 🔄 Handles relative and absolute URLs like a pro
|
|
13
13
|
- 🎯 No duplicate page visits
|
|
14
|
-
- 📊 Generates JSONL
|
|
15
|
-
- 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
|
|
14
|
+
- 📊 Generates JSONL output file for ML training
|
|
15
|
+
- 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
|
|
16
16
|
|
|
17
17
|
## 🛠️ Prerequisites
|
|
18
18
|
|
|
@@ -44,11 +44,12 @@ const WebScraper = require('clean-web-scraper');
|
|
|
44
44
|
|
|
45
45
|
const scraper = new WebScraper({
|
|
46
46
|
baseURL: 'https://example.com', // Required: The website to scrape
|
|
47
|
-
scrapResultPath: './output', // Required: Where to save the content
|
|
48
47
|
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
|
49
48
|
exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
scrapResultPath: './dataset', // Required: Where to save the content
|
|
50
|
+
jsonlPath: './dataset/train.jsonl', // Optional: Custom JSONL output path
|
|
51
|
+
textOutputPath: "./dataset/texts", // Optional: Custom text output path
|
|
52
|
+
csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
|
|
52
53
|
});
|
|
53
54
|
|
|
54
55
|
scraper.start();
|
|
@@ -67,6 +68,31 @@ Your AI-ready content is saved in a clean, structured format:
|
|
|
67
68
|
- 📝 Pure text format, perfect for LLM training and fine-tuning
|
|
68
69
|
- 🤖 No HTML, no mess - just clean, structured text ready for AI consumption
|
|
69
70
|
- 📊 JSONL output for ML training
|
|
71
|
+
- 📈 CSV output with clean text content
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
dataset/
|
|
75
|
+
├── decolonizepalestine.com
|
|
76
|
+
│ ├── faq.json
|
|
77
|
+
│ ├── faq.txt
|
|
78
|
+
│ ├── intro
|
|
79
|
+
│ │ ├── bds-101.json
|
|
80
|
+
│ │ ├── bds-101.txt
|
|
81
|
+
│ ├── myth
|
|
82
|
+
│ │ ├── a-land-without-a-people-for-a-people-without-a-land.json
|
|
83
|
+
│ │ ├── a-land-without-a-people-for-a-people-without-a-land.txt
|
|
84
|
+
│ └── rainbow-washing
|
|
85
|
+
│ ├── bluewashing.json
|
|
86
|
+
│ ├── bluewashing.txt
|
|
87
|
+
├── texts
|
|
88
|
+
│ ├── 1.txt
|
|
89
|
+
│ ├── 2.txt
|
|
90
|
+
│ ├── 3.txt
|
|
91
|
+
│ ├── 4.txt
|
|
92
|
+
│ └── 5.txt
|
|
93
|
+
├── train.csv
|
|
94
|
+
└── train.jsonl
|
|
95
|
+
```
|
|
70
96
|
|
|
71
97
|
## 🤖 AI/LLM Training Ready
|
|
72
98
|
|
package/example-usage.js
CHANGED
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -13,13 +13,15 @@ class WebScraper
|
|
|
13
13
|
exactExcludeList,
|
|
14
14
|
scrapResultPath = "./dataset",
|
|
15
15
|
jsonlPath,
|
|
16
|
-
textOutputPath
|
|
16
|
+
textOutputPath,
|
|
17
|
+
csvPath
|
|
17
18
|
})
|
|
18
19
|
{
|
|
19
20
|
this.baseURL = baseURL;
|
|
20
21
|
this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
|
|
21
22
|
this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
22
23
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
24
|
+
this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
|
|
23
25
|
this.visited = new Set();
|
|
24
26
|
this.excludeList = new Set( excludeList );
|
|
25
27
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
@@ -33,6 +35,7 @@ class WebScraper
|
|
|
33
35
|
await this.fetchPage( this.baseURL );
|
|
34
36
|
this.createJSONLFile();
|
|
35
37
|
this.saveNumberedTextFiles();
|
|
38
|
+
this.createCSVFile();
|
|
36
39
|
console.log( "Scraping completed." );
|
|
37
40
|
}
|
|
38
41
|
|
|
@@ -152,6 +155,23 @@ class WebScraper
|
|
|
152
155
|
console.log( `Created JSONL file at: ${this.jsonlPath}` );
|
|
153
156
|
}
|
|
154
157
|
|
|
158
|
+
createCSVFile ()
|
|
159
|
+
{
|
|
160
|
+
const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
|
|
161
|
+
|
|
162
|
+
writeStream.write( "text\n" );
|
|
163
|
+
|
|
164
|
+
for ( const content of this.processedContent )
|
|
165
|
+
{
|
|
166
|
+
const escapedText = content.text.replace( /"/g, "\"\"" );
|
|
167
|
+
const csvLine = `"${escapedText}"\n`;
|
|
168
|
+
writeStream.write( csvLine );
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
writeStream.end();
|
|
172
|
+
console.log( `Created CSV file at: ${this.csvPath}` );
|
|
173
|
+
}
|
|
174
|
+
|
|
155
175
|
saveNumberedTextFiles ()
|
|
156
176
|
{
|
|
157
177
|
this.processedContent.forEach( ( content, index ) =>
|