clean-web-scraper 2.0.5 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -6
- package/example-usage.js +7 -4
- package/package.json +1 -1
- package/src/WebScraper.js +49 -10
package/README.md
CHANGED
|
@@ -12,7 +12,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
|
12
12
|
- 🔄 Handles relative and absolute URLs like a pro
|
|
13
13
|
- 🎯 No duplicate page visits
|
|
14
14
|
- 📊 Generates JSONL output file for ML training
|
|
15
|
-
- 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
|
|
15
|
+
- 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
|
|
16
16
|
|
|
17
17
|
## 🛠️ Prerequisites
|
|
18
18
|
|
|
@@ -43,11 +43,13 @@ npm install
|
|
|
43
43
|
const WebScraper = require('clean-web-scraper');
|
|
44
44
|
|
|
45
45
|
const scraper = new WebScraper({
|
|
46
|
-
baseURL: 'https://example.com',
|
|
47
|
-
|
|
48
|
-
excludeList: ['/admin', '/private'],
|
|
49
|
-
exactExcludeList: ['/specific-page']
|
|
50
|
-
jsonlPath: 'output.jsonl' // Optional: Custom JSONL output path
|
|
46
|
+
baseURL: 'https://example.com', // Required: The website to scrape
|
|
47
|
+
scrapResultPath: './output', // Required: Where to save the content
|
|
48
|
+
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
|
49
|
+
exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
|
|
50
|
+
jsonlPath: 'output.jsonl', // Optional: Custom JSONL output path
|
|
51
|
+
textOutputPath: "./dataset/texts", // Optional: Custom text output path
|
|
52
|
+
csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
|
|
51
53
|
});
|
|
52
54
|
|
|
53
55
|
scraper.start();
|
|
@@ -66,6 +68,7 @@ Your AI-ready content is saved in a clean, structured format:
|
|
|
66
68
|
- 📝 Pure text format, perfect for LLM training and fine-tuning
|
|
67
69
|
- 🤖 No HTML, no mess - just clean, structured text ready for AI consumption
|
|
68
70
|
- 📊 JSONL output for ML training
|
|
71
|
+
- 📈 CSV output with clean text content
|
|
69
72
|
|
|
70
73
|
## 🤖 AI/LLM Training Ready
|
|
71
74
|
|
package/example-usage.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
const WebScraper = require( "./src/WebScraper" );
|
|
2
2
|
|
|
3
|
+
// Configuration
|
|
3
4
|
const baseURL = "https://decolonizepalestine.com";
|
|
4
|
-
const
|
|
5
|
+
const scrapResultPath = "./dataset";
|
|
5
6
|
const excludeList = [
|
|
6
7
|
"https://decolonizepalestine.com/cdn-cgi",
|
|
7
8
|
"https://decolonizepalestine.com/introduction-to-palestine",
|
|
@@ -14,12 +15,14 @@ const exactExcludeList = [
|
|
|
14
15
|
"https://decolonizepalestine.com/"
|
|
15
16
|
]
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
// Initialize scraper with all available options
|
|
18
19
|
const scraper = new WebScraper({
|
|
19
20
|
baseURL,
|
|
20
|
-
|
|
21
|
+
scrapResultPath,
|
|
21
22
|
excludeList,
|
|
22
23
|
exactExcludeList,
|
|
23
|
-
jsonlPath: "./dataset/
|
|
24
|
+
jsonlPath: "./dataset/train.jsonl",
|
|
25
|
+
textOutputPath: "./dataset/texts",
|
|
26
|
+
csvPath: "./dataset/train.csv"
|
|
24
27
|
});
|
|
25
28
|
scraper.start();
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -7,11 +7,21 @@ const path = require( "path" );
|
|
|
7
7
|
|
|
8
8
|
class WebScraper
|
|
9
9
|
{
|
|
10
|
-
constructor ({
|
|
10
|
+
constructor ({
|
|
11
|
+
baseURL,
|
|
12
|
+
excludeList,
|
|
13
|
+
exactExcludeList,
|
|
14
|
+
scrapResultPath = "./dataset",
|
|
15
|
+
jsonlPath,
|
|
16
|
+
textOutputPath,
|
|
17
|
+
csvPath
|
|
18
|
+
})
|
|
11
19
|
{
|
|
12
20
|
this.baseURL = baseURL;
|
|
13
|
-
this.
|
|
14
|
-
this.
|
|
21
|
+
this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
|
|
22
|
+
this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
23
|
+
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
24
|
+
this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
|
|
15
25
|
this.visited = new Set();
|
|
16
26
|
this.excludeList = new Set( excludeList );
|
|
17
27
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
@@ -24,6 +34,9 @@ class WebScraper
|
|
|
24
34
|
this.visited.add( this.baseURL );
|
|
25
35
|
await this.fetchPage( this.baseURL );
|
|
26
36
|
this.createJSONLFile();
|
|
37
|
+
this.saveNumberedTextFiles();
|
|
38
|
+
this.createCSVFile();
|
|
39
|
+
console.log( "Scraping completed." );
|
|
27
40
|
}
|
|
28
41
|
|
|
29
42
|
async fetchPage ( url )
|
|
@@ -104,7 +117,7 @@ class WebScraper
|
|
|
104
117
|
{
|
|
105
118
|
urlPath = "/index";
|
|
106
119
|
}
|
|
107
|
-
const filePath = path.join( __dirname, this.
|
|
120
|
+
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
|
108
121
|
const dir = path.dirname( filePath );
|
|
109
122
|
|
|
110
123
|
// Create metadata object
|
|
@@ -142,6 +155,34 @@ class WebScraper
|
|
|
142
155
|
console.log( `Created JSONL file at: ${this.jsonlPath}` );
|
|
143
156
|
}
|
|
144
157
|
|
|
158
|
+
createCSVFile ()
|
|
159
|
+
{
|
|
160
|
+
const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
|
|
161
|
+
|
|
162
|
+
writeStream.write( "text\n" );
|
|
163
|
+
|
|
164
|
+
for ( const content of this.processedContent )
|
|
165
|
+
{
|
|
166
|
+
const escapedText = content.text.replace( /"/g, "\"\"" );
|
|
167
|
+
const csvLine = `"${escapedText}"\n`;
|
|
168
|
+
writeStream.write( csvLine );
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
writeStream.end();
|
|
172
|
+
console.log( `Created CSV file at: ${this.csvPath}` );
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
saveNumberedTextFiles ()
|
|
176
|
+
{
|
|
177
|
+
this.processedContent.forEach( ( content, index ) =>
|
|
178
|
+
{
|
|
179
|
+
const fileName = `${index + 1}.txt`;
|
|
180
|
+
const filePath = path.join( __dirname, this.textOutputPath, fileName );
|
|
181
|
+
fs.writeFileSync( filePath, content.text, "utf-8" );
|
|
182
|
+
console.log( `Created numbered text file: ${fileName}` );
|
|
183
|
+
});
|
|
184
|
+
}
|
|
185
|
+
|
|
145
186
|
processContent ( content )
|
|
146
187
|
{
|
|
147
188
|
let processed = content;
|
|
@@ -194,14 +235,12 @@ class WebScraper
|
|
|
194
235
|
|
|
195
236
|
createOutputDirectory ()
|
|
196
237
|
{
|
|
197
|
-
if ( fs.existsSync( path.join( __dirname, this.
|
|
198
|
-
{
|
|
199
|
-
fs.rmSync( path.join( __dirname, this.folderPath ), { recursive: true, force: true });
|
|
200
|
-
}
|
|
201
|
-
if ( !fs.existsSync( path.join( __dirname, this.folderPath ) ) )
|
|
238
|
+
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
202
239
|
{
|
|
203
|
-
fs.
|
|
240
|
+
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
204
241
|
}
|
|
242
|
+
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
243
|
+
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
205
244
|
}
|
|
206
245
|
}
|
|
207
246
|
|