clean-web-scraper 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -11
- package/example-usage.js +17 -10
- package/package.json +1 -1
- package/src/WebScraper.js +82 -23
package/README.md
CHANGED
|
@@ -13,7 +13,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
|
13
13
|
- 🎯 No duplicate page visits
|
|
14
14
|
- 📊 Generates JSONL output file for ML training
|
|
15
15
|
- 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
|
|
16
|
-
- 📊 Rich metadata extraction
|
|
16
|
+
- 📊 Rich metadata extraction
|
|
17
|
+
- 📁 Combine results from multiple scrapers into a unified dataset
|
|
17
18
|
|
|
18
19
|
## 🛠️ Prerequisites
|
|
19
20
|
|
|
@@ -44,17 +45,21 @@ npm install
|
|
|
44
45
|
const WebScraper = require('clean-web-scraper');
|
|
45
46
|
|
|
46
47
|
const scraper = new WebScraper({
|
|
47
|
-
baseURL: 'https://example.com',
|
|
48
|
-
startURL: 'https://example.com/blog',
|
|
49
|
-
excludeList: ['/admin', '/private'],
|
|
50
|
-
exactExcludeList: ['/specific-page'],
|
|
51
|
-
scrapResultPath: './example.com/website',
|
|
52
|
-
|
|
53
|
-
textOutputPath: "./example.com/texts",
|
|
54
|
-
|
|
48
|
+
baseURL: 'https://example.com/news', // Required: The website base url to scrape
|
|
49
|
+
startURL: 'https://example.com/blog', // Optional: Custom starting URL
|
|
50
|
+
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
|
51
|
+
exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
|
|
52
|
+
scrapResultPath: './example.com/website', // Required: Where to save the content
|
|
53
|
+
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
|
54
|
+
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
|
55
|
+
csvOutputPath: "./example.com/train.csv" // Optional: Custom CSV output path
|
|
56
|
+
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
57
|
+
includeTitles: true, // Optional: Include page titles in outputs
|
|
55
58
|
});
|
|
56
|
-
|
|
57
59
|
scraper.start();
|
|
60
|
+
|
|
61
|
+
// Combine results from multiple scrapers
|
|
62
|
+
WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
|
|
58
63
|
```
|
|
59
64
|
|
|
60
65
|
```bash
|
|
@@ -92,7 +97,7 @@ example.com/
|
|
|
92
97
|
The output is specifically formatted for AI training purposes:
|
|
93
98
|
|
|
94
99
|
- Clean, processed text without HTML markup
|
|
95
|
-
-
|
|
100
|
+
- Multiple formats (JSONL, CSV, text files)
|
|
96
101
|
- Structured content perfect for fine-tuning LLMs
|
|
97
102
|
- Ready to use in your ML pipelines
|
|
98
103
|
|
package/example-usage.js
CHANGED
|
@@ -9,19 +9,22 @@ async function khameneiIrFreePalestineTag ()
|
|
|
9
9
|
const scraper = new WebScraper({
|
|
10
10
|
baseURL: "https://english.khamenei.ir/news",
|
|
11
11
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
|
12
|
+
maxDepth: 1,
|
|
12
13
|
excludeList: [
|
|
13
14
|
],
|
|
14
15
|
exactExcludeList: [
|
|
16
|
+
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
|
|
15
17
|
],
|
|
16
18
|
scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
|
|
17
|
-
|
|
19
|
+
jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
|
|
18
20
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
|
19
|
-
|
|
21
|
+
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
|
22
|
+
includeTitles: true
|
|
20
23
|
});
|
|
21
|
-
await scraper.start();
|
|
24
|
+
// await scraper.start();
|
|
25
|
+
return scraper;
|
|
22
26
|
}
|
|
23
27
|
|
|
24
|
-
// decolonizepalestine
|
|
25
28
|
async function decolonizepalestine ()
|
|
26
29
|
{
|
|
27
30
|
// 2
|
|
@@ -40,18 +43,22 @@ async function decolonizepalestine ()
|
|
|
40
43
|
"https://decolonizepalestine.com/"
|
|
41
44
|
],
|
|
42
45
|
scrapResultPath: "./dataset/decolonizepalestine/website",
|
|
43
|
-
|
|
46
|
+
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
|
44
47
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
|
45
|
-
|
|
48
|
+
csvOutputPath: "./dataset/decolonizepalestine/train.csv"
|
|
46
49
|
});
|
|
47
|
-
await scraper.start();
|
|
50
|
+
// await scraper.start();
|
|
51
|
+
return scraper;
|
|
48
52
|
}
|
|
49
53
|
|
|
50
54
|
void async function main ()
|
|
51
55
|
{
|
|
52
|
-
|
|
53
|
-
await decolonizepalestine();
|
|
54
|
-
|
|
56
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
57
|
+
const decolonizepalestineScraper = await decolonizepalestine();
|
|
58
|
+
WebScraper.combineResults( "./dataset/combined", [
|
|
59
|
+
khameneiIrFreePalestineTagScraper,
|
|
60
|
+
decolonizepalestineScraper
|
|
61
|
+
] );
|
|
55
62
|
|
|
56
63
|
// 3
|
|
57
64
|
// https://bdsmovement.net
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -10,38 +10,46 @@ class WebScraper
|
|
|
10
10
|
constructor ({
|
|
11
11
|
baseURL,
|
|
12
12
|
startURL,
|
|
13
|
+
maxDepth = Infinity,
|
|
13
14
|
excludeList,
|
|
14
15
|
exactExcludeList,
|
|
15
16
|
scrapResultPath = "./dataset",
|
|
16
|
-
|
|
17
|
+
jsonlOutputPath,
|
|
17
18
|
textOutputPath,
|
|
18
|
-
|
|
19
|
+
csvOutputPath,
|
|
20
|
+
includeTitles = false
|
|
19
21
|
})
|
|
20
22
|
{
|
|
21
23
|
this.baseURL = baseURL;
|
|
22
24
|
this.startURL = startURL || baseURL;
|
|
25
|
+
this.maxDepth = maxDepth;
|
|
23
26
|
this.scrapResultPath = scrapResultPath;
|
|
24
|
-
this.
|
|
27
|
+
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
25
28
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
26
|
-
this.
|
|
29
|
+
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
30
|
+
this.includeTitles = includeTitles;
|
|
27
31
|
this.visited = new Set();
|
|
28
32
|
this.excludeList = new Set( excludeList );
|
|
29
33
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
30
|
-
this.allProcessedContent = [];
|
|
34
|
+
this.allProcessedContent = [];
|
|
31
35
|
this.createOutputDirectory();
|
|
32
36
|
}
|
|
33
37
|
|
|
34
38
|
async start ()
|
|
35
39
|
{
|
|
36
|
-
await this.fetchPage( this.startURL );
|
|
40
|
+
await this.fetchPage( this.startURL, 0 );
|
|
37
41
|
this.createJSONLFile();
|
|
38
42
|
this.saveNumberedTextFiles();
|
|
39
43
|
this.createCSVFile();
|
|
40
44
|
console.log( "Scraping completed." );
|
|
41
45
|
}
|
|
42
46
|
|
|
43
|
-
async fetchPage ( url )
|
|
47
|
+
async fetchPage ( url, depth )
|
|
44
48
|
{
|
|
49
|
+
if ( depth > this.maxDepth )
|
|
50
|
+
{
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
45
53
|
this.visited.add( url );
|
|
46
54
|
try
|
|
47
55
|
{
|
|
@@ -57,6 +65,7 @@ class WebScraper
|
|
|
57
65
|
if ( article )
|
|
58
66
|
{
|
|
59
67
|
const metadata = this.metadataextractor( url, document, headers );
|
|
68
|
+
metadata.depth = depth;
|
|
60
69
|
this.saveArticle( url, article.textContent, metadata );
|
|
61
70
|
}
|
|
62
71
|
else
|
|
@@ -70,7 +79,7 @@ class WebScraper
|
|
|
70
79
|
{
|
|
71
80
|
if ( !this.visited.has( link ) )
|
|
72
81
|
{
|
|
73
|
-
await this.fetchPage( link );
|
|
82
|
+
await this.fetchPage( link, depth + 1 );
|
|
74
83
|
}
|
|
75
84
|
}
|
|
76
85
|
}
|
|
@@ -122,23 +131,16 @@ class WebScraper
|
|
|
122
131
|
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
|
123
132
|
const dir = path.dirname( filePath );
|
|
124
133
|
|
|
125
|
-
// Create directory if it doesn't exist
|
|
126
134
|
fs.mkdirSync( dir, { recursive: true });
|
|
127
|
-
|
|
128
|
-
// Save the text content
|
|
129
135
|
fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
|
|
130
|
-
|
|
131
|
-
// Save the JSON metadata
|
|
132
136
|
fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
|
|
133
|
-
|
|
134
137
|
console.log( `Saved: ${filePath}.txt` );
|
|
135
138
|
console.log( `Saved: ${filePath}.json` );
|
|
136
139
|
}
|
|
137
140
|
|
|
138
141
|
createJSONLFile ()
|
|
139
142
|
{
|
|
140
|
-
const writeStream = fs.createWriteStream( path.join( __dirname, this.
|
|
141
|
-
|
|
143
|
+
const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
|
142
144
|
for ( const content of this.allProcessedContent )
|
|
143
145
|
{
|
|
144
146
|
const jsonLine = `${JSON.stringify( content )}\n`;
|
|
@@ -146,24 +148,27 @@ class WebScraper
|
|
|
146
148
|
}
|
|
147
149
|
|
|
148
150
|
writeStream.end();
|
|
149
|
-
console.log( `Created JSONL file at: ${this.
|
|
151
|
+
console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
|
|
150
152
|
}
|
|
151
153
|
|
|
152
154
|
createCSVFile ()
|
|
153
155
|
{
|
|
154
|
-
const writeStream = fs.createWriteStream( path.join( __dirname, this.
|
|
155
|
-
|
|
156
|
+
const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
|
|
156
157
|
writeStream.write( "text\n" );
|
|
157
|
-
|
|
158
158
|
for ( const content of this.allProcessedContent )
|
|
159
159
|
{
|
|
160
|
-
|
|
160
|
+
let fullText = content.text;
|
|
161
|
+
if ( this.includeTitles && content.metadata.title )
|
|
162
|
+
{
|
|
163
|
+
fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
|
|
164
|
+
}
|
|
165
|
+
const escapedText = fullText.replace( /"/g, "\"\"" );
|
|
161
166
|
const csvLine = `"${escapedText}"\n`;
|
|
162
167
|
writeStream.write( csvLine );
|
|
163
168
|
}
|
|
164
169
|
|
|
165
170
|
writeStream.end();
|
|
166
|
-
console.log( `Created CSV file at: ${this.
|
|
171
|
+
console.log( `Created CSV file at: ${this.csvOutputPath}` );
|
|
167
172
|
}
|
|
168
173
|
|
|
169
174
|
saveNumberedTextFiles ()
|
|
@@ -172,7 +177,12 @@ class WebScraper
|
|
|
172
177
|
{
|
|
173
178
|
const fileName = `${index + 1}.txt`;
|
|
174
179
|
const filePath = path.join( __dirname, this.textOutputPath, fileName );
|
|
175
|
-
|
|
180
|
+
let titlePrefix = "";
|
|
181
|
+
if ( this.includeTitles && content.metadata.title )
|
|
182
|
+
{
|
|
183
|
+
titlePrefix = `Title: ${content.metadata.title}\n\n`;
|
|
184
|
+
}
|
|
185
|
+
fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
|
|
176
186
|
console.log( `Created numbered text file: ${fileName}` );
|
|
177
187
|
});
|
|
178
188
|
}
|
|
@@ -259,6 +269,55 @@ class WebScraper
|
|
|
259
269
|
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
260
270
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
261
271
|
}
|
|
272
|
+
|
|
273
|
+
static combineResults ( outputPath, websites )
|
|
274
|
+
{
|
|
275
|
+
const fullOutputPath = path.join( __dirname, outputPath );
|
|
276
|
+
|
|
277
|
+
// Create output directories
|
|
278
|
+
fs.mkdirSync( fullOutputPath, { recursive: true });
|
|
279
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
|
280
|
+
|
|
281
|
+
// Combine JSONL files
|
|
282
|
+
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
|
|
283
|
+
for ( const website of websites )
|
|
284
|
+
{
|
|
285
|
+
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
286
|
+
jsonlOutput.write( jsonlContent );
|
|
287
|
+
}
|
|
288
|
+
jsonlOutput.end();
|
|
289
|
+
|
|
290
|
+
// Combine CSV files
|
|
291
|
+
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
292
|
+
csvOutput.write( "text\n" );
|
|
293
|
+
for ( const website of websites )
|
|
294
|
+
{
|
|
295
|
+
const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
|
|
296
|
+
.split( "\n" )
|
|
297
|
+
.slice( 1 ) // Skip header
|
|
298
|
+
.filter( line => { return line.trim() });
|
|
299
|
+
csvOutput.write( `${csvContent.join( "\n" ) }\n` );
|
|
300
|
+
}
|
|
301
|
+
csvOutput.end();
|
|
302
|
+
|
|
303
|
+
// Combine text files
|
|
304
|
+
let textFileCounter = 1;
|
|
305
|
+
for ( const website of websites )
|
|
306
|
+
{
|
|
307
|
+
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
|
308
|
+
for ( const file of textFiles )
|
|
309
|
+
{
|
|
310
|
+
const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
|
|
311
|
+
fs.writeFileSync(
|
|
312
|
+
path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
|
|
313
|
+
content,
|
|
314
|
+
"utf-8"
|
|
315
|
+
);
|
|
316
|
+
textFileCounter++;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
262
321
|
}
|
|
263
322
|
|
|
264
323
|
module.exports = WebScraper;
|