clean-web-scraper 2.3.2 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/example-usage.js +10 -5
- package/package.json +1 -1
- package/src/WebScraper.js +49 -0
package/README.md
CHANGED
|
@@ -13,7 +13,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
|
13
13
|
- 🎯 No duplicate page visits
|
|
14
14
|
- 📊 Generates JSONL output file for ML training
|
|
15
15
|
- 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
|
|
16
|
-
- 📊 Rich metadata extraction
|
|
16
|
+
- 📊 Rich metadata extraction
|
|
17
|
+
- 📁 Combine results from multiple scrapers into a unified dataset
|
|
17
18
|
|
|
18
19
|
## 🛠️ Prerequisites
|
|
19
20
|
|
|
@@ -55,8 +56,10 @@ const scraper = new WebScraper({
|
|
|
55
56
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
56
57
|
includeTitles: true, // Optional: Include page titles in outputs
|
|
57
58
|
});
|
|
58
|
-
|
|
59
59
|
scraper.start();
|
|
60
|
+
|
|
61
|
+
// Combine results from multiple scrapers
|
|
62
|
+
WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
|
|
60
63
|
```
|
|
61
64
|
|
|
62
65
|
```bash
|
package/example-usage.js
CHANGED
|
@@ -21,7 +21,8 @@ async function khameneiIrFreePalestineTag ()
|
|
|
21
21
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
|
22
22
|
includeTitles: true
|
|
23
23
|
});
|
|
24
|
-
await scraper.start();
|
|
24
|
+
// await scraper.start();
|
|
25
|
+
return scraper;
|
|
25
26
|
}
|
|
26
27
|
|
|
27
28
|
async function decolonizepalestine ()
|
|
@@ -46,14 +47,18 @@ async function decolonizepalestine ()
|
|
|
46
47
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
|
47
48
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv"
|
|
48
49
|
});
|
|
49
|
-
await scraper.start();
|
|
50
|
+
// await scraper.start();
|
|
51
|
+
return scraper;
|
|
50
52
|
}
|
|
51
53
|
|
|
52
54
|
void async function main ()
|
|
53
55
|
{
|
|
54
|
-
await khameneiIrFreePalestineTag();
|
|
55
|
-
await decolonizepalestine();
|
|
56
|
-
|
|
56
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
57
|
+
const decolonizepalestineScraper = await decolonizepalestine();
|
|
58
|
+
WebScraper.combineResults( "./dataset/combined", [
|
|
59
|
+
khameneiIrFreePalestineTagScraper,
|
|
60
|
+
decolonizepalestineScraper
|
|
61
|
+
] );
|
|
57
62
|
|
|
58
63
|
// 3
|
|
59
64
|
// https://bdsmovement.net
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -269,6 +269,55 @@ class WebScraper
|
|
|
269
269
|
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
270
270
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
271
271
|
}
|
|
272
|
+
|
|
273
|
+
static combineResults ( outputPath, websites )
|
|
274
|
+
{
|
|
275
|
+
const fullOutputPath = path.join( __dirname, outputPath );
|
|
276
|
+
|
|
277
|
+
// Create output directories
|
|
278
|
+
fs.mkdirSync( fullOutputPath, { recursive: true });
|
|
279
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
|
280
|
+
|
|
281
|
+
// Combine JSONL files
|
|
282
|
+
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
|
|
283
|
+
for ( const website of websites )
|
|
284
|
+
{
|
|
285
|
+
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
286
|
+
jsonlOutput.write( jsonlContent );
|
|
287
|
+
}
|
|
288
|
+
jsonlOutput.end();
|
|
289
|
+
|
|
290
|
+
// Combine CSV files
|
|
291
|
+
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
292
|
+
csvOutput.write( "text\n" );
|
|
293
|
+
for ( const website of websites )
|
|
294
|
+
{
|
|
295
|
+
const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
|
|
296
|
+
.split( "\n" )
|
|
297
|
+
.slice( 1 ) // Skip header
|
|
298
|
+
.filter( line => { return line.trim() });
|
|
299
|
+
csvOutput.write( `${csvContent.join( "\n" ) }\n` );
|
|
300
|
+
}
|
|
301
|
+
csvOutput.end();
|
|
302
|
+
|
|
303
|
+
// Combine text files
|
|
304
|
+
let textFileCounter = 1;
|
|
305
|
+
for ( const website of websites )
|
|
306
|
+
{
|
|
307
|
+
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
|
308
|
+
for ( const file of textFiles )
|
|
309
|
+
{
|
|
310
|
+
const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
|
|
311
|
+
fs.writeFileSync(
|
|
312
|
+
path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
|
|
313
|
+
content,
|
|
314
|
+
"utf-8"
|
|
315
|
+
);
|
|
316
|
+
textFileCounter++;
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
|
|
272
321
|
}
|
|
273
322
|
|
|
274
323
|
module.exports = WebScraper;
|