clean-web-scraper 2.3.2 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,7 +13,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
13
13
  - 🎯 No duplicate page visits
14
14
  - 📊 Generates JSONL output file for ML training
15
15
  - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
16
- - 📊 Rich metadata extraction including:
16
+ - 📊 Rich metadata extraction
17
+ - 📁 Combine results from multiple scrapers into a unified dataset
17
18
 
18
19
  ## 🛠️ Prerequisites
19
20
 
@@ -55,8 +56,10 @@ const scraper = new WebScraper({
55
56
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
56
57
  includeTitles: true, // Optional: Include page titles in outputs
57
58
  });
58
-
59
59
  scraper.start();
60
+
61
+ // Combine results from multiple scrapers
62
+ WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
60
63
  ```
61
64
 
62
65
  ```bash
package/example-usage.js CHANGED
@@ -21,7 +21,8 @@ async function khameneiIrFreePalestineTag ()
21
21
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
22
22
  includeTitles: true
23
23
  });
24
- await scraper.start();
24
+ // await scraper.start();
25
+ return scraper;
25
26
  }
26
27
 
27
28
  async function decolonizepalestine ()
@@ -46,14 +47,18 @@ async function decolonizepalestine ()
46
47
  textOutputPath: "./dataset/decolonizepalestine/texts",
47
48
  csvOutputPath: "./dataset/decolonizepalestine/train.csv"
48
49
  });
49
- await scraper.start();
50
+ // await scraper.start();
51
+ return scraper;
50
52
  }
51
53
 
52
54
  void async function main ()
53
55
  {
54
- await khameneiIrFreePalestineTag();
55
- await decolonizepalestine();
56
-
56
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
57
+ const decolonizepalestineScraper = await decolonizepalestine();
58
+ WebScraper.combineResults( "./dataset/combined", [
59
+ khameneiIrFreePalestineTagScraper,
60
+ decolonizepalestineScraper
61
+ ] );
57
62
 
58
63
  // 3
59
64
  // https://bdsmovement.net
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.3.2",
3
+ "version": "2.3.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -269,6 +269,55 @@ class WebScraper
269
269
  fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
270
270
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
271
271
  }
272
+
273
+ static combineResults ( outputPath, websites )
274
+ {
275
+ const fullOutputPath = path.join( __dirname, outputPath );
276
+
277
+ // Create output directories
278
+ fs.mkdirSync( fullOutputPath, { recursive: true });
279
+ fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
280
+
281
+ // Combine JSONL files
282
+ const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
283
+ for ( const website of websites )
284
+ {
285
+ const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
286
+ jsonlOutput.write( jsonlContent );
287
+ }
288
+ jsonlOutput.end();
289
+
290
+ // Combine CSV files
291
+ const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
292
+ csvOutput.write( "text\n" );
293
+ for ( const website of websites )
294
+ {
295
+ const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
296
+ .split( "\n" )
297
+ .slice( 1 ) // Skip header
298
+ .filter( line => { return line.trim() });
299
+ csvOutput.write( `${csvContent.join( "\n" ) }\n` );
300
+ }
301
+ csvOutput.end();
302
+
303
+ // Combine text files
304
+ let textFileCounter = 1;
305
+ for ( const website of websites )
306
+ {
307
+ const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
308
+ for ( const file of textFiles )
309
+ {
310
+ const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
311
+ fs.writeFileSync(
312
+ path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
313
+ content,
314
+ "utf-8"
315
+ );
316
+ textFileCounter++;
317
+ }
318
+ }
319
+ }
320
+
272
321
  }
273
322
 
274
323
  module.exports = WebScraper;