npm - clean-web-scraper - Versions diffs - 2.3.2 → 2.3.3 - Mend

clean-web-scraper 2.3.2 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -13,7 +13,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
 - 🎯 No duplicate page visits
 - 📊 Generates JSONL output file for ML training
 - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
-- 📊 Rich metadata extraction including:
+- 📊 Rich metadata extraction
+- 📁 Combine results from multiple scrapers into a unified dataset
 ## 🛠️ Prerequisites
@@ -55,8 +56,10 @@ const scraper = new WebScraper({
   maxDepth: 3,                                  // Optional: Maximum depth for recursive crawling
   includeTitles: true,                          // Optional: Include page titles in outputs
 });
 scraper.start();
+// Combine results from multiple scrapers
+WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
 ```
 ```bash

package/example-usage.js CHANGED Viewed

@@ -21,7 +21,8 @@ async function khameneiIrFreePalestineTag ()
 		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
 		includeTitles: true
 	});
-	await scraper.start();
+	// await scraper.start();
+	return scraper;
 }
 async function decolonizepalestine ()
@@ -46,14 +47,18 @@ async function decolonizepalestine ()
 		textOutputPath: "./dataset/decolonizepalestine/texts",
 		csvOutputPath: "./dataset/decolonizepalestine/train.csv"
 	});
-	await scraper.start();
+	// await scraper.start();
+	return scraper;
 }
 void async function main ()
 {
-	await khameneiIrFreePalestineTag();
-	await decolonizepalestine();
+	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
+	const decolonizepalestineScraper = await decolonizepalestine();
+	WebScraper.combineResults( "./dataset/combined", [
+		khameneiIrFreePalestineTagScraper,
+		decolonizepalestineScraper
+	] );
 	// 3
 	// https://bdsmovement.net

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.3.2",
+  "version": "2.3.3",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -269,6 +269,55 @@ class WebScraper
 		fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
 		fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
 	}
+	static combineResults ( outputPath, websites )
+	{
+		const fullOutputPath = path.join( __dirname, outputPath );
+		// Create output directories
+		fs.mkdirSync( fullOutputPath, { recursive: true });
+		fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
+		// Combine JSONL files
+		const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
+		for ( const website of websites )
+		{
+			const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
+			jsonlOutput.write( jsonlContent );
+		}
+		jsonlOutput.end();
+		// Combine CSV files
+		const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
+		csvOutput.write( "text\n" );
+		for ( const website of websites )
+		{
+			const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
+			.split( "\n" )
+			.slice( 1 ) // Skip header
+			.filter( line => { return line.trim() });
+			csvOutput.write( `${csvContent.join( "\n" ) }\n` );
+		}
+		csvOutput.end();
+		// Combine text files
+		let textFileCounter = 1;
+		for ( const website of websites )
+		{
+			const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
+			for ( const file of textFiles )
+			{
+				const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
+				fs.writeFileSync(
+					path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
+					content,
+					"utf-8"
+				);
+				textFileCounter++;
+			}
+		}
+	}
 }
 module.exports = WebScraper;