npm - clean-web-scraper - Versions diffs - 2.1.0 → 2.2.0 - Mend

clean-web-scraper 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -11,8 +11,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
 - 🚫 Excludes unwanted paths from scraping
 - 🔄 Handles relative and absolute URLs like a pro
 - 🎯 No duplicate page visits
-- 📊 Generates JSONL and raw text output file for ML training
-- 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
+- 📊 Generates JSONL output file for ML training
+- 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
 ## 🛠️ Prerequisites
@@ -48,7 +48,8 @@ const scraper = new WebScraper({
   excludeList: ['/admin', '/private'],  // Optional: Paths to exclude
   exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
   jsonlPath: 'output.jsonl',            // Optional: Custom JSONL output path
-  textOutputPath: "./dataset/texts"     // Optional: Custom text output path
+  textOutputPath: "./dataset/texts",    // Optional: Custom text output path
+  csvPath: "./dataset/train.csv"        // Optional: Custom CSV output path
 });
 scraper.start();
@@ -67,6 +68,7 @@ Your AI-ready content is saved in a clean, structured format:
 - 📝 Pure text format, perfect for LLM training and fine-tuning
 - 🤖 No HTML, no mess - just clean, structured text ready for AI consumption
 - 📊 JSONL output for ML training
+- 📈 CSV output with clean text content
 ## 🤖 AI/LLM Training Ready

package/example-usage.js CHANGED Viewed

@@ -22,6 +22,7 @@ const scraper = new WebScraper({
 	excludeList,
 	exactExcludeList,
 	jsonlPath: "./dataset/train.jsonl",
-	textOutputPath: "./dataset/texts"
+	textOutputPath: "./dataset/texts",
+	csvPath: "./dataset/train.csv"
 });
 scraper.start();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.1.0",
+  "version": "2.2.0",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -13,13 +13,15 @@ class WebScraper
 		exactExcludeList,
 		scrapResultPath = "./dataset",
 		jsonlPath,
-		textOutputPath
+		textOutputPath,
+		csvPath
 	})
 	{
 		this.baseURL = baseURL;
 		this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
 		this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
+		this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
 		this.visited = new Set();
 		this.excludeList = new Set( excludeList );
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -33,6 +35,7 @@ class WebScraper
 		await this.fetchPage( this.baseURL );
 		this.createJSONLFile();
 		this.saveNumberedTextFiles();
+		this.createCSVFile();
 		console.log( "Scraping completed." );
 	}
@@ -152,6 +155,23 @@ class WebScraper
 		console.log( `Created JSONL file at: ${this.jsonlPath}` );
 	}
+	createCSVFile ()
+	{
+		const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
+		writeStream.write( "text\n" );
+		for ( const content of this.processedContent )
+		{
+			const escapedText = content.text.replace( /"/g, "\"\"" );
+			const csvLine = `"${escapedText}"\n`;
+			writeStream.write( csvLine );
+		}
+		writeStream.end();
+		console.log( `Created CSV file at: ${this.csvPath}` );
+	}
 	saveNumberedTextFiles ()
 	{
 		this.processedContent.forEach( ( content, index ) =>