npm - clean-web-scraper - Versions diffs - 2.0.5 → 2.1.0 - Mend

clean-web-scraper 2.0.5 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -11,7 +11,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
 - 🚫 Excludes unwanted paths from scraping
 - 🔄 Handles relative and absolute URLs like a pro
 - 🎯 No duplicate page visits
-- 📊 Generates JSONL output file for ML training
+- 📊 Generates JSONL and raw text output file for ML training
 - 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
 ## 🛠️ Prerequisites
@@ -43,11 +43,12 @@ npm install
 const WebScraper = require('clean-web-scraper');
 const scraper = new WebScraper({
-  baseURL: 'https://example.com',      // Required: The website to scrape
-  folderPath: './output',              // Required: Where to save the content
-  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
-  exactExcludeList: ['/specific-page'],// Optional: Exact URLs to exclude
-  jsonlPath: 'output.jsonl'            // Optional: Custom JSONL output path
+  baseURL: 'https://example.com',       // Required: The website to scrape
+  scrapResultPath: './output',          // Required: Where to save the content
+  excludeList: ['/admin', '/private'],  // Optional: Paths to exclude
+  exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
+  jsonlPath: 'output.jsonl',            // Optional: Custom JSONL output path
+  textOutputPath: "./dataset/texts"     // Optional: Custom text output path
 });
 scraper.start();

package/example-usage.js CHANGED Viewed

@@ -1,7 +1,8 @@
 const WebScraper = require( "./src/WebScraper" );
+// Configuration
 const baseURL = "https://decolonizepalestine.com";
-const folderPath = "./dataset";
+const scrapResultPath = "./dataset";
 const excludeList = [
 	"https://decolonizepalestine.com/cdn-cgi",
 	"https://decolonizepalestine.com/introduction-to-palestine",
@@ -14,12 +15,13 @@ const exactExcludeList = [
 	"https://decolonizepalestine.com/"
 ]
+// Initialize scraper with all available options
 const scraper = new WebScraper({
 	baseURL,
-	folderPath,
+	scrapResultPath,
 	excludeList,
 	exactExcludeList,
-	jsonlPath: "./dataset/final.jsonl"
+	jsonlPath: "./dataset/train.jsonl",
+	textOutputPath: "./dataset/texts"
 });
 scraper.start();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.0.5",
+  "version": "2.1.0",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -7,11 +7,19 @@ const path = require( "path" );
 class WebScraper
 {
-	constructor ({ baseURL, folderPath, excludeList, exactExcludeList, jsonlPath })
+	constructor ({
+		baseURL,
+		excludeList,
+		exactExcludeList,
+		scrapResultPath = "./dataset",
+		jsonlPath,
+		textOutputPath
+	})
 	{
 		this.baseURL = baseURL;
-		this.jsonlPath = jsonlPath || "output.jsonl";
-		this.folderPath = path.join( folderPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
+		this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
+		this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
+		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
 		this.visited = new Set();
 		this.excludeList = new Set( excludeList );
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -24,6 +32,8 @@ class WebScraper
 		this.visited.add( this.baseURL );
 		await this.fetchPage( this.baseURL );
 		this.createJSONLFile();
+		this.saveNumberedTextFiles();
+		console.log( "Scraping completed." );
 	}
 	async fetchPage ( url )
@@ -104,7 +114,7 @@ class WebScraper
 		{
 			urlPath = "/index";
 		}
-		const filePath = path.join( __dirname, this.folderPath, urlPath );
+		const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
 		const dir = path.dirname( filePath );
 		// Create metadata object
@@ -142,6 +152,17 @@ class WebScraper
 		console.log( `Created JSONL file at: ${this.jsonlPath}` );
 	}
+	saveNumberedTextFiles ()
+	{
+		this.processedContent.forEach( ( content, index ) =>
+		{
+			const fileName = `${index + 1}.txt`;
+			const filePath = path.join( __dirname, this.textOutputPath, fileName );
+			fs.writeFileSync( filePath, content.text, "utf-8" );
+			console.log( `Created numbered text file: ${fileName}` );
+		});
+	}
 	processContent ( content )
 	{
 		let processed = content;
@@ -194,14 +215,12 @@ class WebScraper
 	createOutputDirectory ()
 	{
-		if ( fs.existsSync( path.join( __dirname, this.folderPath ) ) )
-		{
-			fs.rmSync( path.join( __dirname, this.folderPath ), { recursive: true, force: true });
-		}
-		if ( !fs.existsSync( path.join( __dirname, this.folderPath ) ) )
+		if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
 		{
-			fs.mkdirSync( path.join( __dirname, this.folderPath ), { recursive: true });
+			fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
 		}
+		fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
+		fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
 	}
 }