npm - clean-web-scraper - Versions diffs - 2.3.1 → 2.3.3 - Mend

clean-web-scraper 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -13,7 +13,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
 - 🎯 No duplicate page visits
 - 📊 Generates JSONL output file for ML training
 - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
-- 📊 Rich metadata extraction including:
+- 📊 Rich metadata extraction
+- 📁 Combine results from multiple scrapers into a unified dataset
 ## 🛠️ Prerequisites
@@ -44,17 +45,21 @@ npm install
 const WebScraper = require('clean-web-scraper');
 const scraper = new WebScraper({
-  baseURL: 'https://example.com',           // Required: The website to scrape
-  startURL: 'https://example.com/blog',     // Optional: Custom starting URL
-  excludeList: ['/admin', '/private'],      // Optional: Paths to exclude
-  exactExcludeList: ['/specific-page'],     // Optional: Exact URLs to exclude
-  scrapResultPath: './example.com/website', // Required: Where to save the content
-  jsonlPath: './example.com/train.jsonl',   // Optional: Custom JSONL output path
-  textOutputPath: "./example.com/texts",    // Optional: Custom text output path
-  csvPath: "./example.com/train.csv"        // Optional: Custom CSV output path
+  baseURL: 'https://example.com/news',          // Required: The website base url to scrape
+  startURL: 'https://example.com/blog',         // Optional: Custom starting URL
+  excludeList: ['/admin', '/private'],          // Optional: Paths to exclude
+  exactExcludeList: ['/specific-page'],         // Optional: Exact URLs to exclude
+  scrapResultPath: './example.com/website',     // Required: Where to save the content
+  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
+  textOutputPath: "./example.com/texts",        // Optional: Custom text output path
+  csvOutputPath: "./example.com/train.csv"      // Optional: Custom CSV output path
+  maxDepth: 3,                                  // Optional: Maximum depth for recursive crawling
+  includeTitles: true,                          // Optional: Include page titles in outputs
 });
 scraper.start();
+// Combine results from multiple scrapers
+WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
 ```
 ```bash
@@ -92,7 +97,7 @@ example.com/
 The output is specifically formatted for AI training purposes:
 - Clean, processed text without HTML markup
-- Consistent formatting across all documents
+- Multiple formats (JSONL, CSV, text files)
 - Structured content perfect for fine-tuning LLMs
 - Ready to use in your ML pipelines

package/example-usage.js CHANGED Viewed

@@ -9,19 +9,22 @@ async function khameneiIrFreePalestineTag ()
 	const scraper = new WebScraper({
 		baseURL: "https://english.khamenei.ir/news",
 		startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
+		maxDepth: 1,
 		excludeList: [
 		],
 		exactExcludeList: [
+			"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
 		],
 		scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
-		jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
+		jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
-		csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
+		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
+		includeTitles: true
 	});
-	await scraper.start();
+	// await scraper.start();
+	return scraper;
 }
-// decolonizepalestine
 async function decolonizepalestine ()
 {
 	// 2
@@ -40,18 +43,22 @@ async function decolonizepalestine ()
 			"https://decolonizepalestine.com/"
 		],
 		scrapResultPath: "./dataset/decolonizepalestine/website",
-		jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
+		jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
 		textOutputPath: "./dataset/decolonizepalestine/texts",
-		csvPath: "./dataset/decolonizepalestine/train.csv"
+		csvOutputPath: "./dataset/decolonizepalestine/train.csv"
 	});
-	await scraper.start();
+	// await scraper.start();
+	return scraper;
 }
 void async function main ()
 {
-	// await khameneiIrFreePalestineTag();
-	await decolonizepalestine();
+	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
+	const decolonizepalestineScraper = await decolonizepalestine();
+	WebScraper.combineResults( "./dataset/combined", [
+		khameneiIrFreePalestineTagScraper,
+		decolonizepalestineScraper
+	] );
 	// 3
 	// https://bdsmovement.net

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.3.1",
+  "version": "2.3.3",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -10,38 +10,46 @@ class WebScraper
 	constructor ({
 		baseURL,
 		startURL,
+		maxDepth = Infinity,
 		excludeList,
 		exactExcludeList,
 		scrapResultPath = "./dataset",
-		jsonlPath,
+		jsonlOutputPath,
 		textOutputPath,
-		csvPath
+		csvOutputPath,
+		includeTitles = false
 	})
 	{
 		this.baseURL = baseURL;
 		this.startURL = startURL || baseURL;
+		this.maxDepth = maxDepth;
 		this.scrapResultPath = scrapResultPath;
-		this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
+		this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
-		this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
+		this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
+		this.includeTitles = includeTitles;
 		this.visited = new Set();
 		this.excludeList = new Set( excludeList );
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
-		this.allProcessedContent = []; // Add this line
+		this.allProcessedContent = [];
 		this.createOutputDirectory();
 	}
 	async start ()
 	{
-		await this.fetchPage( this.startURL );
+		await this.fetchPage( this.startURL, 0 );
 		this.createJSONLFile();
 		this.saveNumberedTextFiles();
 		this.createCSVFile();
 		console.log( "Scraping completed." );
 	}
-	async fetchPage ( url )
+	async fetchPage ( url, depth )
 	{
+		if ( depth > this.maxDepth )
+		{
+			return;
+		}
 		this.visited.add( url );
 		try
 		{
@@ -57,6 +65,7 @@ class WebScraper
 				if ( article )
 				{
 					const metadata = this.metadataextractor( url, document, headers );
+					metadata.depth = depth;
 					this.saveArticle( url, article.textContent, metadata );
 				}
 				else
@@ -70,7 +79,7 @@ class WebScraper
 			{
 				if ( !this.visited.has( link ) )
 				{
-					await this.fetchPage( link );
+					await this.fetchPage( link, depth + 1 );
 				}
 			}
 		}
@@ -122,23 +131,16 @@ class WebScraper
 		const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
 		const dir = path.dirname( filePath );
-		// Create directory if it doesn't exist
 		fs.mkdirSync( dir, { recursive: true });
-		// Save the text content
 		fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
-		// Save the JSON metadata
 		fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
 		console.log( `Saved: ${filePath}.txt` );
 		console.log( `Saved: ${filePath}.json` );
 	}
 	createJSONLFile ()
 	{
-		const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
+		const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
 		for ( const content of this.allProcessedContent )
 		{
 			const jsonLine = `${JSON.stringify( content )}\n`;
@@ -146,24 +148,27 @@ class WebScraper
 		}
 		writeStream.end();
-		console.log( `Created JSONL file at: ${this.jsonlPath}` );
+		console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
 	}
 	createCSVFile ()
 	{
-		const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
+		const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
 		writeStream.write( "text\n" );
 		for ( const content of this.allProcessedContent )
 		{
-			const escapedText = content.text.replace( /"/g, "\"\"" );
+			let fullText = content.text;
+			if ( this.includeTitles && content.metadata.title )
+			{
+				fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
+			}
+			const escapedText = fullText.replace( /"/g, "\"\"" );
 			const csvLine = `"${escapedText}"\n`;
 			writeStream.write( csvLine );
 		}
 		writeStream.end();
-		console.log( `Created CSV file at: ${this.csvPath}` );
+		console.log( `Created CSV file at: ${this.csvOutputPath}` );
 	}
 	saveNumberedTextFiles ()
@@ -172,7 +177,12 @@ class WebScraper
 		{
 			const fileName = `${index + 1}.txt`;
 			const filePath = path.join( __dirname, this.textOutputPath, fileName );
-			fs.writeFileSync( filePath, content.text, "utf-8" );
+			let titlePrefix = "";
+			if ( this.includeTitles && content.metadata.title )
+			{
+				titlePrefix = `Title: ${content.metadata.title}\n\n`;
+			}
+			fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
 			console.log( `Created numbered text file: ${fileName}` );
 		});
 	}
@@ -259,6 +269,55 @@ class WebScraper
 		fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
 		fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
 	}
+	static combineResults ( outputPath, websites )
+	{
+		const fullOutputPath = path.join( __dirname, outputPath );
+		// Create output directories
+		fs.mkdirSync( fullOutputPath, { recursive: true });
+		fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
+		// Combine JSONL files
+		const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
+		for ( const website of websites )
+		{
+			const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
+			jsonlOutput.write( jsonlContent );
+		}
+		jsonlOutput.end();
+		// Combine CSV files
+		const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
+		csvOutput.write( "text\n" );
+		for ( const website of websites )
+		{
+			const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
+			.split( "\n" )
+			.slice( 1 ) // Skip header
+			.filter( line => { return line.trim() });
+			csvOutput.write( `${csvContent.join( "\n" ) }\n` );
+		}
+		csvOutput.end();
+		// Combine text files
+		let textFileCounter = 1;
+		for ( const website of websites )
+		{
+			const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
+			for ( const file of textFiles )
+			{
+				const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
+				fs.writeFileSync(
+					path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
+					content,
+					"utf-8"
+				);
+				textFileCounter++;
+			}
+		}
+	}
 }
 module.exports = WebScraper;