npm - clean-web-scraper - Versions diffs - 2.3.0 → 2.3.2 - Mend

clean-web-scraper 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -44,14 +44,16 @@ npm install
 const WebScraper = require('clean-web-scraper');
 const scraper = new WebScraper({
-  baseURL: 'https://example.com',       // Required: The website to scrape
-  startURL: 'https://example.com/blog', // Optional: Custom starting URL
-  excludeList: ['/admin', '/private'],  // Optional: Paths to exclude
-  exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
-  scrapResultPath: './dataset',         // Required: Where to save the content
-  jsonlPath: './dataset/train.jsonl',   // Optional: Custom JSONL output path
-  textOutputPath: "./dataset/texts",    // Optional: Custom text output path
-  csvPath: "./dataset/train.csv"        // Optional: Custom CSV output path
+  baseURL: 'https://example.com/news',          // Required: The website base url to scrape
+  startURL: 'https://example.com/blog',         // Optional: Custom starting URL
+  excludeList: ['/admin', '/private'],          // Optional: Paths to exclude
+  exactExcludeList: ['/specific-page'],         // Optional: Exact URLs to exclude
+  scrapResultPath: './example.com/website',     // Required: Where to save the content
+  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
+  textOutputPath: "./example.com/texts",        // Optional: Custom text output path
+  csvOutputPath: "./example.com/train.csv"      // Optional: Custom CSV output path
+  maxDepth: 3,                                  // Optional: Maximum depth for recursive crawling
+  includeTitles: true,                          // Optional: Include page titles in outputs
 });
 scraper.start();
@@ -73,18 +75,18 @@ Your AI-ready content is saved in a clean, structured format:
 - 📈 CSV output with clean text content
 ```bash
-dataset/
-├── example.com/
+example.com/
+├── website/
 │   ├── page1.txt         # Clean text content
 │   ├── page1.json        # Full metadata
-│   ├── blog/
-│   │   ├── post1.txt
-│   │   ├── post1.json
-│   ├── texts/           # Numbered text files
-│   │   ├── 1.txt
-│   │   ├── 2.txt
-│   ├── train.jsonl      # Combined content
-│   └── train.csv        # Clean text in CSV format
+│   └── blog/
+│       ├── post1.txt
+│       └── post1.json
+│── texts/           # Numbered text files
+│       ├── 1.txt
+│       ├── 2.txt
+│── train.jsonl      # Combined content
+└── train.csv        # Clean text in CSV format
 ```
 ## 🤖 AI/LLM Training Ready
@@ -92,7 +94,7 @@ dataset/
 The output is specifically formatted for AI training purposes:
 - Clean, processed text without HTML markup
-- Consistent formatting across all documents
+- Multiple formats (JSONL, CSV, text files)
 - Structured content perfect for fine-tuning LLMs
 - Ready to use in your ML pipelines

package/example-usage.js CHANGED Viewed

@@ -9,19 +9,21 @@ async function khameneiIrFreePalestineTag ()
 	const scraper = new WebScraper({
 		baseURL: "https://english.khamenei.ir/news",
 		startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
+		maxDepth: 1,
 		excludeList: [
 		],
 		exactExcludeList: [
+			"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
 		],
-		scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag",
-		jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
+		scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
+		jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
-		csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
+		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
+		includeTitles: true
 	});
 	await scraper.start();
 }
-// decolonizepalestine
 async function decolonizepalestine ()
 {
 	// 2
@@ -39,10 +41,10 @@ async function decolonizepalestine ()
 			"https://decolonizepalestine.com/rainbow-washing",
 			"https://decolonizepalestine.com/"
 		],
-		scrapResultPath: "./dataset/decolonizepalestine",
-		jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
+		scrapResultPath: "./dataset/decolonizepalestine/website",
+		jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
 		textOutputPath: "./dataset/decolonizepalestine/texts",
-		csvPath: "./dataset/decolonizepalestine/train.csv"
+		csvOutputPath: "./dataset/decolonizepalestine/train.csv"
 	});
 	await scraper.start();
 }
@@ -50,7 +52,7 @@ async function decolonizepalestine ()
 void async function main ()
 {
 	await khameneiIrFreePalestineTag();
-	// await decolonizepalestine();
+	await decolonizepalestine();
 	// 3

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.3.0",
+  "version": "2.3.2",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -10,38 +10,46 @@ class WebScraper
 	constructor ({
 		baseURL,
 		startURL,
+		maxDepth = Infinity,
 		excludeList,
 		exactExcludeList,
 		scrapResultPath = "./dataset",
-		jsonlPath,
+		jsonlOutputPath,
 		textOutputPath,
-		csvPath
+		csvOutputPath,
+		includeTitles = false
 	})
 	{
 		this.baseURL = baseURL;
 		this.startURL = startURL || baseURL;
-		this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
-		this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
+		this.maxDepth = maxDepth;
+		this.scrapResultPath = scrapResultPath;
+		this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
-		this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
+		this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
+		this.includeTitles = includeTitles;
 		this.visited = new Set();
 		this.excludeList = new Set( excludeList );
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
-		this.allProcessedContent = []; // Add this line
+		this.allProcessedContent = [];
 		this.createOutputDirectory();
 	}
 	async start ()
 	{
-		await this.fetchPage( this.startURL );
+		await this.fetchPage( this.startURL, 0 );
 		this.createJSONLFile();
 		this.saveNumberedTextFiles();
 		this.createCSVFile();
 		console.log( "Scraping completed." );
 	}
-	async fetchPage ( url )
+	async fetchPage ( url, depth )
 	{
+		if ( depth > this.maxDepth )
+		{
+			return;
+		}
 		this.visited.add( url );
 		try
 		{
@@ -57,6 +65,7 @@ class WebScraper
 				if ( article )
 				{
 					const metadata = this.metadataextractor( url, document, headers );
+					metadata.depth = depth;
 					this.saveArticle( url, article.textContent, metadata );
 				}
 				else
@@ -70,7 +79,7 @@ class WebScraper
 			{
 				if ( !this.visited.has( link ) )
 				{
-					await this.fetchPage( link );
+					await this.fetchPage( link, depth + 1 );
 				}
 			}
 		}
@@ -122,23 +131,16 @@ class WebScraper
 		const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
 		const dir = path.dirname( filePath );
-		// Create directory if it doesn't exist
 		fs.mkdirSync( dir, { recursive: true });
-		// Save the text content
 		fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
-		// Save the JSON metadata
 		fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
 		console.log( `Saved: ${filePath}.txt` );
 		console.log( `Saved: ${filePath}.json` );
 	}
 	createJSONLFile ()
 	{
-		const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
+		const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
 		for ( const content of this.allProcessedContent )
 		{
 			const jsonLine = `${JSON.stringify( content )}\n`;
@@ -146,24 +148,27 @@ class WebScraper
 		}
 		writeStream.end();
-		console.log( `Created JSONL file at: ${this.jsonlPath}` );
+		console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
 	}
 	createCSVFile ()
 	{
-		const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
+		const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
 		writeStream.write( "text\n" );
 		for ( const content of this.allProcessedContent )
 		{
-			const escapedText = content.text.replace( /"/g, "\"\"" );
+			let fullText = content.text;
+			if ( this.includeTitles && content.metadata.title )
+			{
+				fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
+			}
+			const escapedText = fullText.replace( /"/g, "\"\"" );
 			const csvLine = `"${escapedText}"\n`;
 			writeStream.write( csvLine );
 		}
 		writeStream.end();
-		console.log( `Created CSV file at: ${this.csvPath}` );
+		console.log( `Created CSV file at: ${this.csvOutputPath}` );
 	}
 	saveNumberedTextFiles ()
@@ -172,7 +177,12 @@ class WebScraper
 		{
 			const fileName = `${index + 1}.txt`;
 			const filePath = path.join( __dirname, this.textOutputPath, fileName );
-			fs.writeFileSync( filePath, content.text, "utf-8" );
+			let titlePrefix = "";
+			if ( this.includeTitles && content.metadata.title )
+			{
+				titlePrefix = `Title: ${content.metadata.title}\n\n`;
+			}
+			fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
 			console.log( `Created numbered text file: ${fileName}` );
 		});
 	}