npm - clean-web-scraper - Versions diffs - 2.2.0 → 2.3.0 - Mend

clean-web-scraper 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -13,6 +13,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
 - 🎯 No duplicate page visits
 - 📊 Generates JSONL output file for ML training
 - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
+- 📊 Rich metadata extraction including:
 ## 🛠️ Prerequisites
@@ -44,10 +45,11 @@ const WebScraper = require('clean-web-scraper');
 const scraper = new WebScraper({
   baseURL: 'https://example.com',       // Required: The website to scrape
-  scrapResultPath: './output',          // Required: Where to save the content
+  startURL: 'https://example.com/blog', // Optional: Custom starting URL
   excludeList: ['/admin', '/private'],  // Optional: Paths to exclude
   exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
-  jsonlPath: 'output.jsonl',            // Optional: Custom JSONL output path
+  scrapResultPath: './dataset',         // Required: Where to save the content
+  jsonlPath: './dataset/train.jsonl',   // Optional: Custom JSONL output path
   textOutputPath: "./dataset/texts",    // Optional: Custom text output path
   csvPath: "./dataset/train.csv"        // Optional: Custom CSV output path
 });
@@ -70,6 +72,21 @@ Your AI-ready content is saved in a clean, structured format:
 - 📊 JSONL output for ML training
 - 📈 CSV output with clean text content
+```bash
+dataset/
+├── example.com/
+│   ├── page1.txt         # Clean text content
+│   ├── page1.json        # Full metadata
+│   ├── blog/
+│   │   ├── post1.txt
+│   │   ├── post1.json
+│   ├── texts/           # Numbered text files
+│   │   ├── 1.txt
+│   │   ├── 2.txt
+│   ├── train.jsonl      # Combined content
+│   └── train.csv        # Clean text in CSV format
+```
 ## 🤖 AI/LLM Training Ready
 The output is specifically formatted for AI training purposes:

package/example-usage.js CHANGED Viewed

@@ -1,28 +1,62 @@
 const WebScraper = require( "./src/WebScraper" );
-// Configuration
-const baseURL = "https://decolonizepalestine.com";
-const scrapResultPath = "./dataset";
-const excludeList = [
-	"https://decolonizepalestine.com/cdn-cgi",
-	"https://decolonizepalestine.com/introduction-to-palestine",
-	"https://decolonizepalestine.com/myths",
-	"https://decolonizepalestine.com/reading-list",
-	"https://decolonizepalestine.com/support-us"
-];
-const exactExcludeList = [
-	"https://decolonizepalestine.com/rainbow-washing",
-	"https://decolonizepalestine.com/"
-]
-// Initialize scraper with all available options
-const scraper = new WebScraper({
-	baseURL,
-	scrapResultPath,
-	excludeList,
-	exactExcludeList,
-	jsonlPath: "./dataset/train.jsonl",
-	textOutputPath: "./dataset/texts",
-	csvPath: "./dataset/train.csv"
-});
-scraper.start();
+async function khameneiIrFreePalestineTag ()
+{
+	// 1
+	// https://english.khamenei.ir/Opinions/FreePalestine
+	// https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
+	const scraper = new WebScraper({
+		baseURL: "https://english.khamenei.ir/news",
+		startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
+		excludeList: [
+		],
+		exactExcludeList: [
+		],
+		scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag",
+		jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
+		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
+		csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
+	});
+	await scraper.start();
+}
+// decolonizepalestine
+async function decolonizepalestine ()
+{
+	// 2
+	// https://decolonizepalestine.com
+	const scraper = new WebScraper({
+		baseURL: "https://decolonizepalestine.com",
+		excludeList: [
+			"https://decolonizepalestine.com/cdn-cgi",
+			"https://decolonizepalestine.com/introduction-to-palestine",
+			"https://decolonizepalestine.com/myths",
+			"https://decolonizepalestine.com/reading-list",
+			"https://decolonizepalestine.com/support-us"
+		],
+		exactExcludeList: [
+			"https://decolonizepalestine.com/rainbow-washing",
+			"https://decolonizepalestine.com/"
+		],
+		scrapResultPath: "./dataset/decolonizepalestine",
+		jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
+		textOutputPath: "./dataset/decolonizepalestine/texts",
+		csvPath: "./dataset/decolonizepalestine/train.csv"
+	});
+	await scraper.start();
+}
+void async function main ()
+{
+	await khameneiIrFreePalestineTag();
+	// await decolonizepalestine();
+	// 3
+	// https://bdsmovement.net
+	// 4
+	// https://electronicintifada.net/
+}()

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.2.0",
+  "version": "2.3.0",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -9,6 +9,7 @@ class WebScraper
 {
 	constructor ({
 		baseURL,
+		startURL,
 		excludeList,
 		exactExcludeList,
 		scrapResultPath = "./dataset",
@@ -18,6 +19,7 @@ class WebScraper
 	})
 	{
 		this.baseURL = baseURL;
+		this.startURL = startURL || baseURL;
 		this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
 		this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
@@ -25,14 +27,13 @@ class WebScraper
 		this.visited = new Set();
 		this.excludeList = new Set( excludeList );
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
-		this.processedContent = []; // Add this line
+		this.allProcessedContent = []; // Add this line
 		this.createOutputDirectory();
 	}
 	async start ()
 	{
-		this.visited.add( this.baseURL );
-		await this.fetchPage( this.baseURL );
+		await this.fetchPage( this.startURL );
 		this.createJSONLFile();
 		this.saveNumberedTextFiles();
 		this.createCSVFile();
@@ -41,20 +42,22 @@ class WebScraper
 	async fetchPage ( url )
 	{
+		this.visited.add( url );
 		try
 		{
-			const { data } = await axios.get( url );
+			const { data, headers } = await axios.get( url );
 			const dom = new JSDOM( data, { url });
+			const { document } = dom.window;
-			// Only save if the URL is not excluded
 			if ( !this.isExcluded( url ) )
 			{
-				const reader = new Readability( dom.window.document, { charThreshold: 500, nbTopCandidates: 20 });
+				const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
 				const article = reader.parse();
 				if ( article )
 				{
-					this.saveArticle( url, article.textContent );
+					const metadata = this.metadataextractor( url, document, headers );
+					this.saveArticle( url, article.textContent, metadata );
 				}
 				else
 				{
@@ -67,7 +70,6 @@ class WebScraper
 			{
 				if ( !this.visited.has( link ) )
 				{
-					this.visited.add( link );
 					await this.fetchPage( link );
 				}
 			}
@@ -87,6 +89,10 @@ class WebScraper
 		while ( ( match = regex.exec( data ) ) !== null )
 		{
 			let href = match[2];
+			if ( href.startsWith( "/" ) )
+			{
+				href = new URL( href, this.baseURL ).href
+			}
 			if ( href.endsWith( "/" ) )
 			{
 				href = href.slice( 0, -1 );
@@ -95,21 +101,17 @@ class WebScraper
 			{
 				links.add( href );
 			}
-			else if ( href.startsWith( "/" ) )
-			{
-				links.add( new URL( href, this.baseURL ).href );
-			}
 		}
 		return links;
 	}
-	saveArticle ( url, content )
+	saveArticle ( url, content, metadata )
 	{
 		const processedContent = this.processContent( content );
-		this.processedContent.push({
-			text: processedContent.trim()
+		this.allProcessedContent.push({
+			text: processedContent.trim(),
+			metadata
 		});
 		let urlPath = new URL( url ).pathname;
@@ -120,14 +122,6 @@ class WebScraper
 		const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
 		const dir = path.dirname( filePath );
-		// Create metadata object
-		const metadata = {
-			url,
-			dateScraped: new Date().toISOString(),
-			contentLength: processedContent.length,
-			fileName: `${path.basename( filePath )}.txt`
-		};
 		// Create directory if it doesn't exist
 		fs.mkdirSync( dir, { recursive: true });
@@ -145,7 +139,7 @@ class WebScraper
 	{
 		const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
-		for ( const content of this.processedContent )
+		for ( const content of this.allProcessedContent )
 		{
 			const jsonLine = `${JSON.stringify( content )}\n`;
 			writeStream.write( jsonLine );
@@ -161,7 +155,7 @@ class WebScraper
 		writeStream.write( "text\n" );
-		for ( const content of this.processedContent )
+		for ( const content of this.allProcessedContent )
 		{
 			const escapedText = content.text.replace( /"/g, "\"\"" );
 			const csvLine = `"${escapedText}"\n`;
@@ -174,7 +168,7 @@ class WebScraper
 	saveNumberedTextFiles ()
 	{
-		this.processedContent.forEach( ( content, index ) =>
+		this.allProcessedContent.forEach( ( content, index ) =>
 		{
 			const fileName = `${index + 1}.txt`;
 			const filePath = path.join( __dirname, this.textOutputPath, fileName );
@@ -205,6 +199,29 @@ class WebScraper
 		return processed;
 	}
+	metadataextractor ( url, document, headers )
+	{
+		return {
+			url,
+			title: document.title,
+			description: document.querySelector( "meta[name=\"description\"]" )?.content,
+			keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
+			author: document.querySelector( "meta[name=\"author\"]" )?.content,
+			lastModified: headers["last-modified"],
+			contentType: headers["content-type"],
+			contentLength: headers["content-length"],
+			language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
+			canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
+			ogTags: {
+				title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
+				description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
+				image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
+				type: document.querySelector( "meta[property=\"og:type\"]" )?.content
+			},
+			dateScraped: new Date().toISOString()
+		};
+	}
 	normalizeExcludeList ( list )
 	{
 		const normalizedSet = new Set();