npm - clean-web-scraper - Versions diffs - 2.3.2 → 3.0.0 - Mend

clean-web-scraper 2.3.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -13,7 +13,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
 - 🎯 No duplicate page visits
 - 📊 Generates JSONL output file for ML training
 - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
-- 📊 Rich metadata extraction including:
+- 📊 Rich metadata extraction
+- 📁 Combine results from multiple scrapers into a unified dataset
 ## 🛠️ Prerequisites
@@ -51,12 +52,15 @@ const scraper = new WebScraper({
   scrapResultPath: './example.com/website',     // Required: Where to save the content
   jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
   textOutputPath: "./example.com/texts",        // Optional: Custom text output path
-  csvOutputPath: "./example.com/train.csv"      // Optional: Custom CSV output path
+  csvOutputPath: "./example.com/train.csv",     // Optional: Custom CSV output path
   maxDepth: 3,                                  // Optional: Maximum depth for recursive crawling
-  includeTitles: true,                          // Optional: Include page titles in outputs
+  includeMetadata: false,                       // Optional: Include metadata in output files
+  metadataFields: ['title', 'description']      // Optional: Specify metadata fields to include
 });
 scraper.start();
+// Combine results from multiple scrapers
+WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
 ```
 ```bash
@@ -82,11 +86,16 @@ example.com/
 │   └── blog/
 │       ├── post1.txt
 │       └── post1.json
-│── texts/           # Numbered text files
-│       ├── 1.txt
-│       ├── 2.txt
-│── train.jsonl      # Combined content
-└── train.csv        # Clean text in CSV format
+├── texts/                # Numbered text files
+│   ├── 1.txt
+│   └── 2.txt
+├── texts_with_metadata/  # When includeMetadata is true
+│   ├── 1.txt
+│   └── 2.txt
+├── train.jsonl           # Combined content
+├── train_with_metadata.jsonl  # When includeMetadata is true
+├── train.csv             # Clean text in CSV format
+└── train_with_metadata.csv    # When includeMetadata is true
 ```
 ## 🤖 AI/LLM Training Ready

package/example-usage.js CHANGED Viewed

@@ -19,9 +19,11 @@ async function khameneiIrFreePalestineTag ()
 		jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
 		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
-		includeTitles: true
+		includeMetadata: true,
+		metadataFields: ["title", "description", "author", "lastModified", "language", "ogTags"]
 	});
 	await scraper.start();
+	return scraper;
 }
 async function decolonizepalestine ()
@@ -44,16 +46,22 @@ async function decolonizepalestine ()
 		scrapResultPath: "./dataset/decolonizepalestine/website",
 		jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
 		textOutputPath: "./dataset/decolonizepalestine/texts",
-		csvOutputPath: "./dataset/decolonizepalestine/train.csv"
+		csvOutputPath: "./dataset/decolonizepalestine/train.csv",
+		includeMetadata: true,
+		metadataFields: ["title", "description", "author", "lastModified", "language"]
 	});
 	await scraper.start();
+	return scraper;
 }
 void async function main ()
 {
-	await khameneiIrFreePalestineTag();
-	await decolonizepalestine();
+	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
+	const decolonizepalestineScraper = await decolonizepalestine();
+	WebScraper.combineResults( "./dataset/combined", [
+		khameneiIrFreePalestineTagScraper,
+		decolonizepalestineScraper
+	] );
 	// 3
 	// https://bdsmovement.net

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.3.2",
+  "version": "3.0.0",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",
@@ -16,7 +16,8 @@
     "ai-ready-web-scraper",
     "ai",
     "fine-tune",
-    "data-processing"
+    "data-processing",
+    "dataset"
   ],
   "author": "",
   "license": "ISC",

package/src/WebScraper.js CHANGED Viewed

@@ -17,7 +17,8 @@ class WebScraper
 		jsonlOutputPath,
 		textOutputPath,
 		csvOutputPath,
-		includeTitles = false
+		includeMetadata = false,
+		metadataFields = [] // ['title', 'description', 'author', 'lastModified', etc.]
 	})
 	{
 		this.baseURL = baseURL;
@@ -27,7 +28,10 @@ class WebScraper
 		this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
 		this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
-		this.includeTitles = includeTitles;
+		this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
+		this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
+		this.includeMetadata = includeMetadata;
+	   this.metadataFields = new Set( metadataFields );
 		this.visited = new Set();
 		this.excludeList = new Set( excludeList );
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -118,9 +122,18 @@ class WebScraper
 	{
 		const processedContent = this.processContent( content );
-		this.allProcessedContent.push({
+		const simpleContent = {
+			text: processedContent.trim()
+		};
+		const contentWithMetadata = {
 			text: processedContent.trim(),
-			metadata
+			metadata: this.filterMetadata( metadata )
+		};
+		this.allProcessedContent.push({
+			simple: simpleContent,
+			withMetadata: contentWithMetadata
 		});
 		let urlPath = new URL( url ).pathname;
@@ -140,50 +153,118 @@ class WebScraper
 	createJSONLFile ()
 	{
-		const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
+		const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
+		let writeStreamMeta
+		if ( this.includeMetadata )
+		{
+			writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
+		}
 		for ( const content of this.allProcessedContent )
 		{
-			const jsonLine = `${JSON.stringify( content )}\n`;
-			writeStream.write( jsonLine );
+			writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
+			if ( this.includeMetadata )
+			{
+				writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
+			}
+		}
+		writeStreamSimple.end();
+		if ( this.includeMetadata )
+		{
+			writeStreamMeta.end();
 		}
-		writeStream.end();
 		console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
 	}
 	createCSVFile ()
 	{
-		const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
-		writeStream.write( "text\n" );
+		// Create simple version
+		const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
+		writeStreamSimple.write( "text\n" );
+		// Create metadata version if requested
+		let writeStreamMeta;
+		if ( this.includeMetadata )
+		{
+			writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
+		}
+		if ( this.includeMetadata )
+		{
+			const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
+			writeStreamMeta.write( `${headers}\n` );
+		}
 		for ( const content of this.allProcessedContent )
 		{
-			let fullText = content.text;
-			if ( this.includeTitles && content.metadata.title )
+			// Write simple version
+			const escapedText = content.simple.text.replace( /"/g, "\"\"" );
+			writeStreamSimple.write( `"${escapedText}"\n` );
+			// Write metadata version if requested
+			if ( this.includeMetadata )
 			{
-				fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
+				const { metadata } = content.withMetadata;
+				const metadataValues = Array.from( this.metadataFields )
+				.map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
+				writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
 			}
-			const escapedText = fullText.replace( /"/g, "\"\"" );
-			const csvLine = `"${escapedText}"\n`;
-			writeStream.write( csvLine );
 		}
-		writeStream.end();
-		console.log( `Created CSV file at: ${this.csvOutputPath}` );
+		writeStreamSimple.end();
+		if ( writeStreamMeta )
+		{
+			writeStreamMeta.end();
+		}
+		console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
+		if ( this.includeMetadata )
+		{
+			console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
+		}
 	}
 	saveNumberedTextFiles ()
 	{
+		// Create base text folder for simple content
+		const baseTextPath = path.join( __dirname, this.textOutputPath );
+		// Create metadata text folder if needed
+		let metaTextPath = null;
+		if ( this.includeMetadata )
+		{
+			metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
+			fs.mkdirSync( metaTextPath, { recursive: true });
+		}
 		this.allProcessedContent.forEach( ( content, index ) =>
 		{
 			const fileName = `${index + 1}.txt`;
-			const filePath = path.join( __dirname, this.textOutputPath, fileName );
-			let titlePrefix = "";
-			if ( this.includeTitles && content.metadata.title )
+			// Always save simple version
+			const simpleFilePath = path.join( baseTextPath, fileName );
+			fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
+			console.log( `Created simple text file: ${fileName}` );
+			// Save metadata version if enabled
+			if ( this.includeMetadata )
 			{
-				titlePrefix = `Title: ${content.metadata.title}\n\n`;
+				const metaFilePath = path.join( metaTextPath, fileName );
+				let fileContent = "";
+				const { metadata } = content.withMetadata;
+				// Add metadata fields as headers
+				for ( const field of this.metadataFields )
+				{
+					if ( metadata[field] )
+					{
+						fileContent += `${field}: ${metadata[field]}\n`;
+					}
+				}
+				fileContent += "\n---\n\n";
+				fileContent += content.withMetadata.text;
+				fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
+				console.log( `Created metadata text file: ${fileName}` );
 			}
-			fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
-			console.log( `Created numbered text file: ${fileName}` );
 		});
 	}
@@ -209,6 +290,21 @@ class WebScraper
 		return processed;
 	}
+	filterMetadata ( metadata )
+	{
+		if ( !this.includeMetadata ) return {};
+		const filteredMetadata = {};
+		for ( const field of this.metadataFields )
+		{
+			if ( metadata[field] && typeof metadata[field] === "string" )
+			{
+				filteredMetadata[field] = metadata[field];
+			}
+		}
+		return filteredMetadata;
+	}
 	metadataextractor ( url, document, headers )
 	{
 		return {
@@ -222,12 +318,10 @@ class WebScraper
 			contentLength: headers["content-length"],
 			language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
 			canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
-			ogTags: {
-				title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
-				description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
-				image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
-				type: document.querySelector( "meta[property=\"og:type\"]" )?.content
-			},
+			ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
+			ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
+			ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
+			ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
 			dateScraped: new Date().toISOString()
 		};
 	}
@@ -269,6 +363,54 @@ class WebScraper
 		fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
 		fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
 	}
+	static combineResults ( outputPath, websites )
+	{
+		const fullOutputPath = path.join( __dirname, outputPath );
+		// Create output directories
+		fs.mkdirSync( fullOutputPath, { recursive: true });
+		fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
+		// Combine JSONL files
+		const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
+		for ( const website of websites )
+		{
+			const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
+			jsonlOutput.write( jsonlContent );
+		}
+		jsonlOutput.end();
+		// Combine CSV files
+		const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
+		csvOutput.write( "text\n" );
+		for ( const website of websites )
+		{
+			const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
+			.split( "\n" )
+			.slice( 1 ) // Skip header
+			.filter( line => { return line.trim() });
+			csvOutput.write( `${csvContent.join( "\n" ) }\n` );
+		}
+		csvOutput.end();
+		// Combine text files
+		let textFileCounter = 1;
+		for ( const website of websites )
+		{
+			const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
+			for ( const file of textFiles )
+			{
+				const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
+				fs.writeFileSync(
+					path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
+					content,
+					"utf-8"
+				);
+				textFileCounter++;
+			}
+		}
+	}
 }
 module.exports = WebScraper;