npm - clean-web-scraper - Versions diffs - 2.3.3 → 3.1.0 - Mend

clean-web-scraper 2.3.3 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -52,9 +52,10 @@ const scraper = new WebScraper({
   scrapResultPath: './example.com/website',     // Required: Where to save the content
   jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
   textOutputPath: "./example.com/texts",        // Optional: Custom text output path
-  csvOutputPath: "./example.com/train.csv"      // Optional: Custom CSV output path
+  csvOutputPath: "./example.com/train.csv",     // Optional: Custom CSV output path
   maxDepth: 3,                                  // Optional: Maximum depth for recursive crawling
-  includeTitles: true,                          // Optional: Include page titles in outputs
+  includeMetadata: false,                       // Optional: Include metadata in output files
+  metadataFields: ['title', 'description']      // Optional: Specify metadata fields to include
 });
 scraper.start();
@@ -85,11 +86,16 @@ example.com/
 │   └── blog/
 │       ├── post1.txt
 │       └── post1.json
-│── texts/           # Numbered text files
-│       ├── 1.txt
-│       ├── 2.txt
-│── train.jsonl      # Combined content
-└── train.csv        # Clean text in CSV format
+├── texts/                # Numbered text files
+│   ├── 1.txt
+│   └── 2.txt
+├── texts_with_metadata/  # When includeMetadata is true
+│   ├── 1.txt
+│   └── 2.txt
+├── train.jsonl           # Combined content
+├── train_with_metadata.jsonl  # When includeMetadata is true
+├── train.csv             # Clean text in CSV format
+└── train_with_metadata.csv    # When includeMetadata is true
 ```
 ## 🤖 AI/LLM Training Ready

package/example-usage.js CHANGED Viewed

@@ -19,9 +19,10 @@ async function khameneiIrFreePalestineTag ()
 		jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
 		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
-		includeTitles: true
+		includeMetadata: true,
+		metadataFields: ["title", "description", "author", "lastModified", "language"]
 	});
-	// await scraper.start();
+	await scraper.start();
 	return scraper;
 }
@@ -45,9 +46,11 @@ async function decolonizepalestine ()
 		scrapResultPath: "./dataset/decolonizepalestine/website",
 		jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
 		textOutputPath: "./dataset/decolonizepalestine/texts",
-		csvOutputPath: "./dataset/decolonizepalestine/train.csv"
+		csvOutputPath: "./dataset/decolonizepalestine/train.csv",
+		includeMetadata: true,
+		metadataFields: ["title", "description", "author", "lastModified", "language"]
 	});
-	// await scraper.start();
+	await scraper.start();
 	return scraper;
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.3.3",
+  "version": "3.1.0",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",
@@ -16,7 +16,8 @@
     "ai-ready-web-scraper",
     "ai",
     "fine-tune",
-    "data-processing"
+    "data-processing",
+    "dataset"
   ],
   "author": "",
   "license": "ISC",

package/src/WebScraper.js CHANGED Viewed

@@ -17,7 +17,8 @@ class WebScraper
 		jsonlOutputPath,
 		textOutputPath,
 		csvOutputPath,
-		includeTitles = false
+		includeMetadata = false,
+		metadataFields = [] // ['title', 'description', 'author', 'lastModified', etc.]
 	})
 	{
 		this.baseURL = baseURL;
@@ -27,7 +28,10 @@ class WebScraper
 		this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
 		this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
-		this.includeTitles = includeTitles;
+		this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
+		this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
+		this.includeMetadata = includeMetadata;
+	   this.metadataFields = new Set( metadataFields );
 		this.visited = new Set();
 		this.excludeList = new Set( excludeList );
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -118,9 +122,18 @@ class WebScraper
 	{
 		const processedContent = this.processContent( content );
-		this.allProcessedContent.push({
+		const simpleContent = {
+			text: processedContent.trim()
+		};
+		const contentWithMetadata = {
 			text: processedContent.trim(),
-			metadata
+			metadata: this.filterMetadata( metadata )
+		};
+		this.allProcessedContent.push({
+			simple: simpleContent,
+			withMetadata: contentWithMetadata
 		});
 		let urlPath = new URL( url ).pathname;
@@ -140,50 +153,118 @@ class WebScraper
 	createJSONLFile ()
 	{
-		const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
+		const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
+		let writeStreamMeta
+		if ( this.includeMetadata )
+		{
+			writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
+		}
 		for ( const content of this.allProcessedContent )
 		{
-			const jsonLine = `${JSON.stringify( content )}\n`;
-			writeStream.write( jsonLine );
+			writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
+			if ( this.includeMetadata )
+			{
+				writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
+			}
+		}
+		writeStreamSimple.end();
+		if ( this.includeMetadata )
+		{
+			writeStreamMeta.end();
 		}
-		writeStream.end();
 		console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
 	}
 	createCSVFile ()
 	{
-		const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
-		writeStream.write( "text\n" );
+		// Create simple version
+		const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
+		writeStreamSimple.write( "text\n" );
+		// Create metadata version if requested
+		let writeStreamMeta;
+		if ( this.includeMetadata )
+		{
+			writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
+		}
+		if ( this.includeMetadata )
+		{
+			const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
+			writeStreamMeta.write( `${headers}\n` );
+		}
 		for ( const content of this.allProcessedContent )
 		{
-			let fullText = content.text;
-			if ( this.includeTitles && content.metadata.title )
+			// Write simple version
+			const escapedText = content.simple.text.replace( /"/g, "\"\"" );
+			writeStreamSimple.write( `"${escapedText}"\n` );
+			// Write metadata version if requested
+			if ( this.includeMetadata )
 			{
-				fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
+				const { metadata } = content.withMetadata;
+				const metadataValues = Array.from( this.metadataFields )
+				.map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
+				writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
 			}
-			const escapedText = fullText.replace( /"/g, "\"\"" );
-			const csvLine = `"${escapedText}"\n`;
-			writeStream.write( csvLine );
 		}
-		writeStream.end();
-		console.log( `Created CSV file at: ${this.csvOutputPath}` );
+		writeStreamSimple.end();
+		if ( writeStreamMeta )
+		{
+			writeStreamMeta.end();
+		}
+		console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
+		if ( this.includeMetadata )
+		{
+			console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
+		}
 	}
 	saveNumberedTextFiles ()
 	{
+		// Create base text folder for simple content
+		const baseTextPath = path.join( __dirname, this.textOutputPath );
+		// Create metadata text folder if needed
+		let metaTextPath = null;
+		if ( this.includeMetadata )
+		{
+			metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
+			fs.mkdirSync( metaTextPath, { recursive: true });
+		}
 		this.allProcessedContent.forEach( ( content, index ) =>
 		{
 			const fileName = `${index + 1}.txt`;
-			const filePath = path.join( __dirname, this.textOutputPath, fileName );
-			let titlePrefix = "";
-			if ( this.includeTitles && content.metadata.title )
+			// Always save simple version
+			const simpleFilePath = path.join( baseTextPath, fileName );
+			fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
+			console.log( `Created simple text file: ${fileName}` );
+			// Save metadata version if enabled
+			if ( this.includeMetadata )
 			{
-				titlePrefix = `Title: ${content.metadata.title}\n\n`;
+				const metaFilePath = path.join( metaTextPath, fileName );
+				let fileContent = "";
+				const { metadata } = content.withMetadata;
+				// Add metadata fields as headers
+				for ( const field of this.metadataFields )
+				{
+					if ( metadata[field] )
+					{
+						fileContent += `${field}: ${metadata[field]}\n`;
+					}
+				}
+				fileContent += "\n---\n\n";
+				fileContent += content.withMetadata.text;
+				fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
+				console.log( `Created metadata text file: ${fileName}` );
 			}
-			fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
-			console.log( `Created numbered text file: ${fileName}` );
 		});
 	}
@@ -209,6 +290,21 @@ class WebScraper
 		return processed;
 	}
+	filterMetadata ( metadata )
+	{
+		if ( !this.includeMetadata ) return {};
+		const filteredMetadata = {};
+		for ( const field of this.metadataFields )
+		{
+			if ( metadata[field] && typeof metadata[field] === "string" )
+			{
+				filteredMetadata[field] = metadata[field];
+			}
+		}
+		return filteredMetadata;
+	}
 	metadataextractor ( url, document, headers )
 	{
 		return {
@@ -222,12 +318,10 @@ class WebScraper
 			contentLength: headers["content-length"],
 			language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
 			canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
-			ogTags: {
-				title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
-				description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
-				image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
-				type: document.querySelector( "meta[property=\"og:type\"]" )?.content
-			},
+			ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
+			ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
+			ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
+			ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
 			dateScraped: new Date().toISOString()
 		};
 	}
@@ -265,6 +359,7 @@ class WebScraper
 		if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
 		{
 			fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
+			fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
 		}
 		fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
 		fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
@@ -277,33 +372,56 @@ class WebScraper
 		// Create output directories
 		fs.mkdirSync( fullOutputPath, { recursive: true });
 		fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
+		fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
-		// Combine JSONL files
+		// Combine regular JSONL files
 		const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
+		const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) );
+		const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
+		const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
+		csvOutput.write( "text\n" );
+		const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
+		if ( metadataFields.size > 0 )
+		{
+			csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
+		}
 		for ( const website of websites )
 		{
 			const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
 			jsonlOutput.write( jsonlContent );
-		}
-		jsonlOutput.end();
-		// Combine CSV files
-		const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
-		csvOutput.write( "text\n" );
-		for ( const website of websites )
-		{
 			const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
 			.split( "\n" )
-			.slice( 1 ) // Skip header
+			.slice( 1 )
 			.filter( line => { return line.trim() });
-			csvOutput.write( `${csvContent.join( "\n" ) }\n` );
+			csvOutput.write( `${csvContent.join( "\n" )}\n` );
+			// Combine metadata files if they exist
+			if ( website.includeMetadata )
+			{
+				const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
+				jsonlMetaOutput.write( jsonlMetaContent );
+				const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
+				.split( "\n" )
+				.slice( 1 )
+				.filter( line => { return line.trim() });
+				csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
+			}
 		}
+		// Close all streams
+		jsonlOutput.end();
+		jsonlMetaOutput.end();
 		csvOutput.end();
+		csvMetaOutput.end();
-		// Combine text files
+		// Combine text files (both regular and metadata versions)
 		let textFileCounter = 1;
 		for ( const website of websites )
 		{
+			// Regular text files
 			const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
 			for ( const file of textFiles )
 			{
@@ -313,6 +431,20 @@ class WebScraper
 					content,
 					"utf-8"
 				);
+				// Metadata text files if they exist
+				if ( website.includeMetadata )
+				{
+					const metaContent = fs.readFileSync(
+						path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
+						"utf-8"
+					);
+					fs.writeFileSync(
+						path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
+						metaContent,
+						"utf-8"
+					);
+				}
 				textFileCounter++;
 			}
 		}