npm - clean-web-scraper - Versions diffs - 3.0.0 → 3.2.0 - Mend

clean-web-scraper 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/example-usage.js CHANGED Viewed

@@ -20,9 +20,9 @@ async function khameneiIrFreePalestineTag ()
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
 		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
 		includeMetadata: true,
-		metadataFields: ["title", "description", "author", "lastModified", "language", "ogTags"]
+		metadataFields: ["title", "description", "author", "lastModified", "language"]
 	});
-	await scraper.start();
+	// await scraper.start();
 	return scraper;
 }
@@ -50,7 +50,7 @@ async function decolonizepalestine ()
 		includeMetadata: true,
 		metadataFields: ["title", "description", "author", "lastModified", "language"]
 	});
-	await scraper.start();
+	// await scraper.start();
 	return scraper;
 }
@@ -58,6 +58,7 @@ void async function main ()
 {
 	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
 	const decolonizepalestineScraper = await decolonizepalestine();
+	await WebScraper.sleep( 1000 ); // Sleeps for 1 second
 	WebScraper.combineResults( "./dataset/combined", [
 		khameneiIrFreePalestineTagScraper,
 		decolonizepalestineScraper

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "3.0.0",
+  "version": "3.2.0",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -155,9 +155,14 @@ class WebScraper
 	{
 		const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
 		let writeStreamMeta
+		// Add error handlers
+		writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
 		if ( this.includeMetadata )
 		{
 			writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
+			writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
 		}
 		for ( const content of this.allProcessedContent )
 		{
@@ -171,6 +176,7 @@ class WebScraper
 		if ( this.includeMetadata )
 		{
 			writeStreamMeta.end();
+			console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
 		}
 		console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
 	}
@@ -179,6 +185,7 @@ class WebScraper
 	{
 		// Create simple version
 		const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
+		writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
 		writeStreamSimple.write( "text\n" );
 		// Create metadata version if requested
@@ -186,6 +193,7 @@ class WebScraper
 		if ( this.includeMetadata )
 		{
 			writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
+			writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
 		}
 		if ( this.includeMetadata )
@@ -359,11 +367,17 @@ class WebScraper
 		if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
 		{
 			fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
+			fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
 		}
 		fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
 		fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
 	}
+	static sleep ( ms )
+	{
+		return new Promise( resolve => { return setTimeout( resolve, ms ) });
+	}
 	static combineResults ( outputPath, websites )
 	{
 		const fullOutputPath = path.join( __dirname, outputPath );
@@ -371,33 +385,58 @@ class WebScraper
 		// Create output directories
 		fs.mkdirSync( fullOutputPath, { recursive: true });
 		fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
+		fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
+		// Combine regular JSONL files
+		const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
+		    .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
+		const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
+		    .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
+		const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
+		const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
-		// Combine JSONL files
-		const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
+		csvOutput.write( "text\n" );
+		const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
+		if ( metadataFields.size > 0 )
+		{
+			csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
+		}
 		for ( const website of websites )
 		{
 			const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
 			jsonlOutput.write( jsonlContent );
-		}
-		jsonlOutput.end();
-		// Combine CSV files
-		const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
-		csvOutput.write( "text\n" );
-		for ( const website of websites )
-		{
 			const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
 			.split( "\n" )
-			.slice( 1 ) // Skip header
+			.slice( 1 )
 			.filter( line => { return line.trim() });
-			csvOutput.write( `${csvContent.join( "\n" ) }\n` );
+			csvOutput.write( `${csvContent.join( "\n" )}\n` );
+			// Combine metadata files if they exist
+			if ( website.includeMetadata )
+			{
+				const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
+				jsonlMetaOutput.write( jsonlMetaContent );
+				const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
+				.split( "\n" )
+				.slice( 1 )
+				.filter( line => { return line.trim() });
+				csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
+			}
 		}
+		// Close all streams
+		jsonlOutput.end();
+		jsonlMetaOutput.end();
 		csvOutput.end();
+		csvMetaOutput.end();
-		// Combine text files
+		// Combine text files (both regular and metadata versions)
 		let textFileCounter = 1;
 		for ( const website of websites )
 		{
+			// Regular text files
 			const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
 			for ( const file of textFiles )
 			{
@@ -407,6 +446,20 @@ class WebScraper
 					content,
 					"utf-8"
 				);
+				// Metadata text files if they exist
+				if ( website.includeMetadata )
+				{
+					const metaContent = fs.readFileSync(
+						path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
+						"utf-8"
+					);
+					fs.writeFileSync(
+						path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
+						metaContent,
+						"utf-8"
+					);
+				}
 				textFileCounter++;
 			}
 		}