npm - clean-web-scraper - Versions diffs - 4.1.7 → 4.2.1 - Mend

clean-web-scraper 4.1.7 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/example-usage.js CHANGED Viewed

@@ -50,6 +50,7 @@ async function khameneiIrFreePalestineTag ( enable )
 		baseURL: "https://english.khamenei.ir/news",
 		startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
 		maxDepth: 1,
+		maxArticles: 2,
 		exactExcludeList: [
 			"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
 			"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
@@ -72,6 +73,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
 		baseURL: "https://english.khamenei.ir/news",
 		startURL: "https://english.khamenei.ir/palestine-special-page",
 		maxDepth: 2,
+		maxArticles: 2,
 		exactExcludeList: [
 			"https://english.khamenei.ir/palestine-special-page/"
 		],
@@ -101,6 +103,7 @@ async function decolonizepalestine ( enable )
 			"https://decolonizepalestine.com/rainbow-washing",
 			"https://decolonizepalestine.com/"
 		],
+		maxArticles: 2,
 		scrapResultPath: "./dataset/decolonizepalestine/website",
 		jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
 		textOutputPath: "./dataset/decolonizepalestine/texts",
@@ -307,7 +310,7 @@ async function palestineremembered ( enable )
 void async function main ()
 {
 	// const palianswersScraper = await palianswers( true );
-	// const decolonizepalestineScraper = await decolonizepalestine( true );
+	const decolonizepalestineScraper = await decolonizepalestine( true );
 	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
 	// const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
 	// const electronicintifadaScraper = await electronicintifada( true );
@@ -316,13 +319,13 @@ void async function main ()
 	// const bdsmovementScraper = await bdsmovement( false );
 	// const palestinerememberedScraper = await palestineremembered( false );
-	// await WebScraper.combineResults( "./dataset/combined", [
-	// 	palianswersScraper,
-	// 	decolonizepalestineScraper,
-	// 	khameneiIrFreePalestineTagScraper,
-	// 	khameneiIrPalestineSpecialPageScraper,
-	// 	electronicintifadaScraper,
-	// 	standWithPalestineScraper,
-	// 	mondoweisScraper
-	// ] );
+	await WebScraper.combineResults( "./dataset/combined", [
+		// palianswersScraper,
+		decolonizepalestineScraper,
+		khameneiIrFreePalestineTagScraper,
+		// khameneiIrPalestineSpecialPageScraper,
+		// electronicintifadaScraper,
+		// standWithPalestineScraper,
+		// mondoweisScraper
+	] );
 }();

package/main.js CHANGED Viewed

@@ -1,8 +1,9 @@
+const process = require( "node:process" );
+const fs = require( "fs" );
+const path = require( "path" );
 const axios = require( "axios" );
 const { JSDOM } = require( "jsdom" );
 const { Readability } = require( "@mozilla/readability" );
-const fs = require( "fs" );
-const path = require( "path" );
 const { connect } = require( "puppeteer-real-browser" );
 class WebScraper
@@ -334,7 +335,7 @@ class WebScraper
 		{
 			urlPath = urlPath.slice( 0, -1 );
 		}
-		const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
+		const filePath = path.join( this.scrapResultPath, urlPath );
 		const dir = path.dirname( filePath );
 		fs.mkdirSync( dir, { recursive: true });
@@ -347,14 +348,14 @@ class WebScraper
 	createJSONLFile ()
 	{
-		const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
+		const writeStreamSimple = fs.createWriteStream( this.jsonlOutputPath );
 		writeStreamSimple.on( "error", err =>
 		{ return console.error( "Error writing JSONL:", err ) });
 		let writeStreamMeta;
 		if ( this.includeMetadata )
 		{
-			writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
+			writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPathWithMeta ) );
 			writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
 		}
 		for ( const content of this.allProcessedContent )
@@ -377,7 +378,7 @@ class WebScraper
 	createCSVFile ()
 	{
 		// Create simple version
-		const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
+		const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPath ) );
 		writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
 		writeStreamSimple.write( "text\n" );
@@ -385,7 +386,7 @@ class WebScraper
 		let writeStreamMeta;
 		if ( this.includeMetadata )
 		{
-			writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
+			writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPathWithMeta ) );
 			writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
 		}
@@ -427,12 +428,12 @@ class WebScraper
 	saveNumberedTextFiles ()
 	{
-		const baseTextPath = path.join( __dirname, this.textOutputPath );
+		const baseTextPath = this.textOutputPath;
 		let metaTextPath = null;
 		if ( this.includeMetadata )
 		{
-			metaTextPath = path.join( __dirname, this.textOutputPathWithMeta );
+			metaTextPath = path.join( process.cwd(), this.textOutputPathWithMeta );
 			fs.mkdirSync( metaTextPath, { recursive: true });
 		}
@@ -672,13 +673,13 @@ class WebScraper
 	createOutputDirectory ()
 	{
 		const paths = [
-			path.join( __dirname, this.scrapResultPath ),
-			path.join( __dirname, this.textOutputPath ),
-			path.join( __dirname, this.textOutputPathWithMeta ),
-			path.join( __dirname, this.csvOutputPath ),
-			path.join( __dirname, this.csvOutputPathWithMeta ),
-			path.join( __dirname, this.jsonlOutputPath ),
-			path.join( __dirname, this.jsonlOutputPathWithMeta )
+			this.scrapResultPath,
+			this.textOutputPath,
+			this.textOutputPathWithMeta,
+			this.csvOutputPath,
+			this.csvOutputPathWithMeta,
+			this.jsonlOutputPath,
+			this.jsonlOutputPathWithMeta
 		];
 		for ( const p of paths )
 		{
@@ -688,9 +689,9 @@ class WebScraper
 			}
 		}
 		// Recreate directories needed for output
-		this.ensureDirectory( path.join( __dirname, this.scrapResultPath ) );
-		this.ensureDirectory( path.join( __dirname, this.textOutputPath ) );
-		this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
+		this.ensureDirectory( path.join( process.cwd(), this.scrapResultPath ) );
+		this.ensureDirectory( path.join( process.cwd(), this.textOutputPath ) );
+		this.ensureDirectory( path.join( process.cwd(), this.textOutputPathWithMeta ) );
 	}
 	ensureDirectory ( dirPath )
@@ -709,7 +710,7 @@ class WebScraper
 	static async combineResults ( outputPath, websites )
 	{
 		await WebScraper.sleep( 1000 );
-		const fullOutputPath = path.join( __dirname, outputPath );
+		const fullOutputPath = outputPath;
 		WebScraper.createCombinedDirectories( fullOutputPath );
 		WebScraper.combineJSONLFiles( fullOutputPath, websites );
 		WebScraper.combineCSVFiles( fullOutputPath, websites );
@@ -742,20 +743,16 @@ class WebScraper
 		for ( const website of websites )
 		{
-			const jsonlContent = fs.readFileSync(
-				path.join( __dirname, website.jsonlOutputPath ),
-				"utf-8"
-			);
+			const jsonlContent = fs.readFileSync( website.jsonlOutputPath, "utf-8" );
 			if ( jsonlContent )
 			{
 				jsonlOutput.write( jsonlContent );
 			}
 			if ( website.includeMetadata )
 			{
-				const jsonlMetaContent = fs.readFileSync(
-					path.join( __dirname, website.jsonlOutputPathWithMeta ),
-					"utf-8"
-				);
+				const jsonlMetaContent = fs.readFileSync( website.jsonlOutputPathWithMeta, "utf-8" );
 				if ( jsonlMetaContent )
 				{
 					jsonlMetaOutput.write( jsonlMetaContent );
@@ -782,7 +779,7 @@ class WebScraper
 		for ( const website of websites )
 		{
-			const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
+			const csvContent = fs.readFileSync( website.csvOutputPath, "utf-8" )
 			.split( "\n" )
 			.slice( 1 )
 			.filter( line => { return line.trim() });
@@ -792,11 +789,8 @@ class WebScraper
 			}
 			if ( website.includeMetadata )
 			{
-				const csvMetaContent = fs
-				.readFileSync(
-					path.join( __dirname, website.csvOutputPathWithMeta ),
-					"utf-8"
-				)
+				const csvMetaContent = fs.readFileSync( website.csvOutputPathWithMeta, "utf-8" )
 				.split( "\n" )
 				.slice( 1 )
 				.filter( line => { return line.trim() });
@@ -815,13 +809,11 @@ class WebScraper
 		let textFileCounter = 1;
 		for ( const website of websites )
 		{
-			const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
+			const textFiles = fs.readdirSync( path.join( process.cwd(), website.textOutputPath ) );
 			for ( const file of textFiles )
 			{
-				const content = fs.readFileSync(
-					path.join( __dirname, website.textOutputPath, file ),
-					"utf-8"
-				);
+				const content = fs.readFileSync( path.join( website.textOutputPath, file ), "utf-8" );
 				fs.writeFileSync(
 					path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
 					content,
@@ -829,10 +821,7 @@ class WebScraper
 				);
 				if ( website.includeMetadata )
 				{
-					const metaContent = fs.readFileSync(
-						path.join( __dirname, website.textOutputPathWithMeta, file ),
-						"utf-8"
-					);
+					const metaContent = fs.readFileSync( path.join( website.textOutputPathWithMeta, file ), "utf-8" );
 					fs.writeFileSync(
 						path.join(
 							fullOutputPath,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "4.1.7",
+  "version": "4.2.1",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",