npm - clean-web-scraper - Versions diffs - 4.2.0 → 4.2.2 - Mend

clean-web-scraper 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -172,13 +172,13 @@ The actual article content starts here. This is the clean, processed text of the
 ### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
 ```text
-articleTitle: My Awesome Page
-description: This is a great article about coding
+articleTitle: Palestine history
+description: This is a great article about Palestine history
 author: John Doe
 language: en
 dateScraped: 2024-01-20T10:30:00Z
-\-\-\-
+---
 The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage.
 ```

package/example-usage.js CHANGED Viewed

@@ -50,6 +50,7 @@ async function khameneiIrFreePalestineTag ( enable )
 		baseURL: "https://english.khamenei.ir/news",
 		startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
 		maxDepth: 1,
+		maxArticles: 2,
 		exactExcludeList: [
 			"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
 			"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
@@ -72,6 +73,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
 		baseURL: "https://english.khamenei.ir/news",
 		startURL: "https://english.khamenei.ir/palestine-special-page",
 		maxDepth: 2,
+		maxArticles: 2,
 		exactExcludeList: [
 			"https://english.khamenei.ir/palestine-special-page/"
 		],
@@ -101,6 +103,7 @@ async function decolonizepalestine ( enable )
 			"https://decolonizepalestine.com/rainbow-washing",
 			"https://decolonizepalestine.com/"
 		],
+		maxArticles: 2,
 		scrapResultPath: "./dataset/decolonizepalestine/website",
 		jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
 		textOutputPath: "./dataset/decolonizepalestine/texts",
@@ -307,7 +310,7 @@ async function palestineremembered ( enable )
 void async function main ()
 {
 	// const palianswersScraper = await palianswers( true );
-	// const decolonizepalestineScraper = await decolonizepalestine( true );
+	const decolonizepalestineScraper = await decolonizepalestine( true );
 	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
 	// const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
 	// const electronicintifadaScraper = await electronicintifada( true );
@@ -316,13 +319,13 @@ void async function main ()
 	// const bdsmovementScraper = await bdsmovement( false );
 	// const palestinerememberedScraper = await palestineremembered( false );
-	// await WebScraper.combineResults( "./dataset/combined", [
-	// 	palianswersScraper,
-	// 	decolonizepalestineScraper,
-	// 	khameneiIrFreePalestineTagScraper,
-	// 	khameneiIrPalestineSpecialPageScraper,
-	// 	electronicintifadaScraper,
-	// 	standWithPalestineScraper,
-	// 	mondoweisScraper
-	// ] );
+	await WebScraper.combineResults( "./dataset/combined", [
+		// palianswersScraper,
+		decolonizepalestineScraper,
+		khameneiIrFreePalestineTagScraper,
+		// khameneiIrPalestineSpecialPageScraper,
+		// electronicintifadaScraper,
+		// standWithPalestineScraper,
+		// mondoweisScraper
+	] );
 }();

package/main.js CHANGED Viewed

@@ -335,7 +335,7 @@ class WebScraper
 		{
 			urlPath = urlPath.slice( 0, -1 );
 		}
-		const filePath = path.join( process.cwd(), this.scrapResultPath, urlPath );
+		const filePath = path.join( this.scrapResultPath, urlPath );
 		const dir = path.dirname( filePath );
 		fs.mkdirSync( dir, { recursive: true });
@@ -348,7 +348,7 @@ class WebScraper
 	createJSONLFile ()
 	{
-		const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPath ) );
+		const writeStreamSimple = fs.createWriteStream( this.jsonlOutputPath );
 		writeStreamSimple.on( "error", err =>
 		{ return console.error( "Error writing JSONL:", err ) });
@@ -428,7 +428,7 @@ class WebScraper
 	saveNumberedTextFiles ()
 	{
-		const baseTextPath = path.join( process.cwd(), this.textOutputPath );
+		const baseTextPath = this.textOutputPath;
 		let metaTextPath = null;
 		if ( this.includeMetadata )
@@ -673,13 +673,13 @@ class WebScraper
 	createOutputDirectory ()
 	{
 		const paths = [
-			path.join( process.cwd(), this.scrapResultPath ),
-			path.join( process.cwd(), this.textOutputPath ),
-			path.join( process.cwd(), this.textOutputPathWithMeta ),
-			path.join( process.cwd(), this.csvOutputPath ),
-			path.join( process.cwd(), this.csvOutputPathWithMeta ),
-			path.join( process.cwd(), this.jsonlOutputPath ),
-			path.join( process.cwd(), this.jsonlOutputPathWithMeta )
+			this.scrapResultPath,
+			this.textOutputPath,
+			this.textOutputPathWithMeta,
+			this.csvOutputPath,
+			this.csvOutputPathWithMeta,
+			this.jsonlOutputPath,
+			this.jsonlOutputPathWithMeta
 		];
 		for ( const p of paths )
 		{
@@ -710,7 +710,7 @@ class WebScraper
 	static async combineResults ( outputPath, websites )
 	{
 		await WebScraper.sleep( 1000 );
-		const fullOutputPath = path.join( process.cwd(), outputPath );
+		const fullOutputPath = outputPath;
 		WebScraper.createCombinedDirectories( fullOutputPath );
 		WebScraper.combineJSONLFiles( fullOutputPath, websites );
 		WebScraper.combineCSVFiles( fullOutputPath, websites );
@@ -743,20 +743,16 @@ class WebScraper
 		for ( const website of websites )
 		{
-			const jsonlContent = fs.readFileSync(
-				path.join( process.cwd(), website.jsonlOutputPath ),
-				"utf-8"
-			);
+			const jsonlContent = fs.readFileSync( website.jsonlOutputPath, "utf-8" );
 			if ( jsonlContent )
 			{
 				jsonlOutput.write( jsonlContent );
 			}
 			if ( website.includeMetadata )
 			{
-				const jsonlMetaContent = fs.readFileSync(
-					path.join( process.cwd(), website.jsonlOutputPathWithMeta ),
-					"utf-8"
-				);
+				const jsonlMetaContent = fs.readFileSync( website.jsonlOutputPathWithMeta, "utf-8" );
 				if ( jsonlMetaContent )
 				{
 					jsonlMetaOutput.write( jsonlMetaContent );
@@ -783,7 +779,7 @@ class WebScraper
 		for ( const website of websites )
 		{
-			const csvContent = fs.readFileSync( path.join( process.cwd(), website.csvOutputPath ), "utf-8" )
+			const csvContent = fs.readFileSync( website.csvOutputPath, "utf-8" )
 			.split( "\n" )
 			.slice( 1 )
 			.filter( line => { return line.trim() });
@@ -793,11 +789,8 @@ class WebScraper
 			}
 			if ( website.includeMetadata )
 			{
-				const csvMetaContent = fs
-				.readFileSync(
-					path.join( process.cwd(), website.csvOutputPathWithMeta ),
-					"utf-8"
-				)
+				const csvMetaContent = fs.readFileSync( website.csvOutputPathWithMeta, "utf-8" )
 				.split( "\n" )
 				.slice( 1 )
 				.filter( line => { return line.trim() });
@@ -819,10 +812,8 @@ class WebScraper
 			const textFiles = fs.readdirSync( path.join( process.cwd(), website.textOutputPath ) );
 			for ( const file of textFiles )
 			{
-				const content = fs.readFileSync(
-					path.join( process.cwd(), website.textOutputPath, file ),
-					"utf-8"
-				);
+				const content = fs.readFileSync( path.join( website.textOutputPath, file ), "utf-8" );
 				fs.writeFileSync(
 					path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
 					content,
@@ -830,10 +821,7 @@ class WebScraper
 				);
 				if ( website.includeMetadata )
 				{
-					const metaContent = fs.readFileSync(
-						path.join( process.cwd(), website.textOutputPathWithMeta, file ),
-						"utf-8"
-					);
+					const metaContent = fs.readFileSync( path.join( website.textOutputPathWithMeta, file ), "utf-8" );
 					fs.writeFileSync(
 						path.join(
 							fullOutputPath,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "4.2.0",
+  "version": "4.2.2",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",