npm - clean-web-scraper - Versions diffs - 4.0.3 → 4.0.4 - Mend

clean-web-scraper 4.0.3 → 4.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -57,7 +57,8 @@ const scraper = new WebScraper({
   maxDepth: Infinity,                           // Optional: Maximum crawling depth
   maxArticles: Infinity,                        // Optional: Maximum articles to scrape
   crawlingDelay: 1000,                          // Optional: Delay between requests (ms)
+  batchSize: 5,                                 // Optional: Number of URLs to process concurrently
   // Network options
   axiosHeaders: {},                             // Optional: Custom HTTP headers
   axiosProxy: {                                 // Optional: HTTP/HTTPS proxy

package/example-usage.js CHANGED Viewed

@@ -109,7 +109,8 @@ async function electronicintifada ( enable )
 			"https://electronicintifada.net/news",
 			"https://electronicintifada.net/opinion",
 			"https://electronicintifada.net/about-ei",
-			"https://electronicintifada.net/review"
+			"https://electronicintifada.net/review",
+			"https://electronicintifada.net/artmusicculture"
 		],
 		exactExcludeList: [
 			"https://electronicintifada.net",
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
 			protocol: "http"
 		},
 		useProxyAsFallback: true,
-		crawlingDelay: 0
+		crawlingDelay: 1
 	};
 	return await runScraper( config, enable );
 }

package/main.js CHANGED Viewed

@@ -16,6 +16,7 @@ class WebScraper
 		this.maxDepth = config.maxDepth || Infinity;
 		this.maxArticles = config.maxArticles || Infinity;
 		this.crawlingDelay = config.crawlingDelay ?? 1000;
+		this.batchSize = config.batchSize || 5;
 		// Output paths setup
 		this.scrapResultPath = config.scrapResultPath || "./dataset";
@@ -97,83 +98,88 @@ class WebScraper
 	async crawl ( initialUrl, initialDepth = 0 )
 	{
 		const queue = [{ url: initialUrl, depth: initialDepth }];
-		for ( let i = 0; i < queue.length; i++ )
+		while ( queue.length > 0 )
 		{
-			let { url, depth } = queue[i];
-			console.log( `Processing URL: ${queue[i].url}` );
-			if ( this.hasReachedMax( depth ) )
+			const currentBatch = queue.splice( 0, this.batchSize );
+			await Promise.all( currentBatch.map( async ({ url, depth }) =>
 			{
-				continue;
-			}
-			if ( this.removeURLFragment )
-			{
-				url = url.split( "#" )[0];
-			}
-			if ( this.visited.has( url ) )
+				await this.processUrl( url, depth, queue );
+			}) );
+		}
+	}
+	async processUrl ( url, depth, queue )
+	{
+		console.log( `Processing URL: ${url}` );
+		if ( this.hasReachedMax( depth ) )
+		{
+			return;
+		}
+		if ( this.removeURLFragment )
+		{
+			url = url.split( "#" )[0];
+		}
+		if ( this.visited.has( url ) )
+		{
+			console.log( `Already visited: ${url}` );
+			return;
+		}
+		this.visited.add( url );
+		if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
+		{
+			return;
+		}
+		try
+		{
+			if ( this.crawlingDelay )
 			{
-				console.log( `Already visited: ${url}` );
-				continue;
+				await WebScraper.sleep( this.crawlingDelay );
 			}
-			this.visited.add( url );
-			if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
+			const data = await this.fetchContent( url );
+			if ( !data )
 			{
-				continue;
+				return;
 			}
-			try
+			const dom = new JSDOM( data, { url });
+			const { document } = dom.window;
+			if ( !this.isExcluded( url ) )
 			{
-				if ( this.crawlingDelay )
-				{
-					await WebScraper.sleep( this.crawlingDelay );
-				}
-				const data = await this.fetchContent( url );
-				if ( !data ) continue;
-				const dom = new JSDOM( data, { url });
-				const { document } = dom.window;
-				if ( !this.isExcluded( url ) )
+				const reader = new Readability( document );
+				const article = reader.parse();
+				if ( article )
 				{
-					const reader = new Readability( document );
-					const article = reader.parse();
-					if ( article )
+					if ( this.hasValidPageContent( article.textContent ) )
 					{
-						if ( this.hasValidPageContent( article.textContent ) )
-						{
-							const metadata = this.extractMetadata( url, document );
- 							metadata.articleTitle = article.title || "";
-							this.saveArticle( url, article.textContent, metadata );
-						}
-						else
-						{
-							console.error( `Invalid content found at ${url}` );
-						}
+						const metadata = this.extractMetadata( url, document );
+ 						metadata.articleTitle = article.title || "";
+						this.saveArticle( url, article.textContent, metadata );
 					}
 					else
 					{
-						console.error( `No readable content found at ${url}` );
+						console.error( `Invalid content found at ${url}` );
 					}
 				}
-				const links = this.extractLinks( data );
-				const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
-				for ( const link of unvisitedLinks )
+				else
 				{
-					if ( !this.hasReachedMax( depth ) )
-					{
-						queue.push({ url: link, depth: depth + 1 });
-					}
+					console.error( `No readable content found at ${url}` );
 				}
 			}
-			catch ( error )
+			const links = this.extractLinks( data );
+			const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
+			for ( const link of unvisitedLinks )
 			{
-				console.error( `Error fetching ${url}:`, error.message, error.code );
+				if ( !this.hasReachedMax( depth ) )
+				{
+					queue.push({ url: link, depth: depth + 1 });
+				}
 			}
 		}
+		catch ( error )
+		{
+			console.error( `Error fetching ${url}:`, error.message, error.code );
+		}
 	}
 	async fetchContent ( url )
 	{
 		try

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "4.0.3",
+  "version": "4.0.4",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",