npm - clean-web-scraper - Versions diffs - 4.0.2 → 4.0.4 - Mend

clean-web-scraper 4.0.2 → 4.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -57,7 +57,8 @@ const scraper = new WebScraper({
   maxDepth: Infinity,                           // Optional: Maximum crawling depth
   maxArticles: Infinity,                        // Optional: Maximum articles to scrape
   crawlingDelay: 1000,                          // Optional: Delay between requests (ms)
+  batchSize: 5,                                 // Optional: Number of URLs to process concurrently
   // Network options
   axiosHeaders: {},                             // Optional: Custom HTTP headers
   axiosProxy: {                                 // Optional: HTTP/HTTPS proxy
@@ -86,7 +87,8 @@ const docsScraper = new WebScraper({
   scrapResultPath: './datasets/docs',
   maxDepth: 3,                               // Optional: Maximum depth for recursive crawling
   includeMetadata: true,                     // Optional: Include metadata in output files
-  metadataFields: ['title', 'description']   // Optional: Specify metadata fields to include
+  metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
+   // Optional: Specify metadata fields to include
 });
 // Scrape blog website
@@ -95,7 +97,8 @@ const blogScraper = new WebScraper({
   scrapResultPath: './datasets/blog',
   maxDepth: 3,                               // Optional: Maximum depth for recursive crawling
   includeMetadata: true,                     // Optional: Include metadata in output files
-  metadataFields: ['title', 'description']   // Optional: Specify metadata fields to include
+  metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
+   // Optional: Specify metadata fields to include
 });
 // Start scraping both sites
@@ -165,7 +168,7 @@ The actual article content starts here. This is the clean, processed text of the
 ### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
 ```text
-title: My Awesome Page
+articleTitle: My Awesome Page
 description: This is a great article about coding
 author: John Doe
 language: en
@@ -186,8 +189,8 @@ The actual article content starts here. This is the clean, processed text of the
 ### 📈 JSONL with Metadata (train_with_metadata.jsonl)
 ```json
-{"text": "Article content", "metadata": {"title": "Page Title", "author": "John Doe"}}
-{"text": "Another article", "metadata": {"title": "Second Page", "author": "Jane Smith"}}
+{"text": "Article content", "metadata": {"articleTitle": "Page Title", "author": "John Doe"}}
+{"text": "Another article", "metadata": {"articleTitle": "Second Page", "author": "Jane Smith"}}
 ```
 ### 🗃️ JSON Files In Website Output  (*.json)
@@ -212,7 +215,7 @@ text
 ### 📊 CSV with Metadata (train_with_metadata.csv)
 ```csv
-text,title,author,description
+text,articleTitle,author,description
 "Article content","Page Title","John Doe","Page description"
 "Another article","Second Page","Jane Smith","Another description"
 ```

package/example-usage.js CHANGED Viewed

@@ -109,7 +109,8 @@ async function electronicintifada ( enable )
 			"https://electronicintifada.net/news",
 			"https://electronicintifada.net/opinion",
 			"https://electronicintifada.net/about-ei",
-			"https://electronicintifada.net/review"
+			"https://electronicintifada.net/review",
+			"https://electronicintifada.net/artmusicculture"
 		],
 		exactExcludeList: [
 			"https://electronicintifada.net",
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
 			protocol: "http"
 		},
 		useProxyAsFallback: true,
-		crawlingDelay: 0
+		crawlingDelay: 1
 	};
 	return await runScraper( config, enable );
 }

package/main.js CHANGED Viewed

@@ -16,6 +16,7 @@ class WebScraper
 		this.maxDepth = config.maxDepth || Infinity;
 		this.maxArticles = config.maxArticles || Infinity;
 		this.crawlingDelay = config.crawlingDelay ?? 1000;
+		this.batchSize = config.batchSize || 5;
 		// Output paths setup
 		this.scrapResultPath = config.scrapResultPath || "./dataset";
@@ -97,83 +98,88 @@ class WebScraper
 	async crawl ( initialUrl, initialDepth = 0 )
 	{
 		const queue = [{ url: initialUrl, depth: initialDepth }];
-		for ( let i = 0; i < queue.length; i++ )
+		while ( queue.length > 0 )
 		{
-			let { url, depth } = queue[i];
-			console.log( `Processing URL: ${queue[i].url}` );
-			if ( this.hasReachedMax( depth ) )
+			const currentBatch = queue.splice( 0, this.batchSize );
+			await Promise.all( currentBatch.map( async ({ url, depth }) =>
 			{
-				continue;
-			}
-			if ( this.removeURLFragment )
-			{
-				url = url.split( "#" )[0];
-			}
-			if ( this.visited.has( url ) )
+				await this.processUrl( url, depth, queue );
+			}) );
+		}
+	}
+	async processUrl ( url, depth, queue )
+	{
+		console.log( `Processing URL: ${url}` );
+		if ( this.hasReachedMax( depth ) )
+		{
+			return;
+		}
+		if ( this.removeURLFragment )
+		{
+			url = url.split( "#" )[0];
+		}
+		if ( this.visited.has( url ) )
+		{
+			console.log( `Already visited: ${url}` );
+			return;
+		}
+		this.visited.add( url );
+		if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
+		{
+			return;
+		}
+		try
+		{
+			if ( this.crawlingDelay )
 			{
-				console.log( `Already visited: ${url}` );
-				continue;
+				await WebScraper.sleep( this.crawlingDelay );
 			}
-			this.visited.add( url );
-			if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
+			const data = await this.fetchContent( url );
+			if ( !data )
 			{
-				continue;
+				return;
 			}
-			try
+			const dom = new JSDOM( data, { url });
+			const { document } = dom.window;
+			if ( !this.isExcluded( url ) )
 			{
-				if ( this.crawlingDelay )
-				{
-					await WebScraper.sleep( this.crawlingDelay );
-				}
-				const data = await this.fetchContent( url );
-				if ( !data ) continue;
-				const dom = new JSDOM( data, { url });
-				const { document } = dom.window;
-				if ( !this.isExcluded( url ) )
+				const reader = new Readability( document );
+				const article = reader.parse();
+				if ( article )
 				{
-					const reader = new Readability( document );
-					const article = reader.parse();
-					if ( article )
+					if ( this.hasValidPageContent( article.textContent ) )
 					{
-						if ( this.hasValidPageContent( article.textContent ) )
-						{
-							const metadata = this.extractMetadata( url, document );
- 							metadata.articleTitle = article.title || "";
-							this.saveArticle( url, article.textContent, metadata );
-						}
-						else
-						{
-							console.error( `Invalid content found at ${url}` );
-						}
+						const metadata = this.extractMetadata( url, document );
+ 						metadata.articleTitle = article.title || "";
+						this.saveArticle( url, article.textContent, metadata );
 					}
 					else
 					{
-						console.error( `No readable content found at ${url}` );
+						console.error( `Invalid content found at ${url}` );
 					}
 				}
-				const links = this.extractLinks( data );
-				const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
-				for ( const link of unvisitedLinks )
+				else
 				{
-					if ( !this.hasReachedMax( depth ) )
-					{
-						queue.push({ url: link, depth: depth + 1 });
-					}
+					console.error( `No readable content found at ${url}` );
 				}
 			}
-			catch ( error )
+			const links = this.extractLinks( data );
+			const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
+			for ( const link of unvisitedLinks )
 			{
-				console.error( `Error fetching ${url}:`, error.message, error.code );
+				if ( !this.hasReachedMax( depth ) )
+				{
+					queue.push({ url: link, depth: depth + 1 });
+				}
 			}
 		}
+		catch ( error )
+		{
+			console.error( `Error fetching ${url}:`, error.message, error.code );
+		}
 	}
 	async fetchContent ( url )
 	{
 		try

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "4.0.2",
+  "version": "4.0.4",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",