npm - clean-web-scraper - Versions diffs - 3.3.7 → 3.4.0 - Mend

clean-web-scraper 3.3.7 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/example-usage.js CHANGED Viewed

@@ -100,7 +100,8 @@ async function electronicintifada ()
 			"https://electronicintifada.net/tags/",
 			"https://electronicintifada.net/blog",
 			"https://electronicintifada.net/people",
-			"https://electronicintifada.net/location"
+			"https://electronicintifada.net/location",
+			"https://electronicintifada.net/file"
 		],
 		exactExcludeList: [
 			"https://electronicintifada.net",
@@ -133,9 +134,6 @@ void async function main ()
 		electronicintifadaScraper
 	] );
-	// 4
-	// https://electronicintifada.net/
 	// 5
 	// https://www.palestineremembered.com/ZionistFAQ.html

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "3.3.7",
+  "version": "3.4.0",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -20,7 +20,7 @@ class WebScraper
 		textOutputPath,
 		csvOutputPath,
 		includeMetadata = false,
-		metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
+		metadataFields = [], // ['title', 'description', 'author', etc.]
 		headers,
 		usePuppeteer,
 		puppeteerProxy, // e.g. http://127.0.0.1:2080
@@ -159,7 +159,7 @@ class WebScraper
 		}
 		catch ( error )
 		{
-			console.error( `Error fetching ${url}:`, error.message );
+			console.error( `Error fetching ${url}:`, error.message, error.code );
 		}
 	}
@@ -167,13 +167,44 @@ class WebScraper
 	{
 		try
 		{
-			let axiosOptinos = {}
+			let axiosOptions = {};
 			if ( this.headers )
 			{
-				axiosOptinos.headers = this.headers
+				axiosOptions.headers = this.headers;
 			}
-			const result = await axios.get( url, axiosOptinos );
-			return result.data
+			// Step 1: Make a GET request with a small timeout and limited data download
+			const response = await axios.get( url, {
+				...axiosOptions,
+				responseType: "stream",
+				maxRedirects: 5,
+				timeout: 70000
+			});
+			// Step 2: Check the Content-Type header from the response
+			const contentType = response.headers["content-type"] || "";
+			if ( !contentType.startsWith( "text" ) )
+			{
+				console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
+				response.data.destroy(); // Destroy the stream to stop downloading further data
+				return null; // Skip further processing for non-HTML content
+			}
+			// Step 3: If Content-Type is HTML, read the full response
+			let htmlContent = "";
+			response.data.on( "data", ( chunk ) =>
+			{
+				htmlContent += chunk.toString();
+			});
+			// Wait for the stream to finish
+			await new Promise( ( resolve, reject ) =>
+			{
+				response.data.on( "end", resolve );
+				response.data.on( "error", reject );
+			});
+			return htmlContent;
 		}
 		catch ( error )
 		{