npm - clean-web-scraper - Versions diffs - 3.5.2 → 3.5.4 - Mend

clean-web-scraper 3.5.2 → 3.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/src/WebScraper.js +123 -92

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "3.5.2",
+  "version": "3.5.4",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -10,80 +10,88 @@ const { connect } = require( "puppeteer-real-browser" )
 class WebScraper
 {
 	constructor ({
+		// Base configuration
 		baseURL,
 		startURL,
 		strictBaseURL = true,
 		maxDepth = Infinity,
 		maxArticles = Infinity,
-		excludeList,
-		exactExcludeList,
+		// URL filtering
+		excludeList = [],
+		exactExcludeList = [],
+		filterFileTypes = true,
+		excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
+		// Output paths
 		scrapResultPath = "./dataset",
 		jsonlOutputPath,
 		textOutputPath,
 		csvOutputPath,
+		// Metadata options
 		includeMetadata = false,
-		metadataFields = [], // ['title', 'description', 'author', etc.]
+		metadataFields = [],
+		// Network options
 		axiosHeaders,
 		axiosProxy,
+		// Puppeteer options
 		usePuppeteer,
 		puppeteerProxy, // e.g. http://127.0.0.1:2080
 		puppeteerExecutablePath,
-		puppeteerRealProxy,
-		filterFileTypes = true,
-		excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
+		puppeteerRealProxy
 	})
 	{
+		// Base configuration
 		this.baseURL = baseURL;
 		this.startURL = startURL || baseURL;
 		this.strictBaseURL = strictBaseURL;
 		this.maxDepth = maxDepth;
 		this.maxArticles = maxArticles;
+		// Output paths setup
 		this.scrapResultPath = scrapResultPath;
 		this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
 		this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
-		this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
-		this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
-		this.axiosHeaders = axiosHeaders;
+		this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
+		this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
+		// Metadata configuration
 		this.includeMetadata = includeMetadata;
-	   this.metadataFields = new Set( metadataFields );
+		this.metadataFields = new Set( metadataFields );
+		// URL filtering setup
 		this.visited = new Set();
 		this.excludeList = this.normalizeExcludeList( excludeList );
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
-		this.allProcessedContent = [];
 		this.filterFileTypes = filterFileTypes;
 		this.excludedFileTypes = excludedFileTypes;
+		// Network configuration
+		this.axiosHeaders = axiosHeaders;
 		this.axiosProxy = axiosProxy;
-		this.usePuppeteer = usePuppeteer || false;
-		this.puppeteerOptions = {
-			headless: false,
-			userDataDir: "./tmp/browser",
-			defaultViewport: null,
-			args: [
-				"--start-maximized"
-			],
-			"ignoreDefaultArgs": true,
-		}
-		if ( puppeteerProxy )
+		this.axiosOptions = {};
+		if ( this.axiosHeaders )
 		{
-			this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
+			axiosOptions.headers = this.axiosHeaders;
 		}
-		if ( puppeteerExecutablePath )
+		if ( this.axiosProxy )
 		{
-			this.puppeteerOptions.executablePath = puppeteerExecutablePath;
-		}
-		this.puppeteerRealOptions = {
-			headless: false,
-			args: [],
-			customConfig: {},
-			turnstile: true,
-			connectOption: {},
-			disableXvfb: false,
-			ignoreAllFlags: false,
-			proxy: puppeteerRealProxy
+			axiosOptions.proxy = this.axiosProxy;
 		}
-		this.puppeteerBrowser = null;
-		this.puppeteerPage = null;
+		// Content storage
+		this.allProcessedContent = [];
+		// Puppeteer configuration
+		this.usePuppeteer = usePuppeteer || false;
+		this.puppeteerProxy = puppeteerProxy;
+		this.puppeteerExecutablePath = puppeteerExecutablePath;
+		this.puppeteerRealProxy = puppeteerRealProxy;
+		this.configurePuppeteer( );
 	}
 	async start ()
@@ -139,7 +147,7 @@ class WebScraper
 		}
 		try
 		{
-			const data = await this.caller( url );
+			const data = await this.fetchContent( url );
 			if ( !data ) return;
 			const dom = new JSDOM( data, { url });
 			const { document } = dom.window;
@@ -151,9 +159,9 @@ class WebScraper
 				if ( article )
 				{
-					if ( this.isValidContent( article.textContent ) )
+					if ( this.hasValidPageContent( article.textContent ) )
 					{
-						const metadata = this.metadataextractor( url, document );
+						const metadata = this.extractMetadata( url, document );
 						metadata.depth = depth;
 						this.saveArticle( url, article.textContent, metadata );
 					}
@@ -183,35 +191,23 @@ class WebScraper
 		}
 	}
-	async caller ( url )
+	async fetchContent ( url )
 	{
 		try
 		{
-			let axiosOptions = {};
-			if ( this.axiosHeaders )
-			{
-				axiosOptions.headers = this.axiosHeaders;
-			}
-			if ( this.axiosProxy )
-			{
-				axiosOptions.proxy = this.axiosProxy;
-			}
-			// Step 1: Make a GET request with a small timeout and limited data download
 			const response = await axios.get( url, {
-				...axiosOptions,
 				responseType: "stream",
 				maxRedirects: 5,
-				timeout: 70000
+				timeout: 70000,
+				...axiosOptions,
 			});
-			// Step 2: Check the Content-Type header from the response
 			const contentType = response.headers["content-type"] || "";
 			if ( !contentType.startsWith( "text" ) )
 			{
 				console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
-				response.data.destroy(); // Destroy the stream to stop downloading further data
-				return null; // Skip further processing for non-HTML content
+				response.data.destroy();
+				return null;
 			}
 			// Step 3: If Content-Type is HTML, read the full response
@@ -243,8 +239,8 @@ class WebScraper
 					for ( let index = 0; index < 10; index++ )
 					{
 						console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
-						result = await this.goToUrl( url ) ;
-						if ( this.isValidContent( result.htmlContent ) )
+						result = await this.navigateToPage( url ) ;
+						if ( this.hasValidPageContent( result.htmlContent ) )
 						{
 							break
 						}
@@ -262,7 +258,7 @@ class WebScraper
 		}
 	}
-	async goToUrl ( url )
+	async navigateToPage ( url )
 	{
 		let pages = await this.puppeteerBrowser.pages();
 		let page = pages[0];
@@ -531,7 +527,7 @@ class WebScraper
 		return filteredMetadata;
 	}
-	metadataextractor ( url, document )
+	extractMetadata ( url, document )
 	{
 		return {
 			url,
@@ -549,6 +545,41 @@ class WebScraper
 		};
 	}
+	configurePuppeteer ( )
+	{
+		this.puppeteerOptions = {
+			headless: false,
+			userDataDir: "./tmp/browser",
+			defaultViewport: null,
+			args: ["--start-maximized"],
+			ignoreDefaultArgs: true
+		};
+		if ( this.puppeteerProxy )
+		{
+			this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
+		}
+		if ( this.puppeteerExecutablePath )
+		{
+			this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
+		}
+		this.puppeteerRealOptions = {
+			headless: false,
+			args: [],
+			customConfig: {},
+			turnstile: true,
+			connectOption: {},
+			disableXvfb: false,
+			ignoreAllFlags: false,
+			proxy: this.puppeteerRealProxy
+		};
+		this.puppeteerBrowser = null;
+		this.puppeteerPage = null;
+	}
 	normalizeExcludeList ( list = [] )
 	{
 		const normalizedSet = new Set();
@@ -577,36 +608,6 @@ class WebScraper
 		return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
 	}
-	createOutputDirectory ()
-	{
-		if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
-		{
-			fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
-		}
-		if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
-		{
-			fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
-		}
-		if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
-		{
-			fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
-		}
-		if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
-		{
-			fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
-		}
-		if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
-		{
-			fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
-		}
-		if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
-		{
-			fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
-		}
-		fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
-		fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
-	}
 	isValidFileType ( url )
 	{
 		if ( !this.filterFileTypes ) return true;
@@ -630,7 +631,7 @@ class WebScraper
 		}
 	}
-	isValidContent ( content )
+	hasValidPageContent ( content )
 	{
 		// Remove whitespace and newlines for checking
 		const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
@@ -657,6 +658,36 @@ class WebScraper
 		return true;
 	}
+	createOutputDirectory ()
+	{
+		if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
+		{
+			fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
+		}
+		if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
+		{
+			fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
+		}
+		if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
+		{
+			fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
+		}
+		if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
+		{
+			fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
+		}
+		if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
+		{
+			fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
+		}
+		if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
+		{
+			fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
+		}
+		fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
+		fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
+	}
 	static sleep ( ms )
 	{
 		return new Promise( resolve => { return setTimeout( resolve, ms ) });