npm - clean-web-scraper - Versions diffs - 3.10.0 → 4.0.1 - Mend

clean-web-scraper 3.10.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -56,7 +56,6 @@ const scraper = new WebScraper({
   strictBaseURL: true,                          // Optional: Only scrape URLs from same domain
   maxDepth: Infinity,                           // Optional: Maximum crawling depth
   maxArticles: Infinity,                        // Optional: Maximum articles to scrape
-  concurrencyLimit: 2,                          // Optional: Limit concurrent requests
   crawlingDelay: 1000,                          // Optional: Delay between requests (ms)
   // Network options
@@ -72,8 +71,6 @@ const scraper = new WebScraper({
   // Puppeteer options for handling dynamic content
   usePuppeteer: false,                          // Optional: Enable Puppeteer browser
-  puppeteerProxy: "http://127.0.0.1:2080",      // Optional: Puppeteer proxy
-  puppeteerExecutablePath: "/path/to/chrome",   // Optional: Custom browser path
 });
 await scraper.start();
 ```

package/example-usage.js CHANGED Viewed

@@ -1,17 +1,25 @@
 const WebScraper = require( "./main" );
-// const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
 const headers = {
 	"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
 	"Cache-Control": "private",
 	"Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
 	// "Cookie": cookies
+};
+async function runScraper ( config, enable )
+{
+	const scraper = new WebScraper( config );
+	if ( enable )
+	{
+		await scraper.start();
+	}
+	return scraper;
 }
 async function palianswers ( enable )
 {
-	// https://palianswers.com
-	const scraper = new WebScraper({
+	const config = {
 		baseURL: "https://palianswers.com",
 		excludeList: [
 			"https://palianswers.com/chat/",
@@ -22,30 +30,23 @@ async function palianswers ( enable )
 			"https://palianswers.com/themes/"
 		],
 		exactExcludeList: [
-			"https://palianswers.com/",
+			"https://palianswers.com/"
 		],
 		scrapResultPath: "./dataset/palianswers/website",
 		jsonlOutputPath: "./dataset/palianswers/train.jsonl",
 		textOutputPath: "./dataset/palianswers/texts",
 		csvOutputPath: "./dataset/palianswers/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
 		axiosRetryDelay: 10000,
-		concurrencyLimit: 4,
 		crawlingDelay: 0
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
+	};
+	return await runScraper( config, enable );
 }
 async function khameneiIrFreePalestineTag ( enable )
 {
-	// https://english.khamenei.ir/Opinions/FreePalestine
-	// https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
-	const scraper = new WebScraper({
+	const config = {
 		baseURL: "https://english.khamenei.ir/news",
 		startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
 		maxDepth: 1,
@@ -57,21 +58,15 @@ async function khameneiIrFreePalestineTag ( enable )
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
 		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
 		axiosRetryDelay: 10000,
-		concurrencyLimit: 4,
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
+	};
+	return await runScraper( config, enable );
 }
 async function decolonizepalestine ( enable )
 {
-	// https://decolonizepalestine.com
-	const scraper = new WebScraper({
+	const config = {
 		baseURL: "https://decolonizepalestine.com",
 		excludeList: [
 			"https://decolonizepalestine.com/cdn-cgi",
@@ -89,21 +84,15 @@ async function decolonizepalestine ( enable )
 		textOutputPath: "./dataset/decolonizepalestine/texts",
 		csvOutputPath: "./dataset/decolonizepalestine/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
 		axiosRetryDelay: 10000,
-		concurrencyLimit: 4,
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
+	};
+	return await runScraper( config, enable );
 }
 async function electronicintifada ( enable )
 {
-	// https://electronicintifada.net
-	const scraper = new WebScraper({
+	const config = {
 		baseURL: "https://electronicintifada.net",
 		excludeList: [
 			"https://electronicintifada.net/updates",
@@ -129,51 +118,43 @@ async function electronicintifada ( enable )
 		textOutputPath: "./dataset/electronicintifada/texts",
 		csvOutputPath: "./dataset/electronicintifada/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
 		maxDepth: 16,
 		maxArticles: 2000,
-		concurrencyLimit: 2,
 		axiosHeaders: headers,
+		axiosMaxRetries: 2,
 		axiosRetryDelay: 10000,
 		axiosProxy: {
 			host: "localhost",
 			port: 2080,
 			protocol: "http"
 		},
-		useProxyAsFallback: true
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
+		useProxyAsFallback: true,
+		crawlingDelay: 0
+	};
+	return await runScraper( config, enable );
 }
 async function standWithPalestine ( enable )
 {
-	const scraper = new WebScraper({
+	const config = {
 		baseURL: "https://stand-with-palestine.org/blogs",
 		startURL: "https://stand-with-palestine.org/blogs",
+		exactExcludeList: ["https://stand-with-palestine.org/blogs"],
 		scrapResultPath: "./dataset/stand-with-palestine/website",
 		jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
 		textOutputPath: "./dataset/stand-with-palestine/texts",
 		csvOutputPath: "./dataset/stand-with-palestine/train.csv",
-		exactExcludeList: ["https://stand-with-palestine.org/blogs"],
 		axiosHeaders: headers,
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"]
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
+	};
+	return await runScraper( config, enable );
 }
 async function mondoweiss ( enable )
 {
-	// https://mondoweiss.net
-	const scraper = new WebScraper({
+	const config = {
 		baseURL: "https://mondoweiss.net",
 		excludeList: [
 			"https://mondoweiss.net/donate",
@@ -199,31 +180,25 @@ async function mondoweiss ( enable )
 		textOutputPath: "./dataset/mondoweiss/texts",
 		csvOutputPath: "./dataset/mondoweiss/train.csv",
 		maxArticles: 2500,
-		axiosMaxRetries: 3,
-		concurrencyLimit: 3,
+		maxDepth: 15,
 		axiosHeaders: headers,
+		axiosMaxRetries: 3,
+		axiosRetryDelay: 10000,
 		axiosProxy: {
 			host: "localhost",
 			port: 2080,
 			protocol: "http"
 		},
-		maxDepth: 15,
-		axiosRetryDelay: 10000,
+		useProxyAsFallback: true,
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"],
-		useProxyAsFallback: true
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
+	};
+	return await runScraper( config, enable );
 }
 async function bdsmovement ( enable )
 {
-	// https://bdsmovement.net
-	const scraper = new WebScraper({
+	const config = {
 		baseURL: "https://bdsmovement.net",
 		excludeList: [
 			"https://bdsmovement.net/press-area",
@@ -238,25 +213,18 @@ async function bdsmovement ( enable )
 		textOutputPath: "./dataset/bdsmovement/texts",
 		csvOutputPath: "./dataset/bdsmovement/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"],
-		puppeteerProxy: "socks5://127.0.0.1:2080",
-		puppeteerExecutablePath: "/usr/bin/chromium",
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
 		puppeteerRealProxy: {
 			host: "socks5://127.0.0.1",
 			port: "2080",
 		},
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
+	};
+	return await runScraper( config, enable );
 }
 async function palestineremembered ( enable )
 {
-	// https://www.palestineremembered.com
-	const scraper = new WebScraper({
+	const config = {
 		baseURL: "https://www.palestineremembered.com",
 		startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
 		excludeList: [
@@ -283,18 +251,14 @@ async function palestineremembered ( enable )
 		textOutputPath: "./dataset/palestineremembered/texts",
 		csvOutputPath: "./dataset/palestineremembered/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
 		axiosProxy: {
 			host: "localhost",
 			port: 2080,
 			protocol: "http"
 		}
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
+	};
+	return await runScraper( config, enable );
 }
 void async function main ()
@@ -316,4 +280,4 @@ void async function main ()
 		standWithPalestineScraper,
 		mondoweisScraper
 	] );
-}()
+}();

package/main.js CHANGED Viewed

@@ -15,13 +15,12 @@ class WebScraper
 		this.strictBaseURL = config.strictBaseURL || true;
 		this.maxDepth = config.maxDepth || Infinity;
 		this.maxArticles = config.maxArticles || Infinity;
-		this.concurrencyLimit = config.concurrencyLimit || 2;
 		this.crawlingDelay = config.crawlingDelay ?? 1000;
 		// Output paths setup
 		this.scrapResultPath = config.scrapResultPath || "./dataset";
 		this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
-		this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
+		this.textOutputPathWithMeta = `${this.textOutputPath}_with_metadata`;
 		this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
 		this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
@@ -60,8 +59,6 @@ class WebScraper
 		// Puppeteer configuration
 		this.usePuppeteer = config.usePuppeteer || false;
-		this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
-		this.puppeteerExecutablePath = config.puppeteerExecutablePath;
 		this.puppeteerRealProxy = config.puppeteerRealProxy;
 		this.configurePuppeteer();
 	}
@@ -138,16 +135,14 @@ class WebScraper
 				if ( !this.isExcluded( url ) )
 				{
-					const reader = new Readability( document, {
-						charThreshold: 500,
-						nbTopCandidates: 20
-					});
+					const reader = new Readability( document );
 					const article = reader.parse();
 					if ( article )
 					{
 						if ( this.hasValidPageContent( article.textContent ) )
 						{
 							const metadata = this.extractMetadata( url, document );
+ 							metadata.articleTitle = article.title || "";
 							this.saveArticle( url, article.textContent, metadata );
 						}
 						else
@@ -506,17 +501,14 @@ class WebScraper
 	{
 		return {
 			url,
-			title: document.title,
+			pageTitle: document.title,
 			description: document.querySelector( "meta[name=\"description\"]" )?.content,
 			keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
 			author: document.querySelector( "meta[name=\"author\"]" )?.content,
-			language:
-        document.documentElement.lang ||
-        document.querySelector( "html" )?.getAttribute( "lang" ),
+			language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
 			canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
 			ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
-			ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
-			?.content,
+			ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
 			ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
 			ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
 			dateScrapedDate: new Date().toISOString()
@@ -562,23 +554,6 @@ class WebScraper
 	configurePuppeteer ( )
 	{
-		this.puppeteerOptions = {
-			headless: false,
-			userDataDir: "./tmp/browser",
-			defaultViewport: null,
-			args: ["--start-maximized"],
-			ignoreDefaultArgs: true
-		};
-		if ( this.puppeteerProxy )
-		{
-			this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
-		}
-		if ( this.puppeteerExecutablePath )
-		{
-			this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
-		}
 		this.puppeteerRealOptions = {
 			headless: false,
 			args: [],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "3.10.0",
+  "version": "4.0.1",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",