npm - clean-web-scraper - Versions diffs - 3.7.6 → 3.8.1 - Mend

clean-web-scraper 3.7.6 → 3.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/example-usage.js CHANGED Viewed

@@ -10,6 +10,7 @@ const headers = {
 async function palianswers ( enable )
 {
+	// https://palianswers.com
 	const scraper = new WebScraper({
 		baseURL: "https://palianswers.com",
 		excludeList: [
@@ -28,7 +29,7 @@ async function palianswers ( enable )
 		textOutputPath: "./dataset/palianswers/texts",
 		csvOutputPath: "./dataset/palianswers/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description"]
+		metadataFields: ["author", "title", "description", "dateScrapedDate"]
 	});
 	if ( enable )
 	{
@@ -44,7 +45,7 @@ async function khameneiIrFreePalestineTag ( enable )
 	const scraper = new WebScraper({
 		baseURL: "https://english.khamenei.ir/news",
 		startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
-		maxDepth: 3,
+		maxDepth: 1,
 		exactExcludeList: [
 			"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
 		],
@@ -53,7 +54,7 @@ async function khameneiIrFreePalestineTag ( enable )
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
 		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description"]
+		metadataFields: ["author", "title", "description", "dateScrapedDate"]
 	});
 	if ( enable )
 	{
@@ -64,6 +65,7 @@ async function khameneiIrFreePalestineTag ( enable )
 async function decolonizepalestine ( enable )
 {
+	// https://decolonizepalestine.com
 	const scraper = new WebScraper({
 		baseURL: "https://decolonizepalestine.com",
 		excludeList: [
@@ -82,7 +84,7 @@ async function decolonizepalestine ( enable )
 		textOutputPath: "./dataset/decolonizepalestine/texts",
 		csvOutputPath: "./dataset/decolonizepalestine/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description"]
+		metadataFields: ["author", "title", "description", "dateScrapedDate"]
 	});
 	if ( enable )
 	{
@@ -93,6 +95,7 @@ async function decolonizepalestine ( enable )
 async function bdsmovement ( enable )
 {
+	// https://bdsmovement.net
 	const scraper = new WebScraper({
 		baseURL: "https://bdsmovement.net",
 		excludeList: [
@@ -108,7 +111,7 @@ async function bdsmovement ( enable )
 		textOutputPath: "./dataset/bdsmovement/texts",
 		csvOutputPath: "./dataset/bdsmovement/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description"],
+		metadataFields: ["author", "title", "description", "dateScrapedDate"],
 		puppeteerProxy: "socks5://127.0.0.1:2080",
 		puppeteerExecutablePath: "/usr/bin/chromium",
 		puppeteerRealProxy: {
@@ -125,6 +128,7 @@ async function bdsmovement ( enable )
 async function electronicintifada ( enable )
 {
+	// https://electronicintifada.net
 	const scraper = new WebScraper({
 		baseURL: "https://electronicintifada.net",
 		excludeList: [
@@ -153,7 +157,7 @@ async function electronicintifada ( enable )
 		includeMetadata: true,
 		maxArticles: 2000,
 		axiosHeaders: headers,
-		metadataFields: ["author", "title", "description"]
+		metadataFields: ["author", "title", "description", "dateScrapedDate"]
 	});
 	if ( enable )
 	{
@@ -192,7 +196,7 @@ async function palestineremembered ( enable )
 		textOutputPath: "./dataset/palestineremembered/texts",
 		csvOutputPath: "./dataset/palestineremembered/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description"],
+		metadataFields: ["author", "title", "description", "dateScrapedDate"],
 		axiosProxy: {
 			host: "localhost",
 			port: 2080,
@@ -206,13 +210,82 @@ async function palestineremembered ( enable )
 	return scraper;
 }
+async function standWithPalestine ( enable )
+{
+	const scraper = new WebScraper({
+		baseURL: "https://stand-with-palestine.org/blogs",
+		startURL: "https://stand-with-palestine.org/blogs",
+		scrapResultPath: "./dataset/stand-with-palestine/website",
+		jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
+		textOutputPath: "./dataset/stand-with-palestine/texts",
+		csvOutputPath: "./dataset/stand-with-palestine/train.csv",
+		exactExcludeList: ["https://stand-with-palestine.org/blogs"],
+		axiosHeaders: headers,
+		includeMetadata: true,
+		metadataFields: ["author", "title", "description", "dateScrapedDate"]
+	});
+	if ( enable )
+	{
+		await scraper.start();
+	}
+	return scraper;
+}
+async function mondoweiss ( enable )
+{
+	// https://mondoweiss.net
+	const scraper = new WebScraper({
+		baseURL: "https://mondoweiss.net",
+		excludeList: [
+			"https://mondoweiss.net/donate",
+			"https://mondoweiss.net/advertise/",
+			"https://mondoweiss.net/contact/",
+			"https://mondoweiss.net/recent-comments/",
+			"https://mondoweiss.net/email-newsletters",
+			"https://mondoweiss.net/author",
+			"https://mondoweiss.net/tag/"
+		],
+		exactExcludeList: [
+			"https://mondoweiss.net",
+			"https://mondoweiss.net/news/",
+			"https://mondoweiss.net/opinion/",
+			"https://mondoweiss.net/ways-to-give/",
+			"https://mondoweiss.net/media-analysis/",
+			"https://mondoweiss.net/culture/",
+			"https://mondoweiss.net/activism/",
+			"https://mondoweiss.net/news-letters/"
+		],
+		scrapResultPath: "./dataset/mondoweiss/website",
+		jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
+		textOutputPath: "./dataset/mondoweiss/texts",
+		csvOutputPath: "./dataset/mondoweiss/train.csv",
+		includeMetadata: true,
+		maxArticles: 2500,
+		maxRetries: 2,
+		axiosHeaders: headers,
+		axiosProxy: {
+			host: "localhost",
+			port: 2080,
+			protocol: "http"
+		},
+		metadataFields: ["author", "title", "description", "dateScrapedDate"]
+	});
+	if ( enable )
+	{
+		await scraper.start();
+	}
+	return scraper;
+}
 void async function main ()
 {
 	const palianswersScraper = await palianswers( false );
 	const decolonizepalestineScraper = await decolonizepalestine( false );
 	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
-	const electronicintifadaScraper = await electronicintifada( true );
+	const electronicintifadaScraper = await electronicintifada( false );
+	const standWithPalestineScraper = await standWithPalestine( false );
+	const mondoweisScraper = await mondoweiss( true );
 	const bdsmovementScraper = await bdsmovement( false );
 	const palestinerememberedScraper = await palestineremembered( false );
@@ -221,10 +294,7 @@ void async function main ()
 		decolonizepalestineScraper,
 		khameneiIrFreePalestineTagScraper,
 		electronicintifadaScraper,
-		// bdsmovementScraper,
-		// palestinerememberedScraper,
+		standWithPalestineScraper,
+		mondoweisScraper
 	] );
-	// 7 https://stand-with-palestine.org/blogs
-	// https://mondoweiss.net
-}()
+}()

package/main.js CHANGED Viewed

@@ -22,6 +22,7 @@ class WebScraper
 		exactExcludeList = [],
 		filterFileTypes,
 		excludedFileTypes,
+		removeURLFragment,
 		// Output paths
 		scrapResultPath = "./dataset",
@@ -72,6 +73,7 @@ class WebScraper
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
 		this.filterFileTypes = filterFileTypes || true;
 		this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
+		this.removeURLFragment = removeURLFragment || true;
 		// Network configuration
 		this.axiosHeaders = axiosHeaders;
@@ -130,6 +132,10 @@ class WebScraper
 	async fetchPage ( url, depth )
 	{
+		if ( this.removeURLFragment )
+		{
+			url = url.split( "#" )[0];
+		}
 		if ( this.hasReachedMax( depth ) )
 		{
 			return;
@@ -161,7 +167,6 @@ class WebScraper
 					if ( this.hasValidPageContent( article.textContent ) )
 					{
 						const metadata = this.extractMetadata( url, document );
-						metadata.depth = depth;
 						this.saveArticle( url, article.textContent, metadata );
 					}
 					else
@@ -234,7 +239,7 @@ class WebScraper
 		}
 		catch ( error )
 		{
-			console.error( `Error fetching ${url}:`, error.message );
+			console.error( `Error fetching content ${url}:`, error.message );
 			if ( error.status = 403 && this.usePuppeteer )
 			{
 				try
@@ -262,7 +267,7 @@ class WebScraper
 		}
 	}
-	hasReachedMax ( depth )
+	hasReachedMax ( depth = 0 )
 	{
 		if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
 		{
@@ -555,7 +560,7 @@ class WebScraper
 			ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
 			ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
 			ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
-			dateScraped: new Date().toISOString()
+			dateScrapedDate: new Date().toISOString()
 		};
 	}
@@ -572,7 +577,7 @@ class WebScraper
 		{
 			try
 			{
-				if ( this.hasReachedMax( depth ) )
+				if ( this.hasReachedMax( ) )
 				{
 					throw new Error( "Max reached" );
 				}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "3.7.6",
+  "version": "3.8.1",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",