npm - clean-web-scraper - Versions diffs - 3.8.0 → 3.8.2 - Mend

clean-web-scraper 3.8.0 → 3.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/example-usage.js CHANGED Viewed

@@ -29,7 +29,8 @@ async function palianswers ( enable )
 		textOutputPath: "./dataset/palianswers/texts",
 		csvOutputPath: "./dataset/palianswers/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"]
+		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		retryDelay: 10000
 	});
 	if ( enable )
 	{
@@ -54,7 +55,8 @@ async function khameneiIrFreePalestineTag ( enable )
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
 		csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"]
+		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		retryDelay: 10000
 	});
 	if ( enable )
 	{
@@ -84,40 +86,8 @@ async function decolonizepalestine ( enable )
 		textOutputPath: "./dataset/decolonizepalestine/texts",
 		csvOutputPath: "./dataset/decolonizepalestine/train.csv",
 		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"]
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
-}
-async function bdsmovement ( enable )
-{
-	// https://bdsmovement.net
-	const scraper = new WebScraper({
-		baseURL: "https://bdsmovement.net",
-		excludeList: [
-			"https://bdsmovement.net/press-area",
-			"https://bdsmovement.net/privacy-policy",
-			"https://bdsmovement.net/get-involved/join-a-bds-campaign",
-			"https://bdsmovement.net/donate_",
-			"https://bdsmovement.net/user",
-			"https://bdsmovement.net/admin"
-		],
-		scrapResultPath: "./dataset/bdsmovement/website",
-		jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
-		textOutputPath: "./dataset/bdsmovement/texts",
-		csvOutputPath: "./dataset/bdsmovement/train.csv",
-		includeMetadata: true,
 		metadataFields: ["author", "title", "description", "dateScrapedDate"],
-		puppeteerProxy: "socks5://127.0.0.1:2080",
-		puppeteerExecutablePath: "/usr/bin/chromium",
-		puppeteerRealProxy: {
-			host: "socks5://127.0.0.1",
-			port: "2080",
-		},
+		retryDelay: 10000
 	});
 	if ( enable )
 	{
@@ -155,53 +125,11 @@ async function electronicintifada ( enable )
 		textOutputPath: "./dataset/electronicintifada/texts",
 		csvOutputPath: "./dataset/electronicintifada/train.csv",
 		includeMetadata: true,
+		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		maxDepth: 10,
 		maxArticles: 2000,
 		axiosHeaders: headers,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"]
-	});
-	if ( enable )
-	{
-		await scraper.start();
-	}
-	return scraper;
-}
-async function palestineremembered ( enable )
-{
-	// https://www.palestineremembered.com
-	const scraper = new WebScraper({
-		baseURL: "https://www.palestineremembered.com",
-		startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
-		excludeList: [
-			"https://www.palestineremembered.com/GeoPoints",
-			"https://www.palestineremembered.com/Donate",
-			"https://www.palestineremembered.com/ContactUs.html",
-			"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
-			"https://www.palestineremembered.com/ar/",
-			"https://www.palestineremembered.com/OldNewPictures.html",
-			"https://www.palestineremembered.com/Maps/index.html",
-			"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
-			"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
-			"https://www.palestineremembered.com/Articles/General/Story2045.html",
-			"https://www.palestineremembered.com/AllTownsListing.html",
-			"https://www.palestineremembered.com/Articles/General/ar/",
-			"https://www.palestineremembered.com/SiteVideos.html"
-		],
-		exactExcludeList: [
-			"https://www.palestineremembered.com/index.html",
-			"https://www.palestineremembered.com/ZionistFAQ.html"
-		],
-		scrapResultPath: "./dataset/palestineremembered/website",
-		jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
-		textOutputPath: "./dataset/palestineremembered/texts",
-		csvOutputPath: "./dataset/palestineremembered/train.csv",
-		includeMetadata: true,
-		metadataFields: ["author", "title", "description", "dateScrapedDate"],
-		axiosProxy: {
-			host: "localhost",
-			port: 2080,
-			protocol: "http"
-		}
+		retryDelay: 10000
 	});
 	if ( enable )
 	{
@@ -240,15 +168,36 @@ async function mondoweiss ( enable )
 			"https://mondoweiss.net/donate",
 			"https://mondoweiss.net/advertise/",
 			"https://mondoweiss.net/contact/",
-			"https://mondoweiss.net/recent-comments/"
+			"https://mondoweiss.net/recent-comments/",
+			"https://mondoweiss.net/email-newsletters",
+			"https://mondoweiss.net/author",
+			"https://mondoweiss.net/tag/"
+		],
+		exactExcludeList: [
+			"https://mondoweiss.net",
+			"https://mondoweiss.net/news/",
+			"https://mondoweiss.net/opinion/",
+			"https://mondoweiss.net/ways-to-give/",
+			"https://mondoweiss.net/media-analysis/",
+			"https://mondoweiss.net/culture/",
+			"https://mondoweiss.net/activism/",
+			"https://mondoweiss.net/news-letters/"
 		],
 		scrapResultPath: "./dataset/mondoweiss/website",
 		jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
 		textOutputPath: "./dataset/mondoweiss/texts",
 		csvOutputPath: "./dataset/mondoweiss/train.csv",
-		includeMetadata: true,
 		maxArticles: 2500,
+		maxRetries: 2,
 		axiosHeaders: headers,
+		axiosProxy: {
+			host: "localhost",
+			port: 2080,
+			protocol: "http"
+		},
+		maxDepth: 10,
+		retryDelay: 10000,
+		includeMetadata: true,
 		metadataFields: ["author", "title", "description", "dateScrapedDate"]
 	});
 	if ( enable )
@@ -258,14 +207,90 @@ async function mondoweiss ( enable )
 	return scraper;
 }
+async function bdsmovement ( enable )
+{
+	// https://bdsmovement.net
+	const scraper = new WebScraper({
+		baseURL: "https://bdsmovement.net",
+		excludeList: [
+			"https://bdsmovement.net/press-area",
+			"https://bdsmovement.net/privacy-policy",
+			"https://bdsmovement.net/get-involved/join-a-bds-campaign",
+			"https://bdsmovement.net/donate_",
+			"https://bdsmovement.net/user",
+			"https://bdsmovement.net/admin"
+		],
+		scrapResultPath: "./dataset/bdsmovement/website",
+		jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
+		textOutputPath: "./dataset/bdsmovement/texts",
+		csvOutputPath: "./dataset/bdsmovement/train.csv",
+		includeMetadata: true,
+		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		puppeteerProxy: "socks5://127.0.0.1:2080",
+		puppeteerExecutablePath: "/usr/bin/chromium",
+		puppeteerRealProxy: {
+			host: "socks5://127.0.0.1",
+			port: "2080",
+		},
+	});
+	if ( enable )
+	{
+		await scraper.start();
+	}
+	return scraper;
+}
+async function palestineremembered ( enable )
+{
+	// https://www.palestineremembered.com
+	const scraper = new WebScraper({
+		baseURL: "https://www.palestineremembered.com",
+		startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
+		excludeList: [
+			"https://www.palestineremembered.com/GeoPoints",
+			"https://www.palestineremembered.com/Donate",
+			"https://www.palestineremembered.com/ContactUs.html",
+			"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
+			"https://www.palestineremembered.com/ar/",
+			"https://www.palestineremembered.com/OldNewPictures.html",
+			"https://www.palestineremembered.com/Maps/index.html",
+			"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
+			"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
+			"https://www.palestineremembered.com/Articles/General/Story2045.html",
+			"https://www.palestineremembered.com/AllTownsListing.html",
+			"https://www.palestineremembered.com/Articles/General/ar/",
+			"https://www.palestineremembered.com/SiteVideos.html"
+		],
+		exactExcludeList: [
+			"https://www.palestineremembered.com/index.html",
+			"https://www.palestineremembered.com/ZionistFAQ.html"
+		],
+		scrapResultPath: "./dataset/palestineremembered/website",
+		jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
+		textOutputPath: "./dataset/palestineremembered/texts",
+		csvOutputPath: "./dataset/palestineremembered/train.csv",
+		includeMetadata: true,
+		metadataFields: ["author", "title", "description", "dateScrapedDate"],
+		axiosProxy: {
+			host: "localhost",
+			port: 2080,
+			protocol: "http"
+		}
+	});
+	if ( enable )
+	{
+		await scraper.start();
+	}
+	return scraper;
+}
 void async function main ()
 {
-	const palianswersScraper = await palianswers( false );
-	const decolonizepalestineScraper = await decolonizepalestine( false );
-	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
-	const electronicintifadaScraper = await electronicintifada( false );
-	const standWithPalestineScraper = await standWithPalestine( false );
+	const palianswersScraper = await palianswers( true );
+	const decolonizepalestineScraper = await decolonizepalestine( true );
+	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
+	const electronicintifadaScraper = await electronicintifada( true );
+	const standWithPalestineScraper = await standWithPalestine( true );
 	const mondoweisScraper = await mondoweiss( true );
 	const bdsmovementScraper = await bdsmovement( false );
 	const palestinerememberedScraper = await palestineremembered( false );
@@ -278,7 +303,4 @@ void async function main ()
 		standWithPalestineScraper,
 		mondoweisScraper
 	] );
-}()
-// https://mondoweiss.net
+}()

package/main.js CHANGED Viewed

@@ -16,12 +16,14 @@ class WebScraper
 		maxArticles,
 		concurrencyLimit,
 		maxRetries,
+		retryDelay,
 		// URL filtering
 		excludeList = [],
 		exactExcludeList = [],
 		filterFileTypes,
 		excludedFileTypes,
+		removeURLFragment,
 		// Output paths
 		scrapResultPath = "./dataset",
@@ -52,6 +54,7 @@ class WebScraper
 		this.maxArticles = maxArticles || Infinity;
 		this.concurrencyLimit = concurrencyLimit || 2;
 		this.maxRetries = maxRetries || 5;
+		this.retryDelay = retryDelay || 40000;
 		// Output paths setup
 		this.scrapResultPath = scrapResultPath;
@@ -72,6 +75,7 @@ class WebScraper
 		this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
 		this.filterFileTypes = filterFileTypes || true;
 		this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
+		this.removeURLFragment = removeURLFragment || true;
 		// Network configuration
 		this.axiosHeaders = axiosHeaders;
@@ -130,6 +134,10 @@ class WebScraper
 	async fetchPage ( url, depth )
 	{
+		if ( this.removeURLFragment )
+		{
+			url = url.split( "#" )[0];
+		}
 		if ( this.hasReachedMax( depth ) )
 		{
 			return;
@@ -233,7 +241,7 @@ class WebScraper
 		}
 		catch ( error )
 		{
-			console.error( `Error fetching ${url}:`, error.message );
+			console.error( `Error fetching content ${url}:`, error.message );
 			if ( error.status = 403 && this.usePuppeteer )
 			{
 				try
@@ -351,6 +359,10 @@ class WebScraper
 		{
 			urlPath = "/index";
 		}
+		else if ( urlPath.endsWith( "/" ) )
+		{
+			urlPath = urlPath.slice( 0, -1 );
+		}
 		const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
 		const dir = path.dirname( filePath );
@@ -580,7 +592,7 @@ class WebScraper
 			catch ( error )
 			{
 				if ( attempt >= this.maxRetries ) throw error;
-				await WebScraper.sleep( 40000 * attempt );
+				await WebScraper.sleep( this.retryDelay * attempt );
 				console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
 			}
 		}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "3.8.0",
+  "version": "3.8.2",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",