npm - clean-web-scraper - Versions diffs - 4.1.0 → 4.1.2 - Mend

clean-web-scraper 4.1.0 → 4.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -48,7 +48,8 @@ const scraper = new WebScraper({
   baseURL: 'https://example.com/news',          // Required: The website base url to scrape
   startURL: 'https://example.com/blog',         // Optional: Custom starting URL
   excludeList: ['/admin', '/private'],          // Optional: Paths to exclude
-  exactExcludeList: ['/specific-page'],         // Optional: Exact URLs to exclude
+  exactExcludeList: ['/specific-page',          // Optional: Exact URLs to exclude
+  /^https:\/\/host\.com\/\d{4}\/$/],            // Optional: Regex patterns to exclude. this will exclude urls likee https://host.com/2023/
   scrapResultPath: './example.com/website',     // Required: Where to save the content
   jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
   textOutputPath: "./example.com/texts",        // Optional: Custom text output path

package/example-usage.js CHANGED Viewed

@@ -65,6 +65,26 @@ async function khameneiIrFreePalestineTag ( enable )
 	return await runScraper( config, enable );
 }
+async function khameneiIrPalestineSpecialPage ( enable )
+{
+	// https://english.khamenei.ir/palestine-special-page/
+	const config = {
+		baseURL: "https://english.khamenei.ir/palestine-special-page/",
+		maxDepth: 2,
+		exactExcludeList: [
+			"https://english.khamenei.ir/palestine-special-page/"
+		],
+		scrapResultPath: "./dataset/khamenei-ir-palestine-special-page/website",
+		jsonlOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.jsonl",
+		textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
+		csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
+		includeMetadata: true,
+		metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
+		axiosRetryDelay: 10000
+	};
+	return await runScraper( config, enable );
+}
 async function decolonizepalestine ( enable )
 {
 	const config = {
@@ -187,26 +207,8 @@ async function mondoweiss ( enable )
 			"https://mondoweiss.net/activism/",
 			"https://mondoweiss.net/news-letters/",
 			"https://mondoweiss.net/newsletters",
-			"https://mondoweiss.net/2006/",
-			"https://mondoweiss.net/2007/",
-			"https://mondoweiss.net/2008/",
-			"https://mondoweiss.net/2009/",
-			"https://mondoweiss.net/2010/",
-			"https://mondoweiss.net/2011/",
-			"https://mondoweiss.net/2012/",
-			"https://mondoweiss.net/2013/",
-			"https://mondoweiss.net/2014/",
-			"https://mondoweiss.net/2015/",
-			"https://mondoweiss.net/2016/",
-			"https://mondoweiss.net/2017/",
-			"https://mondoweiss.net/2018/",
-			"https://mondoweiss.net/2019/",
-			"https://mondoweiss.net/2020/",
-			"https://mondoweiss.net/2021/",
-			"https://mondoweiss.net/2022/",
-			"https://mondoweiss.net/2023/",
-			"https://mondoweiss.net/2024/",
-			"https://mondoweiss.net/2025/",
+			/^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
+			/^https:\/\/mondoweiss\.net\/\d{4}\/$/,
 			"https://mondoweiss.net/daily-headlines",
 			"https://mondoweiss.net/palestineletter",
 			"https://mondoweiss.net/podcasts/",
@@ -306,6 +308,7 @@ void async function main ()
 	const palianswersScraper = await palianswers( false );
 	const decolonizepalestineScraper = await decolonizepalestine( false );
 	const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
+	const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( false );
 	const electronicintifadaScraper = await electronicintifada( false );
 	const standWithPalestineScraper = await standWithPalestine( false );
 	const mondoweisScraper = await mondoweiss( true );
@@ -316,8 +319,10 @@ void async function main ()
 		palianswersScraper,
 		decolonizepalestineScraper,
 		khameneiIrFreePalestineTagScraper,
+		khameneiIrPalestineSpecialPageScraper,
 		electronicintifadaScraper,
 		standWithPalestineScraper,
 		mondoweisScraper
 	] );
+	// QLoRA = LoRA with 4-bit quantization
 }();

package/main.js CHANGED Viewed

@@ -115,6 +115,7 @@ class WebScraper
 		{
 			return;
 		}
+		let originalUrl = url;
 		if ( this.removeURLFragment )
 		{
 			url = url.split( "#" )[0];
@@ -125,6 +126,7 @@ class WebScraper
 			return;
 		}
 		this.visited.add( url );
+		this.visited.add( originalUrl );
 		if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
 		{
 			return;
@@ -150,7 +152,7 @@ class WebScraper
 				{
 					if ( this.hasValidPageContent( article.textContent ) )
 					{
-						const metadata = this.extractMetadata( url, document );
+						const metadata = this.extractMetadata( url, document, data );
  						metadata.articleTitle = article.title || "";
 						this.saveArticle( url, article.textContent, metadata );
 					}
@@ -164,6 +166,10 @@ class WebScraper
 					console.error( `No readable content found at ${url}` );
 				}
 			}
+			else
+			{
+				console.log( `Skipping excluded URL: ${url}` );
+			}
 			const links = this.extractLinks( data );
 			const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
 			for ( const link of unvisitedLinks )
@@ -334,6 +340,7 @@ class WebScraper
 		fs.mkdirSync( dir, { recursive: true });
 		fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
 		fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
+		fs.writeFileSync( `${filePath}.html`, metadata.originalHtml, "utf-8" );
 		console.log( `Saved: ${filePath}.txt` );
 		console.log( `Saved: ${filePath}.json` );
 	}
@@ -503,7 +510,7 @@ class WebScraper
 		return filteredMetadata;
 	}
-	extractMetadata ( url, document )
+	extractMetadata ( url, document, html )
 	{
 		return {
 			url,
@@ -517,7 +524,8 @@ class WebScraper
 			ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
 			ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
 			ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
-			dateScrapedDate: new Date().toISOString()
+			dateScrapedDate: new Date().toISOString(),
+			originalHtml: html,
 		};
 	}
@@ -578,25 +586,36 @@ class WebScraper
 	normalizeExcludeList ( list = [] )
 	{
 		const normalizedSet = new Set();
-		for ( let i = 0; i < list.length; i++ )
+		for ( const item of list )
 		{
-			const item = list[i];
-			if ( item.endsWith( "/" ) )
-			{
-				normalizedSet.add( item.slice( 0, -1 ) );
-			}
-			else
+			if ( item instanceof RegExp )
 			{
 				normalizedSet.add( item );
+				continue;
 			}
-			normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
+			const withSlash = item.endsWith( "/" ) ? item : `${item }/`;
+			const withoutSlash = item.endsWith( "/" ) ? item.slice( 0, -1 ) : item;
+			normalizedSet.add( withSlash );
+			normalizedSet.add( withoutSlash );
 		}
 		return normalizedSet;
 	}
 	isExcluded ( url )
 	{
-		if ( this.exactExcludeList.has( url ) )
+		if ( Array.from( this.exactExcludeList ).some( excluded =>
+		{
+			if ( excluded instanceof RegExp )
+			{
+				return excluded.test( url );
+			}
+			return url === excluded;
+		}) )
 		{
 			return true;
 		}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "4.1.0",
+  "version": "4.1.2",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",