npm - clean-web-scraper - Versions diffs - 2.3.0 → 2.3.1 - Mend

clean-web-scraper 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -44,14 +44,14 @@ npm install
 const WebScraper = require('clean-web-scraper');
 const scraper = new WebScraper({
-  baseURL: 'https://example.com',       // Required: The website to scrape
-  startURL: 'https://example.com/blog', // Optional: Custom starting URL
-  excludeList: ['/admin', '/private'],  // Optional: Paths to exclude
-  exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
-  scrapResultPath: './dataset',         // Required: Where to save the content
-  jsonlPath: './dataset/train.jsonl',   // Optional: Custom JSONL output path
-  textOutputPath: "./dataset/texts",    // Optional: Custom text output path
-  csvPath: "./dataset/train.csv"        // Optional: Custom CSV output path
+  baseURL: 'https://example.com',           // Required: The website to scrape
+  startURL: 'https://example.com/blog',     // Optional: Custom starting URL
+  excludeList: ['/admin', '/private'],      // Optional: Paths to exclude
+  exactExcludeList: ['/specific-page'],     // Optional: Exact URLs to exclude
+  scrapResultPath: './example.com/website', // Required: Where to save the content
+  jsonlPath: './example.com/train.jsonl',   // Optional: Custom JSONL output path
+  textOutputPath: "./example.com/texts",    // Optional: Custom text output path
+  csvPath: "./example.com/train.csv"        // Optional: Custom CSV output path
 });
 scraper.start();
@@ -73,18 +73,18 @@ Your AI-ready content is saved in a clean, structured format:
 - 📈 CSV output with clean text content
 ```bash
-dataset/
-├── example.com/
+example.com/
+├── website/
 │   ├── page1.txt         # Clean text content
 │   ├── page1.json        # Full metadata
-│   ├── blog/
-│   │   ├── post1.txt
-│   │   ├── post1.json
-│   ├── texts/           # Numbered text files
-│   │   ├── 1.txt
-│   │   ├── 2.txt
-│   ├── train.jsonl      # Combined content
-│   └── train.csv        # Clean text in CSV format
+│   └── blog/
+│       ├── post1.txt
+│       └── post1.json
+│── texts/           # Numbered text files
+│       ├── 1.txt
+│       ├── 2.txt
+│── train.jsonl      # Combined content
+└── train.csv        # Clean text in CSV format
 ```
 ## 🤖 AI/LLM Training Ready

package/example-usage.js CHANGED Viewed

@@ -13,7 +13,7 @@ async function khameneiIrFreePalestineTag ()
 		],
 		exactExcludeList: [
 		],
-		scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag",
+		scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
 		jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
 		textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
 		csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
@@ -39,7 +39,7 @@ async function decolonizepalestine ()
 			"https://decolonizepalestine.com/rainbow-washing",
 			"https://decolonizepalestine.com/"
 		],
-		scrapResultPath: "./dataset/decolonizepalestine",
+		scrapResultPath: "./dataset/decolonizepalestine/website",
 		jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
 		textOutputPath: "./dataset/decolonizepalestine/texts",
 		csvPath: "./dataset/decolonizepalestine/train.csv"
@@ -49,8 +49,8 @@ async function decolonizepalestine ()
 void async function main ()
 {
-	await khameneiIrFreePalestineTag();
-	// await decolonizepalestine();
+	// await khameneiIrFreePalestineTag();
+	await decolonizepalestine();
 	// 3

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "clean-web-scraper",
-  "version": "2.3.0",
+  "version": "2.3.1",
   "main": "main.js",
   "scripts": {
     "start": "node main.js",

package/src/WebScraper.js CHANGED Viewed

@@ -20,7 +20,7 @@ class WebScraper
 	{
 		this.baseURL = baseURL;
 		this.startURL = startURL || baseURL;
-		this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
+		this.scrapResultPath = scrapResultPath;
 		this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
 		this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
 		this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );