clean-web-scraper 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -44,14 +44,14 @@ npm install
44
44
  const WebScraper = require('clean-web-scraper');
45
45
 
46
46
  const scraper = new WebScraper({
47
- baseURL: 'https://example.com', // Required: The website to scrape
48
- startURL: 'https://example.com/blog', // Optional: Custom starting URL
49
- excludeList: ['/admin', '/private'], // Optional: Paths to exclude
50
- exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
51
- scrapResultPath: './dataset', // Required: Where to save the content
52
- jsonlPath: './dataset/train.jsonl', // Optional: Custom JSONL output path
53
- textOutputPath: "./dataset/texts", // Optional: Custom text output path
54
- csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
47
+ baseURL: 'https://example.com', // Required: The website to scrape
48
+ startURL: 'https://example.com/blog', // Optional: Custom starting URL
49
+ excludeList: ['/admin', '/private'], // Optional: Paths to exclude
50
+ exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
51
+ scrapResultPath: './example.com/website', // Required: Where to save the content
52
+ jsonlPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
53
+ textOutputPath: "./example.com/texts", // Optional: Custom text output path
54
+ csvPath: "./example.com/train.csv" // Optional: Custom CSV output path
55
55
  });
56
56
 
57
57
  scraper.start();
@@ -73,18 +73,18 @@ Your AI-ready content is saved in a clean, structured format:
73
73
  - 📈 CSV output with clean text content
74
74
 
75
75
  ```bash
76
- dataset/
77
- ├── example.com/
76
+ example.com/
77
+ ├── website/
78
78
  │ ├── page1.txt # Clean text content
79
79
  │ ├── page1.json # Full metadata
80
- ├── blog/
81
- ├── post1.txt
82
- │ ├── post1.json
83
- │ ├── texts/ # Numbered text files
84
- ├── 1.txt
85
- ├── 2.txt
86
- │ ├── train.jsonl # Combined content
87
- └── train.csv # Clean text in CSV format
80
+ └── blog/
81
+ ├── post1.txt
82
+ └── post1.json
83
+ │── texts/ # Numbered text files
84
+ ├── 1.txt
85
+ ├── 2.txt
86
+ │── train.jsonl # Combined content
87
+ └── train.csv # Clean text in CSV format
88
88
  ```
89
89
 
90
90
  ## 🤖 AI/LLM Training Ready
package/example-usage.js CHANGED
@@ -13,7 +13,7 @@ async function khameneiIrFreePalestineTag ()
13
13
  ],
14
14
  exactExcludeList: [
15
15
  ],
16
- scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag",
16
+ scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
17
17
  jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
18
18
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
19
19
  csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
@@ -39,7 +39,7 @@ async function decolonizepalestine ()
39
39
  "https://decolonizepalestine.com/rainbow-washing",
40
40
  "https://decolonizepalestine.com/"
41
41
  ],
42
- scrapResultPath: "./dataset/decolonizepalestine",
42
+ scrapResultPath: "./dataset/decolonizepalestine/website",
43
43
  jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
44
44
  textOutputPath: "./dataset/decolonizepalestine/texts",
45
45
  csvPath: "./dataset/decolonizepalestine/train.csv"
@@ -49,8 +49,8 @@ async function decolonizepalestine ()
49
49
 
50
50
  void async function main ()
51
51
  {
52
- await khameneiIrFreePalestineTag();
53
- // await decolonizepalestine();
52
+ // await khameneiIrFreePalestineTag();
53
+ await decolonizepalestine();
54
54
 
55
55
 
56
56
  // 3
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.3.0",
3
+ "version": "2.3.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -20,7 +20,7 @@ class WebScraper
20
20
  {
21
21
  this.baseURL = baseURL;
22
22
  this.startURL = startURL || baseURL;
23
- this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
23
+ this.scrapResultPath = scrapResultPath;
24
24
  this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
25
25
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
26
26
  this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );