clean-web-scraper 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,8 +11,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
11
11
  - 🚫 Excludes unwanted paths from scraping
12
12
  - 🔄 Handles relative and absolute URLs like a pro
13
13
  - 🎯 No duplicate page visits
14
- - 📊 Generates JSONL and raw text output file for ML training
15
- - 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
14
+ - 📊 Generates JSONL output file for ML training
15
+ - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
16
16
 
17
17
  ## 🛠️ Prerequisites
18
18
 
@@ -44,11 +44,12 @@ const WebScraper = require('clean-web-scraper');
44
44
 
45
45
  const scraper = new WebScraper({
46
46
  baseURL: 'https://example.com', // Required: The website to scrape
47
- scrapResultPath: './output', // Required: Where to save the content
48
47
  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
48
  exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
50
- jsonlPath: 'output.jsonl', // Optional: Custom JSONL output path
51
- textOutputPath: "./dataset/texts" // Optional: Custom text output path
49
+ scrapResultPath: './dataset', // Required: Where to save the content
50
+ jsonlPath: './dataset/train.jsonl', // Optional: Custom JSONL output path
51
+ textOutputPath: "./dataset/texts", // Optional: Custom text output path
52
+ csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
52
53
  });
53
54
 
54
55
  scraper.start();
@@ -67,6 +68,31 @@ Your AI-ready content is saved in a clean, structured format:
67
68
  - 📝 Pure text format, perfect for LLM training and fine-tuning
68
69
  - 🤖 No HTML, no mess - just clean, structured text ready for AI consumption
69
70
  - 📊 JSONL output for ML training
71
+ - 📈 CSV output with clean text content
72
+
73
+ ```bash
74
+ dataset/
75
+ ├── decolonizepalestine.com
76
+ │ ├── faq.json
77
+ │ ├── faq.txt
78
+ │ ├── intro
79
+ │ │ ├── bds-101.json
80
+ │ │ ├── bds-101.txt
81
+ │ ├── myth
82
+ │ │ ├── a-land-without-a-people-for-a-people-without-a-land.json
83
+ │ │ ├── a-land-without-a-people-for-a-people-without-a-land.txt
84
+ │ └── rainbow-washing
85
+ │ ├── bluewashing.json
86
+ │ ├── bluewashing.txt
87
+ ├── texts
88
+ │ ├── 1.txt
89
+ │ ├── 2.txt
90
+ │ ├── 3.txt
91
+ │ ├── 4.txt
92
+ │ └── 5.txt
93
+ ├── train.csv
94
+ └── train.jsonl
95
+ ```
70
96
 
71
97
  ## 🤖 AI/LLM Training Ready
72
98
 
package/example-usage.js CHANGED
@@ -22,6 +22,7 @@ const scraper = new WebScraper({
22
22
  excludeList,
23
23
  exactExcludeList,
24
24
  jsonlPath: "./dataset/train.jsonl",
25
- textOutputPath: "./dataset/texts"
25
+ textOutputPath: "./dataset/texts",
26
+ csvPath: "./dataset/train.csv"
26
27
  });
27
28
  scraper.start();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.1.0",
3
+ "version": "2.2.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -13,13 +13,15 @@ class WebScraper
13
13
  exactExcludeList,
14
14
  scrapResultPath = "./dataset",
15
15
  jsonlPath,
16
- textOutputPath
16
+ textOutputPath,
17
+ csvPath
17
18
  })
18
19
  {
19
20
  this.baseURL = baseURL;
20
21
  this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
21
22
  this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
22
23
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
24
+ this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
23
25
  this.visited = new Set();
24
26
  this.excludeList = new Set( excludeList );
25
27
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -33,6 +35,7 @@ class WebScraper
33
35
  await this.fetchPage( this.baseURL );
34
36
  this.createJSONLFile();
35
37
  this.saveNumberedTextFiles();
38
+ this.createCSVFile();
36
39
  console.log( "Scraping completed." );
37
40
  }
38
41
 
@@ -152,6 +155,23 @@ class WebScraper
152
155
  console.log( `Created JSONL file at: ${this.jsonlPath}` );
153
156
  }
154
157
 
158
+ createCSVFile ()
159
+ {
160
+ const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
161
+
162
+ writeStream.write( "text\n" );
163
+
164
+ for ( const content of this.processedContent )
165
+ {
166
+ const escapedText = content.text.replace( /"/g, "\"\"" );
167
+ const csvLine = `"${escapedText}"\n`;
168
+ writeStream.write( csvLine );
169
+ }
170
+
171
+ writeStream.end();
172
+ console.log( `Created CSV file at: ${this.csvPath}` );
173
+ }
174
+
155
175
  saveNumberedTextFiles ()
156
176
  {
157
177
  this.processedContent.forEach( ( content, index ) =>