clean-web-scraper 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,8 +11,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
11
11
  - 🚫 Excludes unwanted paths from scraping
12
12
  - 🔄 Handles relative and absolute URLs like a pro
13
13
  - 🎯 No duplicate page visits
14
- - 📊 Generates JSONL and raw text output file for ML training
15
- - 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
14
+ - 📊 Generates JSONL output file for ML training
15
+ - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
16
16
 
17
17
  ## 🛠️ Prerequisites
18
18
 
@@ -48,7 +48,8 @@ const scraper = new WebScraper({
48
48
  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
49
  exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
50
50
  jsonlPath: 'output.jsonl', // Optional: Custom JSONL output path
51
- textOutputPath: "./dataset/texts" // Optional: Custom text output path
51
+ textOutputPath: "./dataset/texts", // Optional: Custom text output path
52
+ csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
52
53
  });
53
54
 
54
55
  scraper.start();
@@ -67,6 +68,7 @@ Your AI-ready content is saved in a clean, structured format:
67
68
  - 📝 Pure text format, perfect for LLM training and fine-tuning
68
69
  - 🤖 No HTML, no mess - just clean, structured text ready for AI consumption
69
70
  - 📊 JSONL output for ML training
71
+ - 📈 CSV output with clean text content
70
72
 
71
73
  ## 🤖 AI/LLM Training Ready
72
74
 
package/example-usage.js CHANGED
@@ -22,6 +22,7 @@ const scraper = new WebScraper({
22
22
  excludeList,
23
23
  exactExcludeList,
24
24
  jsonlPath: "./dataset/train.jsonl",
25
- textOutputPath: "./dataset/texts"
25
+ textOutputPath: "./dataset/texts",
26
+ csvPath: "./dataset/train.csv"
26
27
  });
27
28
  scraper.start();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.1.0",
3
+ "version": "2.2.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -13,13 +13,15 @@ class WebScraper
13
13
  exactExcludeList,
14
14
  scrapResultPath = "./dataset",
15
15
  jsonlPath,
16
- textOutputPath
16
+ textOutputPath,
17
+ csvPath
17
18
  })
18
19
  {
19
20
  this.baseURL = baseURL;
20
21
  this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
21
22
  this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
22
23
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
24
+ this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
23
25
  this.visited = new Set();
24
26
  this.excludeList = new Set( excludeList );
25
27
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -33,6 +35,7 @@ class WebScraper
33
35
  await this.fetchPage( this.baseURL );
34
36
  this.createJSONLFile();
35
37
  this.saveNumberedTextFiles();
38
+ this.createCSVFile();
36
39
  console.log( "Scraping completed." );
37
40
  }
38
41
 
@@ -152,6 +155,23 @@ class WebScraper
152
155
  console.log( `Created JSONL file at: ${this.jsonlPath}` );
153
156
  }
154
157
 
158
+ createCSVFile ()
159
+ {
160
+ const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
161
+
162
+ writeStream.write( "text\n" );
163
+
164
+ for ( const content of this.processedContent )
165
+ {
166
+ const escapedText = content.text.replace( /"/g, "\"\"" );
167
+ const csvLine = `"${escapedText}"\n`;
168
+ writeStream.write( csvLine );
169
+ }
170
+
171
+ writeStream.end();
172
+ console.log( `Created CSV file at: ${this.csvPath}` );
173
+ }
174
+
155
175
  saveNumberedTextFiles ()
156
176
  {
157
177
  this.processedContent.forEach( ( content, index ) =>