clean-web-scraper 2.0.5 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -12,7 +12,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
12
12
  - 🔄 Handles relative and absolute URLs like a pro
13
13
  - 🎯 No duplicate page visits
14
14
  - 📊 Generates JSONL output file for ML training
15
- - 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
15
+ - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
16
16
 
17
17
  ## 🛠️ Prerequisites
18
18
 
@@ -43,11 +43,13 @@ npm install
43
43
  const WebScraper = require('clean-web-scraper');
44
44
 
45
45
  const scraper = new WebScraper({
46
- baseURL: 'https://example.com', // Required: The website to scrape
47
- folderPath: './output', // Required: Where to save the content
48
- excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
- exactExcludeList: ['/specific-page'],// Optional: Exact URLs to exclude
50
- jsonlPath: 'output.jsonl' // Optional: Custom JSONL output path
46
+ baseURL: 'https://example.com', // Required: The website to scrape
47
+ scrapResultPath: './output', // Required: Where to save the content
48
+ excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
+ exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
50
+ jsonlPath: 'output.jsonl', // Optional: Custom JSONL output path
51
+ textOutputPath: "./dataset/texts", // Optional: Custom text output path
52
+ csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
51
53
  });
52
54
 
53
55
  scraper.start();
@@ -66,6 +68,7 @@ Your AI-ready content is saved in a clean, structured format:
66
68
  - 📝 Pure text format, perfect for LLM training and fine-tuning
67
69
  - 🤖 No HTML, no mess - just clean, structured text ready for AI consumption
68
70
  - 📊 JSONL output for ML training
71
+ - 📈 CSV output with clean text content
69
72
 
70
73
  ## 🤖 AI/LLM Training Ready
71
74
 
package/example-usage.js CHANGED
@@ -1,7 +1,8 @@
1
1
  const WebScraper = require( "./src/WebScraper" );
2
2
 
3
+ // Configuration
3
4
  const baseURL = "https://decolonizepalestine.com";
4
- const folderPath = "./dataset";
5
+ const scrapResultPath = "./dataset";
5
6
  const excludeList = [
6
7
  "https://decolonizepalestine.com/cdn-cgi",
7
8
  "https://decolonizepalestine.com/introduction-to-palestine",
@@ -14,12 +15,14 @@ const exactExcludeList = [
14
15
  "https://decolonizepalestine.com/"
15
16
  ]
16
17
 
17
-
18
+ // Initialize scraper with all available options
18
19
  const scraper = new WebScraper({
19
20
  baseURL,
20
- folderPath,
21
+ scrapResultPath,
21
22
  excludeList,
22
23
  exactExcludeList,
23
- jsonlPath: "./dataset/final.jsonl"
24
+ jsonlPath: "./dataset/train.jsonl",
25
+ textOutputPath: "./dataset/texts",
26
+ csvPath: "./dataset/train.csv"
24
27
  });
25
28
  scraper.start();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.0.5",
3
+ "version": "2.2.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -7,11 +7,21 @@ const path = require( "path" );
7
7
 
8
8
  class WebScraper
9
9
  {
10
- constructor ({ baseURL, folderPath, excludeList, exactExcludeList, jsonlPath })
10
+ constructor ({
11
+ baseURL,
12
+ excludeList,
13
+ exactExcludeList,
14
+ scrapResultPath = "./dataset",
15
+ jsonlPath,
16
+ textOutputPath,
17
+ csvPath
18
+ })
11
19
  {
12
20
  this.baseURL = baseURL;
13
- this.jsonlPath = jsonlPath || "output.jsonl";
14
- this.folderPath = path.join( folderPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
21
+ this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
22
+ this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
23
+ this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
24
+ this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
15
25
  this.visited = new Set();
16
26
  this.excludeList = new Set( excludeList );
17
27
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -24,6 +34,9 @@ class WebScraper
24
34
  this.visited.add( this.baseURL );
25
35
  await this.fetchPage( this.baseURL );
26
36
  this.createJSONLFile();
37
+ this.saveNumberedTextFiles();
38
+ this.createCSVFile();
39
+ console.log( "Scraping completed." );
27
40
  }
28
41
 
29
42
  async fetchPage ( url )
@@ -104,7 +117,7 @@ class WebScraper
104
117
  {
105
118
  urlPath = "/index";
106
119
  }
107
- const filePath = path.join( __dirname, this.folderPath, urlPath );
120
+ const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
108
121
  const dir = path.dirname( filePath );
109
122
 
110
123
  // Create metadata object
@@ -142,6 +155,34 @@ class WebScraper
142
155
  console.log( `Created JSONL file at: ${this.jsonlPath}` );
143
156
  }
144
157
 
158
+ createCSVFile ()
159
+ {
160
+ const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
161
+
162
+ writeStream.write( "text\n" );
163
+
164
+ for ( const content of this.processedContent )
165
+ {
166
+ const escapedText = content.text.replace( /"/g, "\"\"" );
167
+ const csvLine = `"${escapedText}"\n`;
168
+ writeStream.write( csvLine );
169
+ }
170
+
171
+ writeStream.end();
172
+ console.log( `Created CSV file at: ${this.csvPath}` );
173
+ }
174
+
175
+ saveNumberedTextFiles ()
176
+ {
177
+ this.processedContent.forEach( ( content, index ) =>
178
+ {
179
+ const fileName = `${index + 1}.txt`;
180
+ const filePath = path.join( __dirname, this.textOutputPath, fileName );
181
+ fs.writeFileSync( filePath, content.text, "utf-8" );
182
+ console.log( `Created numbered text file: ${fileName}` );
183
+ });
184
+ }
185
+
145
186
  processContent ( content )
146
187
  {
147
188
  let processed = content;
@@ -194,14 +235,12 @@ class WebScraper
194
235
 
195
236
  createOutputDirectory ()
196
237
  {
197
- if ( fs.existsSync( path.join( __dirname, this.folderPath ) ) )
198
- {
199
- fs.rmSync( path.join( __dirname, this.folderPath ), { recursive: true, force: true });
200
- }
201
- if ( !fs.existsSync( path.join( __dirname, this.folderPath ) ) )
238
+ if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
202
239
  {
203
- fs.mkdirSync( path.join( __dirname, this.folderPath ), { recursive: true });
240
+ fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
204
241
  }
242
+ fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
243
+ fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
205
244
  }
206
245
  }
207
246