clean-web-scraper 2.0.5 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,7 +11,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
11
11
  - 🚫 Excludes unwanted paths from scraping
12
12
  - 🔄 Handles relative and absolute URLs like a pro
13
13
  - 🎯 No duplicate page visits
14
- - 📊 Generates JSONL output file for ML training
14
+ - 📊 Generates JSONL and raw text output file for ML training
15
15
  - 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
16
16
 
17
17
  ## 🛠️ Prerequisites
@@ -43,11 +43,12 @@ npm install
43
43
  const WebScraper = require('clean-web-scraper');
44
44
 
45
45
  const scraper = new WebScraper({
46
- baseURL: 'https://example.com', // Required: The website to scrape
47
- folderPath: './output', // Required: Where to save the content
48
- excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
- exactExcludeList: ['/specific-page'],// Optional: Exact URLs to exclude
50
- jsonlPath: 'output.jsonl' // Optional: Custom JSONL output path
46
+ baseURL: 'https://example.com', // Required: The website to scrape
47
+ scrapResultPath: './output', // Required: Where to save the content
48
+ excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
+ exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
50
+ jsonlPath: 'output.jsonl', // Optional: Custom JSONL output path
51
+ textOutputPath: "./dataset/texts" // Optional: Custom text output path
51
52
  });
52
53
 
53
54
  scraper.start();
package/example-usage.js CHANGED
@@ -1,7 +1,8 @@
1
1
  const WebScraper = require( "./src/WebScraper" );
2
2
 
3
+ // Configuration
3
4
  const baseURL = "https://decolonizepalestine.com";
4
- const folderPath = "./dataset";
5
+ const scrapResultPath = "./dataset";
5
6
  const excludeList = [
6
7
  "https://decolonizepalestine.com/cdn-cgi",
7
8
  "https://decolonizepalestine.com/introduction-to-palestine",
@@ -14,12 +15,13 @@ const exactExcludeList = [
14
15
  "https://decolonizepalestine.com/"
15
16
  ]
16
17
 
17
-
18
+ // Initialize scraper with all available options
18
19
  const scraper = new WebScraper({
19
20
  baseURL,
20
- folderPath,
21
+ scrapResultPath,
21
22
  excludeList,
22
23
  exactExcludeList,
23
- jsonlPath: "./dataset/final.jsonl"
24
+ jsonlPath: "./dataset/train.jsonl",
25
+ textOutputPath: "./dataset/texts"
24
26
  });
25
27
  scraper.start();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.0.5",
3
+ "version": "2.1.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -7,11 +7,19 @@ const path = require( "path" );
7
7
 
8
8
  class WebScraper
9
9
  {
10
- constructor ({ baseURL, folderPath, excludeList, exactExcludeList, jsonlPath })
10
+ constructor ({
11
+ baseURL,
12
+ excludeList,
13
+ exactExcludeList,
14
+ scrapResultPath = "./dataset",
15
+ jsonlPath,
16
+ textOutputPath
17
+ })
11
18
  {
12
19
  this.baseURL = baseURL;
13
- this.jsonlPath = jsonlPath || "output.jsonl";
14
- this.folderPath = path.join( folderPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
20
+ this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
21
+ this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
22
+ this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
15
23
  this.visited = new Set();
16
24
  this.excludeList = new Set( excludeList );
17
25
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -24,6 +32,8 @@ class WebScraper
24
32
  this.visited.add( this.baseURL );
25
33
  await this.fetchPage( this.baseURL );
26
34
  this.createJSONLFile();
35
+ this.saveNumberedTextFiles();
36
+ console.log( "Scraping completed." );
27
37
  }
28
38
 
29
39
  async fetchPage ( url )
@@ -104,7 +114,7 @@ class WebScraper
104
114
  {
105
115
  urlPath = "/index";
106
116
  }
107
- const filePath = path.join( __dirname, this.folderPath, urlPath );
117
+ const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
108
118
  const dir = path.dirname( filePath );
109
119
 
110
120
  // Create metadata object
@@ -142,6 +152,17 @@ class WebScraper
142
152
  console.log( `Created JSONL file at: ${this.jsonlPath}` );
143
153
  }
144
154
 
155
+ saveNumberedTextFiles ()
156
+ {
157
+ this.processedContent.forEach( ( content, index ) =>
158
+ {
159
+ const fileName = `${index + 1}.txt`;
160
+ const filePath = path.join( __dirname, this.textOutputPath, fileName );
161
+ fs.writeFileSync( filePath, content.text, "utf-8" );
162
+ console.log( `Created numbered text file: ${fileName}` );
163
+ });
164
+ }
165
+
145
166
  processContent ( content )
146
167
  {
147
168
  let processed = content;
@@ -194,14 +215,12 @@ class WebScraper
194
215
 
195
216
  createOutputDirectory ()
196
217
  {
197
- if ( fs.existsSync( path.join( __dirname, this.folderPath ) ) )
198
- {
199
- fs.rmSync( path.join( __dirname, this.folderPath ), { recursive: true, force: true });
200
- }
201
- if ( !fs.existsSync( path.join( __dirname, this.folderPath ) ) )
218
+ if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
202
219
  {
203
- fs.mkdirSync( path.join( __dirname, this.folderPath ), { recursive: true });
220
+ fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
204
221
  }
222
+ fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
223
+ fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
205
224
  }
206
225
  }
207
226