clean-web-scraper 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,7 +13,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
13
13
  - 🎯 No duplicate page visits
14
14
  - 📊 Generates JSONL output file for ML training
15
15
  - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
16
- - 📊 Rich metadata extraction including:
16
+ - 📊 Rich metadata extraction
17
+ - 📁 Combine results from multiple scrapers into a unified dataset
17
18
 
18
19
  ## 🛠️ Prerequisites
19
20
 
@@ -44,17 +45,21 @@ npm install
44
45
  const WebScraper = require('clean-web-scraper');
45
46
 
46
47
  const scraper = new WebScraper({
47
- baseURL: 'https://example.com', // Required: The website to scrape
48
- startURL: 'https://example.com/blog', // Optional: Custom starting URL
49
- excludeList: ['/admin', '/private'], // Optional: Paths to exclude
50
- exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
51
- scrapResultPath: './example.com/website', // Required: Where to save the content
52
- jsonlPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
53
- textOutputPath: "./example.com/texts", // Optional: Custom text output path
54
- csvPath: "./example.com/train.csv" // Optional: Custom CSV output path
48
+ baseURL: 'https://example.com/news', // Required: The website base url to scrape
49
+ startURL: 'https://example.com/blog', // Optional: Custom starting URL
50
+ excludeList: ['/admin', '/private'], // Optional: Paths to exclude
51
+ exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
52
+ scrapResultPath: './example.com/website', // Required: Where to save the content
53
+ jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
+ textOutputPath: "./example.com/texts", // Optional: Custom text output path
55
+ csvOutputPath: "./example.com/train.csv" // Optional: Custom CSV output path
56
+ maxDepth: 3, // Optional: Maximum depth for recursive crawling
57
+ includeTitles: true, // Optional: Include page titles in outputs
55
58
  });
56
-
57
59
  scraper.start();
60
+
61
+ // Combine results from multiple scrapers
62
+ WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
58
63
  ```
59
64
 
60
65
  ```bash
@@ -92,7 +97,7 @@ example.com/
92
97
  The output is specifically formatted for AI training purposes:
93
98
 
94
99
  - Clean, processed text without HTML markup
95
- - Consistent formatting across all documents
100
+ - Multiple formats (JSONL, CSV, text files)
96
101
  - Structured content perfect for fine-tuning LLMs
97
102
  - Ready to use in your ML pipelines
98
103
 
package/example-usage.js CHANGED
@@ -9,19 +9,22 @@ async function khameneiIrFreePalestineTag ()
9
9
  const scraper = new WebScraper({
10
10
  baseURL: "https://english.khamenei.ir/news",
11
11
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
12
+ maxDepth: 1,
12
13
  excludeList: [
13
14
  ],
14
15
  exactExcludeList: [
16
+ "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
15
17
  ],
16
18
  scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
17
- jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
19
+ jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
18
20
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
19
- csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
21
+ csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
22
+ includeTitles: true
20
23
  });
21
- await scraper.start();
24
+ // await scraper.start();
25
+ return scraper;
22
26
  }
23
27
 
24
- // decolonizepalestine
25
28
  async function decolonizepalestine ()
26
29
  {
27
30
  // 2
@@ -40,18 +43,22 @@ async function decolonizepalestine ()
40
43
  "https://decolonizepalestine.com/"
41
44
  ],
42
45
  scrapResultPath: "./dataset/decolonizepalestine/website",
43
- jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
46
+ jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
44
47
  textOutputPath: "./dataset/decolonizepalestine/texts",
45
- csvPath: "./dataset/decolonizepalestine/train.csv"
48
+ csvOutputPath: "./dataset/decolonizepalestine/train.csv"
46
49
  });
47
- await scraper.start();
50
+ // await scraper.start();
51
+ return scraper;
48
52
  }
49
53
 
50
54
  void async function main ()
51
55
  {
52
- // await khameneiIrFreePalestineTag();
53
- await decolonizepalestine();
54
-
56
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
57
+ const decolonizepalestineScraper = await decolonizepalestine();
58
+ WebScraper.combineResults( "./dataset/combined", [
59
+ khameneiIrFreePalestineTagScraper,
60
+ decolonizepalestineScraper
61
+ ] );
55
62
 
56
63
  // 3
57
64
  // https://bdsmovement.net
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.3.1",
3
+ "version": "2.3.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -10,38 +10,46 @@ class WebScraper
10
10
  constructor ({
11
11
  baseURL,
12
12
  startURL,
13
+ maxDepth = Infinity,
13
14
  excludeList,
14
15
  exactExcludeList,
15
16
  scrapResultPath = "./dataset",
16
- jsonlPath,
17
+ jsonlOutputPath,
17
18
  textOutputPath,
18
- csvPath
19
+ csvOutputPath,
20
+ includeTitles = false
19
21
  })
20
22
  {
21
23
  this.baseURL = baseURL;
22
24
  this.startURL = startURL || baseURL;
25
+ this.maxDepth = maxDepth;
23
26
  this.scrapResultPath = scrapResultPath;
24
- this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
27
+ this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
25
28
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
26
- this.csvPath = csvPath || path.join( this.scrapResultPath, "train.csv" );
29
+ this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
30
+ this.includeTitles = includeTitles;
27
31
  this.visited = new Set();
28
32
  this.excludeList = new Set( excludeList );
29
33
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
30
- this.allProcessedContent = []; // Add this line
34
+ this.allProcessedContent = [];
31
35
  this.createOutputDirectory();
32
36
  }
33
37
 
34
38
  async start ()
35
39
  {
36
- await this.fetchPage( this.startURL );
40
+ await this.fetchPage( this.startURL, 0 );
37
41
  this.createJSONLFile();
38
42
  this.saveNumberedTextFiles();
39
43
  this.createCSVFile();
40
44
  console.log( "Scraping completed." );
41
45
  }
42
46
 
43
- async fetchPage ( url )
47
+ async fetchPage ( url, depth )
44
48
  {
49
+ if ( depth > this.maxDepth )
50
+ {
51
+ return;
52
+ }
45
53
  this.visited.add( url );
46
54
  try
47
55
  {
@@ -57,6 +65,7 @@ class WebScraper
57
65
  if ( article )
58
66
  {
59
67
  const metadata = this.metadataextractor( url, document, headers );
68
+ metadata.depth = depth;
60
69
  this.saveArticle( url, article.textContent, metadata );
61
70
  }
62
71
  else
@@ -70,7 +79,7 @@ class WebScraper
70
79
  {
71
80
  if ( !this.visited.has( link ) )
72
81
  {
73
- await this.fetchPage( link );
82
+ await this.fetchPage( link, depth + 1 );
74
83
  }
75
84
  }
76
85
  }
@@ -122,23 +131,16 @@ class WebScraper
122
131
  const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
123
132
  const dir = path.dirname( filePath );
124
133
 
125
- // Create directory if it doesn't exist
126
134
  fs.mkdirSync( dir, { recursive: true });
127
-
128
- // Save the text content
129
135
  fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
130
-
131
- // Save the JSON metadata
132
136
  fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
133
-
134
137
  console.log( `Saved: ${filePath}.txt` );
135
138
  console.log( `Saved: ${filePath}.json` );
136
139
  }
137
140
 
138
141
  createJSONLFile ()
139
142
  {
140
- const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
141
-
143
+ const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
142
144
  for ( const content of this.allProcessedContent )
143
145
  {
144
146
  const jsonLine = `${JSON.stringify( content )}\n`;
@@ -146,24 +148,27 @@ class WebScraper
146
148
  }
147
149
 
148
150
  writeStream.end();
149
- console.log( `Created JSONL file at: ${this.jsonlPath}` );
151
+ console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
150
152
  }
151
153
 
152
154
  createCSVFile ()
153
155
  {
154
- const writeStream = fs.createWriteStream( path.join( __dirname, this.csvPath ) );
155
-
156
+ const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
156
157
  writeStream.write( "text\n" );
157
-
158
158
  for ( const content of this.allProcessedContent )
159
159
  {
160
- const escapedText = content.text.replace( /"/g, "\"\"" );
160
+ let fullText = content.text;
161
+ if ( this.includeTitles && content.metadata.title )
162
+ {
163
+ fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
164
+ }
165
+ const escapedText = fullText.replace( /"/g, "\"\"" );
161
166
  const csvLine = `"${escapedText}"\n`;
162
167
  writeStream.write( csvLine );
163
168
  }
164
169
 
165
170
  writeStream.end();
166
- console.log( `Created CSV file at: ${this.csvPath}` );
171
+ console.log( `Created CSV file at: ${this.csvOutputPath}` );
167
172
  }
168
173
 
169
174
  saveNumberedTextFiles ()
@@ -172,7 +177,12 @@ class WebScraper
172
177
  {
173
178
  const fileName = `${index + 1}.txt`;
174
179
  const filePath = path.join( __dirname, this.textOutputPath, fileName );
175
- fs.writeFileSync( filePath, content.text, "utf-8" );
180
+ let titlePrefix = "";
181
+ if ( this.includeTitles && content.metadata.title )
182
+ {
183
+ titlePrefix = `Title: ${content.metadata.title}\n\n`;
184
+ }
185
+ fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
176
186
  console.log( `Created numbered text file: ${fileName}` );
177
187
  });
178
188
  }
@@ -259,6 +269,55 @@ class WebScraper
259
269
  fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
260
270
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
261
271
  }
272
+
273
+ static combineResults ( outputPath, websites )
274
+ {
275
+ const fullOutputPath = path.join( __dirname, outputPath );
276
+
277
+ // Create output directories
278
+ fs.mkdirSync( fullOutputPath, { recursive: true });
279
+ fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
280
+
281
+ // Combine JSONL files
282
+ const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
283
+ for ( const website of websites )
284
+ {
285
+ const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
286
+ jsonlOutput.write( jsonlContent );
287
+ }
288
+ jsonlOutput.end();
289
+
290
+ // Combine CSV files
291
+ const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
292
+ csvOutput.write( "text\n" );
293
+ for ( const website of websites )
294
+ {
295
+ const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
296
+ .split( "\n" )
297
+ .slice( 1 ) // Skip header
298
+ .filter( line => { return line.trim() });
299
+ csvOutput.write( `${csvContent.join( "\n" ) }\n` );
300
+ }
301
+ csvOutput.end();
302
+
303
+ // Combine text files
304
+ let textFileCounter = 1;
305
+ for ( const website of websites )
306
+ {
307
+ const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
308
+ for ( const file of textFiles )
309
+ {
310
+ const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
311
+ fs.writeFileSync(
312
+ path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
313
+ content,
314
+ "utf-8"
315
+ );
316
+ textFileCounter++;
317
+ }
318
+ }
319
+ }
320
+
262
321
  }
263
322
 
264
323
  module.exports = WebScraper;