clean-web-scraper 3.2.1 → 3.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,10 +11,10 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
11
11
  - 🚫 Excludes unwanted paths from scraping
12
12
  - 🔄 Handles relative and absolute URLs like a pro
13
13
  - 🎯 No duplicate page visits
14
- - 📊 Generates JSONL output file for ML training
15
- - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
14
+ - 🤖 AI-friendly output formats (JSONL, CSV, clean text)
16
15
  - 📊 Rich metadata extraction
17
16
  - 📁 Combine results from multiple scrapers into a unified dataset
17
+ - 🎯 Turn any website into an AI training dataset
18
18
 
19
19
  ## 🛠️ Prerequisites
20
20
 
@@ -58,15 +58,37 @@ const scraper = new WebScraper({
58
58
  metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
59
59
  });
60
60
  await scraper.start();
61
-
62
- // Combine results from multiple scrapers
63
- await WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
64
61
  ```
65
62
 
66
63
  ```bash
67
64
  node example-usage.js
68
65
  ```
69
66
 
67
+ ## 💻 Advanced Usage: Multi-Site Scraping
68
+
69
+ ```js
70
+ const WebScraper = require('clean-web-scraper');
71
+
72
+ // Scrape documentation website
73
+ const docsScraper = new WebScraper({
74
+ baseURL: 'https://docs.example.com',
75
+ scrapResultPath: './datasets/docs'
76
+ });
77
+
78
+ // Scrape blog website
79
+ const blogScraper = new WebScraper({
80
+ baseURL: 'https://blog.example.com',
81
+ scrapResultPath: './datasets/blog'
82
+ });
83
+
84
+ // Start scraping both sites
85
+ await docsScraper.start();
86
+ await blogScraper.start();
87
+
88
+ // Combine all scraped content into a single dataset
89
+ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
90
+ ```
91
+
70
92
  ## 📤 Output
71
93
 
72
94
  Your AI-ready content is saved in a clean, structured format:
@@ -96,11 +118,84 @@ example.com/
96
118
  ├── train_with_metadata.jsonl # When includeMetadata is true
97
119
  ├── train.csv # Clean text in CSV format
98
120
  └── train_with_metadata.csv # When includeMetadata is true
121
+
122
+ combined/
123
+ ├── texts/ # Combined numbered text files
124
+ │ ├── 1.txt
125
+ │ ├── 2.txt
126
+ │ └── n.txt
127
+ ├── texts_with_metadata/ # Combined metadata text files
128
+ │ ├── 1.txt
129
+ │ ├── 2.txt
130
+ │ └── n.txt
131
+ ├── combined.jsonl # Combined JSONL content
132
+ ├── combined_with_metadata.jsonl
133
+ ├── combined.csv # Combined CSV content
134
+ └── combined_with_metadata.csv
135
+ ```
136
+
137
+ ## 📄 Output File Formats
138
+
139
+ ### 📝 Text Files (*.txt)
140
+
141
+ The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage
142
+
143
+ ### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
144
+
145
+ title: My Awesome Page
146
+ description: This is a great article about coding
147
+ author: John Doe
148
+ language: en
149
+ dateScraped: 2024-01-20T10:30:00Z
150
+
151
+ \-\-\-
152
+
153
+ The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage.
154
+
155
+ ### 📊 JSONL Files (train.jsonl)
156
+
157
+ ```json
158
+ {"text": "Clean article content here"}
159
+ {"text": "Another article content here"}
160
+ ```
161
+
162
+ ### 📈 JSONL with Metadata (train_with_metadata.jsonl)
163
+
164
+ ```json
165
+ {"text": "Article content", "metadata": {"title": "Page Title", "author": "John Doe"}}
166
+ {"text": "Another article", "metadata": {"title": "Second Page", "author": "Jane Smith"}}
167
+ ```
168
+
169
+ ### 🗃️ JSON Files In Website Output (*.json)
170
+
171
+ ```json
172
+ {
173
+ "url": "<https://example.com/page>",
174
+ "title": "Page Title",
175
+ "description": "Page description",
176
+ "dateScraped": "2024-01-20T10:30:00Z"
177
+ }
178
+ ```
179
+
180
+ ### 📋 CSV Files (train.csv)
181
+
182
+ ```csv
183
+ text
184
+ "Clean article content here"
185
+ "Another article content here"
186
+ ```
187
+
188
+ ### 📊 CSV with Metadata (train_with_metadata.csv)
189
+
190
+ ```csv
191
+ text,title,author,description
192
+ "Article content","Page Title","John Doe","Page description"
193
+ "Another article","Second Page","Jane Smith","Another description"
99
194
  ```
100
195
 
101
196
  ## 🤖 AI/LLM Training Ready
102
197
 
103
- The output is specifically formatted for AI training purposes:
198
+ The output is specifically formatted for AI training and fine-tuning purposes:
104
199
 
105
200
  - Clean, processed text without HTML markup
106
201
  - Multiple formats (JSONL, CSV, text files)
package/example-usage.js CHANGED
@@ -68,5 +68,12 @@ void async function main ()
68
68
 
69
69
  // 4
70
70
  // https://electronicintifada.net/
71
+
72
+ // 5
73
+ // https://www.palestineremembered.com/ZionistFAQ.html
74
+
75
+ // 6 https://the-palestinian-side.vercel.app/
76
+
77
+ // 7 https://stand-with-palestine.org/blogs
71
78
  }()
72
79
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.2.1",
3
+ "version": "3.2.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -36,11 +36,11 @@ class WebScraper
36
36
  this.excludeList = new Set( excludeList );
37
37
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
38
38
  this.allProcessedContent = [];
39
- this.createOutputDirectory();
40
39
  }
41
40
 
42
41
  async start ()
43
42
  {
43
+ this.createOutputDirectory();
44
44
  await this.fetchPage( this.startURL, 0 );
45
45
  this.createJSONLFile();
46
46
  this.saveNumberedTextFiles();
@@ -384,41 +384,71 @@ class WebScraper
384
384
  const fullOutputPath = path.join( __dirname, outputPath );
385
385
 
386
386
  // Create output directories
387
+ WebScraper.createCombinedDirectories( fullOutputPath );
388
+
389
+ // Combine files by type
390
+ WebScraper.combineJSONLFiles( fullOutputPath, websites );
391
+ WebScraper.combineCSVFiles( fullOutputPath, websites );
392
+ WebScraper.combineTextFiles( fullOutputPath, websites );
393
+ }
394
+
395
+ static createCombinedDirectories ( fullOutputPath )
396
+ {
397
+ if ( fs.existsSync( fullOutputPath ) )
398
+ {
399
+ fs.rmSync( fullOutputPath, { recursive: true, force: true });
400
+ }
387
401
  fs.mkdirSync( fullOutputPath, { recursive: true });
388
402
  fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
389
403
  fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
404
+ }
390
405
 
391
- // Combine regular JSONL files
406
+ static combineJSONLFiles ( fullOutputPath, websites )
407
+ {
392
408
  const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
393
- .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
409
+ .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
394
410
  const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
395
- .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
411
+ .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
412
+
413
+ for ( const website of websites )
414
+ {
415
+ const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
416
+ jsonlOutput.write( jsonlContent );
417
+
418
+ if ( website.includeMetadata )
419
+ {
420
+ const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
421
+ jsonlMetaOutput.write( jsonlMetaContent );
422
+ }
423
+ }
424
+
425
+ jsonlOutput.end();
426
+ jsonlMetaOutput.end();
427
+ }
428
+
429
+ static combineCSVFiles ( fullOutputPath, websites )
430
+ {
396
431
  const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
397
432
  const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
398
433
 
399
434
  csvOutput.write( "text\n" );
400
435
  const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
436
+
401
437
  if ( metadataFields.size > 0 )
402
438
  {
403
439
  csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
404
440
  }
441
+
405
442
  for ( const website of websites )
406
443
  {
407
- const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
408
- jsonlOutput.write( jsonlContent );
409
-
410
444
  const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
411
445
  .split( "\n" )
412
446
  .slice( 1 )
413
447
  .filter( line => { return line.trim() });
414
448
  csvOutput.write( `${csvContent.join( "\n" )}\n` );
415
449
 
416
- // Combine metadata files if they exist
417
450
  if ( website.includeMetadata )
418
451
  {
419
- const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
420
- jsonlMetaOutput.write( jsonlMetaContent );
421
-
422
452
  const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
423
453
  .split( "\n" )
424
454
  .slice( 1 )
@@ -427,18 +457,18 @@ class WebScraper
427
457
  }
428
458
  }
429
459
 
430
- // Close all streams
431
- jsonlOutput.end();
432
- jsonlMetaOutput.end();
433
460
  csvOutput.end();
434
461
  csvMetaOutput.end();
462
+ }
435
463
 
436
- // Combine text files (both regular and metadata versions)
464
+ static combineTextFiles ( fullOutputPath, websites )
465
+ {
437
466
  let textFileCounter = 1;
467
+
438
468
  for ( const website of websites )
439
469
  {
440
- // Regular text files
441
470
  const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
471
+
442
472
  for ( const file of textFiles )
443
473
  {
444
474
  const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
@@ -448,7 +478,6 @@ class WebScraper
448
478
  "utf-8"
449
479
  );
450
480
 
451
- // Metadata text files if they exist
452
481
  if ( website.includeMetadata )
453
482
  {
454
483
  const metaContent = fs.readFileSync(
@@ -465,6 +494,7 @@ class WebScraper
465
494
  }
466
495
  }
467
496
  }
497
+
468
498
  }
469
499
 
470
500
  module.exports = WebScraper;