clean-web-scraper 3.2.2 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -53,9 +53,6 @@ const scraper = new WebScraper({
53
53
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
54
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
55
55
  csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
56
- maxDepth: 3, // Optional: Maximum depth for recursive crawling
57
- includeMetadata: false, // Optional: Include metadata in output files
58
- metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
59
56
  });
60
57
  await scraper.start();
61
58
  ```
@@ -72,13 +69,19 @@ const WebScraper = require('clean-web-scraper');
72
69
  // Scrape documentation website
73
70
  const docsScraper = new WebScraper({
74
71
  baseURL: 'https://docs.example.com',
75
- scrapResultPath: './datasets/docs'
72
+ scrapResultPath: './datasets/docs',
73
+ maxDepth: 3, // Optional: Maximum depth for recursive crawling
74
+ includeMetadata: true, // Optional: Include metadata in output files
75
+ metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
76
76
  });
77
77
 
78
78
  // Scrape blog website
79
79
  const blogScraper = new WebScraper({
80
80
  baseURL: 'https://blog.example.com',
81
- scrapResultPath: './datasets/blog'
81
+ scrapResultPath: './datasets/blog',
82
+ maxDepth: 3, // Optional: Maximum depth for recursive crawling
83
+ includeMetadata: true, // Optional: Include metadata in output files
84
+ metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
82
85
  });
83
86
 
84
87
  // Start scraping both sites
@@ -86,7 +89,7 @@ await docsScraper.start();
86
89
  await blogScraper.start();
87
90
 
88
91
  // Combine all scraped content into a single dataset
89
- await WebScraper.combineResults('./combined-dataset', [docsScraper, blogScraper]);
92
+ await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
90
93
  ```
91
94
 
92
95
  ## 📤 Output
@@ -118,6 +121,79 @@ example.com/
118
121
  ├── train_with_metadata.jsonl # When includeMetadata is true
119
122
  ├── train.csv # Clean text in CSV format
120
123
  └── train_with_metadata.csv # When includeMetadata is true
124
+
125
+ combined/
126
+ ├── texts/ # Combined numbered text files
127
+ │ ├── 1.txt
128
+ │ ├── 2.txt
129
+ │ └── n.txt
130
+ ├── texts_with_metadata/ # Combined metadata text files
131
+ │ ├── 1.txt
132
+ │ ├── 2.txt
133
+ │ └── n.txt
134
+ ├── combined.jsonl # Combined JSONL content
135
+ ├── combined_with_metadata.jsonl
136
+ ├── combined.csv # Combined CSV content
137
+ └── combined_with_metadata.csv
138
+ ```
139
+
140
+ ## 📄 Output File Formats
141
+
142
+ ### 📝 Text Files (*.txt)
143
+
144
+ The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage
145
+
146
+ ### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
147
+
148
+ title: My Awesome Page
149
+ description: This is a great article about coding
150
+ author: John Doe
151
+ language: en
152
+ dateScraped: 2024-01-20T10:30:00Z
153
+
154
+ \-\-\-
155
+
156
+ The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage.
157
+
158
+ ### 📊 JSONL Files (train.jsonl)
159
+
160
+ ```json
161
+ {"text": "Clean article content here"}
162
+ {"text": "Another article content here"}
163
+ ```
164
+
165
+ ### 📈 JSONL with Metadata (train_with_metadata.jsonl)
166
+
167
+ ```json
168
+ {"text": "Article content", "metadata": {"title": "Page Title", "author": "John Doe"}}
169
+ {"text": "Another article", "metadata": {"title": "Second Page", "author": "Jane Smith"}}
170
+ ```
171
+
172
+ ### 🗃️ JSON Files In Website Output (*.json)
173
+
174
+ ```json
175
+ {
176
+ "url": "<https://example.com/page>",
177
+ "title": "Page Title",
178
+ "description": "Page description",
179
+ "dateScraped": "2024-01-20T10:30:00Z"
180
+ }
181
+ ```
182
+
183
+ ### 📋 CSV Files (train.csv)
184
+
185
+ ```csv
186
+ text
187
+ "Clean article content here"
188
+ "Another article content here"
189
+ ```
190
+
191
+ ### 📊 CSV with Metadata (train_with_metadata.csv)
192
+
193
+ ```csv
194
+ text,title,author,description
195
+ "Article content","Page Title","John Doe","Page description"
196
+ "Another article","Second Page","Jane Smith","Another description"
121
197
  ```
122
198
 
123
199
  ## 🤖 AI/LLM Training Ready
package/example-usage.js CHANGED
@@ -68,5 +68,12 @@ void async function main ()
68
68
 
69
69
  // 4
70
70
  // https://electronicintifada.net/
71
+
72
+ // 5
73
+ // https://www.palestineremembered.com/ZionistFAQ.html
74
+
75
+ // 6 https://the-palestinian-side.vercel.app/
76
+
77
+ // 7 https://stand-with-palestine.org/blogs
71
78
  }()
72
79
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.2.2",
3
+ "version": "3.2.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -36,11 +36,11 @@ class WebScraper
36
36
  this.excludeList = new Set( excludeList );
37
37
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
38
38
  this.allProcessedContent = [];
39
- this.createOutputDirectory();
40
39
  }
41
40
 
42
41
  async start ()
43
42
  {
43
+ this.createOutputDirectory();
44
44
  await this.fetchPage( this.startURL, 0 );
45
45
  this.createJSONLFile();
46
46
  this.saveNumberedTextFiles();
@@ -384,41 +384,71 @@ class WebScraper
384
384
  const fullOutputPath = path.join( __dirname, outputPath );
385
385
 
386
386
  // Create output directories
387
+ WebScraper.createCombinedDirectories( fullOutputPath );
388
+
389
+ // Combine files by type
390
+ WebScraper.combineJSONLFiles( fullOutputPath, websites );
391
+ WebScraper.combineCSVFiles( fullOutputPath, websites );
392
+ WebScraper.combineTextFiles( fullOutputPath, websites );
393
+ }
394
+
395
+ static createCombinedDirectories ( fullOutputPath )
396
+ {
397
+ if ( fs.existsSync( fullOutputPath ) )
398
+ {
399
+ fs.rmSync( fullOutputPath, { recursive: true, force: true });
400
+ }
387
401
  fs.mkdirSync( fullOutputPath, { recursive: true });
388
402
  fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
389
403
  fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
404
+ }
390
405
 
391
- // Combine regular JSONL files
406
+ static combineJSONLFiles ( fullOutputPath, websites )
407
+ {
392
408
  const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
393
- .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
409
+ .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
394
410
  const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
395
- .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
411
+ .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
412
+
413
+ for ( const website of websites )
414
+ {
415
+ const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
416
+ jsonlOutput.write( jsonlContent );
417
+
418
+ if ( website.includeMetadata )
419
+ {
420
+ const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
421
+ jsonlMetaOutput.write( jsonlMetaContent );
422
+ }
423
+ }
424
+
425
+ jsonlOutput.end();
426
+ jsonlMetaOutput.end();
427
+ }
428
+
429
+ static combineCSVFiles ( fullOutputPath, websites )
430
+ {
396
431
  const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
397
432
  const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
398
433
 
399
434
  csvOutput.write( "text\n" );
400
435
  const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
436
+
401
437
  if ( metadataFields.size > 0 )
402
438
  {
403
439
  csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
404
440
  }
441
+
405
442
  for ( const website of websites )
406
443
  {
407
- const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
408
- jsonlOutput.write( jsonlContent );
409
-
410
444
  const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
411
445
  .split( "\n" )
412
446
  .slice( 1 )
413
447
  .filter( line => { return line.trim() });
414
448
  csvOutput.write( `${csvContent.join( "\n" )}\n` );
415
449
 
416
- // Combine metadata files if they exist
417
450
  if ( website.includeMetadata )
418
451
  {
419
- const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
420
- jsonlMetaOutput.write( jsonlMetaContent );
421
-
422
452
  const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
423
453
  .split( "\n" )
424
454
  .slice( 1 )
@@ -427,18 +457,18 @@ class WebScraper
427
457
  }
428
458
  }
429
459
 
430
- // Close all streams
431
- jsonlOutput.end();
432
- jsonlMetaOutput.end();
433
460
  csvOutput.end();
434
461
  csvMetaOutput.end();
462
+ }
435
463
 
436
- // Combine text files (both regular and metadata versions)
464
+ static combineTextFiles ( fullOutputPath, websites )
465
+ {
437
466
  let textFileCounter = 1;
467
+
438
468
  for ( const website of websites )
439
469
  {
440
- // Regular text files
441
470
  const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
471
+
442
472
  for ( const file of textFiles )
443
473
  {
444
474
  const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
@@ -448,7 +478,6 @@ class WebScraper
448
478
  "utf-8"
449
479
  );
450
480
 
451
- // Metadata text files if they exist
452
481
  if ( website.includeMetadata )
453
482
  {
454
483
  const metaContent = fs.readFileSync(
@@ -465,6 +494,7 @@ class WebScraper
465
494
  }
466
495
  }
467
496
  }
497
+
468
498
  }
469
499
 
470
500
  module.exports = WebScraper;