clean-web-scraper 3.9.0 → 3.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/main.js +97 -108
  2. package/package.json +1 -1
package/main.js CHANGED
@@ -3,11 +3,11 @@ const { JSDOM } = require( "jsdom" );
3
3
  const { Readability } = require( "@mozilla/readability" );
4
4
  const fs = require( "fs" );
5
5
  const path = require( "path" );
6
- const { connect } = require( "puppeteer-real-browser" )
6
+ const { connect } = require( "puppeteer-real-browser" );
7
7
 
8
8
  class WebScraper
9
9
  {
10
- constructor ( config )
10
+ constructor ( config = {})
11
11
  {
12
12
  // Base configuration
13
13
  this.baseURL = baseURL;
@@ -63,7 +63,7 @@ class WebScraper
63
63
  this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
64
64
  this.puppeteerExecutablePath = puppeteerExecutablePath;
65
65
  this.puppeteerRealProxy = puppeteerRealProxy;
66
- this.configurePuppeteer( );
66
+ this.configurePuppeteer();
67
67
  }
68
68
 
69
69
  async start ()
@@ -72,7 +72,7 @@ class WebScraper
72
72
  {
73
73
  if ( this.usePuppeteer )
74
74
  {
75
- let { browser, page } = await connect( this.puppeteerRealOptions )
75
+ const { browser, page } = await connect( this.puppeteerRealOptions );
76
76
  this.puppeteerBrowser = browser;
77
77
  this.puppeteerPage = page;
78
78
  }
@@ -92,7 +92,7 @@ class WebScraper
92
92
  {
93
93
  if ( this.puppeteerBrowser )
94
94
  {
95
- await this.puppeteerBrowser.close(); // Close the browser after scraping
95
+ await this.puppeteerBrowser.close();
96
96
  }
97
97
  }
98
98
  }
@@ -102,8 +102,8 @@ class WebScraper
102
102
  const queue = [{ url: initialUrl, depth: initialDepth }];
103
103
  for ( let i = 0; i < queue.length; i++ )
104
104
  {
105
- console.log( `Processing URL: ${queue[i].url}` );
106
105
  let { url, depth } = queue[i];
106
+ console.log( `Processing URL: ${queue[i].url}` );
107
107
  if ( this.hasReachedMax( depth ) )
108
108
  {
109
109
  continue;
@@ -140,7 +140,6 @@ class WebScraper
140
140
  nbTopCandidates: 20
141
141
  });
142
142
  const article = reader.parse();
143
-
144
143
  if ( article )
145
144
  {
146
145
  if ( this.hasValidPageContent( article.textContent ) )
@@ -161,7 +160,6 @@ class WebScraper
161
160
 
162
161
  const links = this.extractLinks( data );
163
162
  const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
164
-
165
163
  for ( const link of unvisitedLinks )
166
164
  {
167
165
  if ( !this.hasReachedMax( depth ) )
@@ -182,7 +180,7 @@ class WebScraper
182
180
  {
183
181
  try
184
182
  {
185
- const response = await this.retryAxiosRequest( url )
183
+ const response = await this.retryAxiosRequest( url );
186
184
  const contentType = response?.headers["content-type"] || "";
187
185
  if ( !contentType?.startsWith( "text" ) )
188
186
  {
@@ -191,34 +189,30 @@ class WebScraper
191
189
  return null;
192
190
  }
193
191
 
194
- // Step 3: If Content-Type is HTML, read the full response
195
192
  let htmlContent = "";
196
- response.data.on( "data", ( chunk ) =>
193
+ response.data.on( "data", chunk =>
197
194
  {
198
195
  htmlContent += chunk.toString();
199
196
  });
200
-
201
- // Wait for the stream to finish
202
197
  await new Promise( ( resolve, reject ) =>
203
198
  {
204
199
  response.data.on( "end", resolve );
205
200
  response.data.on( "error", reject );
206
201
  });
207
-
208
202
  return htmlContent;
209
203
  }
210
204
  catch ( error )
211
205
  {
212
206
  console.error( `Error fetching content ${url}:`, error.message );
213
- if ( error.status = 403 && this.usePuppeteer )
207
+ if ( error.status === 403 && this.usePuppeteer )
214
208
  {
215
209
  try
216
210
  {
217
211
  let result;
218
- for ( let index = 0; index < 10; index++ )
212
+ for ( let i = 0; i < 10; i++ )
219
213
  {
220
214
  console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
221
- result = await this.navigateToPage( url ) ;
215
+ result = await this.navigateToPage( url );
222
216
  if ( this.hasValidPageContent( result.htmlContent ) )
223
217
  {
224
218
  break
@@ -231,7 +225,6 @@ class WebScraper
231
225
  console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
232
226
  throw error;
233
227
  }
234
-
235
228
  }
236
229
  throw error;
237
230
  }
@@ -344,11 +337,10 @@ class WebScraper
344
337
  createJSONLFile ()
345
338
  {
346
339
  const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
347
- let writeStreamMeta
348
-
349
- // Add error handlers
350
- writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
340
+ writeStreamSimple.on( "error", err =>
341
+ { return console.error( "Error writing JSONL:", err ) });
351
342
 
343
+ let writeStreamMeta;
352
344
  if ( this.includeMetadata )
353
345
  {
354
346
  writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
@@ -394,16 +386,18 @@ class WebScraper
394
386
 
395
387
  for ( const content of this.allProcessedContent )
396
388
  {
397
- // Write simple version
398
389
  const escapedText = content.simple.text.replace( /"/g, "\"\"" );
399
390
  writeStreamSimple.write( `"${escapedText}"\n` );
400
391
 
401
- // Write metadata version if requested
402
392
  if ( this.includeMetadata )
403
393
  {
404
394
  const { metadata } = content.withMetadata;
405
- const metadataValues = Array.from( this.metadataFields )
406
- .map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
395
+ const metadataValues = Array.from( this.metadataFields ).map( field =>
396
+ {
397
+ return metadata[field]
398
+ ? `"${metadata[field].replace( /"/g, "\"\"" )}"`
399
+ : "\"\""
400
+ });
407
401
  writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
408
402
  }
409
403
  }
@@ -422,10 +416,8 @@ class WebScraper
422
416
 
423
417
  saveNumberedTextFiles ()
424
418
  {
425
- // Create base text folder for simple content
426
419
  const baseTextPath = path.join( __dirname, this.textOutputPath );
427
420
 
428
- // Create metadata text folder if needed
429
421
  let metaTextPath = null;
430
422
  if ( this.includeMetadata )
431
423
  {
@@ -436,20 +428,15 @@ class WebScraper
436
428
  this.allProcessedContent.forEach( ( content, index ) =>
437
429
  {
438
430
  const fileName = `${index + 1}.txt`;
439
-
440
- // Always save simple version
441
431
  const simpleFilePath = path.join( baseTextPath, fileName );
442
432
  fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
443
433
  console.log( `Created simple text file: ${fileName}` );
444
434
 
445
- // Save metadata version if enabled
446
435
  if ( this.includeMetadata )
447
436
  {
448
437
  const metaFilePath = path.join( metaTextPath, fileName );
449
438
  let fileContent = "";
450
-
451
439
  const { metadata } = content.withMetadata;
452
- // Add metadata fields as headers
453
440
  for ( const field of this.metadataFields )
454
441
  {
455
442
  if ( metadata[field] )
@@ -459,7 +446,6 @@ class WebScraper
459
446
  }
460
447
  fileContent += "\n---\n\n";
461
448
  fileContent += content.withMetadata.text;
462
-
463
449
  fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
464
450
  console.log( `Created metadata text file: ${fileName}` );
465
451
  }
@@ -469,33 +455,27 @@ class WebScraper
469
455
  processContent ( content )
470
456
  {
471
457
  let processed = content;
472
-
473
- // Remove "[You can read more about this here]" and similar patterns
458
+ // Remove unwanted fixed text
474
459
  processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
475
-
476
- // Trim each line
477
- processed = processed.split( "\n" )
460
+ // Trim each line and remove extra newlines
461
+ processed = processed
462
+ .split( "\n" )
478
463
  .map( line => { return line.trim() })
479
- .join( "\n" );
480
-
481
- // Replace 3 or more newlines with a single newline
482
- processed = processed.replace( /\n{3,}/g, "\n\n" );
483
-
484
- // Add more processing rules as needed:
485
- // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
486
- // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
464
+ .join( "\n" )
465
+ .replace( /\n{3,}/g, "\n\n" );
487
466
 
488
- // Remove specified words from the end of content, handling multiple occurrences
467
+ // Remove specified words at the end (repeatedly)
489
468
  const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
490
469
  let changed = true;
491
-
492
470
  while ( changed )
493
471
  {
494
472
  changed = false;
495
- for ( let i = 0; i < wordsToTrim.length; i++ )
473
+ for ( const word of wordsToTrim )
496
474
  {
497
475
  const oldProcessed = processed;
498
- processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
476
+ processed = processed
477
+ .replace( new RegExp( `\\s*${word}\\s*$`, "g" ), "" )
478
+ .trim();
499
479
  if ( oldProcessed !== processed )
500
480
  {
501
481
  changed = true;
@@ -508,7 +488,6 @@ class WebScraper
508
488
  filterMetadata ( metadata )
509
489
  {
510
490
  if ( !this.includeMetadata ) return {};
511
-
512
491
  const filteredMetadata = {};
513
492
  for ( const field of this.metadataFields )
514
493
  {
@@ -528,10 +507,13 @@ class WebScraper
528
507
  description: document.querySelector( "meta[name=\"description\"]" )?.content,
529
508
  keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
530
509
  author: document.querySelector( "meta[name=\"author\"]" )?.content,
531
- language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
510
+ language:
511
+ document.documentElement.lang ||
512
+ document.querySelector( "html" )?.getAttribute( "lang" ),
532
513
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
533
514
  ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
534
- ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
515
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
516
+ ?.content,
535
517
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
536
518
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
537
519
  dateScrapedDate: new Date().toISOString()
@@ -589,7 +571,6 @@ class WebScraper
589
571
  {
590
572
  this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
591
573
  }
592
-
593
574
  if ( this.puppeteerExecutablePath )
594
575
  {
595
576
  this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
@@ -663,10 +644,7 @@ class WebScraper
663
644
 
664
645
  hasValidPageContent ( content )
665
646
  {
666
- // Remove whitespace and newlines for checking
667
647
  const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
668
-
669
- // List of phrases that indicate invalid content
670
648
  const invalidPhrases = [
671
649
  "verifying that you are not a robot",
672
650
  "verifying you are human. this may take a few seconds.",
@@ -690,37 +668,35 @@ class WebScraper
690
668
 
691
669
  createOutputDirectory ()
692
670
  {
693
- if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
694
- {
695
- fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
696
- }
697
- if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
698
- {
699
- fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
700
- }
701
- if ( fs.existsSync( path.join( __dirname, this.textOutputPathWithMeta ) ) )
702
- {
703
- fs.rmSync( path.join( __dirname, this.textOutputPathWithMeta ), { recursive: true, force: true });
704
- }
705
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
706
- {
707
- fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
708
- }
709
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
710
- {
711
- fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
712
- }
713
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
671
+ const paths = [
672
+ path.join( __dirname, this.scrapResultPath ),
673
+ path.join( __dirname, this.textOutputPath ),
674
+ path.join( __dirname, this.textOutputPathWithMeta ),
675
+ path.join( __dirname, this.csvOutputPath ),
676
+ path.join( __dirname, this.csvOutputPathWithMeta ),
677
+ path.join( __dirname, this.jsonlOutputPath ),
678
+ path.join( __dirname, this.jsonlOutputPathWithMeta )
679
+ ];
680
+ for ( const p of paths )
714
681
  {
715
- fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
682
+ if ( fs.existsSync( p ) )
683
+ {
684
+ fs.rmSync( p, { recursive: true, force: true });
685
+ }
716
686
  }
717
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
687
+ // Recreate directories needed for output
688
+ this.ensureDirectory( path.join( __dirname, this.scrapResultPath ) );
689
+ this.ensureDirectory( path.join( __dirname, this.textOutputPath ) );
690
+ this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
691
+ }
692
+
693
+ // Helper method to ensure a directory exists
694
+ ensureDirectory ( dirPath )
695
+ {
696
+ if ( !fs.existsSync( dirPath ) )
718
697
  {
719
- fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
698
+ fs.mkdirSync( dirPath, { recursive: true });
720
699
  }
721
- fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
722
- fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
723
- fs.mkdirSync( path.join( __dirname, this.textOutputPathWithMeta ), { recursive: true });
724
700
  }
725
701
 
726
702
  static async sleep ( ms )
@@ -732,11 +708,7 @@ class WebScraper
732
708
  {
733
709
  await WebScraper.sleep( 1000 );
734
710
  const fullOutputPath = path.join( __dirname, outputPath );
735
-
736
- // Create output directories
737
711
  WebScraper.createCombinedDirectories( fullOutputPath );
738
-
739
- // Combine files by type
740
712
  WebScraper.combineJSONLFiles( fullOutputPath, websites );
741
713
  WebScraper.combineCSVFiles( fullOutputPath, websites );
742
714
  WebScraper.combineTextFiles( fullOutputPath, websites );
@@ -750,34 +722,44 @@ class WebScraper
750
722
  }
751
723
  fs.mkdirSync( fullOutputPath, { recursive: true });
752
724
  fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
753
- fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
725
+ fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), {
726
+ recursive: true
727
+ });
754
728
  }
755
729
 
756
730
  static combineJSONLFiles ( fullOutputPath, websites )
757
731
  {
758
- const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
759
- .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
760
- const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
761
- .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
732
+ const jsonlOutput = fs
733
+ .createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
734
+ .on( "error", err =>
735
+ { return console.error( "Error combining JSONL:", err ) });
736
+ const jsonlMetaOutput = fs
737
+ .createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
738
+ .on( "error", err =>
739
+ { return console.error( "Error combining metadata JSONL:", err ) });
762
740
 
763
741
  for ( const website of websites )
764
742
  {
765
- const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
743
+ const jsonlContent = fs.readFileSync(
744
+ path.join( __dirname, website.jsonlOutputPath ),
745
+ "utf-8"
746
+ );
766
747
  if ( jsonlContent )
767
748
  {
768
749
  jsonlOutput.write( jsonlContent );
769
750
  }
770
-
771
751
  if ( website.includeMetadata )
772
752
  {
773
- const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
753
+ const jsonlMetaContent = fs.readFileSync(
754
+ path.join( __dirname, website.jsonlOutputPathWithMeta ),
755
+ "utf-8"
756
+ );
774
757
  if ( jsonlMetaContent )
775
758
  {
776
759
  jsonlMetaOutput.write( jsonlMetaContent );
777
760
  }
778
761
  }
779
762
  }
780
-
781
763
  jsonlOutput.end();
782
764
  jsonlMetaOutput.end();
783
765
  }
@@ -788,7 +770,8 @@ class WebScraper
788
770
  const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
789
771
 
790
772
  csvOutput.write( "text\n" );
791
- const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
773
+ const metadataFields =
774
+ websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
792
775
 
793
776
  if ( metadataFields.size > 0 )
794
777
  {
@@ -805,10 +788,13 @@ class WebScraper
805
788
  {
806
789
  csvOutput.write( `${csvContent.join( "\n" )}\n` );
807
790
  }
808
-
809
791
  if ( website.includeMetadata )
810
792
  {
811
- const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
793
+ const csvMetaContent = fs
794
+ .readFileSync(
795
+ path.join( __dirname, website.csvOutputPathWithMeta ),
796
+ "utf-8"
797
+ )
812
798
  .split( "\n" )
813
799
  .slice( 1 )
814
800
  .filter( line => { return line.trim() });
@@ -818,7 +804,6 @@ class WebScraper
818
804
  }
819
805
  }
820
806
  }
821
-
822
807
  csvOutput.end();
823
808
  csvMetaOutput.end();
824
809
  }
@@ -826,20 +811,20 @@ class WebScraper
826
811
  static combineTextFiles ( fullOutputPath, websites )
827
812
  {
828
813
  let textFileCounter = 1;
829
-
830
814
  for ( const website of websites )
831
815
  {
832
816
  const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
833
-
834
817
  for ( const file of textFiles )
835
818
  {
836
- const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
819
+ const content = fs.readFileSync(
820
+ path.join( __dirname, website.textOutputPath, file ),
821
+ "utf-8"
822
+ );
837
823
  fs.writeFileSync(
838
824
  path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
839
825
  content,
840
826
  "utf-8"
841
827
  );
842
-
843
828
  if ( website.includeMetadata )
844
829
  {
845
830
  const metaContent = fs.readFileSync(
@@ -847,7 +832,11 @@ class WebScraper
847
832
  "utf-8"
848
833
  );
849
834
  fs.writeFileSync(
850
- path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
835
+ path.join(
836
+ fullOutputPath,
837
+ "texts_with_metadata",
838
+ `${textFileCounter}.txt`
839
+ ),
851
840
  metaContent,
852
841
  "utf-8"
853
842
  );
@@ -858,4 +847,4 @@ class WebScraper
858
847
  }
859
848
  }
860
849
 
861
- module.exports = WebScraper;
850
+ module.exports = WebScraper;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.9.0",
3
+ "version": "3.9.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",