clean-web-scraper 3.3.2 → 3.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -76,14 +76,14 @@ async function bdsmovement ()
76
76
 
77
77
  void async function main ()
78
78
  {
79
- // const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
80
- // const decolonizepalestineScraper = await decolonizepalestine();
81
- const bdsmovementScraper = await bdsmovement();
82
- // await WebScraper.combineResults( "./dataset/combined", [
83
- // khameneiIrFreePalestineTagScraper,
84
- // decolonizepalestineScraper,
85
- // bdsmovementScraper
86
- // ] );
79
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
80
+ const decolonizepalestineScraper = await decolonizepalestine();
81
+ // const bdsmovementScraper = await bdsmovement();
82
+ await WebScraper.combineResults( "./dataset/combined", [
83
+ khameneiIrFreePalestineTagScraper,
84
+ decolonizepalestineScraper,
85
+ // bdsmovementScraper
86
+ ] );
87
87
 
88
88
  // 4
89
89
  // https://electronicintifada.net/
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.2",
3
+ "version": "3.3.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -408,6 +408,30 @@ class WebScraper
408
408
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
409
409
  }
410
410
 
411
+ isValidContent ( content )
412
+ {
413
+ // Remove whitespace and newlines for checking
414
+ const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
415
+
416
+ // List of phrases that indicate invalid content
417
+ const invalidPhrases = [
418
+ "verifying that you are not a robot",
419
+ "checking if the site connection is secure",
420
+ "please wait while we verify",
421
+ "please enable javascript",
422
+ "access denied",
423
+ "captcha verification"
424
+ ];
425
+
426
+ const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
427
+ // Check content length
428
+ if ( cleanContent.length < 100 && hasInvalidPhrases )
429
+ {
430
+ return false;
431
+ }
432
+ return true;
433
+ }
434
+
411
435
  static sleep ( ms )
412
436
  {
413
437
  return new Promise( resolve => { return setTimeout( resolve, ms ) });
@@ -448,12 +472,18 @@ class WebScraper
448
472
  for ( const website of websites )
449
473
  {
450
474
  const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
451
- jsonlOutput.write( jsonlContent );
475
+ if ( jsonlContent )
476
+ {
477
+ jsonlOutput.write( jsonlContent );
478
+ }
452
479
 
453
480
  if ( website.includeMetadata )
454
481
  {
455
482
  const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
456
- jsonlMetaOutput.write( jsonlMetaContent );
483
+ if ( jsonlMetaContent )
484
+ {
485
+ jsonlMetaOutput.write( jsonlMetaContent );
486
+ }
457
487
  }
458
488
  }
459
489
 
@@ -480,7 +510,10 @@ class WebScraper
480
510
  .split( "\n" )
481
511
  .slice( 1 )
482
512
  .filter( line => { return line.trim() });
483
- csvOutput.write( `${csvContent.join( "\n" )}\n` );
513
+ if ( csvContent.length > 0 )
514
+ {
515
+ csvOutput.write( `${csvContent.join( "\n" )}\n` );
516
+ }
484
517
 
485
518
  if ( website.includeMetadata )
486
519
  {
@@ -488,7 +521,10 @@ class WebScraper
488
521
  .split( "\n" )
489
522
  .slice( 1 )
490
523
  .filter( line => { return line.trim() });
491
- csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
524
+ if ( csvMetaContent.length > 0 )
525
+ {
526
+ csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
527
+ }
492
528
  }
493
529
  }
494
530
 
@@ -496,30 +532,6 @@ class WebScraper
496
532
  csvMetaOutput.end();
497
533
  }
498
534
 
499
- isValidContent ( content )
500
- {
501
- // Remove whitespace and newlines for checking
502
- const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
503
-
504
- // List of phrases that indicate invalid content
505
- const invalidPhrases = [
506
- "verifying that you are not a robot",
507
- "checking if the site connection is secure",
508
- "please wait while we verify",
509
- "please enable javascript",
510
- "access denied",
511
- "captcha verification"
512
- ];
513
-
514
- const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
515
- // Check content length
516
- if ( cleanContent.length < 100 && hasInvalidPhrases )
517
- {
518
- return false;
519
- }
520
- return true;
521
- }
522
-
523
535
  static combineTextFiles ( fullOutputPath, websites )
524
536
  {
525
537
  let textFileCounter = 1;
@@ -553,7 +565,6 @@ class WebScraper
553
565
  }
554
566
  }
555
567
  }
556
-
557
568
  }
558
569
 
559
570
  module.exports = WebScraper;