clean-web-scraper 3.3.2 → 3.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +8 -8
- package/package.json +1 -1
- package/src/WebScraper.js +40 -29
package/example-usage.js
CHANGED
|
@@ -76,14 +76,14 @@ async function bdsmovement ()
|
|
|
76
76
|
|
|
77
77
|
void async function main ()
|
|
78
78
|
{
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
const bdsmovementScraper = await bdsmovement();
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
79
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
80
|
+
const decolonizepalestineScraper = await decolonizepalestine();
|
|
81
|
+
// const bdsmovementScraper = await bdsmovement();
|
|
82
|
+
await WebScraper.combineResults( "./dataset/combined", [
|
|
83
|
+
khameneiIrFreePalestineTagScraper,
|
|
84
|
+
decolonizepalestineScraper,
|
|
85
|
+
// bdsmovementScraper
|
|
86
|
+
] );
|
|
87
87
|
|
|
88
88
|
// 4
|
|
89
89
|
// https://electronicintifada.net/
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -408,6 +408,30 @@ class WebScraper
|
|
|
408
408
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
409
409
|
}
|
|
410
410
|
|
|
411
|
+
isValidContent ( content )
|
|
412
|
+
{
|
|
413
|
+
// Remove whitespace and newlines for checking
|
|
414
|
+
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|
|
415
|
+
|
|
416
|
+
// List of phrases that indicate invalid content
|
|
417
|
+
const invalidPhrases = [
|
|
418
|
+
"verifying that you are not a robot",
|
|
419
|
+
"checking if the site connection is secure",
|
|
420
|
+
"please wait while we verify",
|
|
421
|
+
"please enable javascript",
|
|
422
|
+
"access denied",
|
|
423
|
+
"captcha verification"
|
|
424
|
+
];
|
|
425
|
+
|
|
426
|
+
const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
|
|
427
|
+
// Check content length
|
|
428
|
+
if ( cleanContent.length < 100 && hasInvalidPhrases )
|
|
429
|
+
{
|
|
430
|
+
return false;
|
|
431
|
+
}
|
|
432
|
+
return true;
|
|
433
|
+
}
|
|
434
|
+
|
|
411
435
|
static sleep ( ms )
|
|
412
436
|
{
|
|
413
437
|
return new Promise( resolve => { return setTimeout( resolve, ms ) });
|
|
@@ -448,12 +472,18 @@ class WebScraper
|
|
|
448
472
|
for ( const website of websites )
|
|
449
473
|
{
|
|
450
474
|
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
451
|
-
|
|
475
|
+
if ( jsonlContent )
|
|
476
|
+
{
|
|
477
|
+
jsonlOutput.write( jsonlContent );
|
|
478
|
+
}
|
|
452
479
|
|
|
453
480
|
if ( website.includeMetadata )
|
|
454
481
|
{
|
|
455
482
|
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
456
|
-
|
|
483
|
+
if ( jsonlMetaContent )
|
|
484
|
+
{
|
|
485
|
+
jsonlMetaOutput.write( jsonlMetaContent );
|
|
486
|
+
}
|
|
457
487
|
}
|
|
458
488
|
}
|
|
459
489
|
|
|
@@ -480,7 +510,10 @@ class WebScraper
|
|
|
480
510
|
.split( "\n" )
|
|
481
511
|
.slice( 1 )
|
|
482
512
|
.filter( line => { return line.trim() });
|
|
483
|
-
|
|
513
|
+
if ( csvContent.length > 0 )
|
|
514
|
+
{
|
|
515
|
+
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
|
516
|
+
}
|
|
484
517
|
|
|
485
518
|
if ( website.includeMetadata )
|
|
486
519
|
{
|
|
@@ -488,7 +521,10 @@ class WebScraper
|
|
|
488
521
|
.split( "\n" )
|
|
489
522
|
.slice( 1 )
|
|
490
523
|
.filter( line => { return line.trim() });
|
|
491
|
-
|
|
524
|
+
if ( csvMetaContent.length > 0 )
|
|
525
|
+
{
|
|
526
|
+
csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
|
|
527
|
+
}
|
|
492
528
|
}
|
|
493
529
|
}
|
|
494
530
|
|
|
@@ -496,30 +532,6 @@ class WebScraper
|
|
|
496
532
|
csvMetaOutput.end();
|
|
497
533
|
}
|
|
498
534
|
|
|
499
|
-
isValidContent ( content )
|
|
500
|
-
{
|
|
501
|
-
// Remove whitespace and newlines for checking
|
|
502
|
-
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|
|
503
|
-
|
|
504
|
-
// List of phrases that indicate invalid content
|
|
505
|
-
const invalidPhrases = [
|
|
506
|
-
"verifying that you are not a robot",
|
|
507
|
-
"checking if the site connection is secure",
|
|
508
|
-
"please wait while we verify",
|
|
509
|
-
"please enable javascript",
|
|
510
|
-
"access denied",
|
|
511
|
-
"captcha verification"
|
|
512
|
-
];
|
|
513
|
-
|
|
514
|
-
const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
|
|
515
|
-
// Check content length
|
|
516
|
-
if ( cleanContent.length < 100 && hasInvalidPhrases )
|
|
517
|
-
{
|
|
518
|
-
return false;
|
|
519
|
-
}
|
|
520
|
-
return true;
|
|
521
|
-
}
|
|
522
|
-
|
|
523
535
|
static combineTextFiles ( fullOutputPath, websites )
|
|
524
536
|
{
|
|
525
537
|
let textFileCounter = 1;
|
|
@@ -553,7 +565,6 @@ class WebScraper
|
|
|
553
565
|
}
|
|
554
566
|
}
|
|
555
567
|
}
|
|
556
|
-
|
|
557
568
|
}
|
|
558
569
|
|
|
559
570
|
module.exports = WebScraper;
|