clean-web-scraper 3.3.1 → 3.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/WebScraper.js +61 -8
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -19,7 +19,11 @@ class WebScraper
|
|
|
19
19
|
csvOutputPath,
|
|
20
20
|
includeMetadata = false,
|
|
21
21
|
metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
|
|
22
|
-
|
|
22
|
+
headers = {
|
|
23
|
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
|
24
|
+
"Cache-Control": "private",
|
|
25
|
+
"Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"
|
|
26
|
+
}
|
|
23
27
|
})
|
|
24
28
|
{
|
|
25
29
|
this.baseURL = baseURL;
|
|
@@ -31,7 +35,7 @@ class WebScraper
|
|
|
31
35
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
32
36
|
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
33
37
|
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
34
|
-
this.
|
|
38
|
+
this.headers = headers;
|
|
35
39
|
this.includeMetadata = includeMetadata;
|
|
36
40
|
this.metadataFields = new Set( metadataFields );
|
|
37
41
|
this.visited = new Set();
|
|
@@ -60,9 +64,7 @@ class WebScraper
|
|
|
60
64
|
try
|
|
61
65
|
{
|
|
62
66
|
const { data, headers } = await axios.get( url, {
|
|
63
|
-
headers:
|
|
64
|
-
"user-agent": this.userAgent
|
|
65
|
-
}
|
|
67
|
+
headers: this.headers,
|
|
66
68
|
});
|
|
67
69
|
const dom = new JSDOM( data, { url });
|
|
68
70
|
const { document } = dom.window;
|
|
@@ -74,9 +76,17 @@ class WebScraper
|
|
|
74
76
|
|
|
75
77
|
if ( article )
|
|
76
78
|
{
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
79
|
+
if ( this.isValidContent( article.textContent ) )
|
|
80
|
+
{
|
|
81
|
+
|
|
82
|
+
const metadata = this.metadataextractor( url, document, headers );
|
|
83
|
+
metadata.depth = depth;
|
|
84
|
+
this.saveArticle( url, article.textContent, metadata );
|
|
85
|
+
}
|
|
86
|
+
else
|
|
87
|
+
{
|
|
88
|
+
console.error( `Invalid content found at ${url}` );
|
|
89
|
+
}
|
|
80
90
|
}
|
|
81
91
|
else
|
|
82
92
|
{
|
|
@@ -373,8 +383,27 @@ class WebScraper
|
|
|
373
383
|
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
374
384
|
{
|
|
375
385
|
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
386
|
+
}
|
|
387
|
+
if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
|
|
388
|
+
{
|
|
376
389
|
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
377
390
|
}
|
|
391
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
|
392
|
+
{
|
|
393
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
|
394
|
+
}
|
|
395
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
|
396
|
+
{
|
|
397
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
|
398
|
+
}
|
|
399
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
|
400
|
+
{
|
|
401
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
|
|
402
|
+
}
|
|
403
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
|
|
404
|
+
{
|
|
405
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
|
|
406
|
+
}
|
|
378
407
|
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
379
408
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
380
409
|
}
|
|
@@ -467,6 +496,30 @@ class WebScraper
|
|
|
467
496
|
csvMetaOutput.end();
|
|
468
497
|
}
|
|
469
498
|
|
|
499
|
+
isValidContent ( content )
|
|
500
|
+
{
|
|
501
|
+
// Remove whitespace and newlines for checking
|
|
502
|
+
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|
|
503
|
+
|
|
504
|
+
// List of phrases that indicate invalid content
|
|
505
|
+
const invalidPhrases = [
|
|
506
|
+
"verifying that you are not a robot",
|
|
507
|
+
"checking if the site connection is secure",
|
|
508
|
+
"please wait while we verify",
|
|
509
|
+
"please enable javascript",
|
|
510
|
+
"access denied",
|
|
511
|
+
"captcha verification"
|
|
512
|
+
];
|
|
513
|
+
|
|
514
|
+
const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
|
|
515
|
+
// Check content length
|
|
516
|
+
if ( cleanContent.length < 100 && hasInvalidPhrases )
|
|
517
|
+
{
|
|
518
|
+
return false;
|
|
519
|
+
}
|
|
520
|
+
return true;
|
|
521
|
+
}
|
|
522
|
+
|
|
470
523
|
static combineTextFiles ( fullOutputPath, websites )
|
|
471
524
|
{
|
|
472
525
|
let textFileCounter = 1;
|