clean-web-scraper 3.3.6 → 3.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/WebScraper.js +13 -2
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -25,7 +25,9 @@ class WebScraper
|
|
|
25
25
|
usePuppeteer,
|
|
26
26
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
27
27
|
puppeteerExecutablePath,
|
|
28
|
-
puppeteerRealProxy
|
|
28
|
+
puppeteerRealProxy,
|
|
29
|
+
filterFileTypes = true,
|
|
30
|
+
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
|
|
29
31
|
})
|
|
30
32
|
{
|
|
31
33
|
this.baseURL = baseURL;
|
|
@@ -44,6 +46,8 @@ class WebScraper
|
|
|
44
46
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
45
47
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
46
48
|
this.allProcessedContent = [];
|
|
49
|
+
this.filterFileTypes = filterFileTypes;
|
|
50
|
+
this.excludedFileTypes = excludedFileTypes;
|
|
47
51
|
this.usePuppeteer = usePuppeteer || false;
|
|
48
52
|
this.puppeteerOptions = {
|
|
49
53
|
headless: false,
|
|
@@ -120,7 +124,7 @@ class WebScraper
|
|
|
120
124
|
const dom = new JSDOM( data, { url });
|
|
121
125
|
const { document } = dom.window;
|
|
122
126
|
|
|
123
|
-
if ( !this.isExcluded( url ) )
|
|
127
|
+
if ( !this.isExcluded( url ) && this.isValidFileType( url ) )
|
|
124
128
|
{
|
|
125
129
|
const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
|
|
126
130
|
const article = reader.parse();
|
|
@@ -548,6 +552,13 @@ class WebScraper
|
|
|
548
552
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
549
553
|
}
|
|
550
554
|
|
|
555
|
+
isValidFileType ( url )
|
|
556
|
+
{
|
|
557
|
+
if ( !this.filterFileTypes ) return true;
|
|
558
|
+
const urlPath = new URL( url ).pathname.toLowerCase();
|
|
559
|
+
return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
|
|
560
|
+
}
|
|
561
|
+
|
|
551
562
|
isValidContent ( content )
|
|
552
563
|
{
|
|
553
564
|
// Remove whitespace and newlines for checking
|