clean-web-scraper 3.3.6 → 3.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/WebScraper.js +13 -2
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.6",
3
+ "version": "3.3.7",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -25,7 +25,9 @@ class WebScraper
25
25
  usePuppeteer,
26
26
  puppeteerProxy, // e.g. http://127.0.0.1:2080
27
27
  puppeteerExecutablePath,
28
- puppeteerRealProxy
28
+ puppeteerRealProxy,
29
+ filterFileTypes = true,
30
+ excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
29
31
  })
30
32
  {
31
33
  this.baseURL = baseURL;
@@ -44,6 +46,8 @@ class WebScraper
44
46
  this.excludeList = this.normalizeExcludeList( excludeList );
45
47
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
46
48
  this.allProcessedContent = [];
49
+ this.filterFileTypes = filterFileTypes;
50
+ this.excludedFileTypes = excludedFileTypes;
47
51
  this.usePuppeteer = usePuppeteer || false;
48
52
  this.puppeteerOptions = {
49
53
  headless: false,
@@ -120,7 +124,7 @@ class WebScraper
120
124
  const dom = new JSDOM( data, { url });
121
125
  const { document } = dom.window;
122
126
 
123
- if ( !this.isExcluded( url ) )
127
+ if ( !this.isExcluded( url ) && this.isValidFileType( url ) )
124
128
  {
125
129
  const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
126
130
  const article = reader.parse();
@@ -548,6 +552,13 @@ class WebScraper
548
552
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
549
553
  }
550
554
 
555
+ isValidFileType ( url )
556
+ {
557
+ if ( !this.filterFileTypes ) return true;
558
+ const urlPath = new URL( url ).pathname.toLowerCase();
559
+ return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
560
+ }
561
+
551
562
  isValidContent ( content )
552
563
  {
553
564
  // Remove whitespace and newlines for checking