clean-web-scraper 3.3.6 → 3.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -100,7 +100,8 @@ async function electronicintifada ()
100
100
  "https://electronicintifada.net/tags/",
101
101
  "https://electronicintifada.net/blog",
102
102
  "https://electronicintifada.net/people",
103
- "https://electronicintifada.net/location"
103
+ "https://electronicintifada.net/location",
104
+ "https://electronicintifada.net/file"
104
105
  ],
105
106
  exactExcludeList: [
106
107
  "https://electronicintifada.net",
@@ -133,9 +134,6 @@ void async function main ()
133
134
  electronicintifadaScraper
134
135
  ] );
135
136
 
136
- // 4
137
- // https://electronicintifada.net/
138
-
139
137
  // 5
140
138
  // https://www.palestineremembered.com/ZionistFAQ.html
141
139
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.6",
3
+ "version": "3.3.8",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -20,12 +20,14 @@ class WebScraper
20
20
  textOutputPath,
21
21
  csvOutputPath,
22
22
  includeMetadata = false,
23
- metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
23
+ metadataFields = [], // ['title', 'description', 'author', etc.]
24
24
  headers,
25
25
  usePuppeteer,
26
26
  puppeteerProxy, // e.g. http://127.0.0.1:2080
27
27
  puppeteerExecutablePath,
28
- puppeteerRealProxy
28
+ puppeteerRealProxy,
29
+ filterFileTypes = true,
30
+ excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
29
31
  })
30
32
  {
31
33
  this.baseURL = baseURL;
@@ -44,6 +46,8 @@ class WebScraper
44
46
  this.excludeList = this.normalizeExcludeList( excludeList );
45
47
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
46
48
  this.allProcessedContent = [];
49
+ this.filterFileTypes = filterFileTypes;
50
+ this.excludedFileTypes = excludedFileTypes;
47
51
  this.usePuppeteer = usePuppeteer || false;
48
52
  this.puppeteerOptions = {
49
53
  headless: false,
@@ -120,7 +124,7 @@ class WebScraper
120
124
  const dom = new JSDOM( data, { url });
121
125
  const { document } = dom.window;
122
126
 
123
- if ( !this.isExcluded( url ) )
127
+ if ( !this.isExcluded( url ) && this.isValidFileType( url ) )
124
128
  {
125
129
  const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
126
130
  const article = reader.parse();
@@ -548,6 +552,13 @@ class WebScraper
548
552
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
549
553
  }
550
554
 
555
+ isValidFileType ( url )
556
+ {
557
+ if ( !this.filterFileTypes ) return true;
558
+ const urlPath = new URL( url ).pathname.toLowerCase();
559
+ return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
560
+ }
561
+
551
562
  isValidContent ( content )
552
563
  {
553
564
  // Remove whitespace and newlines for checking