clean-web-scraper 3.3.6 → 3.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +2 -4
- package/package.json +1 -1
- package/src/WebScraper.js +14 -3
package/example-usage.js
CHANGED
|
@@ -100,7 +100,8 @@ async function electronicintifada ()
|
|
|
100
100
|
"https://electronicintifada.net/tags/",
|
|
101
101
|
"https://electronicintifada.net/blog",
|
|
102
102
|
"https://electronicintifada.net/people",
|
|
103
|
-
"https://electronicintifada.net/location"
|
|
103
|
+
"https://electronicintifada.net/location",
|
|
104
|
+
"https://electronicintifada.net/file"
|
|
104
105
|
],
|
|
105
106
|
exactExcludeList: [
|
|
106
107
|
"https://electronicintifada.net",
|
|
@@ -133,9 +134,6 @@ void async function main ()
|
|
|
133
134
|
electronicintifadaScraper
|
|
134
135
|
] );
|
|
135
136
|
|
|
136
|
-
// 4
|
|
137
|
-
// https://electronicintifada.net/
|
|
138
|
-
|
|
139
137
|
// 5
|
|
140
138
|
// https://www.palestineremembered.com/ZionistFAQ.html
|
|
141
139
|
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -20,12 +20,14 @@ class WebScraper
|
|
|
20
20
|
textOutputPath,
|
|
21
21
|
csvOutputPath,
|
|
22
22
|
includeMetadata = false,
|
|
23
|
-
metadataFields = [], // ['title', 'description', 'author',
|
|
23
|
+
metadataFields = [], // ['title', 'description', 'author', etc.]
|
|
24
24
|
headers,
|
|
25
25
|
usePuppeteer,
|
|
26
26
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
27
27
|
puppeteerExecutablePath,
|
|
28
|
-
puppeteerRealProxy
|
|
28
|
+
puppeteerRealProxy,
|
|
29
|
+
filterFileTypes = true,
|
|
30
|
+
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
|
|
29
31
|
})
|
|
30
32
|
{
|
|
31
33
|
this.baseURL = baseURL;
|
|
@@ -44,6 +46,8 @@ class WebScraper
|
|
|
44
46
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
45
47
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
46
48
|
this.allProcessedContent = [];
|
|
49
|
+
this.filterFileTypes = filterFileTypes;
|
|
50
|
+
this.excludedFileTypes = excludedFileTypes;
|
|
47
51
|
this.usePuppeteer = usePuppeteer || false;
|
|
48
52
|
this.puppeteerOptions = {
|
|
49
53
|
headless: false,
|
|
@@ -120,7 +124,7 @@ class WebScraper
|
|
|
120
124
|
const dom = new JSDOM( data, { url });
|
|
121
125
|
const { document } = dom.window;
|
|
122
126
|
|
|
123
|
-
if ( !this.isExcluded( url ) )
|
|
127
|
+
if ( !this.isExcluded( url ) && this.isValidFileType( url ) )
|
|
124
128
|
{
|
|
125
129
|
const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
|
|
126
130
|
const article = reader.parse();
|
|
@@ -548,6 +552,13 @@ class WebScraper
|
|
|
548
552
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
549
553
|
}
|
|
550
554
|
|
|
555
|
+
isValidFileType ( url )
|
|
556
|
+
{
|
|
557
|
+
if ( !this.filterFileTypes ) return true;
|
|
558
|
+
const urlPath = new URL( url ).pathname.toLowerCase();
|
|
559
|
+
return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
|
|
560
|
+
}
|
|
561
|
+
|
|
551
562
|
isValidContent ( content )
|
|
552
563
|
{
|
|
553
564
|
// Remove whitespace and newlines for checking
|