clean-web-scraper 3.3.5 → 3.3.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +17 -8
- package/package.json +1 -1
- package/src/WebScraper.js +30 -2
package/example-usage.js
CHANGED
|
@@ -83,7 +83,7 @@ async function bdsmovement ()
|
|
|
83
83
|
host: "socks5://127.0.0.1",
|
|
84
84
|
port: "2080",
|
|
85
85
|
},
|
|
86
|
-
usePuppeteer: true
|
|
86
|
+
// usePuppeteer: true
|
|
87
87
|
});
|
|
88
88
|
await scraper.start();
|
|
89
89
|
return scraper;
|
|
@@ -95,12 +95,19 @@ async function electronicintifada ()
|
|
|
95
95
|
const scraper = new WebScraper({
|
|
96
96
|
baseURL: "https://electronicintifada.net",
|
|
97
97
|
excludeList: [
|
|
98
|
-
"https://electronicintifada.net/
|
|
99
|
-
"https://electronicintifada.net/
|
|
100
|
-
"https://electronicintifada.net/
|
|
101
|
-
"https://electronicintifada.net/
|
|
102
|
-
"https://electronicintifada.net/
|
|
103
|
-
"https://electronicintifada.net/
|
|
98
|
+
"https://electronicintifada.net/updates",
|
|
99
|
+
"https://electronicintifada.net/taxonomy/term/",
|
|
100
|
+
"https://electronicintifada.net/tags/",
|
|
101
|
+
"https://electronicintifada.net/blog",
|
|
102
|
+
"https://electronicintifada.net/people",
|
|
103
|
+
"https://electronicintifada.net/location"
|
|
104
|
+
],
|
|
105
|
+
exactExcludeList: [
|
|
106
|
+
"https://electronicintifada.net",
|
|
107
|
+
"https://electronicintifada.net/blog",
|
|
108
|
+
"https://electronicintifada.net/news",
|
|
109
|
+
"https://electronicintifada.net/opinion",
|
|
110
|
+
"https://electronicintifada.net/review",
|
|
104
111
|
],
|
|
105
112
|
scrapResultPath: "./dataset/electronicintifada/website",
|
|
106
113
|
jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
|
|
@@ -118,10 +125,12 @@ void async function main ()
|
|
|
118
125
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
119
126
|
const decolonizepalestineScraper = await decolonizepalestine();
|
|
120
127
|
const bdsmovementScraper = await bdsmovement();
|
|
128
|
+
const electronicintifadaScraper = await electronicintifada();
|
|
121
129
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
122
130
|
khameneiIrFreePalestineTagScraper,
|
|
123
131
|
decolonizepalestineScraper,
|
|
124
|
-
bdsmovementScraper
|
|
132
|
+
bdsmovementScraper,
|
|
133
|
+
electronicintifadaScraper
|
|
125
134
|
] );
|
|
126
135
|
|
|
127
136
|
// 4
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -25,7 +25,9 @@ class WebScraper
|
|
|
25
25
|
usePuppeteer,
|
|
26
26
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
27
27
|
puppeteerExecutablePath,
|
|
28
|
-
puppeteerRealProxy
|
|
28
|
+
puppeteerRealProxy,
|
|
29
|
+
filterFileTypes = true,
|
|
30
|
+
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
|
|
29
31
|
})
|
|
30
32
|
{
|
|
31
33
|
this.baseURL = baseURL;
|
|
@@ -44,6 +46,8 @@ class WebScraper
|
|
|
44
46
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
45
47
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
46
48
|
this.allProcessedContent = [];
|
|
49
|
+
this.filterFileTypes = filterFileTypes;
|
|
50
|
+
this.excludedFileTypes = excludedFileTypes;
|
|
47
51
|
this.usePuppeteer = usePuppeteer || false;
|
|
48
52
|
this.puppeteerOptions = {
|
|
49
53
|
headless: false,
|
|
@@ -120,7 +124,7 @@ class WebScraper
|
|
|
120
124
|
const dom = new JSDOM( data, { url });
|
|
121
125
|
const { document } = dom.window;
|
|
122
126
|
|
|
123
|
-
if ( !this.isExcluded( url ) )
|
|
127
|
+
if ( !this.isExcluded( url ) && this.isValidFileType( url ) )
|
|
124
128
|
{
|
|
125
129
|
const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
|
|
126
130
|
const article = reader.parse();
|
|
@@ -437,6 +441,23 @@ class WebScraper
|
|
|
437
441
|
// processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
|
|
438
442
|
// processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
|
|
439
443
|
|
|
444
|
+
// Remove specified words from the end of content, handling multiple occurrences
|
|
445
|
+
const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
|
|
446
|
+
let changed = true;
|
|
447
|
+
|
|
448
|
+
while ( changed )
|
|
449
|
+
{
|
|
450
|
+
changed = false;
|
|
451
|
+
for ( let i = 0; i < wordsToTrim.length; i++ )
|
|
452
|
+
{
|
|
453
|
+
const oldProcessed = processed;
|
|
454
|
+
processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
|
|
455
|
+
if ( oldProcessed !== processed )
|
|
456
|
+
{
|
|
457
|
+
changed = true;
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
}
|
|
440
461
|
return processed;
|
|
441
462
|
}
|
|
442
463
|
|
|
@@ -531,6 +552,13 @@ class WebScraper
|
|
|
531
552
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
532
553
|
}
|
|
533
554
|
|
|
555
|
+
isValidFileType ( url )
|
|
556
|
+
{
|
|
557
|
+
if ( !this.filterFileTypes ) return true;
|
|
558
|
+
const urlPath = new URL( url ).pathname.toLowerCase();
|
|
559
|
+
return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
|
|
560
|
+
}
|
|
561
|
+
|
|
534
562
|
isValidContent ( content )
|
|
535
563
|
{
|
|
536
564
|
// Remove whitespace and newlines for checking
|