clean-web-scraper 3.3.5 → 3.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +17 -8
- package/package.json +1 -1
- package/src/WebScraper.js +17 -0
package/example-usage.js
CHANGED
|
@@ -83,7 +83,7 @@ async function bdsmovement ()
|
|
|
83
83
|
host: "socks5://127.0.0.1",
|
|
84
84
|
port: "2080",
|
|
85
85
|
},
|
|
86
|
-
usePuppeteer: true
|
|
86
|
+
// usePuppeteer: true
|
|
87
87
|
});
|
|
88
88
|
await scraper.start();
|
|
89
89
|
return scraper;
|
|
@@ -95,12 +95,19 @@ async function electronicintifada ()
|
|
|
95
95
|
const scraper = new WebScraper({
|
|
96
96
|
baseURL: "https://electronicintifada.net",
|
|
97
97
|
excludeList: [
|
|
98
|
-
"https://electronicintifada.net/
|
|
99
|
-
"https://electronicintifada.net/
|
|
100
|
-
"https://electronicintifada.net/
|
|
101
|
-
"https://electronicintifada.net/
|
|
102
|
-
"https://electronicintifada.net/
|
|
103
|
-
"https://electronicintifada.net/
|
|
98
|
+
"https://electronicintifada.net/updates",
|
|
99
|
+
"https://electronicintifada.net/taxonomy/term/",
|
|
100
|
+
"https://electronicintifada.net/tags/",
|
|
101
|
+
"https://electronicintifada.net/blog",
|
|
102
|
+
"https://electronicintifada.net/people",
|
|
103
|
+
"https://electronicintifada.net/location"
|
|
104
|
+
],
|
|
105
|
+
exactExcludeList: [
|
|
106
|
+
"https://electronicintifada.net",
|
|
107
|
+
"https://electronicintifada.net/blog",
|
|
108
|
+
"https://electronicintifada.net/news",
|
|
109
|
+
"https://electronicintifada.net/opinion",
|
|
110
|
+
"https://electronicintifada.net/review",
|
|
104
111
|
],
|
|
105
112
|
scrapResultPath: "./dataset/electronicintifada/website",
|
|
106
113
|
jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
|
|
@@ -118,10 +125,12 @@ void async function main ()
|
|
|
118
125
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
119
126
|
const decolonizepalestineScraper = await decolonizepalestine();
|
|
120
127
|
const bdsmovementScraper = await bdsmovement();
|
|
128
|
+
const electronicintifadaScraper = await electronicintifada();
|
|
121
129
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
122
130
|
khameneiIrFreePalestineTagScraper,
|
|
123
131
|
decolonizepalestineScraper,
|
|
124
|
-
bdsmovementScraper
|
|
132
|
+
bdsmovementScraper,
|
|
133
|
+
electronicintifadaScraper
|
|
125
134
|
] );
|
|
126
135
|
|
|
127
136
|
// 4
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -437,6 +437,23 @@ class WebScraper
|
|
|
437
437
|
// processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
|
|
438
438
|
// processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
|
|
439
439
|
|
|
440
|
+
// Remove specified words from the end of content, handling multiple occurrences
|
|
441
|
+
const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
|
|
442
|
+
let changed = true;
|
|
443
|
+
|
|
444
|
+
while ( changed )
|
|
445
|
+
{
|
|
446
|
+
changed = false;
|
|
447
|
+
for ( let i = 0; i < wordsToTrim.length; i++ )
|
|
448
|
+
{
|
|
449
|
+
const oldProcessed = processed;
|
|
450
|
+
processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
|
|
451
|
+
if ( oldProcessed !== processed )
|
|
452
|
+
{
|
|
453
|
+
changed = true;
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
440
457
|
return processed;
|
|
441
458
|
}
|
|
442
459
|
|