clean-web-scraper 3.5.1 → 3.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +40 -14
- package/package.json +1 -1
- package/src/WebScraper.js +24 -2
package/example-usage.js
CHANGED
|
@@ -8,8 +8,36 @@ const WebScraper = require( "./src/WebScraper" );
|
|
|
8
8
|
// "Cookie": cookies
|
|
9
9
|
// }
|
|
10
10
|
|
|
11
|
+
async function palianswers ( enable )
|
|
12
|
+
{
|
|
13
|
+
const scraper = new WebScraper({
|
|
14
|
+
baseURL: "https://palianswers.com",
|
|
15
|
+
excludeList: [
|
|
16
|
+
"https://palianswers.com/chat/",
|
|
17
|
+
"https://palianswers.com/become-a-volunteer/",
|
|
18
|
+
"https://palianswers.com/other-resources/",
|
|
19
|
+
"https://palianswers.com/request-a-rebuttal/",
|
|
20
|
+
"https://palianswers.com/submit-a-rebuttal/",
|
|
21
|
+
"https://palianswers.com/themes/"
|
|
22
|
+
],
|
|
23
|
+
exactExcludeList: [
|
|
24
|
+
"https://palianswers.com/",
|
|
25
|
+
],
|
|
26
|
+
scrapResultPath: "./dataset/palianswers/website",
|
|
27
|
+
jsonlOutputPath: "./dataset/palianswers/train.jsonl",
|
|
28
|
+
textOutputPath: "./dataset/palianswers/texts",
|
|
29
|
+
csvOutputPath: "./dataset/palianswers/train.csv",
|
|
30
|
+
includeMetadata: true,
|
|
31
|
+
metadataFields: ["title", "description", "author"]
|
|
32
|
+
});
|
|
33
|
+
if ( enable )
|
|
34
|
+
{
|
|
35
|
+
await scraper.start();
|
|
36
|
+
}
|
|
37
|
+
return scraper;
|
|
38
|
+
}
|
|
11
39
|
|
|
12
|
-
async function khameneiIrFreePalestineTag ()
|
|
40
|
+
async function khameneiIrFreePalestineTag ( enable )
|
|
13
41
|
{
|
|
14
42
|
// https://english.khamenei.ir/Opinions/FreePalestine
|
|
15
43
|
// https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
|
|
@@ -135,6 +163,7 @@ async function electronicintifada ( enable )
|
|
|
135
163
|
|
|
136
164
|
async function palestineremembered ( enable )
|
|
137
165
|
{
|
|
166
|
+
// https://www.palestineremembered.com
|
|
138
167
|
const scraper = new WebScraper({
|
|
139
168
|
baseURL: "https://www.palestineremembered.com",
|
|
140
169
|
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
|
@@ -179,24 +208,21 @@ async function palestineremembered ( enable )
|
|
|
179
208
|
|
|
180
209
|
void async function main ()
|
|
181
210
|
{
|
|
182
|
-
const
|
|
183
|
-
const decolonizepalestineScraper = await decolonizepalestine(
|
|
211
|
+
const palianswersScraper = await palianswers( true );
|
|
212
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
|
213
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
|
184
214
|
const bdsmovementScraper = await bdsmovement( false );
|
|
185
|
-
const electronicintifadaScraper = await electronicintifada(
|
|
186
|
-
|
|
215
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
|
216
|
+
const palestinerememberedScraper = await palestineremembered( false );
|
|
217
|
+
|
|
187
218
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
188
|
-
|
|
219
|
+
palianswersScraper,
|
|
189
220
|
decolonizepalestineScraper,
|
|
190
|
-
|
|
221
|
+
khameneiIrFreePalestineTagScraper,
|
|
191
222
|
electronicintifadaScraper,
|
|
192
|
-
|
|
223
|
+
// bdsmovementScraper,
|
|
224
|
+
// palestinerememberedScraper,
|
|
193
225
|
] );
|
|
194
226
|
|
|
195
|
-
// 5
|
|
196
|
-
// https://www.palestineremembered.com/ZionistFAQ.html
|
|
197
|
-
|
|
198
|
-
// 6 https://the-palestinian-side.vercel.app/
|
|
199
|
-
|
|
200
227
|
// 7 https://stand-with-palestine.org/blogs
|
|
201
228
|
}()
|
|
202
|
-
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -12,8 +12,9 @@ class WebScraper
|
|
|
12
12
|
constructor ({
|
|
13
13
|
baseURL,
|
|
14
14
|
startURL,
|
|
15
|
+
strictBaseURL = true,
|
|
15
16
|
maxDepth = Infinity,
|
|
16
|
-
maxArticles = Infinity,
|
|
17
|
+
maxArticles = Infinity,
|
|
17
18
|
excludeList,
|
|
18
19
|
exactExcludeList,
|
|
19
20
|
scrapResultPath = "./dataset",
|
|
@@ -34,8 +35,9 @@ class WebScraper
|
|
|
34
35
|
{
|
|
35
36
|
this.baseURL = baseURL;
|
|
36
37
|
this.startURL = startURL || baseURL;
|
|
38
|
+
this.strictBaseURL = strictBaseURL;
|
|
37
39
|
this.maxDepth = maxDepth;
|
|
38
|
-
this.maxArticles = maxArticles;
|
|
40
|
+
this.maxArticles = maxArticles;
|
|
39
41
|
this.scrapResultPath = scrapResultPath;
|
|
40
42
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
41
43
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
@@ -131,6 +133,10 @@ class WebScraper
|
|
|
131
133
|
{
|
|
132
134
|
return;
|
|
133
135
|
}
|
|
136
|
+
if ( !this.isValidDomain( url ) )
|
|
137
|
+
{
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
134
140
|
try
|
|
135
141
|
{
|
|
136
142
|
const data = await this.caller( url );
|
|
@@ -608,6 +614,22 @@ class WebScraper
|
|
|
608
614
|
return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
|
|
609
615
|
}
|
|
610
616
|
|
|
617
|
+
isValidDomain ( url )
|
|
618
|
+
{
|
|
619
|
+
if ( !this.strictBaseURL ) return true;
|
|
620
|
+
try
|
|
621
|
+
{
|
|
622
|
+
const urlObj = new URL( url );
|
|
623
|
+
const baseURLObj = new URL( this.baseURL );
|
|
624
|
+
return urlObj.hostname === baseURLObj.hostname;
|
|
625
|
+
}
|
|
626
|
+
catch ( e )
|
|
627
|
+
{
|
|
628
|
+
console.log( `Invalid URL: ${url}` );
|
|
629
|
+
return false;
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
|
|
611
633
|
isValidContent ( content )
|
|
612
634
|
{
|
|
613
635
|
// Remove whitespace and newlines for checking
|