clean-web-scraper 3.5.0 → 3.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +108 -27
- package/package.json +1 -1
- package/src/WebScraper.js +34 -6
package/example-usage.js
CHANGED
|
@@ -8,8 +8,36 @@ const WebScraper = require( "./src/WebScraper" );
|
|
|
8
8
|
// "Cookie": cookies
|
|
9
9
|
// }
|
|
10
10
|
|
|
11
|
+
async function palianswers ( enable )
|
|
12
|
+
{
|
|
13
|
+
const scraper = new WebScraper({
|
|
14
|
+
baseURL: "https://palianswers.com",
|
|
15
|
+
excludeList: [
|
|
16
|
+
"https://palianswers.com/chat/",
|
|
17
|
+
"https://palianswers.com/become-a-volunteer/",
|
|
18
|
+
"https://palianswers.com/other-resources/",
|
|
19
|
+
"https://palianswers.com/request-a-rebuttal/",
|
|
20
|
+
"https://palianswers.com/submit-a-rebuttal/",
|
|
21
|
+
"https://palianswers.com/themes/"
|
|
22
|
+
],
|
|
23
|
+
exactExcludeList: [
|
|
24
|
+
"https://palianswers.com/",
|
|
25
|
+
],
|
|
26
|
+
scrapResultPath: "./dataset/palianswers/website",
|
|
27
|
+
jsonlOutputPath: "./dataset/palianswers/train.jsonl",
|
|
28
|
+
textOutputPath: "./dataset/palianswers/texts",
|
|
29
|
+
csvOutputPath: "./dataset/palianswers/train.csv",
|
|
30
|
+
includeMetadata: true,
|
|
31
|
+
metadataFields: ["title", "description", "author"]
|
|
32
|
+
});
|
|
33
|
+
if ( enable )
|
|
34
|
+
{
|
|
35
|
+
await scraper.start();
|
|
36
|
+
}
|
|
37
|
+
return scraper;
|
|
38
|
+
}
|
|
11
39
|
|
|
12
|
-
async function khameneiIrFreePalestineTag ()
|
|
40
|
+
async function khameneiIrFreePalestineTag ( enable )
|
|
13
41
|
{
|
|
14
42
|
// https://english.khamenei.ir/Opinions/FreePalestine
|
|
15
43
|
// https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
|
|
@@ -27,13 +55,15 @@ async function khameneiIrFreePalestineTag ()
|
|
|
27
55
|
includeMetadata: true,
|
|
28
56
|
metadataFields: ["title", "description", "author"]
|
|
29
57
|
});
|
|
30
|
-
|
|
58
|
+
if ( enable )
|
|
59
|
+
{
|
|
60
|
+
await scraper.start();
|
|
61
|
+
}
|
|
31
62
|
return scraper;
|
|
32
63
|
}
|
|
33
64
|
|
|
34
|
-
async function decolonizepalestine ()
|
|
65
|
+
async function decolonizepalestine ( enable )
|
|
35
66
|
{
|
|
36
|
-
// https://decolonizepalestine.com
|
|
37
67
|
const scraper = new WebScraper({
|
|
38
68
|
baseURL: "https://decolonizepalestine.com",
|
|
39
69
|
excludeList: [
|
|
@@ -54,13 +84,15 @@ async function decolonizepalestine ()
|
|
|
54
84
|
includeMetadata: true,
|
|
55
85
|
metadataFields: ["title", "description", "author"]
|
|
56
86
|
});
|
|
57
|
-
|
|
87
|
+
if ( enable )
|
|
88
|
+
{
|
|
89
|
+
await scraper.start();
|
|
90
|
+
}
|
|
58
91
|
return scraper;
|
|
59
92
|
}
|
|
60
93
|
|
|
61
|
-
async function bdsmovement ()
|
|
94
|
+
async function bdsmovement ( enable )
|
|
62
95
|
{
|
|
63
|
-
// https://bdsmovement.net
|
|
64
96
|
const scraper = new WebScraper({
|
|
65
97
|
baseURL: "https://bdsmovement.net",
|
|
66
98
|
excludeList: [
|
|
@@ -83,15 +115,16 @@ async function bdsmovement ()
|
|
|
83
115
|
host: "socks5://127.0.0.1",
|
|
84
116
|
port: "2080",
|
|
85
117
|
},
|
|
86
|
-
// usePuppeteer: true
|
|
87
118
|
});
|
|
88
|
-
|
|
119
|
+
if ( enable )
|
|
120
|
+
{
|
|
121
|
+
await scraper.start();
|
|
122
|
+
}
|
|
89
123
|
return scraper;
|
|
90
124
|
}
|
|
91
125
|
|
|
92
|
-
async function electronicintifada ()
|
|
126
|
+
async function electronicintifada ( enable )
|
|
93
127
|
{
|
|
94
|
-
// https://electronicintifada.net
|
|
95
128
|
const scraper = new WebScraper({
|
|
96
129
|
baseURL: "https://electronicintifada.net",
|
|
97
130
|
excludeList: [
|
|
@@ -103,7 +136,8 @@ async function electronicintifada ()
|
|
|
103
136
|
"https://electronicintifada.net/location",
|
|
104
137
|
"https://electronicintifada.net/file",
|
|
105
138
|
"https://electronicintifada.net/bytopic/people",
|
|
106
|
-
"https://electronicintifada.net/comment/"
|
|
139
|
+
"https://electronicintifada.net/comment/",
|
|
140
|
+
"https://electronicintifada.net/search/site/"
|
|
107
141
|
],
|
|
108
142
|
exactExcludeList: [
|
|
109
143
|
"https://electronicintifada.net",
|
|
@@ -117,31 +151,78 @@ async function electronicintifada ()
|
|
|
117
151
|
textOutputPath: "./dataset/electronicintifada/texts",
|
|
118
152
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
|
119
153
|
includeMetadata: true,
|
|
120
|
-
maxArticles:
|
|
154
|
+
maxArticles: 2000,
|
|
121
155
|
metadataFields: ["title", "description", "author"]
|
|
122
156
|
});
|
|
123
|
-
|
|
157
|
+
if ( enable )
|
|
158
|
+
{
|
|
159
|
+
await scraper.start();
|
|
160
|
+
}
|
|
161
|
+
return scraper;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async function palestineremembered ( enable )
|
|
165
|
+
{
|
|
166
|
+
// https://www.palestineremembered.com
|
|
167
|
+
const scraper = new WebScraper({
|
|
168
|
+
baseURL: "https://www.palestineremembered.com",
|
|
169
|
+
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
|
170
|
+
excludeList: [
|
|
171
|
+
"https://www.palestineremembered.com/GeoPoints",
|
|
172
|
+
"https://www.palestineremembered.com/Donate",
|
|
173
|
+
"https://www.palestineremembered.com/ContactUs.html",
|
|
174
|
+
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
|
175
|
+
"https://www.palestineremembered.com/ar/",
|
|
176
|
+
"https://www.palestineremembered.com/OldNewPictures.html",
|
|
177
|
+
"https://www.palestineremembered.com/Maps/index.html",
|
|
178
|
+
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
|
179
|
+
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
|
180
|
+
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
|
181
|
+
"https://www.palestineremembered.com/AllTownsListing.html",
|
|
182
|
+
"https://www.palestineremembered.com/Articles/General/ar/",
|
|
183
|
+
"https://www.palestineremembered.com/SiteVideos.html"
|
|
184
|
+
],
|
|
185
|
+
exactExcludeList: [
|
|
186
|
+
"https://www.palestineremembered.com/index.html",
|
|
187
|
+
"https://www.palestineremembered.com/ZionistFAQ.html"
|
|
188
|
+
],
|
|
189
|
+
scrapResultPath: "./dataset/palestineremembered/website",
|
|
190
|
+
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
|
191
|
+
textOutputPath: "./dataset/palestineremembered/texts",
|
|
192
|
+
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
|
193
|
+
includeMetadata: true,
|
|
194
|
+
metadataFields: ["title", "description", "author"],
|
|
195
|
+
axiosProxy: {
|
|
196
|
+
host: "localhost",
|
|
197
|
+
port: 2080,
|
|
198
|
+
protocol: "http"
|
|
199
|
+
}
|
|
200
|
+
});
|
|
201
|
+
if ( enable )
|
|
202
|
+
{
|
|
203
|
+
await scraper.start();
|
|
204
|
+
}
|
|
124
205
|
return scraper;
|
|
125
206
|
}
|
|
126
207
|
|
|
208
|
+
|
|
127
209
|
void async function main ()
|
|
128
210
|
{
|
|
129
|
-
const
|
|
130
|
-
const decolonizepalestineScraper = await decolonizepalestine();
|
|
131
|
-
const
|
|
132
|
-
const
|
|
211
|
+
const palianswersScraper = await palianswers( true );
|
|
212
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
|
213
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
|
214
|
+
const bdsmovementScraper = await bdsmovement( false );
|
|
215
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
|
216
|
+
const palestinerememberedScraper = await palestineremembered( false );
|
|
217
|
+
|
|
133
218
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
134
|
-
|
|
219
|
+
palianswersScraper,
|
|
135
220
|
decolonizepalestineScraper,
|
|
136
|
-
|
|
137
|
-
electronicintifadaScraper
|
|
221
|
+
khameneiIrFreePalestineTagScraper,
|
|
222
|
+
electronicintifadaScraper,
|
|
223
|
+
// bdsmovementScraper,
|
|
224
|
+
// palestinerememberedScraper,
|
|
138
225
|
] );
|
|
139
226
|
|
|
140
|
-
// 5
|
|
141
|
-
// https://www.palestineremembered.com/ZionistFAQ.html
|
|
142
|
-
|
|
143
|
-
// 6 https://the-palestinian-side.vercel.app/
|
|
144
|
-
|
|
145
227
|
// 7 https://stand-with-palestine.org/blogs
|
|
146
228
|
}()
|
|
147
|
-
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -12,8 +12,9 @@ class WebScraper
|
|
|
12
12
|
constructor ({
|
|
13
13
|
baseURL,
|
|
14
14
|
startURL,
|
|
15
|
+
strictBaseURL = true,
|
|
15
16
|
maxDepth = Infinity,
|
|
16
|
-
maxArticles = Infinity,
|
|
17
|
+
maxArticles = Infinity,
|
|
17
18
|
excludeList,
|
|
18
19
|
exactExcludeList,
|
|
19
20
|
scrapResultPath = "./dataset",
|
|
@@ -22,7 +23,8 @@ class WebScraper
|
|
|
22
23
|
csvOutputPath,
|
|
23
24
|
includeMetadata = false,
|
|
24
25
|
metadataFields = [], // ['title', 'description', 'author', etc.]
|
|
25
|
-
|
|
26
|
+
axiosHeaders,
|
|
27
|
+
axiosProxy,
|
|
26
28
|
usePuppeteer,
|
|
27
29
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
28
30
|
puppeteerExecutablePath,
|
|
@@ -33,15 +35,16 @@ class WebScraper
|
|
|
33
35
|
{
|
|
34
36
|
this.baseURL = baseURL;
|
|
35
37
|
this.startURL = startURL || baseURL;
|
|
38
|
+
this.strictBaseURL = strictBaseURL;
|
|
36
39
|
this.maxDepth = maxDepth;
|
|
37
|
-
this.maxArticles = maxArticles;
|
|
40
|
+
this.maxArticles = maxArticles;
|
|
38
41
|
this.scrapResultPath = scrapResultPath;
|
|
39
42
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
40
43
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
41
44
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
42
45
|
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
43
46
|
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
44
|
-
this.
|
|
47
|
+
this.axiosHeaders = axiosHeaders;
|
|
45
48
|
this.includeMetadata = includeMetadata;
|
|
46
49
|
this.metadataFields = new Set( metadataFields );
|
|
47
50
|
this.visited = new Set();
|
|
@@ -50,6 +53,7 @@ class WebScraper
|
|
|
50
53
|
this.allProcessedContent = [];
|
|
51
54
|
this.filterFileTypes = filterFileTypes;
|
|
52
55
|
this.excludedFileTypes = excludedFileTypes;
|
|
56
|
+
this.axiosProxy = axiosProxy;
|
|
53
57
|
this.usePuppeteer = usePuppeteer || false;
|
|
54
58
|
this.puppeteerOptions = {
|
|
55
59
|
headless: false,
|
|
@@ -129,6 +133,10 @@ class WebScraper
|
|
|
129
133
|
{
|
|
130
134
|
return;
|
|
131
135
|
}
|
|
136
|
+
if ( !this.isValidDomain( url ) )
|
|
137
|
+
{
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
132
140
|
try
|
|
133
141
|
{
|
|
134
142
|
const data = await this.caller( url );
|
|
@@ -180,9 +188,13 @@ class WebScraper
|
|
|
180
188
|
try
|
|
181
189
|
{
|
|
182
190
|
let axiosOptions = {};
|
|
183
|
-
if ( this.
|
|
191
|
+
if ( this.axiosHeaders )
|
|
184
192
|
{
|
|
185
|
-
axiosOptions.headers = this.
|
|
193
|
+
axiosOptions.headers = this.axiosHeaders;
|
|
194
|
+
}
|
|
195
|
+
if ( this.axiosProxy )
|
|
196
|
+
{
|
|
197
|
+
axiosOptions.proxy = this.axiosProxy;
|
|
186
198
|
}
|
|
187
199
|
|
|
188
200
|
// Step 1: Make a GET request with a small timeout and limited data download
|
|
@@ -602,6 +614,22 @@ class WebScraper
|
|
|
602
614
|
return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
|
|
603
615
|
}
|
|
604
616
|
|
|
617
|
+
isValidDomain ( url )
|
|
618
|
+
{
|
|
619
|
+
if ( !this.strictBaseURL ) return true;
|
|
620
|
+
try
|
|
621
|
+
{
|
|
622
|
+
const urlObj = new URL( url );
|
|
623
|
+
const baseURLObj = new URL( this.baseURL );
|
|
624
|
+
return urlObj.hostname === baseURLObj.hostname;
|
|
625
|
+
}
|
|
626
|
+
catch ( e )
|
|
627
|
+
{
|
|
628
|
+
console.log( `Invalid URL: ${url}` );
|
|
629
|
+
return false;
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
|
|
605
633
|
isValidContent ( content )
|
|
606
634
|
{
|
|
607
635
|
// Remove whitespace and newlines for checking
|