clean-web-scraper 3.8.1 → 3.8.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +94 -86
- package/main.js +8 -2
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -29,7 +29,8 @@ async function palianswers ( enable )
|
|
29
29
|
textOutputPath: "./dataset/palianswers/texts",
|
30
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
31
31
|
includeMetadata: true,
|
32
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
32
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
33
|
+
retryDelay: 10000
|
33
34
|
});
|
34
35
|
if ( enable )
|
35
36
|
{
|
@@ -54,7 +55,8 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
54
55
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
55
56
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
56
57
|
includeMetadata: true,
|
57
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
58
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
59
|
+
retryDelay: 10000
|
58
60
|
});
|
59
61
|
if ( enable )
|
60
62
|
{
|
@@ -84,40 +86,8 @@ async function decolonizepalestine ( enable )
|
|
84
86
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
85
87
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
86
88
|
includeMetadata: true,
|
87
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
88
|
-
});
|
89
|
-
if ( enable )
|
90
|
-
{
|
91
|
-
await scraper.start();
|
92
|
-
}
|
93
|
-
return scraper;
|
94
|
-
}
|
95
|
-
|
96
|
-
async function bdsmovement ( enable )
|
97
|
-
{
|
98
|
-
// https://bdsmovement.net
|
99
|
-
const scraper = new WebScraper({
|
100
|
-
baseURL: "https://bdsmovement.net",
|
101
|
-
excludeList: [
|
102
|
-
"https://bdsmovement.net/press-area",
|
103
|
-
"https://bdsmovement.net/privacy-policy",
|
104
|
-
"https://bdsmovement.net/get-involved/join-a-bds-campaign",
|
105
|
-
"https://bdsmovement.net/donate_",
|
106
|
-
"https://bdsmovement.net/user",
|
107
|
-
"https://bdsmovement.net/admin"
|
108
|
-
],
|
109
|
-
scrapResultPath: "./dataset/bdsmovement/website",
|
110
|
-
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
111
|
-
textOutputPath: "./dataset/bdsmovement/texts",
|
112
|
-
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
113
|
-
includeMetadata: true,
|
114
89
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
115
|
-
|
116
|
-
puppeteerExecutablePath: "/usr/bin/chromium",
|
117
|
-
puppeteerRealProxy: {
|
118
|
-
host: "socks5://127.0.0.1",
|
119
|
-
port: "2080",
|
120
|
-
},
|
90
|
+
retryDelay: 10000
|
121
91
|
});
|
122
92
|
if ( enable )
|
123
93
|
{
|
@@ -155,53 +125,12 @@ async function electronicintifada ( enable )
|
|
155
125
|
textOutputPath: "./dataset/electronicintifada/texts",
|
156
126
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
157
127
|
includeMetadata: true,
|
128
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
129
|
+
maxDepth: 10,
|
158
130
|
maxArticles: 2000,
|
131
|
+
concurrencyLimit: 4,
|
159
132
|
axiosHeaders: headers,
|
160
|
-
|
161
|
-
});
|
162
|
-
if ( enable )
|
163
|
-
{
|
164
|
-
await scraper.start();
|
165
|
-
}
|
166
|
-
return scraper;
|
167
|
-
}
|
168
|
-
|
169
|
-
async function palestineremembered ( enable )
|
170
|
-
{
|
171
|
-
// https://www.palestineremembered.com
|
172
|
-
const scraper = new WebScraper({
|
173
|
-
baseURL: "https://www.palestineremembered.com",
|
174
|
-
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
175
|
-
excludeList: [
|
176
|
-
"https://www.palestineremembered.com/GeoPoints",
|
177
|
-
"https://www.palestineremembered.com/Donate",
|
178
|
-
"https://www.palestineremembered.com/ContactUs.html",
|
179
|
-
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
180
|
-
"https://www.palestineremembered.com/ar/",
|
181
|
-
"https://www.palestineremembered.com/OldNewPictures.html",
|
182
|
-
"https://www.palestineremembered.com/Maps/index.html",
|
183
|
-
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
184
|
-
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
185
|
-
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
186
|
-
"https://www.palestineremembered.com/AllTownsListing.html",
|
187
|
-
"https://www.palestineremembered.com/Articles/General/ar/",
|
188
|
-
"https://www.palestineremembered.com/SiteVideos.html"
|
189
|
-
],
|
190
|
-
exactExcludeList: [
|
191
|
-
"https://www.palestineremembered.com/index.html",
|
192
|
-
"https://www.palestineremembered.com/ZionistFAQ.html"
|
193
|
-
],
|
194
|
-
scrapResultPath: "./dataset/palestineremembered/website",
|
195
|
-
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
196
|
-
textOutputPath: "./dataset/palestineremembered/texts",
|
197
|
-
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
198
|
-
includeMetadata: true,
|
199
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
200
|
-
axiosProxy: {
|
201
|
-
host: "localhost",
|
202
|
-
port: 2080,
|
203
|
-
protocol: "http"
|
204
|
-
}
|
133
|
+
retryDelay: 10000
|
205
134
|
});
|
206
135
|
if ( enable )
|
207
136
|
{
|
@@ -259,15 +188,18 @@ async function mondoweiss ( enable )
|
|
259
188
|
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|
260
189
|
textOutputPath: "./dataset/mondoweiss/texts",
|
261
190
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
262
|
-
includeMetadata: true,
|
263
191
|
maxArticles: 2500,
|
264
192
|
maxRetries: 2,
|
193
|
+
concurrencyLimit: 4,
|
265
194
|
axiosHeaders: headers,
|
266
195
|
axiosProxy: {
|
267
196
|
host: "localhost",
|
268
197
|
port: 2080,
|
269
198
|
protocol: "http"
|
270
199
|
},
|
200
|
+
maxDepth: 10,
|
201
|
+
retryDelay: 10000,
|
202
|
+
includeMetadata: true,
|
271
203
|
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
272
204
|
});
|
273
205
|
if ( enable )
|
@@ -277,14 +209,90 @@ async function mondoweiss ( enable )
|
|
277
209
|
return scraper;
|
278
210
|
}
|
279
211
|
|
212
|
+
async function bdsmovement ( enable )
|
213
|
+
{
|
214
|
+
// https://bdsmovement.net
|
215
|
+
const scraper = new WebScraper({
|
216
|
+
baseURL: "https://bdsmovement.net",
|
217
|
+
excludeList: [
|
218
|
+
"https://bdsmovement.net/press-area",
|
219
|
+
"https://bdsmovement.net/privacy-policy",
|
220
|
+
"https://bdsmovement.net/get-involved/join-a-bds-campaign",
|
221
|
+
"https://bdsmovement.net/donate_",
|
222
|
+
"https://bdsmovement.net/user",
|
223
|
+
"https://bdsmovement.net/admin"
|
224
|
+
],
|
225
|
+
scrapResultPath: "./dataset/bdsmovement/website",
|
226
|
+
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
227
|
+
textOutputPath: "./dataset/bdsmovement/texts",
|
228
|
+
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
229
|
+
includeMetadata: true,
|
230
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
231
|
+
puppeteerProxy: "socks5://127.0.0.1:2080",
|
232
|
+
puppeteerExecutablePath: "/usr/bin/chromium",
|
233
|
+
puppeteerRealProxy: {
|
234
|
+
host: "socks5://127.0.0.1",
|
235
|
+
port: "2080",
|
236
|
+
},
|
237
|
+
});
|
238
|
+
if ( enable )
|
239
|
+
{
|
240
|
+
await scraper.start();
|
241
|
+
}
|
242
|
+
return scraper;
|
243
|
+
}
|
244
|
+
|
245
|
+
async function palestineremembered ( enable )
|
246
|
+
{
|
247
|
+
// https://www.palestineremembered.com
|
248
|
+
const scraper = new WebScraper({
|
249
|
+
baseURL: "https://www.palestineremembered.com",
|
250
|
+
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
251
|
+
excludeList: [
|
252
|
+
"https://www.palestineremembered.com/GeoPoints",
|
253
|
+
"https://www.palestineremembered.com/Donate",
|
254
|
+
"https://www.palestineremembered.com/ContactUs.html",
|
255
|
+
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
256
|
+
"https://www.palestineremembered.com/ar/",
|
257
|
+
"https://www.palestineremembered.com/OldNewPictures.html",
|
258
|
+
"https://www.palestineremembered.com/Maps/index.html",
|
259
|
+
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
260
|
+
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
261
|
+
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
262
|
+
"https://www.palestineremembered.com/AllTownsListing.html",
|
263
|
+
"https://www.palestineremembered.com/Articles/General/ar/",
|
264
|
+
"https://www.palestineremembered.com/SiteVideos.html"
|
265
|
+
],
|
266
|
+
exactExcludeList: [
|
267
|
+
"https://www.palestineremembered.com/index.html",
|
268
|
+
"https://www.palestineremembered.com/ZionistFAQ.html"
|
269
|
+
],
|
270
|
+
scrapResultPath: "./dataset/palestineremembered/website",
|
271
|
+
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
272
|
+
textOutputPath: "./dataset/palestineremembered/texts",
|
273
|
+
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
274
|
+
includeMetadata: true,
|
275
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
276
|
+
axiosProxy: {
|
277
|
+
host: "localhost",
|
278
|
+
port: 2080,
|
279
|
+
protocol: "http"
|
280
|
+
}
|
281
|
+
});
|
282
|
+
if ( enable )
|
283
|
+
{
|
284
|
+
await scraper.start();
|
285
|
+
}
|
286
|
+
return scraper;
|
287
|
+
}
|
280
288
|
|
281
289
|
void async function main ()
|
282
290
|
{
|
283
|
-
const palianswersScraper = await palianswers(
|
284
|
-
const decolonizepalestineScraper = await decolonizepalestine(
|
285
|
-
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag(
|
286
|
-
const electronicintifadaScraper = await electronicintifada(
|
287
|
-
const standWithPalestineScraper = await standWithPalestine(
|
291
|
+
const palianswersScraper = await palianswers( true );
|
292
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
293
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
294
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
295
|
+
const standWithPalestineScraper = await standWithPalestine( true );
|
288
296
|
const mondoweisScraper = await mondoweiss( true );
|
289
297
|
const bdsmovementScraper = await bdsmovement( false );
|
290
298
|
const palestinerememberedScraper = await palestineremembered( false );
|
package/main.js
CHANGED
@@ -16,6 +16,7 @@ class WebScraper
|
|
16
16
|
maxArticles,
|
17
17
|
concurrencyLimit,
|
18
18
|
maxRetries,
|
19
|
+
retryDelay,
|
19
20
|
|
20
21
|
// URL filtering
|
21
22
|
excludeList = [],
|
@@ -53,6 +54,7 @@ class WebScraper
|
|
53
54
|
this.maxArticles = maxArticles || Infinity;
|
54
55
|
this.concurrencyLimit = concurrencyLimit || 2;
|
55
56
|
this.maxRetries = maxRetries || 5;
|
57
|
+
this.retryDelay = retryDelay || 40000;
|
56
58
|
|
57
59
|
// Output paths setup
|
58
60
|
this.scrapResultPath = scrapResultPath;
|
@@ -357,6 +359,10 @@ class WebScraper
|
|
357
359
|
{
|
358
360
|
urlPath = "/index";
|
359
361
|
}
|
362
|
+
else if ( urlPath.endsWith( "/" ) )
|
363
|
+
{
|
364
|
+
urlPath = urlPath.slice( 0, -1 );
|
365
|
+
}
|
360
366
|
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
361
367
|
const dir = path.dirname( filePath );
|
362
368
|
|
@@ -579,14 +585,14 @@ class WebScraper
|
|
579
585
|
{
|
580
586
|
if ( this.hasReachedMax( ) )
|
581
587
|
{
|
582
|
-
|
588
|
+
break;
|
583
589
|
}
|
584
590
|
return await axios.get( url, options );
|
585
591
|
}
|
586
592
|
catch ( error )
|
587
593
|
{
|
588
594
|
if ( attempt >= this.maxRetries ) throw error;
|
589
|
-
await WebScraper.sleep(
|
595
|
+
await WebScraper.sleep( this.retryDelay * attempt );
|
590
596
|
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
|
591
597
|
}
|
592
598
|
}
|