clean-web-scraper 3.8.1 → 3.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +92 -86
- package/main.js +7 -1
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -29,7 +29,8 @@ async function palianswers ( enable )
|
|
29
29
|
textOutputPath: "./dataset/palianswers/texts",
|
30
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
31
31
|
includeMetadata: true,
|
32
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
32
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
33
|
+
retryDelay: 10000
|
33
34
|
});
|
34
35
|
if ( enable )
|
35
36
|
{
|
@@ -54,7 +55,8 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
54
55
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
55
56
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
56
57
|
includeMetadata: true,
|
57
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
58
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
59
|
+
retryDelay: 10000
|
58
60
|
});
|
59
61
|
if ( enable )
|
60
62
|
{
|
@@ -84,40 +86,8 @@ async function decolonizepalestine ( enable )
|
|
84
86
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
85
87
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
86
88
|
includeMetadata: true,
|
87
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
88
|
-
});
|
89
|
-
if ( enable )
|
90
|
-
{
|
91
|
-
await scraper.start();
|
92
|
-
}
|
93
|
-
return scraper;
|
94
|
-
}
|
95
|
-
|
96
|
-
async function bdsmovement ( enable )
|
97
|
-
{
|
98
|
-
// https://bdsmovement.net
|
99
|
-
const scraper = new WebScraper({
|
100
|
-
baseURL: "https://bdsmovement.net",
|
101
|
-
excludeList: [
|
102
|
-
"https://bdsmovement.net/press-area",
|
103
|
-
"https://bdsmovement.net/privacy-policy",
|
104
|
-
"https://bdsmovement.net/get-involved/join-a-bds-campaign",
|
105
|
-
"https://bdsmovement.net/donate_",
|
106
|
-
"https://bdsmovement.net/user",
|
107
|
-
"https://bdsmovement.net/admin"
|
108
|
-
],
|
109
|
-
scrapResultPath: "./dataset/bdsmovement/website",
|
110
|
-
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
111
|
-
textOutputPath: "./dataset/bdsmovement/texts",
|
112
|
-
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
113
|
-
includeMetadata: true,
|
114
89
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
115
|
-
|
116
|
-
puppeteerExecutablePath: "/usr/bin/chromium",
|
117
|
-
puppeteerRealProxy: {
|
118
|
-
host: "socks5://127.0.0.1",
|
119
|
-
port: "2080",
|
120
|
-
},
|
90
|
+
retryDelay: 10000
|
121
91
|
});
|
122
92
|
if ( enable )
|
123
93
|
{
|
@@ -155,53 +125,11 @@ async function electronicintifada ( enable )
|
|
155
125
|
textOutputPath: "./dataset/electronicintifada/texts",
|
156
126
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
157
127
|
includeMetadata: true,
|
128
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
129
|
+
maxDepth: 10,
|
158
130
|
maxArticles: 2000,
|
159
131
|
axiosHeaders: headers,
|
160
|
-
|
161
|
-
});
|
162
|
-
if ( enable )
|
163
|
-
{
|
164
|
-
await scraper.start();
|
165
|
-
}
|
166
|
-
return scraper;
|
167
|
-
}
|
168
|
-
|
169
|
-
async function palestineremembered ( enable )
|
170
|
-
{
|
171
|
-
// https://www.palestineremembered.com
|
172
|
-
const scraper = new WebScraper({
|
173
|
-
baseURL: "https://www.palestineremembered.com",
|
174
|
-
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
175
|
-
excludeList: [
|
176
|
-
"https://www.palestineremembered.com/GeoPoints",
|
177
|
-
"https://www.palestineremembered.com/Donate",
|
178
|
-
"https://www.palestineremembered.com/ContactUs.html",
|
179
|
-
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
180
|
-
"https://www.palestineremembered.com/ar/",
|
181
|
-
"https://www.palestineremembered.com/OldNewPictures.html",
|
182
|
-
"https://www.palestineremembered.com/Maps/index.html",
|
183
|
-
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
184
|
-
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
185
|
-
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
186
|
-
"https://www.palestineremembered.com/AllTownsListing.html",
|
187
|
-
"https://www.palestineremembered.com/Articles/General/ar/",
|
188
|
-
"https://www.palestineremembered.com/SiteVideos.html"
|
189
|
-
],
|
190
|
-
exactExcludeList: [
|
191
|
-
"https://www.palestineremembered.com/index.html",
|
192
|
-
"https://www.palestineremembered.com/ZionistFAQ.html"
|
193
|
-
],
|
194
|
-
scrapResultPath: "./dataset/palestineremembered/website",
|
195
|
-
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
196
|
-
textOutputPath: "./dataset/palestineremembered/texts",
|
197
|
-
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
198
|
-
includeMetadata: true,
|
199
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
200
|
-
axiosProxy: {
|
201
|
-
host: "localhost",
|
202
|
-
port: 2080,
|
203
|
-
protocol: "http"
|
204
|
-
}
|
132
|
+
retryDelay: 10000
|
205
133
|
});
|
206
134
|
if ( enable )
|
207
135
|
{
|
@@ -259,7 +187,6 @@ async function mondoweiss ( enable )
|
|
259
187
|
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|
260
188
|
textOutputPath: "./dataset/mondoweiss/texts",
|
261
189
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
262
|
-
includeMetadata: true,
|
263
190
|
maxArticles: 2500,
|
264
191
|
maxRetries: 2,
|
265
192
|
axiosHeaders: headers,
|
@@ -268,6 +195,9 @@ async function mondoweiss ( enable )
|
|
268
195
|
port: 2080,
|
269
196
|
protocol: "http"
|
270
197
|
},
|
198
|
+
maxDepth: 10,
|
199
|
+
retryDelay: 10000,
|
200
|
+
includeMetadata: true,
|
271
201
|
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
272
202
|
});
|
273
203
|
if ( enable )
|
@@ -277,14 +207,90 @@ async function mondoweiss ( enable )
|
|
277
207
|
return scraper;
|
278
208
|
}
|
279
209
|
|
210
|
+
async function bdsmovement ( enable )
|
211
|
+
{
|
212
|
+
// https://bdsmovement.net
|
213
|
+
const scraper = new WebScraper({
|
214
|
+
baseURL: "https://bdsmovement.net",
|
215
|
+
excludeList: [
|
216
|
+
"https://bdsmovement.net/press-area",
|
217
|
+
"https://bdsmovement.net/privacy-policy",
|
218
|
+
"https://bdsmovement.net/get-involved/join-a-bds-campaign",
|
219
|
+
"https://bdsmovement.net/donate_",
|
220
|
+
"https://bdsmovement.net/user",
|
221
|
+
"https://bdsmovement.net/admin"
|
222
|
+
],
|
223
|
+
scrapResultPath: "./dataset/bdsmovement/website",
|
224
|
+
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
225
|
+
textOutputPath: "./dataset/bdsmovement/texts",
|
226
|
+
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
227
|
+
includeMetadata: true,
|
228
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
229
|
+
puppeteerProxy: "socks5://127.0.0.1:2080",
|
230
|
+
puppeteerExecutablePath: "/usr/bin/chromium",
|
231
|
+
puppeteerRealProxy: {
|
232
|
+
host: "socks5://127.0.0.1",
|
233
|
+
port: "2080",
|
234
|
+
},
|
235
|
+
});
|
236
|
+
if ( enable )
|
237
|
+
{
|
238
|
+
await scraper.start();
|
239
|
+
}
|
240
|
+
return scraper;
|
241
|
+
}
|
242
|
+
|
243
|
+
async function palestineremembered ( enable )
|
244
|
+
{
|
245
|
+
// https://www.palestineremembered.com
|
246
|
+
const scraper = new WebScraper({
|
247
|
+
baseURL: "https://www.palestineremembered.com",
|
248
|
+
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
249
|
+
excludeList: [
|
250
|
+
"https://www.palestineremembered.com/GeoPoints",
|
251
|
+
"https://www.palestineremembered.com/Donate",
|
252
|
+
"https://www.palestineremembered.com/ContactUs.html",
|
253
|
+
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
254
|
+
"https://www.palestineremembered.com/ar/",
|
255
|
+
"https://www.palestineremembered.com/OldNewPictures.html",
|
256
|
+
"https://www.palestineremembered.com/Maps/index.html",
|
257
|
+
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
258
|
+
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
259
|
+
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
260
|
+
"https://www.palestineremembered.com/AllTownsListing.html",
|
261
|
+
"https://www.palestineremembered.com/Articles/General/ar/",
|
262
|
+
"https://www.palestineremembered.com/SiteVideos.html"
|
263
|
+
],
|
264
|
+
exactExcludeList: [
|
265
|
+
"https://www.palestineremembered.com/index.html",
|
266
|
+
"https://www.palestineremembered.com/ZionistFAQ.html"
|
267
|
+
],
|
268
|
+
scrapResultPath: "./dataset/palestineremembered/website",
|
269
|
+
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
270
|
+
textOutputPath: "./dataset/palestineremembered/texts",
|
271
|
+
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
272
|
+
includeMetadata: true,
|
273
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
274
|
+
axiosProxy: {
|
275
|
+
host: "localhost",
|
276
|
+
port: 2080,
|
277
|
+
protocol: "http"
|
278
|
+
}
|
279
|
+
});
|
280
|
+
if ( enable )
|
281
|
+
{
|
282
|
+
await scraper.start();
|
283
|
+
}
|
284
|
+
return scraper;
|
285
|
+
}
|
280
286
|
|
281
287
|
void async function main ()
|
282
288
|
{
|
283
|
-
const palianswersScraper = await palianswers(
|
284
|
-
const decolonizepalestineScraper = await decolonizepalestine(
|
285
|
-
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag(
|
286
|
-
const electronicintifadaScraper = await electronicintifada(
|
287
|
-
const standWithPalestineScraper = await standWithPalestine(
|
289
|
+
const palianswersScraper = await palianswers( true );
|
290
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
291
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
292
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
293
|
+
const standWithPalestineScraper = await standWithPalestine( true );
|
288
294
|
const mondoweisScraper = await mondoweiss( true );
|
289
295
|
const bdsmovementScraper = await bdsmovement( false );
|
290
296
|
const palestinerememberedScraper = await palestineremembered( false );
|
package/main.js
CHANGED
@@ -16,6 +16,7 @@ class WebScraper
|
|
16
16
|
maxArticles,
|
17
17
|
concurrencyLimit,
|
18
18
|
maxRetries,
|
19
|
+
retryDelay,
|
19
20
|
|
20
21
|
// URL filtering
|
21
22
|
excludeList = [],
|
@@ -53,6 +54,7 @@ class WebScraper
|
|
53
54
|
this.maxArticles = maxArticles || Infinity;
|
54
55
|
this.concurrencyLimit = concurrencyLimit || 2;
|
55
56
|
this.maxRetries = maxRetries || 5;
|
57
|
+
this.retryDelay = retryDelay || 40000;
|
56
58
|
|
57
59
|
// Output paths setup
|
58
60
|
this.scrapResultPath = scrapResultPath;
|
@@ -357,6 +359,10 @@ class WebScraper
|
|
357
359
|
{
|
358
360
|
urlPath = "/index";
|
359
361
|
}
|
362
|
+
else if ( urlPath.endsWith( "/" ) )
|
363
|
+
{
|
364
|
+
urlPath = urlPath.slice( 0, -1 );
|
365
|
+
}
|
360
366
|
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
361
367
|
const dir = path.dirname( filePath );
|
362
368
|
|
@@ -586,7 +592,7 @@ class WebScraper
|
|
586
592
|
catch ( error )
|
587
593
|
{
|
588
594
|
if ( attempt >= this.maxRetries ) throw error;
|
589
|
-
await WebScraper.sleep(
|
595
|
+
await WebScraper.sleep( this.retryDelay * attempt );
|
590
596
|
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
|
591
597
|
}
|
592
598
|
}
|