clean-web-scraper 3.8.1 → 3.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -29,7 +29,8 @@ async function palianswers ( enable )
29
29
  textOutputPath: "./dataset/palianswers/texts",
30
30
  csvOutputPath: "./dataset/palianswers/train.csv",
31
31
  includeMetadata: true,
32
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
32
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
33
+ retryDelay: 10000
33
34
  });
34
35
  if ( enable )
35
36
  {
@@ -54,7 +55,8 @@ async function khameneiIrFreePalestineTag ( enable )
54
55
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
55
56
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
56
57
  includeMetadata: true,
57
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
58
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
59
+ retryDelay: 10000
58
60
  });
59
61
  if ( enable )
60
62
  {
@@ -84,40 +86,8 @@ async function decolonizepalestine ( enable )
84
86
  textOutputPath: "./dataset/decolonizepalestine/texts",
85
87
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
86
88
  includeMetadata: true,
87
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
88
- });
89
- if ( enable )
90
- {
91
- await scraper.start();
92
- }
93
- return scraper;
94
- }
95
-
96
- async function bdsmovement ( enable )
97
- {
98
- // https://bdsmovement.net
99
- const scraper = new WebScraper({
100
- baseURL: "https://bdsmovement.net",
101
- excludeList: [
102
- "https://bdsmovement.net/press-area",
103
- "https://bdsmovement.net/privacy-policy",
104
- "https://bdsmovement.net/get-involved/join-a-bds-campaign",
105
- "https://bdsmovement.net/donate_",
106
- "https://bdsmovement.net/user",
107
- "https://bdsmovement.net/admin"
108
- ],
109
- scrapResultPath: "./dataset/bdsmovement/website",
110
- jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
111
- textOutputPath: "./dataset/bdsmovement/texts",
112
- csvOutputPath: "./dataset/bdsmovement/train.csv",
113
- includeMetadata: true,
114
89
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
115
- puppeteerProxy: "socks5://127.0.0.1:2080",
116
- puppeteerExecutablePath: "/usr/bin/chromium",
117
- puppeteerRealProxy: {
118
- host: "socks5://127.0.0.1",
119
- port: "2080",
120
- },
90
+ retryDelay: 10000
121
91
  });
122
92
  if ( enable )
123
93
  {
@@ -155,53 +125,11 @@ async function electronicintifada ( enable )
155
125
  textOutputPath: "./dataset/electronicintifada/texts",
156
126
  csvOutputPath: "./dataset/electronicintifada/train.csv",
157
127
  includeMetadata: true,
128
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
129
+ maxDepth: 10,
158
130
  maxArticles: 2000,
159
131
  axiosHeaders: headers,
160
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
161
- });
162
- if ( enable )
163
- {
164
- await scraper.start();
165
- }
166
- return scraper;
167
- }
168
-
169
- async function palestineremembered ( enable )
170
- {
171
- // https://www.palestineremembered.com
172
- const scraper = new WebScraper({
173
- baseURL: "https://www.palestineremembered.com",
174
- startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
175
- excludeList: [
176
- "https://www.palestineremembered.com/GeoPoints",
177
- "https://www.palestineremembered.com/Donate",
178
- "https://www.palestineremembered.com/ContactUs.html",
179
- "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
180
- "https://www.palestineremembered.com/ar/",
181
- "https://www.palestineremembered.com/OldNewPictures.html",
182
- "https://www.palestineremembered.com/Maps/index.html",
183
- "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
184
- "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
185
- "https://www.palestineremembered.com/Articles/General/Story2045.html",
186
- "https://www.palestineremembered.com/AllTownsListing.html",
187
- "https://www.palestineremembered.com/Articles/General/ar/",
188
- "https://www.palestineremembered.com/SiteVideos.html"
189
- ],
190
- exactExcludeList: [
191
- "https://www.palestineremembered.com/index.html",
192
- "https://www.palestineremembered.com/ZionistFAQ.html"
193
- ],
194
- scrapResultPath: "./dataset/palestineremembered/website",
195
- jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
196
- textOutputPath: "./dataset/palestineremembered/texts",
197
- csvOutputPath: "./dataset/palestineremembered/train.csv",
198
- includeMetadata: true,
199
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
200
- axiosProxy: {
201
- host: "localhost",
202
- port: 2080,
203
- protocol: "http"
204
- }
132
+ retryDelay: 10000
205
133
  });
206
134
  if ( enable )
207
135
  {
@@ -259,7 +187,6 @@ async function mondoweiss ( enable )
259
187
  jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
260
188
  textOutputPath: "./dataset/mondoweiss/texts",
261
189
  csvOutputPath: "./dataset/mondoweiss/train.csv",
262
- includeMetadata: true,
263
190
  maxArticles: 2500,
264
191
  maxRetries: 2,
265
192
  axiosHeaders: headers,
@@ -268,6 +195,9 @@ async function mondoweiss ( enable )
268
195
  port: 2080,
269
196
  protocol: "http"
270
197
  },
198
+ maxDepth: 10,
199
+ retryDelay: 10000,
200
+ includeMetadata: true,
271
201
  metadataFields: ["author", "title", "description", "dateScrapedDate"]
272
202
  });
273
203
  if ( enable )
@@ -277,14 +207,90 @@ async function mondoweiss ( enable )
277
207
  return scraper;
278
208
  }
279
209
 
210
+ async function bdsmovement ( enable )
211
+ {
212
+ // https://bdsmovement.net
213
+ const scraper = new WebScraper({
214
+ baseURL: "https://bdsmovement.net",
215
+ excludeList: [
216
+ "https://bdsmovement.net/press-area",
217
+ "https://bdsmovement.net/privacy-policy",
218
+ "https://bdsmovement.net/get-involved/join-a-bds-campaign",
219
+ "https://bdsmovement.net/donate_",
220
+ "https://bdsmovement.net/user",
221
+ "https://bdsmovement.net/admin"
222
+ ],
223
+ scrapResultPath: "./dataset/bdsmovement/website",
224
+ jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
225
+ textOutputPath: "./dataset/bdsmovement/texts",
226
+ csvOutputPath: "./dataset/bdsmovement/train.csv",
227
+ includeMetadata: true,
228
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
229
+ puppeteerProxy: "socks5://127.0.0.1:2080",
230
+ puppeteerExecutablePath: "/usr/bin/chromium",
231
+ puppeteerRealProxy: {
232
+ host: "socks5://127.0.0.1",
233
+ port: "2080",
234
+ },
235
+ });
236
+ if ( enable )
237
+ {
238
+ await scraper.start();
239
+ }
240
+ return scraper;
241
+ }
242
+
243
+ async function palestineremembered ( enable )
244
+ {
245
+ // https://www.palestineremembered.com
246
+ const scraper = new WebScraper({
247
+ baseURL: "https://www.palestineremembered.com",
248
+ startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
249
+ excludeList: [
250
+ "https://www.palestineremembered.com/GeoPoints",
251
+ "https://www.palestineremembered.com/Donate",
252
+ "https://www.palestineremembered.com/ContactUs.html",
253
+ "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
254
+ "https://www.palestineremembered.com/ar/",
255
+ "https://www.palestineremembered.com/OldNewPictures.html",
256
+ "https://www.palestineremembered.com/Maps/index.html",
257
+ "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
258
+ "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
259
+ "https://www.palestineremembered.com/Articles/General/Story2045.html",
260
+ "https://www.palestineremembered.com/AllTownsListing.html",
261
+ "https://www.palestineremembered.com/Articles/General/ar/",
262
+ "https://www.palestineremembered.com/SiteVideos.html"
263
+ ],
264
+ exactExcludeList: [
265
+ "https://www.palestineremembered.com/index.html",
266
+ "https://www.palestineremembered.com/ZionistFAQ.html"
267
+ ],
268
+ scrapResultPath: "./dataset/palestineremembered/website",
269
+ jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
270
+ textOutputPath: "./dataset/palestineremembered/texts",
271
+ csvOutputPath: "./dataset/palestineremembered/train.csv",
272
+ includeMetadata: true,
273
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
274
+ axiosProxy: {
275
+ host: "localhost",
276
+ port: 2080,
277
+ protocol: "http"
278
+ }
279
+ });
280
+ if ( enable )
281
+ {
282
+ await scraper.start();
283
+ }
284
+ return scraper;
285
+ }
280
286
 
281
287
  void async function main ()
282
288
  {
283
- const palianswersScraper = await palianswers( false );
284
- const decolonizepalestineScraper = await decolonizepalestine( false );
285
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
286
- const electronicintifadaScraper = await electronicintifada( false );
287
- const standWithPalestineScraper = await standWithPalestine( false );
289
+ const palianswersScraper = await palianswers( true );
290
+ const decolonizepalestineScraper = await decolonizepalestine( true );
291
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
292
+ const electronicintifadaScraper = await electronicintifada( true );
293
+ const standWithPalestineScraper = await standWithPalestine( true );
288
294
  const mondoweisScraper = await mondoweiss( true );
289
295
  const bdsmovementScraper = await bdsmovement( false );
290
296
  const palestinerememberedScraper = await palestineremembered( false );
package/main.js CHANGED
@@ -16,6 +16,7 @@ class WebScraper
16
16
  maxArticles,
17
17
  concurrencyLimit,
18
18
  maxRetries,
19
+ retryDelay,
19
20
 
20
21
  // URL filtering
21
22
  excludeList = [],
@@ -53,6 +54,7 @@ class WebScraper
53
54
  this.maxArticles = maxArticles || Infinity;
54
55
  this.concurrencyLimit = concurrencyLimit || 2;
55
56
  this.maxRetries = maxRetries || 5;
57
+ this.retryDelay = retryDelay || 40000;
56
58
 
57
59
  // Output paths setup
58
60
  this.scrapResultPath = scrapResultPath;
@@ -357,6 +359,10 @@ class WebScraper
357
359
  {
358
360
  urlPath = "/index";
359
361
  }
362
+ else if ( urlPath.endsWith( "/" ) )
363
+ {
364
+ urlPath = urlPath.slice( 0, -1 );
365
+ }
360
366
  const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
361
367
  const dir = path.dirname( filePath );
362
368
 
@@ -586,7 +592,7 @@ class WebScraper
586
592
  catch ( error )
587
593
  {
588
594
  if ( attempt >= this.maxRetries ) throw error;
589
- await WebScraper.sleep( 40000 * attempt );
595
+ await WebScraper.sleep( this.retryDelay * attempt );
590
596
  console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
591
597
  }
592
598
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.8.1",
3
+ "version": "3.8.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",