clean-web-scraper 3.8.1 → 3.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -29,7 +29,8 @@ async function palianswers ( enable )
29
29
  textOutputPath: "./dataset/palianswers/texts",
30
30
  csvOutputPath: "./dataset/palianswers/train.csv",
31
31
  includeMetadata: true,
32
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
32
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
33
+ retryDelay: 10000
33
34
  });
34
35
  if ( enable )
35
36
  {
@@ -54,7 +55,8 @@ async function khameneiIrFreePalestineTag ( enable )
54
55
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
55
56
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
56
57
  includeMetadata: true,
57
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
58
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
59
+ retryDelay: 10000
58
60
  });
59
61
  if ( enable )
60
62
  {
@@ -84,40 +86,8 @@ async function decolonizepalestine ( enable )
84
86
  textOutputPath: "./dataset/decolonizepalestine/texts",
85
87
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
86
88
  includeMetadata: true,
87
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
88
- });
89
- if ( enable )
90
- {
91
- await scraper.start();
92
- }
93
- return scraper;
94
- }
95
-
96
- async function bdsmovement ( enable )
97
- {
98
- // https://bdsmovement.net
99
- const scraper = new WebScraper({
100
- baseURL: "https://bdsmovement.net",
101
- excludeList: [
102
- "https://bdsmovement.net/press-area",
103
- "https://bdsmovement.net/privacy-policy",
104
- "https://bdsmovement.net/get-involved/join-a-bds-campaign",
105
- "https://bdsmovement.net/donate_",
106
- "https://bdsmovement.net/user",
107
- "https://bdsmovement.net/admin"
108
- ],
109
- scrapResultPath: "./dataset/bdsmovement/website",
110
- jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
111
- textOutputPath: "./dataset/bdsmovement/texts",
112
- csvOutputPath: "./dataset/bdsmovement/train.csv",
113
- includeMetadata: true,
114
89
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
115
- puppeteerProxy: "socks5://127.0.0.1:2080",
116
- puppeteerExecutablePath: "/usr/bin/chromium",
117
- puppeteerRealProxy: {
118
- host: "socks5://127.0.0.1",
119
- port: "2080",
120
- },
90
+ retryDelay: 10000
121
91
  });
122
92
  if ( enable )
123
93
  {
@@ -155,53 +125,12 @@ async function electronicintifada ( enable )
155
125
  textOutputPath: "./dataset/electronicintifada/texts",
156
126
  csvOutputPath: "./dataset/electronicintifada/train.csv",
157
127
  includeMetadata: true,
128
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
129
+ maxDepth: 10,
158
130
  maxArticles: 2000,
131
+ concurrencyLimit: 4,
159
132
  axiosHeaders: headers,
160
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
161
- });
162
- if ( enable )
163
- {
164
- await scraper.start();
165
- }
166
- return scraper;
167
- }
168
-
169
- async function palestineremembered ( enable )
170
- {
171
- // https://www.palestineremembered.com
172
- const scraper = new WebScraper({
173
- baseURL: "https://www.palestineremembered.com",
174
- startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
175
- excludeList: [
176
- "https://www.palestineremembered.com/GeoPoints",
177
- "https://www.palestineremembered.com/Donate",
178
- "https://www.palestineremembered.com/ContactUs.html",
179
- "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
180
- "https://www.palestineremembered.com/ar/",
181
- "https://www.palestineremembered.com/OldNewPictures.html",
182
- "https://www.palestineremembered.com/Maps/index.html",
183
- "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
184
- "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
185
- "https://www.palestineremembered.com/Articles/General/Story2045.html",
186
- "https://www.palestineremembered.com/AllTownsListing.html",
187
- "https://www.palestineremembered.com/Articles/General/ar/",
188
- "https://www.palestineremembered.com/SiteVideos.html"
189
- ],
190
- exactExcludeList: [
191
- "https://www.palestineremembered.com/index.html",
192
- "https://www.palestineremembered.com/ZionistFAQ.html"
193
- ],
194
- scrapResultPath: "./dataset/palestineremembered/website",
195
- jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
196
- textOutputPath: "./dataset/palestineremembered/texts",
197
- csvOutputPath: "./dataset/palestineremembered/train.csv",
198
- includeMetadata: true,
199
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
200
- axiosProxy: {
201
- host: "localhost",
202
- port: 2080,
203
- protocol: "http"
204
- }
133
+ retryDelay: 10000
205
134
  });
206
135
  if ( enable )
207
136
  {
@@ -259,15 +188,18 @@ async function mondoweiss ( enable )
259
188
  jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
260
189
  textOutputPath: "./dataset/mondoweiss/texts",
261
190
  csvOutputPath: "./dataset/mondoweiss/train.csv",
262
- includeMetadata: true,
263
191
  maxArticles: 2500,
264
192
  maxRetries: 2,
193
+ concurrencyLimit: 4,
265
194
  axiosHeaders: headers,
266
195
  axiosProxy: {
267
196
  host: "localhost",
268
197
  port: 2080,
269
198
  protocol: "http"
270
199
  },
200
+ maxDepth: 10,
201
+ retryDelay: 10000,
202
+ includeMetadata: true,
271
203
  metadataFields: ["author", "title", "description", "dateScrapedDate"]
272
204
  });
273
205
  if ( enable )
@@ -277,14 +209,90 @@ async function mondoweiss ( enable )
277
209
  return scraper;
278
210
  }
279
211
 
212
+ async function bdsmovement ( enable )
213
+ {
214
+ // https://bdsmovement.net
215
+ const scraper = new WebScraper({
216
+ baseURL: "https://bdsmovement.net",
217
+ excludeList: [
218
+ "https://bdsmovement.net/press-area",
219
+ "https://bdsmovement.net/privacy-policy",
220
+ "https://bdsmovement.net/get-involved/join-a-bds-campaign",
221
+ "https://bdsmovement.net/donate_",
222
+ "https://bdsmovement.net/user",
223
+ "https://bdsmovement.net/admin"
224
+ ],
225
+ scrapResultPath: "./dataset/bdsmovement/website",
226
+ jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
227
+ textOutputPath: "./dataset/bdsmovement/texts",
228
+ csvOutputPath: "./dataset/bdsmovement/train.csv",
229
+ includeMetadata: true,
230
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
231
+ puppeteerProxy: "socks5://127.0.0.1:2080",
232
+ puppeteerExecutablePath: "/usr/bin/chromium",
233
+ puppeteerRealProxy: {
234
+ host: "socks5://127.0.0.1",
235
+ port: "2080",
236
+ },
237
+ });
238
+ if ( enable )
239
+ {
240
+ await scraper.start();
241
+ }
242
+ return scraper;
243
+ }
244
+
245
+ async function palestineremembered ( enable )
246
+ {
247
+ // https://www.palestineremembered.com
248
+ const scraper = new WebScraper({
249
+ baseURL: "https://www.palestineremembered.com",
250
+ startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
251
+ excludeList: [
252
+ "https://www.palestineremembered.com/GeoPoints",
253
+ "https://www.palestineremembered.com/Donate",
254
+ "https://www.palestineremembered.com/ContactUs.html",
255
+ "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
256
+ "https://www.palestineremembered.com/ar/",
257
+ "https://www.palestineremembered.com/OldNewPictures.html",
258
+ "https://www.palestineremembered.com/Maps/index.html",
259
+ "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
260
+ "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
261
+ "https://www.palestineremembered.com/Articles/General/Story2045.html",
262
+ "https://www.palestineremembered.com/AllTownsListing.html",
263
+ "https://www.palestineremembered.com/Articles/General/ar/",
264
+ "https://www.palestineremembered.com/SiteVideos.html"
265
+ ],
266
+ exactExcludeList: [
267
+ "https://www.palestineremembered.com/index.html",
268
+ "https://www.palestineremembered.com/ZionistFAQ.html"
269
+ ],
270
+ scrapResultPath: "./dataset/palestineremembered/website",
271
+ jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
272
+ textOutputPath: "./dataset/palestineremembered/texts",
273
+ csvOutputPath: "./dataset/palestineremembered/train.csv",
274
+ includeMetadata: true,
275
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
276
+ axiosProxy: {
277
+ host: "localhost",
278
+ port: 2080,
279
+ protocol: "http"
280
+ }
281
+ });
282
+ if ( enable )
283
+ {
284
+ await scraper.start();
285
+ }
286
+ return scraper;
287
+ }
280
288
 
281
289
  void async function main ()
282
290
  {
283
- const palianswersScraper = await palianswers( false );
284
- const decolonizepalestineScraper = await decolonizepalestine( false );
285
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
286
- const electronicintifadaScraper = await electronicintifada( false );
287
- const standWithPalestineScraper = await standWithPalestine( false );
291
+ const palianswersScraper = await palianswers( true );
292
+ const decolonizepalestineScraper = await decolonizepalestine( true );
293
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
294
+ const electronicintifadaScraper = await electronicintifada( true );
295
+ const standWithPalestineScraper = await standWithPalestine( true );
288
296
  const mondoweisScraper = await mondoweiss( true );
289
297
  const bdsmovementScraper = await bdsmovement( false );
290
298
  const palestinerememberedScraper = await palestineremembered( false );
package/main.js CHANGED
@@ -16,6 +16,7 @@ class WebScraper
16
16
  maxArticles,
17
17
  concurrencyLimit,
18
18
  maxRetries,
19
+ retryDelay,
19
20
 
20
21
  // URL filtering
21
22
  excludeList = [],
@@ -53,6 +54,7 @@ class WebScraper
53
54
  this.maxArticles = maxArticles || Infinity;
54
55
  this.concurrencyLimit = concurrencyLimit || 2;
55
56
  this.maxRetries = maxRetries || 5;
57
+ this.retryDelay = retryDelay || 40000;
56
58
 
57
59
  // Output paths setup
58
60
  this.scrapResultPath = scrapResultPath;
@@ -357,6 +359,10 @@ class WebScraper
357
359
  {
358
360
  urlPath = "/index";
359
361
  }
362
+ else if ( urlPath.endsWith( "/" ) )
363
+ {
364
+ urlPath = urlPath.slice( 0, -1 );
365
+ }
360
366
  const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
361
367
  const dir = path.dirname( filePath );
362
368
 
@@ -579,14 +585,14 @@ class WebScraper
579
585
  {
580
586
  if ( this.hasReachedMax( ) )
581
587
  {
582
- throw new Error( "Max reached" );
588
+ break;
583
589
  }
584
590
  return await axios.get( url, options );
585
591
  }
586
592
  catch ( error )
587
593
  {
588
594
  if ( attempt >= this.maxRetries ) throw error;
589
- await WebScraper.sleep( 40000 * attempt );
595
+ await WebScraper.sleep( this.retryDelay * attempt );
590
596
  console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
591
597
  }
592
598
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.8.1",
3
+ "version": "3.8.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",