clean-web-scraper 3.8.0 → 3.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/example-usage.js +113 -91
  2. package/main.js +14 -2
  3. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -29,7 +29,8 @@ async function palianswers ( enable )
29
29
  textOutputPath: "./dataset/palianswers/texts",
30
30
  csvOutputPath: "./dataset/palianswers/train.csv",
31
31
  includeMetadata: true,
32
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
32
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
33
+ retryDelay: 10000
33
34
  });
34
35
  if ( enable )
35
36
  {
@@ -54,7 +55,8 @@ async function khameneiIrFreePalestineTag ( enable )
54
55
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
55
56
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
56
57
  includeMetadata: true,
57
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
58
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
59
+ retryDelay: 10000
58
60
  });
59
61
  if ( enable )
60
62
  {
@@ -84,40 +86,8 @@ async function decolonizepalestine ( enable )
84
86
  textOutputPath: "./dataset/decolonizepalestine/texts",
85
87
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
86
88
  includeMetadata: true,
87
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
88
- });
89
- if ( enable )
90
- {
91
- await scraper.start();
92
- }
93
- return scraper;
94
- }
95
-
96
- async function bdsmovement ( enable )
97
- {
98
- // https://bdsmovement.net
99
- const scraper = new WebScraper({
100
- baseURL: "https://bdsmovement.net",
101
- excludeList: [
102
- "https://bdsmovement.net/press-area",
103
- "https://bdsmovement.net/privacy-policy",
104
- "https://bdsmovement.net/get-involved/join-a-bds-campaign",
105
- "https://bdsmovement.net/donate_",
106
- "https://bdsmovement.net/user",
107
- "https://bdsmovement.net/admin"
108
- ],
109
- scrapResultPath: "./dataset/bdsmovement/website",
110
- jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
111
- textOutputPath: "./dataset/bdsmovement/texts",
112
- csvOutputPath: "./dataset/bdsmovement/train.csv",
113
- includeMetadata: true,
114
89
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
115
- puppeteerProxy: "socks5://127.0.0.1:2080",
116
- puppeteerExecutablePath: "/usr/bin/chromium",
117
- puppeteerRealProxy: {
118
- host: "socks5://127.0.0.1",
119
- port: "2080",
120
- },
90
+ retryDelay: 10000
121
91
  });
122
92
  if ( enable )
123
93
  {
@@ -155,53 +125,11 @@ async function electronicintifada ( enable )
155
125
  textOutputPath: "./dataset/electronicintifada/texts",
156
126
  csvOutputPath: "./dataset/electronicintifada/train.csv",
157
127
  includeMetadata: true,
128
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
129
+ maxDepth: 10,
158
130
  maxArticles: 2000,
159
131
  axiosHeaders: headers,
160
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
161
- });
162
- if ( enable )
163
- {
164
- await scraper.start();
165
- }
166
- return scraper;
167
- }
168
-
169
- async function palestineremembered ( enable )
170
- {
171
- // https://www.palestineremembered.com
172
- const scraper = new WebScraper({
173
- baseURL: "https://www.palestineremembered.com",
174
- startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
175
- excludeList: [
176
- "https://www.palestineremembered.com/GeoPoints",
177
- "https://www.palestineremembered.com/Donate",
178
- "https://www.palestineremembered.com/ContactUs.html",
179
- "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
180
- "https://www.palestineremembered.com/ar/",
181
- "https://www.palestineremembered.com/OldNewPictures.html",
182
- "https://www.palestineremembered.com/Maps/index.html",
183
- "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
184
- "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
185
- "https://www.palestineremembered.com/Articles/General/Story2045.html",
186
- "https://www.palestineremembered.com/AllTownsListing.html",
187
- "https://www.palestineremembered.com/Articles/General/ar/",
188
- "https://www.palestineremembered.com/SiteVideos.html"
189
- ],
190
- exactExcludeList: [
191
- "https://www.palestineremembered.com/index.html",
192
- "https://www.palestineremembered.com/ZionistFAQ.html"
193
- ],
194
- scrapResultPath: "./dataset/palestineremembered/website",
195
- jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
196
- textOutputPath: "./dataset/palestineremembered/texts",
197
- csvOutputPath: "./dataset/palestineremembered/train.csv",
198
- includeMetadata: true,
199
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
200
- axiosProxy: {
201
- host: "localhost",
202
- port: 2080,
203
- protocol: "http"
204
- }
132
+ retryDelay: 10000
205
133
  });
206
134
  if ( enable )
207
135
  {
@@ -240,15 +168,36 @@ async function mondoweiss ( enable )
240
168
  "https://mondoweiss.net/donate",
241
169
  "https://mondoweiss.net/advertise/",
242
170
  "https://mondoweiss.net/contact/",
243
- "https://mondoweiss.net/recent-comments/"
171
+ "https://mondoweiss.net/recent-comments/",
172
+ "https://mondoweiss.net/email-newsletters",
173
+ "https://mondoweiss.net/author",
174
+ "https://mondoweiss.net/tag/"
175
+ ],
176
+ exactExcludeList: [
177
+ "https://mondoweiss.net",
178
+ "https://mondoweiss.net/news/",
179
+ "https://mondoweiss.net/opinion/",
180
+ "https://mondoweiss.net/ways-to-give/",
181
+ "https://mondoweiss.net/media-analysis/",
182
+ "https://mondoweiss.net/culture/",
183
+ "https://mondoweiss.net/activism/",
184
+ "https://mondoweiss.net/news-letters/"
244
185
  ],
245
186
  scrapResultPath: "./dataset/mondoweiss/website",
246
187
  jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
247
188
  textOutputPath: "./dataset/mondoweiss/texts",
248
189
  csvOutputPath: "./dataset/mondoweiss/train.csv",
249
- includeMetadata: true,
250
190
  maxArticles: 2500,
191
+ maxRetries: 2,
251
192
  axiosHeaders: headers,
193
+ axiosProxy: {
194
+ host: "localhost",
195
+ port: 2080,
196
+ protocol: "http"
197
+ },
198
+ maxDepth: 10,
199
+ retryDelay: 10000,
200
+ includeMetadata: true,
252
201
  metadataFields: ["author", "title", "description", "dateScrapedDate"]
253
202
  });
254
203
  if ( enable )
@@ -258,14 +207,90 @@ async function mondoweiss ( enable )
258
207
  return scraper;
259
208
  }
260
209
 
210
+ async function bdsmovement ( enable )
211
+ {
212
+ // https://bdsmovement.net
213
+ const scraper = new WebScraper({
214
+ baseURL: "https://bdsmovement.net",
215
+ excludeList: [
216
+ "https://bdsmovement.net/press-area",
217
+ "https://bdsmovement.net/privacy-policy",
218
+ "https://bdsmovement.net/get-involved/join-a-bds-campaign",
219
+ "https://bdsmovement.net/donate_",
220
+ "https://bdsmovement.net/user",
221
+ "https://bdsmovement.net/admin"
222
+ ],
223
+ scrapResultPath: "./dataset/bdsmovement/website",
224
+ jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
225
+ textOutputPath: "./dataset/bdsmovement/texts",
226
+ csvOutputPath: "./dataset/bdsmovement/train.csv",
227
+ includeMetadata: true,
228
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
229
+ puppeteerProxy: "socks5://127.0.0.1:2080",
230
+ puppeteerExecutablePath: "/usr/bin/chromium",
231
+ puppeteerRealProxy: {
232
+ host: "socks5://127.0.0.1",
233
+ port: "2080",
234
+ },
235
+ });
236
+ if ( enable )
237
+ {
238
+ await scraper.start();
239
+ }
240
+ return scraper;
241
+ }
242
+
243
+ async function palestineremembered ( enable )
244
+ {
245
+ // https://www.palestineremembered.com
246
+ const scraper = new WebScraper({
247
+ baseURL: "https://www.palestineremembered.com",
248
+ startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
249
+ excludeList: [
250
+ "https://www.palestineremembered.com/GeoPoints",
251
+ "https://www.palestineremembered.com/Donate",
252
+ "https://www.palestineremembered.com/ContactUs.html",
253
+ "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
254
+ "https://www.palestineremembered.com/ar/",
255
+ "https://www.palestineremembered.com/OldNewPictures.html",
256
+ "https://www.palestineremembered.com/Maps/index.html",
257
+ "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
258
+ "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
259
+ "https://www.palestineremembered.com/Articles/General/Story2045.html",
260
+ "https://www.palestineremembered.com/AllTownsListing.html",
261
+ "https://www.palestineremembered.com/Articles/General/ar/",
262
+ "https://www.palestineremembered.com/SiteVideos.html"
263
+ ],
264
+ exactExcludeList: [
265
+ "https://www.palestineremembered.com/index.html",
266
+ "https://www.palestineremembered.com/ZionistFAQ.html"
267
+ ],
268
+ scrapResultPath: "./dataset/palestineremembered/website",
269
+ jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
270
+ textOutputPath: "./dataset/palestineremembered/texts",
271
+ csvOutputPath: "./dataset/palestineremembered/train.csv",
272
+ includeMetadata: true,
273
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
274
+ axiosProxy: {
275
+ host: "localhost",
276
+ port: 2080,
277
+ protocol: "http"
278
+ }
279
+ });
280
+ if ( enable )
281
+ {
282
+ await scraper.start();
283
+ }
284
+ return scraper;
285
+ }
261
286
 
262
287
  void async function main ()
263
288
  {
264
- const palianswersScraper = await palianswers( false );
265
- const decolonizepalestineScraper = await decolonizepalestine( false );
266
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
267
- const electronicintifadaScraper = await electronicintifada( false );
268
- const standWithPalestineScraper = await standWithPalestine( false );
289
+ const palianswersScraper = await palianswers( true );
290
+ const decolonizepalestineScraper = await decolonizepalestine( true );
291
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
292
+ const electronicintifadaScraper = await electronicintifada( true );
293
+ const standWithPalestineScraper = await standWithPalestine( true );
269
294
  const mondoweisScraper = await mondoweiss( true );
270
295
  const bdsmovementScraper = await bdsmovement( false );
271
296
  const palestinerememberedScraper = await palestineremembered( false );
@@ -278,7 +303,4 @@ void async function main ()
278
303
  standWithPalestineScraper,
279
304
  mondoweisScraper
280
305
  ] );
281
- }()
282
-
283
-
284
- // https://mondoweiss.net
306
+ }()
package/main.js CHANGED
@@ -16,12 +16,14 @@ class WebScraper
16
16
  maxArticles,
17
17
  concurrencyLimit,
18
18
  maxRetries,
19
+ retryDelay,
19
20
 
20
21
  // URL filtering
21
22
  excludeList = [],
22
23
  exactExcludeList = [],
23
24
  filterFileTypes,
24
25
  excludedFileTypes,
26
+ removeURLFragment,
25
27
 
26
28
  // Output paths
27
29
  scrapResultPath = "./dataset",
@@ -52,6 +54,7 @@ class WebScraper
52
54
  this.maxArticles = maxArticles || Infinity;
53
55
  this.concurrencyLimit = concurrencyLimit || 2;
54
56
  this.maxRetries = maxRetries || 5;
57
+ this.retryDelay = retryDelay || 40000;
55
58
 
56
59
  // Output paths setup
57
60
  this.scrapResultPath = scrapResultPath;
@@ -72,6 +75,7 @@ class WebScraper
72
75
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
73
76
  this.filterFileTypes = filterFileTypes || true;
74
77
  this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
78
+ this.removeURLFragment = removeURLFragment || true;
75
79
 
76
80
  // Network configuration
77
81
  this.axiosHeaders = axiosHeaders;
@@ -130,6 +134,10 @@ class WebScraper
130
134
 
131
135
  async fetchPage ( url, depth )
132
136
  {
137
+ if ( this.removeURLFragment )
138
+ {
139
+ url = url.split( "#" )[0];
140
+ }
133
141
  if ( this.hasReachedMax( depth ) )
134
142
  {
135
143
  return;
@@ -233,7 +241,7 @@ class WebScraper
233
241
  }
234
242
  catch ( error )
235
243
  {
236
- console.error( `Error fetching ${url}:`, error.message );
244
+ console.error( `Error fetching content ${url}:`, error.message );
237
245
  if ( error.status = 403 && this.usePuppeteer )
238
246
  {
239
247
  try
@@ -351,6 +359,10 @@ class WebScraper
351
359
  {
352
360
  urlPath = "/index";
353
361
  }
362
+ else if ( urlPath.endsWith( "/" ) )
363
+ {
364
+ urlPath = urlPath.slice( 0, -1 );
365
+ }
354
366
  const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
355
367
  const dir = path.dirname( filePath );
356
368
 
@@ -580,7 +592,7 @@ class WebScraper
580
592
  catch ( error )
581
593
  {
582
594
  if ( attempt >= this.maxRetries ) throw error;
583
- await WebScraper.sleep( 40000 * attempt );
595
+ await WebScraper.sleep( this.retryDelay * attempt );
584
596
  console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
585
597
  }
586
598
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.8.0",
3
+ "version": "3.8.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",