clean-web-scraper 4.2.3 → 4.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -91,7 +91,7 @@ const docsScraper = new WebScraper({
91
91
  scrapResultPath: './datasets/docs',
92
92
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
93
93
  includeMetadata: true, // Optional: Include metadata in output files
94
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
94
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
95
95
  // Optional: Specify metadata fields to include
96
96
  });
97
97
 
@@ -101,7 +101,7 @@ const blogScraper = new WebScraper({
101
101
  scrapResultPath: './datasets/blog',
102
102
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
103
103
  includeMetadata: true, // Optional: Include metadata in output files
104
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
104
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
105
105
  // Optional: Specify metadata fields to include
106
106
  });
107
107
 
package/example-usage.js CHANGED
@@ -37,9 +37,8 @@ async function palianswers ( enable )
37
37
  textOutputPath: "./dataset/palianswers/texts",
38
38
  csvOutputPath: "./dataset/palianswers/train.csv",
39
39
  includeMetadata: true,
40
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
40
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
41
41
  axiosRetryDelay: 10000,
42
- crawlingDelay: 0
43
42
  };
44
43
  return await runScraper( config, enable );
45
44
  }
@@ -50,7 +49,7 @@ async function khameneiIrFreePalestineTag ( enable )
50
49
  baseURL: "https://english.khamenei.ir/news",
51
50
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
52
51
  maxDepth: 1,
53
- maxArticles: 2,
52
+ maxArticles: 200,
54
53
  exactExcludeList: [
55
54
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
56
55
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
@@ -60,7 +59,7 @@ async function khameneiIrFreePalestineTag ( enable )
60
59
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
61
60
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
62
61
  includeMetadata: true,
63
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
62
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
64
63
  axiosRetryDelay: 10000,
65
64
  };
66
65
  return await runScraper( config, enable );
@@ -72,8 +71,8 @@ async function khameneiIrPalestineSpecialPage ( enable )
72
71
  const config = {
73
72
  baseURL: "https://english.khamenei.ir/news",
74
73
  startURL: "https://english.khamenei.ir/palestine-special-page",
75
- maxDepth: 2,
76
- maxArticles: 2,
74
+ maxDepth: 1,
75
+ maxArticles: 200,
77
76
  exactExcludeList: [
78
77
  "https://english.khamenei.ir/palestine-special-page/"
79
78
  ],
@@ -82,7 +81,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
82
81
  textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
83
82
  csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
84
83
  includeMetadata: true,
85
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
84
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
86
85
  axiosRetryDelay: 10000
87
86
  };
88
87
  return await runScraper( config, enable );
@@ -103,13 +102,13 @@ async function decolonizepalestine ( enable )
103
102
  "https://decolonizepalestine.com/rainbow-washing",
104
103
  "https://decolonizepalestine.com/"
105
104
  ],
106
- maxArticles: 2,
105
+ maxArticles: 400,
107
106
  scrapResultPath: "./dataset/decolonizepalestine/website",
108
107
  jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
109
108
  textOutputPath: "./dataset/decolonizepalestine/texts",
110
109
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
111
110
  includeMetadata: true,
112
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
111
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
113
112
  axiosRetryDelay: 10000,
114
113
  };
115
114
  return await runScraper( config, enable );
@@ -123,7 +122,6 @@ async function electronicintifada ( enable )
123
122
  "https://electronicintifada.net/updates",
124
123
  "https://electronicintifada.net/taxonomy/term/",
125
124
  "https://electronicintifada.net/tags/",
126
- "https://electronicintifada.net/blog",
127
125
  "https://electronicintifada.net/people",
128
126
  "https://electronicintifada.net/location",
129
127
  "https://electronicintifada.net/file",
@@ -134,11 +132,15 @@ async function electronicintifada ( enable )
134
132
  "https://electronicintifada.net/opinion",
135
133
  "https://electronicintifada.net/about-ei",
136
134
  "https://electronicintifada.net/review",
137
- "https://electronicintifada.net/artmusicculture"
135
+ "https://electronicintifada.net/artmusicculture",
136
+ "https://electronicintifada.net/blog/editors",
138
137
  ],
139
138
  exactExcludeList: [
140
- "https://electronicintifada.net",
141
139
  "https://electronicintifada.net/blog",
140
+ /^https:\/\/electronicintifada\.net\/blog\/.*/,
141
+ /^https:\/\/electronicintifada\.net\/blog\?page=\d+$/,
142
+ "https://electronicintifada.net",
143
+ "https://electronicintifada.net/blogs",
142
144
  "https://electronicintifada.net/review",
143
145
  ],
144
146
  scrapResultPath: "./dataset/electronicintifada/website",
@@ -146,19 +148,19 @@ async function electronicintifada ( enable )
146
148
  textOutputPath: "./dataset/electronicintifada/texts",
147
149
  csvOutputPath: "./dataset/electronicintifada/train.csv",
148
150
  includeMetadata: true,
149
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
150
- maxDepth: 16,
151
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
151
152
  maxArticles: 2000,
153
+ maxDepth: 16,
154
+ batchSize: 40,
152
155
  axiosHeaders: headers,
153
156
  axiosMaxRetries: 2,
154
- axiosRetryDelay: 10000,
157
+ axiosRetryDelay: 8000,
155
158
  axiosProxy: {
156
159
  host: "localhost",
157
160
  port: 2080,
158
161
  protocol: "http"
159
162
  },
160
163
  useProxyAsFallback: true,
161
- crawlingDelay: 1
162
164
  };
163
165
  return await runScraper( config, enable );
164
166
  }
@@ -175,7 +177,7 @@ async function standWithPalestine ( enable )
175
177
  csvOutputPath: "./dataset/stand-with-palestine/train.csv",
176
178
  axiosHeaders: headers,
177
179
  includeMetadata: true,
178
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
180
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"]
179
181
  };
180
182
  return await runScraper( config, enable );
181
183
  }
@@ -211,22 +213,26 @@ async function mondoweiss ( enable )
211
213
  "https://mondoweiss.net/activism/",
212
214
  "https://mondoweiss.net/news-letters/",
213
215
  "https://mondoweiss.net/newsletters",
214
- /^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
215
- /^https:\/\/mondoweiss\.net\/\d{4}\/$/,
216
216
  "https://mondoweiss.net/daily-headlines",
217
217
  "https://mondoweiss.net/palestineletter",
218
218
  "https://mondoweiss.net/podcasts/",
219
219
  "https://mondoweiss.net/the-shift",
220
- "https://mondoweiss.net/weekly-briefing"
220
+ "https://mondoweiss.net/weekly-briefing",
221
+ "https://mondoweiss.net/contact/",
222
+ /^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}\/?$/,
223
+ /^https:\/\/mondoweiss\.net\/\d{4}\/?$/
221
224
  ],
222
225
  scrapResultPath: "./dataset/mondoweiss/website",
223
226
  jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
224
227
  textOutputPath: "./dataset/mondoweiss/texts",
225
228
  csvOutputPath: "./dataset/mondoweiss/train.csv",
229
+ includeMetadata: true,
230
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
226
231
  maxArticles: 2500,
227
232
  maxDepth: 15,
233
+ batchSize: 20,
228
234
  axiosHeaders: headers,
229
- axiosMaxRetries: 3,
235
+ axiosMaxRetries: 2,
230
236
  axiosRetryDelay: 10000,
231
237
  axiosProxy: {
232
238
  host: "localhost",
@@ -234,10 +240,6 @@ async function mondoweiss ( enable )
234
240
  protocol: "http"
235
241
  },
236
242
  useProxyAsFallback: true,
237
- includeMetadata: true,
238
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
239
- crawlingDelay: 0,
240
- batchSize: 20
241
243
  };
242
244
  return await runScraper( config, enable );
243
245
  }
@@ -251,19 +253,36 @@ async function bdsmovement ( enable )
251
253
  "https://bdsmovement.net/privacy-policy",
252
254
  "https://bdsmovement.net/get-involved/join-a-bds-campaign",
253
255
  "https://bdsmovement.net/donate_",
256
+ "https://bdsmovement.net/donate",
254
257
  "https://bdsmovement.net/user",
255
- "https://bdsmovement.net/admin"
258
+ "https://bdsmovement.net/admin",
259
+ "https://bdsmovement.net/stay-updated",
260
+ "https://bdsmovement.net/join-a-bds-campaign",
261
+ "https://bdsmovement.net/contact-us",
262
+ "https://bdsmovement.net/taxonomy",
263
+ "https://bdsmovement.net/news-type",
264
+ "https://bdsmovement.net/cdn-cgi"
265
+ ],
266
+ exactExcludeList: [
267
+ "https://bdsmovement.net/",
268
+ "https://bdsmovement.net/shutdownnation",
269
+ "https://bdsmovement.net/campaigns",
270
+ "https://bdsmovement.net/resources",
271
+ /^https:\/\/bdsmovement\.net\/resources\?page=\d+$/,
272
+ /^https:\/\/bdsmovement\.net\/resources\?campaign=\d+$/,
273
+ /^https:\/\/bdsmovement\.net\/resources\?type=\d+$/,
274
+ /^https:\/\/bdsmovement\.net\/news\?type=\d+$/,
275
+ /^https:\/\/bdsmovement\.net\/news\?campaign=\d+$/,
256
276
  ],
257
277
  scrapResultPath: "./dataset/bdsmovement/website",
258
278
  jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
259
279
  textOutputPath: "./dataset/bdsmovement/texts",
260
280
  csvOutputPath: "./dataset/bdsmovement/train.csv",
261
281
  includeMetadata: true,
262
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
263
- puppeteerRealProxy: {
264
- host: "socks5://127.0.0.1",
265
- port: "2080",
266
- },
282
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
283
+ maxArticles: 2000,
284
+ maxDepth: 16,
285
+ batchSize: 20
267
286
  };
268
287
  return await runScraper( config, enable );
269
288
  }
@@ -297,7 +316,8 @@ async function palestineremembered ( enable )
297
316
  textOutputPath: "./dataset/palestineremembered/texts",
298
317
  csvOutputPath: "./dataset/palestineremembered/train.csv",
299
318
  includeMetadata: true,
300
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
319
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
320
+ batchSize: 10,
301
321
  axiosProxy: {
302
322
  host: "localhost",
303
323
  port: 2080,
@@ -309,23 +329,24 @@ async function palestineremembered ( enable )
309
329
 
310
330
  void async function main ()
311
331
  {
312
- // const palianswersScraper = await palianswers( true );
332
+ const palianswersScraper = await palianswers( true );
313
333
  const decolonizepalestineScraper = await decolonizepalestine( true );
314
334
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
315
- // const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
316
- // const electronicintifadaScraper = await electronicintifada( true );
317
- // const standWithPalestineScraper = await standWithPalestine( true );
318
- // const mondoweisScraper = await mondoweiss( true );
319
- // const bdsmovementScraper = await bdsmovement( false );
335
+ const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
336
+ const electronicintifadaScraper = await electronicintifada( true );
337
+ const standWithPalestineScraper = await standWithPalestine( true );
338
+ const mondoweisScraper = await mondoweiss( true );
339
+ const bdsmovementScraper = await bdsmovement( true );
320
340
  // const palestinerememberedScraper = await palestineremembered( false );
321
341
 
322
342
  await WebScraper.combineResults( "./dataset/combined", [
323
- // palianswersScraper,
343
+ palianswersScraper,
324
344
  decolonizepalestineScraper,
325
345
  khameneiIrFreePalestineTagScraper,
326
- // khameneiIrPalestineSpecialPageScraper,
327
- // electronicintifadaScraper,
328
- // standWithPalestineScraper,
329
- // mondoweisScraper
346
+ khameneiIrPalestineSpecialPageScraper,
347
+ electronicintifadaScraper,
348
+ standWithPalestineScraper,
349
+ mondoweisScraper,
350
+ bdsmovementScraper,
330
351
  ] );
331
352
  }();
package/main.js CHANGED
@@ -18,6 +18,7 @@ class WebScraper
18
18
  this.maxArticles = config.maxArticles || Infinity;
19
19
  this.crawlingDelay = config.crawlingDelay ?? 1000;
20
20
  this.batchSize = config.batchSize || 5;
21
+ this.minContentLength = config.minContentLength || 400;
21
22
 
22
23
  // Output paths setup
23
24
  this.scrapResultPath = config.scrapResultPath || "./dataset";
@@ -469,7 +470,7 @@ class WebScraper
469
470
  let processed = content;
470
471
  // Remove unwanted fixed text
471
472
  processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
472
- // Trim each line and remove extra newlines
473
+ processed = processed.replace( /Click on the image to view the large size/g, "" );
473
474
  processed = processed
474
475
  .split( "\n" )
475
476
  .map( line => { return line.trim() })
@@ -525,7 +526,7 @@ class WebScraper
525
526
  ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
526
527
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
527
528
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
528
- dateScrapedDate: new Date().toISOString(),
529
+ dataScrapedDate: new Date().toISOString(),
529
530
  originalHtml: html,
530
531
  };
531
532
  }
@@ -663,7 +664,7 @@ class WebScraper
663
664
 
664
665
  const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
665
666
  // Check content length
666
- if ( cleanContent.length < 100 || hasInvalidPhrases )
667
+ if ( cleanContent.length < this.minContentLength || hasInvalidPhrases )
667
668
  {
668
669
  return false;
669
670
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.2.3",
3
+ "version": "4.3.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
@@ -24,8 +24,8 @@
24
24
  "description": "",
25
25
  "dependencies": {
26
26
  "@mozilla/readability": "^0.6.0",
27
- "axios": "^1.8.3",
28
- "eslint": "^9.17.0",
27
+ "axios": "^1.8.4",
28
+ "eslint": "^9.23.0",
29
29
  "jsdom": "^26.0.0",
30
30
  "puppeteer": "^24.1.1",
31
31
  "puppeteer-real-browser": "^1.3.22"