clean-web-scraper 4.2.2 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -91,7 +91,7 @@ const docsScraper = new WebScraper({
91
91
  scrapResultPath: './datasets/docs',
92
92
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
93
93
  includeMetadata: true, // Optional: Include metadata in output files
94
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
94
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
95
95
  // Optional: Specify metadata fields to include
96
96
  });
97
97
 
@@ -101,7 +101,7 @@ const blogScraper = new WebScraper({
101
101
  scrapResultPath: './datasets/blog',
102
102
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
103
103
  includeMetadata: true, // Optional: Include metadata in output files
104
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
104
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
105
105
  // Optional: Specify metadata fields to include
106
106
  });
107
107
 
package/example-usage.js CHANGED
@@ -37,9 +37,8 @@ async function palianswers ( enable )
37
37
  textOutputPath: "./dataset/palianswers/texts",
38
38
  csvOutputPath: "./dataset/palianswers/train.csv",
39
39
  includeMetadata: true,
40
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
40
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
41
41
  axiosRetryDelay: 10000,
42
- crawlingDelay: 0
43
42
  };
44
43
  return await runScraper( config, enable );
45
44
  }
@@ -50,7 +49,7 @@ async function khameneiIrFreePalestineTag ( enable )
50
49
  baseURL: "https://english.khamenei.ir/news",
51
50
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
52
51
  maxDepth: 1,
53
- maxArticles: 2,
52
+ maxArticles: 200,
54
53
  exactExcludeList: [
55
54
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
56
55
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
@@ -60,7 +59,7 @@ async function khameneiIrFreePalestineTag ( enable )
60
59
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
61
60
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
62
61
  includeMetadata: true,
63
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
62
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
64
63
  axiosRetryDelay: 10000,
65
64
  };
66
65
  return await runScraper( config, enable );
@@ -72,8 +71,8 @@ async function khameneiIrPalestineSpecialPage ( enable )
72
71
  const config = {
73
72
  baseURL: "https://english.khamenei.ir/news",
74
73
  startURL: "https://english.khamenei.ir/palestine-special-page",
75
- maxDepth: 2,
76
- maxArticles: 2,
74
+ maxDepth: 1,
75
+ maxArticles: 200,
77
76
  exactExcludeList: [
78
77
  "https://english.khamenei.ir/palestine-special-page/"
79
78
  ],
@@ -82,7 +81,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
82
81
  textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
83
82
  csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
84
83
  includeMetadata: true,
85
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
84
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
86
85
  axiosRetryDelay: 10000
87
86
  };
88
87
  return await runScraper( config, enable );
@@ -103,13 +102,13 @@ async function decolonizepalestine ( enable )
103
102
  "https://decolonizepalestine.com/rainbow-washing",
104
103
  "https://decolonizepalestine.com/"
105
104
  ],
106
- maxArticles: 2,
105
+ maxArticles: 400,
107
106
  scrapResultPath: "./dataset/decolonizepalestine/website",
108
107
  jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
109
108
  textOutputPath: "./dataset/decolonizepalestine/texts",
110
109
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
111
110
  includeMetadata: true,
112
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
111
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
113
112
  axiosRetryDelay: 10000,
114
113
  };
115
114
  return await runScraper( config, enable );
@@ -123,7 +122,6 @@ async function electronicintifada ( enable )
123
122
  "https://electronicintifada.net/updates",
124
123
  "https://electronicintifada.net/taxonomy/term/",
125
124
  "https://electronicintifada.net/tags/",
126
- "https://electronicintifada.net/blog",
127
125
  "https://electronicintifada.net/people",
128
126
  "https://electronicintifada.net/location",
129
127
  "https://electronicintifada.net/file",
@@ -134,11 +132,13 @@ async function electronicintifada ( enable )
134
132
  "https://electronicintifada.net/opinion",
135
133
  "https://electronicintifada.net/about-ei",
136
134
  "https://electronicintifada.net/review",
137
- "https://electronicintifada.net/artmusicculture"
135
+ "https://electronicintifada.net/artmusicculture",
136
+ "https://electronicintifada.net/blog/editors",
137
+ "https://electronicintifada.net/blog"
138
138
  ],
139
139
  exactExcludeList: [
140
140
  "https://electronicintifada.net",
141
- "https://electronicintifada.net/blog",
141
+ "https://electronicintifada.net/blogs",
142
142
  "https://electronicintifada.net/review",
143
143
  ],
144
144
  scrapResultPath: "./dataset/electronicintifada/website",
@@ -146,11 +146,12 @@ async function electronicintifada ( enable )
146
146
  textOutputPath: "./dataset/electronicintifada/texts",
147
147
  csvOutputPath: "./dataset/electronicintifada/train.csv",
148
148
  includeMetadata: true,
149
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
150
- maxDepth: 16,
149
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
151
150
  maxArticles: 2000,
151
+ maxDepth: 16,
152
+ batchSize: 30,
152
153
  axiosHeaders: headers,
153
- axiosMaxRetries: 2,
154
+ axiosMaxRetries: 3,
154
155
  axiosRetryDelay: 10000,
155
156
  axiosProxy: {
156
157
  host: "localhost",
@@ -158,7 +159,6 @@ async function electronicintifada ( enable )
158
159
  protocol: "http"
159
160
  },
160
161
  useProxyAsFallback: true,
161
- crawlingDelay: 1
162
162
  };
163
163
  return await runScraper( config, enable );
164
164
  }
@@ -175,7 +175,7 @@ async function standWithPalestine ( enable )
175
175
  csvOutputPath: "./dataset/stand-with-palestine/train.csv",
176
176
  axiosHeaders: headers,
177
177
  includeMetadata: true,
178
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
178
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"]
179
179
  };
180
180
  return await runScraper( config, enable );
181
181
  }
@@ -211,20 +211,24 @@ async function mondoweiss ( enable )
211
211
  "https://mondoweiss.net/activism/",
212
212
  "https://mondoweiss.net/news-letters/",
213
213
  "https://mondoweiss.net/newsletters",
214
- /^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
215
- /^https:\/\/mondoweiss\.net\/\d{4}\/$/,
216
214
  "https://mondoweiss.net/daily-headlines",
217
215
  "https://mondoweiss.net/palestineletter",
218
216
  "https://mondoweiss.net/podcasts/",
219
217
  "https://mondoweiss.net/the-shift",
220
- "https://mondoweiss.net/weekly-briefing"
218
+ "https://mondoweiss.net/weekly-briefing",
219
+ "https://mondoweiss.net/contact/",
220
+ /^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}\/?$/,
221
+ /^https:\/\/mondoweiss\.net\/\d{4}\/?$/
221
222
  ],
222
223
  scrapResultPath: "./dataset/mondoweiss/website",
223
224
  jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
224
225
  textOutputPath: "./dataset/mondoweiss/texts",
225
226
  csvOutputPath: "./dataset/mondoweiss/train.csv",
227
+ includeMetadata: true,
228
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
226
229
  maxArticles: 2500,
227
230
  maxDepth: 15,
231
+ batchSize: 20,
228
232
  axiosHeaders: headers,
229
233
  axiosMaxRetries: 3,
230
234
  axiosRetryDelay: 10000,
@@ -234,10 +238,6 @@ async function mondoweiss ( enable )
234
238
  protocol: "http"
235
239
  },
236
240
  useProxyAsFallback: true,
237
- includeMetadata: true,
238
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
239
- crawlingDelay: 0,
240
- batchSize: 20
241
241
  };
242
242
  return await runScraper( config, enable );
243
243
  }
@@ -251,19 +251,36 @@ async function bdsmovement ( enable )
251
251
  "https://bdsmovement.net/privacy-policy",
252
252
  "https://bdsmovement.net/get-involved/join-a-bds-campaign",
253
253
  "https://bdsmovement.net/donate_",
254
+ "https://bdsmovement.net/donate",
254
255
  "https://bdsmovement.net/user",
255
- "https://bdsmovement.net/admin"
256
+ "https://bdsmovement.net/admin",
257
+ "https://bdsmovement.net/stay-updated",
258
+ "https://bdsmovement.net/join-a-bds-campaign",
259
+ "https://bdsmovement.net/contact-us",
260
+ "https://bdsmovement.net/taxonomy",
261
+ "https://bdsmovement.net/news-type",
262
+ "https://bdsmovement.net/cdn-cgi"
263
+ ],
264
+ exactExcludeList: [
265
+ "https://bdsmovement.net/",
266
+ "https://bdsmovement.net/shutdownnation",
267
+ "https://bdsmovement.net/campaigns",
268
+ "https://bdsmovement.net/resources",
269
+ /^https:\/\/bdsmovement\.net\/resources\?page=\d+$/,
270
+ /^https:\/\/bdsmovement\.net\/resources\?campaign=\d+$/,
271
+ /^https:\/\/bdsmovement\.net\/resources\?type=\d+$/,
272
+ /^https:\/\/bdsmovement\.net\/news\?type=\d+$/,
273
+ /^https:\/\/bdsmovement\.net\/news\?campaign=\d+$/,
256
274
  ],
257
275
  scrapResultPath: "./dataset/bdsmovement/website",
258
276
  jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
259
277
  textOutputPath: "./dataset/bdsmovement/texts",
260
278
  csvOutputPath: "./dataset/bdsmovement/train.csv",
261
279
  includeMetadata: true,
262
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
263
- puppeteerRealProxy: {
264
- host: "socks5://127.0.0.1",
265
- port: "2080",
266
- },
280
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
281
+ maxArticles: 2000,
282
+ maxDepth: 16,
283
+ batchSize: 20
267
284
  };
268
285
  return await runScraper( config, enable );
269
286
  }
@@ -297,7 +314,8 @@ async function palestineremembered ( enable )
297
314
  textOutputPath: "./dataset/palestineremembered/texts",
298
315
  csvOutputPath: "./dataset/palestineremembered/train.csv",
299
316
  includeMetadata: true,
300
- metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
317
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
318
+ batchSize: 10,
301
319
  axiosProxy: {
302
320
  host: "localhost",
303
321
  port: 2080,
@@ -309,23 +327,24 @@ async function palestineremembered ( enable )
309
327
 
310
328
  void async function main ()
311
329
  {
312
- // const palianswersScraper = await palianswers( true );
330
+ const palianswersScraper = await palianswers( true );
313
331
  const decolonizepalestineScraper = await decolonizepalestine( true );
314
332
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
315
- // const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
316
- // const electronicintifadaScraper = await electronicintifada( true );
317
- // const standWithPalestineScraper = await standWithPalestine( true );
318
- // const mondoweisScraper = await mondoweiss( true );
319
- // const bdsmovementScraper = await bdsmovement( false );
333
+ const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
334
+ const electronicintifadaScraper = await electronicintifada( true );
335
+ const standWithPalestineScraper = await standWithPalestine( true );
336
+ const mondoweisScraper = await mondoweiss( true );
337
+ const bdsmovementScraper = await bdsmovement( true );
320
338
  // const palestinerememberedScraper = await palestineremembered( false );
321
339
 
322
340
  await WebScraper.combineResults( "./dataset/combined", [
323
- // palianswersScraper,
341
+ palianswersScraper,
324
342
  decolonizepalestineScraper,
325
343
  khameneiIrFreePalestineTagScraper,
326
- // khameneiIrPalestineSpecialPageScraper,
327
- // electronicintifadaScraper,
328
- // standWithPalestineScraper,
329
- // mondoweisScraper
344
+ khameneiIrPalestineSpecialPageScraper,
345
+ electronicintifadaScraper,
346
+ standWithPalestineScraper,
347
+ mondoweisScraper,
348
+ bdsmovementScraper,
330
349
  ] );
331
350
  }();
package/main.js CHANGED
@@ -18,6 +18,7 @@ class WebScraper
18
18
  this.maxArticles = config.maxArticles || Infinity;
19
19
  this.crawlingDelay = config.crawlingDelay ?? 1000;
20
20
  this.batchSize = config.batchSize || 5;
21
+ this.minContentLength = config.minContentLength || 400;
21
22
 
22
23
  // Output paths setup
23
24
  this.scrapResultPath = config.scrapResultPath || "./dataset";
@@ -469,7 +470,7 @@ class WebScraper
469
470
  let processed = content;
470
471
  // Remove unwanted fixed text
471
472
  processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
472
- // Trim each line and remove extra newlines
473
+ processed = processed.replace( /Click on the image to view the large size/g, "" );
473
474
  processed = processed
474
475
  .split( "\n" )
475
476
  .map( line => { return line.trim() })
@@ -525,7 +526,7 @@ class WebScraper
525
526
  ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
526
527
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
527
528
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
528
- dateScrapedDate: new Date().toISOString(),
529
+ dataScrapedDate: new Date().toISOString(),
529
530
  originalHtml: html,
530
531
  };
531
532
  }
@@ -663,7 +664,7 @@ class WebScraper
663
664
 
664
665
  const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
665
666
  // Check content length
666
- if ( cleanContent.length < 100 || hasInvalidPhrases )
667
+ if ( cleanContent.length < this.minContentLength || hasInvalidPhrases )
667
668
  {
668
669
  return false;
669
670
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.2.2",
3
+ "version": "4.3.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
@@ -24,10 +24,10 @@
24
24
  "description": "",
25
25
  "dependencies": {
26
26
  "@mozilla/readability": "^0.6.0",
27
- "axios": "^1.7.9",
28
- "eslint": "^9.17.0",
27
+ "axios": "^1.8.4",
28
+ "eslint": "^9.23.0",
29
29
  "jsdom": "^26.0.0",
30
30
  "puppeteer": "^24.1.1",
31
31
  "puppeteer-real-browser": "^1.3.22"
32
32
  }
33
- }
33
+ }