clean-web-scraper 3.7.6 → 3.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -10,6 +10,7 @@ const headers = {
10
10
 
11
11
  async function palianswers ( enable )
12
12
  {
13
+ // https://palianswers.com
13
14
  const scraper = new WebScraper({
14
15
  baseURL: "https://palianswers.com",
15
16
  excludeList: [
@@ -28,7 +29,7 @@ async function palianswers ( enable )
28
29
  textOutputPath: "./dataset/palianswers/texts",
29
30
  csvOutputPath: "./dataset/palianswers/train.csv",
30
31
  includeMetadata: true,
31
- metadataFields: ["author", "title", "description"]
32
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
32
33
  });
33
34
  if ( enable )
34
35
  {
@@ -44,7 +45,7 @@ async function khameneiIrFreePalestineTag ( enable )
44
45
  const scraper = new WebScraper({
45
46
  baseURL: "https://english.khamenei.ir/news",
46
47
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
47
- maxDepth: 3,
48
+ maxDepth: 1,
48
49
  exactExcludeList: [
49
50
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
50
51
  ],
@@ -53,7 +54,7 @@ async function khameneiIrFreePalestineTag ( enable )
53
54
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
54
55
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
55
56
  includeMetadata: true,
56
- metadataFields: ["author", "title", "description"]
57
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
57
58
  });
58
59
  if ( enable )
59
60
  {
@@ -64,6 +65,7 @@ async function khameneiIrFreePalestineTag ( enable )
64
65
 
65
66
  async function decolonizepalestine ( enable )
66
67
  {
68
+ // https://decolonizepalestine.com
67
69
  const scraper = new WebScraper({
68
70
  baseURL: "https://decolonizepalestine.com",
69
71
  excludeList: [
@@ -82,7 +84,7 @@ async function decolonizepalestine ( enable )
82
84
  textOutputPath: "./dataset/decolonizepalestine/texts",
83
85
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
84
86
  includeMetadata: true,
85
- metadataFields: ["author", "title", "description"]
87
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
86
88
  });
87
89
  if ( enable )
88
90
  {
@@ -93,6 +95,7 @@ async function decolonizepalestine ( enable )
93
95
 
94
96
  async function bdsmovement ( enable )
95
97
  {
98
+ // https://bdsmovement.net
96
99
  const scraper = new WebScraper({
97
100
  baseURL: "https://bdsmovement.net",
98
101
  excludeList: [
@@ -108,7 +111,7 @@ async function bdsmovement ( enable )
108
111
  textOutputPath: "./dataset/bdsmovement/texts",
109
112
  csvOutputPath: "./dataset/bdsmovement/train.csv",
110
113
  includeMetadata: true,
111
- metadataFields: ["author", "title", "description"],
114
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
112
115
  puppeteerProxy: "socks5://127.0.0.1:2080",
113
116
  puppeteerExecutablePath: "/usr/bin/chromium",
114
117
  puppeteerRealProxy: {
@@ -125,6 +128,7 @@ async function bdsmovement ( enable )
125
128
 
126
129
  async function electronicintifada ( enable )
127
130
  {
131
+ // https://electronicintifada.net
128
132
  const scraper = new WebScraper({
129
133
  baseURL: "https://electronicintifada.net",
130
134
  excludeList: [
@@ -153,7 +157,7 @@ async function electronicintifada ( enable )
153
157
  includeMetadata: true,
154
158
  maxArticles: 2000,
155
159
  axiosHeaders: headers,
156
- metadataFields: ["author", "title", "description"]
160
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
157
161
  });
158
162
  if ( enable )
159
163
  {
@@ -192,7 +196,7 @@ async function palestineremembered ( enable )
192
196
  textOutputPath: "./dataset/palestineremembered/texts",
193
197
  csvOutputPath: "./dataset/palestineremembered/train.csv",
194
198
  includeMetadata: true,
195
- metadataFields: ["author", "title", "description"],
199
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
196
200
  axiosProxy: {
197
201
  host: "localhost",
198
202
  port: 2080,
@@ -206,13 +210,63 @@ async function palestineremembered ( enable )
206
210
  return scraper;
207
211
  }
208
212
 
213
+ async function standWithPalestine ( enable )
214
+ {
215
+ const scraper = new WebScraper({
216
+ baseURL: "https://stand-with-palestine.org/blogs",
217
+ startURL: "https://stand-with-palestine.org/blogs",
218
+ scrapResultPath: "./dataset/stand-with-palestine/website",
219
+ jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
220
+ textOutputPath: "./dataset/stand-with-palestine/texts",
221
+ csvOutputPath: "./dataset/stand-with-palestine/train.csv",
222
+ exactExcludeList: ["https://stand-with-palestine.org/blogs"],
223
+ axiosHeaders: headers,
224
+ includeMetadata: true,
225
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
226
+ });
227
+ if ( enable )
228
+ {
229
+ await scraper.start();
230
+ }
231
+ return scraper;
232
+ }
233
+
234
+ async function mondoweiss ( enable )
235
+ {
236
+ // https://mondoweiss.net
237
+ const scraper = new WebScraper({
238
+ baseURL: "https://mondoweiss.net",
239
+ excludeList: [
240
+ "https://mondoweiss.net/donate",
241
+ "https://mondoweiss.net/advertise/",
242
+ "https://mondoweiss.net/contact/",
243
+ "https://mondoweiss.net/recent-comments/"
244
+ ],
245
+ scrapResultPath: "./dataset/mondoweiss/website",
246
+ jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
247
+ textOutputPath: "./dataset/mondoweiss/texts",
248
+ csvOutputPath: "./dataset/mondoweiss/train.csv",
249
+ includeMetadata: true,
250
+ maxArticles: 2500,
251
+ axiosHeaders: headers,
252
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
253
+ });
254
+ if ( enable )
255
+ {
256
+ await scraper.start();
257
+ }
258
+ return scraper;
259
+ }
260
+
209
261
 
210
262
  void async function main ()
211
263
  {
212
264
  const palianswersScraper = await palianswers( false );
213
265
  const decolonizepalestineScraper = await decolonizepalestine( false );
214
266
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
215
- const electronicintifadaScraper = await electronicintifada( true );
267
+ const electronicintifadaScraper = await electronicintifada( false );
268
+ const standWithPalestineScraper = await standWithPalestine( false );
269
+ const mondoweisScraper = await mondoweiss( true );
216
270
  const bdsmovementScraper = await bdsmovement( false );
217
271
  const palestinerememberedScraper = await palestineremembered( false );
218
272
 
@@ -221,10 +275,10 @@ void async function main ()
221
275
  decolonizepalestineScraper,
222
276
  khameneiIrFreePalestineTagScraper,
223
277
  electronicintifadaScraper,
224
- // bdsmovementScraper,
225
- // palestinerememberedScraper,
278
+ standWithPalestineScraper,
279
+ mondoweisScraper
226
280
  ] );
227
-
228
- // 7 https://stand-with-palestine.org/blogs
229
- // https://mondoweiss.net
230
281
  }()
282
+
283
+
284
+ // https://mondoweiss.net
package/main.js CHANGED
@@ -161,7 +161,6 @@ class WebScraper
161
161
  if ( this.hasValidPageContent( article.textContent ) )
162
162
  {
163
163
  const metadata = this.extractMetadata( url, document );
164
- metadata.depth = depth;
165
164
  this.saveArticle( url, article.textContent, metadata );
166
165
  }
167
166
  else
@@ -262,7 +261,7 @@ class WebScraper
262
261
  }
263
262
  }
264
263
 
265
- hasReachedMax ( depth )
264
+ hasReachedMax ( depth = 0 )
266
265
  {
267
266
  if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
268
267
  {
@@ -555,7 +554,7 @@ class WebScraper
555
554
  ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
556
555
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
557
556
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
558
- dateScraped: new Date().toISOString()
557
+ dateScrapedDate: new Date().toISOString()
559
558
  };
560
559
  }
561
560
 
@@ -572,7 +571,7 @@ class WebScraper
572
571
  {
573
572
  try
574
573
  {
575
- if ( this.hasReachedMax( depth ) )
574
+ if ( this.hasReachedMax( ) )
576
575
  {
577
576
  throw new Error( "Max reached" );
578
577
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.7.6",
3
+ "version": "3.8.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",