clean-web-scraper 3.7.6 → 3.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/example-usage.js +84 -14
  2. package/main.js +10 -5
  3. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -10,6 +10,7 @@ const headers = {
10
10
 
11
11
  async function palianswers ( enable )
12
12
  {
13
+ // https://palianswers.com
13
14
  const scraper = new WebScraper({
14
15
  baseURL: "https://palianswers.com",
15
16
  excludeList: [
@@ -28,7 +29,7 @@ async function palianswers ( enable )
28
29
  textOutputPath: "./dataset/palianswers/texts",
29
30
  csvOutputPath: "./dataset/palianswers/train.csv",
30
31
  includeMetadata: true,
31
- metadataFields: ["author", "title", "description"]
32
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
32
33
  });
33
34
  if ( enable )
34
35
  {
@@ -44,7 +45,7 @@ async function khameneiIrFreePalestineTag ( enable )
44
45
  const scraper = new WebScraper({
45
46
  baseURL: "https://english.khamenei.ir/news",
46
47
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
47
- maxDepth: 3,
48
+ maxDepth: 1,
48
49
  exactExcludeList: [
49
50
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
50
51
  ],
@@ -53,7 +54,7 @@ async function khameneiIrFreePalestineTag ( enable )
53
54
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
54
55
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
55
56
  includeMetadata: true,
56
- metadataFields: ["author", "title", "description"]
57
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
57
58
  });
58
59
  if ( enable )
59
60
  {
@@ -64,6 +65,7 @@ async function khameneiIrFreePalestineTag ( enable )
64
65
 
65
66
  async function decolonizepalestine ( enable )
66
67
  {
68
+ // https://decolonizepalestine.com
67
69
  const scraper = new WebScraper({
68
70
  baseURL: "https://decolonizepalestine.com",
69
71
  excludeList: [
@@ -82,7 +84,7 @@ async function decolonizepalestine ( enable )
82
84
  textOutputPath: "./dataset/decolonizepalestine/texts",
83
85
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
84
86
  includeMetadata: true,
85
- metadataFields: ["author", "title", "description"]
87
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
86
88
  });
87
89
  if ( enable )
88
90
  {
@@ -93,6 +95,7 @@ async function decolonizepalestine ( enable )
93
95
 
94
96
  async function bdsmovement ( enable )
95
97
  {
98
+ // https://bdsmovement.net
96
99
  const scraper = new WebScraper({
97
100
  baseURL: "https://bdsmovement.net",
98
101
  excludeList: [
@@ -108,7 +111,7 @@ async function bdsmovement ( enable )
108
111
  textOutputPath: "./dataset/bdsmovement/texts",
109
112
  csvOutputPath: "./dataset/bdsmovement/train.csv",
110
113
  includeMetadata: true,
111
- metadataFields: ["author", "title", "description"],
114
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
112
115
  puppeteerProxy: "socks5://127.0.0.1:2080",
113
116
  puppeteerExecutablePath: "/usr/bin/chromium",
114
117
  puppeteerRealProxy: {
@@ -125,6 +128,7 @@ async function bdsmovement ( enable )
125
128
 
126
129
  async function electronicintifada ( enable )
127
130
  {
131
+ // https://electronicintifada.net
128
132
  const scraper = new WebScraper({
129
133
  baseURL: "https://electronicintifada.net",
130
134
  excludeList: [
@@ -153,7 +157,7 @@ async function electronicintifada ( enable )
153
157
  includeMetadata: true,
154
158
  maxArticles: 2000,
155
159
  axiosHeaders: headers,
156
- metadataFields: ["author", "title", "description"]
160
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
157
161
  });
158
162
  if ( enable )
159
163
  {
@@ -192,7 +196,7 @@ async function palestineremembered ( enable )
192
196
  textOutputPath: "./dataset/palestineremembered/texts",
193
197
  csvOutputPath: "./dataset/palestineremembered/train.csv",
194
198
  includeMetadata: true,
195
- metadataFields: ["author", "title", "description"],
199
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
196
200
  axiosProxy: {
197
201
  host: "localhost",
198
202
  port: 2080,
@@ -206,13 +210,82 @@ async function palestineremembered ( enable )
206
210
  return scraper;
207
211
  }
208
212
 
213
+ async function standWithPalestine ( enable )
214
+ {
215
+ const scraper = new WebScraper({
216
+ baseURL: "https://stand-with-palestine.org/blogs",
217
+ startURL: "https://stand-with-palestine.org/blogs",
218
+ scrapResultPath: "./dataset/stand-with-palestine/website",
219
+ jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
220
+ textOutputPath: "./dataset/stand-with-palestine/texts",
221
+ csvOutputPath: "./dataset/stand-with-palestine/train.csv",
222
+ exactExcludeList: ["https://stand-with-palestine.org/blogs"],
223
+ axiosHeaders: headers,
224
+ includeMetadata: true,
225
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
226
+ });
227
+ if ( enable )
228
+ {
229
+ await scraper.start();
230
+ }
231
+ return scraper;
232
+ }
233
+
234
+ async function mondoweiss ( enable )
235
+ {
236
+ // https://mondoweiss.net
237
+ const scraper = new WebScraper({
238
+ baseURL: "https://mondoweiss.net",
239
+ excludeList: [
240
+ "https://mondoweiss.net/donate",
241
+ "https://mondoweiss.net/advertise/",
242
+ "https://mondoweiss.net/contact/",
243
+ "https://mondoweiss.net/recent-comments/",
244
+ "https://mondoweiss.net/email-newsletters",
245
+ "https://mondoweiss.net/author",
246
+ "https://mondoweiss.net/tag/"
247
+ ],
248
+ exactExcludeList: [
249
+ "https://mondoweiss.net",
250
+ "https://mondoweiss.net/news/",
251
+ "https://mondoweiss.net/opinion/",
252
+ "https://mondoweiss.net/ways-to-give/",
253
+ "https://mondoweiss.net/media-analysis/",
254
+ "https://mondoweiss.net/culture/",
255
+ "https://mondoweiss.net/activism/",
256
+ "https://mondoweiss.net/news-letters/"
257
+ ],
258
+ scrapResultPath: "./dataset/mondoweiss/website",
259
+ jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
260
+ textOutputPath: "./dataset/mondoweiss/texts",
261
+ csvOutputPath: "./dataset/mondoweiss/train.csv",
262
+ includeMetadata: true,
263
+ maxArticles: 2500,
264
+ maxRetries: 2,
265
+ axiosHeaders: headers,
266
+ axiosProxy: {
267
+ host: "localhost",
268
+ port: 2080,
269
+ protocol: "http"
270
+ },
271
+ metadataFields: ["author", "title", "description", "dateScrapedDate"]
272
+ });
273
+ if ( enable )
274
+ {
275
+ await scraper.start();
276
+ }
277
+ return scraper;
278
+ }
279
+
209
280
 
210
281
  void async function main ()
211
282
  {
212
283
  const palianswersScraper = await palianswers( false );
213
284
  const decolonizepalestineScraper = await decolonizepalestine( false );
214
285
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
215
- const electronicintifadaScraper = await electronicintifada( true );
286
+ const electronicintifadaScraper = await electronicintifada( false );
287
+ const standWithPalestineScraper = await standWithPalestine( false );
288
+ const mondoweisScraper = await mondoweiss( true );
216
289
  const bdsmovementScraper = await bdsmovement( false );
217
290
  const palestinerememberedScraper = await palestineremembered( false );
218
291
 
@@ -221,10 +294,7 @@ void async function main ()
221
294
  decolonizepalestineScraper,
222
295
  khameneiIrFreePalestineTagScraper,
223
296
  electronicintifadaScraper,
224
- // bdsmovementScraper,
225
- // palestinerememberedScraper,
297
+ standWithPalestineScraper,
298
+ mondoweisScraper
226
299
  ] );
227
-
228
- // 7 https://stand-with-palestine.org/blogs
229
- // https://mondoweiss.net
230
- }()
300
+ }()
package/main.js CHANGED
@@ -22,6 +22,7 @@ class WebScraper
22
22
  exactExcludeList = [],
23
23
  filterFileTypes,
24
24
  excludedFileTypes,
25
+ removeURLFragment,
25
26
 
26
27
  // Output paths
27
28
  scrapResultPath = "./dataset",
@@ -72,6 +73,7 @@ class WebScraper
72
73
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
73
74
  this.filterFileTypes = filterFileTypes || true;
74
75
  this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
76
+ this.removeURLFragment = removeURLFragment || true;
75
77
 
76
78
  // Network configuration
77
79
  this.axiosHeaders = axiosHeaders;
@@ -130,6 +132,10 @@ class WebScraper
130
132
 
131
133
  async fetchPage ( url, depth )
132
134
  {
135
+ if ( this.removeURLFragment )
136
+ {
137
+ url = url.split( "#" )[0];
138
+ }
133
139
  if ( this.hasReachedMax( depth ) )
134
140
  {
135
141
  return;
@@ -161,7 +167,6 @@ class WebScraper
161
167
  if ( this.hasValidPageContent( article.textContent ) )
162
168
  {
163
169
  const metadata = this.extractMetadata( url, document );
164
- metadata.depth = depth;
165
170
  this.saveArticle( url, article.textContent, metadata );
166
171
  }
167
172
  else
@@ -234,7 +239,7 @@ class WebScraper
234
239
  }
235
240
  catch ( error )
236
241
  {
237
- console.error( `Error fetching ${url}:`, error.message );
242
+ console.error( `Error fetching content ${url}:`, error.message );
238
243
  if ( error.status = 403 && this.usePuppeteer )
239
244
  {
240
245
  try
@@ -262,7 +267,7 @@ class WebScraper
262
267
  }
263
268
  }
264
269
 
265
- hasReachedMax ( depth )
270
+ hasReachedMax ( depth = 0 )
266
271
  {
267
272
  if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
268
273
  {
@@ -555,7 +560,7 @@ class WebScraper
555
560
  ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
556
561
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
557
562
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
558
- dateScraped: new Date().toISOString()
563
+ dateScrapedDate: new Date().toISOString()
559
564
  };
560
565
  }
561
566
 
@@ -572,7 +577,7 @@ class WebScraper
572
577
  {
573
578
  try
574
579
  {
575
- if ( this.hasReachedMax( depth ) )
580
+ if ( this.hasReachedMax( ) )
576
581
  {
577
582
  throw new Error( "Max reached" );
578
583
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.7.6",
3
+ "version": "3.8.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",