clean-web-scraper 4.0.0 → 4.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/example-usage.js +20 -15
  2. package/main.js +5 -10
  3. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -37,7 +37,7 @@ async function palianswers ( enable )
37
37
  textOutputPath: "./dataset/palianswers/texts",
38
38
  csvOutputPath: "./dataset/palianswers/train.csv",
39
39
  includeMetadata: true,
40
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
40
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
41
41
  axiosRetryDelay: 10000,
42
42
  crawlingDelay: 0
43
43
  };
@@ -51,14 +51,15 @@ async function khameneiIrFreePalestineTag ( enable )
51
51
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
52
52
  maxDepth: 1,
53
53
  exactExcludeList: [
54
- "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
54
+ "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
55
+ "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
55
56
  ],
56
57
  scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
57
58
  jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
58
59
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
59
60
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
60
61
  includeMetadata: true,
61
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
62
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
62
63
  axiosRetryDelay: 10000,
63
64
  };
64
65
  return await runScraper( config, enable );
@@ -84,7 +85,7 @@ async function decolonizepalestine ( enable )
84
85
  textOutputPath: "./dataset/decolonizepalestine/texts",
85
86
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
86
87
  includeMetadata: true,
87
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
88
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
88
89
  axiosRetryDelay: 10000,
89
90
  };
90
91
  return await runScraper( config, enable );
@@ -107,6 +108,8 @@ async function electronicintifada ( enable )
107
108
  "https://electronicintifada.net/search/site/",
108
109
  "https://electronicintifada.net/news",
109
110
  "https://electronicintifada.net/opinion",
111
+ "https://electronicintifada.net/about-ei",
112
+ "https://electronicintifada.net/review"
110
113
  ],
111
114
  exactExcludeList: [
112
115
  "https://electronicintifada.net",
@@ -118,17 +121,19 @@ async function electronicintifada ( enable )
118
121
  textOutputPath: "./dataset/electronicintifada/texts",
119
122
  csvOutputPath: "./dataset/electronicintifada/train.csv",
120
123
  includeMetadata: true,
121
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
124
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
122
125
  maxDepth: 16,
123
126
  maxArticles: 2000,
124
127
  axiosHeaders: headers,
128
+ axiosMaxRetries: 2,
125
129
  axiosRetryDelay: 10000,
126
130
  axiosProxy: {
127
131
  host: "localhost",
128
132
  port: 2080,
129
133
  protocol: "http"
130
134
  },
131
- useProxyAsFallback: true
135
+ useProxyAsFallback: true,
136
+ crawlingDelay: 0
132
137
  };
133
138
  return await runScraper( config, enable );
134
139
  }
@@ -138,14 +143,14 @@ async function standWithPalestine ( enable )
138
143
  const config = {
139
144
  baseURL: "https://stand-with-palestine.org/blogs",
140
145
  startURL: "https://stand-with-palestine.org/blogs",
146
+ exactExcludeList: ["https://stand-with-palestine.org/blogs"],
141
147
  scrapResultPath: "./dataset/stand-with-palestine/website",
142
148
  jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
143
149
  textOutputPath: "./dataset/stand-with-palestine/texts",
144
150
  csvOutputPath: "./dataset/stand-with-palestine/train.csv",
145
- exactExcludeList: ["https://stand-with-palestine.org/blogs"],
146
151
  axiosHeaders: headers,
147
152
  includeMetadata: true,
148
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
153
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
149
154
  };
150
155
  return await runScraper( config, enable );
151
156
  }
@@ -178,18 +183,18 @@ async function mondoweiss ( enable )
178
183
  textOutputPath: "./dataset/mondoweiss/texts",
179
184
  csvOutputPath: "./dataset/mondoweiss/train.csv",
180
185
  maxArticles: 2500,
181
- axiosMaxRetries: 3,
186
+ maxDepth: 15,
182
187
  axiosHeaders: headers,
188
+ axiosMaxRetries: 3,
189
+ axiosRetryDelay: 10000,
183
190
  axiosProxy: {
184
191
  host: "localhost",
185
192
  port: 2080,
186
193
  protocol: "http"
187
194
  },
188
- maxDepth: 15,
189
- axiosRetryDelay: 10000,
195
+ useProxyAsFallback: true,
190
196
  includeMetadata: true,
191
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
192
- useProxyAsFallback: true
197
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
193
198
  };
194
199
  return await runScraper( config, enable );
195
200
  }
@@ -211,7 +216,7 @@ async function bdsmovement ( enable )
211
216
  textOutputPath: "./dataset/bdsmovement/texts",
212
217
  csvOutputPath: "./dataset/bdsmovement/train.csv",
213
218
  includeMetadata: true,
214
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
219
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
215
220
  puppeteerRealProxy: {
216
221
  host: "socks5://127.0.0.1",
217
222
  port: "2080",
@@ -249,7 +254,7 @@ async function palestineremembered ( enable )
249
254
  textOutputPath: "./dataset/palestineremembered/texts",
250
255
  csvOutputPath: "./dataset/palestineremembered/train.csv",
251
256
  includeMetadata: true,
252
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
257
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
253
258
  axiosProxy: {
254
259
  host: "localhost",
255
260
  port: 2080,
package/main.js CHANGED
@@ -135,16 +135,14 @@ class WebScraper
135
135
 
136
136
  if ( !this.isExcluded( url ) )
137
137
  {
138
- const reader = new Readability( document, {
139
- charThreshold: 500,
140
- nbTopCandidates: 20
141
- });
138
+ const reader = new Readability( document );
142
139
  const article = reader.parse();
143
140
  if ( article )
144
141
  {
145
142
  if ( this.hasValidPageContent( article.textContent ) )
146
143
  {
147
144
  const metadata = this.extractMetadata( url, document );
145
+ metadata.articleTitle = article.title || "";
148
146
  this.saveArticle( url, article.textContent, metadata );
149
147
  }
150
148
  else
@@ -503,17 +501,14 @@ class WebScraper
503
501
  {
504
502
  return {
505
503
  url,
506
- title: document.title,
504
+ pageTitle: document.title,
507
505
  description: document.querySelector( "meta[name=\"description\"]" )?.content,
508
506
  keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
509
507
  author: document.querySelector( "meta[name=\"author\"]" )?.content,
510
- language:
511
- document.documentElement.lang ||
512
- document.querySelector( "html" )?.getAttribute( "lang" ),
508
+ language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
513
509
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
514
510
  ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
515
- ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
516
- ?.content,
511
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
517
512
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
518
513
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
519
514
  dateScrapedDate: new Date().toISOString()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.0.0",
3
+ "version": "4.0.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",