clean-web-scraper 4.0.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/example-usage.js +16 -14
  2. package/main.js +5 -10
  3. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -37,7 +37,7 @@ async function palianswers ( enable )
37
37
  textOutputPath: "./dataset/palianswers/texts",
38
38
  csvOutputPath: "./dataset/palianswers/train.csv",
39
39
  includeMetadata: true,
40
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
40
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
41
41
  axiosRetryDelay: 10000,
42
42
  crawlingDelay: 0
43
43
  };
@@ -58,7 +58,7 @@ async function khameneiIrFreePalestineTag ( enable )
58
58
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
59
59
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
60
60
  includeMetadata: true,
61
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
61
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
62
62
  axiosRetryDelay: 10000,
63
63
  };
64
64
  return await runScraper( config, enable );
@@ -84,7 +84,7 @@ async function decolonizepalestine ( enable )
84
84
  textOutputPath: "./dataset/decolonizepalestine/texts",
85
85
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
86
86
  includeMetadata: true,
87
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
87
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
88
88
  axiosRetryDelay: 10000,
89
89
  };
90
90
  return await runScraper( config, enable );
@@ -118,17 +118,19 @@ async function electronicintifada ( enable )
118
118
  textOutputPath: "./dataset/electronicintifada/texts",
119
119
  csvOutputPath: "./dataset/electronicintifada/train.csv",
120
120
  includeMetadata: true,
121
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
121
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
122
122
  maxDepth: 16,
123
123
  maxArticles: 2000,
124
124
  axiosHeaders: headers,
125
+ axiosMaxRetries: 2,
125
126
  axiosRetryDelay: 10000,
126
127
  axiosProxy: {
127
128
  host: "localhost",
128
129
  port: 2080,
129
130
  protocol: "http"
130
131
  },
131
- useProxyAsFallback: true
132
+ useProxyAsFallback: true,
133
+ crawlingDelay: 0
132
134
  };
133
135
  return await runScraper( config, enable );
134
136
  }
@@ -138,14 +140,14 @@ async function standWithPalestine ( enable )
138
140
  const config = {
139
141
  baseURL: "https://stand-with-palestine.org/blogs",
140
142
  startURL: "https://stand-with-palestine.org/blogs",
143
+ exactExcludeList: ["https://stand-with-palestine.org/blogs"],
141
144
  scrapResultPath: "./dataset/stand-with-palestine/website",
142
145
  jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
143
146
  textOutputPath: "./dataset/stand-with-palestine/texts",
144
147
  csvOutputPath: "./dataset/stand-with-palestine/train.csv",
145
- exactExcludeList: ["https://stand-with-palestine.org/blogs"],
146
148
  axiosHeaders: headers,
147
149
  includeMetadata: true,
148
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
150
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
149
151
  };
150
152
  return await runScraper( config, enable );
151
153
  }
@@ -178,18 +180,18 @@ async function mondoweiss ( enable )
178
180
  textOutputPath: "./dataset/mondoweiss/texts",
179
181
  csvOutputPath: "./dataset/mondoweiss/train.csv",
180
182
  maxArticles: 2500,
181
- axiosMaxRetries: 3,
183
+ maxDepth: 15,
182
184
  axiosHeaders: headers,
185
+ axiosMaxRetries: 3,
186
+ axiosRetryDelay: 10000,
183
187
  axiosProxy: {
184
188
  host: "localhost",
185
189
  port: 2080,
186
190
  protocol: "http"
187
191
  },
188
- maxDepth: 15,
189
- axiosRetryDelay: 10000,
192
+ useProxyAsFallback: true,
190
193
  includeMetadata: true,
191
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
192
- useProxyAsFallback: true
194
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
193
195
  };
194
196
  return await runScraper( config, enable );
195
197
  }
@@ -211,7 +213,7 @@ async function bdsmovement ( enable )
211
213
  textOutputPath: "./dataset/bdsmovement/texts",
212
214
  csvOutputPath: "./dataset/bdsmovement/train.csv",
213
215
  includeMetadata: true,
214
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
216
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
215
217
  puppeteerRealProxy: {
216
218
  host: "socks5://127.0.0.1",
217
219
  port: "2080",
@@ -249,7 +251,7 @@ async function palestineremembered ( enable )
249
251
  textOutputPath: "./dataset/palestineremembered/texts",
250
252
  csvOutputPath: "./dataset/palestineremembered/train.csv",
251
253
  includeMetadata: true,
252
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
254
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
253
255
  axiosProxy: {
254
256
  host: "localhost",
255
257
  port: 2080,
package/main.js CHANGED
@@ -135,16 +135,14 @@ class WebScraper
135
135
 
136
136
  if ( !this.isExcluded( url ) )
137
137
  {
138
- const reader = new Readability( document, {
139
- charThreshold: 500,
140
- nbTopCandidates: 20
141
- });
138
+ const reader = new Readability( document );
142
139
  const article = reader.parse();
143
140
  if ( article )
144
141
  {
145
142
  if ( this.hasValidPageContent( article.textContent ) )
146
143
  {
147
144
  const metadata = this.extractMetadata( url, document );
145
+ metadata.articleTitle = article.title || "";
148
146
  this.saveArticle( url, article.textContent, metadata );
149
147
  }
150
148
  else
@@ -503,17 +501,14 @@ class WebScraper
503
501
  {
504
502
  return {
505
503
  url,
506
- title: document.title,
504
+ pageTitle: document.title,
507
505
  description: document.querySelector( "meta[name=\"description\"]" )?.content,
508
506
  keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
509
507
  author: document.querySelector( "meta[name=\"author\"]" )?.content,
510
- language:
511
- document.documentElement.lang ||
512
- document.querySelector( "html" )?.getAttribute( "lang" ),
508
+ language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
513
509
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
514
510
  ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
515
- ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
516
- ?.content,
511
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
517
512
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
518
513
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
519
514
  dateScrapedDate: new Date().toISOString()
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.0.0",
3
+ "version": "4.0.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",