clean-web-scraper 3.8.3 → 3.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -30,7 +30,8 @@ async function palianswers ( enable )
30
30
  csvOutputPath: "./dataset/palianswers/train.csv",
31
31
  includeMetadata: true,
32
32
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
33
- retryDelay: 10000
33
+ retryDelay: 10000,
34
+ concurrencyLimit: 4,
34
35
  });
35
36
  if ( enable )
36
37
  {
@@ -56,7 +57,8 @@ async function khameneiIrFreePalestineTag ( enable )
56
57
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
57
58
  includeMetadata: true,
58
59
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
59
- retryDelay: 10000
60
+ retryDelay: 10000,
61
+ concurrencyLimit: 4,
60
62
  });
61
63
  if ( enable )
62
64
  {
@@ -87,7 +89,8 @@ async function decolonizepalestine ( enable )
87
89
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
88
90
  includeMetadata: true,
89
91
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
90
- retryDelay: 10000
92
+ retryDelay: 10000,
93
+ concurrencyLimit: 4,
91
94
  });
92
95
  if ( enable )
93
96
  {
@@ -126,9 +129,9 @@ async function electronicintifada ( enable )
126
129
  csvOutputPath: "./dataset/electronicintifada/train.csv",
127
130
  includeMetadata: true,
128
131
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
129
- maxDepth: 10,
132
+ maxDepth: 12,
130
133
  maxArticles: 2000,
131
- concurrencyLimit: 4,
134
+ concurrencyLimit: 3,
132
135
  axiosHeaders: headers,
133
136
  retryDelay: 10000
134
137
  });
package/main.js CHANGED
@@ -154,6 +154,7 @@ class WebScraper
154
154
  }
155
155
  try
156
156
  {
157
+ await WebScraper.sleep( 5000 );
157
158
  const data = await this.fetchContent( url );
158
159
  if ( !data ) return;
159
160
  const dom = new JSDOM( data, { url });
@@ -191,7 +192,6 @@ class WebScraper
191
192
  {
192
193
  return;
193
194
  }
194
- await WebScraper.sleep( 5000 );
195
195
  const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
196
196
  const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
197
197
 
@@ -575,7 +575,7 @@ class WebScraper
575
575
  const options = {
576
576
  responseType: "stream",
577
577
  maxRedirects: 5,
578
- timeout: 70000,
578
+ timeout: 20000,
579
579
  ...this.axiosOptions,
580
580
  };
581
581
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.8.3",
3
+ "version": "3.8.5",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",