clean-web-scraper 3.8.2 → 3.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -30,7 +30,8 @@ async function palianswers ( enable )
30
30
  csvOutputPath: "./dataset/palianswers/train.csv",
31
31
  includeMetadata: true,
32
32
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
33
- retryDelay: 10000
33
+ retryDelay: 10000,
34
+ concurrencyLimit: 4,
34
35
  });
35
36
  if ( enable )
36
37
  {
@@ -56,7 +57,8 @@ async function khameneiIrFreePalestineTag ( enable )
56
57
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
57
58
  includeMetadata: true,
58
59
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
59
- retryDelay: 10000
60
+ retryDelay: 10000,
61
+ concurrencyLimit: 4,
60
62
  });
61
63
  if ( enable )
62
64
  {
@@ -87,7 +89,8 @@ async function decolonizepalestine ( enable )
87
89
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
88
90
  includeMetadata: true,
89
91
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
90
- retryDelay: 10000
92
+ retryDelay: 10000,
93
+ concurrencyLimit: 4,
91
94
  });
92
95
  if ( enable )
93
96
  {
@@ -128,6 +131,7 @@ async function electronicintifada ( enable )
128
131
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
129
132
  maxDepth: 10,
130
133
  maxArticles: 2000,
134
+ concurrencyLimit: 4,
131
135
  axiosHeaders: headers,
132
136
  retryDelay: 10000
133
137
  });
@@ -189,6 +193,7 @@ async function mondoweiss ( enable )
189
193
  csvOutputPath: "./dataset/mondoweiss/train.csv",
190
194
  maxArticles: 2500,
191
195
  maxRetries: 2,
196
+ concurrencyLimit: 4,
192
197
  axiosHeaders: headers,
193
198
  axiosProxy: {
194
199
  host: "localhost",
package/main.js CHANGED
@@ -154,6 +154,7 @@ class WebScraper
154
154
  }
155
155
  try
156
156
  {
157
+ await WebScraper.sleep( 5000 );
157
158
  const data = await this.fetchContent( url );
158
159
  if ( !data ) return;
159
160
  const dom = new JSDOM( data, { url });
@@ -191,7 +192,6 @@ class WebScraper
191
192
  {
192
193
  return;
193
194
  }
194
- await WebScraper.sleep( 5000 );
195
195
  const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
196
196
  const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
197
197
 
@@ -585,7 +585,7 @@ class WebScraper
585
585
  {
586
586
  if ( this.hasReachedMax( ) )
587
587
  {
588
- throw new Error( "Max reached" );
588
+ break;
589
589
  }
590
590
  return await axios.get( url, options );
591
591
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.8.2",
3
+ "version": "3.8.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",