clean-web-scraper 3.9.3 → 3.9.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -30,8 +30,9 @@ async function palianswers ( enable )
30
30
  csvOutputPath: "./dataset/palianswers/train.csv",
31
31
  includeMetadata: true,
32
32
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
33
- retryDelay: 10000,
33
+ axiosRetryDelay: 10000,
34
34
  concurrencyLimit: 4,
35
+ crawlingDelay: 0
35
36
  });
36
37
  if ( enable )
37
38
  {
@@ -57,7 +58,7 @@ async function khameneiIrFreePalestineTag ( enable )
57
58
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
58
59
  includeMetadata: true,
59
60
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
60
- retryDelay: 10000,
61
+ axiosRetryDelay: 10000,
61
62
  concurrencyLimit: 4,
62
63
  });
63
64
  if ( enable )
@@ -89,7 +90,7 @@ async function decolonizepalestine ( enable )
89
90
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
90
91
  includeMetadata: true,
91
92
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
92
- retryDelay: 10000,
93
+ axiosRetryDelay: 10000,
93
94
  concurrencyLimit: 4,
94
95
  });
95
96
  if ( enable )
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
133
134
  maxArticles: 2000,
134
135
  concurrencyLimit: 2,
135
136
  axiosHeaders: headers,
136
- retryDelay: 10000,
137
+ axiosRetryDelay: 10000,
137
138
  axiosProxy: {
138
139
  host: "localhost",
139
140
  port: 2080,
@@ -198,7 +199,7 @@ async function mondoweiss ( enable )
198
199
  textOutputPath: "./dataset/mondoweiss/texts",
199
200
  csvOutputPath: "./dataset/mondoweiss/train.csv",
200
201
  maxArticles: 2500,
201
- maxRetries: 3,
202
+ axiosMaxRetries: 3,
202
203
  concurrencyLimit: 3,
203
204
  axiosHeaders: headers,
204
205
  axiosProxy: {
@@ -207,7 +208,7 @@ async function mondoweiss ( enable )
207
208
  protocol: "http"
208
209
  },
209
210
  maxDepth: 15,
210
- retryDelay: 10000,
211
+ axiosRetryDelay: 10000,
211
212
  includeMetadata: true,
212
213
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
213
214
  useProxyAsFallback: true
package/main.js CHANGED
@@ -16,7 +16,7 @@ class WebScraper
16
16
  this.maxDepth = config.maxDepth || Infinity;
17
17
  this.maxArticles = config.maxArticles || Infinity;
18
18
  this.concurrencyLimit = config.concurrencyLimit || 2;
19
- this.crawlingDelay = config.crawlingDelay || 1000;
19
+ this.crawlingDelay = config.crawlingDelay ?? 1000;
20
20
 
21
21
  // Output paths setup
22
22
  this.scrapResultPath = config.scrapResultPath || "./dataset";
@@ -126,7 +126,10 @@ class WebScraper
126
126
 
127
127
  try
128
128
  {
129
- await WebScraper.sleep( this.crawlingDelay );
129
+ if ( this.crawlingDelay )
130
+ {
131
+ await WebScraper.sleep( this.crawlingDelay );
132
+ }
130
133
  const data = await this.fetchContent( url );
131
134
  if ( !data ) continue;
132
135
 
@@ -690,7 +693,6 @@ class WebScraper
690
693
  this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
691
694
  }
692
695
 
693
- // Helper method to ensure a directory exists
694
696
  ensureDirectory ( dirPath )
695
697
  {
696
698
  if ( !fs.existsSync( dirPath ) )
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.9.3",
3
+ "version": "3.9.5",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",