clean-web-scraper 3.9.3 → 3.9.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +7 -6
- package/main.js +5 -3
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -30,8 +30,9 @@ async function palianswers ( enable )
|
|
30
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
31
31
|
includeMetadata: true,
|
32
32
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
33
|
-
|
33
|
+
axiosRetryDelay: 10000,
|
34
34
|
concurrencyLimit: 4,
|
35
|
+
crawlingDelay: 0
|
35
36
|
});
|
36
37
|
if ( enable )
|
37
38
|
{
|
@@ -57,7 +58,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
57
58
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
58
59
|
includeMetadata: true,
|
59
60
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
60
|
-
|
61
|
+
axiosRetryDelay: 10000,
|
61
62
|
concurrencyLimit: 4,
|
62
63
|
});
|
63
64
|
if ( enable )
|
@@ -89,7 +90,7 @@ async function decolonizepalestine ( enable )
|
|
89
90
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
90
91
|
includeMetadata: true,
|
91
92
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
92
|
-
|
93
|
+
axiosRetryDelay: 10000,
|
93
94
|
concurrencyLimit: 4,
|
94
95
|
});
|
95
96
|
if ( enable )
|
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
|
|
133
134
|
maxArticles: 2000,
|
134
135
|
concurrencyLimit: 2,
|
135
136
|
axiosHeaders: headers,
|
136
|
-
|
137
|
+
axiosRetryDelay: 10000,
|
137
138
|
axiosProxy: {
|
138
139
|
host: "localhost",
|
139
140
|
port: 2080,
|
@@ -198,7 +199,7 @@ async function mondoweiss ( enable )
|
|
198
199
|
textOutputPath: "./dataset/mondoweiss/texts",
|
199
200
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
200
201
|
maxArticles: 2500,
|
201
|
-
|
202
|
+
axiosMaxRetries: 3,
|
202
203
|
concurrencyLimit: 3,
|
203
204
|
axiosHeaders: headers,
|
204
205
|
axiosProxy: {
|
@@ -207,7 +208,7 @@ async function mondoweiss ( enable )
|
|
207
208
|
protocol: "http"
|
208
209
|
},
|
209
210
|
maxDepth: 15,
|
210
|
-
|
211
|
+
axiosRetryDelay: 10000,
|
211
212
|
includeMetadata: true,
|
212
213
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
213
214
|
useProxyAsFallback: true
|
package/main.js
CHANGED
@@ -16,7 +16,7 @@ class WebScraper
|
|
16
16
|
this.maxDepth = config.maxDepth || Infinity;
|
17
17
|
this.maxArticles = config.maxArticles || Infinity;
|
18
18
|
this.concurrencyLimit = config.concurrencyLimit || 2;
|
19
|
-
this.crawlingDelay = config.crawlingDelay
|
19
|
+
this.crawlingDelay = config.crawlingDelay ?? 1000;
|
20
20
|
|
21
21
|
// Output paths setup
|
22
22
|
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
@@ -126,7 +126,10 @@ class WebScraper
|
|
126
126
|
|
127
127
|
try
|
128
128
|
{
|
129
|
-
|
129
|
+
if ( this.crawlingDelay )
|
130
|
+
{
|
131
|
+
await WebScraper.sleep( this.crawlingDelay );
|
132
|
+
}
|
130
133
|
const data = await this.fetchContent( url );
|
131
134
|
if ( !data ) continue;
|
132
135
|
|
@@ -690,7 +693,6 @@ class WebScraper
|
|
690
693
|
this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
|
691
694
|
}
|
692
695
|
|
693
|
-
// Helper method to ensure a directory exists
|
694
696
|
ensureDirectory ( dirPath )
|
695
697
|
{
|
696
698
|
if ( !fs.existsSync( dirPath ) )
|