clean-web-scraper 3.9.4 → 3.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/example-usage.js +7 -6
- package/main.js +5 -2
- package/package.json +1 -1
package/README.md
CHANGED
@@ -61,7 +61,11 @@ const scraper = new WebScraper({
|
|
61
61
|
|
62
62
|
// Network options
|
63
63
|
axiosHeaders: {}, // Optional: Custom HTTP headers
|
64
|
-
axiosProxy:
|
64
|
+
axiosProxy: { // Optional: HTTP/HTTPS proxy
|
65
|
+
host: "localhost",
|
66
|
+
port: 2080,
|
67
|
+
protocol: "http"
|
68
|
+
},
|
65
69
|
axiosMaxRetries: 5, // Optional: Max retry attempts
|
66
70
|
axiosRetryDelay: 40000, // Optional: Delay between retries (ms)
|
67
71
|
useProxyAsFallback: false, // Optional: Fallback to proxy on failure
|
package/example-usage.js
CHANGED
@@ -30,8 +30,9 @@ async function palianswers ( enable )
|
|
30
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
31
31
|
includeMetadata: true,
|
32
32
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
33
|
-
|
33
|
+
axiosRetryDelay: 10000,
|
34
34
|
concurrencyLimit: 4,
|
35
|
+
crawlingDelay: 0
|
35
36
|
});
|
36
37
|
if ( enable )
|
37
38
|
{
|
@@ -57,7 +58,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
57
58
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
58
59
|
includeMetadata: true,
|
59
60
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
60
|
-
|
61
|
+
axiosRetryDelay: 10000,
|
61
62
|
concurrencyLimit: 4,
|
62
63
|
});
|
63
64
|
if ( enable )
|
@@ -89,7 +90,7 @@ async function decolonizepalestine ( enable )
|
|
89
90
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
90
91
|
includeMetadata: true,
|
91
92
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
92
|
-
|
93
|
+
axiosRetryDelay: 10000,
|
93
94
|
concurrencyLimit: 4,
|
94
95
|
});
|
95
96
|
if ( enable )
|
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
|
|
133
134
|
maxArticles: 2000,
|
134
135
|
concurrencyLimit: 2,
|
135
136
|
axiosHeaders: headers,
|
136
|
-
|
137
|
+
axiosRetryDelay: 10000,
|
137
138
|
axiosProxy: {
|
138
139
|
host: "localhost",
|
139
140
|
port: 2080,
|
@@ -198,7 +199,7 @@ async function mondoweiss ( enable )
|
|
198
199
|
textOutputPath: "./dataset/mondoweiss/texts",
|
199
200
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
200
201
|
maxArticles: 2500,
|
201
|
-
|
202
|
+
axiosMaxRetries: 3,
|
202
203
|
concurrencyLimit: 3,
|
203
204
|
axiosHeaders: headers,
|
204
205
|
axiosProxy: {
|
@@ -207,7 +208,7 @@ async function mondoweiss ( enable )
|
|
207
208
|
protocol: "http"
|
208
209
|
},
|
209
210
|
maxDepth: 15,
|
210
|
-
|
211
|
+
axiosRetryDelay: 10000,
|
211
212
|
includeMetadata: true,
|
212
213
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
213
214
|
useProxyAsFallback: true
|
package/main.js
CHANGED
@@ -16,7 +16,7 @@ class WebScraper
|
|
16
16
|
this.maxDepth = config.maxDepth || Infinity;
|
17
17
|
this.maxArticles = config.maxArticles || Infinity;
|
18
18
|
this.concurrencyLimit = config.concurrencyLimit || 2;
|
19
|
-
this.crawlingDelay = config.crawlingDelay
|
19
|
+
this.crawlingDelay = config.crawlingDelay ?? 1000;
|
20
20
|
|
21
21
|
// Output paths setup
|
22
22
|
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
@@ -126,7 +126,10 @@ class WebScraper
|
|
126
126
|
|
127
127
|
try
|
128
128
|
{
|
129
|
-
|
129
|
+
if ( this.crawlingDelay )
|
130
|
+
{
|
131
|
+
await WebScraper.sleep( this.crawlingDelay );
|
132
|
+
}
|
130
133
|
const data = await this.fetchContent( url );
|
131
134
|
if ( !data ) continue;
|
132
135
|
|