clean-web-scraper 3.8.3 → 3.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +8 -5
- package/main.js +2 -2
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -30,7 +30,8 @@ async function palianswers ( enable )
|
|
30
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
31
31
|
includeMetadata: true,
|
32
32
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
33
|
-
retryDelay: 10000
|
33
|
+
retryDelay: 10000,
|
34
|
+
concurrencyLimit: 4,
|
34
35
|
});
|
35
36
|
if ( enable )
|
36
37
|
{
|
@@ -56,7 +57,8 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
56
57
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
57
58
|
includeMetadata: true,
|
58
59
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
59
|
-
retryDelay: 10000
|
60
|
+
retryDelay: 10000,
|
61
|
+
concurrencyLimit: 4,
|
60
62
|
});
|
61
63
|
if ( enable )
|
62
64
|
{
|
@@ -87,7 +89,8 @@ async function decolonizepalestine ( enable )
|
|
87
89
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
88
90
|
includeMetadata: true,
|
89
91
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
90
|
-
retryDelay: 10000
|
92
|
+
retryDelay: 10000,
|
93
|
+
concurrencyLimit: 4,
|
91
94
|
});
|
92
95
|
if ( enable )
|
93
96
|
{
|
@@ -126,9 +129,9 @@ async function electronicintifada ( enable )
|
|
126
129
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
127
130
|
includeMetadata: true,
|
128
131
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
129
|
-
maxDepth:
|
132
|
+
maxDepth: 12,
|
130
133
|
maxArticles: 2000,
|
131
|
-
concurrencyLimit:
|
134
|
+
concurrencyLimit: 3,
|
132
135
|
axiosHeaders: headers,
|
133
136
|
retryDelay: 10000
|
134
137
|
});
|
package/main.js
CHANGED
@@ -154,6 +154,7 @@ class WebScraper
|
|
154
154
|
}
|
155
155
|
try
|
156
156
|
{
|
157
|
+
await WebScraper.sleep( 5000 );
|
157
158
|
const data = await this.fetchContent( url );
|
158
159
|
if ( !data ) return;
|
159
160
|
const dom = new JSDOM( data, { url });
|
@@ -191,7 +192,6 @@ class WebScraper
|
|
191
192
|
{
|
192
193
|
return;
|
193
194
|
}
|
194
|
-
await WebScraper.sleep( 5000 );
|
195
195
|
const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
|
196
196
|
const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
|
197
197
|
|
@@ -575,7 +575,7 @@ class WebScraper
|
|
575
575
|
const options = {
|
576
576
|
responseType: "stream",
|
577
577
|
maxRedirects: 5,
|
578
|
-
timeout:
|
578
|
+
timeout: 20000,
|
579
579
|
...this.axiosOptions,
|
580
580
|
};
|
581
581
|
|