clean-web-scraper 3.8.2 → 3.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +8 -3
- package/main.js +2 -2
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -30,7 +30,8 @@ async function palianswers ( enable )
|
|
30
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
31
31
|
includeMetadata: true,
|
32
32
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
33
|
-
retryDelay: 10000
|
33
|
+
retryDelay: 10000,
|
34
|
+
concurrencyLimit: 4,
|
34
35
|
});
|
35
36
|
if ( enable )
|
36
37
|
{
|
@@ -56,7 +57,8 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
56
57
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
57
58
|
includeMetadata: true,
|
58
59
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
59
|
-
retryDelay: 10000
|
60
|
+
retryDelay: 10000,
|
61
|
+
concurrencyLimit: 4,
|
60
62
|
});
|
61
63
|
if ( enable )
|
62
64
|
{
|
@@ -87,7 +89,8 @@ async function decolonizepalestine ( enable )
|
|
87
89
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
88
90
|
includeMetadata: true,
|
89
91
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
90
|
-
retryDelay: 10000
|
92
|
+
retryDelay: 10000,
|
93
|
+
concurrencyLimit: 4,
|
91
94
|
});
|
92
95
|
if ( enable )
|
93
96
|
{
|
@@ -128,6 +131,7 @@ async function electronicintifada ( enable )
|
|
128
131
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
129
132
|
maxDepth: 10,
|
130
133
|
maxArticles: 2000,
|
134
|
+
concurrencyLimit: 4,
|
131
135
|
axiosHeaders: headers,
|
132
136
|
retryDelay: 10000
|
133
137
|
});
|
@@ -189,6 +193,7 @@ async function mondoweiss ( enable )
|
|
189
193
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
190
194
|
maxArticles: 2500,
|
191
195
|
maxRetries: 2,
|
196
|
+
concurrencyLimit: 4,
|
192
197
|
axiosHeaders: headers,
|
193
198
|
axiosProxy: {
|
194
199
|
host: "localhost",
|
package/main.js
CHANGED
@@ -154,6 +154,7 @@ class WebScraper
|
|
154
154
|
}
|
155
155
|
try
|
156
156
|
{
|
157
|
+
await WebScraper.sleep( 5000 );
|
157
158
|
const data = await this.fetchContent( url );
|
158
159
|
if ( !data ) return;
|
159
160
|
const dom = new JSDOM( data, { url });
|
@@ -191,7 +192,6 @@ class WebScraper
|
|
191
192
|
{
|
192
193
|
return;
|
193
194
|
}
|
194
|
-
await WebScraper.sleep( 5000 );
|
195
195
|
const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
|
196
196
|
const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
|
197
197
|
|
@@ -585,7 +585,7 @@ class WebScraper
|
|
585
585
|
{
|
586
586
|
if ( this.hasReachedMax( ) )
|
587
587
|
{
|
588
|
-
|
588
|
+
break;
|
589
589
|
}
|
590
590
|
return await axios.get( url, options );
|
591
591
|
}
|