clean-web-scraper 3.6.2 → 3.6.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +1 -1
- package/main.js +5 -5
- package/package.json +1 -1
package/example-usage.js
CHANGED
|
@@ -44,7 +44,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
|
44
44
|
const scraper = new WebScraper({
|
|
45
45
|
baseURL: "https://english.khamenei.ir/news",
|
|
46
46
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
|
47
|
-
maxDepth:
|
|
47
|
+
maxDepth: 3,
|
|
48
48
|
exactExcludeList: [
|
|
49
49
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
|
|
50
50
|
],
|
package/main.js
CHANGED
|
@@ -49,7 +49,7 @@ class WebScraper
|
|
|
49
49
|
this.strictBaseURL = strictBaseURL || true;
|
|
50
50
|
this.maxDepth = maxDepth || Infinity;
|
|
51
51
|
this.maxArticles = maxArticles || Infinity;
|
|
52
|
-
this.concurrencyLimit = concurrencyLimit ||
|
|
52
|
+
this.concurrencyLimit = concurrencyLimit || 7;
|
|
53
53
|
|
|
54
54
|
// Output paths setup
|
|
55
55
|
this.scrapResultPath = scrapResultPath;
|
|
@@ -129,7 +129,7 @@ class WebScraper
|
|
|
129
129
|
{
|
|
130
130
|
if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
|
|
131
131
|
{
|
|
132
|
-
console.
|
|
132
|
+
console.error( `Reached maximum: ${this.allProcessedContent.length}, ${this.maxDepth}` );
|
|
133
133
|
return;
|
|
134
134
|
}
|
|
135
135
|
if ( this.visited.has( url ) )
|
|
@@ -551,7 +551,7 @@ class WebScraper
|
|
|
551
551
|
...this.axiosOptions,
|
|
552
552
|
};
|
|
553
553
|
|
|
554
|
-
let maxRetries =
|
|
554
|
+
let maxRetries = 11;
|
|
555
555
|
for ( let attempt = 1; attempt <= maxRetries; attempt++ )
|
|
556
556
|
{
|
|
557
557
|
try
|
|
@@ -561,8 +561,8 @@ class WebScraper
|
|
|
561
561
|
catch ( error )
|
|
562
562
|
{
|
|
563
563
|
if ( attempt === maxRetries ) throw error;
|
|
564
|
-
await WebScraper.sleep(
|
|
565
|
-
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${maxRetries})
|
|
564
|
+
await WebScraper.sleep( 5000 * attempt );
|
|
565
|
+
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${maxRetries})`, error.message, error.code );
|
|
566
566
|
}
|
|
567
567
|
}
|
|
568
568
|
}
|