clean-web-scraper 3.7.4 → 3.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/main.js +23 -5
- package/package.json +1 -1
package/main.js
CHANGED
@@ -130,9 +130,8 @@ class WebScraper
|
|
130
130
|
|
131
131
|
async fetchPage ( url, depth )
|
132
132
|
{
|
133
|
-
if ( this.
|
133
|
+
if ( this.hasReachedMax( depth ) )
|
134
134
|
{
|
135
|
-
console.error( `Reached maximum: ${this.allProcessedContent.length}, ${this.maxDepth}` );
|
136
135
|
return;
|
137
136
|
}
|
138
137
|
if ( this.visited.has( url ) )
|
@@ -181,6 +180,10 @@ class WebScraper
|
|
181
180
|
|
182
181
|
for ( let i = 0; i < unvisitedLinks.length; i += this.concurrencyLimit )
|
183
182
|
{
|
183
|
+
if ( this.hasReachedMax( depth ) )
|
184
|
+
{
|
185
|
+
return;
|
186
|
+
}
|
184
187
|
await WebScraper.sleep( 5000 );
|
185
188
|
const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
|
186
189
|
const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
|
@@ -205,8 +208,8 @@ class WebScraper
|
|
205
208
|
try
|
206
209
|
{
|
207
210
|
const response = await this.retryAxiosRequest( url )
|
208
|
-
const contentType = response
|
209
|
-
if ( !contentType
|
211
|
+
const contentType = response?.headers["content-type"] || "";
|
212
|
+
if ( !contentType?.startsWith( "text" ) )
|
210
213
|
{
|
211
214
|
console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
|
212
215
|
response.data.destroy();
|
@@ -259,6 +262,16 @@ class WebScraper
|
|
259
262
|
}
|
260
263
|
}
|
261
264
|
|
265
|
+
hasReachedMax ( depth )
|
266
|
+
{
|
267
|
+
if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
|
268
|
+
{
|
269
|
+
console.error( `Reached maximum: ${this.allProcessedContent.length}, ${this.maxDepth} , ${depth}` );
|
270
|
+
return true;
|
271
|
+
}
|
272
|
+
return false;
|
273
|
+
}
|
274
|
+
|
262
275
|
async navigateToPage ( url )
|
263
276
|
{
|
264
277
|
let pages = await this.puppeteerBrowser.pages();
|
@@ -559,15 +572,20 @@ class WebScraper
|
|
559
572
|
{
|
560
573
|
try
|
561
574
|
{
|
575
|
+
if ( this.hasReachedMax( depth ) )
|
576
|
+
{
|
577
|
+
throw new Error( "Max reached" );
|
578
|
+
}
|
562
579
|
return await axios.get( url, options );
|
563
580
|
}
|
564
581
|
catch ( error )
|
565
582
|
{
|
566
|
-
if ( attempt
|
583
|
+
if ( attempt >= this.maxRetries ) throw error;
|
567
584
|
await WebScraper.sleep( 40000 * attempt );
|
568
585
|
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
|
569
586
|
}
|
570
587
|
}
|
588
|
+
throw new Error( "Max retries reached" );
|
571
589
|
}
|
572
590
|
|
573
591
|
configurePuppeteer ( )
|