clean-web-scraper 3.7.4 → 3.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/main.js +23 -5
  2. package/package.json +1 -1
package/main.js CHANGED
@@ -130,9 +130,8 @@ class WebScraper
130
130
 
131
131
  async fetchPage ( url, depth )
132
132
  {
133
- if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
133
+ if ( this.hasReachedMax( depth ) )
134
134
  {
135
- console.error( `Reached maximum: ${this.allProcessedContent.length}, ${this.maxDepth}` );
136
135
  return;
137
136
  }
138
137
  if ( this.visited.has( url ) )
@@ -181,6 +180,10 @@ class WebScraper
181
180
 
182
181
  for ( let i = 0; i < unvisitedLinks.length; i += this.concurrencyLimit )
183
182
  {
183
+ if ( this.hasReachedMax( depth ) )
184
+ {
185
+ return;
186
+ }
184
187
  await WebScraper.sleep( 5000 );
185
188
  const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
186
189
  const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
@@ -205,8 +208,8 @@ class WebScraper
205
208
  try
206
209
  {
207
210
  const response = await this.retryAxiosRequest( url )
208
- const contentType = response.headers["content-type"] || "";
209
- if ( !contentType.startsWith( "text" ) )
211
+ const contentType = response?.headers["content-type"] || "";
212
+ if ( !contentType?.startsWith( "text" ) )
210
213
  {
211
214
  console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
212
215
  response.data.destroy();
@@ -259,6 +262,16 @@ class WebScraper
259
262
  }
260
263
  }
261
264
 
265
+ hasReachedMax ( depth )
266
+ {
267
+ if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
268
+ {
269
+ console.error( `Reached maximum: ${this.allProcessedContent.length}, ${this.maxDepth} , ${depth}` );
270
+ return true;
271
+ }
272
+ return false;
273
+ }
274
+
262
275
  async navigateToPage ( url )
263
276
  {
264
277
  let pages = await this.puppeteerBrowser.pages();
@@ -559,15 +572,20 @@ class WebScraper
559
572
  {
560
573
  try
561
574
  {
575
+ if ( this.hasReachedMax( depth ) )
576
+ {
577
+ throw new Error( "Max reached" );
578
+ }
562
579
  return await axios.get( url, options );
563
580
  }
564
581
  catch ( error )
565
582
  {
566
- if ( attempt === this.maxRetries ) throw error;
583
+ if ( attempt >= this.maxRetries ) throw error;
567
584
  await WebScraper.sleep( 40000 * attempt );
568
585
  console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
569
586
  }
570
587
  }
588
+ throw new Error( "Max retries reached" );
571
589
  }
572
590
 
573
591
  configurePuppeteer ( )
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.7.4",
3
+ "version": "3.7.6",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",