clean-web-scraper 3.6.0 → 3.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/main.js +33 -18
  2. package/package.json +1 -1
package/main.js CHANGED
@@ -11,15 +11,16 @@ class WebScraper
11
11
  // Base configuration
12
12
  baseURL,
13
13
  startURL,
14
- strictBaseURL = true,
15
- maxDepth = Infinity,
16
- maxArticles = Infinity,
14
+ strictBaseURL,
15
+ maxDepth,
16
+ maxArticles,
17
+ concurrencyLimit,
17
18
 
18
19
  // URL filtering
19
20
  excludeList = [],
20
21
  exactExcludeList = [],
21
- filterFileTypes = true,
22
- excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
22
+ filterFileTypes,
23
+ excludedFileTypes,
23
24
 
24
25
  // Output paths
25
26
  scrapResultPath = "./dataset",
@@ -45,9 +46,10 @@ class WebScraper
45
46
  // Base configuration
46
47
  this.baseURL = baseURL;
47
48
  this.startURL = startURL || baseURL;
48
- this.strictBaseURL = strictBaseURL;
49
- this.maxDepth = maxDepth;
50
- this.maxArticles = maxArticles;
49
+ this.strictBaseURL = strictBaseURL || true;
50
+ this.maxDepth = maxDepth || Infinity;
51
+ this.maxArticles = maxArticles || Infinity;
52
+ this.concurrencyLimit = concurrencyLimit || 10;
51
53
 
52
54
  // Output paths setup
53
55
  this.scrapResultPath = scrapResultPath;
@@ -65,8 +67,8 @@ class WebScraper
65
67
  this.visited = new Set();
66
68
  this.excludeList = this.normalizeExcludeList( excludeList );
67
69
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
68
- this.filterFileTypes = filterFileTypes;
69
- this.excludedFileTypes = excludedFileTypes;
70
+ this.filterFileTypes = filterFileTypes || true;
71
+ this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
70
72
 
71
73
  // Network configuration
72
74
  this.axiosHeaders = axiosHeaders;
@@ -130,6 +132,11 @@ class WebScraper
130
132
  console.log( `Reached maximum: ${this.maxArticles}, ${this.maxDepth}` );
131
133
  return;
132
134
  }
135
+ if ( this.visited.has( url ) )
136
+ {
137
+ console.log( `Already visited: ${url}` );
138
+ return;
139
+ }
133
140
  this.visited.add( url );
134
141
  if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
135
142
  {
@@ -167,12 +174,20 @@ class WebScraper
167
174
  }
168
175
 
169
176
  const links = this.extractLinks( data );
170
- for ( const link of links )
177
+ const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
178
+
179
+ for ( let i = 0; i < unvisitedLinks.length; i += this.concurrencyLimit )
171
180
  {
172
- if ( !this.visited.has( link ) )
181
+ const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
182
+ const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
183
+
184
+ results.forEach( ( result, index ) =>
173
185
  {
174
- await this.fetchPage( link, depth + 1 );
175
- }
186
+ if ( result.status === "rejected" )
187
+ {
188
+ console.error( `Failed to fetch ${batch[index]}: ${result.reason}` );
189
+ }
190
+ });
176
191
  }
177
192
  }
178
193
  catch ( error )
@@ -185,7 +200,7 @@ class WebScraper
185
200
  {
186
201
  try
187
202
  {
188
- const response = await retryAxiosRequest( url )
203
+ const response = await this.retryAxiosRequest( url )
189
204
  const contentType = response.headers["content-type"] || "";
190
205
  if ( !contentType.startsWith( "text" ) )
191
206
  {
@@ -536,7 +551,7 @@ class WebScraper
536
551
  ...this.axiosOptions,
537
552
  };
538
553
 
539
- let maxRetries = 3;
554
+ let maxRetries = 10;
540
555
  for ( let attempt = 1; attempt <= maxRetries; attempt++ )
541
556
  {
542
557
  try
@@ -546,8 +561,8 @@ class WebScraper
546
561
  catch ( error )
547
562
  {
548
563
  if ( attempt === maxRetries ) throw error;
549
- await WebScraper.sleep( 1000 * attempt );
550
- console.log( `Retrying request to ${url} (Attempt ${attempt + 1}/${maxRetries})` );
564
+ await WebScraper.sleep( 4000 * attempt );
565
+ console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${maxRetries})` );
551
566
  }
552
567
  }
553
568
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.6.0",
3
+ "version": "3.6.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",