clean-web-scraper 3.6.0 → 3.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/main.js +33 -18
- package/package.json +1 -1
package/main.js
CHANGED
|
@@ -11,15 +11,16 @@ class WebScraper
|
|
|
11
11
|
// Base configuration
|
|
12
12
|
baseURL,
|
|
13
13
|
startURL,
|
|
14
|
-
strictBaseURL
|
|
15
|
-
maxDepth
|
|
16
|
-
maxArticles
|
|
14
|
+
strictBaseURL,
|
|
15
|
+
maxDepth,
|
|
16
|
+
maxArticles,
|
|
17
|
+
concurrencyLimit,
|
|
17
18
|
|
|
18
19
|
// URL filtering
|
|
19
20
|
excludeList = [],
|
|
20
21
|
exactExcludeList = [],
|
|
21
|
-
filterFileTypes
|
|
22
|
-
excludedFileTypes
|
|
22
|
+
filterFileTypes,
|
|
23
|
+
excludedFileTypes,
|
|
23
24
|
|
|
24
25
|
// Output paths
|
|
25
26
|
scrapResultPath = "./dataset",
|
|
@@ -45,9 +46,10 @@ class WebScraper
|
|
|
45
46
|
// Base configuration
|
|
46
47
|
this.baseURL = baseURL;
|
|
47
48
|
this.startURL = startURL || baseURL;
|
|
48
|
-
this.strictBaseURL = strictBaseURL;
|
|
49
|
-
this.maxDepth = maxDepth;
|
|
50
|
-
this.maxArticles = maxArticles;
|
|
49
|
+
this.strictBaseURL = strictBaseURL || true;
|
|
50
|
+
this.maxDepth = maxDepth || Infinity;
|
|
51
|
+
this.maxArticles = maxArticles || Infinity;
|
|
52
|
+
this.concurrencyLimit = concurrencyLimit || 10;
|
|
51
53
|
|
|
52
54
|
// Output paths setup
|
|
53
55
|
this.scrapResultPath = scrapResultPath;
|
|
@@ -65,8 +67,8 @@ class WebScraper
|
|
|
65
67
|
this.visited = new Set();
|
|
66
68
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
67
69
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
68
|
-
this.filterFileTypes = filterFileTypes;
|
|
69
|
-
this.excludedFileTypes = excludedFileTypes;
|
|
70
|
+
this.filterFileTypes = filterFileTypes || true;
|
|
71
|
+
this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
|
|
70
72
|
|
|
71
73
|
// Network configuration
|
|
72
74
|
this.axiosHeaders = axiosHeaders;
|
|
@@ -130,6 +132,11 @@ class WebScraper
|
|
|
130
132
|
console.log( `Reached maximum: ${this.maxArticles}, ${this.maxDepth}` );
|
|
131
133
|
return;
|
|
132
134
|
}
|
|
135
|
+
if ( this.visited.has( url ) )
|
|
136
|
+
{
|
|
137
|
+
console.log( `Already visited: ${url}` );
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
133
140
|
this.visited.add( url );
|
|
134
141
|
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
|
135
142
|
{
|
|
@@ -167,12 +174,20 @@ class WebScraper
|
|
|
167
174
|
}
|
|
168
175
|
|
|
169
176
|
const links = this.extractLinks( data );
|
|
170
|
-
|
|
177
|
+
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
|
178
|
+
|
|
179
|
+
for ( let i = 0; i < unvisitedLinks.length; i += this.concurrencyLimit )
|
|
171
180
|
{
|
|
172
|
-
|
|
181
|
+
const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
|
|
182
|
+
const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
|
|
183
|
+
|
|
184
|
+
results.forEach( ( result, index ) =>
|
|
173
185
|
{
|
|
174
|
-
|
|
175
|
-
|
|
186
|
+
if ( result.status === "rejected" )
|
|
187
|
+
{
|
|
188
|
+
console.error( `Failed to fetch ${batch[index]}: ${result.reason}` );
|
|
189
|
+
}
|
|
190
|
+
});
|
|
176
191
|
}
|
|
177
192
|
}
|
|
178
193
|
catch ( error )
|
|
@@ -185,7 +200,7 @@ class WebScraper
|
|
|
185
200
|
{
|
|
186
201
|
try
|
|
187
202
|
{
|
|
188
|
-
const response = await retryAxiosRequest( url )
|
|
203
|
+
const response = await this.retryAxiosRequest( url )
|
|
189
204
|
const contentType = response.headers["content-type"] || "";
|
|
190
205
|
if ( !contentType.startsWith( "text" ) )
|
|
191
206
|
{
|
|
@@ -536,7 +551,7 @@ class WebScraper
|
|
|
536
551
|
...this.axiosOptions,
|
|
537
552
|
};
|
|
538
553
|
|
|
539
|
-
let maxRetries =
|
|
554
|
+
let maxRetries = 10;
|
|
540
555
|
for ( let attempt = 1; attempt <= maxRetries; attempt++ )
|
|
541
556
|
{
|
|
542
557
|
try
|
|
@@ -546,8 +561,8 @@ class WebScraper
|
|
|
546
561
|
catch ( error )
|
|
547
562
|
{
|
|
548
563
|
if ( attempt === maxRetries ) throw error;
|
|
549
|
-
await WebScraper.sleep(
|
|
550
|
-
console.
|
|
564
|
+
await WebScraper.sleep( 4000 * attempt );
|
|
565
|
+
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${maxRetries})` );
|
|
551
566
|
}
|
|
552
567
|
}
|
|
553
568
|
}
|