clean-web-scraper 4.0.3 → 4.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/example-usage.js +3 -2
- package/main.js +61 -55
- package/package.json +1 -1
package/README.md
CHANGED
@@ -57,7 +57,8 @@ const scraper = new WebScraper({
|
|
57
57
|
maxDepth: Infinity, // Optional: Maximum crawling depth
|
58
58
|
maxArticles: Infinity, // Optional: Maximum articles to scrape
|
59
59
|
crawlingDelay: 1000, // Optional: Delay between requests (ms)
|
60
|
-
|
60
|
+
batchSize: 5, // Optional: Number of URLs to process concurrently
|
61
|
+
|
61
62
|
// Network options
|
62
63
|
axiosHeaders: {}, // Optional: Custom HTTP headers
|
63
64
|
axiosProxy: { // Optional: HTTP/HTTPS proxy
|
package/example-usage.js
CHANGED
@@ -109,7 +109,8 @@ async function electronicintifada ( enable )
|
|
109
109
|
"https://electronicintifada.net/news",
|
110
110
|
"https://electronicintifada.net/opinion",
|
111
111
|
"https://electronicintifada.net/about-ei",
|
112
|
-
"https://electronicintifada.net/review"
|
112
|
+
"https://electronicintifada.net/review",
|
113
|
+
"https://electronicintifada.net/artmusicculture"
|
113
114
|
],
|
114
115
|
exactExcludeList: [
|
115
116
|
"https://electronicintifada.net",
|
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
|
|
133
134
|
protocol: "http"
|
134
135
|
},
|
135
136
|
useProxyAsFallback: true,
|
136
|
-
crawlingDelay:
|
137
|
+
crawlingDelay: 1
|
137
138
|
};
|
138
139
|
return await runScraper( config, enable );
|
139
140
|
}
|
package/main.js
CHANGED
@@ -16,6 +16,7 @@ class WebScraper
|
|
16
16
|
this.maxDepth = config.maxDepth || Infinity;
|
17
17
|
this.maxArticles = config.maxArticles || Infinity;
|
18
18
|
this.crawlingDelay = config.crawlingDelay ?? 1000;
|
19
|
+
this.batchSize = config.batchSize || 5;
|
19
20
|
|
20
21
|
// Output paths setup
|
21
22
|
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
@@ -97,83 +98,88 @@ class WebScraper
|
|
97
98
|
async crawl ( initialUrl, initialDepth = 0 )
|
98
99
|
{
|
99
100
|
const queue = [{ url: initialUrl, depth: initialDepth }];
|
100
|
-
|
101
|
+
while ( queue.length > 0 )
|
101
102
|
{
|
102
|
-
|
103
|
-
|
104
|
-
if ( this.hasReachedMax( depth ) )
|
103
|
+
const currentBatch = queue.splice( 0, this.batchSize );
|
104
|
+
await Promise.all( currentBatch.map( async ({ url, depth }) =>
|
105
105
|
{
|
106
|
-
|
107
|
-
}
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
106
|
+
await this.processUrl( url, depth, queue );
|
107
|
+
}) );
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
async processUrl ( url, depth, queue )
|
112
|
+
{
|
113
|
+
console.log( `Processing URL: ${url}` );
|
114
|
+
if ( this.hasReachedMax( depth ) )
|
115
|
+
{
|
116
|
+
return;
|
117
|
+
}
|
118
|
+
if ( this.removeURLFragment )
|
119
|
+
{
|
120
|
+
url = url.split( "#" )[0];
|
121
|
+
}
|
122
|
+
if ( this.visited.has( url ) )
|
123
|
+
{
|
124
|
+
console.log( `Already visited: ${url}` );
|
125
|
+
return;
|
126
|
+
}
|
127
|
+
this.visited.add( url );
|
128
|
+
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
129
|
+
{
|
130
|
+
return;
|
131
|
+
}
|
132
|
+
try
|
133
|
+
{
|
134
|
+
if ( this.crawlingDelay )
|
113
135
|
{
|
114
|
-
|
115
|
-
continue;
|
136
|
+
await WebScraper.sleep( this.crawlingDelay );
|
116
137
|
}
|
117
|
-
this.
|
118
|
-
|
119
|
-
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
138
|
+
const data = await this.fetchContent( url );
|
139
|
+
if ( !data )
|
120
140
|
{
|
121
|
-
|
141
|
+
return;
|
122
142
|
}
|
123
|
-
|
124
|
-
|
143
|
+
const dom = new JSDOM( data, { url });
|
144
|
+
const { document } = dom.window;
|
145
|
+
if ( !this.isExcluded( url ) )
|
125
146
|
{
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
}
|
130
|
-
const data = await this.fetchContent( url );
|
131
|
-
if ( !data ) continue;
|
132
|
-
|
133
|
-
const dom = new JSDOM( data, { url });
|
134
|
-
const { document } = dom.window;
|
135
|
-
|
136
|
-
if ( !this.isExcluded( url ) )
|
147
|
+
const reader = new Readability( document );
|
148
|
+
const article = reader.parse();
|
149
|
+
if ( article )
|
137
150
|
{
|
138
|
-
|
139
|
-
const article = reader.parse();
|
140
|
-
if ( article )
|
151
|
+
if ( this.hasValidPageContent( article.textContent ) )
|
141
152
|
{
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
metadata.articleTitle = article.title || "";
|
146
|
-
this.saveArticle( url, article.textContent, metadata );
|
147
|
-
}
|
148
|
-
else
|
149
|
-
{
|
150
|
-
console.error( `Invalid content found at ${url}` );
|
151
|
-
}
|
153
|
+
const metadata = this.extractMetadata( url, document );
|
154
|
+
metadata.articleTitle = article.title || "";
|
155
|
+
this.saveArticle( url, article.textContent, metadata );
|
152
156
|
}
|
153
157
|
else
|
154
158
|
{
|
155
|
-
console.error( `
|
159
|
+
console.error( `Invalid content found at ${url}` );
|
156
160
|
}
|
157
161
|
}
|
158
|
-
|
159
|
-
const links = this.extractLinks( data );
|
160
|
-
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
161
|
-
for ( const link of unvisitedLinks )
|
162
|
+
else
|
162
163
|
{
|
163
|
-
|
164
|
-
{
|
165
|
-
queue.push({ url: link, depth: depth + 1 });
|
166
|
-
}
|
164
|
+
console.error( `No readable content found at ${url}` );
|
167
165
|
}
|
168
166
|
}
|
169
|
-
|
167
|
+
const links = this.extractLinks( data );
|
168
|
+
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
169
|
+
for ( const link of unvisitedLinks )
|
170
170
|
{
|
171
|
-
|
171
|
+
if ( !this.hasReachedMax( depth ) )
|
172
|
+
{
|
173
|
+
queue.push({ url: link, depth: depth + 1 });
|
174
|
+
}
|
172
175
|
}
|
173
176
|
}
|
177
|
+
catch ( error )
|
178
|
+
{
|
179
|
+
console.error( `Error fetching ${url}:`, error.message, error.code );
|
180
|
+
}
|
174
181
|
}
|
175
182
|
|
176
|
-
|
177
183
|
async fetchContent ( url )
|
178
184
|
{
|
179
185
|
try
|