clean-web-scraper 3.5.3 → 3.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/WebScraper.js +22 -25
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -73,6 +73,15 @@ class WebScraper
|
|
|
73
73
|
// Network configuration
|
|
74
74
|
this.axiosHeaders = axiosHeaders;
|
|
75
75
|
this.axiosProxy = axiosProxy;
|
|
76
|
+
this.axiosOptions = {};
|
|
77
|
+
if ( this.axiosHeaders )
|
|
78
|
+
{
|
|
79
|
+
axiosOptions.headers = this.axiosHeaders;
|
|
80
|
+
}
|
|
81
|
+
if ( this.axiosProxy )
|
|
82
|
+
{
|
|
83
|
+
axiosOptions.proxy = this.axiosProxy;
|
|
84
|
+
}
|
|
76
85
|
|
|
77
86
|
// Content storage
|
|
78
87
|
this.allProcessedContent = [];
|
|
@@ -138,7 +147,7 @@ class WebScraper
|
|
|
138
147
|
}
|
|
139
148
|
try
|
|
140
149
|
{
|
|
141
|
-
const data = await this.
|
|
150
|
+
const data = await this.fetchContent( url );
|
|
142
151
|
if ( !data ) return;
|
|
143
152
|
const dom = new JSDOM( data, { url });
|
|
144
153
|
const { document } = dom.window;
|
|
@@ -150,9 +159,9 @@ class WebScraper
|
|
|
150
159
|
|
|
151
160
|
if ( article )
|
|
152
161
|
{
|
|
153
|
-
if ( this.
|
|
162
|
+
if ( this.hasValidPageContent( article.textContent ) )
|
|
154
163
|
{
|
|
155
|
-
const metadata = this.
|
|
164
|
+
const metadata = this.extractMetadata( url, document );
|
|
156
165
|
metadata.depth = depth;
|
|
157
166
|
this.saveArticle( url, article.textContent, metadata );
|
|
158
167
|
}
|
|
@@ -182,35 +191,23 @@ class WebScraper
|
|
|
182
191
|
}
|
|
183
192
|
}
|
|
184
193
|
|
|
185
|
-
async
|
|
194
|
+
async fetchContent ( url )
|
|
186
195
|
{
|
|
187
196
|
try
|
|
188
197
|
{
|
|
189
|
-
let axiosOptions = {};
|
|
190
|
-
if ( this.axiosHeaders )
|
|
191
|
-
{
|
|
192
|
-
axiosOptions.headers = this.axiosHeaders;
|
|
193
|
-
}
|
|
194
|
-
if ( this.axiosProxy )
|
|
195
|
-
{
|
|
196
|
-
axiosOptions.proxy = this.axiosProxy;
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
// Step 1: Make a GET request with a small timeout and limited data download
|
|
200
198
|
const response = await axios.get( url, {
|
|
201
|
-
...axiosOptions,
|
|
202
199
|
responseType: "stream",
|
|
203
200
|
maxRedirects: 5,
|
|
204
|
-
timeout: 70000
|
|
201
|
+
timeout: 70000,
|
|
202
|
+
...axiosOptions,
|
|
205
203
|
});
|
|
206
204
|
|
|
207
|
-
// Step 2: Check the Content-Type header from the response
|
|
208
205
|
const contentType = response.headers["content-type"] || "";
|
|
209
206
|
if ( !contentType.startsWith( "text" ) )
|
|
210
207
|
{
|
|
211
208
|
console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
|
|
212
|
-
response.data.destroy();
|
|
213
|
-
return null;
|
|
209
|
+
response.data.destroy();
|
|
210
|
+
return null;
|
|
214
211
|
}
|
|
215
212
|
|
|
216
213
|
// Step 3: If Content-Type is HTML, read the full response
|
|
@@ -242,8 +239,8 @@ class WebScraper
|
|
|
242
239
|
for ( let index = 0; index < 10; index++ )
|
|
243
240
|
{
|
|
244
241
|
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
|
245
|
-
result = await this.
|
|
246
|
-
if ( this.
|
|
242
|
+
result = await this.navigateToPage( url ) ;
|
|
243
|
+
if ( this.hasValidPageContent( result.htmlContent ) )
|
|
247
244
|
{
|
|
248
245
|
break
|
|
249
246
|
}
|
|
@@ -261,7 +258,7 @@ class WebScraper
|
|
|
261
258
|
}
|
|
262
259
|
}
|
|
263
260
|
|
|
264
|
-
async
|
|
261
|
+
async navigateToPage ( url )
|
|
265
262
|
{
|
|
266
263
|
let pages = await this.puppeteerBrowser.pages();
|
|
267
264
|
let page = pages[0];
|
|
@@ -530,7 +527,7 @@ class WebScraper
|
|
|
530
527
|
return filteredMetadata;
|
|
531
528
|
}
|
|
532
529
|
|
|
533
|
-
|
|
530
|
+
extractMetadata ( url, document )
|
|
534
531
|
{
|
|
535
532
|
return {
|
|
536
533
|
url,
|
|
@@ -634,7 +631,7 @@ class WebScraper
|
|
|
634
631
|
}
|
|
635
632
|
}
|
|
636
633
|
|
|
637
|
-
|
|
634
|
+
hasValidPageContent ( content )
|
|
638
635
|
{
|
|
639
636
|
// Remove whitespace and newlines for checking
|
|
640
637
|
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|