clean-web-scraper 3.5.3 → 3.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/WebScraper.js +23 -30
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -73,6 +73,15 @@ class WebScraper
|
|
|
73
73
|
// Network configuration
|
|
74
74
|
this.axiosHeaders = axiosHeaders;
|
|
75
75
|
this.axiosProxy = axiosProxy;
|
|
76
|
+
this.axiosOptions = {};
|
|
77
|
+
if ( this.axiosHeaders )
|
|
78
|
+
{
|
|
79
|
+
axiosOptions.headers = this.axiosHeaders;
|
|
80
|
+
}
|
|
81
|
+
if ( this.axiosProxy )
|
|
82
|
+
{
|
|
83
|
+
axiosOptions.proxy = this.axiosProxy;
|
|
84
|
+
}
|
|
76
85
|
|
|
77
86
|
// Content storage
|
|
78
87
|
this.allProcessedContent = [];
|
|
@@ -128,17 +137,13 @@ class WebScraper
|
|
|
128
137
|
return;
|
|
129
138
|
}
|
|
130
139
|
this.visited.add( url );
|
|
131
|
-
if ( !this.isValidFileType( url ) )
|
|
132
|
-
{
|
|
133
|
-
return;
|
|
134
|
-
}
|
|
135
|
-
if ( !this.isValidDomain( url ) )
|
|
140
|
+
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
|
136
141
|
{
|
|
137
142
|
return;
|
|
138
143
|
}
|
|
139
144
|
try
|
|
140
145
|
{
|
|
141
|
-
const data = await this.
|
|
146
|
+
const data = await this.fetchContent( url );
|
|
142
147
|
if ( !data ) return;
|
|
143
148
|
const dom = new JSDOM( data, { url });
|
|
144
149
|
const { document } = dom.window;
|
|
@@ -150,9 +155,9 @@ class WebScraper
|
|
|
150
155
|
|
|
151
156
|
if ( article )
|
|
152
157
|
{
|
|
153
|
-
if ( this.
|
|
158
|
+
if ( this.hasValidPageContent( article.textContent ) )
|
|
154
159
|
{
|
|
155
|
-
const metadata = this.
|
|
160
|
+
const metadata = this.extractMetadata( url, document );
|
|
156
161
|
metadata.depth = depth;
|
|
157
162
|
this.saveArticle( url, article.textContent, metadata );
|
|
158
163
|
}
|
|
@@ -182,35 +187,23 @@ class WebScraper
|
|
|
182
187
|
}
|
|
183
188
|
}
|
|
184
189
|
|
|
185
|
-
async
|
|
190
|
+
async fetchContent ( url )
|
|
186
191
|
{
|
|
187
192
|
try
|
|
188
193
|
{
|
|
189
|
-
let axiosOptions = {};
|
|
190
|
-
if ( this.axiosHeaders )
|
|
191
|
-
{
|
|
192
|
-
axiosOptions.headers = this.axiosHeaders;
|
|
193
|
-
}
|
|
194
|
-
if ( this.axiosProxy )
|
|
195
|
-
{
|
|
196
|
-
axiosOptions.proxy = this.axiosProxy;
|
|
197
|
-
}
|
|
198
|
-
|
|
199
|
-
// Step 1: Make a GET request with a small timeout and limited data download
|
|
200
194
|
const response = await axios.get( url, {
|
|
201
|
-
...axiosOptions,
|
|
202
195
|
responseType: "stream",
|
|
203
196
|
maxRedirects: 5,
|
|
204
|
-
timeout: 70000
|
|
197
|
+
timeout: 70000,
|
|
198
|
+
...axiosOptions,
|
|
205
199
|
});
|
|
206
200
|
|
|
207
|
-
// Step 2: Check the Content-Type header from the response
|
|
208
201
|
const contentType = response.headers["content-type"] || "";
|
|
209
202
|
if ( !contentType.startsWith( "text" ) )
|
|
210
203
|
{
|
|
211
204
|
console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
|
|
212
|
-
response.data.destroy();
|
|
213
|
-
return null;
|
|
205
|
+
response.data.destroy();
|
|
206
|
+
return null;
|
|
214
207
|
}
|
|
215
208
|
|
|
216
209
|
// Step 3: If Content-Type is HTML, read the full response
|
|
@@ -242,8 +235,8 @@ class WebScraper
|
|
|
242
235
|
for ( let index = 0; index < 10; index++ )
|
|
243
236
|
{
|
|
244
237
|
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
|
245
|
-
result = await this.
|
|
246
|
-
if ( this.
|
|
238
|
+
result = await this.navigateToPage( url ) ;
|
|
239
|
+
if ( this.hasValidPageContent( result.htmlContent ) )
|
|
247
240
|
{
|
|
248
241
|
break
|
|
249
242
|
}
|
|
@@ -261,7 +254,7 @@ class WebScraper
|
|
|
261
254
|
}
|
|
262
255
|
}
|
|
263
256
|
|
|
264
|
-
async
|
|
257
|
+
async navigateToPage ( url )
|
|
265
258
|
{
|
|
266
259
|
let pages = await this.puppeteerBrowser.pages();
|
|
267
260
|
let page = pages[0];
|
|
@@ -530,7 +523,7 @@ class WebScraper
|
|
|
530
523
|
return filteredMetadata;
|
|
531
524
|
}
|
|
532
525
|
|
|
533
|
-
|
|
526
|
+
extractMetadata ( url, document )
|
|
534
527
|
{
|
|
535
528
|
return {
|
|
536
529
|
url,
|
|
@@ -634,7 +627,7 @@ class WebScraper
|
|
|
634
627
|
}
|
|
635
628
|
}
|
|
636
629
|
|
|
637
|
-
|
|
630
|
+
hasValidPageContent ( content )
|
|
638
631
|
{
|
|
639
632
|
// Remove whitespace and newlines for checking
|
|
640
633
|
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|