clean-web-scraper 3.5.3 → 3.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/WebScraper.js +22 -25
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.5.3",
3
+ "version": "3.5.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -73,6 +73,15 @@ class WebScraper
73
73
  // Network configuration
74
74
  this.axiosHeaders = axiosHeaders;
75
75
  this.axiosProxy = axiosProxy;
76
+ this.axiosOptions = {};
77
+ if ( this.axiosHeaders )
78
+ {
79
+ axiosOptions.headers = this.axiosHeaders;
80
+ }
81
+ if ( this.axiosProxy )
82
+ {
83
+ axiosOptions.proxy = this.axiosProxy;
84
+ }
76
85
 
77
86
  // Content storage
78
87
  this.allProcessedContent = [];
@@ -138,7 +147,7 @@ class WebScraper
138
147
  }
139
148
  try
140
149
  {
141
- const data = await this.caller( url );
150
+ const data = await this.fetchContent( url );
142
151
  if ( !data ) return;
143
152
  const dom = new JSDOM( data, { url });
144
153
  const { document } = dom.window;
@@ -150,9 +159,9 @@ class WebScraper
150
159
 
151
160
  if ( article )
152
161
  {
153
- if ( this.isValidContent( article.textContent ) )
162
+ if ( this.hasValidPageContent( article.textContent ) )
154
163
  {
155
- const metadata = this.metadataextractor( url, document );
164
+ const metadata = this.extractMetadata( url, document );
156
165
  metadata.depth = depth;
157
166
  this.saveArticle( url, article.textContent, metadata );
158
167
  }
@@ -182,35 +191,23 @@ class WebScraper
182
191
  }
183
192
  }
184
193
 
185
- async caller ( url )
194
+ async fetchContent ( url )
186
195
  {
187
196
  try
188
197
  {
189
- let axiosOptions = {};
190
- if ( this.axiosHeaders )
191
- {
192
- axiosOptions.headers = this.axiosHeaders;
193
- }
194
- if ( this.axiosProxy )
195
- {
196
- axiosOptions.proxy = this.axiosProxy;
197
- }
198
-
199
- // Step 1: Make a GET request with a small timeout and limited data download
200
198
  const response = await axios.get( url, {
201
- ...axiosOptions,
202
199
  responseType: "stream",
203
200
  maxRedirects: 5,
204
- timeout: 70000
201
+ timeout: 70000,
202
+ ...axiosOptions,
205
203
  });
206
204
 
207
- // Step 2: Check the Content-Type header from the response
208
205
  const contentType = response.headers["content-type"] || "";
209
206
  if ( !contentType.startsWith( "text" ) )
210
207
  {
211
208
  console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
212
- response.data.destroy(); // Destroy the stream to stop downloading further data
213
- return null; // Skip further processing for non-HTML content
209
+ response.data.destroy();
210
+ return null;
214
211
  }
215
212
 
216
213
  // Step 3: If Content-Type is HTML, read the full response
@@ -242,8 +239,8 @@ class WebScraper
242
239
  for ( let index = 0; index < 10; index++ )
243
240
  {
244
241
  console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
245
- result = await this.goToUrl( url ) ;
246
- if ( this.isValidContent( result.htmlContent ) )
242
+ result = await this.navigateToPage( url ) ;
243
+ if ( this.hasValidPageContent( result.htmlContent ) )
247
244
  {
248
245
  break
249
246
  }
@@ -261,7 +258,7 @@ class WebScraper
261
258
  }
262
259
  }
263
260
 
264
- async goToUrl ( url )
261
+ async navigateToPage ( url )
265
262
  {
266
263
  let pages = await this.puppeteerBrowser.pages();
267
264
  let page = pages[0];
@@ -530,7 +527,7 @@ class WebScraper
530
527
  return filteredMetadata;
531
528
  }
532
529
 
533
- metadataextractor ( url, document )
530
+ extractMetadata ( url, document )
534
531
  {
535
532
  return {
536
533
  url,
@@ -634,7 +631,7 @@ class WebScraper
634
631
  }
635
632
  }
636
633
 
637
- isValidContent ( content )
634
+ hasValidPageContent ( content )
638
635
  {
639
636
  // Remove whitespace and newlines for checking
640
637
  const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();