clean-web-scraper 3.5.3 → 3.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/WebScraper.js +23 -30
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.5.3",
3
+ "version": "3.5.5",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -73,6 +73,15 @@ class WebScraper
73
73
  // Network configuration
74
74
  this.axiosHeaders = axiosHeaders;
75
75
  this.axiosProxy = axiosProxy;
76
+ this.axiosOptions = {};
77
+ if ( this.axiosHeaders )
78
+ {
79
+ axiosOptions.headers = this.axiosHeaders;
80
+ }
81
+ if ( this.axiosProxy )
82
+ {
83
+ axiosOptions.proxy = this.axiosProxy;
84
+ }
76
85
 
77
86
  // Content storage
78
87
  this.allProcessedContent = [];
@@ -128,17 +137,13 @@ class WebScraper
128
137
  return;
129
138
  }
130
139
  this.visited.add( url );
131
- if ( !this.isValidFileType( url ) )
132
- {
133
- return;
134
- }
135
- if ( !this.isValidDomain( url ) )
140
+ if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
136
141
  {
137
142
  return;
138
143
  }
139
144
  try
140
145
  {
141
- const data = await this.caller( url );
146
+ const data = await this.fetchContent( url );
142
147
  if ( !data ) return;
143
148
  const dom = new JSDOM( data, { url });
144
149
  const { document } = dom.window;
@@ -150,9 +155,9 @@ class WebScraper
150
155
 
151
156
  if ( article )
152
157
  {
153
- if ( this.isValidContent( article.textContent ) )
158
+ if ( this.hasValidPageContent( article.textContent ) )
154
159
  {
155
- const metadata = this.metadataextractor( url, document );
160
+ const metadata = this.extractMetadata( url, document );
156
161
  metadata.depth = depth;
157
162
  this.saveArticle( url, article.textContent, metadata );
158
163
  }
@@ -182,35 +187,23 @@ class WebScraper
182
187
  }
183
188
  }
184
189
 
185
- async caller ( url )
190
+ async fetchContent ( url )
186
191
  {
187
192
  try
188
193
  {
189
- let axiosOptions = {};
190
- if ( this.axiosHeaders )
191
- {
192
- axiosOptions.headers = this.axiosHeaders;
193
- }
194
- if ( this.axiosProxy )
195
- {
196
- axiosOptions.proxy = this.axiosProxy;
197
- }
198
-
199
- // Step 1: Make a GET request with a small timeout and limited data download
200
194
  const response = await axios.get( url, {
201
- ...axiosOptions,
202
195
  responseType: "stream",
203
196
  maxRedirects: 5,
204
- timeout: 70000
197
+ timeout: 70000,
198
+ ...axiosOptions,
205
199
  });
206
200
 
207
- // Step 2: Check the Content-Type header from the response
208
201
  const contentType = response.headers["content-type"] || "";
209
202
  if ( !contentType.startsWith( "text" ) )
210
203
  {
211
204
  console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
212
- response.data.destroy(); // Destroy the stream to stop downloading further data
213
- return null; // Skip further processing for non-HTML content
205
+ response.data.destroy();
206
+ return null;
214
207
  }
215
208
 
216
209
  // Step 3: If Content-Type is HTML, read the full response
@@ -242,8 +235,8 @@ class WebScraper
242
235
  for ( let index = 0; index < 10; index++ )
243
236
  {
244
237
  console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
245
- result = await this.goToUrl( url ) ;
246
- if ( this.isValidContent( result.htmlContent ) )
238
+ result = await this.navigateToPage( url ) ;
239
+ if ( this.hasValidPageContent( result.htmlContent ) )
247
240
  {
248
241
  break
249
242
  }
@@ -261,7 +254,7 @@ class WebScraper
261
254
  }
262
255
  }
263
256
 
264
- async goToUrl ( url )
257
+ async navigateToPage ( url )
265
258
  {
266
259
  let pages = await this.puppeteerBrowser.pages();
267
260
  let page = pages[0];
@@ -530,7 +523,7 @@ class WebScraper
530
523
  return filteredMetadata;
531
524
  }
532
525
 
533
- metadataextractor ( url, document )
526
+ extractMetadata ( url, document )
534
527
  {
535
528
  return {
536
529
  url,
@@ -634,7 +627,7 @@ class WebScraper
634
627
  }
635
628
  }
636
629
 
637
- isValidContent ( content )
630
+ hasValidPageContent ( content )
638
631
  {
639
632
  // Remove whitespace and newlines for checking
640
633
  const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();