clean-web-scraper 3.3.8 → 3.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/WebScraper.js +41 -6
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -118,13 +118,17 @@ class WebScraper
|
|
|
118
118
|
return;
|
|
119
119
|
}
|
|
120
120
|
this.visited.add( url );
|
|
121
|
+
if ( !this.isValidFileType( url ) )
|
|
122
|
+
{
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
121
125
|
try
|
|
122
126
|
{
|
|
123
127
|
const data = await this.caller( url );
|
|
124
128
|
const dom = new JSDOM( data, { url });
|
|
125
129
|
const { document } = dom.window;
|
|
126
130
|
|
|
127
|
-
if ( !this.isExcluded( url )
|
|
131
|
+
if ( !this.isExcluded( url ) )
|
|
128
132
|
{
|
|
129
133
|
const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
|
|
130
134
|
const article = reader.parse();
|
|
@@ -159,7 +163,7 @@ class WebScraper
|
|
|
159
163
|
}
|
|
160
164
|
catch ( error )
|
|
161
165
|
{
|
|
162
|
-
console.error( `Error fetching ${url}:`, error.message );
|
|
166
|
+
console.error( `Error fetching ${url}:`, error.message, error.code );
|
|
163
167
|
}
|
|
164
168
|
}
|
|
165
169
|
|
|
@@ -167,13 +171,44 @@ class WebScraper
|
|
|
167
171
|
{
|
|
168
172
|
try
|
|
169
173
|
{
|
|
170
|
-
let
|
|
174
|
+
let axiosOptions = {};
|
|
171
175
|
if ( this.headers )
|
|
172
176
|
{
|
|
173
|
-
|
|
177
|
+
axiosOptions.headers = this.headers;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// Step 1: Make a GET request with a small timeout and limited data download
|
|
181
|
+
const response = await axios.get( url, {
|
|
182
|
+
...axiosOptions,
|
|
183
|
+
responseType: "stream",
|
|
184
|
+
maxRedirects: 5,
|
|
185
|
+
timeout: 70000
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
// Step 2: Check the Content-Type header from the response
|
|
189
|
+
const contentType = response.headers["content-type"] || "";
|
|
190
|
+
if ( !contentType.startsWith( "text" ) )
|
|
191
|
+
{
|
|
192
|
+
console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
|
|
193
|
+
response.data.destroy(); // Destroy the stream to stop downloading further data
|
|
194
|
+
return null; // Skip further processing for non-HTML content
|
|
174
195
|
}
|
|
175
|
-
|
|
176
|
-
|
|
196
|
+
|
|
197
|
+
// Step 3: If Content-Type is HTML, read the full response
|
|
198
|
+
let htmlContent = "";
|
|
199
|
+
response.data.on( "data", ( chunk ) =>
|
|
200
|
+
{
|
|
201
|
+
htmlContent += chunk.toString();
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
// Wait for the stream to finish
|
|
205
|
+
await new Promise( ( resolve, reject ) =>
|
|
206
|
+
{
|
|
207
|
+
response.data.on( "end", resolve );
|
|
208
|
+
response.data.on( "error", reject );
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
return htmlContent;
|
|
177
212
|
}
|
|
178
213
|
catch ( error )
|
|
179
214
|
{
|