clean-web-scraper 3.3.7 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +2 -4
- package/package.json +1 -1
- package/src/WebScraper.js +37 -6
package/example-usage.js
CHANGED
|
@@ -100,7 +100,8 @@ async function electronicintifada ()
|
|
|
100
100
|
"https://electronicintifada.net/tags/",
|
|
101
101
|
"https://electronicintifada.net/blog",
|
|
102
102
|
"https://electronicintifada.net/people",
|
|
103
|
-
"https://electronicintifada.net/location"
|
|
103
|
+
"https://electronicintifada.net/location",
|
|
104
|
+
"https://electronicintifada.net/file"
|
|
104
105
|
],
|
|
105
106
|
exactExcludeList: [
|
|
106
107
|
"https://electronicintifada.net",
|
|
@@ -133,9 +134,6 @@ void async function main ()
|
|
|
133
134
|
electronicintifadaScraper
|
|
134
135
|
] );
|
|
135
136
|
|
|
136
|
-
// 4
|
|
137
|
-
// https://electronicintifada.net/
|
|
138
|
-
|
|
139
137
|
// 5
|
|
140
138
|
// https://www.palestineremembered.com/ZionistFAQ.html
|
|
141
139
|
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -20,7 +20,7 @@ class WebScraper
|
|
|
20
20
|
textOutputPath,
|
|
21
21
|
csvOutputPath,
|
|
22
22
|
includeMetadata = false,
|
|
23
|
-
metadataFields = [], // ['title', 'description', 'author',
|
|
23
|
+
metadataFields = [], // ['title', 'description', 'author', etc.]
|
|
24
24
|
headers,
|
|
25
25
|
usePuppeteer,
|
|
26
26
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
@@ -159,7 +159,7 @@ class WebScraper
|
|
|
159
159
|
}
|
|
160
160
|
catch ( error )
|
|
161
161
|
{
|
|
162
|
-
console.error( `Error fetching ${url}:`, error.message );
|
|
162
|
+
console.error( `Error fetching ${url}:`, error.message, error.code );
|
|
163
163
|
}
|
|
164
164
|
}
|
|
165
165
|
|
|
@@ -167,13 +167,44 @@ class WebScraper
|
|
|
167
167
|
{
|
|
168
168
|
try
|
|
169
169
|
{
|
|
170
|
-
let
|
|
170
|
+
let axiosOptions = {};
|
|
171
171
|
if ( this.headers )
|
|
172
172
|
{
|
|
173
|
-
|
|
173
|
+
axiosOptions.headers = this.headers;
|
|
174
174
|
}
|
|
175
|
-
|
|
176
|
-
|
|
175
|
+
|
|
176
|
+
// Step 1: Make a GET request with a small timeout and limited data download
|
|
177
|
+
const response = await axios.get( url, {
|
|
178
|
+
...axiosOptions,
|
|
179
|
+
responseType: "stream",
|
|
180
|
+
maxRedirects: 5,
|
|
181
|
+
timeout: 70000
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
// Step 2: Check the Content-Type header from the response
|
|
185
|
+
const contentType = response.headers["content-type"] || "";
|
|
186
|
+
if ( !contentType.startsWith( "text" ) )
|
|
187
|
+
{
|
|
188
|
+
console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
|
|
189
|
+
response.data.destroy(); // Destroy the stream to stop downloading further data
|
|
190
|
+
return null; // Skip further processing for non-HTML content
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// Step 3: If Content-Type is HTML, read the full response
|
|
194
|
+
let htmlContent = "";
|
|
195
|
+
response.data.on( "data", ( chunk ) =>
|
|
196
|
+
{
|
|
197
|
+
htmlContent += chunk.toString();
|
|
198
|
+
});
|
|
199
|
+
|
|
200
|
+
// Wait for the stream to finish
|
|
201
|
+
await new Promise( ( resolve, reject ) =>
|
|
202
|
+
{
|
|
203
|
+
response.data.on( "end", resolve );
|
|
204
|
+
response.data.on( "error", reject );
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
return htmlContent;
|
|
177
208
|
}
|
|
178
209
|
catch ( error )
|
|
179
210
|
{
|