clean-web-scraper 3.3.7 → 3.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -100,7 +100,8 @@ async function electronicintifada ()
100
100
  "https://electronicintifada.net/tags/",
101
101
  "https://electronicintifada.net/blog",
102
102
  "https://electronicintifada.net/people",
103
- "https://electronicintifada.net/location"
103
+ "https://electronicintifada.net/location",
104
+ "https://electronicintifada.net/file"
104
105
  ],
105
106
  exactExcludeList: [
106
107
  "https://electronicintifada.net",
@@ -133,9 +134,6 @@ void async function main ()
133
134
  electronicintifadaScraper
134
135
  ] );
135
136
 
136
- // 4
137
- // https://electronicintifada.net/
138
-
139
137
  // 5
140
138
  // https://www.palestineremembered.com/ZionistFAQ.html
141
139
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.7",
3
+ "version": "3.4.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -20,7 +20,7 @@ class WebScraper
20
20
  textOutputPath,
21
21
  csvOutputPath,
22
22
  includeMetadata = false,
23
- metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
23
+ metadataFields = [], // ['title', 'description', 'author', etc.]
24
24
  headers,
25
25
  usePuppeteer,
26
26
  puppeteerProxy, // e.g. http://127.0.0.1:2080
@@ -159,7 +159,7 @@ class WebScraper
159
159
  }
160
160
  catch ( error )
161
161
  {
162
- console.error( `Error fetching ${url}:`, error.message );
162
+ console.error( `Error fetching ${url}:`, error.message, error.code );
163
163
  }
164
164
  }
165
165
 
@@ -167,13 +167,44 @@ class WebScraper
167
167
  {
168
168
  try
169
169
  {
170
- let axiosOptinos = {}
170
+ let axiosOptions = {};
171
171
  if ( this.headers )
172
172
  {
173
- axiosOptinos.headers = this.headers
173
+ axiosOptions.headers = this.headers;
174
174
  }
175
- const result = await axios.get( url, axiosOptinos );
176
- return result.data
175
+
176
+ // Step 1: Make a GET request with a small timeout and limited data download
177
+ const response = await axios.get( url, {
178
+ ...axiosOptions,
179
+ responseType: "stream",
180
+ maxRedirects: 5,
181
+ timeout: 70000
182
+ });
183
+
184
+ // Step 2: Check the Content-Type header from the response
185
+ const contentType = response.headers["content-type"] || "";
186
+ if ( !contentType.startsWith( "text" ) )
187
+ {
188
+ console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
189
+ response.data.destroy(); // Destroy the stream to stop downloading further data
190
+ return null; // Skip further processing for non-HTML content
191
+ }
192
+
193
+ // Step 3: If Content-Type is HTML, read the full response
194
+ let htmlContent = "";
195
+ response.data.on( "data", ( chunk ) =>
196
+ {
197
+ htmlContent += chunk.toString();
198
+ });
199
+
200
+ // Wait for the stream to finish
201
+ await new Promise( ( resolve, reject ) =>
202
+ {
203
+ response.data.on( "end", resolve );
204
+ response.data.on( "error", reject );
205
+ });
206
+
207
+ return htmlContent;
177
208
  }
178
209
  catch ( error )
179
210
  {