clean-web-scraper 3.8.0 → 3.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +21 -5
- package/main.js +7 -1
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -240,7 +240,20 @@ async function mondoweiss ( enable )
|
|
240
240
|
"https://mondoweiss.net/donate",
|
241
241
|
"https://mondoweiss.net/advertise/",
|
242
242
|
"https://mondoweiss.net/contact/",
|
243
|
-
"https://mondoweiss.net/recent-comments/"
|
243
|
+
"https://mondoweiss.net/recent-comments/",
|
244
|
+
"https://mondoweiss.net/email-newsletters",
|
245
|
+
"https://mondoweiss.net/author",
|
246
|
+
"https://mondoweiss.net/tag/"
|
247
|
+
],
|
248
|
+
exactExcludeList: [
|
249
|
+
"https://mondoweiss.net",
|
250
|
+
"https://mondoweiss.net/news/",
|
251
|
+
"https://mondoweiss.net/opinion/",
|
252
|
+
"https://mondoweiss.net/ways-to-give/",
|
253
|
+
"https://mondoweiss.net/media-analysis/",
|
254
|
+
"https://mondoweiss.net/culture/",
|
255
|
+
"https://mondoweiss.net/activism/",
|
256
|
+
"https://mondoweiss.net/news-letters/"
|
244
257
|
],
|
245
258
|
scrapResultPath: "./dataset/mondoweiss/website",
|
246
259
|
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|
@@ -248,7 +261,13 @@ async function mondoweiss ( enable )
|
|
248
261
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
249
262
|
includeMetadata: true,
|
250
263
|
maxArticles: 2500,
|
264
|
+
maxRetries: 2,
|
251
265
|
axiosHeaders: headers,
|
266
|
+
axiosProxy: {
|
267
|
+
host: "localhost",
|
268
|
+
port: 2080,
|
269
|
+
protocol: "http"
|
270
|
+
},
|
252
271
|
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
253
272
|
});
|
254
273
|
if ( enable )
|
@@ -278,7 +297,4 @@ void async function main ()
|
|
278
297
|
standWithPalestineScraper,
|
279
298
|
mondoweisScraper
|
280
299
|
] );
|
281
|
-
}()
|
282
|
-
|
283
|
-
|
284
|
-
// https://mondoweiss.net
|
300
|
+
}()
|
package/main.js
CHANGED
@@ -22,6 +22,7 @@ class WebScraper
|
|
22
22
|
exactExcludeList = [],
|
23
23
|
filterFileTypes,
|
24
24
|
excludedFileTypes,
|
25
|
+
removeURLFragment,
|
25
26
|
|
26
27
|
// Output paths
|
27
28
|
scrapResultPath = "./dataset",
|
@@ -72,6 +73,7 @@ class WebScraper
|
|
72
73
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
73
74
|
this.filterFileTypes = filterFileTypes || true;
|
74
75
|
this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
|
76
|
+
this.removeURLFragment = removeURLFragment || true;
|
75
77
|
|
76
78
|
// Network configuration
|
77
79
|
this.axiosHeaders = axiosHeaders;
|
@@ -130,6 +132,10 @@ class WebScraper
|
|
130
132
|
|
131
133
|
async fetchPage ( url, depth )
|
132
134
|
{
|
135
|
+
if ( this.removeURLFragment )
|
136
|
+
{
|
137
|
+
url = url.split( "#" )[0];
|
138
|
+
}
|
133
139
|
if ( this.hasReachedMax( depth ) )
|
134
140
|
{
|
135
141
|
return;
|
@@ -233,7 +239,7 @@ class WebScraper
|
|
233
239
|
}
|
234
240
|
catch ( error )
|
235
241
|
{
|
236
|
-
console.error( `Error fetching ${url}:`, error.message );
|
242
|
+
console.error( `Error fetching content ${url}:`, error.message );
|
237
243
|
if ( error.status = 403 && this.usePuppeteer )
|
238
244
|
{
|
239
245
|
try
|