clean-web-scraper 3.8.0 → 3.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -240,7 +240,20 @@ async function mondoweiss ( enable )
240
240
  "https://mondoweiss.net/donate",
241
241
  "https://mondoweiss.net/advertise/",
242
242
  "https://mondoweiss.net/contact/",
243
- "https://mondoweiss.net/recent-comments/"
243
+ "https://mondoweiss.net/recent-comments/",
244
+ "https://mondoweiss.net/email-newsletters",
245
+ "https://mondoweiss.net/author",
246
+ "https://mondoweiss.net/tag/"
247
+ ],
248
+ exactExcludeList: [
249
+ "https://mondoweiss.net",
250
+ "https://mondoweiss.net/news/",
251
+ "https://mondoweiss.net/opinion/",
252
+ "https://mondoweiss.net/ways-to-give/",
253
+ "https://mondoweiss.net/media-analysis/",
254
+ "https://mondoweiss.net/culture/",
255
+ "https://mondoweiss.net/activism/",
256
+ "https://mondoweiss.net/news-letters/"
244
257
  ],
245
258
  scrapResultPath: "./dataset/mondoweiss/website",
246
259
  jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
@@ -248,7 +261,13 @@ async function mondoweiss ( enable )
248
261
  csvOutputPath: "./dataset/mondoweiss/train.csv",
249
262
  includeMetadata: true,
250
263
  maxArticles: 2500,
264
+ maxRetries: 2,
251
265
  axiosHeaders: headers,
266
+ axiosProxy: {
267
+ host: "localhost",
268
+ port: 2080,
269
+ protocol: "http"
270
+ },
252
271
  metadataFields: ["author", "title", "description", "dateScrapedDate"]
253
272
  });
254
273
  if ( enable )
@@ -278,7 +297,4 @@ void async function main ()
278
297
  standWithPalestineScraper,
279
298
  mondoweisScraper
280
299
  ] );
281
- }()
282
-
283
-
284
- // https://mondoweiss.net
300
+ }()
package/main.js CHANGED
@@ -22,6 +22,7 @@ class WebScraper
22
22
  exactExcludeList = [],
23
23
  filterFileTypes,
24
24
  excludedFileTypes,
25
+ removeURLFragment,
25
26
 
26
27
  // Output paths
27
28
  scrapResultPath = "./dataset",
@@ -72,6 +73,7 @@ class WebScraper
72
73
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
73
74
  this.filterFileTypes = filterFileTypes || true;
74
75
  this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
76
+ this.removeURLFragment = removeURLFragment || true;
75
77
 
76
78
  // Network configuration
77
79
  this.axiosHeaders = axiosHeaders;
@@ -130,6 +132,10 @@ class WebScraper
130
132
 
131
133
  async fetchPage ( url, depth )
132
134
  {
135
+ if ( this.removeURLFragment )
136
+ {
137
+ url = url.split( "#" )[0];
138
+ }
133
139
  if ( this.hasReachedMax( depth ) )
134
140
  {
135
141
  return;
@@ -233,7 +239,7 @@ class WebScraper
233
239
  }
234
240
  catch ( error )
235
241
  {
236
- console.error( `Error fetching ${url}:`, error.message );
242
+ console.error( `Error fetching content ${url}:`, error.message );
237
243
  if ( error.status = 403 && this.usePuppeteer )
238
244
  {
239
245
  try
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.8.0",
3
+ "version": "3.8.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",