clean-web-scraper 3.3.5 → 3.3.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -83,7 +83,7 @@ async function bdsmovement ()
83
83
  host: "socks5://127.0.0.1",
84
84
  port: "2080",
85
85
  },
86
- usePuppeteer: true
86
+ // usePuppeteer: true
87
87
  });
88
88
  await scraper.start();
89
89
  return scraper;
@@ -95,12 +95,19 @@ async function electronicintifada ()
95
95
  const scraper = new WebScraper({
96
96
  baseURL: "https://electronicintifada.net",
97
97
  excludeList: [
98
- "https://electronicintifada.net/press-area",
99
- "https://electronicintifada.net/privacy-policy",
100
- "https://electronicintifada.net/get-involved/join-a-bds-campaign",
101
- "https://electronicintifada.net/donate_",
102
- "https://electronicintifada.net/user",
103
- "https://electronicintifada.net/admin"
98
+ "https://electronicintifada.net/updates",
99
+ "https://electronicintifada.net/taxonomy/term/",
100
+ "https://electronicintifada.net/tags/",
101
+ "https://electronicintifada.net/blog",
102
+ "https://electronicintifada.net/people",
103
+ "https://electronicintifada.net/location"
104
+ ],
105
+ exactExcludeList: [
106
+ "https://electronicintifada.net",
107
+ "https://electronicintifada.net/blog",
108
+ "https://electronicintifada.net/news",
109
+ "https://electronicintifada.net/opinion",
110
+ "https://electronicintifada.net/review",
104
111
  ],
105
112
  scrapResultPath: "./dataset/electronicintifada/website",
106
113
  jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
@@ -118,10 +125,12 @@ void async function main ()
118
125
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
119
126
  const decolonizepalestineScraper = await decolonizepalestine();
120
127
  const bdsmovementScraper = await bdsmovement();
128
+ const electronicintifadaScraper = await electronicintifada();
121
129
  await WebScraper.combineResults( "./dataset/combined", [
122
130
  khameneiIrFreePalestineTagScraper,
123
131
  decolonizepalestineScraper,
124
- bdsmovementScraper
132
+ bdsmovementScraper,
133
+ electronicintifadaScraper
125
134
  ] );
126
135
 
127
136
  // 4
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.5",
3
+ "version": "3.3.7",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -25,7 +25,9 @@ class WebScraper
25
25
  usePuppeteer,
26
26
  puppeteerProxy, // e.g. http://127.0.0.1:2080
27
27
  puppeteerExecutablePath,
28
- puppeteerRealProxy
28
+ puppeteerRealProxy,
29
+ filterFileTypes = true,
30
+ excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
29
31
  })
30
32
  {
31
33
  this.baseURL = baseURL;
@@ -44,6 +46,8 @@ class WebScraper
44
46
  this.excludeList = this.normalizeExcludeList( excludeList );
45
47
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
46
48
  this.allProcessedContent = [];
49
+ this.filterFileTypes = filterFileTypes;
50
+ this.excludedFileTypes = excludedFileTypes;
47
51
  this.usePuppeteer = usePuppeteer || false;
48
52
  this.puppeteerOptions = {
49
53
  headless: false,
@@ -120,7 +124,7 @@ class WebScraper
120
124
  const dom = new JSDOM( data, { url });
121
125
  const { document } = dom.window;
122
126
 
123
- if ( !this.isExcluded( url ) )
127
+ if ( !this.isExcluded( url ) && this.isValidFileType( url ) )
124
128
  {
125
129
  const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
126
130
  const article = reader.parse();
@@ -437,6 +441,23 @@ class WebScraper
437
441
  // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
438
442
  // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
439
443
 
444
+ // Remove specified words from the end of content, handling multiple occurrences
445
+ const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
446
+ let changed = true;
447
+
448
+ while ( changed )
449
+ {
450
+ changed = false;
451
+ for ( let i = 0; i < wordsToTrim.length; i++ )
452
+ {
453
+ const oldProcessed = processed;
454
+ processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
455
+ if ( oldProcessed !== processed )
456
+ {
457
+ changed = true;
458
+ }
459
+ }
460
+ }
440
461
  return processed;
441
462
  }
442
463
 
@@ -531,6 +552,13 @@ class WebScraper
531
552
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
532
553
  }
533
554
 
555
+ isValidFileType ( url )
556
+ {
557
+ if ( !this.filterFileTypes ) return true;
558
+ const urlPath = new URL( url ).pathname.toLowerCase();
559
+ return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
560
+ }
561
+
534
562
  isValidContent ( content )
535
563
  {
536
564
  // Remove whitespace and newlines for checking