clean-web-scraper 3.3.5 → 3.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -83,7 +83,7 @@ async function bdsmovement ()
83
83
  host: "socks5://127.0.0.1",
84
84
  port: "2080",
85
85
  },
86
- usePuppeteer: true
86
+ // usePuppeteer: true
87
87
  });
88
88
  await scraper.start();
89
89
  return scraper;
@@ -95,12 +95,19 @@ async function electronicintifada ()
95
95
  const scraper = new WebScraper({
96
96
  baseURL: "https://electronicintifada.net",
97
97
  excludeList: [
98
- "https://electronicintifada.net/press-area",
99
- "https://electronicintifada.net/privacy-policy",
100
- "https://electronicintifada.net/get-involved/join-a-bds-campaign",
101
- "https://electronicintifada.net/donate_",
102
- "https://electronicintifada.net/user",
103
- "https://electronicintifada.net/admin"
98
+ "https://electronicintifada.net/updates",
99
+ "https://electronicintifada.net/taxonomy/term/",
100
+ "https://electronicintifada.net/tags/",
101
+ "https://electronicintifada.net/blog",
102
+ "https://electronicintifada.net/people",
103
+ "https://electronicintifada.net/location"
104
+ ],
105
+ exactExcludeList: [
106
+ "https://electronicintifada.net",
107
+ "https://electronicintifada.net/blog",
108
+ "https://electronicintifada.net/news",
109
+ "https://electronicintifada.net/opinion",
110
+ "https://electronicintifada.net/review",
104
111
  ],
105
112
  scrapResultPath: "./dataset/electronicintifada/website",
106
113
  jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
@@ -118,10 +125,12 @@ void async function main ()
118
125
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
119
126
  const decolonizepalestineScraper = await decolonizepalestine();
120
127
  const bdsmovementScraper = await bdsmovement();
128
+ const electronicintifadaScraper = await electronicintifada();
121
129
  await WebScraper.combineResults( "./dataset/combined", [
122
130
  khameneiIrFreePalestineTagScraper,
123
131
  decolonizepalestineScraper,
124
- bdsmovementScraper
132
+ bdsmovementScraper,
133
+ electronicintifadaScraper
125
134
  ] );
126
135
 
127
136
  // 4
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.5",
3
+ "version": "3.3.6",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -437,6 +437,23 @@ class WebScraper
437
437
  // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
438
438
  // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
439
439
 
440
+ // Remove specified words from the end of content, handling multiple occurrences
441
+ const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
442
+ let changed = true;
443
+
444
+ while ( changed )
445
+ {
446
+ changed = false;
447
+ for ( let i = 0; i < wordsToTrim.length; i++ )
448
+ {
449
+ const oldProcessed = processed;
450
+ processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
451
+ if ( oldProcessed !== processed )
452
+ {
453
+ changed = true;
454
+ }
455
+ }
456
+ }
440
457
  return processed;
441
458
  }
442
459