clean-web-scraper 3.3.0 → 3.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -3,15 +3,12 @@ const WebScraper = require( "./src/WebScraper" );
3
3
 
4
4
  async function khameneiIrFreePalestineTag ()
5
5
  {
6
- // 1
7
6
  // https://english.khamenei.ir/Opinions/FreePalestine
8
7
  // https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
9
8
  const scraper = new WebScraper({
10
9
  baseURL: "https://english.khamenei.ir/news",
11
10
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
12
11
  maxDepth: 1,
13
- excludeList: [
14
- ],
15
12
  exactExcludeList: [
16
13
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
17
14
  ],
@@ -20,7 +17,7 @@ async function khameneiIrFreePalestineTag ()
20
17
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
21
18
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
22
19
  includeMetadata: true,
23
- metadataFields: ["title", "description", "author", "lastModified", "language"]
20
+ metadataFields: ["title", "description", "author"]
24
21
  });
25
22
  await scraper.start();
26
23
  return scraper;
@@ -28,7 +25,6 @@ async function khameneiIrFreePalestineTag ()
28
25
 
29
26
  async function decolonizepalestine ()
30
27
  {
31
- // 2
32
28
  // https://decolonizepalestine.com
33
29
  const scraper = new WebScraper({
34
30
  baseURL: "https://decolonizepalestine.com",
@@ -48,23 +44,46 @@ async function decolonizepalestine ()
48
44
  textOutputPath: "./dataset/decolonizepalestine/texts",
49
45
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
50
46
  includeMetadata: true,
51
- metadataFields: ["title", "description", "author", "lastModified", "language"]
47
+ metadataFields: ["title", "description", "author"]
52
48
  });
53
49
  await scraper.start();
54
50
  return scraper;
55
51
  }
56
52
 
57
- void async function main ()
53
+ async function bdsmovement ()
58
54
  {
59
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
60
- const decolonizepalestineScraper = await decolonizepalestine();
61
- await WebScraper.combineResults( "./dataset/combined", [
62
- khameneiIrFreePalestineTagScraper,
63
- decolonizepalestineScraper
64
- ] );
55
+ // https://bdsmovement.org
56
+ const scraper = new WebScraper({
57
+ baseURL: "https://bdsmovement.org",
58
+ excludeList: [
59
+ "https://bdsmovement.net/press-area",
60
+ "https://bdsmovement.net/privacy-policy",
61
+ "https://bdsmovement.net/get-involved/join-a-bds-campaign",
62
+ "https://bdsmovement.net/donate_",
63
+ "https://bdsmovement.net/user",
64
+ "https://bdsmovement.net/admin"
65
+ ],
66
+ scrapResultPath: "./dataset/bdsmovement/website",
67
+ jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
68
+ textOutputPath: "./dataset/bdsmovement/texts",
69
+ csvOutputPath: "./dataset/bdsmovement/train.csv",
70
+ includeMetadata: true,
71
+ metadataFields: ["title", "description", "author"]
72
+ });
73
+ await scraper.start();
74
+ return scraper;
75
+ }
65
76
 
66
- // 3
67
- // https://bdsmovement.net
77
+ void async function main ()
78
+ {
79
+ // const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
80
+ // const decolonizepalestineScraper = await decolonizepalestine();
81
+ const bdsmovementScraper = await bdsmovement();
82
+ // await WebScraper.combineResults( "./dataset/combined", [
83
+ // khameneiIrFreePalestineTagScraper,
84
+ // decolonizepalestineScraper,
85
+ // bdsmovementScraper
86
+ // ] );
68
87
 
69
88
  // 4
70
89
  // https://electronicintifada.net/
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.0",
3
+ "version": "3.3.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -18,7 +18,12 @@ class WebScraper
18
18
  textOutputPath,
19
19
  csvOutputPath,
20
20
  includeMetadata = false,
21
- metadataFields = [] // ['title', 'description', 'author', 'lastModified', etc.]
21
+ metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
22
+ headers = {
23
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
24
+ "Cache-Control": "private",
25
+ "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"
26
+ }
22
27
  })
23
28
  {
24
29
  this.baseURL = baseURL;
@@ -30,10 +35,11 @@ class WebScraper
30
35
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
31
36
  this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
32
37
  this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
38
+ this.headers = headers;
33
39
  this.includeMetadata = includeMetadata;
34
40
  this.metadataFields = new Set( metadataFields );
35
41
  this.visited = new Set();
36
- this.excludeList = new Set( excludeList );
42
+ this.excludeList = this.normalizeExcludeList( excludeList );
37
43
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
38
44
  this.allProcessedContent = [];
39
45
  }
@@ -57,7 +63,9 @@ class WebScraper
57
63
  this.visited.add( url );
58
64
  try
59
65
  {
60
- const { data, headers } = await axios.get( url );
66
+ const { data, headers } = await axios.get( url, {
67
+ headers: this.headers,
68
+ });
61
69
  const dom = new JSDOM( data, { url });
62
70
  const { document } = dom.window;
63
71
 
@@ -68,9 +76,17 @@ class WebScraper
68
76
 
69
77
  if ( article )
70
78
  {
71
- const metadata = this.metadataextractor( url, document, headers );
72
- metadata.depth = depth;
73
- this.saveArticle( url, article.textContent, metadata );
79
+ if ( this.isValidContent( article.textContent ) )
80
+ {
81
+
82
+ const metadata = this.metadataextractor( url, document, headers );
83
+ metadata.depth = depth;
84
+ this.saveArticle( url, article.textContent, metadata );
85
+ }
86
+ else
87
+ {
88
+ console.error( `Invalid content found at ${url}` );
89
+ }
74
90
  }
75
91
  else
76
92
  {
@@ -334,7 +350,7 @@ class WebScraper
334
350
  };
335
351
  }
336
352
 
337
- normalizeExcludeList ( list )
353
+ normalizeExcludeList ( list = [] )
338
354
  {
339
355
  const normalizedSet = new Set();
340
356
  for ( let i = 0; i < list.length; i++ )
@@ -367,8 +383,27 @@ class WebScraper
367
383
  if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
368
384
  {
369
385
  fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
386
+ }
387
+ if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
388
+ {
370
389
  fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
371
390
  }
391
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
392
+ {
393
+ fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
394
+ }
395
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
396
+ {
397
+ fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
398
+ }
399
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
400
+ {
401
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
402
+ }
403
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
404
+ {
405
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
406
+ }
372
407
  fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
373
408
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
374
409
  }
@@ -461,6 +496,30 @@ class WebScraper
461
496
  csvMetaOutput.end();
462
497
  }
463
498
 
499
+ isValidContent ( content )
500
+ {
501
+ // Remove whitespace and newlines for checking
502
+ const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
503
+
504
+ // List of phrases that indicate invalid content
505
+ const invalidPhrases = [
506
+ "verifying that you are not a robot",
507
+ "checking if the site connection is secure",
508
+ "please wait while we verify",
509
+ "please enable javascript",
510
+ "access denied",
511
+ "captcha verification"
512
+ ];
513
+
514
+ const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
515
+ // Check content length
516
+ if ( cleanContent.length < 100 && hasInvalidPhrases )
517
+ {
518
+ return false;
519
+ }
520
+ return true;
521
+ }
522
+
464
523
  static combineTextFiles ( fullOutputPath, websites )
465
524
  {
466
525
  let textFileCounter = 1;