clean-web-scraper 3.5.1 → 3.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -8,8 +8,36 @@ const WebScraper = require( "./src/WebScraper" );
8
8
  // "Cookie": cookies
9
9
  // }
10
10
 
11
+ async function palianswers ( enable )
12
+ {
13
+ const scraper = new WebScraper({
14
+ baseURL: "https://palianswers.com",
15
+ excludeList: [
16
+ "https://palianswers.com/chat/",
17
+ "https://palianswers.com/become-a-volunteer/",
18
+ "https://palianswers.com/other-resources/",
19
+ "https://palianswers.com/request-a-rebuttal/",
20
+ "https://palianswers.com/submit-a-rebuttal/",
21
+ "https://palianswers.com/themes/"
22
+ ],
23
+ exactExcludeList: [
24
+ "https://palianswers.com/",
25
+ ],
26
+ scrapResultPath: "./dataset/palianswers/website",
27
+ jsonlOutputPath: "./dataset/palianswers/train.jsonl",
28
+ textOutputPath: "./dataset/palianswers/texts",
29
+ csvOutputPath: "./dataset/palianswers/train.csv",
30
+ includeMetadata: true,
31
+ metadataFields: ["title", "description", "author"]
32
+ });
33
+ if ( enable )
34
+ {
35
+ await scraper.start();
36
+ }
37
+ return scraper;
38
+ }
11
39
 
12
- async function khameneiIrFreePalestineTag ()
40
+ async function khameneiIrFreePalestineTag ( enable )
13
41
  {
14
42
  // https://english.khamenei.ir/Opinions/FreePalestine
15
43
  // https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
@@ -135,6 +163,7 @@ async function electronicintifada ( enable )
135
163
 
136
164
  async function palestineremembered ( enable )
137
165
  {
166
+ // https://www.palestineremembered.com
138
167
  const scraper = new WebScraper({
139
168
  baseURL: "https://www.palestineremembered.com",
140
169
  startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
@@ -179,24 +208,21 @@ async function palestineremembered ( enable )
179
208
 
180
209
  void async function main ()
181
210
  {
182
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
183
- const decolonizepalestineScraper = await decolonizepalestine( false );
211
+ const palianswersScraper = await palianswers( true );
212
+ const decolonizepalestineScraper = await decolonizepalestine( true );
213
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
184
214
  const bdsmovementScraper = await bdsmovement( false );
185
- const electronicintifadaScraper = await electronicintifada( false );
186
- const palestinerememberedScraper = await palestineremembered( true );
215
+ const electronicintifadaScraper = await electronicintifada( true );
216
+ const palestinerememberedScraper = await palestineremembered( false );
217
+
187
218
  await WebScraper.combineResults( "./dataset/combined", [
188
- khameneiIrFreePalestineTagScraper,
219
+ palianswersScraper,
189
220
  decolonizepalestineScraper,
190
- bdsmovementScraper,
221
+ khameneiIrFreePalestineTagScraper,
191
222
  electronicintifadaScraper,
192
- palestinerememberedScraper
223
+ // bdsmovementScraper,
224
+ // palestinerememberedScraper,
193
225
  ] );
194
226
 
195
- // 5
196
- // https://www.palestineremembered.com/ZionistFAQ.html
197
-
198
- // 6 https://the-palestinian-side.vercel.app/
199
-
200
227
  // 7 https://stand-with-palestine.org/blogs
201
228
  }()
202
-
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.5.1",
3
+ "version": "3.5.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -10,78 +10,79 @@ const { connect } = require( "puppeteer-real-browser" )
10
10
  class WebScraper
11
11
  {
12
12
  constructor ({
13
+ // Base configuration
13
14
  baseURL,
14
15
  startURL,
16
+ strictBaseURL = true,
15
17
  maxDepth = Infinity,
16
- maxArticles = Infinity, // Add this line
17
- excludeList,
18
- exactExcludeList,
18
+ maxArticles = Infinity,
19
+
20
+ // URL filtering
21
+ excludeList = [],
22
+ exactExcludeList = [],
23
+ filterFileTypes = true,
24
+ excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
25
+
26
+ // Output paths
19
27
  scrapResultPath = "./dataset",
20
28
  jsonlOutputPath,
21
29
  textOutputPath,
22
30
  csvOutputPath,
31
+
32
+ // Metadata options
23
33
  includeMetadata = false,
24
- metadataFields = [], // ['title', 'description', 'author', etc.]
34
+ metadataFields = [],
35
+
36
+ // Network options
25
37
  axiosHeaders,
26
38
  axiosProxy,
39
+
40
+ // Puppeteer options
27
41
  usePuppeteer,
28
42
  puppeteerProxy, // e.g. http://127.0.0.1:2080
29
43
  puppeteerExecutablePath,
30
- puppeteerRealProxy,
31
- filterFileTypes = true,
32
- excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
44
+ puppeteerRealProxy
33
45
  })
34
46
  {
47
+ // Base configuration
35
48
  this.baseURL = baseURL;
36
49
  this.startURL = startURL || baseURL;
50
+ this.strictBaseURL = strictBaseURL;
37
51
  this.maxDepth = maxDepth;
38
- this.maxArticles = maxArticles; // Add this line
52
+ this.maxArticles = maxArticles;
53
+
54
+ // Output paths setup
39
55
  this.scrapResultPath = scrapResultPath;
40
56
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
41
57
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
42
58
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
43
- this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
44
- this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
45
- this.axiosHeaders = axiosHeaders;
59
+ this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
60
+ this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
61
+
62
+ // Metadata configuration
46
63
  this.includeMetadata = includeMetadata;
47
- this.metadataFields = new Set( metadataFields );
64
+ this.metadataFields = new Set( metadataFields );
65
+
66
+ // URL filtering setup
48
67
  this.visited = new Set();
49
68
  this.excludeList = this.normalizeExcludeList( excludeList );
50
69
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
51
- this.allProcessedContent = [];
52
70
  this.filterFileTypes = filterFileTypes;
53
71
  this.excludedFileTypes = excludedFileTypes;
72
+
73
+ // Network configuration
74
+ this.axiosHeaders = axiosHeaders;
54
75
  this.axiosProxy = axiosProxy;
76
+
77
+ // Content storage
78
+ this.allProcessedContent = [];
79
+
80
+ // Puppeteer configuration
55
81
  this.usePuppeteer = usePuppeteer || false;
56
- this.puppeteerOptions = {
57
- headless: false,
58
- userDataDir: "./tmp/browser",
59
- defaultViewport: null,
60
- args: [
61
- "--start-maximized"
62
- ],
63
- "ignoreDefaultArgs": true,
64
- }
65
- if ( puppeteerProxy )
66
- {
67
- this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
68
- }
69
- if ( puppeteerExecutablePath )
70
- {
71
- this.puppeteerOptions.executablePath = puppeteerExecutablePath;
72
- }
73
- this.puppeteerRealOptions = {
74
- headless: false,
75
- args: [],
76
- customConfig: {},
77
- turnstile: true,
78
- connectOption: {},
79
- disableXvfb: false,
80
- ignoreAllFlags: false,
81
- proxy: puppeteerRealProxy
82
- }
83
- this.puppeteerBrowser = null;
84
- this.puppeteerPage = null;
82
+ this.puppeteerProxy = puppeteerProxy;
83
+ this.puppeteerExecutablePath = puppeteerExecutablePath;
84
+ this.puppeteerRealProxy = puppeteerRealProxy;
85
+ this.configurePuppeteer( );
85
86
  }
86
87
 
87
88
  async start ()
@@ -131,6 +132,10 @@ class WebScraper
131
132
  {
132
133
  return;
133
134
  }
135
+ if ( !this.isValidDomain( url ) )
136
+ {
137
+ return;
138
+ }
134
139
  try
135
140
  {
136
141
  const data = await this.caller( url );
@@ -543,6 +548,41 @@ class WebScraper
543
548
  };
544
549
  }
545
550
 
551
+ configurePuppeteer ( )
552
+ {
553
+ this.puppeteerOptions = {
554
+ headless: false,
555
+ userDataDir: "./tmp/browser",
556
+ defaultViewport: null,
557
+ args: ["--start-maximized"],
558
+ ignoreDefaultArgs: true
559
+ };
560
+
561
+ if ( this.puppeteerProxy )
562
+ {
563
+ this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
564
+ }
565
+
566
+ if ( this.puppeteerExecutablePath )
567
+ {
568
+ this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
569
+ }
570
+
571
+ this.puppeteerRealOptions = {
572
+ headless: false,
573
+ args: [],
574
+ customConfig: {},
575
+ turnstile: true,
576
+ connectOption: {},
577
+ disableXvfb: false,
578
+ ignoreAllFlags: false,
579
+ proxy: this.puppeteerRealProxy
580
+ };
581
+
582
+ this.puppeteerBrowser = null;
583
+ this.puppeteerPage = null;
584
+ }
585
+
546
586
  normalizeExcludeList ( list = [] )
547
587
  {
548
588
  const normalizedSet = new Set();
@@ -571,36 +611,6 @@ class WebScraper
571
611
  return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
572
612
  }
573
613
 
574
- createOutputDirectory ()
575
- {
576
- if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
577
- {
578
- fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
579
- }
580
- if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
581
- {
582
- fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
583
- }
584
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
585
- {
586
- fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
587
- }
588
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
589
- {
590
- fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
591
- }
592
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
593
- {
594
- fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
595
- }
596
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
597
- {
598
- fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
599
- }
600
- fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
601
- fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
602
- }
603
-
604
614
  isValidFileType ( url )
605
615
  {
606
616
  if ( !this.filterFileTypes ) return true;
@@ -608,6 +618,22 @@ class WebScraper
608
618
  return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
609
619
  }
610
620
 
621
+ isValidDomain ( url )
622
+ {
623
+ if ( !this.strictBaseURL ) return true;
624
+ try
625
+ {
626
+ const urlObj = new URL( url );
627
+ const baseURLObj = new URL( this.baseURL );
628
+ return urlObj.hostname === baseURLObj.hostname;
629
+ }
630
+ catch ( e )
631
+ {
632
+ console.log( `Invalid URL: ${url}` );
633
+ return false;
634
+ }
635
+ }
636
+
611
637
  isValidContent ( content )
612
638
  {
613
639
  // Remove whitespace and newlines for checking
@@ -635,6 +661,36 @@ class WebScraper
635
661
  return true;
636
662
  }
637
663
 
664
+ createOutputDirectory ()
665
+ {
666
+ if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
667
+ {
668
+ fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
669
+ }
670
+ if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
671
+ {
672
+ fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
673
+ }
674
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
675
+ {
676
+ fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
677
+ }
678
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
679
+ {
680
+ fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
681
+ }
682
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
683
+ {
684
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
685
+ }
686
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
687
+ {
688
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
689
+ }
690
+ fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
691
+ fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
692
+ }
693
+
638
694
  static sleep ( ms )
639
695
  {
640
696
  return new Promise( resolve => { return setTimeout( resolve, ms ) });