clean-web-scraper 3.5.2 → 3.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/WebScraper.js +104 -70
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.5.2",
3
+ "version": "3.5.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -10,80 +10,79 @@ const { connect } = require( "puppeteer-real-browser" )
10
10
  class WebScraper
11
11
  {
12
12
  constructor ({
13
+ // Base configuration
13
14
  baseURL,
14
15
  startURL,
15
16
  strictBaseURL = true,
16
17
  maxDepth = Infinity,
17
18
  maxArticles = Infinity,
18
- excludeList,
19
- exactExcludeList,
19
+
20
+ // URL filtering
21
+ excludeList = [],
22
+ exactExcludeList = [],
23
+ filterFileTypes = true,
24
+ excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
25
+
26
+ // Output paths
20
27
  scrapResultPath = "./dataset",
21
28
  jsonlOutputPath,
22
29
  textOutputPath,
23
30
  csvOutputPath,
31
+
32
+ // Metadata options
24
33
  includeMetadata = false,
25
- metadataFields = [], // ['title', 'description', 'author', etc.]
34
+ metadataFields = [],
35
+
36
+ // Network options
26
37
  axiosHeaders,
27
38
  axiosProxy,
39
+
40
+ // Puppeteer options
28
41
  usePuppeteer,
29
42
  puppeteerProxy, // e.g. http://127.0.0.1:2080
30
43
  puppeteerExecutablePath,
31
- puppeteerRealProxy,
32
- filterFileTypes = true,
33
- excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
44
+ puppeteerRealProxy
34
45
  })
35
46
  {
47
+ // Base configuration
36
48
  this.baseURL = baseURL;
37
49
  this.startURL = startURL || baseURL;
38
50
  this.strictBaseURL = strictBaseURL;
39
51
  this.maxDepth = maxDepth;
40
52
  this.maxArticles = maxArticles;
53
+
54
+ // Output paths setup
41
55
  this.scrapResultPath = scrapResultPath;
42
56
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
43
57
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
44
58
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
45
- this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
46
- this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
47
- this.axiosHeaders = axiosHeaders;
59
+ this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
60
+ this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
61
+
62
+ // Metadata configuration
48
63
  this.includeMetadata = includeMetadata;
49
- this.metadataFields = new Set( metadataFields );
64
+ this.metadataFields = new Set( metadataFields );
65
+
66
+ // URL filtering setup
50
67
  this.visited = new Set();
51
68
  this.excludeList = this.normalizeExcludeList( excludeList );
52
69
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
53
- this.allProcessedContent = [];
54
70
  this.filterFileTypes = filterFileTypes;
55
71
  this.excludedFileTypes = excludedFileTypes;
72
+
73
+ // Network configuration
74
+ this.axiosHeaders = axiosHeaders;
56
75
  this.axiosProxy = axiosProxy;
76
+
77
+ // Content storage
78
+ this.allProcessedContent = [];
79
+
80
+ // Puppeteer configuration
57
81
  this.usePuppeteer = usePuppeteer || false;
58
- this.puppeteerOptions = {
59
- headless: false,
60
- userDataDir: "./tmp/browser",
61
- defaultViewport: null,
62
- args: [
63
- "--start-maximized"
64
- ],
65
- "ignoreDefaultArgs": true,
66
- }
67
- if ( puppeteerProxy )
68
- {
69
- this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
70
- }
71
- if ( puppeteerExecutablePath )
72
- {
73
- this.puppeteerOptions.executablePath = puppeteerExecutablePath;
74
- }
75
- this.puppeteerRealOptions = {
76
- headless: false,
77
- args: [],
78
- customConfig: {},
79
- turnstile: true,
80
- connectOption: {},
81
- disableXvfb: false,
82
- ignoreAllFlags: false,
83
- proxy: puppeteerRealProxy
84
- }
85
- this.puppeteerBrowser = null;
86
- this.puppeteerPage = null;
82
+ this.puppeteerProxy = puppeteerProxy;
83
+ this.puppeteerExecutablePath = puppeteerExecutablePath;
84
+ this.puppeteerRealProxy = puppeteerRealProxy;
85
+ this.configurePuppeteer( );
87
86
  }
88
87
 
89
88
  async start ()
@@ -549,6 +548,41 @@ class WebScraper
549
548
  };
550
549
  }
551
550
 
551
+ configurePuppeteer ( )
552
+ {
553
+ this.puppeteerOptions = {
554
+ headless: false,
555
+ userDataDir: "./tmp/browser",
556
+ defaultViewport: null,
557
+ args: ["--start-maximized"],
558
+ ignoreDefaultArgs: true
559
+ };
560
+
561
+ if ( this.puppeteerProxy )
562
+ {
563
+ this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
564
+ }
565
+
566
+ if ( this.puppeteerExecutablePath )
567
+ {
568
+ this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
569
+ }
570
+
571
+ this.puppeteerRealOptions = {
572
+ headless: false,
573
+ args: [],
574
+ customConfig: {},
575
+ turnstile: true,
576
+ connectOption: {},
577
+ disableXvfb: false,
578
+ ignoreAllFlags: false,
579
+ proxy: this.puppeteerRealProxy
580
+ };
581
+
582
+ this.puppeteerBrowser = null;
583
+ this.puppeteerPage = null;
584
+ }
585
+
552
586
  normalizeExcludeList ( list = [] )
553
587
  {
554
588
  const normalizedSet = new Set();
@@ -577,36 +611,6 @@ class WebScraper
577
611
  return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
578
612
  }
579
613
 
580
- createOutputDirectory ()
581
- {
582
- if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
583
- {
584
- fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
585
- }
586
- if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
587
- {
588
- fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
589
- }
590
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
591
- {
592
- fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
593
- }
594
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
595
- {
596
- fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
597
- }
598
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
599
- {
600
- fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
601
- }
602
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
603
- {
604
- fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
605
- }
606
- fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
607
- fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
608
- }
609
-
610
614
  isValidFileType ( url )
611
615
  {
612
616
  if ( !this.filterFileTypes ) return true;
@@ -657,6 +661,36 @@ class WebScraper
657
661
  return true;
658
662
  }
659
663
 
664
+ createOutputDirectory ()
665
+ {
666
+ if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
667
+ {
668
+ fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
669
+ }
670
+ if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
671
+ {
672
+ fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
673
+ }
674
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
675
+ {
676
+ fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
677
+ }
678
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
679
+ {
680
+ fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
681
+ }
682
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
683
+ {
684
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
685
+ }
686
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
687
+ {
688
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
689
+ }
690
+ fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
691
+ fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
692
+ }
693
+
660
694
  static sleep ( ms )
661
695
  {
662
696
  return new Promise( resolve => { return setTimeout( resolve, ms ) });