clean-web-scraper 3.5.2 → 3.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/WebScraper.js +123 -92
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.5.2",
3
+ "version": "3.5.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -10,80 +10,88 @@ const { connect } = require( "puppeteer-real-browser" )
10
10
  class WebScraper
11
11
  {
12
12
  constructor ({
13
+ // Base configuration
13
14
  baseURL,
14
15
  startURL,
15
16
  strictBaseURL = true,
16
17
  maxDepth = Infinity,
17
18
  maxArticles = Infinity,
18
- excludeList,
19
- exactExcludeList,
19
+
20
+ // URL filtering
21
+ excludeList = [],
22
+ exactExcludeList = [],
23
+ filterFileTypes = true,
24
+ excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
25
+
26
+ // Output paths
20
27
  scrapResultPath = "./dataset",
21
28
  jsonlOutputPath,
22
29
  textOutputPath,
23
30
  csvOutputPath,
31
+
32
+ // Metadata options
24
33
  includeMetadata = false,
25
- metadataFields = [], // ['title', 'description', 'author', etc.]
34
+ metadataFields = [],
35
+
36
+ // Network options
26
37
  axiosHeaders,
27
38
  axiosProxy,
39
+
40
+ // Puppeteer options
28
41
  usePuppeteer,
29
42
  puppeteerProxy, // e.g. http://127.0.0.1:2080
30
43
  puppeteerExecutablePath,
31
- puppeteerRealProxy,
32
- filterFileTypes = true,
33
- excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
44
+ puppeteerRealProxy
34
45
  })
35
46
  {
47
+ // Base configuration
36
48
  this.baseURL = baseURL;
37
49
  this.startURL = startURL || baseURL;
38
50
  this.strictBaseURL = strictBaseURL;
39
51
  this.maxDepth = maxDepth;
40
52
  this.maxArticles = maxArticles;
53
+
54
+ // Output paths setup
41
55
  this.scrapResultPath = scrapResultPath;
42
56
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
43
57
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
44
58
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
45
- this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
46
- this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
47
- this.axiosHeaders = axiosHeaders;
59
+ this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
60
+ this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
61
+
62
+ // Metadata configuration
48
63
  this.includeMetadata = includeMetadata;
49
- this.metadataFields = new Set( metadataFields );
64
+ this.metadataFields = new Set( metadataFields );
65
+
66
+ // URL filtering setup
50
67
  this.visited = new Set();
51
68
  this.excludeList = this.normalizeExcludeList( excludeList );
52
69
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
53
- this.allProcessedContent = [];
54
70
  this.filterFileTypes = filterFileTypes;
55
71
  this.excludedFileTypes = excludedFileTypes;
72
+
73
+ // Network configuration
74
+ this.axiosHeaders = axiosHeaders;
56
75
  this.axiosProxy = axiosProxy;
57
- this.usePuppeteer = usePuppeteer || false;
58
- this.puppeteerOptions = {
59
- headless: false,
60
- userDataDir: "./tmp/browser",
61
- defaultViewport: null,
62
- args: [
63
- "--start-maximized"
64
- ],
65
- "ignoreDefaultArgs": true,
66
- }
67
- if ( puppeteerProxy )
76
+ this.axiosOptions = {};
77
+ if ( this.axiosHeaders )
68
78
  {
69
- this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
79
+ axiosOptions.headers = this.axiosHeaders;
70
80
  }
71
- if ( puppeteerExecutablePath )
81
+ if ( this.axiosProxy )
72
82
  {
73
- this.puppeteerOptions.executablePath = puppeteerExecutablePath;
74
- }
75
- this.puppeteerRealOptions = {
76
- headless: false,
77
- args: [],
78
- customConfig: {},
79
- turnstile: true,
80
- connectOption: {},
81
- disableXvfb: false,
82
- ignoreAllFlags: false,
83
- proxy: puppeteerRealProxy
83
+ axiosOptions.proxy = this.axiosProxy;
84
84
  }
85
- this.puppeteerBrowser = null;
86
- this.puppeteerPage = null;
85
+
86
+ // Content storage
87
+ this.allProcessedContent = [];
88
+
89
+ // Puppeteer configuration
90
+ this.usePuppeteer = usePuppeteer || false;
91
+ this.puppeteerProxy = puppeteerProxy;
92
+ this.puppeteerExecutablePath = puppeteerExecutablePath;
93
+ this.puppeteerRealProxy = puppeteerRealProxy;
94
+ this.configurePuppeteer( );
87
95
  }
88
96
 
89
97
  async start ()
@@ -139,7 +147,7 @@ class WebScraper
139
147
  }
140
148
  try
141
149
  {
142
- const data = await this.caller( url );
150
+ const data = await this.fetchContent( url );
143
151
  if ( !data ) return;
144
152
  const dom = new JSDOM( data, { url });
145
153
  const { document } = dom.window;
@@ -151,9 +159,9 @@ class WebScraper
151
159
 
152
160
  if ( article )
153
161
  {
154
- if ( this.isValidContent( article.textContent ) )
162
+ if ( this.hasValidPageContent( article.textContent ) )
155
163
  {
156
- const metadata = this.metadataextractor( url, document );
164
+ const metadata = this.extractMetadata( url, document );
157
165
  metadata.depth = depth;
158
166
  this.saveArticle( url, article.textContent, metadata );
159
167
  }
@@ -183,35 +191,23 @@ class WebScraper
183
191
  }
184
192
  }
185
193
 
186
- async caller ( url )
194
+ async fetchContent ( url )
187
195
  {
188
196
  try
189
197
  {
190
- let axiosOptions = {};
191
- if ( this.axiosHeaders )
192
- {
193
- axiosOptions.headers = this.axiosHeaders;
194
- }
195
- if ( this.axiosProxy )
196
- {
197
- axiosOptions.proxy = this.axiosProxy;
198
- }
199
-
200
- // Step 1: Make a GET request with a small timeout and limited data download
201
198
  const response = await axios.get( url, {
202
- ...axiosOptions,
203
199
  responseType: "stream",
204
200
  maxRedirects: 5,
205
- timeout: 70000
201
+ timeout: 70000,
202
+ ...axiosOptions,
206
203
  });
207
204
 
208
- // Step 2: Check the Content-Type header from the response
209
205
  const contentType = response.headers["content-type"] || "";
210
206
  if ( !contentType.startsWith( "text" ) )
211
207
  {
212
208
  console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
213
- response.data.destroy(); // Destroy the stream to stop downloading further data
214
- return null; // Skip further processing for non-HTML content
209
+ response.data.destroy();
210
+ return null;
215
211
  }
216
212
 
217
213
  // Step 3: If Content-Type is HTML, read the full response
@@ -243,8 +239,8 @@ class WebScraper
243
239
  for ( let index = 0; index < 10; index++ )
244
240
  {
245
241
  console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
246
- result = await this.goToUrl( url ) ;
247
- if ( this.isValidContent( result.htmlContent ) )
242
+ result = await this.navigateToPage( url ) ;
243
+ if ( this.hasValidPageContent( result.htmlContent ) )
248
244
  {
249
245
  break
250
246
  }
@@ -262,7 +258,7 @@ class WebScraper
262
258
  }
263
259
  }
264
260
 
265
- async goToUrl ( url )
261
+ async navigateToPage ( url )
266
262
  {
267
263
  let pages = await this.puppeteerBrowser.pages();
268
264
  let page = pages[0];
@@ -531,7 +527,7 @@ class WebScraper
531
527
  return filteredMetadata;
532
528
  }
533
529
 
534
- metadataextractor ( url, document )
530
+ extractMetadata ( url, document )
535
531
  {
536
532
  return {
537
533
  url,
@@ -549,6 +545,41 @@ class WebScraper
549
545
  };
550
546
  }
551
547
 
548
+ configurePuppeteer ( )
549
+ {
550
+ this.puppeteerOptions = {
551
+ headless: false,
552
+ userDataDir: "./tmp/browser",
553
+ defaultViewport: null,
554
+ args: ["--start-maximized"],
555
+ ignoreDefaultArgs: true
556
+ };
557
+
558
+ if ( this.puppeteerProxy )
559
+ {
560
+ this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
561
+ }
562
+
563
+ if ( this.puppeteerExecutablePath )
564
+ {
565
+ this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
566
+ }
567
+
568
+ this.puppeteerRealOptions = {
569
+ headless: false,
570
+ args: [],
571
+ customConfig: {},
572
+ turnstile: true,
573
+ connectOption: {},
574
+ disableXvfb: false,
575
+ ignoreAllFlags: false,
576
+ proxy: this.puppeteerRealProxy
577
+ };
578
+
579
+ this.puppeteerBrowser = null;
580
+ this.puppeteerPage = null;
581
+ }
582
+
552
583
  normalizeExcludeList ( list = [] )
553
584
  {
554
585
  const normalizedSet = new Set();
@@ -577,36 +608,6 @@ class WebScraper
577
608
  return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
578
609
  }
579
610
 
580
- createOutputDirectory ()
581
- {
582
- if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
583
- {
584
- fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
585
- }
586
- if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
587
- {
588
- fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
589
- }
590
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
591
- {
592
- fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
593
- }
594
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
595
- {
596
- fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
597
- }
598
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
599
- {
600
- fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
601
- }
602
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
603
- {
604
- fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
605
- }
606
- fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
607
- fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
608
- }
609
-
610
611
  isValidFileType ( url )
611
612
  {
612
613
  if ( !this.filterFileTypes ) return true;
@@ -630,7 +631,7 @@ class WebScraper
630
631
  }
631
632
  }
632
633
 
633
- isValidContent ( content )
634
+ hasValidPageContent ( content )
634
635
  {
635
636
  // Remove whitespace and newlines for checking
636
637
  const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
@@ -657,6 +658,36 @@ class WebScraper
657
658
  return true;
658
659
  }
659
660
 
661
+ createOutputDirectory ()
662
+ {
663
+ if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
664
+ {
665
+ fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
666
+ }
667
+ if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
668
+ {
669
+ fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
670
+ }
671
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
672
+ {
673
+ fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
674
+ }
675
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
676
+ {
677
+ fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
678
+ }
679
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
680
+ {
681
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
682
+ }
683
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
684
+ {
685
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
686
+ }
687
+ fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
688
+ fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
689
+ }
690
+
660
691
  static sleep ( ms )
661
692
  {
662
693
  return new Promise( resolve => { return setTimeout( resolve, ms ) });