clean-web-scraper 3.5.2 → 3.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/WebScraper.js +123 -92
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -10,80 +10,88 @@ const { connect } = require( "puppeteer-real-browser" )
|
|
|
10
10
|
class WebScraper
|
|
11
11
|
{
|
|
12
12
|
constructor ({
|
|
13
|
+
// Base configuration
|
|
13
14
|
baseURL,
|
|
14
15
|
startURL,
|
|
15
16
|
strictBaseURL = true,
|
|
16
17
|
maxDepth = Infinity,
|
|
17
18
|
maxArticles = Infinity,
|
|
18
|
-
|
|
19
|
-
|
|
19
|
+
|
|
20
|
+
// URL filtering
|
|
21
|
+
excludeList = [],
|
|
22
|
+
exactExcludeList = [],
|
|
23
|
+
filterFileTypes = true,
|
|
24
|
+
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
|
|
25
|
+
|
|
26
|
+
// Output paths
|
|
20
27
|
scrapResultPath = "./dataset",
|
|
21
28
|
jsonlOutputPath,
|
|
22
29
|
textOutputPath,
|
|
23
30
|
csvOutputPath,
|
|
31
|
+
|
|
32
|
+
// Metadata options
|
|
24
33
|
includeMetadata = false,
|
|
25
|
-
metadataFields = [],
|
|
34
|
+
metadataFields = [],
|
|
35
|
+
|
|
36
|
+
// Network options
|
|
26
37
|
axiosHeaders,
|
|
27
38
|
axiosProxy,
|
|
39
|
+
|
|
40
|
+
// Puppeteer options
|
|
28
41
|
usePuppeteer,
|
|
29
42
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
30
43
|
puppeteerExecutablePath,
|
|
31
|
-
puppeteerRealProxy
|
|
32
|
-
filterFileTypes = true,
|
|
33
|
-
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
|
|
44
|
+
puppeteerRealProxy
|
|
34
45
|
})
|
|
35
46
|
{
|
|
47
|
+
// Base configuration
|
|
36
48
|
this.baseURL = baseURL;
|
|
37
49
|
this.startURL = startURL || baseURL;
|
|
38
50
|
this.strictBaseURL = strictBaseURL;
|
|
39
51
|
this.maxDepth = maxDepth;
|
|
40
52
|
this.maxArticles = maxArticles;
|
|
53
|
+
|
|
54
|
+
// Output paths setup
|
|
41
55
|
this.scrapResultPath = scrapResultPath;
|
|
42
56
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
43
57
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
44
58
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
45
|
-
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
46
|
-
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
47
|
-
|
|
59
|
+
this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
60
|
+
this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
61
|
+
|
|
62
|
+
// Metadata configuration
|
|
48
63
|
this.includeMetadata = includeMetadata;
|
|
49
|
-
|
|
64
|
+
this.metadataFields = new Set( metadataFields );
|
|
65
|
+
|
|
66
|
+
// URL filtering setup
|
|
50
67
|
this.visited = new Set();
|
|
51
68
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
52
69
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
53
|
-
this.allProcessedContent = [];
|
|
54
70
|
this.filterFileTypes = filterFileTypes;
|
|
55
71
|
this.excludedFileTypes = excludedFileTypes;
|
|
72
|
+
|
|
73
|
+
// Network configuration
|
|
74
|
+
this.axiosHeaders = axiosHeaders;
|
|
56
75
|
this.axiosProxy = axiosProxy;
|
|
57
|
-
this.
|
|
58
|
-
this.
|
|
59
|
-
headless: false,
|
|
60
|
-
userDataDir: "./tmp/browser",
|
|
61
|
-
defaultViewport: null,
|
|
62
|
-
args: [
|
|
63
|
-
"--start-maximized"
|
|
64
|
-
],
|
|
65
|
-
"ignoreDefaultArgs": true,
|
|
66
|
-
}
|
|
67
|
-
if ( puppeteerProxy )
|
|
76
|
+
this.axiosOptions = {};
|
|
77
|
+
if ( this.axiosHeaders )
|
|
68
78
|
{
|
|
69
|
-
|
|
79
|
+
axiosOptions.headers = this.axiosHeaders;
|
|
70
80
|
}
|
|
71
|
-
if (
|
|
81
|
+
if ( this.axiosProxy )
|
|
72
82
|
{
|
|
73
|
-
|
|
74
|
-
}
|
|
75
|
-
this.puppeteerRealOptions = {
|
|
76
|
-
headless: false,
|
|
77
|
-
args: [],
|
|
78
|
-
customConfig: {},
|
|
79
|
-
turnstile: true,
|
|
80
|
-
connectOption: {},
|
|
81
|
-
disableXvfb: false,
|
|
82
|
-
ignoreAllFlags: false,
|
|
83
|
-
proxy: puppeteerRealProxy
|
|
83
|
+
axiosOptions.proxy = this.axiosProxy;
|
|
84
84
|
}
|
|
85
|
-
|
|
86
|
-
|
|
85
|
+
|
|
86
|
+
// Content storage
|
|
87
|
+
this.allProcessedContent = [];
|
|
88
|
+
|
|
89
|
+
// Puppeteer configuration
|
|
90
|
+
this.usePuppeteer = usePuppeteer || false;
|
|
91
|
+
this.puppeteerProxy = puppeteerProxy;
|
|
92
|
+
this.puppeteerExecutablePath = puppeteerExecutablePath;
|
|
93
|
+
this.puppeteerRealProxy = puppeteerRealProxy;
|
|
94
|
+
this.configurePuppeteer( );
|
|
87
95
|
}
|
|
88
96
|
|
|
89
97
|
async start ()
|
|
@@ -139,7 +147,7 @@ class WebScraper
|
|
|
139
147
|
}
|
|
140
148
|
try
|
|
141
149
|
{
|
|
142
|
-
const data = await this.
|
|
150
|
+
const data = await this.fetchContent( url );
|
|
143
151
|
if ( !data ) return;
|
|
144
152
|
const dom = new JSDOM( data, { url });
|
|
145
153
|
const { document } = dom.window;
|
|
@@ -151,9 +159,9 @@ class WebScraper
|
|
|
151
159
|
|
|
152
160
|
if ( article )
|
|
153
161
|
{
|
|
154
|
-
if ( this.
|
|
162
|
+
if ( this.hasValidPageContent( article.textContent ) )
|
|
155
163
|
{
|
|
156
|
-
const metadata = this.
|
|
164
|
+
const metadata = this.extractMetadata( url, document );
|
|
157
165
|
metadata.depth = depth;
|
|
158
166
|
this.saveArticle( url, article.textContent, metadata );
|
|
159
167
|
}
|
|
@@ -183,35 +191,23 @@ class WebScraper
|
|
|
183
191
|
}
|
|
184
192
|
}
|
|
185
193
|
|
|
186
|
-
async
|
|
194
|
+
async fetchContent ( url )
|
|
187
195
|
{
|
|
188
196
|
try
|
|
189
197
|
{
|
|
190
|
-
let axiosOptions = {};
|
|
191
|
-
if ( this.axiosHeaders )
|
|
192
|
-
{
|
|
193
|
-
axiosOptions.headers = this.axiosHeaders;
|
|
194
|
-
}
|
|
195
|
-
if ( this.axiosProxy )
|
|
196
|
-
{
|
|
197
|
-
axiosOptions.proxy = this.axiosProxy;
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
// Step 1: Make a GET request with a small timeout and limited data download
|
|
201
198
|
const response = await axios.get( url, {
|
|
202
|
-
...axiosOptions,
|
|
203
199
|
responseType: "stream",
|
|
204
200
|
maxRedirects: 5,
|
|
205
|
-
timeout: 70000
|
|
201
|
+
timeout: 70000,
|
|
202
|
+
...axiosOptions,
|
|
206
203
|
});
|
|
207
204
|
|
|
208
|
-
// Step 2: Check the Content-Type header from the response
|
|
209
205
|
const contentType = response.headers["content-type"] || "";
|
|
210
206
|
if ( !contentType.startsWith( "text" ) )
|
|
211
207
|
{
|
|
212
208
|
console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
|
|
213
|
-
response.data.destroy();
|
|
214
|
-
return null;
|
|
209
|
+
response.data.destroy();
|
|
210
|
+
return null;
|
|
215
211
|
}
|
|
216
212
|
|
|
217
213
|
// Step 3: If Content-Type is HTML, read the full response
|
|
@@ -243,8 +239,8 @@ class WebScraper
|
|
|
243
239
|
for ( let index = 0; index < 10; index++ )
|
|
244
240
|
{
|
|
245
241
|
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
|
246
|
-
result = await this.
|
|
247
|
-
if ( this.
|
|
242
|
+
result = await this.navigateToPage( url ) ;
|
|
243
|
+
if ( this.hasValidPageContent( result.htmlContent ) )
|
|
248
244
|
{
|
|
249
245
|
break
|
|
250
246
|
}
|
|
@@ -262,7 +258,7 @@ class WebScraper
|
|
|
262
258
|
}
|
|
263
259
|
}
|
|
264
260
|
|
|
265
|
-
async
|
|
261
|
+
async navigateToPage ( url )
|
|
266
262
|
{
|
|
267
263
|
let pages = await this.puppeteerBrowser.pages();
|
|
268
264
|
let page = pages[0];
|
|
@@ -531,7 +527,7 @@ class WebScraper
|
|
|
531
527
|
return filteredMetadata;
|
|
532
528
|
}
|
|
533
529
|
|
|
534
|
-
|
|
530
|
+
extractMetadata ( url, document )
|
|
535
531
|
{
|
|
536
532
|
return {
|
|
537
533
|
url,
|
|
@@ -549,6 +545,41 @@ class WebScraper
|
|
|
549
545
|
};
|
|
550
546
|
}
|
|
551
547
|
|
|
548
|
+
configurePuppeteer ( )
|
|
549
|
+
{
|
|
550
|
+
this.puppeteerOptions = {
|
|
551
|
+
headless: false,
|
|
552
|
+
userDataDir: "./tmp/browser",
|
|
553
|
+
defaultViewport: null,
|
|
554
|
+
args: ["--start-maximized"],
|
|
555
|
+
ignoreDefaultArgs: true
|
|
556
|
+
};
|
|
557
|
+
|
|
558
|
+
if ( this.puppeteerProxy )
|
|
559
|
+
{
|
|
560
|
+
this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
if ( this.puppeteerExecutablePath )
|
|
564
|
+
{
|
|
565
|
+
this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
this.puppeteerRealOptions = {
|
|
569
|
+
headless: false,
|
|
570
|
+
args: [],
|
|
571
|
+
customConfig: {},
|
|
572
|
+
turnstile: true,
|
|
573
|
+
connectOption: {},
|
|
574
|
+
disableXvfb: false,
|
|
575
|
+
ignoreAllFlags: false,
|
|
576
|
+
proxy: this.puppeteerRealProxy
|
|
577
|
+
};
|
|
578
|
+
|
|
579
|
+
this.puppeteerBrowser = null;
|
|
580
|
+
this.puppeteerPage = null;
|
|
581
|
+
}
|
|
582
|
+
|
|
552
583
|
normalizeExcludeList ( list = [] )
|
|
553
584
|
{
|
|
554
585
|
const normalizedSet = new Set();
|
|
@@ -577,36 +608,6 @@ class WebScraper
|
|
|
577
608
|
return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
|
|
578
609
|
}
|
|
579
610
|
|
|
580
|
-
createOutputDirectory ()
|
|
581
|
-
{
|
|
582
|
-
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
583
|
-
{
|
|
584
|
-
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
585
|
-
}
|
|
586
|
-
if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
|
|
587
|
-
{
|
|
588
|
-
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
589
|
-
}
|
|
590
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
|
591
|
-
{
|
|
592
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
|
593
|
-
}
|
|
594
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
|
595
|
-
{
|
|
596
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
|
597
|
-
}
|
|
598
|
-
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
|
599
|
-
{
|
|
600
|
-
fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
|
|
601
|
-
}
|
|
602
|
-
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
|
|
603
|
-
{
|
|
604
|
-
fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
|
|
605
|
-
}
|
|
606
|
-
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
607
|
-
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
608
|
-
}
|
|
609
|
-
|
|
610
611
|
isValidFileType ( url )
|
|
611
612
|
{
|
|
612
613
|
if ( !this.filterFileTypes ) return true;
|
|
@@ -630,7 +631,7 @@ class WebScraper
|
|
|
630
631
|
}
|
|
631
632
|
}
|
|
632
633
|
|
|
633
|
-
|
|
634
|
+
hasValidPageContent ( content )
|
|
634
635
|
{
|
|
635
636
|
// Remove whitespace and newlines for checking
|
|
636
637
|
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|
|
@@ -657,6 +658,36 @@ class WebScraper
|
|
|
657
658
|
return true;
|
|
658
659
|
}
|
|
659
660
|
|
|
661
|
+
createOutputDirectory ()
|
|
662
|
+
{
|
|
663
|
+
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
664
|
+
{
|
|
665
|
+
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
666
|
+
}
|
|
667
|
+
if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
|
|
668
|
+
{
|
|
669
|
+
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
670
|
+
}
|
|
671
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
|
672
|
+
{
|
|
673
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
|
674
|
+
}
|
|
675
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
|
676
|
+
{
|
|
677
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
|
678
|
+
}
|
|
679
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
|
680
|
+
{
|
|
681
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
|
|
682
|
+
}
|
|
683
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
|
|
684
|
+
{
|
|
685
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
|
|
686
|
+
}
|
|
687
|
+
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
688
|
+
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
689
|
+
}
|
|
690
|
+
|
|
660
691
|
static sleep ( ms )
|
|
661
692
|
{
|
|
662
693
|
return new Promise( resolve => { return setTimeout( resolve, ms ) });
|