clean-web-scraper 3.5.2 → 3.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/WebScraper.js +104 -70
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -10,80 +10,79 @@ const { connect } = require( "puppeteer-real-browser" )
|
|
|
10
10
|
class WebScraper
|
|
11
11
|
{
|
|
12
12
|
constructor ({
|
|
13
|
+
// Base configuration
|
|
13
14
|
baseURL,
|
|
14
15
|
startURL,
|
|
15
16
|
strictBaseURL = true,
|
|
16
17
|
maxDepth = Infinity,
|
|
17
18
|
maxArticles = Infinity,
|
|
18
|
-
|
|
19
|
-
|
|
19
|
+
|
|
20
|
+
// URL filtering
|
|
21
|
+
excludeList = [],
|
|
22
|
+
exactExcludeList = [],
|
|
23
|
+
filterFileTypes = true,
|
|
24
|
+
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
|
|
25
|
+
|
|
26
|
+
// Output paths
|
|
20
27
|
scrapResultPath = "./dataset",
|
|
21
28
|
jsonlOutputPath,
|
|
22
29
|
textOutputPath,
|
|
23
30
|
csvOutputPath,
|
|
31
|
+
|
|
32
|
+
// Metadata options
|
|
24
33
|
includeMetadata = false,
|
|
25
|
-
metadataFields = [],
|
|
34
|
+
metadataFields = [],
|
|
35
|
+
|
|
36
|
+
// Network options
|
|
26
37
|
axiosHeaders,
|
|
27
38
|
axiosProxy,
|
|
39
|
+
|
|
40
|
+
// Puppeteer options
|
|
28
41
|
usePuppeteer,
|
|
29
42
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
30
43
|
puppeteerExecutablePath,
|
|
31
|
-
puppeteerRealProxy
|
|
32
|
-
filterFileTypes = true,
|
|
33
|
-
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
|
|
44
|
+
puppeteerRealProxy
|
|
34
45
|
})
|
|
35
46
|
{
|
|
47
|
+
// Base configuration
|
|
36
48
|
this.baseURL = baseURL;
|
|
37
49
|
this.startURL = startURL || baseURL;
|
|
38
50
|
this.strictBaseURL = strictBaseURL;
|
|
39
51
|
this.maxDepth = maxDepth;
|
|
40
52
|
this.maxArticles = maxArticles;
|
|
53
|
+
|
|
54
|
+
// Output paths setup
|
|
41
55
|
this.scrapResultPath = scrapResultPath;
|
|
42
56
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
43
57
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
44
58
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
45
|
-
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
46
|
-
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
47
|
-
|
|
59
|
+
this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
60
|
+
this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
61
|
+
|
|
62
|
+
// Metadata configuration
|
|
48
63
|
this.includeMetadata = includeMetadata;
|
|
49
|
-
|
|
64
|
+
this.metadataFields = new Set( metadataFields );
|
|
65
|
+
|
|
66
|
+
// URL filtering setup
|
|
50
67
|
this.visited = new Set();
|
|
51
68
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
52
69
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
53
|
-
this.allProcessedContent = [];
|
|
54
70
|
this.filterFileTypes = filterFileTypes;
|
|
55
71
|
this.excludedFileTypes = excludedFileTypes;
|
|
72
|
+
|
|
73
|
+
// Network configuration
|
|
74
|
+
this.axiosHeaders = axiosHeaders;
|
|
56
75
|
this.axiosProxy = axiosProxy;
|
|
76
|
+
|
|
77
|
+
// Content storage
|
|
78
|
+
this.allProcessedContent = [];
|
|
79
|
+
|
|
80
|
+
// Puppeteer configuration
|
|
57
81
|
this.usePuppeteer = usePuppeteer || false;
|
|
58
|
-
this.
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
args: [
|
|
63
|
-
"--start-maximized"
|
|
64
|
-
],
|
|
65
|
-
"ignoreDefaultArgs": true,
|
|
66
|
-
}
|
|
67
|
-
if ( puppeteerProxy )
|
|
68
|
-
{
|
|
69
|
-
this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
|
|
70
|
-
}
|
|
71
|
-
if ( puppeteerExecutablePath )
|
|
72
|
-
{
|
|
73
|
-
this.puppeteerOptions.executablePath = puppeteerExecutablePath;
|
|
74
|
-
}
|
|
75
|
-
this.puppeteerRealOptions = {
|
|
76
|
-
headless: false,
|
|
77
|
-
args: [],
|
|
78
|
-
customConfig: {},
|
|
79
|
-
turnstile: true,
|
|
80
|
-
connectOption: {},
|
|
81
|
-
disableXvfb: false,
|
|
82
|
-
ignoreAllFlags: false,
|
|
83
|
-
proxy: puppeteerRealProxy
|
|
84
|
-
}
|
|
85
|
-
this.puppeteerBrowser = null;
|
|
86
|
-
this.puppeteerPage = null;
|
|
82
|
+
this.puppeteerProxy = puppeteerProxy;
|
|
83
|
+
this.puppeteerExecutablePath = puppeteerExecutablePath;
|
|
84
|
+
this.puppeteerRealProxy = puppeteerRealProxy;
|
|
85
|
+
this.configurePuppeteer( );
|
|
87
86
|
}
|
|
88
87
|
|
|
89
88
|
async start ()
|
|
@@ -549,6 +548,41 @@ class WebScraper
|
|
|
549
548
|
};
|
|
550
549
|
}
|
|
551
550
|
|
|
551
|
+
configurePuppeteer ( )
|
|
552
|
+
{
|
|
553
|
+
this.puppeteerOptions = {
|
|
554
|
+
headless: false,
|
|
555
|
+
userDataDir: "./tmp/browser",
|
|
556
|
+
defaultViewport: null,
|
|
557
|
+
args: ["--start-maximized"],
|
|
558
|
+
ignoreDefaultArgs: true
|
|
559
|
+
};
|
|
560
|
+
|
|
561
|
+
if ( this.puppeteerProxy )
|
|
562
|
+
{
|
|
563
|
+
this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
if ( this.puppeteerExecutablePath )
|
|
567
|
+
{
|
|
568
|
+
this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
this.puppeteerRealOptions = {
|
|
572
|
+
headless: false,
|
|
573
|
+
args: [],
|
|
574
|
+
customConfig: {},
|
|
575
|
+
turnstile: true,
|
|
576
|
+
connectOption: {},
|
|
577
|
+
disableXvfb: false,
|
|
578
|
+
ignoreAllFlags: false,
|
|
579
|
+
proxy: this.puppeteerRealProxy
|
|
580
|
+
};
|
|
581
|
+
|
|
582
|
+
this.puppeteerBrowser = null;
|
|
583
|
+
this.puppeteerPage = null;
|
|
584
|
+
}
|
|
585
|
+
|
|
552
586
|
normalizeExcludeList ( list = [] )
|
|
553
587
|
{
|
|
554
588
|
const normalizedSet = new Set();
|
|
@@ -577,36 +611,6 @@ class WebScraper
|
|
|
577
611
|
return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
|
|
578
612
|
}
|
|
579
613
|
|
|
580
|
-
createOutputDirectory ()
|
|
581
|
-
{
|
|
582
|
-
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
583
|
-
{
|
|
584
|
-
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
585
|
-
}
|
|
586
|
-
if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
|
|
587
|
-
{
|
|
588
|
-
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
589
|
-
}
|
|
590
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
|
591
|
-
{
|
|
592
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
|
593
|
-
}
|
|
594
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
|
595
|
-
{
|
|
596
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
|
597
|
-
}
|
|
598
|
-
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
|
599
|
-
{
|
|
600
|
-
fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
|
|
601
|
-
}
|
|
602
|
-
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
|
|
603
|
-
{
|
|
604
|
-
fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
|
|
605
|
-
}
|
|
606
|
-
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
607
|
-
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
608
|
-
}
|
|
609
|
-
|
|
610
614
|
isValidFileType ( url )
|
|
611
615
|
{
|
|
612
616
|
if ( !this.filterFileTypes ) return true;
|
|
@@ -657,6 +661,36 @@ class WebScraper
|
|
|
657
661
|
return true;
|
|
658
662
|
}
|
|
659
663
|
|
|
664
|
+
createOutputDirectory ()
|
|
665
|
+
{
|
|
666
|
+
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
667
|
+
{
|
|
668
|
+
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
669
|
+
}
|
|
670
|
+
if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
|
|
671
|
+
{
|
|
672
|
+
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
673
|
+
}
|
|
674
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
|
675
|
+
{
|
|
676
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
|
677
|
+
}
|
|
678
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
|
679
|
+
{
|
|
680
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
|
681
|
+
}
|
|
682
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
|
683
|
+
{
|
|
684
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
|
|
685
|
+
}
|
|
686
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
|
|
687
|
+
{
|
|
688
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
|
|
689
|
+
}
|
|
690
|
+
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
691
|
+
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
692
|
+
}
|
|
693
|
+
|
|
660
694
|
static sleep ( ms )
|
|
661
695
|
{
|
|
662
696
|
return new Promise( resolve => { return setTimeout( resolve, ms ) });
|