clean-web-scraper 3.5.1 → 3.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +40 -14
- package/package.json +1 -1
- package/src/WebScraper.js +128 -72
package/example-usage.js
CHANGED
|
@@ -8,8 +8,36 @@ const WebScraper = require( "./src/WebScraper" );
|
|
|
8
8
|
// "Cookie": cookies
|
|
9
9
|
// }
|
|
10
10
|
|
|
11
|
+
async function palianswers ( enable )
|
|
12
|
+
{
|
|
13
|
+
const scraper = new WebScraper({
|
|
14
|
+
baseURL: "https://palianswers.com",
|
|
15
|
+
excludeList: [
|
|
16
|
+
"https://palianswers.com/chat/",
|
|
17
|
+
"https://palianswers.com/become-a-volunteer/",
|
|
18
|
+
"https://palianswers.com/other-resources/",
|
|
19
|
+
"https://palianswers.com/request-a-rebuttal/",
|
|
20
|
+
"https://palianswers.com/submit-a-rebuttal/",
|
|
21
|
+
"https://palianswers.com/themes/"
|
|
22
|
+
],
|
|
23
|
+
exactExcludeList: [
|
|
24
|
+
"https://palianswers.com/",
|
|
25
|
+
],
|
|
26
|
+
scrapResultPath: "./dataset/palianswers/website",
|
|
27
|
+
jsonlOutputPath: "./dataset/palianswers/train.jsonl",
|
|
28
|
+
textOutputPath: "./dataset/palianswers/texts",
|
|
29
|
+
csvOutputPath: "./dataset/palianswers/train.csv",
|
|
30
|
+
includeMetadata: true,
|
|
31
|
+
metadataFields: ["title", "description", "author"]
|
|
32
|
+
});
|
|
33
|
+
if ( enable )
|
|
34
|
+
{
|
|
35
|
+
await scraper.start();
|
|
36
|
+
}
|
|
37
|
+
return scraper;
|
|
38
|
+
}
|
|
11
39
|
|
|
12
|
-
async function khameneiIrFreePalestineTag ()
|
|
40
|
+
async function khameneiIrFreePalestineTag ( enable )
|
|
13
41
|
{
|
|
14
42
|
// https://english.khamenei.ir/Opinions/FreePalestine
|
|
15
43
|
// https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
|
|
@@ -135,6 +163,7 @@ async function electronicintifada ( enable )
|
|
|
135
163
|
|
|
136
164
|
async function palestineremembered ( enable )
|
|
137
165
|
{
|
|
166
|
+
// https://www.palestineremembered.com
|
|
138
167
|
const scraper = new WebScraper({
|
|
139
168
|
baseURL: "https://www.palestineremembered.com",
|
|
140
169
|
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
|
@@ -179,24 +208,21 @@ async function palestineremembered ( enable )
|
|
|
179
208
|
|
|
180
209
|
void async function main ()
|
|
181
210
|
{
|
|
182
|
-
const
|
|
183
|
-
const decolonizepalestineScraper = await decolonizepalestine(
|
|
211
|
+
const palianswersScraper = await palianswers( true );
|
|
212
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
|
213
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
|
184
214
|
const bdsmovementScraper = await bdsmovement( false );
|
|
185
|
-
const electronicintifadaScraper = await electronicintifada(
|
|
186
|
-
|
|
215
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
|
216
|
+
const palestinerememberedScraper = await palestineremembered( false );
|
|
217
|
+
|
|
187
218
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
188
|
-
|
|
219
|
+
palianswersScraper,
|
|
189
220
|
decolonizepalestineScraper,
|
|
190
|
-
|
|
221
|
+
khameneiIrFreePalestineTagScraper,
|
|
191
222
|
electronicintifadaScraper,
|
|
192
|
-
|
|
223
|
+
// bdsmovementScraper,
|
|
224
|
+
// palestinerememberedScraper,
|
|
193
225
|
] );
|
|
194
226
|
|
|
195
|
-
// 5
|
|
196
|
-
// https://www.palestineremembered.com/ZionistFAQ.html
|
|
197
|
-
|
|
198
|
-
// 6 https://the-palestinian-side.vercel.app/
|
|
199
|
-
|
|
200
227
|
// 7 https://stand-with-palestine.org/blogs
|
|
201
228
|
}()
|
|
202
|
-
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -10,78 +10,79 @@ const { connect } = require( "puppeteer-real-browser" )
|
|
|
10
10
|
class WebScraper
|
|
11
11
|
{
|
|
12
12
|
constructor ({
|
|
13
|
+
// Base configuration
|
|
13
14
|
baseURL,
|
|
14
15
|
startURL,
|
|
16
|
+
strictBaseURL = true,
|
|
15
17
|
maxDepth = Infinity,
|
|
16
|
-
maxArticles = Infinity,
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
maxArticles = Infinity,
|
|
19
|
+
|
|
20
|
+
// URL filtering
|
|
21
|
+
excludeList = [],
|
|
22
|
+
exactExcludeList = [],
|
|
23
|
+
filterFileTypes = true,
|
|
24
|
+
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
|
|
25
|
+
|
|
26
|
+
// Output paths
|
|
19
27
|
scrapResultPath = "./dataset",
|
|
20
28
|
jsonlOutputPath,
|
|
21
29
|
textOutputPath,
|
|
22
30
|
csvOutputPath,
|
|
31
|
+
|
|
32
|
+
// Metadata options
|
|
23
33
|
includeMetadata = false,
|
|
24
|
-
metadataFields = [],
|
|
34
|
+
metadataFields = [],
|
|
35
|
+
|
|
36
|
+
// Network options
|
|
25
37
|
axiosHeaders,
|
|
26
38
|
axiosProxy,
|
|
39
|
+
|
|
40
|
+
// Puppeteer options
|
|
27
41
|
usePuppeteer,
|
|
28
42
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
29
43
|
puppeteerExecutablePath,
|
|
30
|
-
puppeteerRealProxy
|
|
31
|
-
filterFileTypes = true,
|
|
32
|
-
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"]
|
|
44
|
+
puppeteerRealProxy
|
|
33
45
|
})
|
|
34
46
|
{
|
|
47
|
+
// Base configuration
|
|
35
48
|
this.baseURL = baseURL;
|
|
36
49
|
this.startURL = startURL || baseURL;
|
|
50
|
+
this.strictBaseURL = strictBaseURL;
|
|
37
51
|
this.maxDepth = maxDepth;
|
|
38
|
-
this.maxArticles = maxArticles;
|
|
52
|
+
this.maxArticles = maxArticles;
|
|
53
|
+
|
|
54
|
+
// Output paths setup
|
|
39
55
|
this.scrapResultPath = scrapResultPath;
|
|
40
56
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
41
57
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
42
58
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
43
|
-
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
44
|
-
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
45
|
-
|
|
59
|
+
this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
60
|
+
this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
61
|
+
|
|
62
|
+
// Metadata configuration
|
|
46
63
|
this.includeMetadata = includeMetadata;
|
|
47
|
-
|
|
64
|
+
this.metadataFields = new Set( metadataFields );
|
|
65
|
+
|
|
66
|
+
// URL filtering setup
|
|
48
67
|
this.visited = new Set();
|
|
49
68
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
50
69
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
51
|
-
this.allProcessedContent = [];
|
|
52
70
|
this.filterFileTypes = filterFileTypes;
|
|
53
71
|
this.excludedFileTypes = excludedFileTypes;
|
|
72
|
+
|
|
73
|
+
// Network configuration
|
|
74
|
+
this.axiosHeaders = axiosHeaders;
|
|
54
75
|
this.axiosProxy = axiosProxy;
|
|
76
|
+
|
|
77
|
+
// Content storage
|
|
78
|
+
this.allProcessedContent = [];
|
|
79
|
+
|
|
80
|
+
// Puppeteer configuration
|
|
55
81
|
this.usePuppeteer = usePuppeteer || false;
|
|
56
|
-
this.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
args: [
|
|
61
|
-
"--start-maximized"
|
|
62
|
-
],
|
|
63
|
-
"ignoreDefaultArgs": true,
|
|
64
|
-
}
|
|
65
|
-
if ( puppeteerProxy )
|
|
66
|
-
{
|
|
67
|
-
this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
|
|
68
|
-
}
|
|
69
|
-
if ( puppeteerExecutablePath )
|
|
70
|
-
{
|
|
71
|
-
this.puppeteerOptions.executablePath = puppeteerExecutablePath;
|
|
72
|
-
}
|
|
73
|
-
this.puppeteerRealOptions = {
|
|
74
|
-
headless: false,
|
|
75
|
-
args: [],
|
|
76
|
-
customConfig: {},
|
|
77
|
-
turnstile: true,
|
|
78
|
-
connectOption: {},
|
|
79
|
-
disableXvfb: false,
|
|
80
|
-
ignoreAllFlags: false,
|
|
81
|
-
proxy: puppeteerRealProxy
|
|
82
|
-
}
|
|
83
|
-
this.puppeteerBrowser = null;
|
|
84
|
-
this.puppeteerPage = null;
|
|
82
|
+
this.puppeteerProxy = puppeteerProxy;
|
|
83
|
+
this.puppeteerExecutablePath = puppeteerExecutablePath;
|
|
84
|
+
this.puppeteerRealProxy = puppeteerRealProxy;
|
|
85
|
+
this.configurePuppeteer( );
|
|
85
86
|
}
|
|
86
87
|
|
|
87
88
|
async start ()
|
|
@@ -131,6 +132,10 @@ class WebScraper
|
|
|
131
132
|
{
|
|
132
133
|
return;
|
|
133
134
|
}
|
|
135
|
+
if ( !this.isValidDomain( url ) )
|
|
136
|
+
{
|
|
137
|
+
return;
|
|
138
|
+
}
|
|
134
139
|
try
|
|
135
140
|
{
|
|
136
141
|
const data = await this.caller( url );
|
|
@@ -543,6 +548,41 @@ class WebScraper
|
|
|
543
548
|
};
|
|
544
549
|
}
|
|
545
550
|
|
|
551
|
+
configurePuppeteer ( )
|
|
552
|
+
{
|
|
553
|
+
this.puppeteerOptions = {
|
|
554
|
+
headless: false,
|
|
555
|
+
userDataDir: "./tmp/browser",
|
|
556
|
+
defaultViewport: null,
|
|
557
|
+
args: ["--start-maximized"],
|
|
558
|
+
ignoreDefaultArgs: true
|
|
559
|
+
};
|
|
560
|
+
|
|
561
|
+
if ( this.puppeteerProxy )
|
|
562
|
+
{
|
|
563
|
+
this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
if ( this.puppeteerExecutablePath )
|
|
567
|
+
{
|
|
568
|
+
this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
this.puppeteerRealOptions = {
|
|
572
|
+
headless: false,
|
|
573
|
+
args: [],
|
|
574
|
+
customConfig: {},
|
|
575
|
+
turnstile: true,
|
|
576
|
+
connectOption: {},
|
|
577
|
+
disableXvfb: false,
|
|
578
|
+
ignoreAllFlags: false,
|
|
579
|
+
proxy: this.puppeteerRealProxy
|
|
580
|
+
};
|
|
581
|
+
|
|
582
|
+
this.puppeteerBrowser = null;
|
|
583
|
+
this.puppeteerPage = null;
|
|
584
|
+
}
|
|
585
|
+
|
|
546
586
|
normalizeExcludeList ( list = [] )
|
|
547
587
|
{
|
|
548
588
|
const normalizedSet = new Set();
|
|
@@ -571,36 +611,6 @@ class WebScraper
|
|
|
571
611
|
return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
|
|
572
612
|
}
|
|
573
613
|
|
|
574
|
-
createOutputDirectory ()
|
|
575
|
-
{
|
|
576
|
-
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
577
|
-
{
|
|
578
|
-
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
579
|
-
}
|
|
580
|
-
if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
|
|
581
|
-
{
|
|
582
|
-
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
583
|
-
}
|
|
584
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
|
585
|
-
{
|
|
586
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
|
587
|
-
}
|
|
588
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
|
589
|
-
{
|
|
590
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
|
591
|
-
}
|
|
592
|
-
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
|
593
|
-
{
|
|
594
|
-
fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
|
|
595
|
-
}
|
|
596
|
-
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
|
|
597
|
-
{
|
|
598
|
-
fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
|
|
599
|
-
}
|
|
600
|
-
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
601
|
-
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
602
|
-
}
|
|
603
|
-
|
|
604
614
|
isValidFileType ( url )
|
|
605
615
|
{
|
|
606
616
|
if ( !this.filterFileTypes ) return true;
|
|
@@ -608,6 +618,22 @@ class WebScraper
|
|
|
608
618
|
return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
|
|
609
619
|
}
|
|
610
620
|
|
|
621
|
+
isValidDomain ( url )
|
|
622
|
+
{
|
|
623
|
+
if ( !this.strictBaseURL ) return true;
|
|
624
|
+
try
|
|
625
|
+
{
|
|
626
|
+
const urlObj = new URL( url );
|
|
627
|
+
const baseURLObj = new URL( this.baseURL );
|
|
628
|
+
return urlObj.hostname === baseURLObj.hostname;
|
|
629
|
+
}
|
|
630
|
+
catch ( e )
|
|
631
|
+
{
|
|
632
|
+
console.log( `Invalid URL: ${url}` );
|
|
633
|
+
return false;
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
|
|
611
637
|
isValidContent ( content )
|
|
612
638
|
{
|
|
613
639
|
// Remove whitespace and newlines for checking
|
|
@@ -635,6 +661,36 @@ class WebScraper
|
|
|
635
661
|
return true;
|
|
636
662
|
}
|
|
637
663
|
|
|
664
|
+
createOutputDirectory ()
|
|
665
|
+
{
|
|
666
|
+
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
667
|
+
{
|
|
668
|
+
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
669
|
+
}
|
|
670
|
+
if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
|
|
671
|
+
{
|
|
672
|
+
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
673
|
+
}
|
|
674
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
|
675
|
+
{
|
|
676
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
|
677
|
+
}
|
|
678
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
|
679
|
+
{
|
|
680
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
|
681
|
+
}
|
|
682
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
|
683
|
+
{
|
|
684
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
|
|
685
|
+
}
|
|
686
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
|
|
687
|
+
{
|
|
688
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
|
|
689
|
+
}
|
|
690
|
+
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
691
|
+
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
692
|
+
}
|
|
693
|
+
|
|
638
694
|
static sleep ( ms )
|
|
639
695
|
{
|
|
640
696
|
return new Promise( resolve => { return setTimeout( resolve, ms ) });
|