clean-web-scraper 3.5.1 → 3.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -8,8 +8,36 @@ const WebScraper = require( "./src/WebScraper" );
8
8
  // "Cookie": cookies
9
9
  // }
10
10
 
11
+ async function palianswers ( enable )
12
+ {
13
+ const scraper = new WebScraper({
14
+ baseURL: "https://palianswers.com",
15
+ excludeList: [
16
+ "https://palianswers.com/chat/",
17
+ "https://palianswers.com/become-a-volunteer/",
18
+ "https://palianswers.com/other-resources/",
19
+ "https://palianswers.com/request-a-rebuttal/",
20
+ "https://palianswers.com/submit-a-rebuttal/",
21
+ "https://palianswers.com/themes/"
22
+ ],
23
+ exactExcludeList: [
24
+ "https://palianswers.com/",
25
+ ],
26
+ scrapResultPath: "./dataset/palianswers/website",
27
+ jsonlOutputPath: "./dataset/palianswers/train.jsonl",
28
+ textOutputPath: "./dataset/palianswers/texts",
29
+ csvOutputPath: "./dataset/palianswers/train.csv",
30
+ includeMetadata: true,
31
+ metadataFields: ["title", "description", "author"]
32
+ });
33
+ if ( enable )
34
+ {
35
+ await scraper.start();
36
+ }
37
+ return scraper;
38
+ }
11
39
 
12
- async function khameneiIrFreePalestineTag ()
40
+ async function khameneiIrFreePalestineTag ( enable )
13
41
  {
14
42
  // https://english.khamenei.ir/Opinions/FreePalestine
15
43
  // https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
@@ -135,6 +163,7 @@ async function electronicintifada ( enable )
135
163
 
136
164
  async function palestineremembered ( enable )
137
165
  {
166
+ // https://www.palestineremembered.com
138
167
  const scraper = new WebScraper({
139
168
  baseURL: "https://www.palestineremembered.com",
140
169
  startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
@@ -179,24 +208,21 @@ async function palestineremembered ( enable )
179
208
 
180
209
  void async function main ()
181
210
  {
182
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
183
- const decolonizepalestineScraper = await decolonizepalestine( false );
211
+ const palianswersScraper = await palianswers( true );
212
+ const decolonizepalestineScraper = await decolonizepalestine( true );
213
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
184
214
  const bdsmovementScraper = await bdsmovement( false );
185
- const electronicintifadaScraper = await electronicintifada( false );
186
- const palestinerememberedScraper = await palestineremembered( true );
215
+ const electronicintifadaScraper = await electronicintifada( true );
216
+ const palestinerememberedScraper = await palestineremembered( false );
217
+
187
218
  await WebScraper.combineResults( "./dataset/combined", [
188
- khameneiIrFreePalestineTagScraper,
219
+ palianswersScraper,
189
220
  decolonizepalestineScraper,
190
- bdsmovementScraper,
221
+ khameneiIrFreePalestineTagScraper,
191
222
  electronicintifadaScraper,
192
- palestinerememberedScraper
223
+ // bdsmovementScraper,
224
+ // palestinerememberedScraper,
193
225
  ] );
194
226
 
195
- // 5
196
- // https://www.palestineremembered.com/ZionistFAQ.html
197
-
198
- // 6 https://the-palestinian-side.vercel.app/
199
-
200
227
  // 7 https://stand-with-palestine.org/blogs
201
228
  }()
202
-
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.5.1",
3
+ "version": "3.5.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -12,8 +12,9 @@ class WebScraper
12
12
  constructor ({
13
13
  baseURL,
14
14
  startURL,
15
+ strictBaseURL = true,
15
16
  maxDepth = Infinity,
16
- maxArticles = Infinity, // Add this line
17
+ maxArticles = Infinity,
17
18
  excludeList,
18
19
  exactExcludeList,
19
20
  scrapResultPath = "./dataset",
@@ -34,8 +35,9 @@ class WebScraper
34
35
  {
35
36
  this.baseURL = baseURL;
36
37
  this.startURL = startURL || baseURL;
38
+ this.strictBaseURL = strictBaseURL;
37
39
  this.maxDepth = maxDepth;
38
- this.maxArticles = maxArticles; // Add this line
40
+ this.maxArticles = maxArticles;
39
41
  this.scrapResultPath = scrapResultPath;
40
42
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
41
43
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
@@ -131,6 +133,10 @@ class WebScraper
131
133
  {
132
134
  return;
133
135
  }
136
+ if ( !this.isValidDomain( url ) )
137
+ {
138
+ return;
139
+ }
134
140
  try
135
141
  {
136
142
  const data = await this.caller( url );
@@ -608,6 +614,22 @@ class WebScraper
608
614
  return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
609
615
  }
610
616
 
617
+ isValidDomain ( url )
618
+ {
619
+ if ( !this.strictBaseURL ) return true;
620
+ try
621
+ {
622
+ const urlObj = new URL( url );
623
+ const baseURLObj = new URL( this.baseURL );
624
+ return urlObj.hostname === baseURLObj.hostname;
625
+ }
626
+ catch ( e )
627
+ {
628
+ console.log( `Invalid URL: ${url}` );
629
+ return false;
630
+ }
631
+ }
632
+
611
633
  isValidContent ( content )
612
634
  {
613
635
  // Remove whitespace and newlines for checking