clean-web-scraper 3.5.0 → 3.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -8,8 +8,36 @@ const WebScraper = require( "./src/WebScraper" );
8
8
  // "Cookie": cookies
9
9
  // }
10
10
 
11
+ async function palianswers ( enable )
12
+ {
13
+ const scraper = new WebScraper({
14
+ baseURL: "https://palianswers.com",
15
+ excludeList: [
16
+ "https://palianswers.com/chat/",
17
+ "https://palianswers.com/become-a-volunteer/",
18
+ "https://palianswers.com/other-resources/",
19
+ "https://palianswers.com/request-a-rebuttal/",
20
+ "https://palianswers.com/submit-a-rebuttal/",
21
+ "https://palianswers.com/themes/"
22
+ ],
23
+ exactExcludeList: [
24
+ "https://palianswers.com/",
25
+ ],
26
+ scrapResultPath: "./dataset/palianswers/website",
27
+ jsonlOutputPath: "./dataset/palianswers/train.jsonl",
28
+ textOutputPath: "./dataset/palianswers/texts",
29
+ csvOutputPath: "./dataset/palianswers/train.csv",
30
+ includeMetadata: true,
31
+ metadataFields: ["title", "description", "author"]
32
+ });
33
+ if ( enable )
34
+ {
35
+ await scraper.start();
36
+ }
37
+ return scraper;
38
+ }
11
39
 
12
- async function khameneiIrFreePalestineTag ()
40
+ async function khameneiIrFreePalestineTag ( enable )
13
41
  {
14
42
  // https://english.khamenei.ir/Opinions/FreePalestine
15
43
  // https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
@@ -27,13 +55,15 @@ async function khameneiIrFreePalestineTag ()
27
55
  includeMetadata: true,
28
56
  metadataFields: ["title", "description", "author"]
29
57
  });
30
- await scraper.start();
58
+ if ( enable )
59
+ {
60
+ await scraper.start();
61
+ }
31
62
  return scraper;
32
63
  }
33
64
 
34
- async function decolonizepalestine ()
65
+ async function decolonizepalestine ( enable )
35
66
  {
36
- // https://decolonizepalestine.com
37
67
  const scraper = new WebScraper({
38
68
  baseURL: "https://decolonizepalestine.com",
39
69
  excludeList: [
@@ -54,13 +84,15 @@ async function decolonizepalestine ()
54
84
  includeMetadata: true,
55
85
  metadataFields: ["title", "description", "author"]
56
86
  });
57
- await scraper.start();
87
+ if ( enable )
88
+ {
89
+ await scraper.start();
90
+ }
58
91
  return scraper;
59
92
  }
60
93
 
61
- async function bdsmovement ()
94
+ async function bdsmovement ( enable )
62
95
  {
63
- // https://bdsmovement.net
64
96
  const scraper = new WebScraper({
65
97
  baseURL: "https://bdsmovement.net",
66
98
  excludeList: [
@@ -83,15 +115,16 @@ async function bdsmovement ()
83
115
  host: "socks5://127.0.0.1",
84
116
  port: "2080",
85
117
  },
86
- // usePuppeteer: true
87
118
  });
88
- await scraper.start();
119
+ if ( enable )
120
+ {
121
+ await scraper.start();
122
+ }
89
123
  return scraper;
90
124
  }
91
125
 
92
- async function electronicintifada ()
126
+ async function electronicintifada ( enable )
93
127
  {
94
- // https://electronicintifada.net
95
128
  const scraper = new WebScraper({
96
129
  baseURL: "https://electronicintifada.net",
97
130
  excludeList: [
@@ -103,7 +136,8 @@ async function electronicintifada ()
103
136
  "https://electronicintifada.net/location",
104
137
  "https://electronicintifada.net/file",
105
138
  "https://electronicintifada.net/bytopic/people",
106
- "https://electronicintifada.net/comment/"
139
+ "https://electronicintifada.net/comment/",
140
+ "https://electronicintifada.net/search/site/"
107
141
  ],
108
142
  exactExcludeList: [
109
143
  "https://electronicintifada.net",
@@ -117,31 +151,78 @@ async function electronicintifada ()
117
151
  textOutputPath: "./dataset/electronicintifada/texts",
118
152
  csvOutputPath: "./dataset/electronicintifada/train.csv",
119
153
  includeMetadata: true,
120
- maxArticles: 1000,
154
+ maxArticles: 2000,
121
155
  metadataFields: ["title", "description", "author"]
122
156
  });
123
- await scraper.start();
157
+ if ( enable )
158
+ {
159
+ await scraper.start();
160
+ }
161
+ return scraper;
162
+ }
163
+
164
+ async function palestineremembered ( enable )
165
+ {
166
+ // https://www.palestineremembered.com
167
+ const scraper = new WebScraper({
168
+ baseURL: "https://www.palestineremembered.com",
169
+ startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
170
+ excludeList: [
171
+ "https://www.palestineremembered.com/GeoPoints",
172
+ "https://www.palestineremembered.com/Donate",
173
+ "https://www.palestineremembered.com/ContactUs.html",
174
+ "https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
175
+ "https://www.palestineremembered.com/ar/",
176
+ "https://www.palestineremembered.com/OldNewPictures.html",
177
+ "https://www.palestineremembered.com/Maps/index.html",
178
+ "https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
179
+ "https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
180
+ "https://www.palestineremembered.com/Articles/General/Story2045.html",
181
+ "https://www.palestineremembered.com/AllTownsListing.html",
182
+ "https://www.palestineremembered.com/Articles/General/ar/",
183
+ "https://www.palestineremembered.com/SiteVideos.html"
184
+ ],
185
+ exactExcludeList: [
186
+ "https://www.palestineremembered.com/index.html",
187
+ "https://www.palestineremembered.com/ZionistFAQ.html"
188
+ ],
189
+ scrapResultPath: "./dataset/palestineremembered/website",
190
+ jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
191
+ textOutputPath: "./dataset/palestineremembered/texts",
192
+ csvOutputPath: "./dataset/palestineremembered/train.csv",
193
+ includeMetadata: true,
194
+ metadataFields: ["title", "description", "author"],
195
+ axiosProxy: {
196
+ host: "localhost",
197
+ port: 2080,
198
+ protocol: "http"
199
+ }
200
+ });
201
+ if ( enable )
202
+ {
203
+ await scraper.start();
204
+ }
124
205
  return scraper;
125
206
  }
126
207
 
208
+
127
209
  void async function main ()
128
210
  {
129
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
130
- const decolonizepalestineScraper = await decolonizepalestine();
131
- const bdsmovementScraper = await bdsmovement();
132
- const electronicintifadaScraper = await electronicintifada();
211
+ const palianswersScraper = await palianswers( true );
212
+ const decolonizepalestineScraper = await decolonizepalestine( true );
213
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
214
+ const bdsmovementScraper = await bdsmovement( false );
215
+ const electronicintifadaScraper = await electronicintifada( true );
216
+ const palestinerememberedScraper = await palestineremembered( false );
217
+
133
218
  await WebScraper.combineResults( "./dataset/combined", [
134
- khameneiIrFreePalestineTagScraper,
219
+ palianswersScraper,
135
220
  decolonizepalestineScraper,
136
- bdsmovementScraper,
137
- electronicintifadaScraper
221
+ khameneiIrFreePalestineTagScraper,
222
+ electronicintifadaScraper,
223
+ // bdsmovementScraper,
224
+ // palestinerememberedScraper,
138
225
  ] );
139
226
 
140
- // 5
141
- // https://www.palestineremembered.com/ZionistFAQ.html
142
-
143
- // 6 https://the-palestinian-side.vercel.app/
144
-
145
227
  // 7 https://stand-with-palestine.org/blogs
146
228
  }()
147
-
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.5.0",
3
+ "version": "3.5.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -12,8 +12,9 @@ class WebScraper
12
12
  constructor ({
13
13
  baseURL,
14
14
  startURL,
15
+ strictBaseURL = true,
15
16
  maxDepth = Infinity,
16
- maxArticles = Infinity, // Add this line
17
+ maxArticles = Infinity,
17
18
  excludeList,
18
19
  exactExcludeList,
19
20
  scrapResultPath = "./dataset",
@@ -22,7 +23,8 @@ class WebScraper
22
23
  csvOutputPath,
23
24
  includeMetadata = false,
24
25
  metadataFields = [], // ['title', 'description', 'author', etc.]
25
- headers,
26
+ axiosHeaders,
27
+ axiosProxy,
26
28
  usePuppeteer,
27
29
  puppeteerProxy, // e.g. http://127.0.0.1:2080
28
30
  puppeteerExecutablePath,
@@ -33,15 +35,16 @@ class WebScraper
33
35
  {
34
36
  this.baseURL = baseURL;
35
37
  this.startURL = startURL || baseURL;
38
+ this.strictBaseURL = strictBaseURL;
36
39
  this.maxDepth = maxDepth;
37
- this.maxArticles = maxArticles; // Add this line
40
+ this.maxArticles = maxArticles;
38
41
  this.scrapResultPath = scrapResultPath;
39
42
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
40
43
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
41
44
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
42
45
  this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
43
46
  this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
44
- this.headers = headers;
47
+ this.axiosHeaders = axiosHeaders;
45
48
  this.includeMetadata = includeMetadata;
46
49
  this.metadataFields = new Set( metadataFields );
47
50
  this.visited = new Set();
@@ -50,6 +53,7 @@ class WebScraper
50
53
  this.allProcessedContent = [];
51
54
  this.filterFileTypes = filterFileTypes;
52
55
  this.excludedFileTypes = excludedFileTypes;
56
+ this.axiosProxy = axiosProxy;
53
57
  this.usePuppeteer = usePuppeteer || false;
54
58
  this.puppeteerOptions = {
55
59
  headless: false,
@@ -129,6 +133,10 @@ class WebScraper
129
133
  {
130
134
  return;
131
135
  }
136
+ if ( !this.isValidDomain( url ) )
137
+ {
138
+ return;
139
+ }
132
140
  try
133
141
  {
134
142
  const data = await this.caller( url );
@@ -180,9 +188,13 @@ class WebScraper
180
188
  try
181
189
  {
182
190
  let axiosOptions = {};
183
- if ( this.headers )
191
+ if ( this.axiosHeaders )
184
192
  {
185
- axiosOptions.headers = this.headers;
193
+ axiosOptions.headers = this.axiosHeaders;
194
+ }
195
+ if ( this.axiosProxy )
196
+ {
197
+ axiosOptions.proxy = this.axiosProxy;
186
198
  }
187
199
 
188
200
  // Step 1: Make a GET request with a small timeout and limited data download
@@ -602,6 +614,22 @@ class WebScraper
602
614
  return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
603
615
  }
604
616
 
617
+ isValidDomain ( url )
618
+ {
619
+ if ( !this.strictBaseURL ) return true;
620
+ try
621
+ {
622
+ const urlObj = new URL( url );
623
+ const baseURLObj = new URL( this.baseURL );
624
+ return urlObj.hostname === baseURLObj.hostname;
625
+ }
626
+ catch ( e )
627
+ {
628
+ console.log( `Invalid URL: ${url}` );
629
+ return false;
630
+ }
631
+ }
632
+
605
633
  isValidContent ( content )
606
634
  {
607
635
  // Remove whitespace and newlines for checking