clean-web-scraper 3.10.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -56,7 +56,6 @@ const scraper = new WebScraper({
56
56
  strictBaseURL: true, // Optional: Only scrape URLs from same domain
57
57
  maxDepth: Infinity, // Optional: Maximum crawling depth
58
58
  maxArticles: Infinity, // Optional: Maximum articles to scrape
59
- concurrencyLimit: 2, // Optional: Limit concurrent requests
60
59
  crawlingDelay: 1000, // Optional: Delay between requests (ms)
61
60
 
62
61
  // Network options
@@ -72,8 +71,6 @@ const scraper = new WebScraper({
72
71
 
73
72
  // Puppeteer options for handling dynamic content
74
73
  usePuppeteer: false, // Optional: Enable Puppeteer browser
75
- puppeteerProxy: "http://127.0.0.1:2080", // Optional: Puppeteer proxy
76
- puppeteerExecutablePath: "/path/to/chrome", // Optional: Custom browser path
77
74
  });
78
75
  await scraper.start();
79
76
  ```
package/example-usage.js CHANGED
@@ -1,17 +1,25 @@
1
1
  const WebScraper = require( "./main" );
2
2
 
3
- // const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
4
3
  const headers = {
5
4
  "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
6
5
  "Cache-Control": "private",
7
6
  "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
8
7
  // "Cookie": cookies
8
+ };
9
+
10
+ async function runScraper ( config, enable )
11
+ {
12
+ const scraper = new WebScraper( config );
13
+ if ( enable )
14
+ {
15
+ await scraper.start();
16
+ }
17
+ return scraper;
9
18
  }
10
19
 
11
20
  async function palianswers ( enable )
12
21
  {
13
- // https://palianswers.com
14
- const scraper = new WebScraper({
22
+ const config = {
15
23
  baseURL: "https://palianswers.com",
16
24
  excludeList: [
17
25
  "https://palianswers.com/chat/",
@@ -22,7 +30,7 @@ async function palianswers ( enable )
22
30
  "https://palianswers.com/themes/"
23
31
  ],
24
32
  exactExcludeList: [
25
- "https://palianswers.com/",
33
+ "https://palianswers.com/"
26
34
  ],
27
35
  scrapResultPath: "./dataset/palianswers/website",
28
36
  jsonlOutputPath: "./dataset/palianswers/train.jsonl",
@@ -31,21 +39,14 @@ async function palianswers ( enable )
31
39
  includeMetadata: true,
32
40
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
33
41
  axiosRetryDelay: 10000,
34
- concurrencyLimit: 4,
35
42
  crawlingDelay: 0
36
- });
37
- if ( enable )
38
- {
39
- await scraper.start();
40
- }
41
- return scraper;
43
+ };
44
+ return await runScraper( config, enable );
42
45
  }
43
46
 
44
47
  async function khameneiIrFreePalestineTag ( enable )
45
48
  {
46
- // https://english.khamenei.ir/Opinions/FreePalestine
47
- // https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
48
- const scraper = new WebScraper({
49
+ const config = {
49
50
  baseURL: "https://english.khamenei.ir/news",
50
51
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
51
52
  maxDepth: 1,
@@ -59,19 +60,13 @@ async function khameneiIrFreePalestineTag ( enable )
59
60
  includeMetadata: true,
60
61
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
61
62
  axiosRetryDelay: 10000,
62
- concurrencyLimit: 4,
63
- });
64
- if ( enable )
65
- {
66
- await scraper.start();
67
- }
68
- return scraper;
63
+ };
64
+ return await runScraper( config, enable );
69
65
  }
70
66
 
71
67
  async function decolonizepalestine ( enable )
72
68
  {
73
- // https://decolonizepalestine.com
74
- const scraper = new WebScraper({
69
+ const config = {
75
70
  baseURL: "https://decolonizepalestine.com",
76
71
  excludeList: [
77
72
  "https://decolonizepalestine.com/cdn-cgi",
@@ -91,19 +86,13 @@ async function decolonizepalestine ( enable )
91
86
  includeMetadata: true,
92
87
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
93
88
  axiosRetryDelay: 10000,
94
- concurrencyLimit: 4,
95
- });
96
- if ( enable )
97
- {
98
- await scraper.start();
99
- }
100
- return scraper;
89
+ };
90
+ return await runScraper( config, enable );
101
91
  }
102
92
 
103
93
  async function electronicintifada ( enable )
104
94
  {
105
- // https://electronicintifada.net
106
- const scraper = new WebScraper({
95
+ const config = {
107
96
  baseURL: "https://electronicintifada.net",
108
97
  excludeList: [
109
98
  "https://electronicintifada.net/updates",
@@ -132,7 +121,6 @@ async function electronicintifada ( enable )
132
121
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
133
122
  maxDepth: 16,
134
123
  maxArticles: 2000,
135
- concurrencyLimit: 2,
136
124
  axiosHeaders: headers,
137
125
  axiosRetryDelay: 10000,
138
126
  axiosProxy: {
@@ -141,17 +129,13 @@ async function electronicintifada ( enable )
141
129
  protocol: "http"
142
130
  },
143
131
  useProxyAsFallback: true
144
- });
145
- if ( enable )
146
- {
147
- await scraper.start();
148
- }
149
- return scraper;
132
+ };
133
+ return await runScraper( config, enable );
150
134
  }
151
135
 
152
136
  async function standWithPalestine ( enable )
153
137
  {
154
- const scraper = new WebScraper({
138
+ const config = {
155
139
  baseURL: "https://stand-with-palestine.org/blogs",
156
140
  startURL: "https://stand-with-palestine.org/blogs",
157
141
  scrapResultPath: "./dataset/stand-with-palestine/website",
@@ -162,18 +146,13 @@ async function standWithPalestine ( enable )
162
146
  axiosHeaders: headers,
163
147
  includeMetadata: true,
164
148
  metadataFields: ["author", "title", "description", "dateScrapedDate"]
165
- });
166
- if ( enable )
167
- {
168
- await scraper.start();
169
- }
170
- return scraper;
149
+ };
150
+ return await runScraper( config, enable );
171
151
  }
172
152
 
173
153
  async function mondoweiss ( enable )
174
154
  {
175
- // https://mondoweiss.net
176
- const scraper = new WebScraper({
155
+ const config = {
177
156
  baseURL: "https://mondoweiss.net",
178
157
  excludeList: [
179
158
  "https://mondoweiss.net/donate",
@@ -200,7 +179,6 @@ async function mondoweiss ( enable )
200
179
  csvOutputPath: "./dataset/mondoweiss/train.csv",
201
180
  maxArticles: 2500,
202
181
  axiosMaxRetries: 3,
203
- concurrencyLimit: 3,
204
182
  axiosHeaders: headers,
205
183
  axiosProxy: {
206
184
  host: "localhost",
@@ -212,18 +190,13 @@ async function mondoweiss ( enable )
212
190
  includeMetadata: true,
213
191
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
214
192
  useProxyAsFallback: true
215
- });
216
- if ( enable )
217
- {
218
- await scraper.start();
219
- }
220
- return scraper;
193
+ };
194
+ return await runScraper( config, enable );
221
195
  }
222
196
 
223
197
  async function bdsmovement ( enable )
224
198
  {
225
- // https://bdsmovement.net
226
- const scraper = new WebScraper({
199
+ const config = {
227
200
  baseURL: "https://bdsmovement.net",
228
201
  excludeList: [
229
202
  "https://bdsmovement.net/press-area",
@@ -239,24 +212,17 @@ async function bdsmovement ( enable )
239
212
  csvOutputPath: "./dataset/bdsmovement/train.csv",
240
213
  includeMetadata: true,
241
214
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
242
- puppeteerProxy: "socks5://127.0.0.1:2080",
243
- puppeteerExecutablePath: "/usr/bin/chromium",
244
215
  puppeteerRealProxy: {
245
216
  host: "socks5://127.0.0.1",
246
217
  port: "2080",
247
218
  },
248
- });
249
- if ( enable )
250
- {
251
- await scraper.start();
252
- }
253
- return scraper;
219
+ };
220
+ return await runScraper( config, enable );
254
221
  }
255
222
 
256
223
  async function palestineremembered ( enable )
257
224
  {
258
- // https://www.palestineremembered.com
259
- const scraper = new WebScraper({
225
+ const config = {
260
226
  baseURL: "https://www.palestineremembered.com",
261
227
  startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
262
228
  excludeList: [
@@ -289,12 +255,8 @@ async function palestineremembered ( enable )
289
255
  port: 2080,
290
256
  protocol: "http"
291
257
  }
292
- });
293
- if ( enable )
294
- {
295
- await scraper.start();
296
- }
297
- return scraper;
258
+ };
259
+ return await runScraper( config, enable );
298
260
  }
299
261
 
300
262
  void async function main ()
@@ -316,4 +278,4 @@ void async function main ()
316
278
  standWithPalestineScraper,
317
279
  mondoweisScraper
318
280
  ] );
319
- }()
281
+ }();
package/main.js CHANGED
@@ -15,13 +15,12 @@ class WebScraper
15
15
  this.strictBaseURL = config.strictBaseURL || true;
16
16
  this.maxDepth = config.maxDepth || Infinity;
17
17
  this.maxArticles = config.maxArticles || Infinity;
18
- this.concurrencyLimit = config.concurrencyLimit || 2;
19
18
  this.crawlingDelay = config.crawlingDelay ?? 1000;
20
19
 
21
20
  // Output paths setup
22
21
  this.scrapResultPath = config.scrapResultPath || "./dataset";
23
22
  this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
24
- this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
23
+ this.textOutputPathWithMeta = `${this.textOutputPath}_with_metadata`;
25
24
  this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
26
25
  this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
27
26
  this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
@@ -60,8 +59,6 @@ class WebScraper
60
59
 
61
60
  // Puppeteer configuration
62
61
  this.usePuppeteer = config.usePuppeteer || false;
63
- this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
64
- this.puppeteerExecutablePath = config.puppeteerExecutablePath;
65
62
  this.puppeteerRealProxy = config.puppeteerRealProxy;
66
63
  this.configurePuppeteer();
67
64
  }
@@ -562,23 +559,6 @@ class WebScraper
562
559
 
563
560
  configurePuppeteer ( )
564
561
  {
565
- this.puppeteerOptions = {
566
- headless: false,
567
- userDataDir: "./tmp/browser",
568
- defaultViewport: null,
569
- args: ["--start-maximized"],
570
- ignoreDefaultArgs: true
571
- };
572
-
573
- if ( this.puppeteerProxy )
574
- {
575
- this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
576
- }
577
- if ( this.puppeteerExecutablePath )
578
- {
579
- this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
580
- }
581
-
582
562
  this.puppeteerRealOptions = {
583
563
  headless: false,
584
564
  args: [],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.10.0",
3
+ "version": "4.0.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",