clean-web-scraper 3.10.0 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -56,7 +56,6 @@ const scraper = new WebScraper({
56
56
  strictBaseURL: true, // Optional: Only scrape URLs from same domain
57
57
  maxDepth: Infinity, // Optional: Maximum crawling depth
58
58
  maxArticles: Infinity, // Optional: Maximum articles to scrape
59
- concurrencyLimit: 2, // Optional: Limit concurrent requests
60
59
  crawlingDelay: 1000, // Optional: Delay between requests (ms)
61
60
 
62
61
  // Network options
@@ -72,8 +71,6 @@ const scraper = new WebScraper({
72
71
 
73
72
  // Puppeteer options for handling dynamic content
74
73
  usePuppeteer: false, // Optional: Enable Puppeteer browser
75
- puppeteerProxy: "http://127.0.0.1:2080", // Optional: Puppeteer proxy
76
- puppeteerExecutablePath: "/path/to/chrome", // Optional: Custom browser path
77
74
  });
78
75
  await scraper.start();
79
76
  ```
package/example-usage.js CHANGED
@@ -1,17 +1,25 @@
1
1
  const WebScraper = require( "./main" );
2
2
 
3
- // const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
4
3
  const headers = {
5
4
  "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
6
5
  "Cache-Control": "private",
7
6
  "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
8
7
  // "Cookie": cookies
8
+ };
9
+
10
+ async function runScraper ( config, enable )
11
+ {
12
+ const scraper = new WebScraper( config );
13
+ if ( enable )
14
+ {
15
+ await scraper.start();
16
+ }
17
+ return scraper;
9
18
  }
10
19
 
11
20
  async function palianswers ( enable )
12
21
  {
13
- // https://palianswers.com
14
- const scraper = new WebScraper({
22
+ const config = {
15
23
  baseURL: "https://palianswers.com",
16
24
  excludeList: [
17
25
  "https://palianswers.com/chat/",
@@ -22,30 +30,23 @@ async function palianswers ( enable )
22
30
  "https://palianswers.com/themes/"
23
31
  ],
24
32
  exactExcludeList: [
25
- "https://palianswers.com/",
33
+ "https://palianswers.com/"
26
34
  ],
27
35
  scrapResultPath: "./dataset/palianswers/website",
28
36
  jsonlOutputPath: "./dataset/palianswers/train.jsonl",
29
37
  textOutputPath: "./dataset/palianswers/texts",
30
38
  csvOutputPath: "./dataset/palianswers/train.csv",
31
39
  includeMetadata: true,
32
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
40
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
33
41
  axiosRetryDelay: 10000,
34
- concurrencyLimit: 4,
35
42
  crawlingDelay: 0
36
- });
37
- if ( enable )
38
- {
39
- await scraper.start();
40
- }
41
- return scraper;
43
+ };
44
+ return await runScraper( config, enable );
42
45
  }
43
46
 
44
47
  async function khameneiIrFreePalestineTag ( enable )
45
48
  {
46
- // https://english.khamenei.ir/Opinions/FreePalestine
47
- // https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
48
- const scraper = new WebScraper({
49
+ const config = {
49
50
  baseURL: "https://english.khamenei.ir/news",
50
51
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
51
52
  maxDepth: 1,
@@ -57,21 +58,15 @@ async function khameneiIrFreePalestineTag ( enable )
57
58
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
58
59
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
59
60
  includeMetadata: true,
60
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
61
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
61
62
  axiosRetryDelay: 10000,
62
- concurrencyLimit: 4,
63
- });
64
- if ( enable )
65
- {
66
- await scraper.start();
67
- }
68
- return scraper;
63
+ };
64
+ return await runScraper( config, enable );
69
65
  }
70
66
 
71
67
  async function decolonizepalestine ( enable )
72
68
  {
73
- // https://decolonizepalestine.com
74
- const scraper = new WebScraper({
69
+ const config = {
75
70
  baseURL: "https://decolonizepalestine.com",
76
71
  excludeList: [
77
72
  "https://decolonizepalestine.com/cdn-cgi",
@@ -89,21 +84,15 @@ async function decolonizepalestine ( enable )
89
84
  textOutputPath: "./dataset/decolonizepalestine/texts",
90
85
  csvOutputPath: "./dataset/decolonizepalestine/train.csv",
91
86
  includeMetadata: true,
92
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
87
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
93
88
  axiosRetryDelay: 10000,
94
- concurrencyLimit: 4,
95
- });
96
- if ( enable )
97
- {
98
- await scraper.start();
99
- }
100
- return scraper;
89
+ };
90
+ return await runScraper( config, enable );
101
91
  }
102
92
 
103
93
  async function electronicintifada ( enable )
104
94
  {
105
- // https://electronicintifada.net
106
- const scraper = new WebScraper({
95
+ const config = {
107
96
  baseURL: "https://electronicintifada.net",
108
97
  excludeList: [
109
98
  "https://electronicintifada.net/updates",
@@ -129,51 +118,43 @@ async function electronicintifada ( enable )
129
118
  textOutputPath: "./dataset/electronicintifada/texts",
130
119
  csvOutputPath: "./dataset/electronicintifada/train.csv",
131
120
  includeMetadata: true,
132
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
121
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
133
122
  maxDepth: 16,
134
123
  maxArticles: 2000,
135
- concurrencyLimit: 2,
136
124
  axiosHeaders: headers,
125
+ axiosMaxRetries: 2,
137
126
  axiosRetryDelay: 10000,
138
127
  axiosProxy: {
139
128
  host: "localhost",
140
129
  port: 2080,
141
130
  protocol: "http"
142
131
  },
143
- useProxyAsFallback: true
144
- });
145
- if ( enable )
146
- {
147
- await scraper.start();
148
- }
149
- return scraper;
132
+ useProxyAsFallback: true,
133
+ crawlingDelay: 0
134
+ };
135
+ return await runScraper( config, enable );
150
136
  }
151
137
 
152
138
  async function standWithPalestine ( enable )
153
139
  {
154
- const scraper = new WebScraper({
140
+ const config = {
155
141
  baseURL: "https://stand-with-palestine.org/blogs",
156
142
  startURL: "https://stand-with-palestine.org/blogs",
143
+ exactExcludeList: ["https://stand-with-palestine.org/blogs"],
157
144
  scrapResultPath: "./dataset/stand-with-palestine/website",
158
145
  jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
159
146
  textOutputPath: "./dataset/stand-with-palestine/texts",
160
147
  csvOutputPath: "./dataset/stand-with-palestine/train.csv",
161
- exactExcludeList: ["https://stand-with-palestine.org/blogs"],
162
148
  axiosHeaders: headers,
163
149
  includeMetadata: true,
164
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
165
- });
166
- if ( enable )
167
- {
168
- await scraper.start();
169
- }
170
- return scraper;
150
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
151
+ };
152
+ return await runScraper( config, enable );
171
153
  }
172
154
 
173
155
  async function mondoweiss ( enable )
174
156
  {
175
- // https://mondoweiss.net
176
- const scraper = new WebScraper({
157
+ const config = {
177
158
  baseURL: "https://mondoweiss.net",
178
159
  excludeList: [
179
160
  "https://mondoweiss.net/donate",
@@ -199,31 +180,25 @@ async function mondoweiss ( enable )
199
180
  textOutputPath: "./dataset/mondoweiss/texts",
200
181
  csvOutputPath: "./dataset/mondoweiss/train.csv",
201
182
  maxArticles: 2500,
202
- axiosMaxRetries: 3,
203
- concurrencyLimit: 3,
183
+ maxDepth: 15,
204
184
  axiosHeaders: headers,
185
+ axiosMaxRetries: 3,
186
+ axiosRetryDelay: 10000,
205
187
  axiosProxy: {
206
188
  host: "localhost",
207
189
  port: 2080,
208
190
  protocol: "http"
209
191
  },
210
- maxDepth: 15,
211
- axiosRetryDelay: 10000,
192
+ useProxyAsFallback: true,
212
193
  includeMetadata: true,
213
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
214
- useProxyAsFallback: true
215
- });
216
- if ( enable )
217
- {
218
- await scraper.start();
219
- }
220
- return scraper;
194
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
195
+ };
196
+ return await runScraper( config, enable );
221
197
  }
222
198
 
223
199
  async function bdsmovement ( enable )
224
200
  {
225
- // https://bdsmovement.net
226
- const scraper = new WebScraper({
201
+ const config = {
227
202
  baseURL: "https://bdsmovement.net",
228
203
  excludeList: [
229
204
  "https://bdsmovement.net/press-area",
@@ -238,25 +213,18 @@ async function bdsmovement ( enable )
238
213
  textOutputPath: "./dataset/bdsmovement/texts",
239
214
  csvOutputPath: "./dataset/bdsmovement/train.csv",
240
215
  includeMetadata: true,
241
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
242
- puppeteerProxy: "socks5://127.0.0.1:2080",
243
- puppeteerExecutablePath: "/usr/bin/chromium",
216
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
244
217
  puppeteerRealProxy: {
245
218
  host: "socks5://127.0.0.1",
246
219
  port: "2080",
247
220
  },
248
- });
249
- if ( enable )
250
- {
251
- await scraper.start();
252
- }
253
- return scraper;
221
+ };
222
+ return await runScraper( config, enable );
254
223
  }
255
224
 
256
225
  async function palestineremembered ( enable )
257
226
  {
258
- // https://www.palestineremembered.com
259
- const scraper = new WebScraper({
227
+ const config = {
260
228
  baseURL: "https://www.palestineremembered.com",
261
229
  startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
262
230
  excludeList: [
@@ -283,18 +251,14 @@ async function palestineremembered ( enable )
283
251
  textOutputPath: "./dataset/palestineremembered/texts",
284
252
  csvOutputPath: "./dataset/palestineremembered/train.csv",
285
253
  includeMetadata: true,
286
- metadataFields: ["author", "title", "description", "dateScrapedDate"],
254
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
287
255
  axiosProxy: {
288
256
  host: "localhost",
289
257
  port: 2080,
290
258
  protocol: "http"
291
259
  }
292
- });
293
- if ( enable )
294
- {
295
- await scraper.start();
296
- }
297
- return scraper;
260
+ };
261
+ return await runScraper( config, enable );
298
262
  }
299
263
 
300
264
  void async function main ()
@@ -316,4 +280,4 @@ void async function main ()
316
280
  standWithPalestineScraper,
317
281
  mondoweisScraper
318
282
  ] );
319
- }()
283
+ }();
package/main.js CHANGED
@@ -15,13 +15,12 @@ class WebScraper
15
15
  this.strictBaseURL = config.strictBaseURL || true;
16
16
  this.maxDepth = config.maxDepth || Infinity;
17
17
  this.maxArticles = config.maxArticles || Infinity;
18
- this.concurrencyLimit = config.concurrencyLimit || 2;
19
18
  this.crawlingDelay = config.crawlingDelay ?? 1000;
20
19
 
21
20
  // Output paths setup
22
21
  this.scrapResultPath = config.scrapResultPath || "./dataset";
23
22
  this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
24
- this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
23
+ this.textOutputPathWithMeta = `${this.textOutputPath}_with_metadata`;
25
24
  this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
26
25
  this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
27
26
  this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
@@ -60,8 +59,6 @@ class WebScraper
60
59
 
61
60
  // Puppeteer configuration
62
61
  this.usePuppeteer = config.usePuppeteer || false;
63
- this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
64
- this.puppeteerExecutablePath = config.puppeteerExecutablePath;
65
62
  this.puppeteerRealProxy = config.puppeteerRealProxy;
66
63
  this.configurePuppeteer();
67
64
  }
@@ -138,16 +135,14 @@ class WebScraper
138
135
 
139
136
  if ( !this.isExcluded( url ) )
140
137
  {
141
- const reader = new Readability( document, {
142
- charThreshold: 500,
143
- nbTopCandidates: 20
144
- });
138
+ const reader = new Readability( document );
145
139
  const article = reader.parse();
146
140
  if ( article )
147
141
  {
148
142
  if ( this.hasValidPageContent( article.textContent ) )
149
143
  {
150
144
  const metadata = this.extractMetadata( url, document );
145
+ metadata.articleTitle = article.title || "";
151
146
  this.saveArticle( url, article.textContent, metadata );
152
147
  }
153
148
  else
@@ -506,17 +501,14 @@ class WebScraper
506
501
  {
507
502
  return {
508
503
  url,
509
- title: document.title,
504
+ pageTitle: document.title,
510
505
  description: document.querySelector( "meta[name=\"description\"]" )?.content,
511
506
  keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
512
507
  author: document.querySelector( "meta[name=\"author\"]" )?.content,
513
- language:
514
- document.documentElement.lang ||
515
- document.querySelector( "html" )?.getAttribute( "lang" ),
508
+ language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
516
509
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
517
510
  ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
518
- ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
519
- ?.content,
511
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
520
512
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
521
513
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
522
514
  dateScrapedDate: new Date().toISOString()
@@ -562,23 +554,6 @@ class WebScraper
562
554
 
563
555
  configurePuppeteer ( )
564
556
  {
565
- this.puppeteerOptions = {
566
- headless: false,
567
- userDataDir: "./tmp/browser",
568
- defaultViewport: null,
569
- args: ["--start-maximized"],
570
- ignoreDefaultArgs: true
571
- };
572
-
573
- if ( this.puppeteerProxy )
574
- {
575
- this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
576
- }
577
- if ( this.puppeteerExecutablePath )
578
- {
579
- this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
580
- }
581
-
582
557
  this.puppeteerRealOptions = {
583
558
  headless: false,
584
559
  args: [],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.10.0",
3
+ "version": "4.0.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",