clean-web-scraper 3.3.2 → 3.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -35,6 +35,7 @@ npm i clean-web-scraper
35
35
 
36
36
  git clone https://github.com/mlibre/Clean-Web-Scraper
37
37
  cd Clean-Web-Scraper
38
+ sudo pacman -S extra/xorg-server-xvfb chromium
38
39
  npm install
39
40
  ```
40
41
 
package/example-usage.js CHANGED
@@ -1,5 +1,13 @@
1
1
  const WebScraper = require( "./src/WebScraper" );
2
2
 
3
+ // const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
4
+ // const headers = {
5
+ // "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
6
+ // "Cache-Control": "private",
7
+ // "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
8
+ // "Cookie": cookies
9
+ // }
10
+
3
11
 
4
12
  async function khameneiIrFreePalestineTag ()
5
13
  {
@@ -52,9 +60,9 @@ async function decolonizepalestine ()
52
60
 
53
61
  async function bdsmovement ()
54
62
  {
55
- // https://bdsmovement.org
63
+ // https://bdsmovement.net
56
64
  const scraper = new WebScraper({
57
- baseURL: "https://bdsmovement.org",
65
+ baseURL: "https://bdsmovement.net",
58
66
  excludeList: [
59
67
  "https://bdsmovement.net/press-area",
60
68
  "https://bdsmovement.net/privacy-policy",
@@ -68,6 +76,36 @@ async function bdsmovement ()
68
76
  textOutputPath: "./dataset/bdsmovement/texts",
69
77
  csvOutputPath: "./dataset/bdsmovement/train.csv",
70
78
  includeMetadata: true,
79
+ metadataFields: ["title", "description", "author"],
80
+ puppeteerProxy: "socks5://127.0.0.1:2080",
81
+ puppeteerExecutablePath: "/usr/bin/chromium",
82
+ puppeteerRealProxy: {
83
+ host: "socks5://127.0.0.1",
84
+ port: "2080",
85
+ }
86
+ });
87
+ await scraper.start();
88
+ return scraper;
89
+ }
90
+
91
+ async function electronicintifada ()
92
+ {
93
+ // https://electronicintifada.net
94
+ const scraper = new WebScraper({
95
+ baseURL: "https://electronicintifada.net",
96
+ excludeList: [
97
+ "https://electronicintifada.net/press-area",
98
+ "https://electronicintifada.net/privacy-policy",
99
+ "https://electronicintifada.net/get-involved/join-a-bds-campaign",
100
+ "https://electronicintifada.net/donate_",
101
+ "https://electronicintifada.net/user",
102
+ "https://electronicintifada.net/admin"
103
+ ],
104
+ scrapResultPath: "./dataset/electronicintifada/website",
105
+ jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
106
+ textOutputPath: "./dataset/electronicintifada/texts",
107
+ csvOutputPath: "./dataset/electronicintifada/train.csv",
108
+ includeMetadata: true,
71
109
  metadataFields: ["title", "description", "author"]
72
110
  });
73
111
  await scraper.start();
@@ -77,13 +115,13 @@ async function bdsmovement ()
77
115
  void async function main ()
78
116
  {
79
117
  // const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
80
- // const decolonizepalestineScraper = await decolonizepalestine();
81
- const bdsmovementScraper = await bdsmovement();
82
- // await WebScraper.combineResults( "./dataset/combined", [
83
- // khameneiIrFreePalestineTagScraper,
84
- // decolonizepalestineScraper,
85
- // bdsmovementScraper
86
- // ] );
118
+ const decolonizepalestineScraper = await decolonizepalestine();
119
+ // const bdsmovementScraper = await bdsmovement();
120
+ await WebScraper.combineResults( "./dataset/combined", [
121
+ // khameneiIrFreePalestineTagScraper,
122
+ decolonizepalestineScraper,
123
+ // bdsmovementScraper
124
+ ] );
87
125
 
88
126
  // 4
89
127
  // https://electronicintifada.net/
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.2",
3
+ "version": "3.3.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
@@ -26,6 +26,8 @@
26
26
  "@mozilla/readability": "^0.5.0",
27
27
  "axios": "^1.7.9",
28
28
  "eslint": "^9.17.0",
29
- "jsdom": "^26.0.0"
29
+ "jsdom": "^26.0.0",
30
+ "puppeteer": "^24.1.1",
31
+ "puppeteer-real-browser": "^1.3.22"
30
32
  }
31
33
  }
package/src/WebScraper.js CHANGED
@@ -4,6 +4,8 @@ const { JSDOM } = jsdom;
4
4
  const { Readability } = require( "@mozilla/readability" );
5
5
  const fs = require( "fs" );
6
6
  const path = require( "path" );
7
+ const puppeteer = require( "puppeteer" );
8
+ const { connect } = require( "puppeteer-real-browser" )
7
9
 
8
10
  class WebScraper
9
11
  {
@@ -19,11 +21,10 @@ class WebScraper
19
21
  csvOutputPath,
20
22
  includeMetadata = false,
21
23
  metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
22
- headers = {
23
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
24
- "Cache-Control": "private",
25
- "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"
26
- }
24
+ headers,
25
+ puppeteerProxy, // e.g. http://127.0.0.1:2080
26
+ puppeteerExecutablePath,
27
+ puppeteerRealProxy
27
28
  })
28
29
  {
29
30
  this.baseURL = baseURL;
@@ -42,6 +43,34 @@ class WebScraper
42
43
  this.excludeList = this.normalizeExcludeList( excludeList );
43
44
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
44
45
  this.allProcessedContent = [];
46
+ this.usePuppeteer = false;
47
+ this.puppeteerOptions = {
48
+ headless: false,
49
+ userDataDir: "./tmp/browser",
50
+ defaultViewport: null,
51
+ args: [
52
+ "--start-maximized"
53
+ ],
54
+ "ignoreDefaultArgs": true,
55
+ }
56
+ if ( puppeteerProxy )
57
+ {
58
+ this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
59
+ }
60
+ if ( puppeteerExecutablePath )
61
+ {
62
+ this.puppeteerOptions.executablePath = puppeteerExecutablePath;
63
+ }
64
+ this.puppeteerRealOptions = {
65
+ headless: false,
66
+ args: [],
67
+ customConfig: {},
68
+ turnstile: true,
69
+ connectOption: {},
70
+ disableXvfb: false,
71
+ ignoreAllFlags: false,
72
+ proxy: puppeteerRealProxy
73
+ }
45
74
  }
46
75
 
47
76
  async start ()
@@ -63,9 +92,7 @@ class WebScraper
63
92
  this.visited.add( url );
64
93
  try
65
94
  {
66
- const { data, headers } = await axios.get( url, {
67
- headers: this.headers,
68
- });
95
+ const data = await this.caller( url );
69
96
  const dom = new JSDOM( data, { url });
70
97
  const { document } = dom.window;
71
98
 
@@ -78,8 +105,7 @@ class WebScraper
78
105
  {
79
106
  if ( this.isValidContent( article.textContent ) )
80
107
  {
81
-
82
- const metadata = this.metadataextractor( url, document, headers );
108
+ const metadata = this.metadataextractor( url, document );
83
109
  metadata.depth = depth;
84
110
  this.saveArticle( url, article.textContent, metadata );
85
111
  }
@@ -109,6 +135,81 @@ class WebScraper
109
135
  }
110
136
  }
111
137
 
138
+ async caller ( url )
139
+ {
140
+ try
141
+ {
142
+ let axiosOptinos = {}
143
+ if ( this.headers )
144
+ {
145
+ axiosOptinos.headers = this.headers
146
+ }
147
+ const result = await axios.get( url, axiosOptinos );
148
+ return result.data
149
+ }
150
+ catch ( error )
151
+ {
152
+ console.error( `Error fetching ${url}:`, error.message );
153
+ if ( error.status = 403 && this.usePuppeteer )
154
+ {
155
+ let { browser, page } = await connect( this.puppeteerRealOptions )
156
+
157
+ // const browser = await puppeteer.launch( this.puppeteerOptions );
158
+ // const page = await browser.newPage();
159
+ try
160
+ {
161
+ let htmlContent;
162
+ for ( let index = 0; index < 10; index++ )
163
+ {
164
+ const pages = await browser.pages();
165
+ page = pages[0];
166
+ page.setDefaultNavigationTimeout( 30000 )
167
+ await page.goto( url );
168
+ console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
169
+ await this.waitForPageToLoad( page );
170
+ htmlContent = await page.content();
171
+ if ( this.isValidContent( htmlContent ) )
172
+ {
173
+ break
174
+ }
175
+ page = pages[0];
176
+ page.setDefaultNavigationTimeout( 30000 )
177
+ await this.waitForPageToLoad( page );
178
+ htmlContent = await page.content();
179
+ if ( this.isValidContent( htmlContent ) )
180
+ {
181
+ break
182
+ }
183
+ await page.goto( url );
184
+ }
185
+ return htmlContent;
186
+ }
187
+ catch ( error )
188
+ {
189
+ console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
190
+ throw error;
191
+ }
192
+ finally
193
+ {
194
+ await browser.close(); // Close the browser after scraping
195
+ }
196
+ }
197
+ throw error;
198
+ }
199
+ }
200
+
201
+ async waitForPageToLoad ( page )
202
+ {
203
+ try
204
+ {
205
+ await page.waitForNavigation({ waitUntil: "networkidle0" });
206
+ }
207
+ catch ( error )
208
+ {
209
+ console.log( error );
210
+ }
211
+ }
212
+
112
213
  extractLinks ( data )
113
214
  {
114
215
  const links = new Set();
@@ -329,7 +430,7 @@ class WebScraper
329
430
  return filteredMetadata;
330
431
  }
331
432
 
332
- metadataextractor ( url, document, headers )
433
+ metadataextractor ( url, document )
333
434
  {
334
435
  return {
335
436
  url,
@@ -337,9 +438,6 @@ class WebScraper
337
438
  description: document.querySelector( "meta[name=\"description\"]" )?.content,
338
439
  keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
339
440
  author: document.querySelector( "meta[name=\"author\"]" )?.content,
340
- lastModified: headers["last-modified"],
341
- contentType: headers["content-type"],
342
- contentLength: headers["content-length"],
343
441
  language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
344
442
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
345
443
  ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
@@ -408,6 +506,33 @@ class WebScraper
408
506
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
409
507
  }
410
508
 
509
+ isValidContent ( content )
510
+ {
511
+ // Remove whitespace and newlines for checking
512
+ const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
513
+
514
+ // List of phrases that indicate invalid content
515
+ const invalidPhrases = [
516
+ "verifying that you are not a robot",
517
+ "verifying you are human. this may take a few seconds.",
518
+ "verify you are human by completing the action below",
519
+ "checking if the site connection is secure",
520
+ "please wait while we verify",
521
+ "please enable javascript",
522
+ "access denied",
523
+ "verifying you are human",
524
+ "captcha verification"
525
+ ];
526
+
527
+ const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
528
+ // Check content length
529
+ if ( cleanContent.length < 100 || hasInvalidPhrases )
530
+ {
531
+ return false;
532
+ }
533
+ return true;
534
+ }
535
+
411
536
  static sleep ( ms )
412
537
  {
413
538
  return new Promise( resolve => { return setTimeout( resolve, ms ) });
@@ -448,12 +573,18 @@ class WebScraper
448
573
  for ( const website of websites )
449
574
  {
450
575
  const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
451
- jsonlOutput.write( jsonlContent );
576
+ if ( jsonlContent )
577
+ {
578
+ jsonlOutput.write( jsonlContent );
579
+ }
452
580
 
453
581
  if ( website.includeMetadata )
454
582
  {
455
583
  const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
456
- jsonlMetaOutput.write( jsonlMetaContent );
584
+ if ( jsonlMetaContent )
585
+ {
586
+ jsonlMetaOutput.write( jsonlMetaContent );
587
+ }
457
588
  }
458
589
  }
459
590
 
@@ -480,7 +611,10 @@ class WebScraper
480
611
  .split( "\n" )
481
612
  .slice( 1 )
482
613
  .filter( line => { return line.trim() });
483
- csvOutput.write( `${csvContent.join( "\n" )}\n` );
614
+ if ( csvContent.length > 0 )
615
+ {
616
+ csvOutput.write( `${csvContent.join( "\n" )}\n` );
617
+ }
484
618
 
485
619
  if ( website.includeMetadata )
486
620
  {
@@ -488,7 +622,10 @@ class WebScraper
488
622
  .split( "\n" )
489
623
  .slice( 1 )
490
624
  .filter( line => { return line.trim() });
491
- csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
625
+ if ( csvMetaContent.length > 0 )
626
+ {
627
+ csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
628
+ }
492
629
  }
493
630
  }
494
631
 
@@ -496,30 +633,6 @@ class WebScraper
496
633
  csvMetaOutput.end();
497
634
  }
498
635
 
499
- isValidContent ( content )
500
- {
501
- // Remove whitespace and newlines for checking
502
- const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
503
-
504
- // List of phrases that indicate invalid content
505
- const invalidPhrases = [
506
- "verifying that you are not a robot",
507
- "checking if the site connection is secure",
508
- "please wait while we verify",
509
- "please enable javascript",
510
- "access denied",
511
- "captcha verification"
512
- ];
513
-
514
- const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
515
- // Check content length
516
- if ( cleanContent.length < 100 && hasInvalidPhrases )
517
- {
518
- return false;
519
- }
520
- return true;
521
- }
522
-
523
636
  static combineTextFiles ( fullOutputPath, websites )
524
637
  {
525
638
  let textFileCounter = 1;
@@ -553,7 +666,6 @@ class WebScraper
553
666
  }
554
667
  }
555
668
  }
556
-
557
669
  }
558
670
 
559
671
  module.exports = WebScraper;