clean-web-scraper 3.3.3 → 3.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -35,6 +35,7 @@ npm i clean-web-scraper
35
35
 
36
36
  git clone https://github.com/mlibre/Clean-Web-Scraper
37
37
  cd Clean-Web-Scraper
38
+ sudo pacman -S extra/xorg-server-xvfb chromium
38
39
  npm install
39
40
  ```
40
41
 
package/example-usage.js CHANGED
@@ -1,5 +1,13 @@
1
1
  const WebScraper = require( "./src/WebScraper" );
2
2
 
3
+ // const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
4
+ // const headers = {
5
+ // "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
6
+ // "Cache-Control": "private",
7
+ // "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
8
+ // "Cookie": cookies
9
+ // }
10
+
3
11
 
4
12
  async function khameneiIrFreePalestineTag ()
5
13
  {
@@ -52,9 +60,9 @@ async function decolonizepalestine ()
52
60
 
53
61
  async function bdsmovement ()
54
62
  {
55
- // https://bdsmovement.org
63
+ // https://bdsmovement.net
56
64
  const scraper = new WebScraper({
57
- baseURL: "https://bdsmovement.org",
65
+ baseURL: "https://bdsmovement.net",
58
66
  excludeList: [
59
67
  "https://bdsmovement.net/press-area",
60
68
  "https://bdsmovement.net/privacy-policy",
@@ -68,6 +76,37 @@ async function bdsmovement ()
68
76
  textOutputPath: "./dataset/bdsmovement/texts",
69
77
  csvOutputPath: "./dataset/bdsmovement/train.csv",
70
78
  includeMetadata: true,
79
+ metadataFields: ["title", "description", "author"],
80
+ puppeteerProxy: "socks5://127.0.0.1:2080",
81
+ puppeteerExecutablePath: "/usr/bin/chromium",
82
+ puppeteerRealProxy: {
83
+ host: "socks5://127.0.0.1",
84
+ port: "2080",
85
+ },
86
+ usePuppeteer: true
87
+ });
88
+ await scraper.start();
89
+ return scraper;
90
+ }
91
+
92
+ async function electronicintifada ()
93
+ {
94
+ // https://electronicintifada.net
95
+ const scraper = new WebScraper({
96
+ baseURL: "https://electronicintifada.net",
97
+ excludeList: [
98
+ "https://electronicintifada.net/press-area",
99
+ "https://electronicintifada.net/privacy-policy",
100
+ "https://electronicintifada.net/get-involved/join-a-bds-campaign",
101
+ "https://electronicintifada.net/donate_",
102
+ "https://electronicintifada.net/user",
103
+ "https://electronicintifada.net/admin"
104
+ ],
105
+ scrapResultPath: "./dataset/electronicintifada/website",
106
+ jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
107
+ textOutputPath: "./dataset/electronicintifada/texts",
108
+ csvOutputPath: "./dataset/electronicintifada/train.csv",
109
+ includeMetadata: true,
71
110
  metadataFields: ["title", "description", "author"]
72
111
  });
73
112
  await scraper.start();
@@ -78,11 +117,11 @@ void async function main ()
78
117
  {
79
118
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
80
119
  const decolonizepalestineScraper = await decolonizepalestine();
81
- // const bdsmovementScraper = await bdsmovement();
120
+ const bdsmovementScraper = await bdsmovement();
82
121
  await WebScraper.combineResults( "./dataset/combined", [
83
122
  khameneiIrFreePalestineTagScraper,
84
123
  decolonizepalestineScraper,
85
- // bdsmovementScraper
124
+ bdsmovementScraper
86
125
  ] );
87
126
 
88
127
  // 4
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.3",
3
+ "version": "3.3.5",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
@@ -26,6 +26,8 @@
26
26
  "@mozilla/readability": "^0.5.0",
27
27
  "axios": "^1.7.9",
28
28
  "eslint": "^9.17.0",
29
- "jsdom": "^26.0.0"
29
+ "jsdom": "^26.0.0",
30
+ "puppeteer": "^24.1.1",
31
+ "puppeteer-real-browser": "^1.3.22"
30
32
  }
31
33
  }
package/src/WebScraper.js CHANGED
@@ -4,6 +4,8 @@ const { JSDOM } = jsdom;
4
4
  const { Readability } = require( "@mozilla/readability" );
5
5
  const fs = require( "fs" );
6
6
  const path = require( "path" );
7
+ const puppeteer = require( "puppeteer" );
8
+ const { connect } = require( "puppeteer-real-browser" )
7
9
 
8
10
  class WebScraper
9
11
  {
@@ -19,11 +21,11 @@ class WebScraper
19
21
  csvOutputPath,
20
22
  includeMetadata = false,
21
23
  metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
22
- headers = {
23
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
24
- "Cache-Control": "private",
25
- "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"
26
- }
24
+ headers,
25
+ usePuppeteer,
26
+ puppeteerProxy, // e.g. http://127.0.0.1:2080
27
+ puppeteerExecutablePath,
28
+ puppeteerRealProxy
27
29
  })
28
30
  {
29
31
  this.baseURL = baseURL;
@@ -42,16 +44,67 @@ class WebScraper
42
44
  this.excludeList = this.normalizeExcludeList( excludeList );
43
45
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
44
46
  this.allProcessedContent = [];
47
+ this.usePuppeteer = usePuppeteer || false;
48
+ this.puppeteerOptions = {
49
+ headless: false,
50
+ userDataDir: "./tmp/browser",
51
+ defaultViewport: null,
52
+ args: [
53
+ "--start-maximized"
54
+ ],
55
+ "ignoreDefaultArgs": true,
56
+ }
57
+ if ( puppeteerProxy )
58
+ {
59
+ this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
60
+ }
61
+ if ( puppeteerExecutablePath )
62
+ {
63
+ this.puppeteerOptions.executablePath = puppeteerExecutablePath;
64
+ }
65
+ this.puppeteerRealOptions = {
66
+ headless: false,
67
+ args: [],
68
+ customConfig: {},
69
+ turnstile: true,
70
+ connectOption: {},
71
+ disableXvfb: false,
72
+ ignoreAllFlags: false,
73
+ proxy: puppeteerRealProxy
74
+ }
75
+ this.puppeteerBrowser = null;
76
+ this.puppeteerPage = null;
45
77
  }
46
78
 
47
79
  async start ()
48
80
  {
49
- this.createOutputDirectory();
50
- await this.fetchPage( this.startURL, 0 );
51
- this.createJSONLFile();
52
- this.saveNumberedTextFiles();
53
- this.createCSVFile();
54
- console.log( "Scraping completed." );
81
+ try
82
+ {
83
+ if ( this.usePuppeteer )
84
+ {
85
+ let { browser, page } = await connect( this.puppeteerRealOptions )
86
+ this.puppeteerBrowser = browser;
87
+ this.puppeteerPage = page;
88
+ }
89
+ this.createOutputDirectory();
90
+ await this.fetchPage( this.startURL, 0 );
91
+ this.createJSONLFile();
92
+ this.saveNumberedTextFiles();
93
+ this.createCSVFile();
94
+ console.log( "Scraping completed." );
95
+ }
96
+ catch ( error )
97
+ {
98
+ console.error( "Error:", error );
99
+ throw error;
100
+ }
101
+ finally
102
+ {
103
+ if ( this.puppeteerBrowser )
104
+ {
105
+ await this.puppeteerBrowser.close(); // Close the browser after scraping
106
+ }
107
+ }
55
108
  }
56
109
 
57
110
  async fetchPage ( url, depth )
@@ -63,9 +116,7 @@ class WebScraper
63
116
  this.visited.add( url );
64
117
  try
65
118
  {
66
- const { data, headers } = await axios.get( url, {
67
- headers: this.headers,
68
- });
119
+ const data = await this.caller( url );
69
120
  const dom = new JSDOM( data, { url });
70
121
  const { document } = dom.window;
71
122
 
@@ -78,8 +129,7 @@ class WebScraper
78
129
  {
79
130
  if ( this.isValidContent( article.textContent ) )
80
131
  {
81
-
82
- const metadata = this.metadataextractor( url, document, headers );
132
+ const metadata = this.metadataextractor( url, document );
83
133
  metadata.depth = depth;
84
134
  this.saveArticle( url, article.textContent, metadata );
85
135
  }
@@ -109,6 +159,82 @@ class WebScraper
109
159
  }
110
160
  }
111
161
 
162
+ async caller ( url )
163
+ {
164
+ try
165
+ {
166
+ let axiosOptinos = {}
167
+ if ( this.headers )
168
+ {
169
+ axiosOptinos.headers = this.headers
170
+ }
171
+ const result = await axios.get( url, axiosOptinos );
172
+ return result.data
173
+ }
174
+ catch ( error )
175
+ {
176
+ console.error( `Error fetching ${url}:`, error.message );
177
+ if ( error.status = 403 && this.usePuppeteer )
178
+ {
179
+ // const browser = await puppeteer.launch( this.puppeteerOptions );
180
+ // const page = await browser.newPage();
181
+ try
182
+ {
183
+ let result;
184
+ for ( let index = 0; index < 10; index++ )
185
+ {
186
+ console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
187
+ result = await this.goToUrl( url ) ;
188
+ if ( this.isValidContent( result.htmlContent ) )
189
+ {
190
+ break
191
+ }
192
+ }
193
+ return result.htmlContent;
194
+ }
195
+ catch ( error )
196
+ {
197
+ console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
198
+ throw error;
199
+ }
200
+
201
+ }
202
+ throw error;
203
+ }
204
+ }
205
+
206
+ async goToUrl ( url )
207
+ {
208
+ let pages = await this.puppeteerBrowser.pages();
209
+ let page = pages[0];
210
+ page.setDefaultNavigationTimeout( 10000 );
211
+ await page.goto( url );
212
+ pages = await this.puppeteerBrowser.pages();
213
+ page = pages[0];
214
+ page.setDefaultNavigationTimeout( 10000 );
215
+ await this.waitForPageToLoad( page );
216
+ pages = await this.puppeteerBrowser.pages();
217
+ page = pages[0];
218
+ page.setDefaultNavigationTimeout( 10000 );
219
+ if ( page )
220
+ {
221
+ let htmlContent = await page.content();
222
+ return { pages, page, htmlContent };
223
+ }
224
+ }
225
+
226
+ async waitForPageToLoad ( page )
227
+ {
228
+ try
229
+ {
230
+ await page.waitForNavigation({ waitUntil: "networkidle0" });
231
+ }
232
+ catch ( error )
233
+ {
234
+ console.log( error );
235
+ }
236
+ }
237
+
112
238
  extractLinks ( data )
113
239
  {
114
240
  const links = new Set();
@@ -329,7 +455,7 @@ class WebScraper
329
455
  return filteredMetadata;
330
456
  }
331
457
 
332
- metadataextractor ( url, document, headers )
458
+ metadataextractor ( url, document )
333
459
  {
334
460
  return {
335
461
  url,
@@ -337,9 +463,6 @@ class WebScraper
337
463
  description: document.querySelector( "meta[name=\"description\"]" )?.content,
338
464
  keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
339
465
  author: document.querySelector( "meta[name=\"author\"]" )?.content,
340
- lastModified: headers["last-modified"],
341
- contentType: headers["content-type"],
342
- contentLength: headers["content-length"],
343
466
  language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
344
467
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
345
468
  ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
@@ -416,16 +539,19 @@ class WebScraper
416
539
  // List of phrases that indicate invalid content
417
540
  const invalidPhrases = [
418
541
  "verifying that you are not a robot",
542
+ "verifying you are human. this may take a few seconds.",
543
+ "verify you are human by completing the action below",
419
544
  "checking if the site connection is secure",
420
545
  "please wait while we verify",
421
546
  "please enable javascript",
422
547
  "access denied",
548
+ "verifying you are human",
423
549
  "captcha verification"
424
550
  ];
425
551
 
426
552
  const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
427
553
  // Check content length
428
- if ( cleanContent.length < 100 && hasInvalidPhrases )
554
+ if ( cleanContent.length < 100 || hasInvalidPhrases )
429
555
  {
430
556
  return false;
431
557
  }