clean-web-scraper 3.3.3 → 3.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -35,6 +35,7 @@ npm i clean-web-scraper
35
35
 
36
36
  git clone https://github.com/mlibre/Clean-Web-Scraper
37
37
  cd Clean-Web-Scraper
38
+ sudo pacman -S extra/xorg-server-xvfb chromium
38
39
  npm install
39
40
  ```
40
41
 
package/example-usage.js CHANGED
@@ -1,5 +1,13 @@
1
1
  const WebScraper = require( "./src/WebScraper" );
2
2
 
3
+ // const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
4
+ // const headers = {
5
+ // "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
6
+ // "Cache-Control": "private",
7
+ // "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
8
+ // "Cookie": cookies
9
+ // }
10
+
3
11
 
4
12
  async function khameneiIrFreePalestineTag ()
5
13
  {
@@ -52,9 +60,9 @@ async function decolonizepalestine ()
52
60
 
53
61
  async function bdsmovement ()
54
62
  {
55
- // https://bdsmovement.org
63
+ // https://bdsmovement.net
56
64
  const scraper = new WebScraper({
57
- baseURL: "https://bdsmovement.org",
65
+ baseURL: "https://bdsmovement.net",
58
66
  excludeList: [
59
67
  "https://bdsmovement.net/press-area",
60
68
  "https://bdsmovement.net/privacy-policy",
@@ -68,6 +76,36 @@ async function bdsmovement ()
68
76
  textOutputPath: "./dataset/bdsmovement/texts",
69
77
  csvOutputPath: "./dataset/bdsmovement/train.csv",
70
78
  includeMetadata: true,
79
+ metadataFields: ["title", "description", "author"],
80
+ puppeteerProxy: "socks5://127.0.0.1:2080",
81
+ puppeteerExecutablePath: "/usr/bin/chromium",
82
+ puppeteerRealProxy: {
83
+ host: "socks5://127.0.0.1",
84
+ port: "2080",
85
+ }
86
+ });
87
+ await scraper.start();
88
+ return scraper;
89
+ }
90
+
91
+ async function electronicintifada ()
92
+ {
93
+ // https://electronicintifada.net
94
+ const scraper = new WebScraper({
95
+ baseURL: "https://electronicintifada.net",
96
+ excludeList: [
97
+ "https://electronicintifada.net/press-area",
98
+ "https://electronicintifada.net/privacy-policy",
99
+ "https://electronicintifada.net/get-involved/join-a-bds-campaign",
100
+ "https://electronicintifada.net/donate_",
101
+ "https://electronicintifada.net/user",
102
+ "https://electronicintifada.net/admin"
103
+ ],
104
+ scrapResultPath: "./dataset/electronicintifada/website",
105
+ jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
106
+ textOutputPath: "./dataset/electronicintifada/texts",
107
+ csvOutputPath: "./dataset/electronicintifada/train.csv",
108
+ includeMetadata: true,
71
109
  metadataFields: ["title", "description", "author"]
72
110
  });
73
111
  await scraper.start();
@@ -76,11 +114,11 @@ async function bdsmovement ()
76
114
 
77
115
  void async function main ()
78
116
  {
79
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
117
+ // const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
80
118
  const decolonizepalestineScraper = await decolonizepalestine();
81
119
  // const bdsmovementScraper = await bdsmovement();
82
120
  await WebScraper.combineResults( "./dataset/combined", [
83
- khameneiIrFreePalestineTagScraper,
121
+ // khameneiIrFreePalestineTagScraper,
84
122
  decolonizepalestineScraper,
85
123
  // bdsmovementScraper
86
124
  ] );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.3",
3
+ "version": "3.3.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
@@ -26,6 +26,8 @@
26
26
  "@mozilla/readability": "^0.5.0",
27
27
  "axios": "^1.7.9",
28
28
  "eslint": "^9.17.0",
29
- "jsdom": "^26.0.0"
29
+ "jsdom": "^26.0.0",
30
+ "puppeteer": "^24.1.1",
31
+ "puppeteer-real-browser": "^1.3.22"
30
32
  }
31
33
  }
package/src/WebScraper.js CHANGED
@@ -4,6 +4,8 @@ const { JSDOM } = jsdom;
4
4
  const { Readability } = require( "@mozilla/readability" );
5
5
  const fs = require( "fs" );
6
6
  const path = require( "path" );
7
+ const puppeteer = require( "puppeteer" );
8
+ const { connect } = require( "puppeteer-real-browser" )
7
9
 
8
10
  class WebScraper
9
11
  {
@@ -19,11 +21,10 @@ class WebScraper
19
21
  csvOutputPath,
20
22
  includeMetadata = false,
21
23
  metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
22
- headers = {
23
- "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
24
- "Cache-Control": "private",
25
- "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5"
26
- }
24
+ headers,
25
+ puppeteerProxy, // e.g. http://127.0.0.1:2080
26
+ puppeteerExecutablePath,
27
+ puppeteerRealProxy
27
28
  })
28
29
  {
29
30
  this.baseURL = baseURL;
@@ -42,6 +43,34 @@ class WebScraper
42
43
  this.excludeList = this.normalizeExcludeList( excludeList );
43
44
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
44
45
  this.allProcessedContent = [];
46
+ this.usePuppeteer = false;
47
+ this.puppeteerOptions = {
48
+ headless: false,
49
+ userDataDir: "./tmp/browser",
50
+ defaultViewport: null,
51
+ args: [
52
+ "--start-maximized"
53
+ ],
54
+ "ignoreDefaultArgs": true,
55
+ }
56
+ if ( puppeteerProxy )
57
+ {
58
+ this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
59
+ }
60
+ if ( puppeteerExecutablePath )
61
+ {
62
+ this.puppeteerOptions.executablePath = puppeteerExecutablePath;
63
+ }
64
+ this.puppeteerRealOptions = {
65
+ headless: false,
66
+ args: [],
67
+ customConfig: {},
68
+ turnstile: true,
69
+ connectOption: {},
70
+ disableXvfb: false,
71
+ ignoreAllFlags: false,
72
+ proxy: puppeteerRealProxy
73
+ }
45
74
  }
46
75
 
47
76
  async start ()
@@ -63,9 +92,7 @@ class WebScraper
63
92
  this.visited.add( url );
64
93
  try
65
94
  {
66
- const { data, headers } = await axios.get( url, {
67
- headers: this.headers,
68
- });
95
+ const data = await this.caller( url );
69
96
  const dom = new JSDOM( data, { url });
70
97
  const { document } = dom.window;
71
98
 
@@ -78,8 +105,7 @@ class WebScraper
78
105
  {
79
106
  if ( this.isValidContent( article.textContent ) )
80
107
  {
81
-
82
- const metadata = this.metadataextractor( url, document, headers );
108
+ const metadata = this.metadataextractor( url, document );
83
109
  metadata.depth = depth;
84
110
  this.saveArticle( url, article.textContent, metadata );
85
111
  }
@@ -109,6 +135,81 @@ class WebScraper
109
135
  }
110
136
  }
111
137
 
138
+ async caller ( url )
139
+ {
140
+ try
141
+ {
142
+ let axiosOptinos = {}
143
+ if ( this.headers )
144
+ {
145
+ axiosOptinos.headers = this.headers
146
+ }
147
+ const result = await axios.get( url, axiosOptinos );
148
+ return result.data
149
+ }
150
+ catch ( error )
151
+ {
152
+ console.error( `Error fetching ${url}:`, error.message );
153
+ if ( error.status = 403 && this.usePuppeteer )
154
+ {
155
+ let { browser, page } = await connect( this.puppeteerRealOptions )
156
+
157
+ // const browser = await puppeteer.launch( this.puppeteerOptions );
158
+ // const page = await browser.newPage();
159
+ try
160
+ {
161
+ let htmlContent;
162
+ for ( let index = 0; index < 10; index++ )
163
+ {
164
+ const pages = await browser.pages();
165
+ page = pages[0];
166
+ page.setDefaultNavigationTimeout( 30000 )
167
+ await page.goto( url );
168
+ console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
169
+ await this.waitForPageToLoad( page );
170
+ htmlContent = await page.content();
171
+ if ( this.isValidContent( htmlContent ) )
172
+ {
173
+ break
174
+ }
175
+ page = pages[0];
176
+ page.setDefaultNavigationTimeout( 30000 )
177
+ await this.waitForPageToLoad( page );
178
+ htmlContent = await page.content();
179
+ if ( this.isValidContent( htmlContent ) )
180
+ {
181
+ break
182
+ }
183
+ await page.goto( url );
184
+ }
185
+ return htmlContent;
186
+ }
187
+ catch ( error )
188
+ {
189
+ console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
190
+ throw error;
191
+ }
192
+ finally
193
+ {
194
+ await browser.close(); // Close the browser after scraping
195
+ }
196
+ }
197
+ throw error;
198
+ }
199
+ }
200
+
201
+ async waitForPageToLoad ( page )
202
+ {
203
+ try
204
+ {
205
+ await page.waitForNavigation({ waitUntil: "networkidle0" });
206
+ }
207
+ catch ( error )
208
+ {
209
+ console.log( error );
210
+ }
211
+ }
212
+
112
213
  extractLinks ( data )
113
214
  {
114
215
  const links = new Set();
@@ -329,7 +430,7 @@ class WebScraper
329
430
  return filteredMetadata;
330
431
  }
331
432
 
332
- metadataextractor ( url, document, headers )
433
+ metadataextractor ( url, document )
333
434
  {
334
435
  return {
335
436
  url,
@@ -337,9 +438,6 @@ class WebScraper
337
438
  description: document.querySelector( "meta[name=\"description\"]" )?.content,
338
439
  keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
339
440
  author: document.querySelector( "meta[name=\"author\"]" )?.content,
340
- lastModified: headers["last-modified"],
341
- contentType: headers["content-type"],
342
- contentLength: headers["content-length"],
343
441
  language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
344
442
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
345
443
  ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
@@ -416,16 +514,19 @@ class WebScraper
416
514
  // List of phrases that indicate invalid content
417
515
  const invalidPhrases = [
418
516
  "verifying that you are not a robot",
517
+ "verifying you are human. this may take a few seconds.",
518
+ "verify you are human by completing the action below",
419
519
  "checking if the site connection is secure",
420
520
  "please wait while we verify",
421
521
  "please enable javascript",
422
522
  "access denied",
523
+ "verifying you are human",
423
524
  "captcha verification"
424
525
  ];
425
526
 
426
527
  const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
427
528
  // Check content length
428
- if ( cleanContent.length < 100 && hasInvalidPhrases )
529
+ if ( cleanContent.length < 100 || hasInvalidPhrases )
429
530
  {
430
531
  return false;
431
532
  }