clean-web-scraper 3.3.3 → 3.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/example-usage.js +43 -4
- package/package.json +4 -2
- package/src/WebScraper.js +147 -21
package/README.md
CHANGED
package/example-usage.js
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
const WebScraper = require( "./src/WebScraper" );
|
|
2
2
|
|
|
3
|
+
// const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
|
|
4
|
+
// const headers = {
|
|
5
|
+
// "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
|
6
|
+
// "Cache-Control": "private",
|
|
7
|
+
// "Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
|
|
8
|
+
// "Cookie": cookies
|
|
9
|
+
// }
|
|
10
|
+
|
|
3
11
|
|
|
4
12
|
async function khameneiIrFreePalestineTag ()
|
|
5
13
|
{
|
|
@@ -52,9 +60,9 @@ async function decolonizepalestine ()
|
|
|
52
60
|
|
|
53
61
|
async function bdsmovement ()
|
|
54
62
|
{
|
|
55
|
-
// https://bdsmovement.
|
|
63
|
+
// https://bdsmovement.net
|
|
56
64
|
const scraper = new WebScraper({
|
|
57
|
-
baseURL: "https://bdsmovement.
|
|
65
|
+
baseURL: "https://bdsmovement.net",
|
|
58
66
|
excludeList: [
|
|
59
67
|
"https://bdsmovement.net/press-area",
|
|
60
68
|
"https://bdsmovement.net/privacy-policy",
|
|
@@ -68,6 +76,37 @@ async function bdsmovement ()
|
|
|
68
76
|
textOutputPath: "./dataset/bdsmovement/texts",
|
|
69
77
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
|
70
78
|
includeMetadata: true,
|
|
79
|
+
metadataFields: ["title", "description", "author"],
|
|
80
|
+
puppeteerProxy: "socks5://127.0.0.1:2080",
|
|
81
|
+
puppeteerExecutablePath: "/usr/bin/chromium",
|
|
82
|
+
puppeteerRealProxy: {
|
|
83
|
+
host: "socks5://127.0.0.1",
|
|
84
|
+
port: "2080",
|
|
85
|
+
},
|
|
86
|
+
usePuppeteer: true
|
|
87
|
+
});
|
|
88
|
+
await scraper.start();
|
|
89
|
+
return scraper;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
async function electronicintifada ()
|
|
93
|
+
{
|
|
94
|
+
// https://electronicintifada.net
|
|
95
|
+
const scraper = new WebScraper({
|
|
96
|
+
baseURL: "https://electronicintifada.net",
|
|
97
|
+
excludeList: [
|
|
98
|
+
"https://electronicintifada.net/press-area",
|
|
99
|
+
"https://electronicintifada.net/privacy-policy",
|
|
100
|
+
"https://electronicintifada.net/get-involved/join-a-bds-campaign",
|
|
101
|
+
"https://electronicintifada.net/donate_",
|
|
102
|
+
"https://electronicintifada.net/user",
|
|
103
|
+
"https://electronicintifada.net/admin"
|
|
104
|
+
],
|
|
105
|
+
scrapResultPath: "./dataset/electronicintifada/website",
|
|
106
|
+
jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
|
|
107
|
+
textOutputPath: "./dataset/electronicintifada/texts",
|
|
108
|
+
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
|
109
|
+
includeMetadata: true,
|
|
71
110
|
metadataFields: ["title", "description", "author"]
|
|
72
111
|
});
|
|
73
112
|
await scraper.start();
|
|
@@ -78,11 +117,11 @@ void async function main ()
|
|
|
78
117
|
{
|
|
79
118
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
80
119
|
const decolonizepalestineScraper = await decolonizepalestine();
|
|
81
|
-
|
|
120
|
+
const bdsmovementScraper = await bdsmovement();
|
|
82
121
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
83
122
|
khameneiIrFreePalestineTagScraper,
|
|
84
123
|
decolonizepalestineScraper,
|
|
85
|
-
|
|
124
|
+
bdsmovementScraper
|
|
86
125
|
] );
|
|
87
126
|
|
|
88
127
|
// 4
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "clean-web-scraper",
|
|
3
|
-
"version": "3.3.
|
|
3
|
+
"version": "3.3.5",
|
|
4
4
|
"main": "main.js",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"start": "node main.js",
|
|
@@ -26,6 +26,8 @@
|
|
|
26
26
|
"@mozilla/readability": "^0.5.0",
|
|
27
27
|
"axios": "^1.7.9",
|
|
28
28
|
"eslint": "^9.17.0",
|
|
29
|
-
"jsdom": "^26.0.0"
|
|
29
|
+
"jsdom": "^26.0.0",
|
|
30
|
+
"puppeteer": "^24.1.1",
|
|
31
|
+
"puppeteer-real-browser": "^1.3.22"
|
|
30
32
|
}
|
|
31
33
|
}
|
package/src/WebScraper.js
CHANGED
|
@@ -4,6 +4,8 @@ const { JSDOM } = jsdom;
|
|
|
4
4
|
const { Readability } = require( "@mozilla/readability" );
|
|
5
5
|
const fs = require( "fs" );
|
|
6
6
|
const path = require( "path" );
|
|
7
|
+
const puppeteer = require( "puppeteer" );
|
|
8
|
+
const { connect } = require( "puppeteer-real-browser" )
|
|
7
9
|
|
|
8
10
|
class WebScraper
|
|
9
11
|
{
|
|
@@ -19,11 +21,11 @@ class WebScraper
|
|
|
19
21
|
csvOutputPath,
|
|
20
22
|
includeMetadata = false,
|
|
21
23
|
metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
|
|
22
|
-
headers
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
24
|
+
headers,
|
|
25
|
+
usePuppeteer,
|
|
26
|
+
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
27
|
+
puppeteerExecutablePath,
|
|
28
|
+
puppeteerRealProxy
|
|
27
29
|
})
|
|
28
30
|
{
|
|
29
31
|
this.baseURL = baseURL;
|
|
@@ -42,16 +44,67 @@ class WebScraper
|
|
|
42
44
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
43
45
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
44
46
|
this.allProcessedContent = [];
|
|
47
|
+
this.usePuppeteer = usePuppeteer || false;
|
|
48
|
+
this.puppeteerOptions = {
|
|
49
|
+
headless: false,
|
|
50
|
+
userDataDir: "./tmp/browser",
|
|
51
|
+
defaultViewport: null,
|
|
52
|
+
args: [
|
|
53
|
+
"--start-maximized"
|
|
54
|
+
],
|
|
55
|
+
"ignoreDefaultArgs": true,
|
|
56
|
+
}
|
|
57
|
+
if ( puppeteerProxy )
|
|
58
|
+
{
|
|
59
|
+
this.puppeteerOptions.args.push( `--proxy-server=${puppeteerProxy}` );
|
|
60
|
+
}
|
|
61
|
+
if ( puppeteerExecutablePath )
|
|
62
|
+
{
|
|
63
|
+
this.puppeteerOptions.executablePath = puppeteerExecutablePath;
|
|
64
|
+
}
|
|
65
|
+
this.puppeteerRealOptions = {
|
|
66
|
+
headless: false,
|
|
67
|
+
args: [],
|
|
68
|
+
customConfig: {},
|
|
69
|
+
turnstile: true,
|
|
70
|
+
connectOption: {},
|
|
71
|
+
disableXvfb: false,
|
|
72
|
+
ignoreAllFlags: false,
|
|
73
|
+
proxy: puppeteerRealProxy
|
|
74
|
+
}
|
|
75
|
+
this.puppeteerBrowser = null;
|
|
76
|
+
this.puppeteerPage = null;
|
|
45
77
|
}
|
|
46
78
|
|
|
47
79
|
async start ()
|
|
48
80
|
{
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
81
|
+
try
|
|
82
|
+
{
|
|
83
|
+
if ( this.usePuppeteer )
|
|
84
|
+
{
|
|
85
|
+
let { browser, page } = await connect( this.puppeteerRealOptions )
|
|
86
|
+
this.puppeteerBrowser = browser;
|
|
87
|
+
this.puppeteerPage = page;
|
|
88
|
+
}
|
|
89
|
+
this.createOutputDirectory();
|
|
90
|
+
await this.fetchPage( this.startURL, 0 );
|
|
91
|
+
this.createJSONLFile();
|
|
92
|
+
this.saveNumberedTextFiles();
|
|
93
|
+
this.createCSVFile();
|
|
94
|
+
console.log( "Scraping completed." );
|
|
95
|
+
}
|
|
96
|
+
catch ( error )
|
|
97
|
+
{
|
|
98
|
+
console.error( "Error:", error );
|
|
99
|
+
throw error;
|
|
100
|
+
}
|
|
101
|
+
finally
|
|
102
|
+
{
|
|
103
|
+
if ( this.puppeteerBrowser )
|
|
104
|
+
{
|
|
105
|
+
await this.puppeteerBrowser.close(); // Close the browser after scraping
|
|
106
|
+
}
|
|
107
|
+
}
|
|
55
108
|
}
|
|
56
109
|
|
|
57
110
|
async fetchPage ( url, depth )
|
|
@@ -63,9 +116,7 @@ class WebScraper
|
|
|
63
116
|
this.visited.add( url );
|
|
64
117
|
try
|
|
65
118
|
{
|
|
66
|
-
const
|
|
67
|
-
headers: this.headers,
|
|
68
|
-
});
|
|
119
|
+
const data = await this.caller( url );
|
|
69
120
|
const dom = new JSDOM( data, { url });
|
|
70
121
|
const { document } = dom.window;
|
|
71
122
|
|
|
@@ -78,8 +129,7 @@ class WebScraper
|
|
|
78
129
|
{
|
|
79
130
|
if ( this.isValidContent( article.textContent ) )
|
|
80
131
|
{
|
|
81
|
-
|
|
82
|
-
const metadata = this.metadataextractor( url, document, headers );
|
|
132
|
+
const metadata = this.metadataextractor( url, document );
|
|
83
133
|
metadata.depth = depth;
|
|
84
134
|
this.saveArticle( url, article.textContent, metadata );
|
|
85
135
|
}
|
|
@@ -109,6 +159,82 @@ class WebScraper
|
|
|
109
159
|
}
|
|
110
160
|
}
|
|
111
161
|
|
|
162
|
+
async caller ( url )
|
|
163
|
+
{
|
|
164
|
+
try
|
|
165
|
+
{
|
|
166
|
+
let axiosOptinos = {}
|
|
167
|
+
if ( this.headers )
|
|
168
|
+
{
|
|
169
|
+
axiosOptinos.headers = this.headers
|
|
170
|
+
}
|
|
171
|
+
const result = await axios.get( url, axiosOptinos );
|
|
172
|
+
return result.data
|
|
173
|
+
}
|
|
174
|
+
catch ( error )
|
|
175
|
+
{
|
|
176
|
+
console.error( `Error fetching ${url}:`, error.message );
|
|
177
|
+
if ( error.status = 403 && this.usePuppeteer )
|
|
178
|
+
{
|
|
179
|
+
// const browser = await puppeteer.launch( this.puppeteerOptions );
|
|
180
|
+
// const page = await browser.newPage();
|
|
181
|
+
try
|
|
182
|
+
{
|
|
183
|
+
let result;
|
|
184
|
+
for ( let index = 0; index < 10; index++ )
|
|
185
|
+
{
|
|
186
|
+
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
|
187
|
+
result = await this.goToUrl( url ) ;
|
|
188
|
+
if ( this.isValidContent( result.htmlContent ) )
|
|
189
|
+
{
|
|
190
|
+
break
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return result.htmlContent;
|
|
194
|
+
}
|
|
195
|
+
catch ( error )
|
|
196
|
+
{
|
|
197
|
+
console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
|
|
198
|
+
throw error;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
}
|
|
202
|
+
throw error;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
async goToUrl ( url )
|
|
207
|
+
{
|
|
208
|
+
let pages = await this.puppeteerBrowser.pages();
|
|
209
|
+
let page = pages[0];
|
|
210
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
211
|
+
await page.goto( url );
|
|
212
|
+
pages = await this.puppeteerBrowser.pages();
|
|
213
|
+
page = pages[0];
|
|
214
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
215
|
+
await this.waitForPageToLoad( page );
|
|
216
|
+
pages = await this.puppeteerBrowser.pages();
|
|
217
|
+
page = pages[0];
|
|
218
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
219
|
+
if ( page )
|
|
220
|
+
{
|
|
221
|
+
let htmlContent = await page.content();
|
|
222
|
+
return { pages, page, htmlContent };
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
async waitForPageToLoad ( page )
|
|
227
|
+
{
|
|
228
|
+
try
|
|
229
|
+
{
|
|
230
|
+
await page.waitForNavigation({ waitUntil: "networkidle0" });
|
|
231
|
+
}
|
|
232
|
+
catch ( error )
|
|
233
|
+
{
|
|
234
|
+
console.log( error );
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
112
238
|
extractLinks ( data )
|
|
113
239
|
{
|
|
114
240
|
const links = new Set();
|
|
@@ -329,7 +455,7 @@ class WebScraper
|
|
|
329
455
|
return filteredMetadata;
|
|
330
456
|
}
|
|
331
457
|
|
|
332
|
-
metadataextractor ( url, document
|
|
458
|
+
metadataextractor ( url, document )
|
|
333
459
|
{
|
|
334
460
|
return {
|
|
335
461
|
url,
|
|
@@ -337,9 +463,6 @@ class WebScraper
|
|
|
337
463
|
description: document.querySelector( "meta[name=\"description\"]" )?.content,
|
|
338
464
|
keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
|
|
339
465
|
author: document.querySelector( "meta[name=\"author\"]" )?.content,
|
|
340
|
-
lastModified: headers["last-modified"],
|
|
341
|
-
contentType: headers["content-type"],
|
|
342
|
-
contentLength: headers["content-length"],
|
|
343
466
|
language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
|
|
344
467
|
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
|
345
468
|
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
|
@@ -416,16 +539,19 @@ class WebScraper
|
|
|
416
539
|
// List of phrases that indicate invalid content
|
|
417
540
|
const invalidPhrases = [
|
|
418
541
|
"verifying that you are not a robot",
|
|
542
|
+
"verifying you are human. this may take a few seconds.",
|
|
543
|
+
"verify you are human by completing the action below",
|
|
419
544
|
"checking if the site connection is secure",
|
|
420
545
|
"please wait while we verify",
|
|
421
546
|
"please enable javascript",
|
|
422
547
|
"access denied",
|
|
548
|
+
"verifying you are human",
|
|
423
549
|
"captcha verification"
|
|
424
550
|
];
|
|
425
551
|
|
|
426
552
|
const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
|
|
427
553
|
// Check content length
|
|
428
|
-
if ( cleanContent.length < 100
|
|
554
|
+
if ( cleanContent.length < 100 || hasInvalidPhrases )
|
|
429
555
|
{
|
|
430
556
|
return false;
|
|
431
557
|
}
|