clean-web-scraper 3.10.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -3
- package/example-usage.js +52 -88
- package/main.js +6 -31
- package/package.json +1 -1
package/README.md
CHANGED
@@ -56,7 +56,6 @@ const scraper = new WebScraper({
|
|
56
56
|
strictBaseURL: true, // Optional: Only scrape URLs from same domain
|
57
57
|
maxDepth: Infinity, // Optional: Maximum crawling depth
|
58
58
|
maxArticles: Infinity, // Optional: Maximum articles to scrape
|
59
|
-
concurrencyLimit: 2, // Optional: Limit concurrent requests
|
60
59
|
crawlingDelay: 1000, // Optional: Delay between requests (ms)
|
61
60
|
|
62
61
|
// Network options
|
@@ -72,8 +71,6 @@ const scraper = new WebScraper({
|
|
72
71
|
|
73
72
|
// Puppeteer options for handling dynamic content
|
74
73
|
usePuppeteer: false, // Optional: Enable Puppeteer browser
|
75
|
-
puppeteerProxy: "http://127.0.0.1:2080", // Optional: Puppeteer proxy
|
76
|
-
puppeteerExecutablePath: "/path/to/chrome", // Optional: Custom browser path
|
77
74
|
});
|
78
75
|
await scraper.start();
|
79
76
|
```
|
package/example-usage.js
CHANGED
@@ -1,17 +1,25 @@
|
|
1
1
|
const WebScraper = require( "./main" );
|
2
2
|
|
3
|
-
// const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
|
4
3
|
const headers = {
|
5
4
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
6
5
|
"Cache-Control": "private",
|
7
6
|
"Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
|
8
7
|
// "Cookie": cookies
|
8
|
+
};
|
9
|
+
|
10
|
+
async function runScraper ( config, enable )
|
11
|
+
{
|
12
|
+
const scraper = new WebScraper( config );
|
13
|
+
if ( enable )
|
14
|
+
{
|
15
|
+
await scraper.start();
|
16
|
+
}
|
17
|
+
return scraper;
|
9
18
|
}
|
10
19
|
|
11
20
|
async function palianswers ( enable )
|
12
21
|
{
|
13
|
-
|
14
|
-
const scraper = new WebScraper({
|
22
|
+
const config = {
|
15
23
|
baseURL: "https://palianswers.com",
|
16
24
|
excludeList: [
|
17
25
|
"https://palianswers.com/chat/",
|
@@ -22,30 +30,23 @@ async function palianswers ( enable )
|
|
22
30
|
"https://palianswers.com/themes/"
|
23
31
|
],
|
24
32
|
exactExcludeList: [
|
25
|
-
"https://palianswers.com/"
|
33
|
+
"https://palianswers.com/"
|
26
34
|
],
|
27
35
|
scrapResultPath: "./dataset/palianswers/website",
|
28
36
|
jsonlOutputPath: "./dataset/palianswers/train.jsonl",
|
29
37
|
textOutputPath: "./dataset/palianswers/texts",
|
30
38
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
31
39
|
includeMetadata: true,
|
32
|
-
metadataFields: ["author", "
|
40
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
33
41
|
axiosRetryDelay: 10000,
|
34
|
-
concurrencyLimit: 4,
|
35
42
|
crawlingDelay: 0
|
36
|
-
}
|
37
|
-
|
38
|
-
{
|
39
|
-
await scraper.start();
|
40
|
-
}
|
41
|
-
return scraper;
|
43
|
+
};
|
44
|
+
return await runScraper( config, enable );
|
42
45
|
}
|
43
46
|
|
44
47
|
async function khameneiIrFreePalestineTag ( enable )
|
45
48
|
{
|
46
|
-
|
47
|
-
// https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
|
48
|
-
const scraper = new WebScraper({
|
49
|
+
const config = {
|
49
50
|
baseURL: "https://english.khamenei.ir/news",
|
50
51
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
51
52
|
maxDepth: 1,
|
@@ -57,21 +58,15 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
57
58
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
58
59
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
59
60
|
includeMetadata: true,
|
60
|
-
metadataFields: ["author", "
|
61
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
61
62
|
axiosRetryDelay: 10000,
|
62
|
-
|
63
|
-
|
64
|
-
if ( enable )
|
65
|
-
{
|
66
|
-
await scraper.start();
|
67
|
-
}
|
68
|
-
return scraper;
|
63
|
+
};
|
64
|
+
return await runScraper( config, enable );
|
69
65
|
}
|
70
66
|
|
71
67
|
async function decolonizepalestine ( enable )
|
72
68
|
{
|
73
|
-
|
74
|
-
const scraper = new WebScraper({
|
69
|
+
const config = {
|
75
70
|
baseURL: "https://decolonizepalestine.com",
|
76
71
|
excludeList: [
|
77
72
|
"https://decolonizepalestine.com/cdn-cgi",
|
@@ -89,21 +84,15 @@ async function decolonizepalestine ( enable )
|
|
89
84
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
90
85
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
91
86
|
includeMetadata: true,
|
92
|
-
metadataFields: ["author", "
|
87
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
93
88
|
axiosRetryDelay: 10000,
|
94
|
-
|
95
|
-
|
96
|
-
if ( enable )
|
97
|
-
{
|
98
|
-
await scraper.start();
|
99
|
-
}
|
100
|
-
return scraper;
|
89
|
+
};
|
90
|
+
return await runScraper( config, enable );
|
101
91
|
}
|
102
92
|
|
103
93
|
async function electronicintifada ( enable )
|
104
94
|
{
|
105
|
-
|
106
|
-
const scraper = new WebScraper({
|
95
|
+
const config = {
|
107
96
|
baseURL: "https://electronicintifada.net",
|
108
97
|
excludeList: [
|
109
98
|
"https://electronicintifada.net/updates",
|
@@ -129,51 +118,43 @@ async function electronicintifada ( enable )
|
|
129
118
|
textOutputPath: "./dataset/electronicintifada/texts",
|
130
119
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
131
120
|
includeMetadata: true,
|
132
|
-
metadataFields: ["author", "
|
121
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
133
122
|
maxDepth: 16,
|
134
123
|
maxArticles: 2000,
|
135
|
-
concurrencyLimit: 2,
|
136
124
|
axiosHeaders: headers,
|
125
|
+
axiosMaxRetries: 2,
|
137
126
|
axiosRetryDelay: 10000,
|
138
127
|
axiosProxy: {
|
139
128
|
host: "localhost",
|
140
129
|
port: 2080,
|
141
130
|
protocol: "http"
|
142
131
|
},
|
143
|
-
useProxyAsFallback: true
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
await scraper.start();
|
148
|
-
}
|
149
|
-
return scraper;
|
132
|
+
useProxyAsFallback: true,
|
133
|
+
crawlingDelay: 0
|
134
|
+
};
|
135
|
+
return await runScraper( config, enable );
|
150
136
|
}
|
151
137
|
|
152
138
|
async function standWithPalestine ( enable )
|
153
139
|
{
|
154
|
-
const
|
140
|
+
const config = {
|
155
141
|
baseURL: "https://stand-with-palestine.org/blogs",
|
156
142
|
startURL: "https://stand-with-palestine.org/blogs",
|
143
|
+
exactExcludeList: ["https://stand-with-palestine.org/blogs"],
|
157
144
|
scrapResultPath: "./dataset/stand-with-palestine/website",
|
158
145
|
jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
|
159
146
|
textOutputPath: "./dataset/stand-with-palestine/texts",
|
160
147
|
csvOutputPath: "./dataset/stand-with-palestine/train.csv",
|
161
|
-
exactExcludeList: ["https://stand-with-palestine.org/blogs"],
|
162
148
|
axiosHeaders: headers,
|
163
149
|
includeMetadata: true,
|
164
|
-
metadataFields: ["author", "
|
165
|
-
}
|
166
|
-
|
167
|
-
{
|
168
|
-
await scraper.start();
|
169
|
-
}
|
170
|
-
return scraper;
|
150
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
|
151
|
+
};
|
152
|
+
return await runScraper( config, enable );
|
171
153
|
}
|
172
154
|
|
173
155
|
async function mondoweiss ( enable )
|
174
156
|
{
|
175
|
-
|
176
|
-
const scraper = new WebScraper({
|
157
|
+
const config = {
|
177
158
|
baseURL: "https://mondoweiss.net",
|
178
159
|
excludeList: [
|
179
160
|
"https://mondoweiss.net/donate",
|
@@ -199,31 +180,25 @@ async function mondoweiss ( enable )
|
|
199
180
|
textOutputPath: "./dataset/mondoweiss/texts",
|
200
181
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
201
182
|
maxArticles: 2500,
|
202
|
-
|
203
|
-
concurrencyLimit: 3,
|
183
|
+
maxDepth: 15,
|
204
184
|
axiosHeaders: headers,
|
185
|
+
axiosMaxRetries: 3,
|
186
|
+
axiosRetryDelay: 10000,
|
205
187
|
axiosProxy: {
|
206
188
|
host: "localhost",
|
207
189
|
port: 2080,
|
208
190
|
protocol: "http"
|
209
191
|
},
|
210
|
-
|
211
|
-
axiosRetryDelay: 10000,
|
192
|
+
useProxyAsFallback: true,
|
212
193
|
includeMetadata: true,
|
213
|
-
metadataFields: ["author", "
|
214
|
-
|
215
|
-
|
216
|
-
if ( enable )
|
217
|
-
{
|
218
|
-
await scraper.start();
|
219
|
-
}
|
220
|
-
return scraper;
|
194
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
195
|
+
};
|
196
|
+
return await runScraper( config, enable );
|
221
197
|
}
|
222
198
|
|
223
199
|
async function bdsmovement ( enable )
|
224
200
|
{
|
225
|
-
|
226
|
-
const scraper = new WebScraper({
|
201
|
+
const config = {
|
227
202
|
baseURL: "https://bdsmovement.net",
|
228
203
|
excludeList: [
|
229
204
|
"https://bdsmovement.net/press-area",
|
@@ -238,25 +213,18 @@ async function bdsmovement ( enable )
|
|
238
213
|
textOutputPath: "./dataset/bdsmovement/texts",
|
239
214
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
240
215
|
includeMetadata: true,
|
241
|
-
metadataFields: ["author", "
|
242
|
-
puppeteerProxy: "socks5://127.0.0.1:2080",
|
243
|
-
puppeteerExecutablePath: "/usr/bin/chromium",
|
216
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
244
217
|
puppeteerRealProxy: {
|
245
218
|
host: "socks5://127.0.0.1",
|
246
219
|
port: "2080",
|
247
220
|
},
|
248
|
-
}
|
249
|
-
|
250
|
-
{
|
251
|
-
await scraper.start();
|
252
|
-
}
|
253
|
-
return scraper;
|
221
|
+
};
|
222
|
+
return await runScraper( config, enable );
|
254
223
|
}
|
255
224
|
|
256
225
|
async function palestineremembered ( enable )
|
257
226
|
{
|
258
|
-
|
259
|
-
const scraper = new WebScraper({
|
227
|
+
const config = {
|
260
228
|
baseURL: "https://www.palestineremembered.com",
|
261
229
|
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
262
230
|
excludeList: [
|
@@ -283,18 +251,14 @@ async function palestineremembered ( enable )
|
|
283
251
|
textOutputPath: "./dataset/palestineremembered/texts",
|
284
252
|
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
285
253
|
includeMetadata: true,
|
286
|
-
metadataFields: ["author", "
|
254
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
287
255
|
axiosProxy: {
|
288
256
|
host: "localhost",
|
289
257
|
port: 2080,
|
290
258
|
protocol: "http"
|
291
259
|
}
|
292
|
-
}
|
293
|
-
|
294
|
-
{
|
295
|
-
await scraper.start();
|
296
|
-
}
|
297
|
-
return scraper;
|
260
|
+
};
|
261
|
+
return await runScraper( config, enable );
|
298
262
|
}
|
299
263
|
|
300
264
|
void async function main ()
|
@@ -316,4 +280,4 @@ void async function main ()
|
|
316
280
|
standWithPalestineScraper,
|
317
281
|
mondoweisScraper
|
318
282
|
] );
|
319
|
-
}()
|
283
|
+
}();
|
package/main.js
CHANGED
@@ -15,13 +15,12 @@ class WebScraper
|
|
15
15
|
this.strictBaseURL = config.strictBaseURL || true;
|
16
16
|
this.maxDepth = config.maxDepth || Infinity;
|
17
17
|
this.maxArticles = config.maxArticles || Infinity;
|
18
|
-
this.concurrencyLimit = config.concurrencyLimit || 2;
|
19
18
|
this.crawlingDelay = config.crawlingDelay ?? 1000;
|
20
19
|
|
21
20
|
// Output paths setup
|
22
21
|
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
23
22
|
this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
|
24
|
-
this.textOutputPathWithMeta = `${this.textOutputPath
|
23
|
+
this.textOutputPathWithMeta = `${this.textOutputPath}_with_metadata`;
|
25
24
|
this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
26
25
|
this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
27
26
|
this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
@@ -60,8 +59,6 @@ class WebScraper
|
|
60
59
|
|
61
60
|
// Puppeteer configuration
|
62
61
|
this.usePuppeteer = config.usePuppeteer || false;
|
63
|
-
this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
|
64
|
-
this.puppeteerExecutablePath = config.puppeteerExecutablePath;
|
65
62
|
this.puppeteerRealProxy = config.puppeteerRealProxy;
|
66
63
|
this.configurePuppeteer();
|
67
64
|
}
|
@@ -138,16 +135,14 @@ class WebScraper
|
|
138
135
|
|
139
136
|
if ( !this.isExcluded( url ) )
|
140
137
|
{
|
141
|
-
const reader = new Readability( document
|
142
|
-
charThreshold: 500,
|
143
|
-
nbTopCandidates: 20
|
144
|
-
});
|
138
|
+
const reader = new Readability( document );
|
145
139
|
const article = reader.parse();
|
146
140
|
if ( article )
|
147
141
|
{
|
148
142
|
if ( this.hasValidPageContent( article.textContent ) )
|
149
143
|
{
|
150
144
|
const metadata = this.extractMetadata( url, document );
|
145
|
+
metadata.articleTitle = article.title || "";
|
151
146
|
this.saveArticle( url, article.textContent, metadata );
|
152
147
|
}
|
153
148
|
else
|
@@ -506,17 +501,14 @@ class WebScraper
|
|
506
501
|
{
|
507
502
|
return {
|
508
503
|
url,
|
509
|
-
|
504
|
+
pageTitle: document.title,
|
510
505
|
description: document.querySelector( "meta[name=\"description\"]" )?.content,
|
511
506
|
keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
|
512
507
|
author: document.querySelector( "meta[name=\"author\"]" )?.content,
|
513
|
-
language:
|
514
|
-
document.documentElement.lang ||
|
515
|
-
document.querySelector( "html" )?.getAttribute( "lang" ),
|
508
|
+
language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
|
516
509
|
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
517
510
|
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
518
|
-
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
|
519
|
-
?.content,
|
511
|
+
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
520
512
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
521
513
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
522
514
|
dateScrapedDate: new Date().toISOString()
|
@@ -562,23 +554,6 @@ class WebScraper
|
|
562
554
|
|
563
555
|
configurePuppeteer ( )
|
564
556
|
{
|
565
|
-
this.puppeteerOptions = {
|
566
|
-
headless: false,
|
567
|
-
userDataDir: "./tmp/browser",
|
568
|
-
defaultViewport: null,
|
569
|
-
args: ["--start-maximized"],
|
570
|
-
ignoreDefaultArgs: true
|
571
|
-
};
|
572
|
-
|
573
|
-
if ( this.puppeteerProxy )
|
574
|
-
{
|
575
|
-
this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
|
576
|
-
}
|
577
|
-
if ( this.puppeteerExecutablePath )
|
578
|
-
{
|
579
|
-
this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
|
580
|
-
}
|
581
|
-
|
582
557
|
this.puppeteerRealOptions = {
|
583
558
|
headless: false,
|
584
559
|
args: [],
|