clean-web-scraper 3.10.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -3
- package/example-usage.js +36 -74
- package/main.js +1 -21
- package/package.json +1 -1
package/README.md
CHANGED
@@ -56,7 +56,6 @@ const scraper = new WebScraper({
|
|
56
56
|
strictBaseURL: true, // Optional: Only scrape URLs from same domain
|
57
57
|
maxDepth: Infinity, // Optional: Maximum crawling depth
|
58
58
|
maxArticles: Infinity, // Optional: Maximum articles to scrape
|
59
|
-
concurrencyLimit: 2, // Optional: Limit concurrent requests
|
60
59
|
crawlingDelay: 1000, // Optional: Delay between requests (ms)
|
61
60
|
|
62
61
|
// Network options
|
@@ -72,8 +71,6 @@ const scraper = new WebScraper({
|
|
72
71
|
|
73
72
|
// Puppeteer options for handling dynamic content
|
74
73
|
usePuppeteer: false, // Optional: Enable Puppeteer browser
|
75
|
-
puppeteerProxy: "http://127.0.0.1:2080", // Optional: Puppeteer proxy
|
76
|
-
puppeteerExecutablePath: "/path/to/chrome", // Optional: Custom browser path
|
77
74
|
});
|
78
75
|
await scraper.start();
|
79
76
|
```
|
package/example-usage.js
CHANGED
@@ -1,17 +1,25 @@
|
|
1
1
|
const WebScraper = require( "./main" );
|
2
2
|
|
3
|
-
// const cookies = "cf_clearance=ENHJkpw.ycd1tZ_A.d0O27QdslTN0EHaNurhCznfimg-1738241402-1.2.1.1-BlO.WitkGwE3U3vSamX35xP.AgN1HyvHWL03Jhe.twbn4QWojiw1T4.0M4lE_TcIeZrQ6ErwV9kQBMBKmfU0S6lQth1BJx7UpWn4T6wtFm83LmF.cB13PQYSQgGFGsH7qOkGIjbBhMbceQNp.y2XZgLq_hdntGKSBMe0iCUotx_xsqlzkolQIqnUYID3BLEQXZqNvqJOwkzLZ7.kzrwP42VdEuWEvT4jt7F3TkTaU9rumAp8FSNO1.hnr76Tv23OITm17rPD3__Ghdu1D0E.4v693nEiVYO_KQYNf_8gk0vXP.KAvUKA2zQyBmDXkfW3M1MkoLjFNZCanx9FPRVO7g";
|
4
3
|
const headers = {
|
5
4
|
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
6
5
|
"Cache-Control": "private",
|
7
6
|
"Accept": "application/xml,application/xhtml+xml,text/html;q=0.9, text/plain;q=0.8,image/png,*/*;q=0.5",
|
8
7
|
// "Cookie": cookies
|
8
|
+
};
|
9
|
+
|
10
|
+
async function runScraper ( config, enable )
|
11
|
+
{
|
12
|
+
const scraper = new WebScraper( config );
|
13
|
+
if ( enable )
|
14
|
+
{
|
15
|
+
await scraper.start();
|
16
|
+
}
|
17
|
+
return scraper;
|
9
18
|
}
|
10
19
|
|
11
20
|
async function palianswers ( enable )
|
12
21
|
{
|
13
|
-
|
14
|
-
const scraper = new WebScraper({
|
22
|
+
const config = {
|
15
23
|
baseURL: "https://palianswers.com",
|
16
24
|
excludeList: [
|
17
25
|
"https://palianswers.com/chat/",
|
@@ -22,7 +30,7 @@ async function palianswers ( enable )
|
|
22
30
|
"https://palianswers.com/themes/"
|
23
31
|
],
|
24
32
|
exactExcludeList: [
|
25
|
-
"https://palianswers.com/"
|
33
|
+
"https://palianswers.com/"
|
26
34
|
],
|
27
35
|
scrapResultPath: "./dataset/palianswers/website",
|
28
36
|
jsonlOutputPath: "./dataset/palianswers/train.jsonl",
|
@@ -31,21 +39,14 @@ async function palianswers ( enable )
|
|
31
39
|
includeMetadata: true,
|
32
40
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
33
41
|
axiosRetryDelay: 10000,
|
34
|
-
concurrencyLimit: 4,
|
35
42
|
crawlingDelay: 0
|
36
|
-
}
|
37
|
-
|
38
|
-
{
|
39
|
-
await scraper.start();
|
40
|
-
}
|
41
|
-
return scraper;
|
43
|
+
};
|
44
|
+
return await runScraper( config, enable );
|
42
45
|
}
|
43
46
|
|
44
47
|
async function khameneiIrFreePalestineTag ( enable )
|
45
48
|
{
|
46
|
-
|
47
|
-
// https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
|
48
|
-
const scraper = new WebScraper({
|
49
|
+
const config = {
|
49
50
|
baseURL: "https://english.khamenei.ir/news",
|
50
51
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
51
52
|
maxDepth: 1,
|
@@ -59,19 +60,13 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
59
60
|
includeMetadata: true,
|
60
61
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
61
62
|
axiosRetryDelay: 10000,
|
62
|
-
|
63
|
-
|
64
|
-
if ( enable )
|
65
|
-
{
|
66
|
-
await scraper.start();
|
67
|
-
}
|
68
|
-
return scraper;
|
63
|
+
};
|
64
|
+
return await runScraper( config, enable );
|
69
65
|
}
|
70
66
|
|
71
67
|
async function decolonizepalestine ( enable )
|
72
68
|
{
|
73
|
-
|
74
|
-
const scraper = new WebScraper({
|
69
|
+
const config = {
|
75
70
|
baseURL: "https://decolonizepalestine.com",
|
76
71
|
excludeList: [
|
77
72
|
"https://decolonizepalestine.com/cdn-cgi",
|
@@ -91,19 +86,13 @@ async function decolonizepalestine ( enable )
|
|
91
86
|
includeMetadata: true,
|
92
87
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
93
88
|
axiosRetryDelay: 10000,
|
94
|
-
|
95
|
-
|
96
|
-
if ( enable )
|
97
|
-
{
|
98
|
-
await scraper.start();
|
99
|
-
}
|
100
|
-
return scraper;
|
89
|
+
};
|
90
|
+
return await runScraper( config, enable );
|
101
91
|
}
|
102
92
|
|
103
93
|
async function electronicintifada ( enable )
|
104
94
|
{
|
105
|
-
|
106
|
-
const scraper = new WebScraper({
|
95
|
+
const config = {
|
107
96
|
baseURL: "https://electronicintifada.net",
|
108
97
|
excludeList: [
|
109
98
|
"https://electronicintifada.net/updates",
|
@@ -132,7 +121,6 @@ async function electronicintifada ( enable )
|
|
132
121
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
133
122
|
maxDepth: 16,
|
134
123
|
maxArticles: 2000,
|
135
|
-
concurrencyLimit: 2,
|
136
124
|
axiosHeaders: headers,
|
137
125
|
axiosRetryDelay: 10000,
|
138
126
|
axiosProxy: {
|
@@ -141,17 +129,13 @@ async function electronicintifada ( enable )
|
|
141
129
|
protocol: "http"
|
142
130
|
},
|
143
131
|
useProxyAsFallback: true
|
144
|
-
}
|
145
|
-
|
146
|
-
{
|
147
|
-
await scraper.start();
|
148
|
-
}
|
149
|
-
return scraper;
|
132
|
+
};
|
133
|
+
return await runScraper( config, enable );
|
150
134
|
}
|
151
135
|
|
152
136
|
async function standWithPalestine ( enable )
|
153
137
|
{
|
154
|
-
const
|
138
|
+
const config = {
|
155
139
|
baseURL: "https://stand-with-palestine.org/blogs",
|
156
140
|
startURL: "https://stand-with-palestine.org/blogs",
|
157
141
|
scrapResultPath: "./dataset/stand-with-palestine/website",
|
@@ -162,18 +146,13 @@ async function standWithPalestine ( enable )
|
|
162
146
|
axiosHeaders: headers,
|
163
147
|
includeMetadata: true,
|
164
148
|
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
165
|
-
}
|
166
|
-
|
167
|
-
{
|
168
|
-
await scraper.start();
|
169
|
-
}
|
170
|
-
return scraper;
|
149
|
+
};
|
150
|
+
return await runScraper( config, enable );
|
171
151
|
}
|
172
152
|
|
173
153
|
async function mondoweiss ( enable )
|
174
154
|
{
|
175
|
-
|
176
|
-
const scraper = new WebScraper({
|
155
|
+
const config = {
|
177
156
|
baseURL: "https://mondoweiss.net",
|
178
157
|
excludeList: [
|
179
158
|
"https://mondoweiss.net/donate",
|
@@ -200,7 +179,6 @@ async function mondoweiss ( enable )
|
|
200
179
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
201
180
|
maxArticles: 2500,
|
202
181
|
axiosMaxRetries: 3,
|
203
|
-
concurrencyLimit: 3,
|
204
182
|
axiosHeaders: headers,
|
205
183
|
axiosProxy: {
|
206
184
|
host: "localhost",
|
@@ -212,18 +190,13 @@ async function mondoweiss ( enable )
|
|
212
190
|
includeMetadata: true,
|
213
191
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
214
192
|
useProxyAsFallback: true
|
215
|
-
}
|
216
|
-
|
217
|
-
{
|
218
|
-
await scraper.start();
|
219
|
-
}
|
220
|
-
return scraper;
|
193
|
+
};
|
194
|
+
return await runScraper( config, enable );
|
221
195
|
}
|
222
196
|
|
223
197
|
async function bdsmovement ( enable )
|
224
198
|
{
|
225
|
-
|
226
|
-
const scraper = new WebScraper({
|
199
|
+
const config = {
|
227
200
|
baseURL: "https://bdsmovement.net",
|
228
201
|
excludeList: [
|
229
202
|
"https://bdsmovement.net/press-area",
|
@@ -239,24 +212,17 @@ async function bdsmovement ( enable )
|
|
239
212
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
240
213
|
includeMetadata: true,
|
241
214
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
242
|
-
puppeteerProxy: "socks5://127.0.0.1:2080",
|
243
|
-
puppeteerExecutablePath: "/usr/bin/chromium",
|
244
215
|
puppeteerRealProxy: {
|
245
216
|
host: "socks5://127.0.0.1",
|
246
217
|
port: "2080",
|
247
218
|
},
|
248
|
-
}
|
249
|
-
|
250
|
-
{
|
251
|
-
await scraper.start();
|
252
|
-
}
|
253
|
-
return scraper;
|
219
|
+
};
|
220
|
+
return await runScraper( config, enable );
|
254
221
|
}
|
255
222
|
|
256
223
|
async function palestineremembered ( enable )
|
257
224
|
{
|
258
|
-
|
259
|
-
const scraper = new WebScraper({
|
225
|
+
const config = {
|
260
226
|
baseURL: "https://www.palestineremembered.com",
|
261
227
|
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
262
228
|
excludeList: [
|
@@ -289,12 +255,8 @@ async function palestineremembered ( enable )
|
|
289
255
|
port: 2080,
|
290
256
|
protocol: "http"
|
291
257
|
}
|
292
|
-
}
|
293
|
-
|
294
|
-
{
|
295
|
-
await scraper.start();
|
296
|
-
}
|
297
|
-
return scraper;
|
258
|
+
};
|
259
|
+
return await runScraper( config, enable );
|
298
260
|
}
|
299
261
|
|
300
262
|
void async function main ()
|
@@ -316,4 +278,4 @@ void async function main ()
|
|
316
278
|
standWithPalestineScraper,
|
317
279
|
mondoweisScraper
|
318
280
|
] );
|
319
|
-
}()
|
281
|
+
}();
|
package/main.js
CHANGED
@@ -15,13 +15,12 @@ class WebScraper
|
|
15
15
|
this.strictBaseURL = config.strictBaseURL || true;
|
16
16
|
this.maxDepth = config.maxDepth || Infinity;
|
17
17
|
this.maxArticles = config.maxArticles || Infinity;
|
18
|
-
this.concurrencyLimit = config.concurrencyLimit || 2;
|
19
18
|
this.crawlingDelay = config.crawlingDelay ?? 1000;
|
20
19
|
|
21
20
|
// Output paths setup
|
22
21
|
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
23
22
|
this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
|
24
|
-
this.textOutputPathWithMeta = `${this.textOutputPath
|
23
|
+
this.textOutputPathWithMeta = `${this.textOutputPath}_with_metadata`;
|
25
24
|
this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
26
25
|
this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
27
26
|
this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
@@ -60,8 +59,6 @@ class WebScraper
|
|
60
59
|
|
61
60
|
// Puppeteer configuration
|
62
61
|
this.usePuppeteer = config.usePuppeteer || false;
|
63
|
-
this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
|
64
|
-
this.puppeteerExecutablePath = config.puppeteerExecutablePath;
|
65
62
|
this.puppeteerRealProxy = config.puppeteerRealProxy;
|
66
63
|
this.configurePuppeteer();
|
67
64
|
}
|
@@ -562,23 +559,6 @@ class WebScraper
|
|
562
559
|
|
563
560
|
configurePuppeteer ( )
|
564
561
|
{
|
565
|
-
this.puppeteerOptions = {
|
566
|
-
headless: false,
|
567
|
-
userDataDir: "./tmp/browser",
|
568
|
-
defaultViewport: null,
|
569
|
-
args: ["--start-maximized"],
|
570
|
-
ignoreDefaultArgs: true
|
571
|
-
};
|
572
|
-
|
573
|
-
if ( this.puppeteerProxy )
|
574
|
-
{
|
575
|
-
this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
|
576
|
-
}
|
577
|
-
if ( this.puppeteerExecutablePath )
|
578
|
-
{
|
579
|
-
this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
|
580
|
-
}
|
581
|
-
|
582
562
|
this.puppeteerRealOptions = {
|
583
563
|
headless: false,
|
584
564
|
args: [],
|