clean-web-scraper 3.8.0 → 3.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +113 -91
- package/main.js +14 -2
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -29,7 +29,8 @@ async function palianswers ( enable )
|
|
29
29
|
textOutputPath: "./dataset/palianswers/texts",
|
30
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
31
31
|
includeMetadata: true,
|
32
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
32
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
33
|
+
retryDelay: 10000
|
33
34
|
});
|
34
35
|
if ( enable )
|
35
36
|
{
|
@@ -54,7 +55,8 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
54
55
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
55
56
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
56
57
|
includeMetadata: true,
|
57
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
58
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
59
|
+
retryDelay: 10000
|
58
60
|
});
|
59
61
|
if ( enable )
|
60
62
|
{
|
@@ -84,40 +86,8 @@ async function decolonizepalestine ( enable )
|
|
84
86
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
85
87
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
86
88
|
includeMetadata: true,
|
87
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
88
|
-
});
|
89
|
-
if ( enable )
|
90
|
-
{
|
91
|
-
await scraper.start();
|
92
|
-
}
|
93
|
-
return scraper;
|
94
|
-
}
|
95
|
-
|
96
|
-
async function bdsmovement ( enable )
|
97
|
-
{
|
98
|
-
// https://bdsmovement.net
|
99
|
-
const scraper = new WebScraper({
|
100
|
-
baseURL: "https://bdsmovement.net",
|
101
|
-
excludeList: [
|
102
|
-
"https://bdsmovement.net/press-area",
|
103
|
-
"https://bdsmovement.net/privacy-policy",
|
104
|
-
"https://bdsmovement.net/get-involved/join-a-bds-campaign",
|
105
|
-
"https://bdsmovement.net/donate_",
|
106
|
-
"https://bdsmovement.net/user",
|
107
|
-
"https://bdsmovement.net/admin"
|
108
|
-
],
|
109
|
-
scrapResultPath: "./dataset/bdsmovement/website",
|
110
|
-
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
111
|
-
textOutputPath: "./dataset/bdsmovement/texts",
|
112
|
-
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
113
|
-
includeMetadata: true,
|
114
89
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
115
|
-
|
116
|
-
puppeteerExecutablePath: "/usr/bin/chromium",
|
117
|
-
puppeteerRealProxy: {
|
118
|
-
host: "socks5://127.0.0.1",
|
119
|
-
port: "2080",
|
120
|
-
},
|
90
|
+
retryDelay: 10000
|
121
91
|
});
|
122
92
|
if ( enable )
|
123
93
|
{
|
@@ -155,53 +125,11 @@ async function electronicintifada ( enable )
|
|
155
125
|
textOutputPath: "./dataset/electronicintifada/texts",
|
156
126
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
157
127
|
includeMetadata: true,
|
128
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
129
|
+
maxDepth: 10,
|
158
130
|
maxArticles: 2000,
|
159
131
|
axiosHeaders: headers,
|
160
|
-
|
161
|
-
});
|
162
|
-
if ( enable )
|
163
|
-
{
|
164
|
-
await scraper.start();
|
165
|
-
}
|
166
|
-
return scraper;
|
167
|
-
}
|
168
|
-
|
169
|
-
async function palestineremembered ( enable )
|
170
|
-
{
|
171
|
-
// https://www.palestineremembered.com
|
172
|
-
const scraper = new WebScraper({
|
173
|
-
baseURL: "https://www.palestineremembered.com",
|
174
|
-
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
175
|
-
excludeList: [
|
176
|
-
"https://www.palestineremembered.com/GeoPoints",
|
177
|
-
"https://www.palestineremembered.com/Donate",
|
178
|
-
"https://www.palestineremembered.com/ContactUs.html",
|
179
|
-
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
180
|
-
"https://www.palestineremembered.com/ar/",
|
181
|
-
"https://www.palestineremembered.com/OldNewPictures.html",
|
182
|
-
"https://www.palestineremembered.com/Maps/index.html",
|
183
|
-
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
184
|
-
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
185
|
-
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
186
|
-
"https://www.palestineremembered.com/AllTownsListing.html",
|
187
|
-
"https://www.palestineremembered.com/Articles/General/ar/",
|
188
|
-
"https://www.palestineremembered.com/SiteVideos.html"
|
189
|
-
],
|
190
|
-
exactExcludeList: [
|
191
|
-
"https://www.palestineremembered.com/index.html",
|
192
|
-
"https://www.palestineremembered.com/ZionistFAQ.html"
|
193
|
-
],
|
194
|
-
scrapResultPath: "./dataset/palestineremembered/website",
|
195
|
-
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
196
|
-
textOutputPath: "./dataset/palestineremembered/texts",
|
197
|
-
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
198
|
-
includeMetadata: true,
|
199
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
200
|
-
axiosProxy: {
|
201
|
-
host: "localhost",
|
202
|
-
port: 2080,
|
203
|
-
protocol: "http"
|
204
|
-
}
|
132
|
+
retryDelay: 10000
|
205
133
|
});
|
206
134
|
if ( enable )
|
207
135
|
{
|
@@ -240,15 +168,36 @@ async function mondoweiss ( enable )
|
|
240
168
|
"https://mondoweiss.net/donate",
|
241
169
|
"https://mondoweiss.net/advertise/",
|
242
170
|
"https://mondoweiss.net/contact/",
|
243
|
-
"https://mondoweiss.net/recent-comments/"
|
171
|
+
"https://mondoweiss.net/recent-comments/",
|
172
|
+
"https://mondoweiss.net/email-newsletters",
|
173
|
+
"https://mondoweiss.net/author",
|
174
|
+
"https://mondoweiss.net/tag/"
|
175
|
+
],
|
176
|
+
exactExcludeList: [
|
177
|
+
"https://mondoweiss.net",
|
178
|
+
"https://mondoweiss.net/news/",
|
179
|
+
"https://mondoweiss.net/opinion/",
|
180
|
+
"https://mondoweiss.net/ways-to-give/",
|
181
|
+
"https://mondoweiss.net/media-analysis/",
|
182
|
+
"https://mondoweiss.net/culture/",
|
183
|
+
"https://mondoweiss.net/activism/",
|
184
|
+
"https://mondoweiss.net/news-letters/"
|
244
185
|
],
|
245
186
|
scrapResultPath: "./dataset/mondoweiss/website",
|
246
187
|
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|
247
188
|
textOutputPath: "./dataset/mondoweiss/texts",
|
248
189
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
249
|
-
includeMetadata: true,
|
250
190
|
maxArticles: 2500,
|
191
|
+
maxRetries: 2,
|
251
192
|
axiosHeaders: headers,
|
193
|
+
axiosProxy: {
|
194
|
+
host: "localhost",
|
195
|
+
port: 2080,
|
196
|
+
protocol: "http"
|
197
|
+
},
|
198
|
+
maxDepth: 10,
|
199
|
+
retryDelay: 10000,
|
200
|
+
includeMetadata: true,
|
252
201
|
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
253
202
|
});
|
254
203
|
if ( enable )
|
@@ -258,14 +207,90 @@ async function mondoweiss ( enable )
|
|
258
207
|
return scraper;
|
259
208
|
}
|
260
209
|
|
210
|
+
async function bdsmovement ( enable )
|
211
|
+
{
|
212
|
+
// https://bdsmovement.net
|
213
|
+
const scraper = new WebScraper({
|
214
|
+
baseURL: "https://bdsmovement.net",
|
215
|
+
excludeList: [
|
216
|
+
"https://bdsmovement.net/press-area",
|
217
|
+
"https://bdsmovement.net/privacy-policy",
|
218
|
+
"https://bdsmovement.net/get-involved/join-a-bds-campaign",
|
219
|
+
"https://bdsmovement.net/donate_",
|
220
|
+
"https://bdsmovement.net/user",
|
221
|
+
"https://bdsmovement.net/admin"
|
222
|
+
],
|
223
|
+
scrapResultPath: "./dataset/bdsmovement/website",
|
224
|
+
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
225
|
+
textOutputPath: "./dataset/bdsmovement/texts",
|
226
|
+
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
227
|
+
includeMetadata: true,
|
228
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
229
|
+
puppeteerProxy: "socks5://127.0.0.1:2080",
|
230
|
+
puppeteerExecutablePath: "/usr/bin/chromium",
|
231
|
+
puppeteerRealProxy: {
|
232
|
+
host: "socks5://127.0.0.1",
|
233
|
+
port: "2080",
|
234
|
+
},
|
235
|
+
});
|
236
|
+
if ( enable )
|
237
|
+
{
|
238
|
+
await scraper.start();
|
239
|
+
}
|
240
|
+
return scraper;
|
241
|
+
}
|
242
|
+
|
243
|
+
async function palestineremembered ( enable )
|
244
|
+
{
|
245
|
+
// https://www.palestineremembered.com
|
246
|
+
const scraper = new WebScraper({
|
247
|
+
baseURL: "https://www.palestineremembered.com",
|
248
|
+
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
249
|
+
excludeList: [
|
250
|
+
"https://www.palestineremembered.com/GeoPoints",
|
251
|
+
"https://www.palestineremembered.com/Donate",
|
252
|
+
"https://www.palestineremembered.com/ContactUs.html",
|
253
|
+
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
254
|
+
"https://www.palestineremembered.com/ar/",
|
255
|
+
"https://www.palestineremembered.com/OldNewPictures.html",
|
256
|
+
"https://www.palestineremembered.com/Maps/index.html",
|
257
|
+
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
258
|
+
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
259
|
+
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
260
|
+
"https://www.palestineremembered.com/AllTownsListing.html",
|
261
|
+
"https://www.palestineremembered.com/Articles/General/ar/",
|
262
|
+
"https://www.palestineremembered.com/SiteVideos.html"
|
263
|
+
],
|
264
|
+
exactExcludeList: [
|
265
|
+
"https://www.palestineremembered.com/index.html",
|
266
|
+
"https://www.palestineremembered.com/ZionistFAQ.html"
|
267
|
+
],
|
268
|
+
scrapResultPath: "./dataset/palestineremembered/website",
|
269
|
+
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
270
|
+
textOutputPath: "./dataset/palestineremembered/texts",
|
271
|
+
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
272
|
+
includeMetadata: true,
|
273
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
274
|
+
axiosProxy: {
|
275
|
+
host: "localhost",
|
276
|
+
port: 2080,
|
277
|
+
protocol: "http"
|
278
|
+
}
|
279
|
+
});
|
280
|
+
if ( enable )
|
281
|
+
{
|
282
|
+
await scraper.start();
|
283
|
+
}
|
284
|
+
return scraper;
|
285
|
+
}
|
261
286
|
|
262
287
|
void async function main ()
|
263
288
|
{
|
264
|
-
const palianswersScraper = await palianswers(
|
265
|
-
const decolonizepalestineScraper = await decolonizepalestine(
|
266
|
-
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag(
|
267
|
-
const electronicintifadaScraper = await electronicintifada(
|
268
|
-
const standWithPalestineScraper = await standWithPalestine(
|
289
|
+
const palianswersScraper = await palianswers( true );
|
290
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
291
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
292
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
293
|
+
const standWithPalestineScraper = await standWithPalestine( true );
|
269
294
|
const mondoweisScraper = await mondoweiss( true );
|
270
295
|
const bdsmovementScraper = await bdsmovement( false );
|
271
296
|
const palestinerememberedScraper = await palestineremembered( false );
|
@@ -278,7 +303,4 @@ void async function main ()
|
|
278
303
|
standWithPalestineScraper,
|
279
304
|
mondoweisScraper
|
280
305
|
] );
|
281
|
-
}()
|
282
|
-
|
283
|
-
|
284
|
-
// https://mondoweiss.net
|
306
|
+
}()
|
package/main.js
CHANGED
@@ -16,12 +16,14 @@ class WebScraper
|
|
16
16
|
maxArticles,
|
17
17
|
concurrencyLimit,
|
18
18
|
maxRetries,
|
19
|
+
retryDelay,
|
19
20
|
|
20
21
|
// URL filtering
|
21
22
|
excludeList = [],
|
22
23
|
exactExcludeList = [],
|
23
24
|
filterFileTypes,
|
24
25
|
excludedFileTypes,
|
26
|
+
removeURLFragment,
|
25
27
|
|
26
28
|
// Output paths
|
27
29
|
scrapResultPath = "./dataset",
|
@@ -52,6 +54,7 @@ class WebScraper
|
|
52
54
|
this.maxArticles = maxArticles || Infinity;
|
53
55
|
this.concurrencyLimit = concurrencyLimit || 2;
|
54
56
|
this.maxRetries = maxRetries || 5;
|
57
|
+
this.retryDelay = retryDelay || 40000;
|
55
58
|
|
56
59
|
// Output paths setup
|
57
60
|
this.scrapResultPath = scrapResultPath;
|
@@ -72,6 +75,7 @@ class WebScraper
|
|
72
75
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
73
76
|
this.filterFileTypes = filterFileTypes || true;
|
74
77
|
this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
|
78
|
+
this.removeURLFragment = removeURLFragment || true;
|
75
79
|
|
76
80
|
// Network configuration
|
77
81
|
this.axiosHeaders = axiosHeaders;
|
@@ -130,6 +134,10 @@ class WebScraper
|
|
130
134
|
|
131
135
|
async fetchPage ( url, depth )
|
132
136
|
{
|
137
|
+
if ( this.removeURLFragment )
|
138
|
+
{
|
139
|
+
url = url.split( "#" )[0];
|
140
|
+
}
|
133
141
|
if ( this.hasReachedMax( depth ) )
|
134
142
|
{
|
135
143
|
return;
|
@@ -233,7 +241,7 @@ class WebScraper
|
|
233
241
|
}
|
234
242
|
catch ( error )
|
235
243
|
{
|
236
|
-
console.error( `Error fetching ${url}:`, error.message );
|
244
|
+
console.error( `Error fetching content ${url}:`, error.message );
|
237
245
|
if ( error.status = 403 && this.usePuppeteer )
|
238
246
|
{
|
239
247
|
try
|
@@ -351,6 +359,10 @@ class WebScraper
|
|
351
359
|
{
|
352
360
|
urlPath = "/index";
|
353
361
|
}
|
362
|
+
else if ( urlPath.endsWith( "/" ) )
|
363
|
+
{
|
364
|
+
urlPath = urlPath.slice( 0, -1 );
|
365
|
+
}
|
354
366
|
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
355
367
|
const dir = path.dirname( filePath );
|
356
368
|
|
@@ -580,7 +592,7 @@ class WebScraper
|
|
580
592
|
catch ( error )
|
581
593
|
{
|
582
594
|
if ( attempt >= this.maxRetries ) throw error;
|
583
|
-
await WebScraper.sleep(
|
595
|
+
await WebScraper.sleep( this.retryDelay * attempt );
|
584
596
|
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
|
585
597
|
}
|
586
598
|
}
|