clean-web-scraper 3.7.6 → 3.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +84 -14
- package/main.js +10 -5
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -10,6 +10,7 @@ const headers = {
|
|
10
10
|
|
11
11
|
async function palianswers ( enable )
|
12
12
|
{
|
13
|
+
// https://palianswers.com
|
13
14
|
const scraper = new WebScraper({
|
14
15
|
baseURL: "https://palianswers.com",
|
15
16
|
excludeList: [
|
@@ -28,7 +29,7 @@ async function palianswers ( enable )
|
|
28
29
|
textOutputPath: "./dataset/palianswers/texts",
|
29
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
30
31
|
includeMetadata: true,
|
31
|
-
metadataFields: ["author", "title", "description"]
|
32
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
32
33
|
});
|
33
34
|
if ( enable )
|
34
35
|
{
|
@@ -44,7 +45,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
44
45
|
const scraper = new WebScraper({
|
45
46
|
baseURL: "https://english.khamenei.ir/news",
|
46
47
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
47
|
-
maxDepth:
|
48
|
+
maxDepth: 1,
|
48
49
|
exactExcludeList: [
|
49
50
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
|
50
51
|
],
|
@@ -53,7 +54,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
53
54
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
54
55
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
55
56
|
includeMetadata: true,
|
56
|
-
metadataFields: ["author", "title", "description"]
|
57
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
57
58
|
});
|
58
59
|
if ( enable )
|
59
60
|
{
|
@@ -64,6 +65,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
64
65
|
|
65
66
|
async function decolonizepalestine ( enable )
|
66
67
|
{
|
68
|
+
// https://decolonizepalestine.com
|
67
69
|
const scraper = new WebScraper({
|
68
70
|
baseURL: "https://decolonizepalestine.com",
|
69
71
|
excludeList: [
|
@@ -82,7 +84,7 @@ async function decolonizepalestine ( enable )
|
|
82
84
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
83
85
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
84
86
|
includeMetadata: true,
|
85
|
-
metadataFields: ["author", "title", "description"]
|
87
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
86
88
|
});
|
87
89
|
if ( enable )
|
88
90
|
{
|
@@ -93,6 +95,7 @@ async function decolonizepalestine ( enable )
|
|
93
95
|
|
94
96
|
async function bdsmovement ( enable )
|
95
97
|
{
|
98
|
+
// https://bdsmovement.net
|
96
99
|
const scraper = new WebScraper({
|
97
100
|
baseURL: "https://bdsmovement.net",
|
98
101
|
excludeList: [
|
@@ -108,7 +111,7 @@ async function bdsmovement ( enable )
|
|
108
111
|
textOutputPath: "./dataset/bdsmovement/texts",
|
109
112
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
110
113
|
includeMetadata: true,
|
111
|
-
metadataFields: ["author", "title", "description"],
|
114
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
112
115
|
puppeteerProxy: "socks5://127.0.0.1:2080",
|
113
116
|
puppeteerExecutablePath: "/usr/bin/chromium",
|
114
117
|
puppeteerRealProxy: {
|
@@ -125,6 +128,7 @@ async function bdsmovement ( enable )
|
|
125
128
|
|
126
129
|
async function electronicintifada ( enable )
|
127
130
|
{
|
131
|
+
// https://electronicintifada.net
|
128
132
|
const scraper = new WebScraper({
|
129
133
|
baseURL: "https://electronicintifada.net",
|
130
134
|
excludeList: [
|
@@ -153,7 +157,7 @@ async function electronicintifada ( enable )
|
|
153
157
|
includeMetadata: true,
|
154
158
|
maxArticles: 2000,
|
155
159
|
axiosHeaders: headers,
|
156
|
-
metadataFields: ["author", "title", "description"]
|
160
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
157
161
|
});
|
158
162
|
if ( enable )
|
159
163
|
{
|
@@ -192,7 +196,7 @@ async function palestineremembered ( enable )
|
|
192
196
|
textOutputPath: "./dataset/palestineremembered/texts",
|
193
197
|
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
194
198
|
includeMetadata: true,
|
195
|
-
metadataFields: ["author", "title", "description"],
|
199
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
196
200
|
axiosProxy: {
|
197
201
|
host: "localhost",
|
198
202
|
port: 2080,
|
@@ -206,13 +210,82 @@ async function palestineremembered ( enable )
|
|
206
210
|
return scraper;
|
207
211
|
}
|
208
212
|
|
213
|
+
async function standWithPalestine ( enable )
|
214
|
+
{
|
215
|
+
const scraper = new WebScraper({
|
216
|
+
baseURL: "https://stand-with-palestine.org/blogs",
|
217
|
+
startURL: "https://stand-with-palestine.org/blogs",
|
218
|
+
scrapResultPath: "./dataset/stand-with-palestine/website",
|
219
|
+
jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
|
220
|
+
textOutputPath: "./dataset/stand-with-palestine/texts",
|
221
|
+
csvOutputPath: "./dataset/stand-with-palestine/train.csv",
|
222
|
+
exactExcludeList: ["https://stand-with-palestine.org/blogs"],
|
223
|
+
axiosHeaders: headers,
|
224
|
+
includeMetadata: true,
|
225
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
226
|
+
});
|
227
|
+
if ( enable )
|
228
|
+
{
|
229
|
+
await scraper.start();
|
230
|
+
}
|
231
|
+
return scraper;
|
232
|
+
}
|
233
|
+
|
234
|
+
async function mondoweiss ( enable )
|
235
|
+
{
|
236
|
+
// https://mondoweiss.net
|
237
|
+
const scraper = new WebScraper({
|
238
|
+
baseURL: "https://mondoweiss.net",
|
239
|
+
excludeList: [
|
240
|
+
"https://mondoweiss.net/donate",
|
241
|
+
"https://mondoweiss.net/advertise/",
|
242
|
+
"https://mondoweiss.net/contact/",
|
243
|
+
"https://mondoweiss.net/recent-comments/",
|
244
|
+
"https://mondoweiss.net/email-newsletters",
|
245
|
+
"https://mondoweiss.net/author",
|
246
|
+
"https://mondoweiss.net/tag/"
|
247
|
+
],
|
248
|
+
exactExcludeList: [
|
249
|
+
"https://mondoweiss.net",
|
250
|
+
"https://mondoweiss.net/news/",
|
251
|
+
"https://mondoweiss.net/opinion/",
|
252
|
+
"https://mondoweiss.net/ways-to-give/",
|
253
|
+
"https://mondoweiss.net/media-analysis/",
|
254
|
+
"https://mondoweiss.net/culture/",
|
255
|
+
"https://mondoweiss.net/activism/",
|
256
|
+
"https://mondoweiss.net/news-letters/"
|
257
|
+
],
|
258
|
+
scrapResultPath: "./dataset/mondoweiss/website",
|
259
|
+
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|
260
|
+
textOutputPath: "./dataset/mondoweiss/texts",
|
261
|
+
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
262
|
+
includeMetadata: true,
|
263
|
+
maxArticles: 2500,
|
264
|
+
maxRetries: 2,
|
265
|
+
axiosHeaders: headers,
|
266
|
+
axiosProxy: {
|
267
|
+
host: "localhost",
|
268
|
+
port: 2080,
|
269
|
+
protocol: "http"
|
270
|
+
},
|
271
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
272
|
+
});
|
273
|
+
if ( enable )
|
274
|
+
{
|
275
|
+
await scraper.start();
|
276
|
+
}
|
277
|
+
return scraper;
|
278
|
+
}
|
279
|
+
|
209
280
|
|
210
281
|
void async function main ()
|
211
282
|
{
|
212
283
|
const palianswersScraper = await palianswers( false );
|
213
284
|
const decolonizepalestineScraper = await decolonizepalestine( false );
|
214
285
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
|
215
|
-
const electronicintifadaScraper = await electronicintifada(
|
286
|
+
const electronicintifadaScraper = await electronicintifada( false );
|
287
|
+
const standWithPalestineScraper = await standWithPalestine( false );
|
288
|
+
const mondoweisScraper = await mondoweiss( true );
|
216
289
|
const bdsmovementScraper = await bdsmovement( false );
|
217
290
|
const palestinerememberedScraper = await palestineremembered( false );
|
218
291
|
|
@@ -221,10 +294,7 @@ void async function main ()
|
|
221
294
|
decolonizepalestineScraper,
|
222
295
|
khameneiIrFreePalestineTagScraper,
|
223
296
|
electronicintifadaScraper,
|
224
|
-
|
225
|
-
|
297
|
+
standWithPalestineScraper,
|
298
|
+
mondoweisScraper
|
226
299
|
] );
|
227
|
-
|
228
|
-
// 7 https://stand-with-palestine.org/blogs
|
229
|
-
// https://mondoweiss.net
|
230
|
-
}()
|
300
|
+
}()
|
package/main.js
CHANGED
@@ -22,6 +22,7 @@ class WebScraper
|
|
22
22
|
exactExcludeList = [],
|
23
23
|
filterFileTypes,
|
24
24
|
excludedFileTypes,
|
25
|
+
removeURLFragment,
|
25
26
|
|
26
27
|
// Output paths
|
27
28
|
scrapResultPath = "./dataset",
|
@@ -72,6 +73,7 @@ class WebScraper
|
|
72
73
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
73
74
|
this.filterFileTypes = filterFileTypes || true;
|
74
75
|
this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
|
76
|
+
this.removeURLFragment = removeURLFragment || true;
|
75
77
|
|
76
78
|
// Network configuration
|
77
79
|
this.axiosHeaders = axiosHeaders;
|
@@ -130,6 +132,10 @@ class WebScraper
|
|
130
132
|
|
131
133
|
async fetchPage ( url, depth )
|
132
134
|
{
|
135
|
+
if ( this.removeURLFragment )
|
136
|
+
{
|
137
|
+
url = url.split( "#" )[0];
|
138
|
+
}
|
133
139
|
if ( this.hasReachedMax( depth ) )
|
134
140
|
{
|
135
141
|
return;
|
@@ -161,7 +167,6 @@ class WebScraper
|
|
161
167
|
if ( this.hasValidPageContent( article.textContent ) )
|
162
168
|
{
|
163
169
|
const metadata = this.extractMetadata( url, document );
|
164
|
-
metadata.depth = depth;
|
165
170
|
this.saveArticle( url, article.textContent, metadata );
|
166
171
|
}
|
167
172
|
else
|
@@ -234,7 +239,7 @@ class WebScraper
|
|
234
239
|
}
|
235
240
|
catch ( error )
|
236
241
|
{
|
237
|
-
console.error( `Error fetching ${url}:`, error.message );
|
242
|
+
console.error( `Error fetching content ${url}:`, error.message );
|
238
243
|
if ( error.status = 403 && this.usePuppeteer )
|
239
244
|
{
|
240
245
|
try
|
@@ -262,7 +267,7 @@ class WebScraper
|
|
262
267
|
}
|
263
268
|
}
|
264
269
|
|
265
|
-
hasReachedMax ( depth )
|
270
|
+
hasReachedMax ( depth = 0 )
|
266
271
|
{
|
267
272
|
if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
|
268
273
|
{
|
@@ -555,7 +560,7 @@ class WebScraper
|
|
555
560
|
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
556
561
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
557
562
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
558
|
-
|
563
|
+
dateScrapedDate: new Date().toISOString()
|
559
564
|
};
|
560
565
|
}
|
561
566
|
|
@@ -572,7 +577,7 @@ class WebScraper
|
|
572
577
|
{
|
573
578
|
try
|
574
579
|
{
|
575
|
-
if ( this.hasReachedMax(
|
580
|
+
if ( this.hasReachedMax( ) )
|
576
581
|
{
|
577
582
|
throw new Error( "Max reached" );
|
578
583
|
}
|