clean-web-scraper 4.2.3 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/example-usage.js +64 -43
- package/main.js +4 -3
- package/package.json +3 -3
package/README.md
CHANGED
@@ -91,7 +91,7 @@ const docsScraper = new WebScraper({
|
|
91
91
|
scrapResultPath: './datasets/docs',
|
92
92
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
93
93
|
includeMetadata: true, // Optional: Include metadata in output files
|
94
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
94
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
|
95
95
|
// Optional: Specify metadata fields to include
|
96
96
|
});
|
97
97
|
|
@@ -101,7 +101,7 @@ const blogScraper = new WebScraper({
|
|
101
101
|
scrapResultPath: './datasets/blog',
|
102
102
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
103
103
|
includeMetadata: true, // Optional: Include metadata in output files
|
104
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
104
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
|
105
105
|
// Optional: Specify metadata fields to include
|
106
106
|
});
|
107
107
|
|
package/example-usage.js
CHANGED
@@ -37,9 +37,8 @@ async function palianswers ( enable )
|
|
37
37
|
textOutputPath: "./dataset/palianswers/texts",
|
38
38
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
39
39
|
includeMetadata: true,
|
40
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
40
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
41
41
|
axiosRetryDelay: 10000,
|
42
|
-
crawlingDelay: 0
|
43
42
|
};
|
44
43
|
return await runScraper( config, enable );
|
45
44
|
}
|
@@ -50,7 +49,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
50
49
|
baseURL: "https://english.khamenei.ir/news",
|
51
50
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
52
51
|
maxDepth: 1,
|
53
|
-
maxArticles:
|
52
|
+
maxArticles: 200,
|
54
53
|
exactExcludeList: [
|
55
54
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
56
55
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
|
@@ -60,7 +59,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
60
59
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
61
60
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
62
61
|
includeMetadata: true,
|
63
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
62
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
64
63
|
axiosRetryDelay: 10000,
|
65
64
|
};
|
66
65
|
return await runScraper( config, enable );
|
@@ -72,8 +71,8 @@ async function khameneiIrPalestineSpecialPage ( enable )
|
|
72
71
|
const config = {
|
73
72
|
baseURL: "https://english.khamenei.ir/news",
|
74
73
|
startURL: "https://english.khamenei.ir/palestine-special-page",
|
75
|
-
maxDepth:
|
76
|
-
maxArticles:
|
74
|
+
maxDepth: 1,
|
75
|
+
maxArticles: 200,
|
77
76
|
exactExcludeList: [
|
78
77
|
"https://english.khamenei.ir/palestine-special-page/"
|
79
78
|
],
|
@@ -82,7 +81,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
|
|
82
81
|
textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
|
83
82
|
csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
|
84
83
|
includeMetadata: true,
|
85
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
84
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
86
85
|
axiosRetryDelay: 10000
|
87
86
|
};
|
88
87
|
return await runScraper( config, enable );
|
@@ -103,13 +102,13 @@ async function decolonizepalestine ( enable )
|
|
103
102
|
"https://decolonizepalestine.com/rainbow-washing",
|
104
103
|
"https://decolonizepalestine.com/"
|
105
104
|
],
|
106
|
-
maxArticles:
|
105
|
+
maxArticles: 400,
|
107
106
|
scrapResultPath: "./dataset/decolonizepalestine/website",
|
108
107
|
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
109
108
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
110
109
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
111
110
|
includeMetadata: true,
|
112
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
111
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
113
112
|
axiosRetryDelay: 10000,
|
114
113
|
};
|
115
114
|
return await runScraper( config, enable );
|
@@ -123,7 +122,6 @@ async function electronicintifada ( enable )
|
|
123
122
|
"https://electronicintifada.net/updates",
|
124
123
|
"https://electronicintifada.net/taxonomy/term/",
|
125
124
|
"https://electronicintifada.net/tags/",
|
126
|
-
"https://electronicintifada.net/blog",
|
127
125
|
"https://electronicintifada.net/people",
|
128
126
|
"https://electronicintifada.net/location",
|
129
127
|
"https://electronicintifada.net/file",
|
@@ -134,11 +132,15 @@ async function electronicintifada ( enable )
|
|
134
132
|
"https://electronicintifada.net/opinion",
|
135
133
|
"https://electronicintifada.net/about-ei",
|
136
134
|
"https://electronicintifada.net/review",
|
137
|
-
"https://electronicintifada.net/artmusicculture"
|
135
|
+
"https://electronicintifada.net/artmusicculture",
|
136
|
+
"https://electronicintifada.net/blog/editors",
|
138
137
|
],
|
139
138
|
exactExcludeList: [
|
140
|
-
"https://electronicintifada.net",
|
141
139
|
"https://electronicintifada.net/blog",
|
140
|
+
/^https:\/\/electronicintifada\.net\/blog\/.*/,
|
141
|
+
/^https:\/\/electronicintifada\.net\/blog\?page=\d+$/,
|
142
|
+
"https://electronicintifada.net",
|
143
|
+
"https://electronicintifada.net/blogs",
|
142
144
|
"https://electronicintifada.net/review",
|
143
145
|
],
|
144
146
|
scrapResultPath: "./dataset/electronicintifada/website",
|
@@ -146,19 +148,19 @@ async function electronicintifada ( enable )
|
|
146
148
|
textOutputPath: "./dataset/electronicintifada/texts",
|
147
149
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
148
150
|
includeMetadata: true,
|
149
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
150
|
-
maxDepth: 16,
|
151
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
151
152
|
maxArticles: 2000,
|
153
|
+
maxDepth: 16,
|
154
|
+
batchSize: 40,
|
152
155
|
axiosHeaders: headers,
|
153
156
|
axiosMaxRetries: 2,
|
154
|
-
axiosRetryDelay:
|
157
|
+
axiosRetryDelay: 8000,
|
155
158
|
axiosProxy: {
|
156
159
|
host: "localhost",
|
157
160
|
port: 2080,
|
158
161
|
protocol: "http"
|
159
162
|
},
|
160
163
|
useProxyAsFallback: true,
|
161
|
-
crawlingDelay: 1
|
162
164
|
};
|
163
165
|
return await runScraper( config, enable );
|
164
166
|
}
|
@@ -175,7 +177,7 @@ async function standWithPalestine ( enable )
|
|
175
177
|
csvOutputPath: "./dataset/stand-with-palestine/train.csv",
|
176
178
|
axiosHeaders: headers,
|
177
179
|
includeMetadata: true,
|
178
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
180
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"]
|
179
181
|
};
|
180
182
|
return await runScraper( config, enable );
|
181
183
|
}
|
@@ -211,22 +213,26 @@ async function mondoweiss ( enable )
|
|
211
213
|
"https://mondoweiss.net/activism/",
|
212
214
|
"https://mondoweiss.net/news-letters/",
|
213
215
|
"https://mondoweiss.net/newsletters",
|
214
|
-
/^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
|
215
|
-
/^https:\/\/mondoweiss\.net\/\d{4}\/$/,
|
216
216
|
"https://mondoweiss.net/daily-headlines",
|
217
217
|
"https://mondoweiss.net/palestineletter",
|
218
218
|
"https://mondoweiss.net/podcasts/",
|
219
219
|
"https://mondoweiss.net/the-shift",
|
220
|
-
"https://mondoweiss.net/weekly-briefing"
|
220
|
+
"https://mondoweiss.net/weekly-briefing",
|
221
|
+
"https://mondoweiss.net/contact/",
|
222
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}\/?$/,
|
223
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/?$/
|
221
224
|
],
|
222
225
|
scrapResultPath: "./dataset/mondoweiss/website",
|
223
226
|
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|
224
227
|
textOutputPath: "./dataset/mondoweiss/texts",
|
225
228
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
229
|
+
includeMetadata: true,
|
230
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
226
231
|
maxArticles: 2500,
|
227
232
|
maxDepth: 15,
|
233
|
+
batchSize: 20,
|
228
234
|
axiosHeaders: headers,
|
229
|
-
axiosMaxRetries:
|
235
|
+
axiosMaxRetries: 2,
|
230
236
|
axiosRetryDelay: 10000,
|
231
237
|
axiosProxy: {
|
232
238
|
host: "localhost",
|
@@ -234,10 +240,6 @@ async function mondoweiss ( enable )
|
|
234
240
|
protocol: "http"
|
235
241
|
},
|
236
242
|
useProxyAsFallback: true,
|
237
|
-
includeMetadata: true,
|
238
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
239
|
-
crawlingDelay: 0,
|
240
|
-
batchSize: 20
|
241
243
|
};
|
242
244
|
return await runScraper( config, enable );
|
243
245
|
}
|
@@ -251,19 +253,36 @@ async function bdsmovement ( enable )
|
|
251
253
|
"https://bdsmovement.net/privacy-policy",
|
252
254
|
"https://bdsmovement.net/get-involved/join-a-bds-campaign",
|
253
255
|
"https://bdsmovement.net/donate_",
|
256
|
+
"https://bdsmovement.net/donate",
|
254
257
|
"https://bdsmovement.net/user",
|
255
|
-
"https://bdsmovement.net/admin"
|
258
|
+
"https://bdsmovement.net/admin",
|
259
|
+
"https://bdsmovement.net/stay-updated",
|
260
|
+
"https://bdsmovement.net/join-a-bds-campaign",
|
261
|
+
"https://bdsmovement.net/contact-us",
|
262
|
+
"https://bdsmovement.net/taxonomy",
|
263
|
+
"https://bdsmovement.net/news-type",
|
264
|
+
"https://bdsmovement.net/cdn-cgi"
|
265
|
+
],
|
266
|
+
exactExcludeList: [
|
267
|
+
"https://bdsmovement.net/",
|
268
|
+
"https://bdsmovement.net/shutdownnation",
|
269
|
+
"https://bdsmovement.net/campaigns",
|
270
|
+
"https://bdsmovement.net/resources",
|
271
|
+
/^https:\/\/bdsmovement\.net\/resources\?page=\d+$/,
|
272
|
+
/^https:\/\/bdsmovement\.net\/resources\?campaign=\d+$/,
|
273
|
+
/^https:\/\/bdsmovement\.net\/resources\?type=\d+$/,
|
274
|
+
/^https:\/\/bdsmovement\.net\/news\?type=\d+$/,
|
275
|
+
/^https:\/\/bdsmovement\.net\/news\?campaign=\d+$/,
|
256
276
|
],
|
257
277
|
scrapResultPath: "./dataset/bdsmovement/website",
|
258
278
|
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
259
279
|
textOutputPath: "./dataset/bdsmovement/texts",
|
260
280
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
261
281
|
includeMetadata: true,
|
262
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
},
|
282
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
283
|
+
maxArticles: 2000,
|
284
|
+
maxDepth: 16,
|
285
|
+
batchSize: 20
|
267
286
|
};
|
268
287
|
return await runScraper( config, enable );
|
269
288
|
}
|
@@ -297,7 +316,8 @@ async function palestineremembered ( enable )
|
|
297
316
|
textOutputPath: "./dataset/palestineremembered/texts",
|
298
317
|
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
299
318
|
includeMetadata: true,
|
300
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
319
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
320
|
+
batchSize: 10,
|
301
321
|
axiosProxy: {
|
302
322
|
host: "localhost",
|
303
323
|
port: 2080,
|
@@ -309,23 +329,24 @@ async function palestineremembered ( enable )
|
|
309
329
|
|
310
330
|
void async function main ()
|
311
331
|
{
|
312
|
-
|
332
|
+
const palianswersScraper = await palianswers( true );
|
313
333
|
const decolonizepalestineScraper = await decolonizepalestine( true );
|
314
334
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
335
|
+
const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
|
336
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
337
|
+
const standWithPalestineScraper = await standWithPalestine( true );
|
338
|
+
const mondoweisScraper = await mondoweiss( true );
|
339
|
+
const bdsmovementScraper = await bdsmovement( true );
|
320
340
|
// const palestinerememberedScraper = await palestineremembered( false );
|
321
341
|
|
322
342
|
await WebScraper.combineResults( "./dataset/combined", [
|
323
|
-
|
343
|
+
palianswersScraper,
|
324
344
|
decolonizepalestineScraper,
|
325
345
|
khameneiIrFreePalestineTagScraper,
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
346
|
+
khameneiIrPalestineSpecialPageScraper,
|
347
|
+
electronicintifadaScraper,
|
348
|
+
standWithPalestineScraper,
|
349
|
+
mondoweisScraper,
|
350
|
+
bdsmovementScraper,
|
330
351
|
] );
|
331
352
|
}();
|
package/main.js
CHANGED
@@ -18,6 +18,7 @@ class WebScraper
|
|
18
18
|
this.maxArticles = config.maxArticles || Infinity;
|
19
19
|
this.crawlingDelay = config.crawlingDelay ?? 1000;
|
20
20
|
this.batchSize = config.batchSize || 5;
|
21
|
+
this.minContentLength = config.minContentLength || 400;
|
21
22
|
|
22
23
|
// Output paths setup
|
23
24
|
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
@@ -469,7 +470,7 @@ class WebScraper
|
|
469
470
|
let processed = content;
|
470
471
|
// Remove unwanted fixed text
|
471
472
|
processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
|
472
|
-
|
473
|
+
processed = processed.replace( /Click on the image to view the large size/g, "" );
|
473
474
|
processed = processed
|
474
475
|
.split( "\n" )
|
475
476
|
.map( line => { return line.trim() })
|
@@ -525,7 +526,7 @@ class WebScraper
|
|
525
526
|
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
526
527
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
527
528
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
528
|
-
|
529
|
+
dataScrapedDate: new Date().toISOString(),
|
529
530
|
originalHtml: html,
|
530
531
|
};
|
531
532
|
}
|
@@ -663,7 +664,7 @@ class WebScraper
|
|
663
664
|
|
664
665
|
const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
|
665
666
|
// Check content length
|
666
|
-
if ( cleanContent.length <
|
667
|
+
if ( cleanContent.length < this.minContentLength || hasInvalidPhrases )
|
667
668
|
{
|
668
669
|
return false;
|
669
670
|
}
|
package/package.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"name": "clean-web-scraper",
|
3
|
-
"version": "4.
|
3
|
+
"version": "4.3.1",
|
4
4
|
"main": "main.js",
|
5
5
|
"scripts": {
|
6
6
|
"start": "node main.js",
|
@@ -24,8 +24,8 @@
|
|
24
24
|
"description": "",
|
25
25
|
"dependencies": {
|
26
26
|
"@mozilla/readability": "^0.6.0",
|
27
|
-
"axios": "^1.8.
|
28
|
-
"eslint": "^9.
|
27
|
+
"axios": "^1.8.4",
|
28
|
+
"eslint": "^9.23.0",
|
29
29
|
"jsdom": "^26.0.0",
|
30
30
|
"puppeteer": "^24.1.1",
|
31
31
|
"puppeteer-real-browser": "^1.3.22"
|