clean-web-scraper 4.2.2 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/example-usage.js +61 -42
- package/main.js +4 -3
- package/package.json +4 -4
package/README.md
CHANGED
@@ -91,7 +91,7 @@ const docsScraper = new WebScraper({
|
|
91
91
|
scrapResultPath: './datasets/docs',
|
92
92
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
93
93
|
includeMetadata: true, // Optional: Include metadata in output files
|
94
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
94
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
|
95
95
|
// Optional: Specify metadata fields to include
|
96
96
|
});
|
97
97
|
|
@@ -101,7 +101,7 @@ const blogScraper = new WebScraper({
|
|
101
101
|
scrapResultPath: './datasets/blog',
|
102
102
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
103
103
|
includeMetadata: true, // Optional: Include metadata in output files
|
104
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
104
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate"],
|
105
105
|
// Optional: Specify metadata fields to include
|
106
106
|
});
|
107
107
|
|
package/example-usage.js
CHANGED
@@ -37,9 +37,8 @@ async function palianswers ( enable )
|
|
37
37
|
textOutputPath: "./dataset/palianswers/texts",
|
38
38
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
39
39
|
includeMetadata: true,
|
40
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
40
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
41
41
|
axiosRetryDelay: 10000,
|
42
|
-
crawlingDelay: 0
|
43
42
|
};
|
44
43
|
return await runScraper( config, enable );
|
45
44
|
}
|
@@ -50,7 +49,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
50
49
|
baseURL: "https://english.khamenei.ir/news",
|
51
50
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
52
51
|
maxDepth: 1,
|
53
|
-
maxArticles:
|
52
|
+
maxArticles: 200,
|
54
53
|
exactExcludeList: [
|
55
54
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
56
55
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
|
@@ -60,7 +59,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
60
59
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
61
60
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
62
61
|
includeMetadata: true,
|
63
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
62
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
64
63
|
axiosRetryDelay: 10000,
|
65
64
|
};
|
66
65
|
return await runScraper( config, enable );
|
@@ -72,8 +71,8 @@ async function khameneiIrPalestineSpecialPage ( enable )
|
|
72
71
|
const config = {
|
73
72
|
baseURL: "https://english.khamenei.ir/news",
|
74
73
|
startURL: "https://english.khamenei.ir/palestine-special-page",
|
75
|
-
maxDepth:
|
76
|
-
maxArticles:
|
74
|
+
maxDepth: 1,
|
75
|
+
maxArticles: 200,
|
77
76
|
exactExcludeList: [
|
78
77
|
"https://english.khamenei.ir/palestine-special-page/"
|
79
78
|
],
|
@@ -82,7 +81,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
|
|
82
81
|
textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
|
83
82
|
csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
|
84
83
|
includeMetadata: true,
|
85
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
84
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
86
85
|
axiosRetryDelay: 10000
|
87
86
|
};
|
88
87
|
return await runScraper( config, enable );
|
@@ -103,13 +102,13 @@ async function decolonizepalestine ( enable )
|
|
103
102
|
"https://decolonizepalestine.com/rainbow-washing",
|
104
103
|
"https://decolonizepalestine.com/"
|
105
104
|
],
|
106
|
-
maxArticles:
|
105
|
+
maxArticles: 400,
|
107
106
|
scrapResultPath: "./dataset/decolonizepalestine/website",
|
108
107
|
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
109
108
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
110
109
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
111
110
|
includeMetadata: true,
|
112
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
111
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
113
112
|
axiosRetryDelay: 10000,
|
114
113
|
};
|
115
114
|
return await runScraper( config, enable );
|
@@ -123,7 +122,6 @@ async function electronicintifada ( enable )
|
|
123
122
|
"https://electronicintifada.net/updates",
|
124
123
|
"https://electronicintifada.net/taxonomy/term/",
|
125
124
|
"https://electronicintifada.net/tags/",
|
126
|
-
"https://electronicintifada.net/blog",
|
127
125
|
"https://electronicintifada.net/people",
|
128
126
|
"https://electronicintifada.net/location",
|
129
127
|
"https://electronicintifada.net/file",
|
@@ -134,11 +132,13 @@ async function electronicintifada ( enable )
|
|
134
132
|
"https://electronicintifada.net/opinion",
|
135
133
|
"https://electronicintifada.net/about-ei",
|
136
134
|
"https://electronicintifada.net/review",
|
137
|
-
"https://electronicintifada.net/artmusicculture"
|
135
|
+
"https://electronicintifada.net/artmusicculture",
|
136
|
+
"https://electronicintifada.net/blog/editors",
|
137
|
+
"https://electronicintifada.net/blog"
|
138
138
|
],
|
139
139
|
exactExcludeList: [
|
140
140
|
"https://electronicintifada.net",
|
141
|
-
"https://electronicintifada.net/
|
141
|
+
"https://electronicintifada.net/blogs",
|
142
142
|
"https://electronicintifada.net/review",
|
143
143
|
],
|
144
144
|
scrapResultPath: "./dataset/electronicintifada/website",
|
@@ -146,11 +146,12 @@ async function electronicintifada ( enable )
|
|
146
146
|
textOutputPath: "./dataset/electronicintifada/texts",
|
147
147
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
148
148
|
includeMetadata: true,
|
149
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
150
|
-
maxDepth: 16,
|
149
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
151
150
|
maxArticles: 2000,
|
151
|
+
maxDepth: 16,
|
152
|
+
batchSize: 30,
|
152
153
|
axiosHeaders: headers,
|
153
|
-
axiosMaxRetries:
|
154
|
+
axiosMaxRetries: 3,
|
154
155
|
axiosRetryDelay: 10000,
|
155
156
|
axiosProxy: {
|
156
157
|
host: "localhost",
|
@@ -158,7 +159,6 @@ async function electronicintifada ( enable )
|
|
158
159
|
protocol: "http"
|
159
160
|
},
|
160
161
|
useProxyAsFallback: true,
|
161
|
-
crawlingDelay: 1
|
162
162
|
};
|
163
163
|
return await runScraper( config, enable );
|
164
164
|
}
|
@@ -175,7 +175,7 @@ async function standWithPalestine ( enable )
|
|
175
175
|
csvOutputPath: "./dataset/stand-with-palestine/train.csv",
|
176
176
|
axiosHeaders: headers,
|
177
177
|
includeMetadata: true,
|
178
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
178
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"]
|
179
179
|
};
|
180
180
|
return await runScraper( config, enable );
|
181
181
|
}
|
@@ -211,20 +211,24 @@ async function mondoweiss ( enable )
|
|
211
211
|
"https://mondoweiss.net/activism/",
|
212
212
|
"https://mondoweiss.net/news-letters/",
|
213
213
|
"https://mondoweiss.net/newsletters",
|
214
|
-
/^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
|
215
|
-
/^https:\/\/mondoweiss\.net\/\d{4}\/$/,
|
216
214
|
"https://mondoweiss.net/daily-headlines",
|
217
215
|
"https://mondoweiss.net/palestineletter",
|
218
216
|
"https://mondoweiss.net/podcasts/",
|
219
217
|
"https://mondoweiss.net/the-shift",
|
220
|
-
"https://mondoweiss.net/weekly-briefing"
|
218
|
+
"https://mondoweiss.net/weekly-briefing",
|
219
|
+
"https://mondoweiss.net/contact/",
|
220
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}\/?$/,
|
221
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/?$/
|
221
222
|
],
|
222
223
|
scrapResultPath: "./dataset/mondoweiss/website",
|
223
224
|
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|
224
225
|
textOutputPath: "./dataset/mondoweiss/texts",
|
225
226
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
227
|
+
includeMetadata: true,
|
228
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
226
229
|
maxArticles: 2500,
|
227
230
|
maxDepth: 15,
|
231
|
+
batchSize: 20,
|
228
232
|
axiosHeaders: headers,
|
229
233
|
axiosMaxRetries: 3,
|
230
234
|
axiosRetryDelay: 10000,
|
@@ -234,10 +238,6 @@ async function mondoweiss ( enable )
|
|
234
238
|
protocol: "http"
|
235
239
|
},
|
236
240
|
useProxyAsFallback: true,
|
237
|
-
includeMetadata: true,
|
238
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
239
|
-
crawlingDelay: 0,
|
240
|
-
batchSize: 20
|
241
241
|
};
|
242
242
|
return await runScraper( config, enable );
|
243
243
|
}
|
@@ -251,19 +251,36 @@ async function bdsmovement ( enable )
|
|
251
251
|
"https://bdsmovement.net/privacy-policy",
|
252
252
|
"https://bdsmovement.net/get-involved/join-a-bds-campaign",
|
253
253
|
"https://bdsmovement.net/donate_",
|
254
|
+
"https://bdsmovement.net/donate",
|
254
255
|
"https://bdsmovement.net/user",
|
255
|
-
"https://bdsmovement.net/admin"
|
256
|
+
"https://bdsmovement.net/admin",
|
257
|
+
"https://bdsmovement.net/stay-updated",
|
258
|
+
"https://bdsmovement.net/join-a-bds-campaign",
|
259
|
+
"https://bdsmovement.net/contact-us",
|
260
|
+
"https://bdsmovement.net/taxonomy",
|
261
|
+
"https://bdsmovement.net/news-type",
|
262
|
+
"https://bdsmovement.net/cdn-cgi"
|
263
|
+
],
|
264
|
+
exactExcludeList: [
|
265
|
+
"https://bdsmovement.net/",
|
266
|
+
"https://bdsmovement.net/shutdownnation",
|
267
|
+
"https://bdsmovement.net/campaigns",
|
268
|
+
"https://bdsmovement.net/resources",
|
269
|
+
/^https:\/\/bdsmovement\.net\/resources\?page=\d+$/,
|
270
|
+
/^https:\/\/bdsmovement\.net\/resources\?campaign=\d+$/,
|
271
|
+
/^https:\/\/bdsmovement\.net\/resources\?type=\d+$/,
|
272
|
+
/^https:\/\/bdsmovement\.net\/news\?type=\d+$/,
|
273
|
+
/^https:\/\/bdsmovement\.net\/news\?campaign=\d+$/,
|
256
274
|
],
|
257
275
|
scrapResultPath: "./dataset/bdsmovement/website",
|
258
276
|
jsonlOutputPath: "./dataset/bdsmovement/train.jsonl",
|
259
277
|
textOutputPath: "./dataset/bdsmovement/texts",
|
260
278
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
261
279
|
includeMetadata: true,
|
262
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
},
|
280
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
281
|
+
maxArticles: 2000,
|
282
|
+
maxDepth: 16,
|
283
|
+
batchSize: 20
|
267
284
|
};
|
268
285
|
return await runScraper( config, enable );
|
269
286
|
}
|
@@ -297,7 +314,8 @@ async function palestineremembered ( enable )
|
|
297
314
|
textOutputPath: "./dataset/palestineremembered/texts",
|
298
315
|
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
299
316
|
includeMetadata: true,
|
300
|
-
metadataFields: ["author", "articleTitle", "pageTitle", "description", "
|
317
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
318
|
+
batchSize: 10,
|
301
319
|
axiosProxy: {
|
302
320
|
host: "localhost",
|
303
321
|
port: 2080,
|
@@ -309,23 +327,24 @@ async function palestineremembered ( enable )
|
|
309
327
|
|
310
328
|
void async function main ()
|
311
329
|
{
|
312
|
-
|
330
|
+
const palianswersScraper = await palianswers( true );
|
313
331
|
const decolonizepalestineScraper = await decolonizepalestine( true );
|
314
332
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
333
|
+
const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
|
334
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
335
|
+
const standWithPalestineScraper = await standWithPalestine( true );
|
336
|
+
const mondoweisScraper = await mondoweiss( true );
|
337
|
+
const bdsmovementScraper = await bdsmovement( true );
|
320
338
|
// const palestinerememberedScraper = await palestineremembered( false );
|
321
339
|
|
322
340
|
await WebScraper.combineResults( "./dataset/combined", [
|
323
|
-
|
341
|
+
palianswersScraper,
|
324
342
|
decolonizepalestineScraper,
|
325
343
|
khameneiIrFreePalestineTagScraper,
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
344
|
+
khameneiIrPalestineSpecialPageScraper,
|
345
|
+
electronicintifadaScraper,
|
346
|
+
standWithPalestineScraper,
|
347
|
+
mondoweisScraper,
|
348
|
+
bdsmovementScraper,
|
330
349
|
] );
|
331
350
|
}();
|
package/main.js
CHANGED
@@ -18,6 +18,7 @@ class WebScraper
|
|
18
18
|
this.maxArticles = config.maxArticles || Infinity;
|
19
19
|
this.crawlingDelay = config.crawlingDelay ?? 1000;
|
20
20
|
this.batchSize = config.batchSize || 5;
|
21
|
+
this.minContentLength = config.minContentLength || 400;
|
21
22
|
|
22
23
|
// Output paths setup
|
23
24
|
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
@@ -469,7 +470,7 @@ class WebScraper
|
|
469
470
|
let processed = content;
|
470
471
|
// Remove unwanted fixed text
|
471
472
|
processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
|
472
|
-
|
473
|
+
processed = processed.replace( /Click on the image to view the large size/g, "" );
|
473
474
|
processed = processed
|
474
475
|
.split( "\n" )
|
475
476
|
.map( line => { return line.trim() })
|
@@ -525,7 +526,7 @@ class WebScraper
|
|
525
526
|
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
526
527
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
527
528
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
528
|
-
|
529
|
+
dataScrapedDate: new Date().toISOString(),
|
529
530
|
originalHtml: html,
|
530
531
|
};
|
531
532
|
}
|
@@ -663,7 +664,7 @@ class WebScraper
|
|
663
664
|
|
664
665
|
const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
|
665
666
|
// Check content length
|
666
|
-
if ( cleanContent.length <
|
667
|
+
if ( cleanContent.length < this.minContentLength || hasInvalidPhrases )
|
667
668
|
{
|
668
669
|
return false;
|
669
670
|
}
|
package/package.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"name": "clean-web-scraper",
|
3
|
-
"version": "4.
|
3
|
+
"version": "4.3.0",
|
4
4
|
"main": "main.js",
|
5
5
|
"scripts": {
|
6
6
|
"start": "node main.js",
|
@@ -24,10 +24,10 @@
|
|
24
24
|
"description": "",
|
25
25
|
"dependencies": {
|
26
26
|
"@mozilla/readability": "^0.6.0",
|
27
|
-
"axios": "^1.
|
28
|
-
"eslint": "^9.
|
27
|
+
"axios": "^1.8.4",
|
28
|
+
"eslint": "^9.23.0",
|
29
29
|
"jsdom": "^26.0.0",
|
30
30
|
"puppeteer": "^24.1.1",
|
31
31
|
"puppeteer-real-browser": "^1.3.22"
|
32
32
|
}
|
33
|
-
}
|
33
|
+
}
|