clean-web-scraper 3.7.6 → 3.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +67 -13
- package/main.js +3 -4
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -10,6 +10,7 @@ const headers = {
|
|
10
10
|
|
11
11
|
async function palianswers ( enable )
|
12
12
|
{
|
13
|
+
// https://palianswers.com
|
13
14
|
const scraper = new WebScraper({
|
14
15
|
baseURL: "https://palianswers.com",
|
15
16
|
excludeList: [
|
@@ -28,7 +29,7 @@ async function palianswers ( enable )
|
|
28
29
|
textOutputPath: "./dataset/palianswers/texts",
|
29
30
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
30
31
|
includeMetadata: true,
|
31
|
-
metadataFields: ["author", "title", "description"]
|
32
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
32
33
|
});
|
33
34
|
if ( enable )
|
34
35
|
{
|
@@ -44,7 +45,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
44
45
|
const scraper = new WebScraper({
|
45
46
|
baseURL: "https://english.khamenei.ir/news",
|
46
47
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
47
|
-
maxDepth:
|
48
|
+
maxDepth: 1,
|
48
49
|
exactExcludeList: [
|
49
50
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
|
50
51
|
],
|
@@ -53,7 +54,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
53
54
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
54
55
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
55
56
|
includeMetadata: true,
|
56
|
-
metadataFields: ["author", "title", "description"]
|
57
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
57
58
|
});
|
58
59
|
if ( enable )
|
59
60
|
{
|
@@ -64,6 +65,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
64
65
|
|
65
66
|
async function decolonizepalestine ( enable )
|
66
67
|
{
|
68
|
+
// https://decolonizepalestine.com
|
67
69
|
const scraper = new WebScraper({
|
68
70
|
baseURL: "https://decolonizepalestine.com",
|
69
71
|
excludeList: [
|
@@ -82,7 +84,7 @@ async function decolonizepalestine ( enable )
|
|
82
84
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
83
85
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
84
86
|
includeMetadata: true,
|
85
|
-
metadataFields: ["author", "title", "description"]
|
87
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
86
88
|
});
|
87
89
|
if ( enable )
|
88
90
|
{
|
@@ -93,6 +95,7 @@ async function decolonizepalestine ( enable )
|
|
93
95
|
|
94
96
|
async function bdsmovement ( enable )
|
95
97
|
{
|
98
|
+
// https://bdsmovement.net
|
96
99
|
const scraper = new WebScraper({
|
97
100
|
baseURL: "https://bdsmovement.net",
|
98
101
|
excludeList: [
|
@@ -108,7 +111,7 @@ async function bdsmovement ( enable )
|
|
108
111
|
textOutputPath: "./dataset/bdsmovement/texts",
|
109
112
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
110
113
|
includeMetadata: true,
|
111
|
-
metadataFields: ["author", "title", "description"],
|
114
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
112
115
|
puppeteerProxy: "socks5://127.0.0.1:2080",
|
113
116
|
puppeteerExecutablePath: "/usr/bin/chromium",
|
114
117
|
puppeteerRealProxy: {
|
@@ -125,6 +128,7 @@ async function bdsmovement ( enable )
|
|
125
128
|
|
126
129
|
async function electronicintifada ( enable )
|
127
130
|
{
|
131
|
+
// https://electronicintifada.net
|
128
132
|
const scraper = new WebScraper({
|
129
133
|
baseURL: "https://electronicintifada.net",
|
130
134
|
excludeList: [
|
@@ -153,7 +157,7 @@ async function electronicintifada ( enable )
|
|
153
157
|
includeMetadata: true,
|
154
158
|
maxArticles: 2000,
|
155
159
|
axiosHeaders: headers,
|
156
|
-
metadataFields: ["author", "title", "description"]
|
160
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
157
161
|
});
|
158
162
|
if ( enable )
|
159
163
|
{
|
@@ -192,7 +196,7 @@ async function palestineremembered ( enable )
|
|
192
196
|
textOutputPath: "./dataset/palestineremembered/texts",
|
193
197
|
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
194
198
|
includeMetadata: true,
|
195
|
-
metadataFields: ["author", "title", "description"],
|
199
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
196
200
|
axiosProxy: {
|
197
201
|
host: "localhost",
|
198
202
|
port: 2080,
|
@@ -206,13 +210,63 @@ async function palestineremembered ( enable )
|
|
206
210
|
return scraper;
|
207
211
|
}
|
208
212
|
|
213
|
+
async function standWithPalestine ( enable )
|
214
|
+
{
|
215
|
+
const scraper = new WebScraper({
|
216
|
+
baseURL: "https://stand-with-palestine.org/blogs",
|
217
|
+
startURL: "https://stand-with-palestine.org/blogs",
|
218
|
+
scrapResultPath: "./dataset/stand-with-palestine/website",
|
219
|
+
jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
|
220
|
+
textOutputPath: "./dataset/stand-with-palestine/texts",
|
221
|
+
csvOutputPath: "./dataset/stand-with-palestine/train.csv",
|
222
|
+
exactExcludeList: ["https://stand-with-palestine.org/blogs"],
|
223
|
+
axiosHeaders: headers,
|
224
|
+
includeMetadata: true,
|
225
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
226
|
+
});
|
227
|
+
if ( enable )
|
228
|
+
{
|
229
|
+
await scraper.start();
|
230
|
+
}
|
231
|
+
return scraper;
|
232
|
+
}
|
233
|
+
|
234
|
+
async function mondoweiss ( enable )
|
235
|
+
{
|
236
|
+
// https://mondoweiss.net
|
237
|
+
const scraper = new WebScraper({
|
238
|
+
baseURL: "https://mondoweiss.net",
|
239
|
+
excludeList: [
|
240
|
+
"https://mondoweiss.net/donate",
|
241
|
+
"https://mondoweiss.net/advertise/",
|
242
|
+
"https://mondoweiss.net/contact/",
|
243
|
+
"https://mondoweiss.net/recent-comments/"
|
244
|
+
],
|
245
|
+
scrapResultPath: "./dataset/mondoweiss/website",
|
246
|
+
jsonlOutputPath: "./dataset/mondoweiss/train.jsonl",
|
247
|
+
textOutputPath: "./dataset/mondoweiss/texts",
|
248
|
+
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
249
|
+
includeMetadata: true,
|
250
|
+
maxArticles: 2500,
|
251
|
+
axiosHeaders: headers,
|
252
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
253
|
+
});
|
254
|
+
if ( enable )
|
255
|
+
{
|
256
|
+
await scraper.start();
|
257
|
+
}
|
258
|
+
return scraper;
|
259
|
+
}
|
260
|
+
|
209
261
|
|
210
262
|
void async function main ()
|
211
263
|
{
|
212
264
|
const palianswersScraper = await palianswers( false );
|
213
265
|
const decolonizepalestineScraper = await decolonizepalestine( false );
|
214
266
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
|
215
|
-
const electronicintifadaScraper = await electronicintifada(
|
267
|
+
const electronicintifadaScraper = await electronicintifada( false );
|
268
|
+
const standWithPalestineScraper = await standWithPalestine( false );
|
269
|
+
const mondoweisScraper = await mondoweiss( true );
|
216
270
|
const bdsmovementScraper = await bdsmovement( false );
|
217
271
|
const palestinerememberedScraper = await palestineremembered( false );
|
218
272
|
|
@@ -221,10 +275,10 @@ void async function main ()
|
|
221
275
|
decolonizepalestineScraper,
|
222
276
|
khameneiIrFreePalestineTagScraper,
|
223
277
|
electronicintifadaScraper,
|
224
|
-
|
225
|
-
|
278
|
+
standWithPalestineScraper,
|
279
|
+
mondoweisScraper
|
226
280
|
] );
|
227
|
-
|
228
|
-
// 7 https://stand-with-palestine.org/blogs
|
229
|
-
// https://mondoweiss.net
|
230
281
|
}()
|
282
|
+
|
283
|
+
|
284
|
+
// https://mondoweiss.net
|
package/main.js
CHANGED
@@ -161,7 +161,6 @@ class WebScraper
|
|
161
161
|
if ( this.hasValidPageContent( article.textContent ) )
|
162
162
|
{
|
163
163
|
const metadata = this.extractMetadata( url, document );
|
164
|
-
metadata.depth = depth;
|
165
164
|
this.saveArticle( url, article.textContent, metadata );
|
166
165
|
}
|
167
166
|
else
|
@@ -262,7 +261,7 @@ class WebScraper
|
|
262
261
|
}
|
263
262
|
}
|
264
263
|
|
265
|
-
hasReachedMax ( depth )
|
264
|
+
hasReachedMax ( depth = 0 )
|
266
265
|
{
|
267
266
|
if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
|
268
267
|
{
|
@@ -555,7 +554,7 @@ class WebScraper
|
|
555
554
|
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
556
555
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
557
556
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
558
|
-
|
557
|
+
dateScrapedDate: new Date().toISOString()
|
559
558
|
};
|
560
559
|
}
|
561
560
|
|
@@ -572,7 +571,7 @@ class WebScraper
|
|
572
571
|
{
|
573
572
|
try
|
574
573
|
{
|
575
|
-
if ( this.hasReachedMax(
|
574
|
+
if ( this.hasReachedMax( ) )
|
576
575
|
{
|
577
576
|
throw new Error( "Max reached" );
|
578
577
|
}
|