clean-web-scraper 4.0.0 → 4.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +20 -15
- package/main.js +5 -10
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -37,7 +37,7 @@ async function palianswers ( enable )
|
|
37
37
|
textOutputPath: "./dataset/palianswers/texts",
|
38
38
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
39
39
|
includeMetadata: true,
|
40
|
-
metadataFields: ["author", "
|
40
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
41
41
|
axiosRetryDelay: 10000,
|
42
42
|
crawlingDelay: 0
|
43
43
|
};
|
@@ -51,14 +51,15 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
51
51
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
52
52
|
maxDepth: 1,
|
53
53
|
exactExcludeList: [
|
54
|
-
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
|
54
|
+
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
55
|
+
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
|
55
56
|
],
|
56
57
|
scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
|
57
58
|
jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
|
58
59
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
59
60
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
60
61
|
includeMetadata: true,
|
61
|
-
metadataFields: ["author", "
|
62
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
62
63
|
axiosRetryDelay: 10000,
|
63
64
|
};
|
64
65
|
return await runScraper( config, enable );
|
@@ -84,7 +85,7 @@ async function decolonizepalestine ( enable )
|
|
84
85
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
85
86
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
86
87
|
includeMetadata: true,
|
87
|
-
metadataFields: ["author", "
|
88
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
88
89
|
axiosRetryDelay: 10000,
|
89
90
|
};
|
90
91
|
return await runScraper( config, enable );
|
@@ -107,6 +108,8 @@ async function electronicintifada ( enable )
|
|
107
108
|
"https://electronicintifada.net/search/site/",
|
108
109
|
"https://electronicintifada.net/news",
|
109
110
|
"https://electronicintifada.net/opinion",
|
111
|
+
"https://electronicintifada.net/about-ei",
|
112
|
+
"https://electronicintifada.net/review"
|
110
113
|
],
|
111
114
|
exactExcludeList: [
|
112
115
|
"https://electronicintifada.net",
|
@@ -118,17 +121,19 @@ async function electronicintifada ( enable )
|
|
118
121
|
textOutputPath: "./dataset/electronicintifada/texts",
|
119
122
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
120
123
|
includeMetadata: true,
|
121
|
-
metadataFields: ["author", "
|
124
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
122
125
|
maxDepth: 16,
|
123
126
|
maxArticles: 2000,
|
124
127
|
axiosHeaders: headers,
|
128
|
+
axiosMaxRetries: 2,
|
125
129
|
axiosRetryDelay: 10000,
|
126
130
|
axiosProxy: {
|
127
131
|
host: "localhost",
|
128
132
|
port: 2080,
|
129
133
|
protocol: "http"
|
130
134
|
},
|
131
|
-
useProxyAsFallback: true
|
135
|
+
useProxyAsFallback: true,
|
136
|
+
crawlingDelay: 0
|
132
137
|
};
|
133
138
|
return await runScraper( config, enable );
|
134
139
|
}
|
@@ -138,14 +143,14 @@ async function standWithPalestine ( enable )
|
|
138
143
|
const config = {
|
139
144
|
baseURL: "https://stand-with-palestine.org/blogs",
|
140
145
|
startURL: "https://stand-with-palestine.org/blogs",
|
146
|
+
exactExcludeList: ["https://stand-with-palestine.org/blogs"],
|
141
147
|
scrapResultPath: "./dataset/stand-with-palestine/website",
|
142
148
|
jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
|
143
149
|
textOutputPath: "./dataset/stand-with-palestine/texts",
|
144
150
|
csvOutputPath: "./dataset/stand-with-palestine/train.csv",
|
145
|
-
exactExcludeList: ["https://stand-with-palestine.org/blogs"],
|
146
151
|
axiosHeaders: headers,
|
147
152
|
includeMetadata: true,
|
148
|
-
metadataFields: ["author", "
|
153
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
|
149
154
|
};
|
150
155
|
return await runScraper( config, enable );
|
151
156
|
}
|
@@ -178,18 +183,18 @@ async function mondoweiss ( enable )
|
|
178
183
|
textOutputPath: "./dataset/mondoweiss/texts",
|
179
184
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
180
185
|
maxArticles: 2500,
|
181
|
-
|
186
|
+
maxDepth: 15,
|
182
187
|
axiosHeaders: headers,
|
188
|
+
axiosMaxRetries: 3,
|
189
|
+
axiosRetryDelay: 10000,
|
183
190
|
axiosProxy: {
|
184
191
|
host: "localhost",
|
185
192
|
port: 2080,
|
186
193
|
protocol: "http"
|
187
194
|
},
|
188
|
-
|
189
|
-
axiosRetryDelay: 10000,
|
195
|
+
useProxyAsFallback: true,
|
190
196
|
includeMetadata: true,
|
191
|
-
metadataFields: ["author", "
|
192
|
-
useProxyAsFallback: true
|
197
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
193
198
|
};
|
194
199
|
return await runScraper( config, enable );
|
195
200
|
}
|
@@ -211,7 +216,7 @@ async function bdsmovement ( enable )
|
|
211
216
|
textOutputPath: "./dataset/bdsmovement/texts",
|
212
217
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
213
218
|
includeMetadata: true,
|
214
|
-
metadataFields: ["author", "
|
219
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
215
220
|
puppeteerRealProxy: {
|
216
221
|
host: "socks5://127.0.0.1",
|
217
222
|
port: "2080",
|
@@ -249,7 +254,7 @@ async function palestineremembered ( enable )
|
|
249
254
|
textOutputPath: "./dataset/palestineremembered/texts",
|
250
255
|
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
251
256
|
includeMetadata: true,
|
252
|
-
metadataFields: ["author", "
|
257
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
253
258
|
axiosProxy: {
|
254
259
|
host: "localhost",
|
255
260
|
port: 2080,
|
package/main.js
CHANGED
@@ -135,16 +135,14 @@ class WebScraper
|
|
135
135
|
|
136
136
|
if ( !this.isExcluded( url ) )
|
137
137
|
{
|
138
|
-
const reader = new Readability( document
|
139
|
-
charThreshold: 500,
|
140
|
-
nbTopCandidates: 20
|
141
|
-
});
|
138
|
+
const reader = new Readability( document );
|
142
139
|
const article = reader.parse();
|
143
140
|
if ( article )
|
144
141
|
{
|
145
142
|
if ( this.hasValidPageContent( article.textContent ) )
|
146
143
|
{
|
147
144
|
const metadata = this.extractMetadata( url, document );
|
145
|
+
metadata.articleTitle = article.title || "";
|
148
146
|
this.saveArticle( url, article.textContent, metadata );
|
149
147
|
}
|
150
148
|
else
|
@@ -503,17 +501,14 @@ class WebScraper
|
|
503
501
|
{
|
504
502
|
return {
|
505
503
|
url,
|
506
|
-
|
504
|
+
pageTitle: document.title,
|
507
505
|
description: document.querySelector( "meta[name=\"description\"]" )?.content,
|
508
506
|
keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
|
509
507
|
author: document.querySelector( "meta[name=\"author\"]" )?.content,
|
510
|
-
language:
|
511
|
-
document.documentElement.lang ||
|
512
|
-
document.querySelector( "html" )?.getAttribute( "lang" ),
|
508
|
+
language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
|
513
509
|
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
514
510
|
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
515
|
-
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
|
516
|
-
?.content,
|
511
|
+
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
517
512
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
518
513
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
519
514
|
dateScrapedDate: new Date().toISOString()
|