clean-web-scraper 4.0.0 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +16 -14
- package/main.js +5 -10
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -37,7 +37,7 @@ async function palianswers ( enable )
|
|
37
37
|
textOutputPath: "./dataset/palianswers/texts",
|
38
38
|
csvOutputPath: "./dataset/palianswers/train.csv",
|
39
39
|
includeMetadata: true,
|
40
|
-
metadataFields: ["author", "
|
40
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
41
41
|
axiosRetryDelay: 10000,
|
42
42
|
crawlingDelay: 0
|
43
43
|
};
|
@@ -58,7 +58,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
58
58
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
59
59
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
60
60
|
includeMetadata: true,
|
61
|
-
metadataFields: ["author", "
|
61
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
62
62
|
axiosRetryDelay: 10000,
|
63
63
|
};
|
64
64
|
return await runScraper( config, enable );
|
@@ -84,7 +84,7 @@ async function decolonizepalestine ( enable )
|
|
84
84
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
85
85
|
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
86
86
|
includeMetadata: true,
|
87
|
-
metadataFields: ["author", "
|
87
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
88
88
|
axiosRetryDelay: 10000,
|
89
89
|
};
|
90
90
|
return await runScraper( config, enable );
|
@@ -118,17 +118,19 @@ async function electronicintifada ( enable )
|
|
118
118
|
textOutputPath: "./dataset/electronicintifada/texts",
|
119
119
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
120
120
|
includeMetadata: true,
|
121
|
-
metadataFields: ["author", "
|
121
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
122
122
|
maxDepth: 16,
|
123
123
|
maxArticles: 2000,
|
124
124
|
axiosHeaders: headers,
|
125
|
+
axiosMaxRetries: 2,
|
125
126
|
axiosRetryDelay: 10000,
|
126
127
|
axiosProxy: {
|
127
128
|
host: "localhost",
|
128
129
|
port: 2080,
|
129
130
|
protocol: "http"
|
130
131
|
},
|
131
|
-
useProxyAsFallback: true
|
132
|
+
useProxyAsFallback: true,
|
133
|
+
crawlingDelay: 0
|
132
134
|
};
|
133
135
|
return await runScraper( config, enable );
|
134
136
|
}
|
@@ -138,14 +140,14 @@ async function standWithPalestine ( enable )
|
|
138
140
|
const config = {
|
139
141
|
baseURL: "https://stand-with-palestine.org/blogs",
|
140
142
|
startURL: "https://stand-with-palestine.org/blogs",
|
143
|
+
exactExcludeList: ["https://stand-with-palestine.org/blogs"],
|
141
144
|
scrapResultPath: "./dataset/stand-with-palestine/website",
|
142
145
|
jsonlOutputPath: "./dataset/stand-with-palestine/train.jsonl",
|
143
146
|
textOutputPath: "./dataset/stand-with-palestine/texts",
|
144
147
|
csvOutputPath: "./dataset/stand-with-palestine/train.csv",
|
145
|
-
exactExcludeList: ["https://stand-with-palestine.org/blogs"],
|
146
148
|
axiosHeaders: headers,
|
147
149
|
includeMetadata: true,
|
148
|
-
metadataFields: ["author", "
|
150
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"]
|
149
151
|
};
|
150
152
|
return await runScraper( config, enable );
|
151
153
|
}
|
@@ -178,18 +180,18 @@ async function mondoweiss ( enable )
|
|
178
180
|
textOutputPath: "./dataset/mondoweiss/texts",
|
179
181
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
180
182
|
maxArticles: 2500,
|
181
|
-
|
183
|
+
maxDepth: 15,
|
182
184
|
axiosHeaders: headers,
|
185
|
+
axiosMaxRetries: 3,
|
186
|
+
axiosRetryDelay: 10000,
|
183
187
|
axiosProxy: {
|
184
188
|
host: "localhost",
|
185
189
|
port: 2080,
|
186
190
|
protocol: "http"
|
187
191
|
},
|
188
|
-
|
189
|
-
axiosRetryDelay: 10000,
|
192
|
+
useProxyAsFallback: true,
|
190
193
|
includeMetadata: true,
|
191
|
-
metadataFields: ["author", "
|
192
|
-
useProxyAsFallback: true
|
194
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
193
195
|
};
|
194
196
|
return await runScraper( config, enable );
|
195
197
|
}
|
@@ -211,7 +213,7 @@ async function bdsmovement ( enable )
|
|
211
213
|
textOutputPath: "./dataset/bdsmovement/texts",
|
212
214
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
213
215
|
includeMetadata: true,
|
214
|
-
metadataFields: ["author", "
|
216
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
215
217
|
puppeteerRealProxy: {
|
216
218
|
host: "socks5://127.0.0.1",
|
217
219
|
port: "2080",
|
@@ -249,7 +251,7 @@ async function palestineremembered ( enable )
|
|
249
251
|
textOutputPath: "./dataset/palestineremembered/texts",
|
250
252
|
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
251
253
|
includeMetadata: true,
|
252
|
-
metadataFields: ["author", "
|
254
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
253
255
|
axiosProxy: {
|
254
256
|
host: "localhost",
|
255
257
|
port: 2080,
|
package/main.js
CHANGED
@@ -135,16 +135,14 @@ class WebScraper
|
|
135
135
|
|
136
136
|
if ( !this.isExcluded( url ) )
|
137
137
|
{
|
138
|
-
const reader = new Readability( document
|
139
|
-
charThreshold: 500,
|
140
|
-
nbTopCandidates: 20
|
141
|
-
});
|
138
|
+
const reader = new Readability( document );
|
142
139
|
const article = reader.parse();
|
143
140
|
if ( article )
|
144
141
|
{
|
145
142
|
if ( this.hasValidPageContent( article.textContent ) )
|
146
143
|
{
|
147
144
|
const metadata = this.extractMetadata( url, document );
|
145
|
+
metadata.articleTitle = article.title || "";
|
148
146
|
this.saveArticle( url, article.textContent, metadata );
|
149
147
|
}
|
150
148
|
else
|
@@ -503,17 +501,14 @@ class WebScraper
|
|
503
501
|
{
|
504
502
|
return {
|
505
503
|
url,
|
506
|
-
|
504
|
+
pageTitle: document.title,
|
507
505
|
description: document.querySelector( "meta[name=\"description\"]" )?.content,
|
508
506
|
keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
|
509
507
|
author: document.querySelector( "meta[name=\"author\"]" )?.content,
|
510
|
-
language:
|
511
|
-
document.documentElement.lang ||
|
512
|
-
document.querySelector( "html" )?.getAttribute( "lang" ),
|
508
|
+
language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
|
513
509
|
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
514
510
|
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
515
|
-
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
|
516
|
-
?.content,
|
511
|
+
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
517
512
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
518
513
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
519
514
|
dateScrapedDate: new Date().toISOString()
|