clean-web-scraper 3.5.0 → 3.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +73 -18
- package/package.json +1 -1
- package/src/WebScraper.js +10 -4
package/example-usage.js
CHANGED
|
@@ -27,13 +27,15 @@ async function khameneiIrFreePalestineTag ()
|
|
|
27
27
|
includeMetadata: true,
|
|
28
28
|
metadataFields: ["title", "description", "author"]
|
|
29
29
|
});
|
|
30
|
-
|
|
30
|
+
if ( enable )
|
|
31
|
+
{
|
|
32
|
+
await scraper.start();
|
|
33
|
+
}
|
|
31
34
|
return scraper;
|
|
32
35
|
}
|
|
33
36
|
|
|
34
|
-
async function decolonizepalestine ()
|
|
37
|
+
async function decolonizepalestine ( enable )
|
|
35
38
|
{
|
|
36
|
-
// https://decolonizepalestine.com
|
|
37
39
|
const scraper = new WebScraper({
|
|
38
40
|
baseURL: "https://decolonizepalestine.com",
|
|
39
41
|
excludeList: [
|
|
@@ -54,13 +56,15 @@ async function decolonizepalestine ()
|
|
|
54
56
|
includeMetadata: true,
|
|
55
57
|
metadataFields: ["title", "description", "author"]
|
|
56
58
|
});
|
|
57
|
-
|
|
59
|
+
if ( enable )
|
|
60
|
+
{
|
|
61
|
+
await scraper.start();
|
|
62
|
+
}
|
|
58
63
|
return scraper;
|
|
59
64
|
}
|
|
60
65
|
|
|
61
|
-
async function bdsmovement ()
|
|
66
|
+
async function bdsmovement ( enable )
|
|
62
67
|
{
|
|
63
|
-
// https://bdsmovement.net
|
|
64
68
|
const scraper = new WebScraper({
|
|
65
69
|
baseURL: "https://bdsmovement.net",
|
|
66
70
|
excludeList: [
|
|
@@ -83,15 +87,16 @@ async function bdsmovement ()
|
|
|
83
87
|
host: "socks5://127.0.0.1",
|
|
84
88
|
port: "2080",
|
|
85
89
|
},
|
|
86
|
-
// usePuppeteer: true
|
|
87
90
|
});
|
|
88
|
-
|
|
91
|
+
if ( enable )
|
|
92
|
+
{
|
|
93
|
+
await scraper.start();
|
|
94
|
+
}
|
|
89
95
|
return scraper;
|
|
90
96
|
}
|
|
91
97
|
|
|
92
|
-
async function electronicintifada ()
|
|
98
|
+
async function electronicintifada ( enable )
|
|
93
99
|
{
|
|
94
|
-
// https://electronicintifada.net
|
|
95
100
|
const scraper = new WebScraper({
|
|
96
101
|
baseURL: "https://electronicintifada.net",
|
|
97
102
|
excludeList: [
|
|
@@ -103,7 +108,8 @@ async function electronicintifada ()
|
|
|
103
108
|
"https://electronicintifada.net/location",
|
|
104
109
|
"https://electronicintifada.net/file",
|
|
105
110
|
"https://electronicintifada.net/bytopic/people",
|
|
106
|
-
"https://electronicintifada.net/comment/"
|
|
111
|
+
"https://electronicintifada.net/comment/",
|
|
112
|
+
"https://electronicintifada.net/search/site/"
|
|
107
113
|
],
|
|
108
114
|
exactExcludeList: [
|
|
109
115
|
"https://electronicintifada.net",
|
|
@@ -117,24 +123,73 @@ async function electronicintifada ()
|
|
|
117
123
|
textOutputPath: "./dataset/electronicintifada/texts",
|
|
118
124
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
|
119
125
|
includeMetadata: true,
|
|
120
|
-
maxArticles:
|
|
126
|
+
maxArticles: 2000,
|
|
121
127
|
metadataFields: ["title", "description", "author"]
|
|
122
128
|
});
|
|
123
|
-
|
|
129
|
+
if ( enable )
|
|
130
|
+
{
|
|
131
|
+
await scraper.start();
|
|
132
|
+
}
|
|
124
133
|
return scraper;
|
|
125
134
|
}
|
|
126
135
|
|
|
136
|
+
async function palestineremembered ( enable )
|
|
137
|
+
{
|
|
138
|
+
const scraper = new WebScraper({
|
|
139
|
+
baseURL: "https://www.palestineremembered.com",
|
|
140
|
+
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
|
141
|
+
excludeList: [
|
|
142
|
+
"https://www.palestineremembered.com/GeoPoints",
|
|
143
|
+
"https://www.palestineremembered.com/Donate",
|
|
144
|
+
"https://www.palestineremembered.com/ContactUs.html",
|
|
145
|
+
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
|
146
|
+
"https://www.palestineremembered.com/ar/",
|
|
147
|
+
"https://www.palestineremembered.com/OldNewPictures.html",
|
|
148
|
+
"https://www.palestineremembered.com/Maps/index.html",
|
|
149
|
+
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
|
150
|
+
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
|
151
|
+
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
|
152
|
+
"https://www.palestineremembered.com/AllTownsListing.html",
|
|
153
|
+
"https://www.palestineremembered.com/Articles/General/ar/",
|
|
154
|
+
"https://www.palestineremembered.com/SiteVideos.html"
|
|
155
|
+
],
|
|
156
|
+
exactExcludeList: [
|
|
157
|
+
"https://www.palestineremembered.com/index.html",
|
|
158
|
+
"https://www.palestineremembered.com/ZionistFAQ.html"
|
|
159
|
+
],
|
|
160
|
+
scrapResultPath: "./dataset/palestineremembered/website",
|
|
161
|
+
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
|
162
|
+
textOutputPath: "./dataset/palestineremembered/texts",
|
|
163
|
+
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
|
164
|
+
includeMetadata: true,
|
|
165
|
+
metadataFields: ["title", "description", "author"],
|
|
166
|
+
axiosProxy: {
|
|
167
|
+
host: "localhost",
|
|
168
|
+
port: 2080,
|
|
169
|
+
protocol: "http"
|
|
170
|
+
}
|
|
171
|
+
});
|
|
172
|
+
if ( enable )
|
|
173
|
+
{
|
|
174
|
+
await scraper.start();
|
|
175
|
+
}
|
|
176
|
+
return scraper;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
127
180
|
void async function main ()
|
|
128
181
|
{
|
|
129
|
-
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
130
|
-
const decolonizepalestineScraper = await decolonizepalestine();
|
|
131
|
-
const bdsmovementScraper = await bdsmovement();
|
|
132
|
-
const electronicintifadaScraper = await electronicintifada();
|
|
182
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
|
|
183
|
+
const decolonizepalestineScraper = await decolonizepalestine( false );
|
|
184
|
+
const bdsmovementScraper = await bdsmovement( false );
|
|
185
|
+
const electronicintifadaScraper = await electronicintifada( false );
|
|
186
|
+
const palestinerememberedScraper = await palestineremembered( true );
|
|
133
187
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
134
188
|
khameneiIrFreePalestineTagScraper,
|
|
135
189
|
decolonizepalestineScraper,
|
|
136
190
|
bdsmovementScraper,
|
|
137
|
-
electronicintifadaScraper
|
|
191
|
+
electronicintifadaScraper,
|
|
192
|
+
palestinerememberedScraper
|
|
138
193
|
] );
|
|
139
194
|
|
|
140
195
|
// 5
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -22,7 +22,8 @@ class WebScraper
|
|
|
22
22
|
csvOutputPath,
|
|
23
23
|
includeMetadata = false,
|
|
24
24
|
metadataFields = [], // ['title', 'description', 'author', etc.]
|
|
25
|
-
|
|
25
|
+
axiosHeaders,
|
|
26
|
+
axiosProxy,
|
|
26
27
|
usePuppeteer,
|
|
27
28
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
28
29
|
puppeteerExecutablePath,
|
|
@@ -41,7 +42,7 @@ class WebScraper
|
|
|
41
42
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
42
43
|
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
43
44
|
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
44
|
-
this.
|
|
45
|
+
this.axiosHeaders = axiosHeaders;
|
|
45
46
|
this.includeMetadata = includeMetadata;
|
|
46
47
|
this.metadataFields = new Set( metadataFields );
|
|
47
48
|
this.visited = new Set();
|
|
@@ -50,6 +51,7 @@ class WebScraper
|
|
|
50
51
|
this.allProcessedContent = [];
|
|
51
52
|
this.filterFileTypes = filterFileTypes;
|
|
52
53
|
this.excludedFileTypes = excludedFileTypes;
|
|
54
|
+
this.axiosProxy = axiosProxy;
|
|
53
55
|
this.usePuppeteer = usePuppeteer || false;
|
|
54
56
|
this.puppeteerOptions = {
|
|
55
57
|
headless: false,
|
|
@@ -180,9 +182,13 @@ class WebScraper
|
|
|
180
182
|
try
|
|
181
183
|
{
|
|
182
184
|
let axiosOptions = {};
|
|
183
|
-
if ( this.
|
|
185
|
+
if ( this.axiosHeaders )
|
|
184
186
|
{
|
|
185
|
-
axiosOptions.headers = this.
|
|
187
|
+
axiosOptions.headers = this.axiosHeaders;
|
|
188
|
+
}
|
|
189
|
+
if ( this.axiosProxy )
|
|
190
|
+
{
|
|
191
|
+
axiosOptions.proxy = this.axiosProxy;
|
|
186
192
|
}
|
|
187
193
|
|
|
188
194
|
// Step 1: Make a GET request with a small timeout and limited data download
|