clean-web-scraper 3.4.1 → 3.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/example-usage.js +75 -17
- package/package.json +1 -1
- package/src/WebScraper.js +18 -4
package/README.md
CHANGED
package/example-usage.js
CHANGED
|
@@ -27,13 +27,15 @@ async function khameneiIrFreePalestineTag ()
|
|
|
27
27
|
includeMetadata: true,
|
|
28
28
|
metadataFields: ["title", "description", "author"]
|
|
29
29
|
});
|
|
30
|
-
|
|
30
|
+
if ( enable )
|
|
31
|
+
{
|
|
32
|
+
await scraper.start();
|
|
33
|
+
}
|
|
31
34
|
return scraper;
|
|
32
35
|
}
|
|
33
36
|
|
|
34
|
-
async function decolonizepalestine ()
|
|
37
|
+
async function decolonizepalestine ( enable )
|
|
35
38
|
{
|
|
36
|
-
// https://decolonizepalestine.com
|
|
37
39
|
const scraper = new WebScraper({
|
|
38
40
|
baseURL: "https://decolonizepalestine.com",
|
|
39
41
|
excludeList: [
|
|
@@ -54,13 +56,15 @@ async function decolonizepalestine ()
|
|
|
54
56
|
includeMetadata: true,
|
|
55
57
|
metadataFields: ["title", "description", "author"]
|
|
56
58
|
});
|
|
57
|
-
|
|
59
|
+
if ( enable )
|
|
60
|
+
{
|
|
61
|
+
await scraper.start();
|
|
62
|
+
}
|
|
58
63
|
return scraper;
|
|
59
64
|
}
|
|
60
65
|
|
|
61
|
-
async function bdsmovement ()
|
|
66
|
+
async function bdsmovement ( enable )
|
|
62
67
|
{
|
|
63
|
-
// https://bdsmovement.net
|
|
64
68
|
const scraper = new WebScraper({
|
|
65
69
|
baseURL: "https://bdsmovement.net",
|
|
66
70
|
excludeList: [
|
|
@@ -83,15 +87,16 @@ async function bdsmovement ()
|
|
|
83
87
|
host: "socks5://127.0.0.1",
|
|
84
88
|
port: "2080",
|
|
85
89
|
},
|
|
86
|
-
// usePuppeteer: true
|
|
87
90
|
});
|
|
88
|
-
|
|
91
|
+
if ( enable )
|
|
92
|
+
{
|
|
93
|
+
await scraper.start();
|
|
94
|
+
}
|
|
89
95
|
return scraper;
|
|
90
96
|
}
|
|
91
97
|
|
|
92
|
-
async function electronicintifada ()
|
|
98
|
+
async function electronicintifada ( enable )
|
|
93
99
|
{
|
|
94
|
-
// https://electronicintifada.net
|
|
95
100
|
const scraper = new WebScraper({
|
|
96
101
|
baseURL: "https://electronicintifada.net",
|
|
97
102
|
excludeList: [
|
|
@@ -101,7 +106,10 @@ async function electronicintifada ()
|
|
|
101
106
|
"https://electronicintifada.net/blog",
|
|
102
107
|
"https://electronicintifada.net/people",
|
|
103
108
|
"https://electronicintifada.net/location",
|
|
104
|
-
"https://electronicintifada.net/file"
|
|
109
|
+
"https://electronicintifada.net/file",
|
|
110
|
+
"https://electronicintifada.net/bytopic/people",
|
|
111
|
+
"https://electronicintifada.net/comment/",
|
|
112
|
+
"https://electronicintifada.net/search/site/"
|
|
105
113
|
],
|
|
106
114
|
exactExcludeList: [
|
|
107
115
|
"https://electronicintifada.net",
|
|
@@ -115,23 +123,73 @@ async function electronicintifada ()
|
|
|
115
123
|
textOutputPath: "./dataset/electronicintifada/texts",
|
|
116
124
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
|
117
125
|
includeMetadata: true,
|
|
126
|
+
maxArticles: 2000,
|
|
118
127
|
metadataFields: ["title", "description", "author"]
|
|
119
128
|
});
|
|
120
|
-
|
|
129
|
+
if ( enable )
|
|
130
|
+
{
|
|
131
|
+
await scraper.start();
|
|
132
|
+
}
|
|
121
133
|
return scraper;
|
|
122
134
|
}
|
|
123
135
|
|
|
136
|
+
async function palestineremembered ( enable )
|
|
137
|
+
{
|
|
138
|
+
const scraper = new WebScraper({
|
|
139
|
+
baseURL: "https://www.palestineremembered.com",
|
|
140
|
+
startURL: "https://www.palestineremembered.com/ZionistFAQ.html",
|
|
141
|
+
excludeList: [
|
|
142
|
+
"https://www.palestineremembered.com/GeoPoints",
|
|
143
|
+
"https://www.palestineremembered.com/Donate",
|
|
144
|
+
"https://www.palestineremembered.com/ContactUs.html",
|
|
145
|
+
"https://www.palestineremembered.com/tags/Looting-Palestinian-properties.html",
|
|
146
|
+
"https://www.palestineremembered.com/ar/",
|
|
147
|
+
"https://www.palestineremembered.com/OldNewPictures.html",
|
|
148
|
+
"https://www.palestineremembered.com/Maps/index.html",
|
|
149
|
+
"https://www.palestineremembered.com/OralHistory/Interviews-Listing/",
|
|
150
|
+
"https://www.palestineremembered.com/Acre/Famous-Zionist-Quotes/Story637.html",
|
|
151
|
+
"https://www.palestineremembered.com/Articles/General/Story2045.html",
|
|
152
|
+
"https://www.palestineremembered.com/AllTownsListing.html",
|
|
153
|
+
"https://www.palestineremembered.com/Articles/General/ar/",
|
|
154
|
+
"https://www.palestineremembered.com/SiteVideos.html"
|
|
155
|
+
],
|
|
156
|
+
exactExcludeList: [
|
|
157
|
+
"https://www.palestineremembered.com/index.html",
|
|
158
|
+
"https://www.palestineremembered.com/ZionistFAQ.html"
|
|
159
|
+
],
|
|
160
|
+
scrapResultPath: "./dataset/palestineremembered/website",
|
|
161
|
+
jsonlOutputPath: "./dataset/palestineremembered/train.jsonl",
|
|
162
|
+
textOutputPath: "./dataset/palestineremembered/texts",
|
|
163
|
+
csvOutputPath: "./dataset/palestineremembered/train.csv",
|
|
164
|
+
includeMetadata: true,
|
|
165
|
+
metadataFields: ["title", "description", "author"],
|
|
166
|
+
axiosProxy: {
|
|
167
|
+
host: "localhost",
|
|
168
|
+
port: 2080,
|
|
169
|
+
protocol: "http"
|
|
170
|
+
}
|
|
171
|
+
});
|
|
172
|
+
if ( enable )
|
|
173
|
+
{
|
|
174
|
+
await scraper.start();
|
|
175
|
+
}
|
|
176
|
+
return scraper;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
124
180
|
void async function main ()
|
|
125
181
|
{
|
|
126
|
-
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
127
|
-
const decolonizepalestineScraper = await decolonizepalestine();
|
|
128
|
-
const bdsmovementScraper = await bdsmovement();
|
|
129
|
-
const electronicintifadaScraper = await electronicintifada();
|
|
182
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
|
|
183
|
+
const decolonizepalestineScraper = await decolonizepalestine( false );
|
|
184
|
+
const bdsmovementScraper = await bdsmovement( false );
|
|
185
|
+
const electronicintifadaScraper = await electronicintifada( false );
|
|
186
|
+
const palestinerememberedScraper = await palestineremembered( true );
|
|
130
187
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
131
188
|
khameneiIrFreePalestineTagScraper,
|
|
132
189
|
decolonizepalestineScraper,
|
|
133
190
|
bdsmovementScraper,
|
|
134
|
-
electronicintifadaScraper
|
|
191
|
+
electronicintifadaScraper,
|
|
192
|
+
palestinerememberedScraper
|
|
135
193
|
] );
|
|
136
194
|
|
|
137
195
|
// 5
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -13,6 +13,7 @@ class WebScraper
|
|
|
13
13
|
baseURL,
|
|
14
14
|
startURL,
|
|
15
15
|
maxDepth = Infinity,
|
|
16
|
+
maxArticles = Infinity, // Add this line
|
|
16
17
|
excludeList,
|
|
17
18
|
exactExcludeList,
|
|
18
19
|
scrapResultPath = "./dataset",
|
|
@@ -21,7 +22,8 @@ class WebScraper
|
|
|
21
22
|
csvOutputPath,
|
|
22
23
|
includeMetadata = false,
|
|
23
24
|
metadataFields = [], // ['title', 'description', 'author', etc.]
|
|
24
|
-
|
|
25
|
+
axiosHeaders,
|
|
26
|
+
axiosProxy,
|
|
25
27
|
usePuppeteer,
|
|
26
28
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
27
29
|
puppeteerExecutablePath,
|
|
@@ -33,13 +35,14 @@ class WebScraper
|
|
|
33
35
|
this.baseURL = baseURL;
|
|
34
36
|
this.startURL = startURL || baseURL;
|
|
35
37
|
this.maxDepth = maxDepth;
|
|
38
|
+
this.maxArticles = maxArticles; // Add this line
|
|
36
39
|
this.scrapResultPath = scrapResultPath;
|
|
37
40
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
38
41
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
39
42
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
40
43
|
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
41
44
|
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
42
|
-
this.
|
|
45
|
+
this.axiosHeaders = axiosHeaders;
|
|
43
46
|
this.includeMetadata = includeMetadata;
|
|
44
47
|
this.metadataFields = new Set( metadataFields );
|
|
45
48
|
this.visited = new Set();
|
|
@@ -48,6 +51,7 @@ class WebScraper
|
|
|
48
51
|
this.allProcessedContent = [];
|
|
49
52
|
this.filterFileTypes = filterFileTypes;
|
|
50
53
|
this.excludedFileTypes = excludedFileTypes;
|
|
54
|
+
this.axiosProxy = axiosProxy;
|
|
51
55
|
this.usePuppeteer = usePuppeteer || false;
|
|
52
56
|
this.puppeteerOptions = {
|
|
53
57
|
headless: false,
|
|
@@ -113,6 +117,11 @@ class WebScraper
|
|
|
113
117
|
|
|
114
118
|
async fetchPage ( url, depth )
|
|
115
119
|
{
|
|
120
|
+
if ( this.allProcessedContent.length >= this.maxArticles )
|
|
121
|
+
{
|
|
122
|
+
console.log( `Reached maximum number of articles (${this.maxArticles})` );
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
116
125
|
if ( depth > this.maxDepth )
|
|
117
126
|
{
|
|
118
127
|
return;
|
|
@@ -125,6 +134,7 @@ class WebScraper
|
|
|
125
134
|
try
|
|
126
135
|
{
|
|
127
136
|
const data = await this.caller( url );
|
|
137
|
+
if ( !data ) return;
|
|
128
138
|
const dom = new JSDOM( data, { url });
|
|
129
139
|
const { document } = dom.window;
|
|
130
140
|
|
|
@@ -172,9 +182,13 @@ class WebScraper
|
|
|
172
182
|
try
|
|
173
183
|
{
|
|
174
184
|
let axiosOptions = {};
|
|
175
|
-
if ( this.
|
|
185
|
+
if ( this.axiosHeaders )
|
|
186
|
+
{
|
|
187
|
+
axiosOptions.headers = this.axiosHeaders;
|
|
188
|
+
}
|
|
189
|
+
if ( this.axiosProxy )
|
|
176
190
|
{
|
|
177
|
-
axiosOptions.
|
|
191
|
+
axiosOptions.proxy = this.axiosProxy;
|
|
178
192
|
}
|
|
179
193
|
|
|
180
194
|
// Step 1: Make a GET request with a small timeout and limited data download
|