clean-web-scraper 3.3.4 → 3.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +21 -11
- package/package.json +1 -1
- package/src/WebScraper.js +73 -31
package/example-usage.js
CHANGED
|
@@ -82,7 +82,8 @@ async function bdsmovement ()
|
|
|
82
82
|
puppeteerRealProxy: {
|
|
83
83
|
host: "socks5://127.0.0.1",
|
|
84
84
|
port: "2080",
|
|
85
|
-
}
|
|
85
|
+
},
|
|
86
|
+
// usePuppeteer: true
|
|
86
87
|
});
|
|
87
88
|
await scraper.start();
|
|
88
89
|
return scraper;
|
|
@@ -94,12 +95,19 @@ async function electronicintifada ()
|
|
|
94
95
|
const scraper = new WebScraper({
|
|
95
96
|
baseURL: "https://electronicintifada.net",
|
|
96
97
|
excludeList: [
|
|
97
|
-
"https://electronicintifada.net/
|
|
98
|
-
"https://electronicintifada.net/
|
|
99
|
-
"https://electronicintifada.net/
|
|
100
|
-
"https://electronicintifada.net/
|
|
101
|
-
"https://electronicintifada.net/
|
|
102
|
-
"https://electronicintifada.net/
|
|
98
|
+
"https://electronicintifada.net/updates",
|
|
99
|
+
"https://electronicintifada.net/taxonomy/term/",
|
|
100
|
+
"https://electronicintifada.net/tags/",
|
|
101
|
+
"https://electronicintifada.net/blog",
|
|
102
|
+
"https://electronicintifada.net/people",
|
|
103
|
+
"https://electronicintifada.net/location"
|
|
104
|
+
],
|
|
105
|
+
exactExcludeList: [
|
|
106
|
+
"https://electronicintifada.net",
|
|
107
|
+
"https://electronicintifada.net/blog",
|
|
108
|
+
"https://electronicintifada.net/news",
|
|
109
|
+
"https://electronicintifada.net/opinion",
|
|
110
|
+
"https://electronicintifada.net/review",
|
|
103
111
|
],
|
|
104
112
|
scrapResultPath: "./dataset/electronicintifada/website",
|
|
105
113
|
jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
|
|
@@ -114,13 +122,15 @@ async function electronicintifada ()
|
|
|
114
122
|
|
|
115
123
|
void async function main ()
|
|
116
124
|
{
|
|
117
|
-
|
|
125
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
118
126
|
const decolonizepalestineScraper = await decolonizepalestine();
|
|
119
|
-
|
|
127
|
+
const bdsmovementScraper = await bdsmovement();
|
|
128
|
+
const electronicintifadaScraper = await electronicintifada();
|
|
120
129
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
121
|
-
|
|
130
|
+
khameneiIrFreePalestineTagScraper,
|
|
122
131
|
decolonizepalestineScraper,
|
|
123
|
-
|
|
132
|
+
bdsmovementScraper,
|
|
133
|
+
electronicintifadaScraper
|
|
124
134
|
] );
|
|
125
135
|
|
|
126
136
|
// 4
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -22,6 +22,7 @@ class WebScraper
|
|
|
22
22
|
includeMetadata = false,
|
|
23
23
|
metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
|
|
24
24
|
headers,
|
|
25
|
+
usePuppeteer,
|
|
25
26
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
26
27
|
puppeteerExecutablePath,
|
|
27
28
|
puppeteerRealProxy
|
|
@@ -43,7 +44,7 @@ class WebScraper
|
|
|
43
44
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
44
45
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
45
46
|
this.allProcessedContent = [];
|
|
46
|
-
this.usePuppeteer = false;
|
|
47
|
+
this.usePuppeteer = usePuppeteer || false;
|
|
47
48
|
this.puppeteerOptions = {
|
|
48
49
|
headless: false,
|
|
49
50
|
userDataDir: "./tmp/browser",
|
|
@@ -71,16 +72,39 @@ class WebScraper
|
|
|
71
72
|
ignoreAllFlags: false,
|
|
72
73
|
proxy: puppeteerRealProxy
|
|
73
74
|
}
|
|
75
|
+
this.puppeteerBrowser = null;
|
|
76
|
+
this.puppeteerPage = null;
|
|
74
77
|
}
|
|
75
78
|
|
|
76
79
|
async start ()
|
|
77
80
|
{
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
81
|
+
try
|
|
82
|
+
{
|
|
83
|
+
if ( this.usePuppeteer )
|
|
84
|
+
{
|
|
85
|
+
let { browser, page } = await connect( this.puppeteerRealOptions )
|
|
86
|
+
this.puppeteerBrowser = browser;
|
|
87
|
+
this.puppeteerPage = page;
|
|
88
|
+
}
|
|
89
|
+
this.createOutputDirectory();
|
|
90
|
+
await this.fetchPage( this.startURL, 0 );
|
|
91
|
+
this.createJSONLFile();
|
|
92
|
+
this.saveNumberedTextFiles();
|
|
93
|
+
this.createCSVFile();
|
|
94
|
+
console.log( "Scraping completed." );
|
|
95
|
+
}
|
|
96
|
+
catch ( error )
|
|
97
|
+
{
|
|
98
|
+
console.error( "Error:", error );
|
|
99
|
+
throw error;
|
|
100
|
+
}
|
|
101
|
+
finally
|
|
102
|
+
{
|
|
103
|
+
if ( this.puppeteerBrowser )
|
|
104
|
+
{
|
|
105
|
+
await this.puppeteerBrowser.close(); // Close the browser after scraping
|
|
106
|
+
}
|
|
107
|
+
}
|
|
84
108
|
}
|
|
85
109
|
|
|
86
110
|
async fetchPage ( url, depth )
|
|
@@ -152,52 +176,53 @@ class WebScraper
|
|
|
152
176
|
console.error( `Error fetching ${url}:`, error.message );
|
|
153
177
|
if ( error.status = 403 && this.usePuppeteer )
|
|
154
178
|
{
|
|
155
|
-
let { browser, page } = await connect( this.puppeteerRealOptions )
|
|
156
|
-
|
|
157
179
|
// const browser = await puppeteer.launch( this.puppeteerOptions );
|
|
158
180
|
// const page = await browser.newPage();
|
|
159
181
|
try
|
|
160
182
|
{
|
|
161
|
-
let
|
|
183
|
+
let result;
|
|
162
184
|
for ( let index = 0; index < 10; index++ )
|
|
163
185
|
{
|
|
164
|
-
const pages = await browser.pages();
|
|
165
|
-
page = pages[0];
|
|
166
|
-
page.setDefaultNavigationTimeout( 30000 )
|
|
167
|
-
await page.goto( url );
|
|
168
186
|
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
|
169
|
-
await this.
|
|
170
|
-
|
|
171
|
-
if ( this.isValidContent( htmlContent ) )
|
|
187
|
+
result = await this.goToUrl( url ) ;
|
|
188
|
+
if ( this.isValidContent( result.htmlContent ) )
|
|
172
189
|
{
|
|
173
190
|
break
|
|
174
191
|
}
|
|
175
|
-
page = pages[0];
|
|
176
|
-
page.setDefaultNavigationTimeout( 30000 )
|
|
177
|
-
await this.waitForPageToLoad( page );
|
|
178
|
-
htmlContent = await page.content();
|
|
179
|
-
if ( this.isValidContent( htmlContent ) )
|
|
180
|
-
{
|
|
181
|
-
break
|
|
182
|
-
}
|
|
183
|
-
await page.goto( url );
|
|
184
192
|
}
|
|
185
|
-
return htmlContent;
|
|
193
|
+
return result.htmlContent;
|
|
186
194
|
}
|
|
187
195
|
catch ( error )
|
|
188
196
|
{
|
|
189
197
|
console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
|
|
190
198
|
throw error;
|
|
191
199
|
}
|
|
192
|
-
|
|
193
|
-
{
|
|
194
|
-
await browser.close(); // Close the browser after scraping
|
|
195
|
-
}
|
|
200
|
+
|
|
196
201
|
}
|
|
197
202
|
throw error;
|
|
198
203
|
}
|
|
199
204
|
}
|
|
200
205
|
|
|
206
|
+
async goToUrl ( url )
|
|
207
|
+
{
|
|
208
|
+
let pages = await this.puppeteerBrowser.pages();
|
|
209
|
+
let page = pages[0];
|
|
210
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
211
|
+
await page.goto( url );
|
|
212
|
+
pages = await this.puppeteerBrowser.pages();
|
|
213
|
+
page = pages[0];
|
|
214
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
215
|
+
await this.waitForPageToLoad( page );
|
|
216
|
+
pages = await this.puppeteerBrowser.pages();
|
|
217
|
+
page = pages[0];
|
|
218
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
219
|
+
if ( page )
|
|
220
|
+
{
|
|
221
|
+
let htmlContent = await page.content();
|
|
222
|
+
return { pages, page, htmlContent };
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
201
226
|
async waitForPageToLoad ( page )
|
|
202
227
|
{
|
|
203
228
|
try
|
|
@@ -412,6 +437,23 @@ class WebScraper
|
|
|
412
437
|
// processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
|
|
413
438
|
// processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
|
|
414
439
|
|
|
440
|
+
// Remove specified words from the end of content, handling multiple occurrences
|
|
441
|
+
const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
|
|
442
|
+
let changed = true;
|
|
443
|
+
|
|
444
|
+
while ( changed )
|
|
445
|
+
{
|
|
446
|
+
changed = false;
|
|
447
|
+
for ( let i = 0; i < wordsToTrim.length; i++ )
|
|
448
|
+
{
|
|
449
|
+
const oldProcessed = processed;
|
|
450
|
+
processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
|
|
451
|
+
if ( oldProcessed !== processed )
|
|
452
|
+
{
|
|
453
|
+
changed = true;
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
}
|
|
415
457
|
return processed;
|
|
416
458
|
}
|
|
417
459
|
|