clean-web-scraper 3.3.4 → 3.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +6 -5
- package/package.json +1 -1
- package/src/WebScraper.js +56 -31
package/example-usage.js
CHANGED
|
@@ -82,7 +82,8 @@ async function bdsmovement ()
|
|
|
82
82
|
puppeteerRealProxy: {
|
|
83
83
|
host: "socks5://127.0.0.1",
|
|
84
84
|
port: "2080",
|
|
85
|
-
}
|
|
85
|
+
},
|
|
86
|
+
usePuppeteer: true
|
|
86
87
|
});
|
|
87
88
|
await scraper.start();
|
|
88
89
|
return scraper;
|
|
@@ -114,13 +115,13 @@ async function electronicintifada ()
|
|
|
114
115
|
|
|
115
116
|
void async function main ()
|
|
116
117
|
{
|
|
117
|
-
|
|
118
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
118
119
|
const decolonizepalestineScraper = await decolonizepalestine();
|
|
119
|
-
|
|
120
|
+
const bdsmovementScraper = await bdsmovement();
|
|
120
121
|
await WebScraper.combineResults( "./dataset/combined", [
|
|
121
|
-
|
|
122
|
+
khameneiIrFreePalestineTagScraper,
|
|
122
123
|
decolonizepalestineScraper,
|
|
123
|
-
|
|
124
|
+
bdsmovementScraper
|
|
124
125
|
] );
|
|
125
126
|
|
|
126
127
|
// 4
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -22,6 +22,7 @@ class WebScraper
|
|
|
22
22
|
includeMetadata = false,
|
|
23
23
|
metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
|
|
24
24
|
headers,
|
|
25
|
+
usePuppeteer,
|
|
25
26
|
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
26
27
|
puppeteerExecutablePath,
|
|
27
28
|
puppeteerRealProxy
|
|
@@ -43,7 +44,7 @@ class WebScraper
|
|
|
43
44
|
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
44
45
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
45
46
|
this.allProcessedContent = [];
|
|
46
|
-
this.usePuppeteer = false;
|
|
47
|
+
this.usePuppeteer = usePuppeteer || false;
|
|
47
48
|
this.puppeteerOptions = {
|
|
48
49
|
headless: false,
|
|
49
50
|
userDataDir: "./tmp/browser",
|
|
@@ -71,16 +72,39 @@ class WebScraper
|
|
|
71
72
|
ignoreAllFlags: false,
|
|
72
73
|
proxy: puppeteerRealProxy
|
|
73
74
|
}
|
|
75
|
+
this.puppeteerBrowser = null;
|
|
76
|
+
this.puppeteerPage = null;
|
|
74
77
|
}
|
|
75
78
|
|
|
76
79
|
async start ()
|
|
77
80
|
{
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
81
|
+
try
|
|
82
|
+
{
|
|
83
|
+
if ( this.usePuppeteer )
|
|
84
|
+
{
|
|
85
|
+
let { browser, page } = await connect( this.puppeteerRealOptions )
|
|
86
|
+
this.puppeteerBrowser = browser;
|
|
87
|
+
this.puppeteerPage = page;
|
|
88
|
+
}
|
|
89
|
+
this.createOutputDirectory();
|
|
90
|
+
await this.fetchPage( this.startURL, 0 );
|
|
91
|
+
this.createJSONLFile();
|
|
92
|
+
this.saveNumberedTextFiles();
|
|
93
|
+
this.createCSVFile();
|
|
94
|
+
console.log( "Scraping completed." );
|
|
95
|
+
}
|
|
96
|
+
catch ( error )
|
|
97
|
+
{
|
|
98
|
+
console.error( "Error:", error );
|
|
99
|
+
throw error;
|
|
100
|
+
}
|
|
101
|
+
finally
|
|
102
|
+
{
|
|
103
|
+
if ( this.puppeteerBrowser )
|
|
104
|
+
{
|
|
105
|
+
await this.puppeteerBrowser.close(); // Close the browser after scraping
|
|
106
|
+
}
|
|
107
|
+
}
|
|
84
108
|
}
|
|
85
109
|
|
|
86
110
|
async fetchPage ( url, depth )
|
|
@@ -152,52 +176,53 @@ class WebScraper
|
|
|
152
176
|
console.error( `Error fetching ${url}:`, error.message );
|
|
153
177
|
if ( error.status = 403 && this.usePuppeteer )
|
|
154
178
|
{
|
|
155
|
-
let { browser, page } = await connect( this.puppeteerRealOptions )
|
|
156
|
-
|
|
157
179
|
// const browser = await puppeteer.launch( this.puppeteerOptions );
|
|
158
180
|
// const page = await browser.newPage();
|
|
159
181
|
try
|
|
160
182
|
{
|
|
161
|
-
let
|
|
183
|
+
let result;
|
|
162
184
|
for ( let index = 0; index < 10; index++ )
|
|
163
185
|
{
|
|
164
|
-
const pages = await browser.pages();
|
|
165
|
-
page = pages[0];
|
|
166
|
-
page.setDefaultNavigationTimeout( 30000 )
|
|
167
|
-
await page.goto( url );
|
|
168
186
|
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
|
169
|
-
await this.
|
|
170
|
-
|
|
171
|
-
if ( this.isValidContent( htmlContent ) )
|
|
187
|
+
result = await this.goToUrl( url ) ;
|
|
188
|
+
if ( this.isValidContent( result.htmlContent ) )
|
|
172
189
|
{
|
|
173
190
|
break
|
|
174
191
|
}
|
|
175
|
-
page = pages[0];
|
|
176
|
-
page.setDefaultNavigationTimeout( 30000 )
|
|
177
|
-
await this.waitForPageToLoad( page );
|
|
178
|
-
htmlContent = await page.content();
|
|
179
|
-
if ( this.isValidContent( htmlContent ) )
|
|
180
|
-
{
|
|
181
|
-
break
|
|
182
|
-
}
|
|
183
|
-
await page.goto( url );
|
|
184
192
|
}
|
|
185
|
-
return htmlContent;
|
|
193
|
+
return result.htmlContent;
|
|
186
194
|
}
|
|
187
195
|
catch ( error )
|
|
188
196
|
{
|
|
189
197
|
console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
|
|
190
198
|
throw error;
|
|
191
199
|
}
|
|
192
|
-
|
|
193
|
-
{
|
|
194
|
-
await browser.close(); // Close the browser after scraping
|
|
195
|
-
}
|
|
200
|
+
|
|
196
201
|
}
|
|
197
202
|
throw error;
|
|
198
203
|
}
|
|
199
204
|
}
|
|
200
205
|
|
|
206
|
+
async goToUrl ( url )
|
|
207
|
+
{
|
|
208
|
+
let pages = await this.puppeteerBrowser.pages();
|
|
209
|
+
let page = pages[0];
|
|
210
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
211
|
+
await page.goto( url );
|
|
212
|
+
pages = await this.puppeteerBrowser.pages();
|
|
213
|
+
page = pages[0];
|
|
214
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
215
|
+
await this.waitForPageToLoad( page );
|
|
216
|
+
pages = await this.puppeteerBrowser.pages();
|
|
217
|
+
page = pages[0];
|
|
218
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
219
|
+
if ( page )
|
|
220
|
+
{
|
|
221
|
+
let htmlContent = await page.content();
|
|
222
|
+
return { pages, page, htmlContent };
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
|
|
201
226
|
async waitForPageToLoad ( page )
|
|
202
227
|
{
|
|
203
228
|
try
|