clean-web-scraper 3.3.4 → 3.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -82,7 +82,8 @@ async function bdsmovement ()
82
82
  puppeteerRealProxy: {
83
83
  host: "socks5://127.0.0.1",
84
84
  port: "2080",
85
- }
85
+ },
86
+ usePuppeteer: true
86
87
  });
87
88
  await scraper.start();
88
89
  return scraper;
@@ -114,13 +115,13 @@ async function electronicintifada ()
114
115
 
115
116
  void async function main ()
116
117
  {
117
- // const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
118
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
118
119
  const decolonizepalestineScraper = await decolonizepalestine();
119
- // const bdsmovementScraper = await bdsmovement();
120
+ const bdsmovementScraper = await bdsmovement();
120
121
  await WebScraper.combineResults( "./dataset/combined", [
121
- // khameneiIrFreePalestineTagScraper,
122
+ khameneiIrFreePalestineTagScraper,
122
123
  decolonizepalestineScraper,
123
- // bdsmovementScraper
124
+ bdsmovementScraper
124
125
  ] );
125
126
 
126
127
  // 4
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.4",
3
+ "version": "3.3.5",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -22,6 +22,7 @@ class WebScraper
22
22
  includeMetadata = false,
23
23
  metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
24
24
  headers,
25
+ usePuppeteer,
25
26
  puppeteerProxy, // e.g. http://127.0.0.1:2080
26
27
  puppeteerExecutablePath,
27
28
  puppeteerRealProxy
@@ -43,7 +44,7 @@ class WebScraper
43
44
  this.excludeList = this.normalizeExcludeList( excludeList );
44
45
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
45
46
  this.allProcessedContent = [];
46
- this.usePuppeteer = false;
47
+ this.usePuppeteer = usePuppeteer || false;
47
48
  this.puppeteerOptions = {
48
49
  headless: false,
49
50
  userDataDir: "./tmp/browser",
@@ -71,16 +72,39 @@ class WebScraper
71
72
  ignoreAllFlags: false,
72
73
  proxy: puppeteerRealProxy
73
74
  }
75
+ this.puppeteerBrowser = null;
76
+ this.puppeteerPage = null;
74
77
  }
75
78
 
76
79
  async start ()
77
80
  {
78
- this.createOutputDirectory();
79
- await this.fetchPage( this.startURL, 0 );
80
- this.createJSONLFile();
81
- this.saveNumberedTextFiles();
82
- this.createCSVFile();
83
- console.log( "Scraping completed." );
81
+ try
82
+ {
83
+ if ( this.usePuppeteer )
84
+ {
85
+ let { browser, page } = await connect( this.puppeteerRealOptions )
86
+ this.puppeteerBrowser = browser;
87
+ this.puppeteerPage = page;
88
+ }
89
+ this.createOutputDirectory();
90
+ await this.fetchPage( this.startURL, 0 );
91
+ this.createJSONLFile();
92
+ this.saveNumberedTextFiles();
93
+ this.createCSVFile();
94
+ console.log( "Scraping completed." );
95
+ }
96
+ catch ( error )
97
+ {
98
+ console.error( "Error:", error );
99
+ throw error;
100
+ }
101
+ finally
102
+ {
103
+ if ( this.puppeteerBrowser )
104
+ {
105
+ await this.puppeteerBrowser.close(); // Close the browser after scraping
106
+ }
107
+ }
84
108
  }
85
109
 
86
110
  async fetchPage ( url, depth )
@@ -152,52 +176,53 @@ class WebScraper
152
176
  console.error( `Error fetching ${url}:`, error.message );
153
177
  if ( error.status = 403 && this.usePuppeteer )
154
178
  {
155
- let { browser, page } = await connect( this.puppeteerRealOptions )
156
-
157
179
  // const browser = await puppeteer.launch( this.puppeteerOptions );
158
180
  // const page = await browser.newPage();
159
181
  try
160
182
  {
161
- let htmlContent;
183
+ let result;
162
184
  for ( let index = 0; index < 10; index++ )
163
185
  {
164
- const pages = await browser.pages();
165
- page = pages[0];
166
- page.setDefaultNavigationTimeout( 30000 )
167
- await page.goto( url );
168
186
  console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
169
- await this.waitForPageToLoad( page );
170
- htmlContent = await page.content();
171
- if ( this.isValidContent( htmlContent ) )
187
+ result = await this.goToUrl( url ) ;
188
+ if ( this.isValidContent( result.htmlContent ) )
172
189
  {
173
190
  break
174
191
  }
175
- page = pages[0];
176
- page.setDefaultNavigationTimeout( 30000 )
177
- await this.waitForPageToLoad( page );
178
- htmlContent = await page.content();
179
- if ( this.isValidContent( htmlContent ) )
180
- {
181
- break
182
- }
183
- await page.goto( url );
184
192
  }
185
- return htmlContent;
193
+ return result.htmlContent;
186
194
  }
187
195
  catch ( error )
188
196
  {
189
197
  console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
190
198
  throw error;
191
199
  }
192
- finally
193
- {
194
- await browser.close(); // Close the browser after scraping
195
- }
200
+
196
201
  }
197
202
  throw error;
198
203
  }
199
204
  }
200
205
 
206
+ async goToUrl ( url )
207
+ {
208
+ let pages = await this.puppeteerBrowser.pages();
209
+ let page = pages[0];
210
+ page.setDefaultNavigationTimeout( 10000 );
211
+ await page.goto( url );
212
+ pages = await this.puppeteerBrowser.pages();
213
+ page = pages[0];
214
+ page.setDefaultNavigationTimeout( 10000 );
215
+ await this.waitForPageToLoad( page );
216
+ pages = await this.puppeteerBrowser.pages();
217
+ page = pages[0];
218
+ page.setDefaultNavigationTimeout( 10000 );
219
+ if ( page )
220
+ {
221
+ let htmlContent = await page.content();
222
+ return { pages, page, htmlContent };
223
+ }
224
+ }
225
+
201
226
  async waitForPageToLoad ( page )
202
227
  {
203
228
  try