clean-web-scraper 3.3.4 → 3.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -82,7 +82,8 @@ async function bdsmovement ()
82
82
  puppeteerRealProxy: {
83
83
  host: "socks5://127.0.0.1",
84
84
  port: "2080",
85
- }
85
+ },
86
+ // usePuppeteer: true
86
87
  });
87
88
  await scraper.start();
88
89
  return scraper;
@@ -94,12 +95,19 @@ async function electronicintifada ()
94
95
  const scraper = new WebScraper({
95
96
  baseURL: "https://electronicintifada.net",
96
97
  excludeList: [
97
- "https://electronicintifada.net/press-area",
98
- "https://electronicintifada.net/privacy-policy",
99
- "https://electronicintifada.net/get-involved/join-a-bds-campaign",
100
- "https://electronicintifada.net/donate_",
101
- "https://electronicintifada.net/user",
102
- "https://electronicintifada.net/admin"
98
+ "https://electronicintifada.net/updates",
99
+ "https://electronicintifada.net/taxonomy/term/",
100
+ "https://electronicintifada.net/tags/",
101
+ "https://electronicintifada.net/blog",
102
+ "https://electronicintifada.net/people",
103
+ "https://electronicintifada.net/location"
104
+ ],
105
+ exactExcludeList: [
106
+ "https://electronicintifada.net",
107
+ "https://electronicintifada.net/blog",
108
+ "https://electronicintifada.net/news",
109
+ "https://electronicintifada.net/opinion",
110
+ "https://electronicintifada.net/review",
103
111
  ],
104
112
  scrapResultPath: "./dataset/electronicintifada/website",
105
113
  jsonlOutputPath: "./dataset/electronicintifada/train.jsonl",
@@ -114,13 +122,15 @@ async function electronicintifada ()
114
122
 
115
123
  void async function main ()
116
124
  {
117
- // const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
125
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
118
126
  const decolonizepalestineScraper = await decolonizepalestine();
119
- // const bdsmovementScraper = await bdsmovement();
127
+ const bdsmovementScraper = await bdsmovement();
128
+ const electronicintifadaScraper = await electronicintifada();
120
129
  await WebScraper.combineResults( "./dataset/combined", [
121
- // khameneiIrFreePalestineTagScraper,
130
+ khameneiIrFreePalestineTagScraper,
122
131
  decolonizepalestineScraper,
123
- // bdsmovementScraper
132
+ bdsmovementScraper,
133
+ electronicintifadaScraper
124
134
  ] );
125
135
 
126
136
  // 4
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.3.4",
3
+ "version": "3.3.6",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -22,6 +22,7 @@ class WebScraper
22
22
  includeMetadata = false,
23
23
  metadataFields = [], // ['title', 'description', 'author', 'lastModified', etc.]
24
24
  headers,
25
+ usePuppeteer,
25
26
  puppeteerProxy, // e.g. http://127.0.0.1:2080
26
27
  puppeteerExecutablePath,
27
28
  puppeteerRealProxy
@@ -43,7 +44,7 @@ class WebScraper
43
44
  this.excludeList = this.normalizeExcludeList( excludeList );
44
45
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
45
46
  this.allProcessedContent = [];
46
- this.usePuppeteer = false;
47
+ this.usePuppeteer = usePuppeteer || false;
47
48
  this.puppeteerOptions = {
48
49
  headless: false,
49
50
  userDataDir: "./tmp/browser",
@@ -71,16 +72,39 @@ class WebScraper
71
72
  ignoreAllFlags: false,
72
73
  proxy: puppeteerRealProxy
73
74
  }
75
+ this.puppeteerBrowser = null;
76
+ this.puppeteerPage = null;
74
77
  }
75
78
 
76
79
  async start ()
77
80
  {
78
- this.createOutputDirectory();
79
- await this.fetchPage( this.startURL, 0 );
80
- this.createJSONLFile();
81
- this.saveNumberedTextFiles();
82
- this.createCSVFile();
83
- console.log( "Scraping completed." );
81
+ try
82
+ {
83
+ if ( this.usePuppeteer )
84
+ {
85
+ let { browser, page } = await connect( this.puppeteerRealOptions )
86
+ this.puppeteerBrowser = browser;
87
+ this.puppeteerPage = page;
88
+ }
89
+ this.createOutputDirectory();
90
+ await this.fetchPage( this.startURL, 0 );
91
+ this.createJSONLFile();
92
+ this.saveNumberedTextFiles();
93
+ this.createCSVFile();
94
+ console.log( "Scraping completed." );
95
+ }
96
+ catch ( error )
97
+ {
98
+ console.error( "Error:", error );
99
+ throw error;
100
+ }
101
+ finally
102
+ {
103
+ if ( this.puppeteerBrowser )
104
+ {
105
+ await this.puppeteerBrowser.close(); // Close the browser after scraping
106
+ }
107
+ }
84
108
  }
85
109
 
86
110
  async fetchPage ( url, depth )
@@ -152,52 +176,53 @@ class WebScraper
152
176
  console.error( `Error fetching ${url}:`, error.message );
153
177
  if ( error.status = 403 && this.usePuppeteer )
154
178
  {
155
- let { browser, page } = await connect( this.puppeteerRealOptions )
156
-
157
179
  // const browser = await puppeteer.launch( this.puppeteerOptions );
158
180
  // const page = await browser.newPage();
159
181
  try
160
182
  {
161
- let htmlContent;
183
+ let result;
162
184
  for ( let index = 0; index < 10; index++ )
163
185
  {
164
- const pages = await browser.pages();
165
- page = pages[0];
166
- page.setDefaultNavigationTimeout( 30000 )
167
- await page.goto( url );
168
186
  console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
169
- await this.waitForPageToLoad( page );
170
- htmlContent = await page.content();
171
- if ( this.isValidContent( htmlContent ) )
187
+ result = await this.goToUrl( url ) ;
188
+ if ( this.isValidContent( result.htmlContent ) )
172
189
  {
173
190
  break
174
191
  }
175
- page = pages[0];
176
- page.setDefaultNavigationTimeout( 30000 )
177
- await this.waitForPageToLoad( page );
178
- htmlContent = await page.content();
179
- if ( this.isValidContent( htmlContent ) )
180
- {
181
- break
182
- }
183
- await page.goto( url );
184
192
  }
185
- return htmlContent;
193
+ return result.htmlContent;
186
194
  }
187
195
  catch ( error )
188
196
  {
189
197
  console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
190
198
  throw error;
191
199
  }
192
- finally
193
- {
194
- await browser.close(); // Close the browser after scraping
195
- }
200
+
196
201
  }
197
202
  throw error;
198
203
  }
199
204
  }
200
205
 
206
+ async goToUrl ( url )
207
+ {
208
+ let pages = await this.puppeteerBrowser.pages();
209
+ let page = pages[0];
210
+ page.setDefaultNavigationTimeout( 10000 );
211
+ await page.goto( url );
212
+ pages = await this.puppeteerBrowser.pages();
213
+ page = pages[0];
214
+ page.setDefaultNavigationTimeout( 10000 );
215
+ await this.waitForPageToLoad( page );
216
+ pages = await this.puppeteerBrowser.pages();
217
+ page = pages[0];
218
+ page.setDefaultNavigationTimeout( 10000 );
219
+ if ( page )
220
+ {
221
+ let htmlContent = await page.content();
222
+ return { pages, page, htmlContent };
223
+ }
224
+ }
225
+
201
226
  async waitForPageToLoad ( page )
202
227
  {
203
228
  try
@@ -412,6 +437,23 @@ class WebScraper
412
437
  // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
413
438
  // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
414
439
 
440
+ // Remove specified words from the end of content, handling multiple occurrences
441
+ const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
442
+ let changed = true;
443
+
444
+ while ( changed )
445
+ {
446
+ changed = false;
447
+ for ( let i = 0; i < wordsToTrim.length; i++ )
448
+ {
449
+ const oldProcessed = processed;
450
+ processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
451
+ if ( oldProcessed !== processed )
452
+ {
453
+ changed = true;
454
+ }
455
+ }
456
+ }
415
457
  return processed;
416
458
  }
417
459