clean-web-scraper 3.8.7 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/example-usage.js +7 -6
  2. package/main.js +74 -108
  3. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -129,9 +129,9 @@ async function electronicintifada ( enable )
129
129
  csvOutputPath: "./dataset/electronicintifada/train.csv",
130
130
  includeMetadata: true,
131
131
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
132
- maxDepth: 13,
132
+ maxDepth: 16,
133
133
  maxArticles: 2000,
134
- concurrencyLimit: 3,
134
+ concurrencyLimit: 2,
135
135
  axiosHeaders: headers,
136
136
  retryDelay: 10000,
137
137
  axiosProxy: {
@@ -198,18 +198,19 @@ async function mondoweiss ( enable )
198
198
  textOutputPath: "./dataset/mondoweiss/texts",
199
199
  csvOutputPath: "./dataset/mondoweiss/train.csv",
200
200
  maxArticles: 2500,
201
- maxRetries: 2,
202
- concurrencyLimit: 4,
201
+ maxRetries: 3,
202
+ concurrencyLimit: 3,
203
203
  axiosHeaders: headers,
204
204
  axiosProxy: {
205
205
  host: "localhost",
206
206
  port: 2080,
207
207
  protocol: "http"
208
208
  },
209
- maxDepth: 10,
209
+ maxDepth: 15,
210
210
  retryDelay: 10000,
211
211
  includeMetadata: true,
212
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
212
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
213
+ useProxyAsFallback: true
213
214
  });
214
215
  if ( enable )
215
216
  {
package/main.js CHANGED
@@ -7,45 +7,7 @@ const { connect } = require( "puppeteer-real-browser" )
7
7
 
8
8
  class WebScraper
9
9
  {
10
- constructor ({
11
- // Base configuration
12
- baseURL,
13
- startURL,
14
- strictBaseURL,
15
- maxDepth,
16
- maxArticles,
17
- concurrencyLimit,
18
- maxRetries,
19
- retryDelay,
20
-
21
- // URL filtering
22
- excludeList = [],
23
- exactExcludeList = [],
24
- filterFileTypes,
25
- excludedFileTypes,
26
- removeURLFragment,
27
-
28
- // Output paths
29
- scrapResultPath = "./dataset",
30
- jsonlOutputPath,
31
- textOutputPath,
32
- csvOutputPath,
33
-
34
- // Metadata options
35
- includeMetadata = false,
36
- metadataFields = [],
37
-
38
- // Network options
39
- axiosHeaders,
40
- axiosProxy,
41
- useProxyAsFallback,
42
-
43
- // Puppeteer options
44
- usePuppeteer,
45
- puppeteerProxy, // e.g. http://127.0.0.1:2080
46
- puppeteerExecutablePath,
47
- puppeteerRealProxy
48
- })
10
+ constructor ( config )
49
11
  {
50
12
  // Base configuration
51
13
  this.baseURL = baseURL;
@@ -54,11 +16,10 @@ class WebScraper
54
16
  this.maxDepth = maxDepth || Infinity;
55
17
  this.maxArticles = maxArticles || Infinity;
56
18
  this.concurrencyLimit = concurrencyLimit || 2;
57
- this.maxRetries = maxRetries || 5;
58
- this.retryDelay = retryDelay || 40000;
19
+ this.crawlingDelay = crawlingDelay || 1000;
59
20
 
60
21
  // Output paths setup
61
- this.scrapResultPath = scrapResultPath;
22
+ this.scrapResultPath = scrapResultPath || "./dataset";
62
23
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
63
24
  this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
64
25
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
@@ -67,8 +28,8 @@ class WebScraper
67
28
  this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
68
29
 
69
30
  // Metadata configuration
70
- this.includeMetadata = includeMetadata;
71
- this.metadataFields = new Set( metadataFields );
31
+ this.includeMetadata = includeMetadata || false;
32
+ this.metadataFields = new Set( metadataFields || [] );
72
33
 
73
34
  // URL filtering setup
74
35
  this.visited = new Set();
@@ -81,6 +42,8 @@ class WebScraper
81
42
  // Network configuration
82
43
  this.axiosHeaders = axiosHeaders;
83
44
  this.axiosProxy = axiosProxy;
45
+ this.axiosMaxRetries = axiosMaxRetries || 5;
46
+ this.axiosRetryDelay = axiosRetryDelay || 40000;
84
47
  this.useProxyAsFallback = useProxyAsFallback || false;
85
48
  this.axiosOptions = {};
86
49
  if ( this.axiosHeaders )
@@ -97,7 +60,7 @@ class WebScraper
97
60
 
98
61
  // Puppeteer configuration
99
62
  this.usePuppeteer = usePuppeteer || false;
100
- this.puppeteerProxy = puppeteerProxy;
63
+ this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
101
64
  this.puppeteerExecutablePath = puppeteerExecutablePath;
102
65
  this.puppeteerRealProxy = puppeteerRealProxy;
103
66
  this.configurePuppeteer( );
@@ -114,7 +77,7 @@ class WebScraper
114
77
  this.puppeteerPage = page;
115
78
  }
116
79
  this.createOutputDirectory();
117
- await this.fetchPage( this.startURL, 0 );
80
+ await this.crawl( this.startURL, 0 );
118
81
  this.createJSONLFile();
119
82
  this.saveNumberedTextFiles();
120
83
  this.createCSVFile();
@@ -134,84 +97,87 @@ class WebScraper
134
97
  }
135
98
  }
136
99
 
137
- async fetchPage ( url, depth )
100
+ async crawl ( initialUrl, initialDepth = 0 )
138
101
  {
139
- if ( this.removeURLFragment )
140
- {
141
- url = url.split( "#" )[0];
142
- }
143
- if ( this.hasReachedMax( depth ) )
102
+ const queue = [{ url: initialUrl, depth: initialDepth }];
103
+ for ( let i = 0; i < queue.length; i++ )
144
104
  {
145
- return;
146
- }
147
- if ( this.visited.has( url ) )
148
- {
149
- console.log( `Already visited: ${url}` );
150
- return;
151
- }
152
- this.visited.add( url );
153
- if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
154
- {
155
- return;
156
- }
157
- try
158
- {
159
- await WebScraper.sleep( 5000 );
160
- const data = await this.fetchContent( url );
161
- if ( !data ) return;
162
- const dom = new JSDOM( data, { url });
163
- const { document } = dom.window;
105
+ console.log( `Processing URL: ${queue[i].url}` );
106
+ let { url, depth } = queue[i];
107
+ if ( this.hasReachedMax( depth ) )
108
+ {
109
+ continue;
110
+ }
111
+ if ( this.removeURLFragment )
112
+ {
113
+ url = url.split( "#" )[0];
114
+ }
115
+ if ( this.visited.has( url ) )
116
+ {
117
+ console.log( `Already visited: ${url}` );
118
+ continue;
119
+ }
120
+ this.visited.add( url );
164
121
 
165
- if ( !this.isExcluded( url ) )
122
+ if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
166
123
  {
167
- const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
168
- const article = reader.parse();
124
+ continue;
125
+ }
126
+
127
+ try
128
+ {
129
+ await WebScraper.sleep( this.crawlingDelay );
130
+ const data = await this.fetchContent( url );
131
+ if ( !data ) continue;
132
+
133
+ const dom = new JSDOM( data, { url });
134
+ const { document } = dom.window;
169
135
 
170
- if ( article )
136
+ if ( !this.isExcluded( url ) )
171
137
  {
172
- if ( this.hasValidPageContent( article.textContent ) )
138
+ const reader = new Readability( document, {
139
+ charThreshold: 500,
140
+ nbTopCandidates: 20
141
+ });
142
+ const article = reader.parse();
143
+
144
+ if ( article )
173
145
  {
174
- const metadata = this.extractMetadata( url, document );
175
- this.saveArticle( url, article.textContent, metadata );
146
+ if ( this.hasValidPageContent( article.textContent ) )
147
+ {
148
+ const metadata = this.extractMetadata( url, document );
149
+ this.saveArticle( url, article.textContent, metadata );
150
+ }
151
+ else
152
+ {
153
+ console.error( `Invalid content found at ${url}` );
154
+ }
176
155
  }
177
156
  else
178
157
  {
179
- console.error( `Invalid content found at ${url}` );
158
+ console.error( `No readable content found at ${url}` );
180
159
  }
181
160
  }
182
- else
183
- {
184
- console.error( `No readable content found at ${url}` );
185
- }
186
- }
187
161
 
188
- const links = this.extractLinks( data );
189
- const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
190
-
191
- for ( let i = 0; i < unvisitedLinks.length; i += this.concurrencyLimit )
192
- {
193
- if ( this.hasReachedMax( depth ) )
194
- {
195
- return;
196
- }
197
- const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
198
- const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
162
+ const links = this.extractLinks( data );
163
+ const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
199
164
 
200
- results.forEach( ( result, index ) =>
165
+ for ( const link of unvisitedLinks )
201
166
  {
202
- if ( result.status === "rejected" )
167
+ if ( !this.hasReachedMax( depth ) )
203
168
  {
204
- console.error( `Failed to fetch ${batch[index]}: ${result.reason}` );
169
+ queue.push({ url: link, depth: depth + 1 });
205
170
  }
206
- });
171
+ }
172
+ }
173
+ catch ( error )
174
+ {
175
+ console.error( `Error fetching ${url}:`, error.message, error.code );
207
176
  }
208
- }
209
- catch ( error )
210
- {
211
- console.error( `Error fetching ${url}:`, error.message, error.code );
212
177
  }
213
178
  }
214
179
 
180
+
215
181
  async fetchContent ( url )
216
182
  {
217
183
  try
@@ -581,7 +547,7 @@ class WebScraper
581
547
  ...this.axiosOptions,
582
548
  };
583
549
 
584
- for ( let attempt = 1; attempt <= this.maxRetries; attempt++ )
550
+ for ( let attempt = 1; attempt <= this.axiosMaxRetries; attempt++ )
585
551
  {
586
552
  try
587
553
  {
@@ -589,7 +555,7 @@ class WebScraper
589
555
  {
590
556
  break;
591
557
  }
592
- if ( attempt === this.maxRetries && this.useProxyAsFallback && this.axiosProxy )
558
+ if ( attempt === this.axiosMaxRetries && this.useProxyAsFallback && this.axiosProxy )
593
559
  {
594
560
  options = {
595
561
  ...options,
@@ -601,9 +567,9 @@ class WebScraper
601
567
  }
602
568
  catch ( error )
603
569
  {
604
- if ( attempt >= this.maxRetries ) throw error;
605
- await WebScraper.sleep( this.retryDelay * attempt );
606
- console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
570
+ if ( attempt >= this.axiosMaxRetries ) throw error;
571
+ await WebScraper.sleep( this.axiosRetryDelay * attempt );
572
+ console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.axiosMaxRetries})`, error.message, error.code );
607
573
  }
608
574
  }
609
575
  throw new Error( "Max retries reached" );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.8.7",
3
+ "version": "3.9.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",