clean-web-scraper 4.0.3 → 4.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -57,7 +57,8 @@ const scraper = new WebScraper({
57
57
  maxDepth: Infinity, // Optional: Maximum crawling depth
58
58
  maxArticles: Infinity, // Optional: Maximum articles to scrape
59
59
  crawlingDelay: 1000, // Optional: Delay between requests (ms)
60
-
60
+ batchSize: 5, // Optional: Number of URLs to process concurrently
61
+
61
62
  // Network options
62
63
  axiosHeaders: {}, // Optional: Custom HTTP headers
63
64
  axiosProxy: { // Optional: HTTP/HTTPS proxy
package/example-usage.js CHANGED
@@ -109,7 +109,8 @@ async function electronicintifada ( enable )
109
109
  "https://electronicintifada.net/news",
110
110
  "https://electronicintifada.net/opinion",
111
111
  "https://electronicintifada.net/about-ei",
112
- "https://electronicintifada.net/review"
112
+ "https://electronicintifada.net/review",
113
+ "https://electronicintifada.net/artmusicculture"
113
114
  ],
114
115
  exactExcludeList: [
115
116
  "https://electronicintifada.net",
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
133
134
  protocol: "http"
134
135
  },
135
136
  useProxyAsFallback: true,
136
- crawlingDelay: 0
137
+ crawlingDelay: 1
137
138
  };
138
139
  return await runScraper( config, enable );
139
140
  }
package/main.js CHANGED
@@ -16,6 +16,7 @@ class WebScraper
16
16
  this.maxDepth = config.maxDepth || Infinity;
17
17
  this.maxArticles = config.maxArticles || Infinity;
18
18
  this.crawlingDelay = config.crawlingDelay ?? 1000;
19
+ this.batchSize = config.batchSize || 5;
19
20
 
20
21
  // Output paths setup
21
22
  this.scrapResultPath = config.scrapResultPath || "./dataset";
@@ -97,83 +98,88 @@ class WebScraper
97
98
  async crawl ( initialUrl, initialDepth = 0 )
98
99
  {
99
100
  const queue = [{ url: initialUrl, depth: initialDepth }];
100
- for ( let i = 0; i < queue.length; i++ )
101
+ while ( queue.length > 0 )
101
102
  {
102
- let { url, depth } = queue[i];
103
- console.log( `Processing URL: ${queue[i].url}` );
104
- if ( this.hasReachedMax( depth ) )
103
+ const currentBatch = queue.splice( 0, this.batchSize );
104
+ await Promise.all( currentBatch.map( async ({ url, depth }) =>
105
105
  {
106
- continue;
107
- }
108
- if ( this.removeURLFragment )
109
- {
110
- url = url.split( "#" )[0];
111
- }
112
- if ( this.visited.has( url ) )
106
+ await this.processUrl( url, depth, queue );
107
+ }) );
108
+ }
109
+ }
110
+
111
+ async processUrl ( url, depth, queue )
112
+ {
113
+ console.log( `Processing URL: ${url}` );
114
+ if ( this.hasReachedMax( depth ) )
115
+ {
116
+ return;
117
+ }
118
+ if ( this.removeURLFragment )
119
+ {
120
+ url = url.split( "#" )[0];
121
+ }
122
+ if ( this.visited.has( url ) )
123
+ {
124
+ console.log( `Already visited: ${url}` );
125
+ return;
126
+ }
127
+ this.visited.add( url );
128
+ if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
129
+ {
130
+ return;
131
+ }
132
+ try
133
+ {
134
+ if ( this.crawlingDelay )
113
135
  {
114
- console.log( `Already visited: ${url}` );
115
- continue;
136
+ await WebScraper.sleep( this.crawlingDelay );
116
137
  }
117
- this.visited.add( url );
118
-
119
- if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
138
+ const data = await this.fetchContent( url );
139
+ if ( !data )
120
140
  {
121
- continue;
141
+ return;
122
142
  }
123
-
124
- try
143
+ const dom = new JSDOM( data, { url });
144
+ const { document } = dom.window;
145
+ if ( !this.isExcluded( url ) )
125
146
  {
126
- if ( this.crawlingDelay )
127
- {
128
- await WebScraper.sleep( this.crawlingDelay );
129
- }
130
- const data = await this.fetchContent( url );
131
- if ( !data ) continue;
132
-
133
- const dom = new JSDOM( data, { url });
134
- const { document } = dom.window;
135
-
136
- if ( !this.isExcluded( url ) )
147
+ const reader = new Readability( document );
148
+ const article = reader.parse();
149
+ if ( article )
137
150
  {
138
- const reader = new Readability( document );
139
- const article = reader.parse();
140
- if ( article )
151
+ if ( this.hasValidPageContent( article.textContent ) )
141
152
  {
142
- if ( this.hasValidPageContent( article.textContent ) )
143
- {
144
- const metadata = this.extractMetadata( url, document );
145
- metadata.articleTitle = article.title || "";
146
- this.saveArticle( url, article.textContent, metadata );
147
- }
148
- else
149
- {
150
- console.error( `Invalid content found at ${url}` );
151
- }
153
+ const metadata = this.extractMetadata( url, document );
154
+ metadata.articleTitle = article.title || "";
155
+ this.saveArticle( url, article.textContent, metadata );
152
156
  }
153
157
  else
154
158
  {
155
- console.error( `No readable content found at ${url}` );
159
+ console.error( `Invalid content found at ${url}` );
156
160
  }
157
161
  }
158
-
159
- const links = this.extractLinks( data );
160
- const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
161
- for ( const link of unvisitedLinks )
162
+ else
162
163
  {
163
- if ( !this.hasReachedMax( depth ) )
164
- {
165
- queue.push({ url: link, depth: depth + 1 });
166
- }
164
+ console.error( `No readable content found at ${url}` );
167
165
  }
168
166
  }
169
- catch ( error )
167
+ const links = this.extractLinks( data );
168
+ const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
169
+ for ( const link of unvisitedLinks )
170
170
  {
171
- console.error( `Error fetching ${url}:`, error.message, error.code );
171
+ if ( !this.hasReachedMax( depth ) )
172
+ {
173
+ queue.push({ url: link, depth: depth + 1 });
174
+ }
172
175
  }
173
176
  }
177
+ catch ( error )
178
+ {
179
+ console.error( `Error fetching ${url}:`, error.message, error.code );
180
+ }
174
181
  }
175
182
 
176
-
177
183
  async fetchContent ( url )
178
184
  {
179
185
  try
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.0.3",
3
+ "version": "4.0.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",