clean-web-scraper 4.0.2 → 4.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -57,7 +57,8 @@ const scraper = new WebScraper({
57
57
  maxDepth: Infinity, // Optional: Maximum crawling depth
58
58
  maxArticles: Infinity, // Optional: Maximum articles to scrape
59
59
  crawlingDelay: 1000, // Optional: Delay between requests (ms)
60
-
60
+ batchSize: 5, // Optional: Number of URLs to process concurrently
61
+
61
62
  // Network options
62
63
  axiosHeaders: {}, // Optional: Custom HTTP headers
63
64
  axiosProxy: { // Optional: HTTP/HTTPS proxy
@@ -86,7 +87,8 @@ const docsScraper = new WebScraper({
86
87
  scrapResultPath: './datasets/docs',
87
88
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
88
89
  includeMetadata: true, // Optional: Include metadata in output files
89
- metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
90
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
91
+ // Optional: Specify metadata fields to include
90
92
  });
91
93
 
92
94
  // Scrape blog website
@@ -95,7 +97,8 @@ const blogScraper = new WebScraper({
95
97
  scrapResultPath: './datasets/blog',
96
98
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
97
99
  includeMetadata: true, // Optional: Include metadata in output files
98
- metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
100
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
101
+ // Optional: Specify metadata fields to include
99
102
  });
100
103
 
101
104
  // Start scraping both sites
@@ -165,7 +168,7 @@ The actual article content starts here. This is the clean, processed text of the
165
168
  ### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
166
169
 
167
170
  ```text
168
- title: My Awesome Page
171
+ articleTitle: My Awesome Page
169
172
  description: This is a great article about coding
170
173
  author: John Doe
171
174
  language: en
@@ -186,8 +189,8 @@ The actual article content starts here. This is the clean, processed text of the
186
189
  ### 📈 JSONL with Metadata (train_with_metadata.jsonl)
187
190
 
188
191
  ```json
189
- {"text": "Article content", "metadata": {"title": "Page Title", "author": "John Doe"}}
190
- {"text": "Another article", "metadata": {"title": "Second Page", "author": "Jane Smith"}}
192
+ {"text": "Article content", "metadata": {"articleTitle": "Page Title", "author": "John Doe"}}
193
+ {"text": "Another article", "metadata": {"articleTitle": "Second Page", "author": "Jane Smith"}}
191
194
  ```
192
195
 
193
196
  ### 🗃️ JSON Files In Website Output (*.json)
@@ -212,7 +215,7 @@ text
212
215
  ### 📊 CSV with Metadata (train_with_metadata.csv)
213
216
 
214
217
  ```csv
215
- text,title,author,description
218
+ text,articleTitle,author,description
216
219
  "Article content","Page Title","John Doe","Page description"
217
220
  "Another article","Second Page","Jane Smith","Another description"
218
221
  ```
package/example-usage.js CHANGED
@@ -109,7 +109,8 @@ async function electronicintifada ( enable )
109
109
  "https://electronicintifada.net/news",
110
110
  "https://electronicintifada.net/opinion",
111
111
  "https://electronicintifada.net/about-ei",
112
- "https://electronicintifada.net/review"
112
+ "https://electronicintifada.net/review",
113
+ "https://electronicintifada.net/artmusicculture"
113
114
  ],
114
115
  exactExcludeList: [
115
116
  "https://electronicintifada.net",
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
133
134
  protocol: "http"
134
135
  },
135
136
  useProxyAsFallback: true,
136
- crawlingDelay: 0
137
+ crawlingDelay: 1
137
138
  };
138
139
  return await runScraper( config, enable );
139
140
  }
package/main.js CHANGED
@@ -16,6 +16,7 @@ class WebScraper
16
16
  this.maxDepth = config.maxDepth || Infinity;
17
17
  this.maxArticles = config.maxArticles || Infinity;
18
18
  this.crawlingDelay = config.crawlingDelay ?? 1000;
19
+ this.batchSize = config.batchSize || 5;
19
20
 
20
21
  // Output paths setup
21
22
  this.scrapResultPath = config.scrapResultPath || "./dataset";
@@ -97,83 +98,88 @@ class WebScraper
97
98
  async crawl ( initialUrl, initialDepth = 0 )
98
99
  {
99
100
  const queue = [{ url: initialUrl, depth: initialDepth }];
100
- for ( let i = 0; i < queue.length; i++ )
101
+ while ( queue.length > 0 )
101
102
  {
102
- let { url, depth } = queue[i];
103
- console.log( `Processing URL: ${queue[i].url}` );
104
- if ( this.hasReachedMax( depth ) )
103
+ const currentBatch = queue.splice( 0, this.batchSize );
104
+ await Promise.all( currentBatch.map( async ({ url, depth }) =>
105
105
  {
106
- continue;
107
- }
108
- if ( this.removeURLFragment )
109
- {
110
- url = url.split( "#" )[0];
111
- }
112
- if ( this.visited.has( url ) )
106
+ await this.processUrl( url, depth, queue );
107
+ }) );
108
+ }
109
+ }
110
+
111
+ async processUrl ( url, depth, queue )
112
+ {
113
+ console.log( `Processing URL: ${url}` );
114
+ if ( this.hasReachedMax( depth ) )
115
+ {
116
+ return;
117
+ }
118
+ if ( this.removeURLFragment )
119
+ {
120
+ url = url.split( "#" )[0];
121
+ }
122
+ if ( this.visited.has( url ) )
123
+ {
124
+ console.log( `Already visited: ${url}` );
125
+ return;
126
+ }
127
+ this.visited.add( url );
128
+ if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
129
+ {
130
+ return;
131
+ }
132
+ try
133
+ {
134
+ if ( this.crawlingDelay )
113
135
  {
114
- console.log( `Already visited: ${url}` );
115
- continue;
136
+ await WebScraper.sleep( this.crawlingDelay );
116
137
  }
117
- this.visited.add( url );
118
-
119
- if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
138
+ const data = await this.fetchContent( url );
139
+ if ( !data )
120
140
  {
121
- continue;
141
+ return;
122
142
  }
123
-
124
- try
143
+ const dom = new JSDOM( data, { url });
144
+ const { document } = dom.window;
145
+ if ( !this.isExcluded( url ) )
125
146
  {
126
- if ( this.crawlingDelay )
127
- {
128
- await WebScraper.sleep( this.crawlingDelay );
129
- }
130
- const data = await this.fetchContent( url );
131
- if ( !data ) continue;
132
-
133
- const dom = new JSDOM( data, { url });
134
- const { document } = dom.window;
135
-
136
- if ( !this.isExcluded( url ) )
147
+ const reader = new Readability( document );
148
+ const article = reader.parse();
149
+ if ( article )
137
150
  {
138
- const reader = new Readability( document );
139
- const article = reader.parse();
140
- if ( article )
151
+ if ( this.hasValidPageContent( article.textContent ) )
141
152
  {
142
- if ( this.hasValidPageContent( article.textContent ) )
143
- {
144
- const metadata = this.extractMetadata( url, document );
145
- metadata.articleTitle = article.title || "";
146
- this.saveArticle( url, article.textContent, metadata );
147
- }
148
- else
149
- {
150
- console.error( `Invalid content found at ${url}` );
151
- }
153
+ const metadata = this.extractMetadata( url, document );
154
+ metadata.articleTitle = article.title || "";
155
+ this.saveArticle( url, article.textContent, metadata );
152
156
  }
153
157
  else
154
158
  {
155
- console.error( `No readable content found at ${url}` );
159
+ console.error( `Invalid content found at ${url}` );
156
160
  }
157
161
  }
158
-
159
- const links = this.extractLinks( data );
160
- const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
161
- for ( const link of unvisitedLinks )
162
+ else
162
163
  {
163
- if ( !this.hasReachedMax( depth ) )
164
- {
165
- queue.push({ url: link, depth: depth + 1 });
166
- }
164
+ console.error( `No readable content found at ${url}` );
167
165
  }
168
166
  }
169
- catch ( error )
167
+ const links = this.extractLinks( data );
168
+ const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
169
+ for ( const link of unvisitedLinks )
170
170
  {
171
- console.error( `Error fetching ${url}:`, error.message, error.code );
171
+ if ( !this.hasReachedMax( depth ) )
172
+ {
173
+ queue.push({ url: link, depth: depth + 1 });
174
+ }
172
175
  }
173
176
  }
177
+ catch ( error )
178
+ {
179
+ console.error( `Error fetching ${url}:`, error.message, error.code );
180
+ }
174
181
  }
175
182
 
176
-
177
183
  async fetchContent ( url )
178
184
  {
179
185
  try
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.0.2",
3
+ "version": "4.0.4",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",