clean-web-scraper 4.0.2 → 4.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -7
- package/example-usage.js +3 -2
- package/main.js +61 -55
- package/package.json +1 -1
package/README.md
CHANGED
@@ -57,7 +57,8 @@ const scraper = new WebScraper({
|
|
57
57
|
maxDepth: Infinity, // Optional: Maximum crawling depth
|
58
58
|
maxArticles: Infinity, // Optional: Maximum articles to scrape
|
59
59
|
crawlingDelay: 1000, // Optional: Delay between requests (ms)
|
60
|
-
|
60
|
+
batchSize: 5, // Optional: Number of URLs to process concurrently
|
61
|
+
|
61
62
|
// Network options
|
62
63
|
axiosHeaders: {}, // Optional: Custom HTTP headers
|
63
64
|
axiosProxy: { // Optional: HTTP/HTTPS proxy
|
@@ -86,7 +87,8 @@ const docsScraper = new WebScraper({
|
|
86
87
|
scrapResultPath: './datasets/docs',
|
87
88
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
88
89
|
includeMetadata: true, // Optional: Include metadata in output files
|
89
|
-
metadataFields: [
|
90
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
91
|
+
// Optional: Specify metadata fields to include
|
90
92
|
});
|
91
93
|
|
92
94
|
// Scrape blog website
|
@@ -95,7 +97,8 @@ const blogScraper = new WebScraper({
|
|
95
97
|
scrapResultPath: './datasets/blog',
|
96
98
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
97
99
|
includeMetadata: true, // Optional: Include metadata in output files
|
98
|
-
metadataFields: [
|
100
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
101
|
+
// Optional: Specify metadata fields to include
|
99
102
|
});
|
100
103
|
|
101
104
|
// Start scraping both sites
|
@@ -165,7 +168,7 @@ The actual article content starts here. This is the clean, processed text of the
|
|
165
168
|
### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
|
166
169
|
|
167
170
|
```text
|
168
|
-
|
171
|
+
articleTitle: My Awesome Page
|
169
172
|
description: This is a great article about coding
|
170
173
|
author: John Doe
|
171
174
|
language: en
|
@@ -186,8 +189,8 @@ The actual article content starts here. This is the clean, processed text of the
|
|
186
189
|
### 📈 JSONL with Metadata (train_with_metadata.jsonl)
|
187
190
|
|
188
191
|
```json
|
189
|
-
{"text": "Article content", "metadata": {"
|
190
|
-
{"text": "Another article", "metadata": {"
|
192
|
+
{"text": "Article content", "metadata": {"articleTitle": "Page Title", "author": "John Doe"}}
|
193
|
+
{"text": "Another article", "metadata": {"articleTitle": "Second Page", "author": "Jane Smith"}}
|
191
194
|
```
|
192
195
|
|
193
196
|
### 🗃️ JSON Files In Website Output (*.json)
|
@@ -212,7 +215,7 @@ text
|
|
212
215
|
### 📊 CSV with Metadata (train_with_metadata.csv)
|
213
216
|
|
214
217
|
```csv
|
215
|
-
text,
|
218
|
+
text,articleTitle,author,description
|
216
219
|
"Article content","Page Title","John Doe","Page description"
|
217
220
|
"Another article","Second Page","Jane Smith","Another description"
|
218
221
|
```
|
package/example-usage.js
CHANGED
@@ -109,7 +109,8 @@ async function electronicintifada ( enable )
|
|
109
109
|
"https://electronicintifada.net/news",
|
110
110
|
"https://electronicintifada.net/opinion",
|
111
111
|
"https://electronicintifada.net/about-ei",
|
112
|
-
"https://electronicintifada.net/review"
|
112
|
+
"https://electronicintifada.net/review",
|
113
|
+
"https://electronicintifada.net/artmusicculture"
|
113
114
|
],
|
114
115
|
exactExcludeList: [
|
115
116
|
"https://electronicintifada.net",
|
@@ -133,7 +134,7 @@ async function electronicintifada ( enable )
|
|
133
134
|
protocol: "http"
|
134
135
|
},
|
135
136
|
useProxyAsFallback: true,
|
136
|
-
crawlingDelay:
|
137
|
+
crawlingDelay: 1
|
137
138
|
};
|
138
139
|
return await runScraper( config, enable );
|
139
140
|
}
|
package/main.js
CHANGED
@@ -16,6 +16,7 @@ class WebScraper
|
|
16
16
|
this.maxDepth = config.maxDepth || Infinity;
|
17
17
|
this.maxArticles = config.maxArticles || Infinity;
|
18
18
|
this.crawlingDelay = config.crawlingDelay ?? 1000;
|
19
|
+
this.batchSize = config.batchSize || 5;
|
19
20
|
|
20
21
|
// Output paths setup
|
21
22
|
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
@@ -97,83 +98,88 @@ class WebScraper
|
|
97
98
|
async crawl ( initialUrl, initialDepth = 0 )
|
98
99
|
{
|
99
100
|
const queue = [{ url: initialUrl, depth: initialDepth }];
|
100
|
-
|
101
|
+
while ( queue.length > 0 )
|
101
102
|
{
|
102
|
-
|
103
|
-
|
104
|
-
if ( this.hasReachedMax( depth ) )
|
103
|
+
const currentBatch = queue.splice( 0, this.batchSize );
|
104
|
+
await Promise.all( currentBatch.map( async ({ url, depth }) =>
|
105
105
|
{
|
106
|
-
|
107
|
-
}
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
106
|
+
await this.processUrl( url, depth, queue );
|
107
|
+
}) );
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
async processUrl ( url, depth, queue )
|
112
|
+
{
|
113
|
+
console.log( `Processing URL: ${url}` );
|
114
|
+
if ( this.hasReachedMax( depth ) )
|
115
|
+
{
|
116
|
+
return;
|
117
|
+
}
|
118
|
+
if ( this.removeURLFragment )
|
119
|
+
{
|
120
|
+
url = url.split( "#" )[0];
|
121
|
+
}
|
122
|
+
if ( this.visited.has( url ) )
|
123
|
+
{
|
124
|
+
console.log( `Already visited: ${url}` );
|
125
|
+
return;
|
126
|
+
}
|
127
|
+
this.visited.add( url );
|
128
|
+
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
129
|
+
{
|
130
|
+
return;
|
131
|
+
}
|
132
|
+
try
|
133
|
+
{
|
134
|
+
if ( this.crawlingDelay )
|
113
135
|
{
|
114
|
-
|
115
|
-
continue;
|
136
|
+
await WebScraper.sleep( this.crawlingDelay );
|
116
137
|
}
|
117
|
-
this.
|
118
|
-
|
119
|
-
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
138
|
+
const data = await this.fetchContent( url );
|
139
|
+
if ( !data )
|
120
140
|
{
|
121
|
-
|
141
|
+
return;
|
122
142
|
}
|
123
|
-
|
124
|
-
|
143
|
+
const dom = new JSDOM( data, { url });
|
144
|
+
const { document } = dom.window;
|
145
|
+
if ( !this.isExcluded( url ) )
|
125
146
|
{
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
}
|
130
|
-
const data = await this.fetchContent( url );
|
131
|
-
if ( !data ) continue;
|
132
|
-
|
133
|
-
const dom = new JSDOM( data, { url });
|
134
|
-
const { document } = dom.window;
|
135
|
-
|
136
|
-
if ( !this.isExcluded( url ) )
|
147
|
+
const reader = new Readability( document );
|
148
|
+
const article = reader.parse();
|
149
|
+
if ( article )
|
137
150
|
{
|
138
|
-
|
139
|
-
const article = reader.parse();
|
140
|
-
if ( article )
|
151
|
+
if ( this.hasValidPageContent( article.textContent ) )
|
141
152
|
{
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
metadata.articleTitle = article.title || "";
|
146
|
-
this.saveArticle( url, article.textContent, metadata );
|
147
|
-
}
|
148
|
-
else
|
149
|
-
{
|
150
|
-
console.error( `Invalid content found at ${url}` );
|
151
|
-
}
|
153
|
+
const metadata = this.extractMetadata( url, document );
|
154
|
+
metadata.articleTitle = article.title || "";
|
155
|
+
this.saveArticle( url, article.textContent, metadata );
|
152
156
|
}
|
153
157
|
else
|
154
158
|
{
|
155
|
-
console.error( `
|
159
|
+
console.error( `Invalid content found at ${url}` );
|
156
160
|
}
|
157
161
|
}
|
158
|
-
|
159
|
-
const links = this.extractLinks( data );
|
160
|
-
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
161
|
-
for ( const link of unvisitedLinks )
|
162
|
+
else
|
162
163
|
{
|
163
|
-
|
164
|
-
{
|
165
|
-
queue.push({ url: link, depth: depth + 1 });
|
166
|
-
}
|
164
|
+
console.error( `No readable content found at ${url}` );
|
167
165
|
}
|
168
166
|
}
|
169
|
-
|
167
|
+
const links = this.extractLinks( data );
|
168
|
+
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
169
|
+
for ( const link of unvisitedLinks )
|
170
170
|
{
|
171
|
-
|
171
|
+
if ( !this.hasReachedMax( depth ) )
|
172
|
+
{
|
173
|
+
queue.push({ url: link, depth: depth + 1 });
|
174
|
+
}
|
172
175
|
}
|
173
176
|
}
|
177
|
+
catch ( error )
|
178
|
+
{
|
179
|
+
console.error( `Error fetching ${url}:`, error.message, error.code );
|
180
|
+
}
|
174
181
|
}
|
175
182
|
|
176
|
-
|
177
183
|
async fetchContent ( url )
|
178
184
|
{
|
179
185
|
try
|