clean-web-scraper 3.8.7 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +7 -6
- package/main.js +74 -108
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -129,9 +129,9 @@ async function electronicintifada ( enable )
|
|
129
129
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
130
130
|
includeMetadata: true,
|
131
131
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
132
|
-
maxDepth:
|
132
|
+
maxDepth: 16,
|
133
133
|
maxArticles: 2000,
|
134
|
-
concurrencyLimit:
|
134
|
+
concurrencyLimit: 2,
|
135
135
|
axiosHeaders: headers,
|
136
136
|
retryDelay: 10000,
|
137
137
|
axiosProxy: {
|
@@ -198,18 +198,19 @@ async function mondoweiss ( enable )
|
|
198
198
|
textOutputPath: "./dataset/mondoweiss/texts",
|
199
199
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
200
200
|
maxArticles: 2500,
|
201
|
-
maxRetries:
|
202
|
-
concurrencyLimit:
|
201
|
+
maxRetries: 3,
|
202
|
+
concurrencyLimit: 3,
|
203
203
|
axiosHeaders: headers,
|
204
204
|
axiosProxy: {
|
205
205
|
host: "localhost",
|
206
206
|
port: 2080,
|
207
207
|
protocol: "http"
|
208
208
|
},
|
209
|
-
maxDepth:
|
209
|
+
maxDepth: 15,
|
210
210
|
retryDelay: 10000,
|
211
211
|
includeMetadata: true,
|
212
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
212
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
213
|
+
useProxyAsFallback: true
|
213
214
|
});
|
214
215
|
if ( enable )
|
215
216
|
{
|
package/main.js
CHANGED
@@ -7,45 +7,7 @@ const { connect } = require( "puppeteer-real-browser" )
|
|
7
7
|
|
8
8
|
class WebScraper
|
9
9
|
{
|
10
|
-
constructor (
|
11
|
-
// Base configuration
|
12
|
-
baseURL,
|
13
|
-
startURL,
|
14
|
-
strictBaseURL,
|
15
|
-
maxDepth,
|
16
|
-
maxArticles,
|
17
|
-
concurrencyLimit,
|
18
|
-
maxRetries,
|
19
|
-
retryDelay,
|
20
|
-
|
21
|
-
// URL filtering
|
22
|
-
excludeList = [],
|
23
|
-
exactExcludeList = [],
|
24
|
-
filterFileTypes,
|
25
|
-
excludedFileTypes,
|
26
|
-
removeURLFragment,
|
27
|
-
|
28
|
-
// Output paths
|
29
|
-
scrapResultPath = "./dataset",
|
30
|
-
jsonlOutputPath,
|
31
|
-
textOutputPath,
|
32
|
-
csvOutputPath,
|
33
|
-
|
34
|
-
// Metadata options
|
35
|
-
includeMetadata = false,
|
36
|
-
metadataFields = [],
|
37
|
-
|
38
|
-
// Network options
|
39
|
-
axiosHeaders,
|
40
|
-
axiosProxy,
|
41
|
-
useProxyAsFallback,
|
42
|
-
|
43
|
-
// Puppeteer options
|
44
|
-
usePuppeteer,
|
45
|
-
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
46
|
-
puppeteerExecutablePath,
|
47
|
-
puppeteerRealProxy
|
48
|
-
})
|
10
|
+
constructor ( config )
|
49
11
|
{
|
50
12
|
// Base configuration
|
51
13
|
this.baseURL = baseURL;
|
@@ -54,11 +16,10 @@ class WebScraper
|
|
54
16
|
this.maxDepth = maxDepth || Infinity;
|
55
17
|
this.maxArticles = maxArticles || Infinity;
|
56
18
|
this.concurrencyLimit = concurrencyLimit || 2;
|
57
|
-
this.
|
58
|
-
this.retryDelay = retryDelay || 40000;
|
19
|
+
this.crawlingDelay = crawlingDelay || 1000;
|
59
20
|
|
60
21
|
// Output paths setup
|
61
|
-
this.scrapResultPath = scrapResultPath;
|
22
|
+
this.scrapResultPath = scrapResultPath || "./dataset";
|
62
23
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
63
24
|
this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
|
64
25
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
@@ -67,8 +28,8 @@ class WebScraper
|
|
67
28
|
this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
68
29
|
|
69
30
|
// Metadata configuration
|
70
|
-
this.includeMetadata = includeMetadata;
|
71
|
-
this.metadataFields = new Set( metadataFields );
|
31
|
+
this.includeMetadata = includeMetadata || false;
|
32
|
+
this.metadataFields = new Set( metadataFields || [] );
|
72
33
|
|
73
34
|
// URL filtering setup
|
74
35
|
this.visited = new Set();
|
@@ -81,6 +42,8 @@ class WebScraper
|
|
81
42
|
// Network configuration
|
82
43
|
this.axiosHeaders = axiosHeaders;
|
83
44
|
this.axiosProxy = axiosProxy;
|
45
|
+
this.axiosMaxRetries = axiosMaxRetries || 5;
|
46
|
+
this.axiosRetryDelay = axiosRetryDelay || 40000;
|
84
47
|
this.useProxyAsFallback = useProxyAsFallback || false;
|
85
48
|
this.axiosOptions = {};
|
86
49
|
if ( this.axiosHeaders )
|
@@ -97,7 +60,7 @@ class WebScraper
|
|
97
60
|
|
98
61
|
// Puppeteer configuration
|
99
62
|
this.usePuppeteer = usePuppeteer || false;
|
100
|
-
this.puppeteerProxy = puppeteerProxy;
|
63
|
+
this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
|
101
64
|
this.puppeteerExecutablePath = puppeteerExecutablePath;
|
102
65
|
this.puppeteerRealProxy = puppeteerRealProxy;
|
103
66
|
this.configurePuppeteer( );
|
@@ -114,7 +77,7 @@ class WebScraper
|
|
114
77
|
this.puppeteerPage = page;
|
115
78
|
}
|
116
79
|
this.createOutputDirectory();
|
117
|
-
await this.
|
80
|
+
await this.crawl( this.startURL, 0 );
|
118
81
|
this.createJSONLFile();
|
119
82
|
this.saveNumberedTextFiles();
|
120
83
|
this.createCSVFile();
|
@@ -134,84 +97,87 @@ class WebScraper
|
|
134
97
|
}
|
135
98
|
}
|
136
99
|
|
137
|
-
async
|
100
|
+
async crawl ( initialUrl, initialDepth = 0 )
|
138
101
|
{
|
139
|
-
|
140
|
-
|
141
|
-
url = url.split( "#" )[0];
|
142
|
-
}
|
143
|
-
if ( this.hasReachedMax( depth ) )
|
102
|
+
const queue = [{ url: initialUrl, depth: initialDepth }];
|
103
|
+
for ( let i = 0; i < queue.length; i++ )
|
144
104
|
{
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
if ( !data ) return;
|
162
|
-
const dom = new JSDOM( data, { url });
|
163
|
-
const { document } = dom.window;
|
105
|
+
console.log( `Processing URL: ${queue[i].url}` );
|
106
|
+
let { url, depth } = queue[i];
|
107
|
+
if ( this.hasReachedMax( depth ) )
|
108
|
+
{
|
109
|
+
continue;
|
110
|
+
}
|
111
|
+
if ( this.removeURLFragment )
|
112
|
+
{
|
113
|
+
url = url.split( "#" )[0];
|
114
|
+
}
|
115
|
+
if ( this.visited.has( url ) )
|
116
|
+
{
|
117
|
+
console.log( `Already visited: ${url}` );
|
118
|
+
continue;
|
119
|
+
}
|
120
|
+
this.visited.add( url );
|
164
121
|
|
165
|
-
if ( !this.
|
122
|
+
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
166
123
|
{
|
167
|
-
|
168
|
-
|
124
|
+
continue;
|
125
|
+
}
|
126
|
+
|
127
|
+
try
|
128
|
+
{
|
129
|
+
await WebScraper.sleep( this.crawlingDelay );
|
130
|
+
const data = await this.fetchContent( url );
|
131
|
+
if ( !data ) continue;
|
132
|
+
|
133
|
+
const dom = new JSDOM( data, { url });
|
134
|
+
const { document } = dom.window;
|
169
135
|
|
170
|
-
if (
|
136
|
+
if ( !this.isExcluded( url ) )
|
171
137
|
{
|
172
|
-
|
138
|
+
const reader = new Readability( document, {
|
139
|
+
charThreshold: 500,
|
140
|
+
nbTopCandidates: 20
|
141
|
+
});
|
142
|
+
const article = reader.parse();
|
143
|
+
|
144
|
+
if ( article )
|
173
145
|
{
|
174
|
-
|
175
|
-
|
146
|
+
if ( this.hasValidPageContent( article.textContent ) )
|
147
|
+
{
|
148
|
+
const metadata = this.extractMetadata( url, document );
|
149
|
+
this.saveArticle( url, article.textContent, metadata );
|
150
|
+
}
|
151
|
+
else
|
152
|
+
{
|
153
|
+
console.error( `Invalid content found at ${url}` );
|
154
|
+
}
|
176
155
|
}
|
177
156
|
else
|
178
157
|
{
|
179
|
-
console.error( `
|
158
|
+
console.error( `No readable content found at ${url}` );
|
180
159
|
}
|
181
160
|
}
|
182
|
-
else
|
183
|
-
{
|
184
|
-
console.error( `No readable content found at ${url}` );
|
185
|
-
}
|
186
|
-
}
|
187
161
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
for ( let i = 0; i < unvisitedLinks.length; i += this.concurrencyLimit )
|
192
|
-
{
|
193
|
-
if ( this.hasReachedMax( depth ) )
|
194
|
-
{
|
195
|
-
return;
|
196
|
-
}
|
197
|
-
const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
|
198
|
-
const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
|
162
|
+
const links = this.extractLinks( data );
|
163
|
+
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
199
164
|
|
200
|
-
|
165
|
+
for ( const link of unvisitedLinks )
|
201
166
|
{
|
202
|
-
if (
|
167
|
+
if ( !this.hasReachedMax( depth ) )
|
203
168
|
{
|
204
|
-
|
169
|
+
queue.push({ url: link, depth: depth + 1 });
|
205
170
|
}
|
206
|
-
}
|
171
|
+
}
|
172
|
+
}
|
173
|
+
catch ( error )
|
174
|
+
{
|
175
|
+
console.error( `Error fetching ${url}:`, error.message, error.code );
|
207
176
|
}
|
208
|
-
}
|
209
|
-
catch ( error )
|
210
|
-
{
|
211
|
-
console.error( `Error fetching ${url}:`, error.message, error.code );
|
212
177
|
}
|
213
178
|
}
|
214
179
|
|
180
|
+
|
215
181
|
async fetchContent ( url )
|
216
182
|
{
|
217
183
|
try
|
@@ -581,7 +547,7 @@ class WebScraper
|
|
581
547
|
...this.axiosOptions,
|
582
548
|
};
|
583
549
|
|
584
|
-
for ( let attempt = 1; attempt <= this.
|
550
|
+
for ( let attempt = 1; attempt <= this.axiosMaxRetries; attempt++ )
|
585
551
|
{
|
586
552
|
try
|
587
553
|
{
|
@@ -589,7 +555,7 @@ class WebScraper
|
|
589
555
|
{
|
590
556
|
break;
|
591
557
|
}
|
592
|
-
if ( attempt === this.
|
558
|
+
if ( attempt === this.axiosMaxRetries && this.useProxyAsFallback && this.axiosProxy )
|
593
559
|
{
|
594
560
|
options = {
|
595
561
|
...options,
|
@@ -601,9 +567,9 @@ class WebScraper
|
|
601
567
|
}
|
602
568
|
catch ( error )
|
603
569
|
{
|
604
|
-
if ( attempt >= this.
|
605
|
-
await WebScraper.sleep( this.
|
606
|
-
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.
|
570
|
+
if ( attempt >= this.axiosMaxRetries ) throw error;
|
571
|
+
await WebScraper.sleep( this.axiosRetryDelay * attempt );
|
572
|
+
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.axiosMaxRetries})`, error.message, error.code );
|
607
573
|
}
|
608
574
|
}
|
609
575
|
throw new Error( "Max retries reached" );
|