clean-web-scraper 3.5.7 → 3.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +7 -7
- package/main.js +832 -1
- package/package.json +1 -1
- package/src/WebScraper.js +0 -822
package/main.js
CHANGED
|
@@ -1,2 +1,833 @@
|
|
|
1
|
-
const
|
|
1
|
+
const axios = require( "axios" );
|
|
2
|
+
const { JSDOM } = require( "jsdom" );
|
|
3
|
+
const { Readability } = require( "@mozilla/readability" );
|
|
4
|
+
const fs = require( "fs" );
|
|
5
|
+
const path = require( "path" );
|
|
6
|
+
const { connect } = require( "puppeteer-real-browser" )
|
|
7
|
+
|
|
8
|
+
class WebScraper
|
|
9
|
+
{
|
|
10
|
+
constructor ({
|
|
11
|
+
// Base configuration
|
|
12
|
+
baseURL,
|
|
13
|
+
startURL,
|
|
14
|
+
strictBaseURL = true,
|
|
15
|
+
maxDepth = Infinity,
|
|
16
|
+
maxArticles = Infinity,
|
|
17
|
+
|
|
18
|
+
// URL filtering
|
|
19
|
+
excludeList = [],
|
|
20
|
+
exactExcludeList = [],
|
|
21
|
+
filterFileTypes = true,
|
|
22
|
+
excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
|
|
23
|
+
|
|
24
|
+
// Output paths
|
|
25
|
+
scrapResultPath = "./dataset",
|
|
26
|
+
jsonlOutputPath,
|
|
27
|
+
textOutputPath,
|
|
28
|
+
csvOutputPath,
|
|
29
|
+
|
|
30
|
+
// Metadata options
|
|
31
|
+
includeMetadata = false,
|
|
32
|
+
metadataFields = [],
|
|
33
|
+
|
|
34
|
+
// Network options
|
|
35
|
+
axiosHeaders,
|
|
36
|
+
axiosProxy,
|
|
37
|
+
|
|
38
|
+
// Puppeteer options
|
|
39
|
+
usePuppeteer,
|
|
40
|
+
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
|
41
|
+
puppeteerExecutablePath,
|
|
42
|
+
puppeteerRealProxy
|
|
43
|
+
})
|
|
44
|
+
{
|
|
45
|
+
// Base configuration
|
|
46
|
+
this.baseURL = baseURL;
|
|
47
|
+
this.startURL = startURL || baseURL;
|
|
48
|
+
this.strictBaseURL = strictBaseURL;
|
|
49
|
+
this.maxDepth = maxDepth;
|
|
50
|
+
this.maxArticles = maxArticles;
|
|
51
|
+
|
|
52
|
+
// Output paths setup
|
|
53
|
+
this.scrapResultPath = scrapResultPath;
|
|
54
|
+
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
55
|
+
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
56
|
+
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
57
|
+
this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
58
|
+
this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
59
|
+
|
|
60
|
+
// Metadata configuration
|
|
61
|
+
this.includeMetadata = includeMetadata;
|
|
62
|
+
this.metadataFields = new Set( metadataFields );
|
|
63
|
+
|
|
64
|
+
// URL filtering setup
|
|
65
|
+
this.visited = new Set();
|
|
66
|
+
this.excludeList = this.normalizeExcludeList( excludeList );
|
|
67
|
+
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
68
|
+
this.filterFileTypes = filterFileTypes;
|
|
69
|
+
this.excludedFileTypes = excludedFileTypes;
|
|
70
|
+
|
|
71
|
+
// Network configuration
|
|
72
|
+
this.axiosHeaders = axiosHeaders;
|
|
73
|
+
this.axiosProxy = axiosProxy;
|
|
74
|
+
this.axiosOptions = {};
|
|
75
|
+
if ( this.axiosHeaders )
|
|
76
|
+
{
|
|
77
|
+
this.axiosOptions.headers = this.axiosHeaders;
|
|
78
|
+
}
|
|
79
|
+
if ( this.axiosProxy )
|
|
80
|
+
{
|
|
81
|
+
this.axiosOptions.proxy = this.axiosProxy;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Content storage
|
|
85
|
+
this.allProcessedContent = [];
|
|
86
|
+
|
|
87
|
+
// Puppeteer configuration
|
|
88
|
+
this.usePuppeteer = usePuppeteer || false;
|
|
89
|
+
this.puppeteerProxy = puppeteerProxy;
|
|
90
|
+
this.puppeteerExecutablePath = puppeteerExecutablePath;
|
|
91
|
+
this.puppeteerRealProxy = puppeteerRealProxy;
|
|
92
|
+
this.configurePuppeteer( );
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async start ()
|
|
96
|
+
{
|
|
97
|
+
try
|
|
98
|
+
{
|
|
99
|
+
if ( this.usePuppeteer )
|
|
100
|
+
{
|
|
101
|
+
let { browser, page } = await connect( this.puppeteerRealOptions )
|
|
102
|
+
this.puppeteerBrowser = browser;
|
|
103
|
+
this.puppeteerPage = page;
|
|
104
|
+
}
|
|
105
|
+
this.createOutputDirectory();
|
|
106
|
+
await this.fetchPage( this.startURL, 0 );
|
|
107
|
+
this.createJSONLFile();
|
|
108
|
+
this.saveNumberedTextFiles();
|
|
109
|
+
this.createCSVFile();
|
|
110
|
+
console.log( "Scraping completed." );
|
|
111
|
+
}
|
|
112
|
+
catch ( error )
|
|
113
|
+
{
|
|
114
|
+
console.error( "Error:", error );
|
|
115
|
+
throw error;
|
|
116
|
+
}
|
|
117
|
+
finally
|
|
118
|
+
{
|
|
119
|
+
if ( this.puppeteerBrowser )
|
|
120
|
+
{
|
|
121
|
+
await this.puppeteerBrowser.close(); // Close the browser after scraping
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
async fetchPage ( url, depth )
|
|
127
|
+
{
|
|
128
|
+
if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
|
|
129
|
+
{
|
|
130
|
+
console.log( `Reached maximum: ${this.maxArticles}, ${this.maxDepth}` );
|
|
131
|
+
return;
|
|
132
|
+
}
|
|
133
|
+
this.visited.add( url );
|
|
134
|
+
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
|
135
|
+
{
|
|
136
|
+
return;
|
|
137
|
+
}
|
|
138
|
+
try
|
|
139
|
+
{
|
|
140
|
+
const data = await this.fetchContent( url );
|
|
141
|
+
if ( !data ) return;
|
|
142
|
+
const dom = new JSDOM( data, { url });
|
|
143
|
+
const { document } = dom.window;
|
|
144
|
+
|
|
145
|
+
if ( !this.isExcluded( url ) )
|
|
146
|
+
{
|
|
147
|
+
const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
|
|
148
|
+
const article = reader.parse();
|
|
149
|
+
|
|
150
|
+
if ( article )
|
|
151
|
+
{
|
|
152
|
+
if ( this.hasValidPageContent( article.textContent ) )
|
|
153
|
+
{
|
|
154
|
+
const metadata = this.extractMetadata( url, document );
|
|
155
|
+
metadata.depth = depth;
|
|
156
|
+
this.saveArticle( url, article.textContent, metadata );
|
|
157
|
+
}
|
|
158
|
+
else
|
|
159
|
+
{
|
|
160
|
+
console.error( `Invalid content found at ${url}` );
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
else
|
|
164
|
+
{
|
|
165
|
+
console.error( `No readable content found at ${url}` );
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
const links = this.extractLinks( data );
|
|
170
|
+
for ( const link of links )
|
|
171
|
+
{
|
|
172
|
+
if ( !this.visited.has( link ) )
|
|
173
|
+
{
|
|
174
|
+
await this.fetchPage( link, depth + 1 );
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
catch ( error )
|
|
179
|
+
{
|
|
180
|
+
console.error( `Error fetching ${url}:`, error.message, error.code );
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
async fetchContent ( url )
|
|
185
|
+
{
|
|
186
|
+
try
|
|
187
|
+
{
|
|
188
|
+
const response = await retryAxiosRequest( url )
|
|
189
|
+
const contentType = response.headers["content-type"] || "";
|
|
190
|
+
if ( !contentType.startsWith( "text" ) )
|
|
191
|
+
{
|
|
192
|
+
console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
|
|
193
|
+
response.data.destroy();
|
|
194
|
+
return null;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Step 3: If Content-Type is HTML, read the full response
|
|
198
|
+
let htmlContent = "";
|
|
199
|
+
response.data.on( "data", ( chunk ) =>
|
|
200
|
+
{
|
|
201
|
+
htmlContent += chunk.toString();
|
|
202
|
+
});
|
|
203
|
+
|
|
204
|
+
// Wait for the stream to finish
|
|
205
|
+
await new Promise( ( resolve, reject ) =>
|
|
206
|
+
{
|
|
207
|
+
response.data.on( "end", resolve );
|
|
208
|
+
response.data.on( "error", reject );
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
return htmlContent;
|
|
212
|
+
}
|
|
213
|
+
catch ( error )
|
|
214
|
+
{
|
|
215
|
+
console.error( `Error fetching ${url}:`, error.message );
|
|
216
|
+
if ( error.status = 403 && this.usePuppeteer )
|
|
217
|
+
{
|
|
218
|
+
try
|
|
219
|
+
{
|
|
220
|
+
let result;
|
|
221
|
+
for ( let index = 0; index < 10; index++ )
|
|
222
|
+
{
|
|
223
|
+
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
|
224
|
+
result = await this.navigateToPage( url ) ;
|
|
225
|
+
if ( this.hasValidPageContent( result.htmlContent ) )
|
|
226
|
+
{
|
|
227
|
+
break
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
return result.htmlContent;
|
|
231
|
+
}
|
|
232
|
+
catch ( error )
|
|
233
|
+
{
|
|
234
|
+
console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
|
|
235
|
+
throw error;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
}
|
|
239
|
+
throw error;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
async navigateToPage ( url )
|
|
244
|
+
{
|
|
245
|
+
let pages = await this.puppeteerBrowser.pages();
|
|
246
|
+
let page = pages[0];
|
|
247
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
248
|
+
await page.goto( url );
|
|
249
|
+
pages = await this.puppeteerBrowser.pages();
|
|
250
|
+
page = pages[0];
|
|
251
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
252
|
+
await this.waitForPageToLoad( page );
|
|
253
|
+
pages = await this.puppeteerBrowser.pages();
|
|
254
|
+
page = pages[0];
|
|
255
|
+
page.setDefaultNavigationTimeout( 10000 );
|
|
256
|
+
if ( page )
|
|
257
|
+
{
|
|
258
|
+
let htmlContent = await page.content();
|
|
259
|
+
return { pages, page, htmlContent };
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
async waitForPageToLoad ( page )
|
|
264
|
+
{
|
|
265
|
+
try
|
|
266
|
+
{
|
|
267
|
+
await page.waitForNavigation({ waitUntil: "networkidle0" });
|
|
268
|
+
}
|
|
269
|
+
catch ( error )
|
|
270
|
+
{
|
|
271
|
+
console.log( error );
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
extractLinks ( data )
|
|
276
|
+
{
|
|
277
|
+
const links = new Set();
|
|
278
|
+
const regex = /<a\s+(?:[^>]*?\s+)?href=("|')(.*?)\1/gi;
|
|
279
|
+
let match;
|
|
280
|
+
|
|
281
|
+
while ( ( match = regex.exec( data ) ) !== null )
|
|
282
|
+
{
|
|
283
|
+
let href = match[2];
|
|
284
|
+
if ( href.startsWith( "/" ) )
|
|
285
|
+
{
|
|
286
|
+
href = new URL( href, this.baseURL ).href
|
|
287
|
+
}
|
|
288
|
+
if ( href.endsWith( "/" ) )
|
|
289
|
+
{
|
|
290
|
+
href = href.slice( 0, -1 );
|
|
291
|
+
}
|
|
292
|
+
if ( href.startsWith( this.baseURL ) )
|
|
293
|
+
{
|
|
294
|
+
links.add( href );
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
return links;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
saveArticle ( url, content, metadata )
|
|
301
|
+
{
|
|
302
|
+
const processedContent = this.processContent( content );
|
|
303
|
+
|
|
304
|
+
const simpleContent = {
|
|
305
|
+
text: processedContent.trim()
|
|
306
|
+
};
|
|
307
|
+
|
|
308
|
+
const contentWithMetadata = {
|
|
309
|
+
text: processedContent.trim(),
|
|
310
|
+
metadata: this.filterMetadata( metadata )
|
|
311
|
+
};
|
|
312
|
+
|
|
313
|
+
this.allProcessedContent.push({
|
|
314
|
+
simple: simpleContent,
|
|
315
|
+
withMetadata: contentWithMetadata
|
|
316
|
+
});
|
|
317
|
+
|
|
318
|
+
let urlPath = new URL( url ).pathname;
|
|
319
|
+
if ( urlPath === "/" )
|
|
320
|
+
{
|
|
321
|
+
urlPath = "/index";
|
|
322
|
+
}
|
|
323
|
+
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
|
324
|
+
const dir = path.dirname( filePath );
|
|
325
|
+
|
|
326
|
+
fs.mkdirSync( dir, { recursive: true });
|
|
327
|
+
fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
|
|
328
|
+
fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
|
|
329
|
+
console.log( `Saved: ${filePath}.txt` );
|
|
330
|
+
console.log( `Saved: ${filePath}.json` );
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
createJSONLFile ()
|
|
334
|
+
{
|
|
335
|
+
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
|
336
|
+
let writeStreamMeta
|
|
337
|
+
|
|
338
|
+
// Add error handlers
|
|
339
|
+
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
|
|
340
|
+
|
|
341
|
+
if ( this.includeMetadata )
|
|
342
|
+
{
|
|
343
|
+
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
|
|
344
|
+
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
|
|
345
|
+
}
|
|
346
|
+
for ( const content of this.allProcessedContent )
|
|
347
|
+
{
|
|
348
|
+
writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
|
|
349
|
+
if ( this.includeMetadata )
|
|
350
|
+
{
|
|
351
|
+
writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
writeStreamSimple.end();
|
|
355
|
+
if ( this.includeMetadata )
|
|
356
|
+
{
|
|
357
|
+
writeStreamMeta.end();
|
|
358
|
+
console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
|
|
359
|
+
}
|
|
360
|
+
console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
createCSVFile ()
|
|
364
|
+
{
|
|
365
|
+
// Create simple version
|
|
366
|
+
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
|
|
367
|
+
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
|
|
368
|
+
writeStreamSimple.write( "text\n" );
|
|
369
|
+
|
|
370
|
+
// Create metadata version if requested
|
|
371
|
+
let writeStreamMeta;
|
|
372
|
+
if ( this.includeMetadata )
|
|
373
|
+
{
|
|
374
|
+
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
|
|
375
|
+
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
if ( this.includeMetadata )
|
|
379
|
+
{
|
|
380
|
+
const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
|
|
381
|
+
writeStreamMeta.write( `${headers}\n` );
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
for ( const content of this.allProcessedContent )
|
|
385
|
+
{
|
|
386
|
+
// Write simple version
|
|
387
|
+
const escapedText = content.simple.text.replace( /"/g, "\"\"" );
|
|
388
|
+
writeStreamSimple.write( `"${escapedText}"\n` );
|
|
389
|
+
|
|
390
|
+
// Write metadata version if requested
|
|
391
|
+
if ( this.includeMetadata )
|
|
392
|
+
{
|
|
393
|
+
const { metadata } = content.withMetadata;
|
|
394
|
+
const metadataValues = Array.from( this.metadataFields )
|
|
395
|
+
.map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
|
|
396
|
+
writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
writeStreamSimple.end();
|
|
401
|
+
if ( writeStreamMeta )
|
|
402
|
+
{
|
|
403
|
+
writeStreamMeta.end();
|
|
404
|
+
}
|
|
405
|
+
console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
|
|
406
|
+
if ( this.includeMetadata )
|
|
407
|
+
{
|
|
408
|
+
console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
saveNumberedTextFiles ()
|
|
413
|
+
{
|
|
414
|
+
// Create base text folder for simple content
|
|
415
|
+
const baseTextPath = path.join( __dirname, this.textOutputPath );
|
|
416
|
+
|
|
417
|
+
// Create metadata text folder if needed
|
|
418
|
+
let metaTextPath = null;
|
|
419
|
+
if ( this.includeMetadata )
|
|
420
|
+
{
|
|
421
|
+
metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
|
|
422
|
+
fs.mkdirSync( metaTextPath, { recursive: true });
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
this.allProcessedContent.forEach( ( content, index ) =>
|
|
426
|
+
{
|
|
427
|
+
const fileName = `${index + 1}.txt`;
|
|
428
|
+
|
|
429
|
+
// Always save simple version
|
|
430
|
+
const simpleFilePath = path.join( baseTextPath, fileName );
|
|
431
|
+
fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
|
|
432
|
+
console.log( `Created simple text file: ${fileName}` );
|
|
433
|
+
|
|
434
|
+
// Save metadata version if enabled
|
|
435
|
+
if ( this.includeMetadata )
|
|
436
|
+
{
|
|
437
|
+
const metaFilePath = path.join( metaTextPath, fileName );
|
|
438
|
+
let fileContent = "";
|
|
439
|
+
|
|
440
|
+
const { metadata } = content.withMetadata;
|
|
441
|
+
// Add metadata fields as headers
|
|
442
|
+
for ( const field of this.metadataFields )
|
|
443
|
+
{
|
|
444
|
+
if ( metadata[field] )
|
|
445
|
+
{
|
|
446
|
+
fileContent += `${field}: ${metadata[field]}\n`;
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
fileContent += "\n---\n\n";
|
|
450
|
+
fileContent += content.withMetadata.text;
|
|
451
|
+
|
|
452
|
+
fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
|
|
453
|
+
console.log( `Created metadata text file: ${fileName}` );
|
|
454
|
+
}
|
|
455
|
+
});
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
processContent ( content )
|
|
459
|
+
{
|
|
460
|
+
let processed = content;
|
|
461
|
+
|
|
462
|
+
// Remove "[You can read more about this here]" and similar patterns
|
|
463
|
+
processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
|
|
464
|
+
|
|
465
|
+
// Trim each line
|
|
466
|
+
processed = processed.split( "\n" )
|
|
467
|
+
.map( line => { return line.trim() })
|
|
468
|
+
.join( "\n" );
|
|
469
|
+
|
|
470
|
+
// Replace 3 or more newlines with a single newline
|
|
471
|
+
processed = processed.replace( /\n{3,}/g, "\n\n" );
|
|
472
|
+
|
|
473
|
+
// Add more processing rules as needed:
|
|
474
|
+
// processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
|
|
475
|
+
// processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
|
|
476
|
+
|
|
477
|
+
// Remove specified words from the end of content, handling multiple occurrences
|
|
478
|
+
const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
|
|
479
|
+
let changed = true;
|
|
480
|
+
|
|
481
|
+
while ( changed )
|
|
482
|
+
{
|
|
483
|
+
changed = false;
|
|
484
|
+
for ( let i = 0; i < wordsToTrim.length; i++ )
|
|
485
|
+
{
|
|
486
|
+
const oldProcessed = processed;
|
|
487
|
+
processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
|
|
488
|
+
if ( oldProcessed !== processed )
|
|
489
|
+
{
|
|
490
|
+
changed = true;
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
return processed;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
filterMetadata ( metadata )
|
|
498
|
+
{
|
|
499
|
+
if ( !this.includeMetadata ) return {};
|
|
500
|
+
|
|
501
|
+
const filteredMetadata = {};
|
|
502
|
+
for ( const field of this.metadataFields )
|
|
503
|
+
{
|
|
504
|
+
if ( metadata[field] && typeof metadata[field] === "string" )
|
|
505
|
+
{
|
|
506
|
+
filteredMetadata[field] = metadata[field];
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
return filteredMetadata;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
extractMetadata ( url, document )
|
|
513
|
+
{
|
|
514
|
+
return {
|
|
515
|
+
url,
|
|
516
|
+
title: document.title,
|
|
517
|
+
description: document.querySelector( "meta[name=\"description\"]" )?.content,
|
|
518
|
+
keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
|
|
519
|
+
author: document.querySelector( "meta[name=\"author\"]" )?.content,
|
|
520
|
+
language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
|
|
521
|
+
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
|
522
|
+
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
|
523
|
+
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
|
524
|
+
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
|
525
|
+
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
|
526
|
+
dateScraped: new Date().toISOString()
|
|
527
|
+
};
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
async retryAxiosRequest ( url )
|
|
531
|
+
{
|
|
532
|
+
const options = {
|
|
533
|
+
responseType: "stream",
|
|
534
|
+
maxRedirects: 5,
|
|
535
|
+
timeout: 70000,
|
|
536
|
+
...this.axiosOptions,
|
|
537
|
+
};
|
|
538
|
+
|
|
539
|
+
let maxRetries = 3;
|
|
540
|
+
for ( let attempt = 1; attempt <= maxRetries; attempt++ )
|
|
541
|
+
{
|
|
542
|
+
try
|
|
543
|
+
{
|
|
544
|
+
return await axios.get( url, options );
|
|
545
|
+
}
|
|
546
|
+
catch ( error )
|
|
547
|
+
{
|
|
548
|
+
if ( attempt === maxRetries ) throw error;
|
|
549
|
+
await WebScraper.sleep( 1000 * attempt );
|
|
550
|
+
console.log( `Retrying request to ${url} (Attempt ${attempt + 1}/${maxRetries})` );
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
configurePuppeteer ( )
|
|
556
|
+
{
|
|
557
|
+
this.puppeteerOptions = {
|
|
558
|
+
headless: false,
|
|
559
|
+
userDataDir: "./tmp/browser",
|
|
560
|
+
defaultViewport: null,
|
|
561
|
+
args: ["--start-maximized"],
|
|
562
|
+
ignoreDefaultArgs: true
|
|
563
|
+
};
|
|
564
|
+
|
|
565
|
+
if ( this.puppeteerProxy )
|
|
566
|
+
{
|
|
567
|
+
this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
if ( this.puppeteerExecutablePath )
|
|
571
|
+
{
|
|
572
|
+
this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
this.puppeteerRealOptions = {
|
|
576
|
+
headless: false,
|
|
577
|
+
args: [],
|
|
578
|
+
customConfig: {},
|
|
579
|
+
turnstile: true,
|
|
580
|
+
connectOption: {},
|
|
581
|
+
disableXvfb: false,
|
|
582
|
+
ignoreAllFlags: false,
|
|
583
|
+
proxy: this.puppeteerRealProxy
|
|
584
|
+
};
|
|
585
|
+
|
|
586
|
+
this.puppeteerBrowser = null;
|
|
587
|
+
this.puppeteerPage = null;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
normalizeExcludeList ( list = [] )
|
|
591
|
+
{
|
|
592
|
+
const normalizedSet = new Set();
|
|
593
|
+
for ( let i = 0; i < list.length; i++ )
|
|
594
|
+
{
|
|
595
|
+
const item = list[i];
|
|
596
|
+
if ( item.endsWith( "/" ) )
|
|
597
|
+
{
|
|
598
|
+
normalizedSet.add( item.slice( 0, -1 ) );
|
|
599
|
+
}
|
|
600
|
+
else
|
|
601
|
+
{
|
|
602
|
+
normalizedSet.add( item );
|
|
603
|
+
}
|
|
604
|
+
normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
|
|
605
|
+
}
|
|
606
|
+
return normalizedSet;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
isExcluded ( url )
|
|
610
|
+
{
|
|
611
|
+
if ( this.exactExcludeList.has( url ) )
|
|
612
|
+
{
|
|
613
|
+
return true;
|
|
614
|
+
}
|
|
615
|
+
return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
isValidFileType ( url )
|
|
619
|
+
{
|
|
620
|
+
if ( !this.filterFileTypes ) return true;
|
|
621
|
+
const urlPath = new URL( url ).pathname.toLowerCase();
|
|
622
|
+
return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
isValidDomain ( url )
|
|
626
|
+
{
|
|
627
|
+
if ( !this.strictBaseURL ) return true;
|
|
628
|
+
try
|
|
629
|
+
{
|
|
630
|
+
const urlObj = new URL( url );
|
|
631
|
+
const baseURLObj = new URL( this.baseURL );
|
|
632
|
+
return urlObj.hostname === baseURLObj.hostname;
|
|
633
|
+
}
|
|
634
|
+
catch ( e )
|
|
635
|
+
{
|
|
636
|
+
console.log( `Invalid URL: ${url}` );
|
|
637
|
+
return false;
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
hasValidPageContent ( content )
|
|
642
|
+
{
|
|
643
|
+
// Remove whitespace and newlines for checking
|
|
644
|
+
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|
|
645
|
+
|
|
646
|
+
// List of phrases that indicate invalid content
|
|
647
|
+
const invalidPhrases = [
|
|
648
|
+
"verifying that you are not a robot",
|
|
649
|
+
"verifying you are human. this may take a few seconds.",
|
|
650
|
+
"verify you are human by completing the action below",
|
|
651
|
+
"checking if the site connection is secure",
|
|
652
|
+
"please wait while we verify",
|
|
653
|
+
"please enable javascript",
|
|
654
|
+
"access denied",
|
|
655
|
+
"verifying you are human",
|
|
656
|
+
"captcha verification"
|
|
657
|
+
];
|
|
658
|
+
|
|
659
|
+
const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
|
|
660
|
+
// Check content length
|
|
661
|
+
if ( cleanContent.length < 100 || hasInvalidPhrases )
|
|
662
|
+
{
|
|
663
|
+
return false;
|
|
664
|
+
}
|
|
665
|
+
return true;
|
|
666
|
+
}
|
|
667
|
+
|
|
668
|
+
createOutputDirectory ()
|
|
669
|
+
{
|
|
670
|
+
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
671
|
+
{
|
|
672
|
+
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
673
|
+
}
|
|
674
|
+
if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
|
|
675
|
+
{
|
|
676
|
+
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
677
|
+
}
|
|
678
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
|
679
|
+
{
|
|
680
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
|
681
|
+
}
|
|
682
|
+
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
|
683
|
+
{
|
|
684
|
+
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
|
685
|
+
}
|
|
686
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
|
687
|
+
{
|
|
688
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
|
|
689
|
+
}
|
|
690
|
+
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
|
|
691
|
+
{
|
|
692
|
+
fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
|
|
693
|
+
}
|
|
694
|
+
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
695
|
+
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
696
|
+
}
|
|
697
|
+
|
|
698
|
+
static sleep ( ms )
|
|
699
|
+
{
|
|
700
|
+
return new Promise( resolve => { return setTimeout( resolve, ms ) });
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
static async combineResults ( outputPath, websites )
|
|
704
|
+
{
|
|
705
|
+
await WebScraper.sleep( 1000 );
|
|
706
|
+
const fullOutputPath = path.join( __dirname, outputPath );
|
|
707
|
+
|
|
708
|
+
// Create output directories
|
|
709
|
+
WebScraper.createCombinedDirectories( fullOutputPath );
|
|
710
|
+
|
|
711
|
+
// Combine files by type
|
|
712
|
+
WebScraper.combineJSONLFiles( fullOutputPath, websites );
|
|
713
|
+
WebScraper.combineCSVFiles( fullOutputPath, websites );
|
|
714
|
+
WebScraper.combineTextFiles( fullOutputPath, websites );
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
static createCombinedDirectories ( fullOutputPath )
|
|
718
|
+
{
|
|
719
|
+
if ( fs.existsSync( fullOutputPath ) )
|
|
720
|
+
{
|
|
721
|
+
fs.rmSync( fullOutputPath, { recursive: true, force: true });
|
|
722
|
+
}
|
|
723
|
+
fs.mkdirSync( fullOutputPath, { recursive: true });
|
|
724
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
|
725
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
|
|
726
|
+
}
|
|
727
|
+
|
|
728
|
+
static combineJSONLFiles ( fullOutputPath, websites )
|
|
729
|
+
{
|
|
730
|
+
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
|
731
|
+
.on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
|
|
732
|
+
const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
|
|
733
|
+
.on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
|
|
734
|
+
|
|
735
|
+
for ( const website of websites )
|
|
736
|
+
{
|
|
737
|
+
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
738
|
+
if ( jsonlContent )
|
|
739
|
+
{
|
|
740
|
+
jsonlOutput.write( jsonlContent );
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
if ( website.includeMetadata )
|
|
744
|
+
{
|
|
745
|
+
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
746
|
+
if ( jsonlMetaContent )
|
|
747
|
+
{
|
|
748
|
+
jsonlMetaOutput.write( jsonlMetaContent );
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
}
|
|
752
|
+
|
|
753
|
+
jsonlOutput.end();
|
|
754
|
+
jsonlMetaOutput.end();
|
|
755
|
+
}
|
|
756
|
+
|
|
757
|
+
static combineCSVFiles ( fullOutputPath, websites )
|
|
758
|
+
{
|
|
759
|
+
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
760
|
+
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
|
761
|
+
|
|
762
|
+
csvOutput.write( "text\n" );
|
|
763
|
+
const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
|
|
764
|
+
|
|
765
|
+
if ( metadataFields.size > 0 )
|
|
766
|
+
{
|
|
767
|
+
csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
for ( const website of websites )
|
|
771
|
+
{
|
|
772
|
+
const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
|
|
773
|
+
.split( "\n" )
|
|
774
|
+
.slice( 1 )
|
|
775
|
+
.filter( line => { return line.trim() });
|
|
776
|
+
if ( csvContent.length > 0 )
|
|
777
|
+
{
|
|
778
|
+
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
if ( website.includeMetadata )
|
|
782
|
+
{
|
|
783
|
+
const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
|
|
784
|
+
.split( "\n" )
|
|
785
|
+
.slice( 1 )
|
|
786
|
+
.filter( line => { return line.trim() });
|
|
787
|
+
if ( csvMetaContent.length > 0 )
|
|
788
|
+
{
|
|
789
|
+
csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
csvOutput.end();
|
|
795
|
+
csvMetaOutput.end();
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
static combineTextFiles ( fullOutputPath, websites )
|
|
799
|
+
{
|
|
800
|
+
let textFileCounter = 1;
|
|
801
|
+
|
|
802
|
+
for ( const website of websites )
|
|
803
|
+
{
|
|
804
|
+
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
|
805
|
+
|
|
806
|
+
for ( const file of textFiles )
|
|
807
|
+
{
|
|
808
|
+
const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
|
|
809
|
+
fs.writeFileSync(
|
|
810
|
+
path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
|
|
811
|
+
content,
|
|
812
|
+
"utf-8"
|
|
813
|
+
);
|
|
814
|
+
|
|
815
|
+
if ( website.includeMetadata )
|
|
816
|
+
{
|
|
817
|
+
const metaContent = fs.readFileSync(
|
|
818
|
+
path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
|
|
819
|
+
"utf-8"
|
|
820
|
+
);
|
|
821
|
+
fs.writeFileSync(
|
|
822
|
+
path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
|
|
823
|
+
metaContent,
|
|
824
|
+
"utf-8"
|
|
825
|
+
);
|
|
826
|
+
}
|
|
827
|
+
textFileCounter++;
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
|
|
2
833
|
module.exports = WebScraper;
|