clean-web-scraper 3.8.7 → 3.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +7 -6
- package/main.js +168 -213
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -129,9 +129,9 @@ async function electronicintifada ( enable )
|
|
129
129
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
130
130
|
includeMetadata: true,
|
131
131
|
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
132
|
-
maxDepth:
|
132
|
+
maxDepth: 16,
|
133
133
|
maxArticles: 2000,
|
134
|
-
concurrencyLimit:
|
134
|
+
concurrencyLimit: 2,
|
135
135
|
axiosHeaders: headers,
|
136
136
|
retryDelay: 10000,
|
137
137
|
axiosProxy: {
|
@@ -198,18 +198,19 @@ async function mondoweiss ( enable )
|
|
198
198
|
textOutputPath: "./dataset/mondoweiss/texts",
|
199
199
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
200
200
|
maxArticles: 2500,
|
201
|
-
maxRetries:
|
202
|
-
concurrencyLimit:
|
201
|
+
maxRetries: 3,
|
202
|
+
concurrencyLimit: 3,
|
203
203
|
axiosHeaders: headers,
|
204
204
|
axiosProxy: {
|
205
205
|
host: "localhost",
|
206
206
|
port: 2080,
|
207
207
|
protocol: "http"
|
208
208
|
},
|
209
|
-
maxDepth:
|
209
|
+
maxDepth: 15,
|
210
210
|
retryDelay: 10000,
|
211
211
|
includeMetadata: true,
|
212
|
-
metadataFields: ["author", "title", "description", "dateScrapedDate"]
|
212
|
+
metadataFields: ["author", "title", "description", "dateScrapedDate"],
|
213
|
+
useProxyAsFallback: true
|
213
214
|
});
|
214
215
|
if ( enable )
|
215
216
|
{
|
package/main.js
CHANGED
@@ -3,49 +3,11 @@ const { JSDOM } = require( "jsdom" );
|
|
3
3
|
const { Readability } = require( "@mozilla/readability" );
|
4
4
|
const fs = require( "fs" );
|
5
5
|
const path = require( "path" );
|
6
|
-
const { connect } = require( "puppeteer-real-browser" )
|
6
|
+
const { connect } = require( "puppeteer-real-browser" );
|
7
7
|
|
8
8
|
class WebScraper
|
9
9
|
{
|
10
|
-
constructor ({
|
11
|
-
// Base configuration
|
12
|
-
baseURL,
|
13
|
-
startURL,
|
14
|
-
strictBaseURL,
|
15
|
-
maxDepth,
|
16
|
-
maxArticles,
|
17
|
-
concurrencyLimit,
|
18
|
-
maxRetries,
|
19
|
-
retryDelay,
|
20
|
-
|
21
|
-
// URL filtering
|
22
|
-
excludeList = [],
|
23
|
-
exactExcludeList = [],
|
24
|
-
filterFileTypes,
|
25
|
-
excludedFileTypes,
|
26
|
-
removeURLFragment,
|
27
|
-
|
28
|
-
// Output paths
|
29
|
-
scrapResultPath = "./dataset",
|
30
|
-
jsonlOutputPath,
|
31
|
-
textOutputPath,
|
32
|
-
csvOutputPath,
|
33
|
-
|
34
|
-
// Metadata options
|
35
|
-
includeMetadata = false,
|
36
|
-
metadataFields = [],
|
37
|
-
|
38
|
-
// Network options
|
39
|
-
axiosHeaders,
|
40
|
-
axiosProxy,
|
41
|
-
useProxyAsFallback,
|
42
|
-
|
43
|
-
// Puppeteer options
|
44
|
-
usePuppeteer,
|
45
|
-
puppeteerProxy, // e.g. http://127.0.0.1:2080
|
46
|
-
puppeteerExecutablePath,
|
47
|
-
puppeteerRealProxy
|
48
|
-
})
|
10
|
+
constructor ( config = {})
|
49
11
|
{
|
50
12
|
// Base configuration
|
51
13
|
this.baseURL = baseURL;
|
@@ -54,11 +16,10 @@ class WebScraper
|
|
54
16
|
this.maxDepth = maxDepth || Infinity;
|
55
17
|
this.maxArticles = maxArticles || Infinity;
|
56
18
|
this.concurrencyLimit = concurrencyLimit || 2;
|
57
|
-
this.
|
58
|
-
this.retryDelay = retryDelay || 40000;
|
19
|
+
this.crawlingDelay = crawlingDelay || 1000;
|
59
20
|
|
60
21
|
// Output paths setup
|
61
|
-
this.scrapResultPath = scrapResultPath;
|
22
|
+
this.scrapResultPath = scrapResultPath || "./dataset";
|
62
23
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
63
24
|
this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
|
64
25
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
@@ -67,8 +28,8 @@ class WebScraper
|
|
67
28
|
this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
68
29
|
|
69
30
|
// Metadata configuration
|
70
|
-
this.includeMetadata = includeMetadata;
|
71
|
-
this.metadataFields = new Set( metadataFields );
|
31
|
+
this.includeMetadata = includeMetadata || false;
|
32
|
+
this.metadataFields = new Set( metadataFields || [] );
|
72
33
|
|
73
34
|
// URL filtering setup
|
74
35
|
this.visited = new Set();
|
@@ -81,6 +42,8 @@ class WebScraper
|
|
81
42
|
// Network configuration
|
82
43
|
this.axiosHeaders = axiosHeaders;
|
83
44
|
this.axiosProxy = axiosProxy;
|
45
|
+
this.axiosMaxRetries = axiosMaxRetries || 5;
|
46
|
+
this.axiosRetryDelay = axiosRetryDelay || 40000;
|
84
47
|
this.useProxyAsFallback = useProxyAsFallback || false;
|
85
48
|
this.axiosOptions = {};
|
86
49
|
if ( this.axiosHeaders )
|
@@ -97,10 +60,10 @@ class WebScraper
|
|
97
60
|
|
98
61
|
// Puppeteer configuration
|
99
62
|
this.usePuppeteer = usePuppeteer || false;
|
100
|
-
this.puppeteerProxy = puppeteerProxy;
|
63
|
+
this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
|
101
64
|
this.puppeteerExecutablePath = puppeteerExecutablePath;
|
102
65
|
this.puppeteerRealProxy = puppeteerRealProxy;
|
103
|
-
this.configurePuppeteer(
|
66
|
+
this.configurePuppeteer();
|
104
67
|
}
|
105
68
|
|
106
69
|
async start ()
|
@@ -109,12 +72,12 @@ class WebScraper
|
|
109
72
|
{
|
110
73
|
if ( this.usePuppeteer )
|
111
74
|
{
|
112
|
-
|
75
|
+
const { browser, page } = await connect( this.puppeteerRealOptions );
|
113
76
|
this.puppeteerBrowser = browser;
|
114
77
|
this.puppeteerPage = page;
|
115
78
|
}
|
116
79
|
this.createOutputDirectory();
|
117
|
-
await this.
|
80
|
+
await this.crawl( this.startURL, 0 );
|
118
81
|
this.createJSONLFile();
|
119
82
|
this.saveNumberedTextFiles();
|
120
83
|
this.createCSVFile();
|
@@ -129,94 +92,95 @@ class WebScraper
|
|
129
92
|
{
|
130
93
|
if ( this.puppeteerBrowser )
|
131
94
|
{
|
132
|
-
await this.puppeteerBrowser.close();
|
95
|
+
await this.puppeteerBrowser.close();
|
133
96
|
}
|
134
97
|
}
|
135
98
|
}
|
136
99
|
|
137
|
-
async
|
100
|
+
async crawl ( initialUrl, initialDepth = 0 )
|
138
101
|
{
|
139
|
-
|
140
|
-
|
141
|
-
url = url.split( "#" )[0];
|
142
|
-
}
|
143
|
-
if ( this.hasReachedMax( depth ) )
|
144
|
-
{
|
145
|
-
return;
|
146
|
-
}
|
147
|
-
if ( this.visited.has( url ) )
|
148
|
-
{
|
149
|
-
console.log( `Already visited: ${url}` );
|
150
|
-
return;
|
151
|
-
}
|
152
|
-
this.visited.add( url );
|
153
|
-
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
154
|
-
{
|
155
|
-
return;
|
156
|
-
}
|
157
|
-
try
|
102
|
+
const queue = [{ url: initialUrl, depth: initialDepth }];
|
103
|
+
for ( let i = 0; i < queue.length; i++ )
|
158
104
|
{
|
159
|
-
|
160
|
-
|
161
|
-
if (
|
162
|
-
|
163
|
-
|
105
|
+
let { url, depth } = queue[i];
|
106
|
+
console.log( `Processing URL: ${queue[i].url}` );
|
107
|
+
if ( this.hasReachedMax( depth ) )
|
108
|
+
{
|
109
|
+
continue;
|
110
|
+
}
|
111
|
+
if ( this.removeURLFragment )
|
112
|
+
{
|
113
|
+
url = url.split( "#" )[0];
|
114
|
+
}
|
115
|
+
if ( this.visited.has( url ) )
|
116
|
+
{
|
117
|
+
console.log( `Already visited: ${url}` );
|
118
|
+
continue;
|
119
|
+
}
|
120
|
+
this.visited.add( url );
|
164
121
|
|
165
|
-
if ( !this.
|
122
|
+
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
166
123
|
{
|
167
|
-
|
168
|
-
|
124
|
+
continue;
|
125
|
+
}
|
169
126
|
|
170
|
-
|
127
|
+
try
|
128
|
+
{
|
129
|
+
await WebScraper.sleep( this.crawlingDelay );
|
130
|
+
const data = await this.fetchContent( url );
|
131
|
+
if ( !data ) continue;
|
132
|
+
|
133
|
+
const dom = new JSDOM( data, { url });
|
134
|
+
const { document } = dom.window;
|
135
|
+
|
136
|
+
if ( !this.isExcluded( url ) )
|
171
137
|
{
|
172
|
-
|
138
|
+
const reader = new Readability( document, {
|
139
|
+
charThreshold: 500,
|
140
|
+
nbTopCandidates: 20
|
141
|
+
});
|
142
|
+
const article = reader.parse();
|
143
|
+
if ( article )
|
173
144
|
{
|
174
|
-
|
175
|
-
|
145
|
+
if ( this.hasValidPageContent( article.textContent ) )
|
146
|
+
{
|
147
|
+
const metadata = this.extractMetadata( url, document );
|
148
|
+
this.saveArticle( url, article.textContent, metadata );
|
149
|
+
}
|
150
|
+
else
|
151
|
+
{
|
152
|
+
console.error( `Invalid content found at ${url}` );
|
153
|
+
}
|
176
154
|
}
|
177
155
|
else
|
178
156
|
{
|
179
|
-
console.error( `
|
157
|
+
console.error( `No readable content found at ${url}` );
|
180
158
|
}
|
181
159
|
}
|
182
|
-
else
|
183
|
-
{
|
184
|
-
console.error( `No readable content found at ${url}` );
|
185
|
-
}
|
186
|
-
}
|
187
|
-
|
188
|
-
const links = this.extractLinks( data );
|
189
|
-
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
190
|
-
|
191
|
-
for ( let i = 0; i < unvisitedLinks.length; i += this.concurrencyLimit )
|
192
|
-
{
|
193
|
-
if ( this.hasReachedMax( depth ) )
|
194
|
-
{
|
195
|
-
return;
|
196
|
-
}
|
197
|
-
const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
|
198
|
-
const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
|
199
160
|
|
200
|
-
|
161
|
+
const links = this.extractLinks( data );
|
162
|
+
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
163
|
+
for ( const link of unvisitedLinks )
|
201
164
|
{
|
202
|
-
if (
|
165
|
+
if ( !this.hasReachedMax( depth ) )
|
203
166
|
{
|
204
|
-
|
167
|
+
queue.push({ url: link, depth: depth + 1 });
|
205
168
|
}
|
206
|
-
}
|
169
|
+
}
|
170
|
+
}
|
171
|
+
catch ( error )
|
172
|
+
{
|
173
|
+
console.error( `Error fetching ${url}:`, error.message, error.code );
|
207
174
|
}
|
208
|
-
}
|
209
|
-
catch ( error )
|
210
|
-
{
|
211
|
-
console.error( `Error fetching ${url}:`, error.message, error.code );
|
212
175
|
}
|
213
176
|
}
|
214
177
|
|
178
|
+
|
215
179
|
async fetchContent ( url )
|
216
180
|
{
|
217
181
|
try
|
218
182
|
{
|
219
|
-
const response = await this.retryAxiosRequest( url )
|
183
|
+
const response = await this.retryAxiosRequest( url );
|
220
184
|
const contentType = response?.headers["content-type"] || "";
|
221
185
|
if ( !contentType?.startsWith( "text" ) )
|
222
186
|
{
|
@@ -225,34 +189,30 @@ class WebScraper
|
|
225
189
|
return null;
|
226
190
|
}
|
227
191
|
|
228
|
-
// Step 3: If Content-Type is HTML, read the full response
|
229
192
|
let htmlContent = "";
|
230
|
-
response.data.on( "data",
|
193
|
+
response.data.on( "data", chunk =>
|
231
194
|
{
|
232
195
|
htmlContent += chunk.toString();
|
233
196
|
});
|
234
|
-
|
235
|
-
// Wait for the stream to finish
|
236
197
|
await new Promise( ( resolve, reject ) =>
|
237
198
|
{
|
238
199
|
response.data.on( "end", resolve );
|
239
200
|
response.data.on( "error", reject );
|
240
201
|
});
|
241
|
-
|
242
202
|
return htmlContent;
|
243
203
|
}
|
244
204
|
catch ( error )
|
245
205
|
{
|
246
206
|
console.error( `Error fetching content ${url}:`, error.message );
|
247
|
-
if ( error.status
|
207
|
+
if ( error.status === 403 && this.usePuppeteer )
|
248
208
|
{
|
249
209
|
try
|
250
210
|
{
|
251
211
|
let result;
|
252
|
-
for ( let
|
212
|
+
for ( let i = 0; i < 10; i++ )
|
253
213
|
{
|
254
214
|
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
255
|
-
result = await this.navigateToPage( url )
|
215
|
+
result = await this.navigateToPage( url );
|
256
216
|
if ( this.hasValidPageContent( result.htmlContent ) )
|
257
217
|
{
|
258
218
|
break
|
@@ -265,7 +225,6 @@ class WebScraper
|
|
265
225
|
console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
|
266
226
|
throw error;
|
267
227
|
}
|
268
|
-
|
269
228
|
}
|
270
229
|
throw error;
|
271
230
|
}
|
@@ -378,11 +337,10 @@ class WebScraper
|
|
378
337
|
createJSONLFile ()
|
379
338
|
{
|
380
339
|
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
381
|
-
|
382
|
-
|
383
|
-
// Add error handlers
|
384
|
-
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
|
340
|
+
writeStreamSimple.on( "error", err =>
|
341
|
+
{ return console.error( "Error writing JSONL:", err ) });
|
385
342
|
|
343
|
+
let writeStreamMeta;
|
386
344
|
if ( this.includeMetadata )
|
387
345
|
{
|
388
346
|
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
|
@@ -428,16 +386,18 @@ class WebScraper
|
|
428
386
|
|
429
387
|
for ( const content of this.allProcessedContent )
|
430
388
|
{
|
431
|
-
// Write simple version
|
432
389
|
const escapedText = content.simple.text.replace( /"/g, "\"\"" );
|
433
390
|
writeStreamSimple.write( `"${escapedText}"\n` );
|
434
391
|
|
435
|
-
// Write metadata version if requested
|
436
392
|
if ( this.includeMetadata )
|
437
393
|
{
|
438
394
|
const { metadata } = content.withMetadata;
|
439
|
-
const metadataValues = Array.from( this.metadataFields )
|
440
|
-
|
395
|
+
const metadataValues = Array.from( this.metadataFields ).map( field =>
|
396
|
+
{
|
397
|
+
return metadata[field]
|
398
|
+
? `"${metadata[field].replace( /"/g, "\"\"" )}"`
|
399
|
+
: "\"\""
|
400
|
+
});
|
441
401
|
writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
|
442
402
|
}
|
443
403
|
}
|
@@ -456,10 +416,8 @@ class WebScraper
|
|
456
416
|
|
457
417
|
saveNumberedTextFiles ()
|
458
418
|
{
|
459
|
-
// Create base text folder for simple content
|
460
419
|
const baseTextPath = path.join( __dirname, this.textOutputPath );
|
461
420
|
|
462
|
-
// Create metadata text folder if needed
|
463
421
|
let metaTextPath = null;
|
464
422
|
if ( this.includeMetadata )
|
465
423
|
{
|
@@ -470,20 +428,15 @@ class WebScraper
|
|
470
428
|
this.allProcessedContent.forEach( ( content, index ) =>
|
471
429
|
{
|
472
430
|
const fileName = `${index + 1}.txt`;
|
473
|
-
|
474
|
-
// Always save simple version
|
475
431
|
const simpleFilePath = path.join( baseTextPath, fileName );
|
476
432
|
fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
|
477
433
|
console.log( `Created simple text file: ${fileName}` );
|
478
434
|
|
479
|
-
// Save metadata version if enabled
|
480
435
|
if ( this.includeMetadata )
|
481
436
|
{
|
482
437
|
const metaFilePath = path.join( metaTextPath, fileName );
|
483
438
|
let fileContent = "";
|
484
|
-
|
485
439
|
const { metadata } = content.withMetadata;
|
486
|
-
// Add metadata fields as headers
|
487
440
|
for ( const field of this.metadataFields )
|
488
441
|
{
|
489
442
|
if ( metadata[field] )
|
@@ -493,7 +446,6 @@ class WebScraper
|
|
493
446
|
}
|
494
447
|
fileContent += "\n---\n\n";
|
495
448
|
fileContent += content.withMetadata.text;
|
496
|
-
|
497
449
|
fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
|
498
450
|
console.log( `Created metadata text file: ${fileName}` );
|
499
451
|
}
|
@@ -503,33 +455,27 @@ class WebScraper
|
|
503
455
|
processContent ( content )
|
504
456
|
{
|
505
457
|
let processed = content;
|
506
|
-
|
507
|
-
// Remove "[You can read more about this here]" and similar patterns
|
458
|
+
// Remove unwanted fixed text
|
508
459
|
processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
|
509
|
-
|
510
|
-
|
511
|
-
|
460
|
+
// Trim each line and remove extra newlines
|
461
|
+
processed = processed
|
462
|
+
.split( "\n" )
|
512
463
|
.map( line => { return line.trim() })
|
513
|
-
.join( "\n" )
|
514
|
-
|
515
|
-
// Replace 3 or more newlines with a single newline
|
516
|
-
processed = processed.replace( /\n{3,}/g, "\n\n" );
|
464
|
+
.join( "\n" )
|
465
|
+
.replace( /\n{3,}/g, "\n\n" );
|
517
466
|
|
518
|
-
//
|
519
|
-
// processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
|
520
|
-
// processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
|
521
|
-
|
522
|
-
// Remove specified words from the end of content, handling multiple occurrences
|
467
|
+
// Remove specified words at the end (repeatedly)
|
523
468
|
const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
|
524
469
|
let changed = true;
|
525
|
-
|
526
470
|
while ( changed )
|
527
471
|
{
|
528
472
|
changed = false;
|
529
|
-
for (
|
473
|
+
for ( const word of wordsToTrim )
|
530
474
|
{
|
531
475
|
const oldProcessed = processed;
|
532
|
-
processed = processed
|
476
|
+
processed = processed
|
477
|
+
.replace( new RegExp( `\\s*${word}\\s*$`, "g" ), "" )
|
478
|
+
.trim();
|
533
479
|
if ( oldProcessed !== processed )
|
534
480
|
{
|
535
481
|
changed = true;
|
@@ -542,7 +488,6 @@ class WebScraper
|
|
542
488
|
filterMetadata ( metadata )
|
543
489
|
{
|
544
490
|
if ( !this.includeMetadata ) return {};
|
545
|
-
|
546
491
|
const filteredMetadata = {};
|
547
492
|
for ( const field of this.metadataFields )
|
548
493
|
{
|
@@ -562,10 +507,13 @@ class WebScraper
|
|
562
507
|
description: document.querySelector( "meta[name=\"description\"]" )?.content,
|
563
508
|
keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
|
564
509
|
author: document.querySelector( "meta[name=\"author\"]" )?.content,
|
565
|
-
language:
|
510
|
+
language:
|
511
|
+
document.documentElement.lang ||
|
512
|
+
document.querySelector( "html" )?.getAttribute( "lang" ),
|
566
513
|
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
567
514
|
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
568
|
-
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
|
515
|
+
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
|
516
|
+
?.content,
|
569
517
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
570
518
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
571
519
|
dateScrapedDate: new Date().toISOString()
|
@@ -581,7 +529,7 @@ class WebScraper
|
|
581
529
|
...this.axiosOptions,
|
582
530
|
};
|
583
531
|
|
584
|
-
for ( let attempt = 1; attempt <= this.
|
532
|
+
for ( let attempt = 1; attempt <= this.axiosMaxRetries; attempt++ )
|
585
533
|
{
|
586
534
|
try
|
587
535
|
{
|
@@ -589,7 +537,7 @@ class WebScraper
|
|
589
537
|
{
|
590
538
|
break;
|
591
539
|
}
|
592
|
-
if ( attempt === this.
|
540
|
+
if ( attempt === this.axiosMaxRetries && this.useProxyAsFallback && this.axiosProxy )
|
593
541
|
{
|
594
542
|
options = {
|
595
543
|
...options,
|
@@ -601,9 +549,9 @@ class WebScraper
|
|
601
549
|
}
|
602
550
|
catch ( error )
|
603
551
|
{
|
604
|
-
if ( attempt >= this.
|
605
|
-
await WebScraper.sleep( this.
|
606
|
-
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.
|
552
|
+
if ( attempt >= this.axiosMaxRetries ) throw error;
|
553
|
+
await WebScraper.sleep( this.axiosRetryDelay * attempt );
|
554
|
+
console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.axiosMaxRetries})`, error.message, error.code );
|
607
555
|
}
|
608
556
|
}
|
609
557
|
throw new Error( "Max retries reached" );
|
@@ -623,7 +571,6 @@ class WebScraper
|
|
623
571
|
{
|
624
572
|
this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
|
625
573
|
}
|
626
|
-
|
627
574
|
if ( this.puppeteerExecutablePath )
|
628
575
|
{
|
629
576
|
this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
|
@@ -697,10 +644,7 @@ class WebScraper
|
|
697
644
|
|
698
645
|
hasValidPageContent ( content )
|
699
646
|
{
|
700
|
-
// Remove whitespace and newlines for checking
|
701
647
|
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|
702
|
-
|
703
|
-
// List of phrases that indicate invalid content
|
704
648
|
const invalidPhrases = [
|
705
649
|
"verifying that you are not a robot",
|
706
650
|
"verifying you are human. this may take a few seconds.",
|
@@ -724,37 +668,35 @@ class WebScraper
|
|
724
668
|
|
725
669
|
createOutputDirectory ()
|
726
670
|
{
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
fs.rmSync( path.join( __dirname, this.textOutputPathWithMeta ), { recursive: true, force: true });
|
738
|
-
}
|
739
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
740
|
-
{
|
741
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
742
|
-
}
|
743
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
744
|
-
{
|
745
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
746
|
-
}
|
747
|
-
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
671
|
+
const paths = [
|
672
|
+
path.join( __dirname, this.scrapResultPath ),
|
673
|
+
path.join( __dirname, this.textOutputPath ),
|
674
|
+
path.join( __dirname, this.textOutputPathWithMeta ),
|
675
|
+
path.join( __dirname, this.csvOutputPath ),
|
676
|
+
path.join( __dirname, this.csvOutputPathWithMeta ),
|
677
|
+
path.join( __dirname, this.jsonlOutputPath ),
|
678
|
+
path.join( __dirname, this.jsonlOutputPathWithMeta )
|
679
|
+
];
|
680
|
+
for ( const p of paths )
|
748
681
|
{
|
749
|
-
|
682
|
+
if ( fs.existsSync( p ) )
|
683
|
+
{
|
684
|
+
fs.rmSync( p, { recursive: true, force: true });
|
685
|
+
}
|
750
686
|
}
|
751
|
-
|
687
|
+
// Recreate directories needed for output
|
688
|
+
this.ensureDirectory( path.join( __dirname, this.scrapResultPath ) );
|
689
|
+
this.ensureDirectory( path.join( __dirname, this.textOutputPath ) );
|
690
|
+
this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
|
691
|
+
}
|
692
|
+
|
693
|
+
// Helper method to ensure a directory exists
|
694
|
+
ensureDirectory ( dirPath )
|
695
|
+
{
|
696
|
+
if ( !fs.existsSync( dirPath ) )
|
752
697
|
{
|
753
|
-
fs.
|
698
|
+
fs.mkdirSync( dirPath, { recursive: true });
|
754
699
|
}
|
755
|
-
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
756
|
-
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
757
|
-
fs.mkdirSync( path.join( __dirname, this.textOutputPathWithMeta ), { recursive: true });
|
758
700
|
}
|
759
701
|
|
760
702
|
static async sleep ( ms )
|
@@ -766,11 +708,7 @@ class WebScraper
|
|
766
708
|
{
|
767
709
|
await WebScraper.sleep( 1000 );
|
768
710
|
const fullOutputPath = path.join( __dirname, outputPath );
|
769
|
-
|
770
|
-
// Create output directories
|
771
711
|
WebScraper.createCombinedDirectories( fullOutputPath );
|
772
|
-
|
773
|
-
// Combine files by type
|
774
712
|
WebScraper.combineJSONLFiles( fullOutputPath, websites );
|
775
713
|
WebScraper.combineCSVFiles( fullOutputPath, websites );
|
776
714
|
WebScraper.combineTextFiles( fullOutputPath, websites );
|
@@ -784,34 +722,44 @@ class WebScraper
|
|
784
722
|
}
|
785
723
|
fs.mkdirSync( fullOutputPath, { recursive: true });
|
786
724
|
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
787
|
-
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), {
|
725
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), {
|
726
|
+
recursive: true
|
727
|
+
});
|
788
728
|
}
|
789
729
|
|
790
730
|
static combineJSONLFiles ( fullOutputPath, websites )
|
791
731
|
{
|
792
|
-
const jsonlOutput = fs
|
793
|
-
.
|
794
|
-
|
795
|
-
|
732
|
+
const jsonlOutput = fs
|
733
|
+
.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
734
|
+
.on( "error", err =>
|
735
|
+
{ return console.error( "Error combining JSONL:", err ) });
|
736
|
+
const jsonlMetaOutput = fs
|
737
|
+
.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
|
738
|
+
.on( "error", err =>
|
739
|
+
{ return console.error( "Error combining metadata JSONL:", err ) });
|
796
740
|
|
797
741
|
for ( const website of websites )
|
798
742
|
{
|
799
|
-
const jsonlContent = fs.readFileSync(
|
743
|
+
const jsonlContent = fs.readFileSync(
|
744
|
+
path.join( __dirname, website.jsonlOutputPath ),
|
745
|
+
"utf-8"
|
746
|
+
);
|
800
747
|
if ( jsonlContent )
|
801
748
|
{
|
802
749
|
jsonlOutput.write( jsonlContent );
|
803
750
|
}
|
804
|
-
|
805
751
|
if ( website.includeMetadata )
|
806
752
|
{
|
807
|
-
const jsonlMetaContent = fs.readFileSync(
|
753
|
+
const jsonlMetaContent = fs.readFileSync(
|
754
|
+
path.join( __dirname, website.jsonlOutputPathWithMeta ),
|
755
|
+
"utf-8"
|
756
|
+
);
|
808
757
|
if ( jsonlMetaContent )
|
809
758
|
{
|
810
759
|
jsonlMetaOutput.write( jsonlMetaContent );
|
811
760
|
}
|
812
761
|
}
|
813
762
|
}
|
814
|
-
|
815
763
|
jsonlOutput.end();
|
816
764
|
jsonlMetaOutput.end();
|
817
765
|
}
|
@@ -822,7 +770,8 @@ class WebScraper
|
|
822
770
|
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
823
771
|
|
824
772
|
csvOutput.write( "text\n" );
|
825
|
-
const metadataFields =
|
773
|
+
const metadataFields =
|
774
|
+
websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
|
826
775
|
|
827
776
|
if ( metadataFields.size > 0 )
|
828
777
|
{
|
@@ -839,10 +788,13 @@ class WebScraper
|
|
839
788
|
{
|
840
789
|
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
841
790
|
}
|
842
|
-
|
843
791
|
if ( website.includeMetadata )
|
844
792
|
{
|
845
|
-
const csvMetaContent = fs
|
793
|
+
const csvMetaContent = fs
|
794
|
+
.readFileSync(
|
795
|
+
path.join( __dirname, website.csvOutputPathWithMeta ),
|
796
|
+
"utf-8"
|
797
|
+
)
|
846
798
|
.split( "\n" )
|
847
799
|
.slice( 1 )
|
848
800
|
.filter( line => { return line.trim() });
|
@@ -852,7 +804,6 @@ class WebScraper
|
|
852
804
|
}
|
853
805
|
}
|
854
806
|
}
|
855
|
-
|
856
807
|
csvOutput.end();
|
857
808
|
csvMetaOutput.end();
|
858
809
|
}
|
@@ -860,20 +811,20 @@ class WebScraper
|
|
860
811
|
static combineTextFiles ( fullOutputPath, websites )
|
861
812
|
{
|
862
813
|
let textFileCounter = 1;
|
863
|
-
|
864
814
|
for ( const website of websites )
|
865
815
|
{
|
866
816
|
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
867
|
-
|
868
817
|
for ( const file of textFiles )
|
869
818
|
{
|
870
|
-
const content = fs.readFileSync(
|
819
|
+
const content = fs.readFileSync(
|
820
|
+
path.join( __dirname, website.textOutputPath, file ),
|
821
|
+
"utf-8"
|
822
|
+
);
|
871
823
|
fs.writeFileSync(
|
872
824
|
path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
|
873
825
|
content,
|
874
826
|
"utf-8"
|
875
827
|
);
|
876
|
-
|
877
828
|
if ( website.includeMetadata )
|
878
829
|
{
|
879
830
|
const metaContent = fs.readFileSync(
|
@@ -881,7 +832,11 @@ class WebScraper
|
|
881
832
|
"utf-8"
|
882
833
|
);
|
883
834
|
fs.writeFileSync(
|
884
|
-
path.join(
|
835
|
+
path.join(
|
836
|
+
fullOutputPath,
|
837
|
+
"texts_with_metadata",
|
838
|
+
`${textFileCounter}.txt`
|
839
|
+
),
|
885
840
|
metaContent,
|
886
841
|
"utf-8"
|
887
842
|
);
|
@@ -892,4 +847,4 @@ class WebScraper
|
|
892
847
|
}
|
893
848
|
}
|
894
849
|
|
895
|
-
module.exports = WebScraper;
|
850
|
+
module.exports = WebScraper;
|