clean-web-scraper 3.9.0 → 3.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/main.js +124 -135
- package/package.json +1 -1
package/main.js
CHANGED
@@ -3,48 +3,48 @@ const { JSDOM } = require( "jsdom" );
|
|
3
3
|
const { Readability } = require( "@mozilla/readability" );
|
4
4
|
const fs = require( "fs" );
|
5
5
|
const path = require( "path" );
|
6
|
-
const { connect } = require( "puppeteer-real-browser" )
|
6
|
+
const { connect } = require( "puppeteer-real-browser" );
|
7
7
|
|
8
8
|
class WebScraper
|
9
9
|
{
|
10
|
-
constructor ( config )
|
10
|
+
constructor ( config = {})
|
11
11
|
{
|
12
12
|
// Base configuration
|
13
|
-
this.baseURL = baseURL;
|
14
|
-
this.startURL = startURL || baseURL;
|
15
|
-
this.strictBaseURL = strictBaseURL || true;
|
16
|
-
this.maxDepth = maxDepth || Infinity;
|
17
|
-
this.maxArticles = maxArticles || Infinity;
|
18
|
-
this.concurrencyLimit = concurrencyLimit || 2;
|
19
|
-
this.crawlingDelay = crawlingDelay || 1000;
|
13
|
+
this.baseURL = config.baseURL;
|
14
|
+
this.startURL = config.startURL || config.baseURL;
|
15
|
+
this.strictBaseURL = config.strictBaseURL || true;
|
16
|
+
this.maxDepth = config.maxDepth || Infinity;
|
17
|
+
this.maxArticles = config.maxArticles || Infinity;
|
18
|
+
this.concurrencyLimit = config.concurrencyLimit || 2;
|
19
|
+
this.crawlingDelay = config.crawlingDelay || 1000;
|
20
20
|
|
21
21
|
// Output paths setup
|
22
|
-
this.scrapResultPath = scrapResultPath || "./dataset";
|
23
|
-
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
22
|
+
this.scrapResultPath = config.scrapResultPath || "./dataset";
|
23
|
+
this.textOutputPath = config.textOutputPath || path.join( this.scrapResultPath, "texts" );
|
24
24
|
this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
|
25
|
-
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
25
|
+
this.jsonlOutputPath = config.jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
26
26
|
this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
27
|
-
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
27
|
+
this.csvOutputPath = config.csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
28
28
|
this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
29
29
|
|
30
30
|
// Metadata configuration
|
31
|
-
this.includeMetadata = includeMetadata || false;
|
32
|
-
this.metadataFields = new Set( metadataFields || [] );
|
31
|
+
this.includeMetadata = config.includeMetadata || false;
|
32
|
+
this.metadataFields = new Set( config.metadataFields || [] );
|
33
33
|
|
34
34
|
// URL filtering setup
|
35
35
|
this.visited = new Set();
|
36
|
-
this.excludeList = this.normalizeExcludeList( excludeList );
|
37
|
-
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
38
|
-
this.filterFileTypes = filterFileTypes
|
39
|
-
this.excludedFileTypes = excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
|
40
|
-
this.removeURLFragment = removeURLFragment
|
36
|
+
this.excludeList = this.normalizeExcludeList( config.excludeList );
|
37
|
+
this.exactExcludeList = this.normalizeExcludeList( config.exactExcludeList );
|
38
|
+
this.filterFileTypes = config.filterFileTypes ?? true;
|
39
|
+
this.excludedFileTypes = config.excludedFileTypes || [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"];
|
40
|
+
this.removeURLFragment = config.removeURLFragment ?? true;
|
41
41
|
|
42
42
|
// Network configuration
|
43
|
-
this.axiosHeaders = axiosHeaders;
|
44
|
-
this.axiosProxy = axiosProxy;
|
45
|
-
this.axiosMaxRetries = axiosMaxRetries || 5;
|
46
|
-
this.axiosRetryDelay = axiosRetryDelay || 40000;
|
47
|
-
this.useProxyAsFallback = useProxyAsFallback || false;
|
43
|
+
this.axiosHeaders = config.axiosHeaders;
|
44
|
+
this.axiosProxy = config.axiosProxy;
|
45
|
+
this.axiosMaxRetries = config.axiosMaxRetries || 5;
|
46
|
+
this.axiosRetryDelay = config.axiosRetryDelay || 40000;
|
47
|
+
this.useProxyAsFallback = config.useProxyAsFallback || false;
|
48
48
|
this.axiosOptions = {};
|
49
49
|
if ( this.axiosHeaders )
|
50
50
|
{
|
@@ -59,11 +59,11 @@ class WebScraper
|
|
59
59
|
this.allProcessedContent = [];
|
60
60
|
|
61
61
|
// Puppeteer configuration
|
62
|
-
this.usePuppeteer = usePuppeteer || false;
|
63
|
-
this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
|
64
|
-
this.puppeteerExecutablePath = puppeteerExecutablePath;
|
65
|
-
this.puppeteerRealProxy = puppeteerRealProxy;
|
66
|
-
this.configurePuppeteer(
|
62
|
+
this.usePuppeteer = config.usePuppeteer || false;
|
63
|
+
this.puppeteerProxy = config.puppeteerProxy; // http://127.0.0.1:2080
|
64
|
+
this.puppeteerExecutablePath = config.puppeteerExecutablePath;
|
65
|
+
this.puppeteerRealProxy = config.puppeteerRealProxy;
|
66
|
+
this.configurePuppeteer();
|
67
67
|
}
|
68
68
|
|
69
69
|
async start ()
|
@@ -72,7 +72,7 @@ class WebScraper
|
|
72
72
|
{
|
73
73
|
if ( this.usePuppeteer )
|
74
74
|
{
|
75
|
-
|
75
|
+
const { browser, page } = await connect( this.puppeteerRealOptions );
|
76
76
|
this.puppeteerBrowser = browser;
|
77
77
|
this.puppeteerPage = page;
|
78
78
|
}
|
@@ -92,7 +92,7 @@ class WebScraper
|
|
92
92
|
{
|
93
93
|
if ( this.puppeteerBrowser )
|
94
94
|
{
|
95
|
-
await this.puppeteerBrowser.close();
|
95
|
+
await this.puppeteerBrowser.close();
|
96
96
|
}
|
97
97
|
}
|
98
98
|
}
|
@@ -102,8 +102,8 @@ class WebScraper
|
|
102
102
|
const queue = [{ url: initialUrl, depth: initialDepth }];
|
103
103
|
for ( let i = 0; i < queue.length; i++ )
|
104
104
|
{
|
105
|
-
console.log( `Processing URL: ${queue[i].url}` );
|
106
105
|
let { url, depth } = queue[i];
|
106
|
+
console.log( `Processing URL: ${queue[i].url}` );
|
107
107
|
if ( this.hasReachedMax( depth ) )
|
108
108
|
{
|
109
109
|
continue;
|
@@ -140,7 +140,6 @@ class WebScraper
|
|
140
140
|
nbTopCandidates: 20
|
141
141
|
});
|
142
142
|
const article = reader.parse();
|
143
|
-
|
144
143
|
if ( article )
|
145
144
|
{
|
146
145
|
if ( this.hasValidPageContent( article.textContent ) )
|
@@ -161,7 +160,6 @@ class WebScraper
|
|
161
160
|
|
162
161
|
const links = this.extractLinks( data );
|
163
162
|
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
164
|
-
|
165
163
|
for ( const link of unvisitedLinks )
|
166
164
|
{
|
167
165
|
if ( !this.hasReachedMax( depth ) )
|
@@ -182,7 +180,7 @@ class WebScraper
|
|
182
180
|
{
|
183
181
|
try
|
184
182
|
{
|
185
|
-
const response = await this.retryAxiosRequest( url )
|
183
|
+
const response = await this.retryAxiosRequest( url );
|
186
184
|
const contentType = response?.headers["content-type"] || "";
|
187
185
|
if ( !contentType?.startsWith( "text" ) )
|
188
186
|
{
|
@@ -191,34 +189,30 @@ class WebScraper
|
|
191
189
|
return null;
|
192
190
|
}
|
193
191
|
|
194
|
-
// Step 3: If Content-Type is HTML, read the full response
|
195
192
|
let htmlContent = "";
|
196
|
-
response.data.on( "data",
|
193
|
+
response.data.on( "data", chunk =>
|
197
194
|
{
|
198
195
|
htmlContent += chunk.toString();
|
199
196
|
});
|
200
|
-
|
201
|
-
// Wait for the stream to finish
|
202
197
|
await new Promise( ( resolve, reject ) =>
|
203
198
|
{
|
204
199
|
response.data.on( "end", resolve );
|
205
200
|
response.data.on( "error", reject );
|
206
201
|
});
|
207
|
-
|
208
202
|
return htmlContent;
|
209
203
|
}
|
210
204
|
catch ( error )
|
211
205
|
{
|
212
206
|
console.error( `Error fetching content ${url}:`, error.message );
|
213
|
-
if ( error.status
|
207
|
+
if ( error.status === 403 && this.usePuppeteer )
|
214
208
|
{
|
215
209
|
try
|
216
210
|
{
|
217
211
|
let result;
|
218
|
-
for ( let
|
212
|
+
for ( let i = 0; i < 10; i++ )
|
219
213
|
{
|
220
214
|
console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
|
221
|
-
result = await this.navigateToPage( url )
|
215
|
+
result = await this.navigateToPage( url );
|
222
216
|
if ( this.hasValidPageContent( result.htmlContent ) )
|
223
217
|
{
|
224
218
|
break
|
@@ -231,7 +225,6 @@ class WebScraper
|
|
231
225
|
console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
|
232
226
|
throw error;
|
233
227
|
}
|
234
|
-
|
235
228
|
}
|
236
229
|
throw error;
|
237
230
|
}
|
@@ -344,11 +337,10 @@ class WebScraper
|
|
344
337
|
createJSONLFile ()
|
345
338
|
{
|
346
339
|
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
347
|
-
|
348
|
-
|
349
|
-
// Add error handlers
|
350
|
-
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
|
340
|
+
writeStreamSimple.on( "error", err =>
|
341
|
+
{ return console.error( "Error writing JSONL:", err ) });
|
351
342
|
|
343
|
+
let writeStreamMeta;
|
352
344
|
if ( this.includeMetadata )
|
353
345
|
{
|
354
346
|
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
|
@@ -394,16 +386,18 @@ class WebScraper
|
|
394
386
|
|
395
387
|
for ( const content of this.allProcessedContent )
|
396
388
|
{
|
397
|
-
// Write simple version
|
398
389
|
const escapedText = content.simple.text.replace( /"/g, "\"\"" );
|
399
390
|
writeStreamSimple.write( `"${escapedText}"\n` );
|
400
391
|
|
401
|
-
// Write metadata version if requested
|
402
392
|
if ( this.includeMetadata )
|
403
393
|
{
|
404
394
|
const { metadata } = content.withMetadata;
|
405
|
-
const metadataValues = Array.from( this.metadataFields )
|
406
|
-
|
395
|
+
const metadataValues = Array.from( this.metadataFields ).map( field =>
|
396
|
+
{
|
397
|
+
return metadata[field]
|
398
|
+
? `"${metadata[field].replace( /"/g, "\"\"" )}"`
|
399
|
+
: "\"\""
|
400
|
+
});
|
407
401
|
writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
|
408
402
|
}
|
409
403
|
}
|
@@ -422,10 +416,8 @@ class WebScraper
|
|
422
416
|
|
423
417
|
saveNumberedTextFiles ()
|
424
418
|
{
|
425
|
-
// Create base text folder for simple content
|
426
419
|
const baseTextPath = path.join( __dirname, this.textOutputPath );
|
427
420
|
|
428
|
-
// Create metadata text folder if needed
|
429
421
|
let metaTextPath = null;
|
430
422
|
if ( this.includeMetadata )
|
431
423
|
{
|
@@ -436,20 +428,15 @@ class WebScraper
|
|
436
428
|
this.allProcessedContent.forEach( ( content, index ) =>
|
437
429
|
{
|
438
430
|
const fileName = `${index + 1}.txt`;
|
439
|
-
|
440
|
-
// Always save simple version
|
441
431
|
const simpleFilePath = path.join( baseTextPath, fileName );
|
442
432
|
fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
|
443
433
|
console.log( `Created simple text file: ${fileName}` );
|
444
434
|
|
445
|
-
// Save metadata version if enabled
|
446
435
|
if ( this.includeMetadata )
|
447
436
|
{
|
448
437
|
const metaFilePath = path.join( metaTextPath, fileName );
|
449
438
|
let fileContent = "";
|
450
|
-
|
451
439
|
const { metadata } = content.withMetadata;
|
452
|
-
// Add metadata fields as headers
|
453
440
|
for ( const field of this.metadataFields )
|
454
441
|
{
|
455
442
|
if ( metadata[field] )
|
@@ -459,7 +446,6 @@ class WebScraper
|
|
459
446
|
}
|
460
447
|
fileContent += "\n---\n\n";
|
461
448
|
fileContent += content.withMetadata.text;
|
462
|
-
|
463
449
|
fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
|
464
450
|
console.log( `Created metadata text file: ${fileName}` );
|
465
451
|
}
|
@@ -469,33 +455,27 @@ class WebScraper
|
|
469
455
|
processContent ( content )
|
470
456
|
{
|
471
457
|
let processed = content;
|
472
|
-
|
473
|
-
// Remove "[You can read more about this here]" and similar patterns
|
458
|
+
// Remove unwanted fixed text
|
474
459
|
processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
|
475
|
-
|
476
|
-
|
477
|
-
|
460
|
+
// Trim each line and remove extra newlines
|
461
|
+
processed = processed
|
462
|
+
.split( "\n" )
|
478
463
|
.map( line => { return line.trim() })
|
479
|
-
.join( "\n" )
|
480
|
-
|
481
|
-
// Replace 3 or more newlines with a single newline
|
482
|
-
processed = processed.replace( /\n{3,}/g, "\n\n" );
|
483
|
-
|
484
|
-
// Add more processing rules as needed:
|
485
|
-
// processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
|
486
|
-
// processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
|
464
|
+
.join( "\n" )
|
465
|
+
.replace( /\n{3,}/g, "\n\n" );
|
487
466
|
|
488
|
-
// Remove specified words
|
467
|
+
// Remove specified words at the end (repeatedly)
|
489
468
|
const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
|
490
469
|
let changed = true;
|
491
|
-
|
492
470
|
while ( changed )
|
493
471
|
{
|
494
472
|
changed = false;
|
495
|
-
for (
|
473
|
+
for ( const word of wordsToTrim )
|
496
474
|
{
|
497
475
|
const oldProcessed = processed;
|
498
|
-
processed = processed
|
476
|
+
processed = processed
|
477
|
+
.replace( new RegExp( `\\s*${word}\\s*$`, "g" ), "" )
|
478
|
+
.trim();
|
499
479
|
if ( oldProcessed !== processed )
|
500
480
|
{
|
501
481
|
changed = true;
|
@@ -508,7 +488,6 @@ class WebScraper
|
|
508
488
|
filterMetadata ( metadata )
|
509
489
|
{
|
510
490
|
if ( !this.includeMetadata ) return {};
|
511
|
-
|
512
491
|
const filteredMetadata = {};
|
513
492
|
for ( const field of this.metadataFields )
|
514
493
|
{
|
@@ -528,10 +507,13 @@ class WebScraper
|
|
528
507
|
description: document.querySelector( "meta[name=\"description\"]" )?.content,
|
529
508
|
keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
|
530
509
|
author: document.querySelector( "meta[name=\"author\"]" )?.content,
|
531
|
-
language:
|
510
|
+
language:
|
511
|
+
document.documentElement.lang ||
|
512
|
+
document.querySelector( "html" )?.getAttribute( "lang" ),
|
532
513
|
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
533
514
|
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
534
|
-
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
|
515
|
+
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
|
516
|
+
?.content,
|
535
517
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
536
518
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
537
519
|
dateScrapedDate: new Date().toISOString()
|
@@ -589,7 +571,6 @@ class WebScraper
|
|
589
571
|
{
|
590
572
|
this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
|
591
573
|
}
|
592
|
-
|
593
574
|
if ( this.puppeteerExecutablePath )
|
594
575
|
{
|
595
576
|
this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
|
@@ -663,10 +644,7 @@ class WebScraper
|
|
663
644
|
|
664
645
|
hasValidPageContent ( content )
|
665
646
|
{
|
666
|
-
// Remove whitespace and newlines for checking
|
667
647
|
const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
|
668
|
-
|
669
|
-
// List of phrases that indicate invalid content
|
670
648
|
const invalidPhrases = [
|
671
649
|
"verifying that you are not a robot",
|
672
650
|
"verifying you are human. this may take a few seconds.",
|
@@ -690,37 +668,35 @@ class WebScraper
|
|
690
668
|
|
691
669
|
createOutputDirectory ()
|
692
670
|
{
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
fs.rmSync( path.join( __dirname, this.textOutputPathWithMeta ), { recursive: true, force: true });
|
704
|
-
}
|
705
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
|
706
|
-
{
|
707
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
|
708
|
-
}
|
709
|
-
if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
|
710
|
-
{
|
711
|
-
fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
|
712
|
-
}
|
713
|
-
if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
|
671
|
+
const paths = [
|
672
|
+
path.join( __dirname, this.scrapResultPath ),
|
673
|
+
path.join( __dirname, this.textOutputPath ),
|
674
|
+
path.join( __dirname, this.textOutputPathWithMeta ),
|
675
|
+
path.join( __dirname, this.csvOutputPath ),
|
676
|
+
path.join( __dirname, this.csvOutputPathWithMeta ),
|
677
|
+
path.join( __dirname, this.jsonlOutputPath ),
|
678
|
+
path.join( __dirname, this.jsonlOutputPathWithMeta )
|
679
|
+
];
|
680
|
+
for ( const p of paths )
|
714
681
|
{
|
715
|
-
|
682
|
+
if ( fs.existsSync( p ) )
|
683
|
+
{
|
684
|
+
fs.rmSync( p, { recursive: true, force: true });
|
685
|
+
}
|
716
686
|
}
|
717
|
-
|
687
|
+
// Recreate directories needed for output
|
688
|
+
this.ensureDirectory( path.join( __dirname, this.scrapResultPath ) );
|
689
|
+
this.ensureDirectory( path.join( __dirname, this.textOutputPath ) );
|
690
|
+
this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
|
691
|
+
}
|
692
|
+
|
693
|
+
// Helper method to ensure a directory exists
|
694
|
+
ensureDirectory ( dirPath )
|
695
|
+
{
|
696
|
+
if ( !fs.existsSync( dirPath ) )
|
718
697
|
{
|
719
|
-
fs.
|
698
|
+
fs.mkdirSync( dirPath, { recursive: true });
|
720
699
|
}
|
721
|
-
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
722
|
-
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
723
|
-
fs.mkdirSync( path.join( __dirname, this.textOutputPathWithMeta ), { recursive: true });
|
724
700
|
}
|
725
701
|
|
726
702
|
static async sleep ( ms )
|
@@ -732,11 +708,7 @@ class WebScraper
|
|
732
708
|
{
|
733
709
|
await WebScraper.sleep( 1000 );
|
734
710
|
const fullOutputPath = path.join( __dirname, outputPath );
|
735
|
-
|
736
|
-
// Create output directories
|
737
711
|
WebScraper.createCombinedDirectories( fullOutputPath );
|
738
|
-
|
739
|
-
// Combine files by type
|
740
712
|
WebScraper.combineJSONLFiles( fullOutputPath, websites );
|
741
713
|
WebScraper.combineCSVFiles( fullOutputPath, websites );
|
742
714
|
WebScraper.combineTextFiles( fullOutputPath, websites );
|
@@ -750,34 +722,44 @@ class WebScraper
|
|
750
722
|
}
|
751
723
|
fs.mkdirSync( fullOutputPath, { recursive: true });
|
752
724
|
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
753
|
-
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), {
|
725
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), {
|
726
|
+
recursive: true
|
727
|
+
});
|
754
728
|
}
|
755
729
|
|
756
730
|
static combineJSONLFiles ( fullOutputPath, websites )
|
757
731
|
{
|
758
|
-
const jsonlOutput = fs
|
759
|
-
.
|
760
|
-
|
761
|
-
|
732
|
+
const jsonlOutput = fs
|
733
|
+
.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
734
|
+
.on( "error", err =>
|
735
|
+
{ return console.error( "Error combining JSONL:", err ) });
|
736
|
+
const jsonlMetaOutput = fs
|
737
|
+
.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
|
738
|
+
.on( "error", err =>
|
739
|
+
{ return console.error( "Error combining metadata JSONL:", err ) });
|
762
740
|
|
763
741
|
for ( const website of websites )
|
764
742
|
{
|
765
|
-
const jsonlContent = fs.readFileSync(
|
743
|
+
const jsonlContent = fs.readFileSync(
|
744
|
+
path.join( __dirname, website.jsonlOutputPath ),
|
745
|
+
"utf-8"
|
746
|
+
);
|
766
747
|
if ( jsonlContent )
|
767
748
|
{
|
768
749
|
jsonlOutput.write( jsonlContent );
|
769
750
|
}
|
770
|
-
|
771
751
|
if ( website.includeMetadata )
|
772
752
|
{
|
773
|
-
const jsonlMetaContent = fs.readFileSync(
|
753
|
+
const jsonlMetaContent = fs.readFileSync(
|
754
|
+
path.join( __dirname, website.jsonlOutputPathWithMeta ),
|
755
|
+
"utf-8"
|
756
|
+
);
|
774
757
|
if ( jsonlMetaContent )
|
775
758
|
{
|
776
759
|
jsonlMetaOutput.write( jsonlMetaContent );
|
777
760
|
}
|
778
761
|
}
|
779
762
|
}
|
780
|
-
|
781
763
|
jsonlOutput.end();
|
782
764
|
jsonlMetaOutput.end();
|
783
765
|
}
|
@@ -788,7 +770,8 @@ class WebScraper
|
|
788
770
|
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
789
771
|
|
790
772
|
csvOutput.write( "text\n" );
|
791
|
-
const metadataFields =
|
773
|
+
const metadataFields =
|
774
|
+
websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
|
792
775
|
|
793
776
|
if ( metadataFields.size > 0 )
|
794
777
|
{
|
@@ -805,10 +788,13 @@ class WebScraper
|
|
805
788
|
{
|
806
789
|
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
807
790
|
}
|
808
|
-
|
809
791
|
if ( website.includeMetadata )
|
810
792
|
{
|
811
|
-
const csvMetaContent = fs
|
793
|
+
const csvMetaContent = fs
|
794
|
+
.readFileSync(
|
795
|
+
path.join( __dirname, website.csvOutputPathWithMeta ),
|
796
|
+
"utf-8"
|
797
|
+
)
|
812
798
|
.split( "\n" )
|
813
799
|
.slice( 1 )
|
814
800
|
.filter( line => { return line.trim() });
|
@@ -818,7 +804,6 @@ class WebScraper
|
|
818
804
|
}
|
819
805
|
}
|
820
806
|
}
|
821
|
-
|
822
807
|
csvOutput.end();
|
823
808
|
csvMetaOutput.end();
|
824
809
|
}
|
@@ -826,20 +811,20 @@ class WebScraper
|
|
826
811
|
static combineTextFiles ( fullOutputPath, websites )
|
827
812
|
{
|
828
813
|
let textFileCounter = 1;
|
829
|
-
|
830
814
|
for ( const website of websites )
|
831
815
|
{
|
832
816
|
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
833
|
-
|
834
817
|
for ( const file of textFiles )
|
835
818
|
{
|
836
|
-
const content = fs.readFileSync(
|
819
|
+
const content = fs.readFileSync(
|
820
|
+
path.join( __dirname, website.textOutputPath, file ),
|
821
|
+
"utf-8"
|
822
|
+
);
|
837
823
|
fs.writeFileSync(
|
838
824
|
path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
|
839
825
|
content,
|
840
826
|
"utf-8"
|
841
827
|
);
|
842
|
-
|
843
828
|
if ( website.includeMetadata )
|
844
829
|
{
|
845
830
|
const metaContent = fs.readFileSync(
|
@@ -847,7 +832,11 @@ class WebScraper
|
|
847
832
|
"utf-8"
|
848
833
|
);
|
849
834
|
fs.writeFileSync(
|
850
|
-
path.join(
|
835
|
+
path.join(
|
836
|
+
fullOutputPath,
|
837
|
+
"texts_with_metadata",
|
838
|
+
`${textFileCounter}.txt`
|
839
|
+
),
|
851
840
|
metaContent,
|
852
841
|
"utf-8"
|
853
842
|
);
|
@@ -858,4 +847,4 @@ class WebScraper
|
|
858
847
|
}
|
859
848
|
}
|
860
849
|
|
861
|
-
module.exports = WebScraper;
|
850
|
+
module.exports = WebScraper;
|