clean-web-scraper 3.5.7 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/main.js CHANGED
@@ -1,2 +1,833 @@
1
- const WebScraper = require( "./src/WebScraper" );
1
+ const axios = require( "axios" );
2
+ const { JSDOM } = require( "jsdom" );
3
+ const { Readability } = require( "@mozilla/readability" );
4
+ const fs = require( "fs" );
5
+ const path = require( "path" );
6
+ const { connect } = require( "puppeteer-real-browser" )
7
+
8
+ class WebScraper
9
+ {
10
+ constructor ({
11
+ // Base configuration
12
+ baseURL,
13
+ startURL,
14
+ strictBaseURL = true,
15
+ maxDepth = Infinity,
16
+ maxArticles = Infinity,
17
+
18
+ // URL filtering
19
+ excludeList = [],
20
+ exactExcludeList = [],
21
+ filterFileTypes = true,
22
+ excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
23
+
24
+ // Output paths
25
+ scrapResultPath = "./dataset",
26
+ jsonlOutputPath,
27
+ textOutputPath,
28
+ csvOutputPath,
29
+
30
+ // Metadata options
31
+ includeMetadata = false,
32
+ metadataFields = [],
33
+
34
+ // Network options
35
+ axiosHeaders,
36
+ axiosProxy,
37
+
38
+ // Puppeteer options
39
+ usePuppeteer,
40
+ puppeteerProxy, // e.g. http://127.0.0.1:2080
41
+ puppeteerExecutablePath,
42
+ puppeteerRealProxy
43
+ })
44
+ {
45
+ // Base configuration
46
+ this.baseURL = baseURL;
47
+ this.startURL = startURL || baseURL;
48
+ this.strictBaseURL = strictBaseURL;
49
+ this.maxDepth = maxDepth;
50
+ this.maxArticles = maxArticles;
51
+
52
+ // Output paths setup
53
+ this.scrapResultPath = scrapResultPath;
54
+ this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
55
+ this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
56
+ this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
57
+ this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
58
+ this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
59
+
60
+ // Metadata configuration
61
+ this.includeMetadata = includeMetadata;
62
+ this.metadataFields = new Set( metadataFields );
63
+
64
+ // URL filtering setup
65
+ this.visited = new Set();
66
+ this.excludeList = this.normalizeExcludeList( excludeList );
67
+ this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
68
+ this.filterFileTypes = filterFileTypes;
69
+ this.excludedFileTypes = excludedFileTypes;
70
+
71
+ // Network configuration
72
+ this.axiosHeaders = axiosHeaders;
73
+ this.axiosProxy = axiosProxy;
74
+ this.axiosOptions = {};
75
+ if ( this.axiosHeaders )
76
+ {
77
+ this.axiosOptions.headers = this.axiosHeaders;
78
+ }
79
+ if ( this.axiosProxy )
80
+ {
81
+ this.axiosOptions.proxy = this.axiosProxy;
82
+ }
83
+
84
+ // Content storage
85
+ this.allProcessedContent = [];
86
+
87
+ // Puppeteer configuration
88
+ this.usePuppeteer = usePuppeteer || false;
89
+ this.puppeteerProxy = puppeteerProxy;
90
+ this.puppeteerExecutablePath = puppeteerExecutablePath;
91
+ this.puppeteerRealProxy = puppeteerRealProxy;
92
+ this.configurePuppeteer( );
93
+ }
94
+
95
+ async start ()
96
+ {
97
+ try
98
+ {
99
+ if ( this.usePuppeteer )
100
+ {
101
+ let { browser, page } = await connect( this.puppeteerRealOptions )
102
+ this.puppeteerBrowser = browser;
103
+ this.puppeteerPage = page;
104
+ }
105
+ this.createOutputDirectory();
106
+ await this.fetchPage( this.startURL, 0 );
107
+ this.createJSONLFile();
108
+ this.saveNumberedTextFiles();
109
+ this.createCSVFile();
110
+ console.log( "Scraping completed." );
111
+ }
112
+ catch ( error )
113
+ {
114
+ console.error( "Error:", error );
115
+ throw error;
116
+ }
117
+ finally
118
+ {
119
+ if ( this.puppeteerBrowser )
120
+ {
121
+ await this.puppeteerBrowser.close(); // Close the browser after scraping
122
+ }
123
+ }
124
+ }
125
+
126
+ async fetchPage ( url, depth )
127
+ {
128
+ if ( this.allProcessedContent.length >= this.maxArticles || depth > this.maxDepth )
129
+ {
130
+ console.log( `Reached maximum: ${this.maxArticles}, ${this.maxDepth}` );
131
+ return;
132
+ }
133
+ this.visited.add( url );
134
+ if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
135
+ {
136
+ return;
137
+ }
138
+ try
139
+ {
140
+ const data = await this.fetchContent( url );
141
+ if ( !data ) return;
142
+ const dom = new JSDOM( data, { url });
143
+ const { document } = dom.window;
144
+
145
+ if ( !this.isExcluded( url ) )
146
+ {
147
+ const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
148
+ const article = reader.parse();
149
+
150
+ if ( article )
151
+ {
152
+ if ( this.hasValidPageContent( article.textContent ) )
153
+ {
154
+ const metadata = this.extractMetadata( url, document );
155
+ metadata.depth = depth;
156
+ this.saveArticle( url, article.textContent, metadata );
157
+ }
158
+ else
159
+ {
160
+ console.error( `Invalid content found at ${url}` );
161
+ }
162
+ }
163
+ else
164
+ {
165
+ console.error( `No readable content found at ${url}` );
166
+ }
167
+ }
168
+
169
+ const links = this.extractLinks( data );
170
+ for ( const link of links )
171
+ {
172
+ if ( !this.visited.has( link ) )
173
+ {
174
+ await this.fetchPage( link, depth + 1 );
175
+ }
176
+ }
177
+ }
178
+ catch ( error )
179
+ {
180
+ console.error( `Error fetching ${url}:`, error.message, error.code );
181
+ }
182
+ }
183
+
184
+ async fetchContent ( url )
185
+ {
186
+ try
187
+ {
188
+ const response = await retryAxiosRequest( url )
189
+ const contentType = response.headers["content-type"] || "";
190
+ if ( !contentType.startsWith( "text" ) )
191
+ {
192
+ console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
193
+ response.data.destroy();
194
+ return null;
195
+ }
196
+
197
+ // Step 3: If Content-Type is HTML, read the full response
198
+ let htmlContent = "";
199
+ response.data.on( "data", ( chunk ) =>
200
+ {
201
+ htmlContent += chunk.toString();
202
+ });
203
+
204
+ // Wait for the stream to finish
205
+ await new Promise( ( resolve, reject ) =>
206
+ {
207
+ response.data.on( "end", resolve );
208
+ response.data.on( "error", reject );
209
+ });
210
+
211
+ return htmlContent;
212
+ }
213
+ catch ( error )
214
+ {
215
+ console.error( `Error fetching ${url}:`, error.message );
216
+ if ( error.status = 403 && this.usePuppeteer )
217
+ {
218
+ try
219
+ {
220
+ let result;
221
+ for ( let index = 0; index < 10; index++ )
222
+ {
223
+ console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
224
+ result = await this.navigateToPage( url ) ;
225
+ if ( this.hasValidPageContent( result.htmlContent ) )
226
+ {
227
+ break
228
+ }
229
+ }
230
+ return result.htmlContent;
231
+ }
232
+ catch ( error )
233
+ {
234
+ console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
235
+ throw error;
236
+ }
237
+
238
+ }
239
+ throw error;
240
+ }
241
+ }
242
+
243
+ async navigateToPage ( url )
244
+ {
245
+ let pages = await this.puppeteerBrowser.pages();
246
+ let page = pages[0];
247
+ page.setDefaultNavigationTimeout( 10000 );
248
+ await page.goto( url );
249
+ pages = await this.puppeteerBrowser.pages();
250
+ page = pages[0];
251
+ page.setDefaultNavigationTimeout( 10000 );
252
+ await this.waitForPageToLoad( page );
253
+ pages = await this.puppeteerBrowser.pages();
254
+ page = pages[0];
255
+ page.setDefaultNavigationTimeout( 10000 );
256
+ if ( page )
257
+ {
258
+ let htmlContent = await page.content();
259
+ return { pages, page, htmlContent };
260
+ }
261
+ }
262
+
263
+ async waitForPageToLoad ( page )
264
+ {
265
+ try
266
+ {
267
+ await page.waitForNavigation({ waitUntil: "networkidle0" });
268
+ }
269
+ catch ( error )
270
+ {
271
+ console.log( error );
272
+ }
273
+ }
274
+
275
+ extractLinks ( data )
276
+ {
277
+ const links = new Set();
278
+ const regex = /<a\s+(?:[^>]*?\s+)?href=("|')(.*?)\1/gi;
279
+ let match;
280
+
281
+ while ( ( match = regex.exec( data ) ) !== null )
282
+ {
283
+ let href = match[2];
284
+ if ( href.startsWith( "/" ) )
285
+ {
286
+ href = new URL( href, this.baseURL ).href
287
+ }
288
+ if ( href.endsWith( "/" ) )
289
+ {
290
+ href = href.slice( 0, -1 );
291
+ }
292
+ if ( href.startsWith( this.baseURL ) )
293
+ {
294
+ links.add( href );
295
+ }
296
+ }
297
+ return links;
298
+ }
299
+
300
+ saveArticle ( url, content, metadata )
301
+ {
302
+ const processedContent = this.processContent( content );
303
+
304
+ const simpleContent = {
305
+ text: processedContent.trim()
306
+ };
307
+
308
+ const contentWithMetadata = {
309
+ text: processedContent.trim(),
310
+ metadata: this.filterMetadata( metadata )
311
+ };
312
+
313
+ this.allProcessedContent.push({
314
+ simple: simpleContent,
315
+ withMetadata: contentWithMetadata
316
+ });
317
+
318
+ let urlPath = new URL( url ).pathname;
319
+ if ( urlPath === "/" )
320
+ {
321
+ urlPath = "/index";
322
+ }
323
+ const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
324
+ const dir = path.dirname( filePath );
325
+
326
+ fs.mkdirSync( dir, { recursive: true });
327
+ fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
328
+ fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
329
+ console.log( `Saved: ${filePath}.txt` );
330
+ console.log( `Saved: ${filePath}.json` );
331
+ }
332
+
333
+ createJSONLFile ()
334
+ {
335
+ const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
336
+ let writeStreamMeta
337
+
338
+ // Add error handlers
339
+ writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
340
+
341
+ if ( this.includeMetadata )
342
+ {
343
+ writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
344
+ writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
345
+ }
346
+ for ( const content of this.allProcessedContent )
347
+ {
348
+ writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
349
+ if ( this.includeMetadata )
350
+ {
351
+ writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
352
+ }
353
+ }
354
+ writeStreamSimple.end();
355
+ if ( this.includeMetadata )
356
+ {
357
+ writeStreamMeta.end();
358
+ console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
359
+ }
360
+ console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
361
+ }
362
+
363
+ createCSVFile ()
364
+ {
365
+ // Create simple version
366
+ const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
367
+ writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
368
+ writeStreamSimple.write( "text\n" );
369
+
370
+ // Create metadata version if requested
371
+ let writeStreamMeta;
372
+ if ( this.includeMetadata )
373
+ {
374
+ writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
375
+ writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
376
+ }
377
+
378
+ if ( this.includeMetadata )
379
+ {
380
+ const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
381
+ writeStreamMeta.write( `${headers}\n` );
382
+ }
383
+
384
+ for ( const content of this.allProcessedContent )
385
+ {
386
+ // Write simple version
387
+ const escapedText = content.simple.text.replace( /"/g, "\"\"" );
388
+ writeStreamSimple.write( `"${escapedText}"\n` );
389
+
390
+ // Write metadata version if requested
391
+ if ( this.includeMetadata )
392
+ {
393
+ const { metadata } = content.withMetadata;
394
+ const metadataValues = Array.from( this.metadataFields )
395
+ .map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
396
+ writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
397
+ }
398
+ }
399
+
400
+ writeStreamSimple.end();
401
+ if ( writeStreamMeta )
402
+ {
403
+ writeStreamMeta.end();
404
+ }
405
+ console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
406
+ if ( this.includeMetadata )
407
+ {
408
+ console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
409
+ }
410
+ }
411
+
412
+ saveNumberedTextFiles ()
413
+ {
414
+ // Create base text folder for simple content
415
+ const baseTextPath = path.join( __dirname, this.textOutputPath );
416
+
417
+ // Create metadata text folder if needed
418
+ let metaTextPath = null;
419
+ if ( this.includeMetadata )
420
+ {
421
+ metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
422
+ fs.mkdirSync( metaTextPath, { recursive: true });
423
+ }
424
+
425
+ this.allProcessedContent.forEach( ( content, index ) =>
426
+ {
427
+ const fileName = `${index + 1}.txt`;
428
+
429
+ // Always save simple version
430
+ const simpleFilePath = path.join( baseTextPath, fileName );
431
+ fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
432
+ console.log( `Created simple text file: ${fileName}` );
433
+
434
+ // Save metadata version if enabled
435
+ if ( this.includeMetadata )
436
+ {
437
+ const metaFilePath = path.join( metaTextPath, fileName );
438
+ let fileContent = "";
439
+
440
+ const { metadata } = content.withMetadata;
441
+ // Add metadata fields as headers
442
+ for ( const field of this.metadataFields )
443
+ {
444
+ if ( metadata[field] )
445
+ {
446
+ fileContent += `${field}: ${metadata[field]}\n`;
447
+ }
448
+ }
449
+ fileContent += "\n---\n\n";
450
+ fileContent += content.withMetadata.text;
451
+
452
+ fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
453
+ console.log( `Created metadata text file: ${fileName}` );
454
+ }
455
+ });
456
+ }
457
+
458
+ processContent ( content )
459
+ {
460
+ let processed = content;
461
+
462
+ // Remove "[You can read more about this here]" and similar patterns
463
+ processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
464
+
465
+ // Trim each line
466
+ processed = processed.split( "\n" )
467
+ .map( line => { return line.trim() })
468
+ .join( "\n" );
469
+
470
+ // Replace 3 or more newlines with a single newline
471
+ processed = processed.replace( /\n{3,}/g, "\n\n" );
472
+
473
+ // Add more processing rules as needed:
474
+ // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
475
+ // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
476
+
477
+ // Remove specified words from the end of content, handling multiple occurrences
478
+ const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
479
+ let changed = true;
480
+
481
+ while ( changed )
482
+ {
483
+ changed = false;
484
+ for ( let i = 0; i < wordsToTrim.length; i++ )
485
+ {
486
+ const oldProcessed = processed;
487
+ processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
488
+ if ( oldProcessed !== processed )
489
+ {
490
+ changed = true;
491
+ }
492
+ }
493
+ }
494
+ return processed;
495
+ }
496
+
497
+ filterMetadata ( metadata )
498
+ {
499
+ if ( !this.includeMetadata ) return {};
500
+
501
+ const filteredMetadata = {};
502
+ for ( const field of this.metadataFields )
503
+ {
504
+ if ( metadata[field] && typeof metadata[field] === "string" )
505
+ {
506
+ filteredMetadata[field] = metadata[field];
507
+ }
508
+ }
509
+ return filteredMetadata;
510
+ }
511
+
512
+ extractMetadata ( url, document )
513
+ {
514
+ return {
515
+ url,
516
+ title: document.title,
517
+ description: document.querySelector( "meta[name=\"description\"]" )?.content,
518
+ keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
519
+ author: document.querySelector( "meta[name=\"author\"]" )?.content,
520
+ language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
521
+ canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
522
+ ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
523
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
524
+ ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
525
+ ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
526
+ dateScraped: new Date().toISOString()
527
+ };
528
+ }
529
+
530
+ async retryAxiosRequest ( url )
531
+ {
532
+ const options = {
533
+ responseType: "stream",
534
+ maxRedirects: 5,
535
+ timeout: 70000,
536
+ ...this.axiosOptions,
537
+ };
538
+
539
+ let maxRetries = 3;
540
+ for ( let attempt = 1; attempt <= maxRetries; attempt++ )
541
+ {
542
+ try
543
+ {
544
+ return await axios.get( url, options );
545
+ }
546
+ catch ( error )
547
+ {
548
+ if ( attempt === maxRetries ) throw error;
549
+ await WebScraper.sleep( 1000 * attempt );
550
+ console.log( `Retrying request to ${url} (Attempt ${attempt + 1}/${maxRetries})` );
551
+ }
552
+ }
553
+ }
554
+
555
+ configurePuppeteer ( )
556
+ {
557
+ this.puppeteerOptions = {
558
+ headless: false,
559
+ userDataDir: "./tmp/browser",
560
+ defaultViewport: null,
561
+ args: ["--start-maximized"],
562
+ ignoreDefaultArgs: true
563
+ };
564
+
565
+ if ( this.puppeteerProxy )
566
+ {
567
+ this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
568
+ }
569
+
570
+ if ( this.puppeteerExecutablePath )
571
+ {
572
+ this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
573
+ }
574
+
575
+ this.puppeteerRealOptions = {
576
+ headless: false,
577
+ args: [],
578
+ customConfig: {},
579
+ turnstile: true,
580
+ connectOption: {},
581
+ disableXvfb: false,
582
+ ignoreAllFlags: false,
583
+ proxy: this.puppeteerRealProxy
584
+ };
585
+
586
+ this.puppeteerBrowser = null;
587
+ this.puppeteerPage = null;
588
+ }
589
+
590
+ normalizeExcludeList ( list = [] )
591
+ {
592
+ const normalizedSet = new Set();
593
+ for ( let i = 0; i < list.length; i++ )
594
+ {
595
+ const item = list[i];
596
+ if ( item.endsWith( "/" ) )
597
+ {
598
+ normalizedSet.add( item.slice( 0, -1 ) );
599
+ }
600
+ else
601
+ {
602
+ normalizedSet.add( item );
603
+ }
604
+ normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
605
+ }
606
+ return normalizedSet;
607
+ }
608
+
609
+ isExcluded ( url )
610
+ {
611
+ if ( this.exactExcludeList.has( url ) )
612
+ {
613
+ return true;
614
+ }
615
+ return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
616
+ }
617
+
618
+ isValidFileType ( url )
619
+ {
620
+ if ( !this.filterFileTypes ) return true;
621
+ const urlPath = new URL( url ).pathname.toLowerCase();
622
+ return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
623
+ }
624
+
625
+ isValidDomain ( url )
626
+ {
627
+ if ( !this.strictBaseURL ) return true;
628
+ try
629
+ {
630
+ const urlObj = new URL( url );
631
+ const baseURLObj = new URL( this.baseURL );
632
+ return urlObj.hostname === baseURLObj.hostname;
633
+ }
634
+ catch ( e )
635
+ {
636
+ console.log( `Invalid URL: ${url}` );
637
+ return false;
638
+ }
639
+ }
640
+
641
+ hasValidPageContent ( content )
642
+ {
643
+ // Remove whitespace and newlines for checking
644
+ const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
645
+
646
+ // List of phrases that indicate invalid content
647
+ const invalidPhrases = [
648
+ "verifying that you are not a robot",
649
+ "verifying you are human. this may take a few seconds.",
650
+ "verify you are human by completing the action below",
651
+ "checking if the site connection is secure",
652
+ "please wait while we verify",
653
+ "please enable javascript",
654
+ "access denied",
655
+ "verifying you are human",
656
+ "captcha verification"
657
+ ];
658
+
659
+ const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
660
+ // Check content length
661
+ if ( cleanContent.length < 100 || hasInvalidPhrases )
662
+ {
663
+ return false;
664
+ }
665
+ return true;
666
+ }
667
+
668
+ createOutputDirectory ()
669
+ {
670
+ if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
671
+ {
672
+ fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
673
+ }
674
+ if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
675
+ {
676
+ fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
677
+ }
678
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
679
+ {
680
+ fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
681
+ }
682
+ if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
683
+ {
684
+ fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
685
+ }
686
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
687
+ {
688
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
689
+ }
690
+ if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
691
+ {
692
+ fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
693
+ }
694
+ fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
695
+ fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
696
+ }
697
+
698
+ static sleep ( ms )
699
+ {
700
+ return new Promise( resolve => { return setTimeout( resolve, ms ) });
701
+ }
702
+
703
+ static async combineResults ( outputPath, websites )
704
+ {
705
+ await WebScraper.sleep( 1000 );
706
+ const fullOutputPath = path.join( __dirname, outputPath );
707
+
708
+ // Create output directories
709
+ WebScraper.createCombinedDirectories( fullOutputPath );
710
+
711
+ // Combine files by type
712
+ WebScraper.combineJSONLFiles( fullOutputPath, websites );
713
+ WebScraper.combineCSVFiles( fullOutputPath, websites );
714
+ WebScraper.combineTextFiles( fullOutputPath, websites );
715
+ }
716
+
717
+ static createCombinedDirectories ( fullOutputPath )
718
+ {
719
+ if ( fs.existsSync( fullOutputPath ) )
720
+ {
721
+ fs.rmSync( fullOutputPath, { recursive: true, force: true });
722
+ }
723
+ fs.mkdirSync( fullOutputPath, { recursive: true });
724
+ fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
725
+ fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
726
+ }
727
+
728
+ static combineJSONLFiles ( fullOutputPath, websites )
729
+ {
730
+ const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
731
+ .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
732
+ const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
733
+ .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
734
+
735
+ for ( const website of websites )
736
+ {
737
+ const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
738
+ if ( jsonlContent )
739
+ {
740
+ jsonlOutput.write( jsonlContent );
741
+ }
742
+
743
+ if ( website.includeMetadata )
744
+ {
745
+ const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
746
+ if ( jsonlMetaContent )
747
+ {
748
+ jsonlMetaOutput.write( jsonlMetaContent );
749
+ }
750
+ }
751
+ }
752
+
753
+ jsonlOutput.end();
754
+ jsonlMetaOutput.end();
755
+ }
756
+
757
+ static combineCSVFiles ( fullOutputPath, websites )
758
+ {
759
+ const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
760
+ const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
761
+
762
+ csvOutput.write( "text\n" );
763
+ const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
764
+
765
+ if ( metadataFields.size > 0 )
766
+ {
767
+ csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
768
+ }
769
+
770
+ for ( const website of websites )
771
+ {
772
+ const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
773
+ .split( "\n" )
774
+ .slice( 1 )
775
+ .filter( line => { return line.trim() });
776
+ if ( csvContent.length > 0 )
777
+ {
778
+ csvOutput.write( `${csvContent.join( "\n" )}\n` );
779
+ }
780
+
781
+ if ( website.includeMetadata )
782
+ {
783
+ const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
784
+ .split( "\n" )
785
+ .slice( 1 )
786
+ .filter( line => { return line.trim() });
787
+ if ( csvMetaContent.length > 0 )
788
+ {
789
+ csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
790
+ }
791
+ }
792
+ }
793
+
794
+ csvOutput.end();
795
+ csvMetaOutput.end();
796
+ }
797
+
798
+ static combineTextFiles ( fullOutputPath, websites )
799
+ {
800
+ let textFileCounter = 1;
801
+
802
+ for ( const website of websites )
803
+ {
804
+ const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
805
+
806
+ for ( const file of textFiles )
807
+ {
808
+ const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
809
+ fs.writeFileSync(
810
+ path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
811
+ content,
812
+ "utf-8"
813
+ );
814
+
815
+ if ( website.includeMetadata )
816
+ {
817
+ const metaContent = fs.readFileSync(
818
+ path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
819
+ "utf-8"
820
+ );
821
+ fs.writeFileSync(
822
+ path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
823
+ metaContent,
824
+ "utf-8"
825
+ );
826
+ }
827
+ textFileCounter++;
828
+ }
829
+ }
830
+ }
831
+ }
832
+
2
833
  module.exports = WebScraper;