clean-web-scraper 3.5.6 → 3.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/WebScraper.js DELETED
@@ -1,822 +0,0 @@
1
- const axios = require( "axios" );
2
- const jsdom = require( "jsdom" );
3
- const { JSDOM } = jsdom;
4
- const { Readability } = require( "@mozilla/readability" );
5
- const fs = require( "fs" );
6
- const path = require( "path" );
7
- const puppeteer = require( "puppeteer" );
8
- const { connect } = require( "puppeteer-real-browser" )
9
-
10
- class WebScraper
11
- {
12
- constructor ({
13
- // Base configuration
14
- baseURL,
15
- startURL,
16
- strictBaseURL = true,
17
- maxDepth = Infinity,
18
- maxArticles = Infinity,
19
-
20
- // URL filtering
21
- excludeList = [],
22
- exactExcludeList = [],
23
- filterFileTypes = true,
24
- excludedFileTypes = [".mp3", ".mp4", ".wav", ".avi", ".mov", ".pdf", ".zip", ".rar"],
25
-
26
- // Output paths
27
- scrapResultPath = "./dataset",
28
- jsonlOutputPath,
29
- textOutputPath,
30
- csvOutputPath,
31
-
32
- // Metadata options
33
- includeMetadata = false,
34
- metadataFields = [],
35
-
36
- // Network options
37
- axiosHeaders,
38
- axiosProxy,
39
-
40
- // Puppeteer options
41
- usePuppeteer,
42
- puppeteerProxy, // e.g. http://127.0.0.1:2080
43
- puppeteerExecutablePath,
44
- puppeteerRealProxy
45
- })
46
- {
47
- // Base configuration
48
- this.baseURL = baseURL;
49
- this.startURL = startURL || baseURL;
50
- this.strictBaseURL = strictBaseURL;
51
- this.maxDepth = maxDepth;
52
- this.maxArticles = maxArticles;
53
-
54
- // Output paths setup
55
- this.scrapResultPath = scrapResultPath;
56
- this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
57
- this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
58
- this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
59
- this.jsonlOutputPathWithMeta = this.jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
60
- this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
61
-
62
- // Metadata configuration
63
- this.includeMetadata = includeMetadata;
64
- this.metadataFields = new Set( metadataFields );
65
-
66
- // URL filtering setup
67
- this.visited = new Set();
68
- this.excludeList = this.normalizeExcludeList( excludeList );
69
- this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
70
- this.filterFileTypes = filterFileTypes;
71
- this.excludedFileTypes = excludedFileTypes;
72
-
73
- // Network configuration
74
- this.axiosHeaders = axiosHeaders;
75
- this.axiosProxy = axiosProxy;
76
- this.axiosOptions = {};
77
- if ( this.axiosHeaders )
78
- {
79
- axiosOptions.headers = this.axiosHeaders;
80
- }
81
- if ( this.axiosProxy )
82
- {
83
- axiosOptions.proxy = this.axiosProxy;
84
- }
85
-
86
- // Content storage
87
- this.allProcessedContent = [];
88
-
89
- // Puppeteer configuration
90
- this.usePuppeteer = usePuppeteer || false;
91
- this.puppeteerProxy = puppeteerProxy;
92
- this.puppeteerExecutablePath = puppeteerExecutablePath;
93
- this.puppeteerRealProxy = puppeteerRealProxy;
94
- this.configurePuppeteer( );
95
- }
96
-
97
- async start ()
98
- {
99
- try
100
- {
101
- if ( this.usePuppeteer )
102
- {
103
- let { browser, page } = await connect( this.puppeteerRealOptions )
104
- this.puppeteerBrowser = browser;
105
- this.puppeteerPage = page;
106
- }
107
- this.createOutputDirectory();
108
- await this.fetchPage( this.startURL, 0 );
109
- this.createJSONLFile();
110
- this.saveNumberedTextFiles();
111
- this.createCSVFile();
112
- console.log( "Scraping completed." );
113
- }
114
- catch ( error )
115
- {
116
- console.error( "Error:", error );
117
- throw error;
118
- }
119
- finally
120
- {
121
- if ( this.puppeteerBrowser )
122
- {
123
- await this.puppeteerBrowser.close(); // Close the browser after scraping
124
- }
125
- }
126
- }
127
-
128
- async fetchPage ( url, depth )
129
- {
130
- if ( this.allProcessedContent.length >= this.maxArticles )
131
- {
132
- console.log( `Reached maximum number of articles (${this.maxArticles})` );
133
- return;
134
- }
135
- if ( depth > this.maxDepth )
136
- {
137
- return;
138
- }
139
- this.visited.add( url );
140
- if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
141
- {
142
- return;
143
- }
144
- try
145
- {
146
- const data = await this.fetchContent( url );
147
- if ( !data ) return;
148
- const dom = new JSDOM( data, { url });
149
- const { document } = dom.window;
150
-
151
- if ( !this.isExcluded( url ) )
152
- {
153
- const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
154
- const article = reader.parse();
155
-
156
- if ( article )
157
- {
158
- if ( this.hasValidPageContent( article.textContent ) )
159
- {
160
- const metadata = this.extractMetadata( url, document );
161
- metadata.depth = depth;
162
- this.saveArticle( url, article.textContent, metadata );
163
- }
164
- else
165
- {
166
- console.error( `Invalid content found at ${url}` );
167
- }
168
- }
169
- else
170
- {
171
- console.error( `No readable content found at ${url}` );
172
- }
173
- }
174
-
175
- const links = this.extractLinks( data );
176
- for ( const link of links )
177
- {
178
- if ( !this.visited.has( link ) )
179
- {
180
- await this.fetchPage( link, depth + 1 );
181
- }
182
- }
183
- }
184
- catch ( error )
185
- {
186
- console.error( `Error fetching ${url}:`, error.message, error.code );
187
- }
188
- }
189
-
190
- async fetchContent ( url )
191
- {
192
- try
193
- {
194
- const response = await axios.get( url, {
195
- responseType: "stream",
196
- maxRedirects: 5,
197
- timeout: 70000,
198
- ...axiosOptions,
199
- });
200
-
201
- const contentType = response.headers["content-type"] || "";
202
- if ( !contentType.startsWith( "text" ) )
203
- {
204
- console.log( `Skipping non-HTML content for ${url}: Content-Type is ${contentType}` );
205
- response.data.destroy();
206
- return null;
207
- }
208
-
209
- // Step 3: If Content-Type is HTML, read the full response
210
- let htmlContent = "";
211
- response.data.on( "data", ( chunk ) =>
212
- {
213
- htmlContent += chunk.toString();
214
- });
215
-
216
- // Wait for the stream to finish
217
- await new Promise( ( resolve, reject ) =>
218
- {
219
- response.data.on( "end", resolve );
220
- response.data.on( "error", reject );
221
- });
222
-
223
- return htmlContent;
224
- }
225
- catch ( error )
226
- {
227
- console.error( `Error fetching ${url}:`, error.message );
228
- if ( error.status = 403 && this.usePuppeteer )
229
- {
230
- // const browser = await puppeteer.launch( this.puppeteerOptions );
231
- // const page = await browser.newPage();
232
- try
233
- {
234
- let result;
235
- for ( let index = 0; index < 10; index++ )
236
- {
237
- console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
238
- result = await this.navigateToPage( url ) ;
239
- if ( this.hasValidPageContent( result.htmlContent ) )
240
- {
241
- break
242
- }
243
- }
244
- return result.htmlContent;
245
- }
246
- catch ( error )
247
- {
248
- console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
249
- throw error;
250
- }
251
-
252
- }
253
- throw error;
254
- }
255
- }
256
-
257
- async navigateToPage ( url )
258
- {
259
- let pages = await this.puppeteerBrowser.pages();
260
- let page = pages[0];
261
- page.setDefaultNavigationTimeout( 10000 );
262
- await page.goto( url );
263
- pages = await this.puppeteerBrowser.pages();
264
- page = pages[0];
265
- page.setDefaultNavigationTimeout( 10000 );
266
- await this.waitForPageToLoad( page );
267
- pages = await this.puppeteerBrowser.pages();
268
- page = pages[0];
269
- page.setDefaultNavigationTimeout( 10000 );
270
- if ( page )
271
- {
272
- let htmlContent = await page.content();
273
- return { pages, page, htmlContent };
274
- }
275
- }
276
-
277
- async waitForPageToLoad ( page )
278
- {
279
- try
280
- {
281
- await page.waitForNavigation({ waitUntil: "networkidle0" });
282
- }
283
- catch ( error )
284
- {
285
- console.log( error );
286
- }
287
- }
288
-
289
- extractLinks ( data )
290
- {
291
- const links = new Set();
292
- const regex = /<a\s+(?:[^>]*?\s+)?href=("|')(.*?)\1/gi;
293
- let match;
294
-
295
- while ( ( match = regex.exec( data ) ) !== null )
296
- {
297
- let href = match[2];
298
- if ( href.startsWith( "/" ) )
299
- {
300
- href = new URL( href, this.baseURL ).href
301
- }
302
- if ( href.endsWith( "/" ) )
303
- {
304
- href = href.slice( 0, -1 );
305
- }
306
- if ( href.startsWith( this.baseURL ) )
307
- {
308
- links.add( href );
309
- }
310
- }
311
- return links;
312
- }
313
-
314
- saveArticle ( url, content, metadata )
315
- {
316
- const processedContent = this.processContent( content );
317
-
318
- const simpleContent = {
319
- text: processedContent.trim()
320
- };
321
-
322
- const contentWithMetadata = {
323
- text: processedContent.trim(),
324
- metadata: this.filterMetadata( metadata )
325
- };
326
-
327
- this.allProcessedContent.push({
328
- simple: simpleContent,
329
- withMetadata: contentWithMetadata
330
- });
331
-
332
- let urlPath = new URL( url ).pathname;
333
- if ( urlPath === "/" )
334
- {
335
- urlPath = "/index";
336
- }
337
- const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
338
- const dir = path.dirname( filePath );
339
-
340
- fs.mkdirSync( dir, { recursive: true });
341
- fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
342
- fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
343
- console.log( `Saved: ${filePath}.txt` );
344
- console.log( `Saved: ${filePath}.json` );
345
- }
346
-
347
- createJSONLFile ()
348
- {
349
- const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
350
- let writeStreamMeta
351
-
352
- // Add error handlers
353
- writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
354
-
355
- if ( this.includeMetadata )
356
- {
357
- writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
358
- writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
359
- }
360
- for ( const content of this.allProcessedContent )
361
- {
362
- writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
363
- if ( this.includeMetadata )
364
- {
365
- writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
366
- }
367
- }
368
- writeStreamSimple.end();
369
- if ( this.includeMetadata )
370
- {
371
- writeStreamMeta.end();
372
- console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
373
- }
374
- console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
375
- }
376
-
377
- createCSVFile ()
378
- {
379
- // Create simple version
380
- const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
381
- writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
382
- writeStreamSimple.write( "text\n" );
383
-
384
- // Create metadata version if requested
385
- let writeStreamMeta;
386
- if ( this.includeMetadata )
387
- {
388
- writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
389
- writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
390
- }
391
-
392
- if ( this.includeMetadata )
393
- {
394
- const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
395
- writeStreamMeta.write( `${headers}\n` );
396
- }
397
-
398
- for ( const content of this.allProcessedContent )
399
- {
400
- // Write simple version
401
- const escapedText = content.simple.text.replace( /"/g, "\"\"" );
402
- writeStreamSimple.write( `"${escapedText}"\n` );
403
-
404
- // Write metadata version if requested
405
- if ( this.includeMetadata )
406
- {
407
- const { metadata } = content.withMetadata;
408
- const metadataValues = Array.from( this.metadataFields )
409
- .map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
410
- writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
411
- }
412
- }
413
-
414
- writeStreamSimple.end();
415
- if ( writeStreamMeta )
416
- {
417
- writeStreamMeta.end();
418
- }
419
- console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
420
- if ( this.includeMetadata )
421
- {
422
- console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
423
- }
424
- }
425
-
426
- saveNumberedTextFiles ()
427
- {
428
- // Create base text folder for simple content
429
- const baseTextPath = path.join( __dirname, this.textOutputPath );
430
-
431
- // Create metadata text folder if needed
432
- let metaTextPath = null;
433
- if ( this.includeMetadata )
434
- {
435
- metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
436
- fs.mkdirSync( metaTextPath, { recursive: true });
437
- }
438
-
439
- this.allProcessedContent.forEach( ( content, index ) =>
440
- {
441
- const fileName = `${index + 1}.txt`;
442
-
443
- // Always save simple version
444
- const simpleFilePath = path.join( baseTextPath, fileName );
445
- fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
446
- console.log( `Created simple text file: ${fileName}` );
447
-
448
- // Save metadata version if enabled
449
- if ( this.includeMetadata )
450
- {
451
- const metaFilePath = path.join( metaTextPath, fileName );
452
- let fileContent = "";
453
-
454
- const { metadata } = content.withMetadata;
455
- // Add metadata fields as headers
456
- for ( const field of this.metadataFields )
457
- {
458
- if ( metadata[field] )
459
- {
460
- fileContent += `${field}: ${metadata[field]}\n`;
461
- }
462
- }
463
- fileContent += "\n---\n\n";
464
- fileContent += content.withMetadata.text;
465
-
466
- fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
467
- console.log( `Created metadata text file: ${fileName}` );
468
- }
469
- });
470
- }
471
-
472
- processContent ( content )
473
- {
474
- let processed = content;
475
-
476
- // Remove "[You can read more about this here]" and similar patterns
477
- processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
478
-
479
- // Trim each line
480
- processed = processed.split( "\n" )
481
- .map( line => { return line.trim() })
482
- .join( "\n" );
483
-
484
- // Replace 3 or more newlines with a single newline
485
- processed = processed.replace( /\n{3,}/g, "\n\n" );
486
-
487
- // Add more processing rules as needed:
488
- // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
489
- // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
490
-
491
- // Remove specified words from the end of content, handling multiple occurrences
492
- const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
493
- let changed = true;
494
-
495
- while ( changed )
496
- {
497
- changed = false;
498
- for ( let i = 0; i < wordsToTrim.length; i++ )
499
- {
500
- const oldProcessed = processed;
501
- processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
502
- if ( oldProcessed !== processed )
503
- {
504
- changed = true;
505
- }
506
- }
507
- }
508
- return processed;
509
- }
510
-
511
- filterMetadata ( metadata )
512
- {
513
- if ( !this.includeMetadata ) return {};
514
-
515
- const filteredMetadata = {};
516
- for ( const field of this.metadataFields )
517
- {
518
- if ( metadata[field] && typeof metadata[field] === "string" )
519
- {
520
- filteredMetadata[field] = metadata[field];
521
- }
522
- }
523
- return filteredMetadata;
524
- }
525
-
526
- extractMetadata ( url, document )
527
- {
528
- return {
529
- url,
530
- title: document.title,
531
- description: document.querySelector( "meta[name=\"description\"]" )?.content,
532
- keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
533
- author: document.querySelector( "meta[name=\"author\"]" )?.content,
534
- language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
535
- canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
536
- ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
537
- ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
538
- ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
539
- ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
540
- dateScraped: new Date().toISOString()
541
- };
542
- }
543
-
544
- configurePuppeteer ( )
545
- {
546
- this.puppeteerOptions = {
547
- headless: false,
548
- userDataDir: "./tmp/browser",
549
- defaultViewport: null,
550
- args: ["--start-maximized"],
551
- ignoreDefaultArgs: true
552
- };
553
-
554
- if ( this.puppeteerProxy )
555
- {
556
- this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
557
- }
558
-
559
- if ( this.puppeteerExecutablePath )
560
- {
561
- this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
562
- }
563
-
564
- this.puppeteerRealOptions = {
565
- headless: false,
566
- args: [],
567
- customConfig: {},
568
- turnstile: true,
569
- connectOption: {},
570
- disableXvfb: false,
571
- ignoreAllFlags: false,
572
- proxy: this.puppeteerRealProxy
573
- };
574
-
575
- this.puppeteerBrowser = null;
576
- this.puppeteerPage = null;
577
- }
578
-
579
- normalizeExcludeList ( list = [] )
580
- {
581
- const normalizedSet = new Set();
582
- for ( let i = 0; i < list.length; i++ )
583
- {
584
- const item = list[i];
585
- if ( item.endsWith( "/" ) )
586
- {
587
- normalizedSet.add( item.slice( 0, -1 ) );
588
- }
589
- else
590
- {
591
- normalizedSet.add( item );
592
- }
593
- normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
594
- }
595
- return normalizedSet;
596
- }
597
-
598
- isExcluded ( url )
599
- {
600
- if ( this.exactExcludeList.has( url ) )
601
- {
602
- return true;
603
- }
604
- return Array.from( this.excludeList ).some( excluded => { return url.startsWith( excluded ) });
605
- }
606
-
607
- isValidFileType ( url )
608
- {
609
- if ( !this.filterFileTypes ) return true;
610
- const urlPath = new URL( url ).pathname.toLowerCase();
611
- return !this.excludedFileTypes.some( ext => { return urlPath.endsWith( ext ) });
612
- }
613
-
614
- isValidDomain ( url )
615
- {
616
- if ( !this.strictBaseURL ) return true;
617
- try
618
- {
619
- const urlObj = new URL( url );
620
- const baseURLObj = new URL( this.baseURL );
621
- return urlObj.hostname === baseURLObj.hostname;
622
- }
623
- catch ( e )
624
- {
625
- console.log( `Invalid URL: ${url}` );
626
- return false;
627
- }
628
- }
629
-
630
- hasValidPageContent ( content )
631
- {
632
- // Remove whitespace and newlines for checking
633
- const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
634
-
635
- // List of phrases that indicate invalid content
636
- const invalidPhrases = [
637
- "verifying that you are not a robot",
638
- "verifying you are human. this may take a few seconds.",
639
- "verify you are human by completing the action below",
640
- "checking if the site connection is secure",
641
- "please wait while we verify",
642
- "please enable javascript",
643
- "access denied",
644
- "verifying you are human",
645
- "captcha verification"
646
- ];
647
-
648
- const hasInvalidPhrases = invalidPhrases.some( phrase => { return cleanContent.includes( phrase ) });
649
- // Check content length
650
- if ( cleanContent.length < 100 || hasInvalidPhrases )
651
- {
652
- return false;
653
- }
654
- return true;
655
- }
656
-
657
- createOutputDirectory ()
658
- {
659
- if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
660
- {
661
- fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
662
- }
663
- if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
664
- {
665
- fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
666
- }
667
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
668
- {
669
- fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
670
- }
671
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
672
- {
673
- fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
674
- }
675
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
676
- {
677
- fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
678
- }
679
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
680
- {
681
- fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
682
- }
683
- fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
684
- fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
685
- }
686
-
687
- static sleep ( ms )
688
- {
689
- return new Promise( resolve => { return setTimeout( resolve, ms ) });
690
- }
691
-
692
- static async combineResults ( outputPath, websites )
693
- {
694
- await WebScraper.sleep( 1000 );
695
- const fullOutputPath = path.join( __dirname, outputPath );
696
-
697
- // Create output directories
698
- WebScraper.createCombinedDirectories( fullOutputPath );
699
-
700
- // Combine files by type
701
- WebScraper.combineJSONLFiles( fullOutputPath, websites );
702
- WebScraper.combineCSVFiles( fullOutputPath, websites );
703
- WebScraper.combineTextFiles( fullOutputPath, websites );
704
- }
705
-
706
- static createCombinedDirectories ( fullOutputPath )
707
- {
708
- if ( fs.existsSync( fullOutputPath ) )
709
- {
710
- fs.rmSync( fullOutputPath, { recursive: true, force: true });
711
- }
712
- fs.mkdirSync( fullOutputPath, { recursive: true });
713
- fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
714
- fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
715
- }
716
-
717
- static combineJSONLFiles ( fullOutputPath, websites )
718
- {
719
- const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
720
- .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
721
- const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
722
- .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
723
-
724
- for ( const website of websites )
725
- {
726
- const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
727
- if ( jsonlContent )
728
- {
729
- jsonlOutput.write( jsonlContent );
730
- }
731
-
732
- if ( website.includeMetadata )
733
- {
734
- const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
735
- if ( jsonlMetaContent )
736
- {
737
- jsonlMetaOutput.write( jsonlMetaContent );
738
- }
739
- }
740
- }
741
-
742
- jsonlOutput.end();
743
- jsonlMetaOutput.end();
744
- }
745
-
746
- static combineCSVFiles ( fullOutputPath, websites )
747
- {
748
- const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
749
- const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
750
-
751
- csvOutput.write( "text\n" );
752
- const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
753
-
754
- if ( metadataFields.size > 0 )
755
- {
756
- csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
757
- }
758
-
759
- for ( const website of websites )
760
- {
761
- const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
762
- .split( "\n" )
763
- .slice( 1 )
764
- .filter( line => { return line.trim() });
765
- if ( csvContent.length > 0 )
766
- {
767
- csvOutput.write( `${csvContent.join( "\n" )}\n` );
768
- }
769
-
770
- if ( website.includeMetadata )
771
- {
772
- const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
773
- .split( "\n" )
774
- .slice( 1 )
775
- .filter( line => { return line.trim() });
776
- if ( csvMetaContent.length > 0 )
777
- {
778
- csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
779
- }
780
- }
781
- }
782
-
783
- csvOutput.end();
784
- csvMetaOutput.end();
785
- }
786
-
787
- static combineTextFiles ( fullOutputPath, websites )
788
- {
789
- let textFileCounter = 1;
790
-
791
- for ( const website of websites )
792
- {
793
- const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
794
-
795
- for ( const file of textFiles )
796
- {
797
- const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
798
- fs.writeFileSync(
799
- path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
800
- content,
801
- "utf-8"
802
- );
803
-
804
- if ( website.includeMetadata )
805
- {
806
- const metaContent = fs.readFileSync(
807
- path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
808
- "utf-8"
809
- );
810
- fs.writeFileSync(
811
- path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
812
- metaContent,
813
- "utf-8"
814
- );
815
- }
816
- textFileCounter++;
817
- }
818
- }
819
- }
820
- }
821
-
822
- module.exports = WebScraper;