clean-web-scraper 3.8.7 → 3.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/example-usage.js +7 -6
  2. package/main.js +168 -213
  3. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -129,9 +129,9 @@ async function electronicintifada ( enable )
129
129
  csvOutputPath: "./dataset/electronicintifada/train.csv",
130
130
  includeMetadata: true,
131
131
  metadataFields: ["author", "title", "description", "dateScrapedDate"],
132
- maxDepth: 13,
132
+ maxDepth: 16,
133
133
  maxArticles: 2000,
134
- concurrencyLimit: 3,
134
+ concurrencyLimit: 2,
135
135
  axiosHeaders: headers,
136
136
  retryDelay: 10000,
137
137
  axiosProxy: {
@@ -198,18 +198,19 @@ async function mondoweiss ( enable )
198
198
  textOutputPath: "./dataset/mondoweiss/texts",
199
199
  csvOutputPath: "./dataset/mondoweiss/train.csv",
200
200
  maxArticles: 2500,
201
- maxRetries: 2,
202
- concurrencyLimit: 4,
201
+ maxRetries: 3,
202
+ concurrencyLimit: 3,
203
203
  axiosHeaders: headers,
204
204
  axiosProxy: {
205
205
  host: "localhost",
206
206
  port: 2080,
207
207
  protocol: "http"
208
208
  },
209
- maxDepth: 10,
209
+ maxDepth: 15,
210
210
  retryDelay: 10000,
211
211
  includeMetadata: true,
212
- metadataFields: ["author", "title", "description", "dateScrapedDate"]
212
+ metadataFields: ["author", "title", "description", "dateScrapedDate"],
213
+ useProxyAsFallback: true
213
214
  });
214
215
  if ( enable )
215
216
  {
package/main.js CHANGED
@@ -3,49 +3,11 @@ const { JSDOM } = require( "jsdom" );
3
3
  const { Readability } = require( "@mozilla/readability" );
4
4
  const fs = require( "fs" );
5
5
  const path = require( "path" );
6
- const { connect } = require( "puppeteer-real-browser" )
6
+ const { connect } = require( "puppeteer-real-browser" );
7
7
 
8
8
  class WebScraper
9
9
  {
10
- constructor ({
11
- // Base configuration
12
- baseURL,
13
- startURL,
14
- strictBaseURL,
15
- maxDepth,
16
- maxArticles,
17
- concurrencyLimit,
18
- maxRetries,
19
- retryDelay,
20
-
21
- // URL filtering
22
- excludeList = [],
23
- exactExcludeList = [],
24
- filterFileTypes,
25
- excludedFileTypes,
26
- removeURLFragment,
27
-
28
- // Output paths
29
- scrapResultPath = "./dataset",
30
- jsonlOutputPath,
31
- textOutputPath,
32
- csvOutputPath,
33
-
34
- // Metadata options
35
- includeMetadata = false,
36
- metadataFields = [],
37
-
38
- // Network options
39
- axiosHeaders,
40
- axiosProxy,
41
- useProxyAsFallback,
42
-
43
- // Puppeteer options
44
- usePuppeteer,
45
- puppeteerProxy, // e.g. http://127.0.0.1:2080
46
- puppeteerExecutablePath,
47
- puppeteerRealProxy
48
- })
10
+ constructor ( config = {})
49
11
  {
50
12
  // Base configuration
51
13
  this.baseURL = baseURL;
@@ -54,11 +16,10 @@ class WebScraper
54
16
  this.maxDepth = maxDepth || Infinity;
55
17
  this.maxArticles = maxArticles || Infinity;
56
18
  this.concurrencyLimit = concurrencyLimit || 2;
57
- this.maxRetries = maxRetries || 5;
58
- this.retryDelay = retryDelay || 40000;
19
+ this.crawlingDelay = crawlingDelay || 1000;
59
20
 
60
21
  // Output paths setup
61
- this.scrapResultPath = scrapResultPath;
22
+ this.scrapResultPath = scrapResultPath || "./dataset";
62
23
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
63
24
  this.textOutputPathWithMeta = `${this.textOutputPath }_with_metadata`;
64
25
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
@@ -67,8 +28,8 @@ class WebScraper
67
28
  this.csvOutputPathWithMeta = this.csvOutputPath.replace( ".csv", "_with_metadata.csv" );
68
29
 
69
30
  // Metadata configuration
70
- this.includeMetadata = includeMetadata;
71
- this.metadataFields = new Set( metadataFields );
31
+ this.includeMetadata = includeMetadata || false;
32
+ this.metadataFields = new Set( metadataFields || [] );
72
33
 
73
34
  // URL filtering setup
74
35
  this.visited = new Set();
@@ -81,6 +42,8 @@ class WebScraper
81
42
  // Network configuration
82
43
  this.axiosHeaders = axiosHeaders;
83
44
  this.axiosProxy = axiosProxy;
45
+ this.axiosMaxRetries = axiosMaxRetries || 5;
46
+ this.axiosRetryDelay = axiosRetryDelay || 40000;
84
47
  this.useProxyAsFallback = useProxyAsFallback || false;
85
48
  this.axiosOptions = {};
86
49
  if ( this.axiosHeaders )
@@ -97,10 +60,10 @@ class WebScraper
97
60
 
98
61
  // Puppeteer configuration
99
62
  this.usePuppeteer = usePuppeteer || false;
100
- this.puppeteerProxy = puppeteerProxy;
63
+ this.puppeteerProxy = puppeteerProxy; // http://127.0.0.1:2080
101
64
  this.puppeteerExecutablePath = puppeteerExecutablePath;
102
65
  this.puppeteerRealProxy = puppeteerRealProxy;
103
- this.configurePuppeteer( );
66
+ this.configurePuppeteer();
104
67
  }
105
68
 
106
69
  async start ()
@@ -109,12 +72,12 @@ class WebScraper
109
72
  {
110
73
  if ( this.usePuppeteer )
111
74
  {
112
- let { browser, page } = await connect( this.puppeteerRealOptions )
75
+ const { browser, page } = await connect( this.puppeteerRealOptions );
113
76
  this.puppeteerBrowser = browser;
114
77
  this.puppeteerPage = page;
115
78
  }
116
79
  this.createOutputDirectory();
117
- await this.fetchPage( this.startURL, 0 );
80
+ await this.crawl( this.startURL, 0 );
118
81
  this.createJSONLFile();
119
82
  this.saveNumberedTextFiles();
120
83
  this.createCSVFile();
@@ -129,94 +92,95 @@ class WebScraper
129
92
  {
130
93
  if ( this.puppeteerBrowser )
131
94
  {
132
- await this.puppeteerBrowser.close(); // Close the browser after scraping
95
+ await this.puppeteerBrowser.close();
133
96
  }
134
97
  }
135
98
  }
136
99
 
137
- async fetchPage ( url, depth )
100
+ async crawl ( initialUrl, initialDepth = 0 )
138
101
  {
139
- if ( this.removeURLFragment )
140
- {
141
- url = url.split( "#" )[0];
142
- }
143
- if ( this.hasReachedMax( depth ) )
144
- {
145
- return;
146
- }
147
- if ( this.visited.has( url ) )
148
- {
149
- console.log( `Already visited: ${url}` );
150
- return;
151
- }
152
- this.visited.add( url );
153
- if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
154
- {
155
- return;
156
- }
157
- try
102
+ const queue = [{ url: initialUrl, depth: initialDepth }];
103
+ for ( let i = 0; i < queue.length; i++ )
158
104
  {
159
- await WebScraper.sleep( 5000 );
160
- const data = await this.fetchContent( url );
161
- if ( !data ) return;
162
- const dom = new JSDOM( data, { url });
163
- const { document } = dom.window;
105
+ let { url, depth } = queue[i];
106
+ console.log( `Processing URL: ${queue[i].url}` );
107
+ if ( this.hasReachedMax( depth ) )
108
+ {
109
+ continue;
110
+ }
111
+ if ( this.removeURLFragment )
112
+ {
113
+ url = url.split( "#" )[0];
114
+ }
115
+ if ( this.visited.has( url ) )
116
+ {
117
+ console.log( `Already visited: ${url}` );
118
+ continue;
119
+ }
120
+ this.visited.add( url );
164
121
 
165
- if ( !this.isExcluded( url ) )
122
+ if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
166
123
  {
167
- const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
168
- const article = reader.parse();
124
+ continue;
125
+ }
169
126
 
170
- if ( article )
127
+ try
128
+ {
129
+ await WebScraper.sleep( this.crawlingDelay );
130
+ const data = await this.fetchContent( url );
131
+ if ( !data ) continue;
132
+
133
+ const dom = new JSDOM( data, { url });
134
+ const { document } = dom.window;
135
+
136
+ if ( !this.isExcluded( url ) )
171
137
  {
172
- if ( this.hasValidPageContent( article.textContent ) )
138
+ const reader = new Readability( document, {
139
+ charThreshold: 500,
140
+ nbTopCandidates: 20
141
+ });
142
+ const article = reader.parse();
143
+ if ( article )
173
144
  {
174
- const metadata = this.extractMetadata( url, document );
175
- this.saveArticle( url, article.textContent, metadata );
145
+ if ( this.hasValidPageContent( article.textContent ) )
146
+ {
147
+ const metadata = this.extractMetadata( url, document );
148
+ this.saveArticle( url, article.textContent, metadata );
149
+ }
150
+ else
151
+ {
152
+ console.error( `Invalid content found at ${url}` );
153
+ }
176
154
  }
177
155
  else
178
156
  {
179
- console.error( `Invalid content found at ${url}` );
157
+ console.error( `No readable content found at ${url}` );
180
158
  }
181
159
  }
182
- else
183
- {
184
- console.error( `No readable content found at ${url}` );
185
- }
186
- }
187
-
188
- const links = this.extractLinks( data );
189
- const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
190
-
191
- for ( let i = 0; i < unvisitedLinks.length; i += this.concurrencyLimit )
192
- {
193
- if ( this.hasReachedMax( depth ) )
194
- {
195
- return;
196
- }
197
- const batch = unvisitedLinks.slice( i, i + this.concurrencyLimit );
198
- const results = await Promise.allSettled( batch.map( link => { return this.fetchPage( link, depth + 1 ) }) );
199
160
 
200
- results.forEach( ( result, index ) =>
161
+ const links = this.extractLinks( data );
162
+ const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
163
+ for ( const link of unvisitedLinks )
201
164
  {
202
- if ( result.status === "rejected" )
165
+ if ( !this.hasReachedMax( depth ) )
203
166
  {
204
- console.error( `Failed to fetch ${batch[index]}: ${result.reason}` );
167
+ queue.push({ url: link, depth: depth + 1 });
205
168
  }
206
- });
169
+ }
170
+ }
171
+ catch ( error )
172
+ {
173
+ console.error( `Error fetching ${url}:`, error.message, error.code );
207
174
  }
208
- }
209
- catch ( error )
210
- {
211
- console.error( `Error fetching ${url}:`, error.message, error.code );
212
175
  }
213
176
  }
214
177
 
178
+
215
179
  async fetchContent ( url )
216
180
  {
217
181
  try
218
182
  {
219
- const response = await this.retryAxiosRequest( url )
183
+ const response = await this.retryAxiosRequest( url );
220
184
  const contentType = response?.headers["content-type"] || "";
221
185
  if ( !contentType?.startsWith( "text" ) )
222
186
  {
@@ -225,34 +189,30 @@ class WebScraper
225
189
  return null;
226
190
  }
227
191
 
228
- // Step 3: If Content-Type is HTML, read the full response
229
192
  let htmlContent = "";
230
- response.data.on( "data", ( chunk ) =>
193
+ response.data.on( "data", chunk =>
231
194
  {
232
195
  htmlContent += chunk.toString();
233
196
  });
234
-
235
- // Wait for the stream to finish
236
197
  await new Promise( ( resolve, reject ) =>
237
198
  {
238
199
  response.data.on( "end", resolve );
239
200
  response.data.on( "error", reject );
240
201
  });
241
-
242
202
  return htmlContent;
243
203
  }
244
204
  catch ( error )
245
205
  {
246
206
  console.error( `Error fetching content ${url}:`, error.message );
247
- if ( error.status = 403 && this.usePuppeteer )
207
+ if ( error.status === 403 && this.usePuppeteer )
248
208
  {
249
209
  try
250
210
  {
251
211
  let result;
252
- for ( let index = 0; index < 10; index++ )
212
+ for ( let i = 0; i < 10; i++ )
253
213
  {
254
214
  console.log( `Please solve the CAPTCHA on the opened browser window for ${url}` );
255
- result = await this.navigateToPage( url ) ;
215
+ result = await this.navigateToPage( url );
256
216
  if ( this.hasValidPageContent( result.htmlContent ) )
257
217
  {
258
218
  break
@@ -265,7 +225,6 @@ class WebScraper
265
225
  console.error( `Error solving CAPTCHA for ${url}:`, error.message, error );
266
226
  throw error;
267
227
  }
268
-
269
228
  }
270
229
  throw error;
271
230
  }
@@ -378,11 +337,10 @@ class WebScraper
378
337
  createJSONLFile ()
379
338
  {
380
339
  const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
381
- let writeStreamMeta
382
-
383
- // Add error handlers
384
- writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
340
+ writeStreamSimple.on( "error", err =>
341
+ { return console.error( "Error writing JSONL:", err ) });
385
342
 
343
+ let writeStreamMeta;
386
344
  if ( this.includeMetadata )
387
345
  {
388
346
  writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
@@ -428,16 +386,18 @@ class WebScraper
428
386
 
429
387
  for ( const content of this.allProcessedContent )
430
388
  {
431
- // Write simple version
432
389
  const escapedText = content.simple.text.replace( /"/g, "\"\"" );
433
390
  writeStreamSimple.write( `"${escapedText}"\n` );
434
391
 
435
- // Write metadata version if requested
436
392
  if ( this.includeMetadata )
437
393
  {
438
394
  const { metadata } = content.withMetadata;
439
- const metadataValues = Array.from( this.metadataFields )
440
- .map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
395
+ const metadataValues = Array.from( this.metadataFields ).map( field =>
396
+ {
397
+ return metadata[field]
398
+ ? `"${metadata[field].replace( /"/g, "\"\"" )}"`
399
+ : "\"\""
400
+ });
441
401
  writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
442
402
  }
443
403
  }
@@ -456,10 +416,8 @@ class WebScraper
456
416
 
457
417
  saveNumberedTextFiles ()
458
418
  {
459
- // Create base text folder for simple content
460
419
  const baseTextPath = path.join( __dirname, this.textOutputPath );
461
420
 
462
- // Create metadata text folder if needed
463
421
  let metaTextPath = null;
464
422
  if ( this.includeMetadata )
465
423
  {
@@ -470,20 +428,15 @@ class WebScraper
470
428
  this.allProcessedContent.forEach( ( content, index ) =>
471
429
  {
472
430
  const fileName = `${index + 1}.txt`;
473
-
474
- // Always save simple version
475
431
  const simpleFilePath = path.join( baseTextPath, fileName );
476
432
  fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
477
433
  console.log( `Created simple text file: ${fileName}` );
478
434
 
479
- // Save metadata version if enabled
480
435
  if ( this.includeMetadata )
481
436
  {
482
437
  const metaFilePath = path.join( metaTextPath, fileName );
483
438
  let fileContent = "";
484
-
485
439
  const { metadata } = content.withMetadata;
486
- // Add metadata fields as headers
487
440
  for ( const field of this.metadataFields )
488
441
  {
489
442
  if ( metadata[field] )
@@ -493,7 +446,6 @@ class WebScraper
493
446
  }
494
447
  fileContent += "\n---\n\n";
495
448
  fileContent += content.withMetadata.text;
496
-
497
449
  fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
498
450
  console.log( `Created metadata text file: ${fileName}` );
499
451
  }
@@ -503,33 +455,27 @@ class WebScraper
503
455
  processContent ( content )
504
456
  {
505
457
  let processed = content;
506
-
507
- // Remove "[You can read more about this here]" and similar patterns
458
+ // Remove unwanted fixed text
508
459
  processed = processed.replace( /\[You can read more about this here\]/g, "" ).trim();
509
-
510
- // Trim each line
511
- processed = processed.split( "\n" )
460
+ // Trim each line and remove extra newlines
461
+ processed = processed
462
+ .split( "\n" )
512
463
  .map( line => { return line.trim() })
513
- .join( "\n" );
514
-
515
- // Replace 3 or more newlines with a single newline
516
- processed = processed.replace( /\n{3,}/g, "\n\n" );
464
+ .join( "\n" )
465
+ .replace( /\n{3,}/g, "\n\n" );
517
466
 
518
- // Add more processing rules as needed:
519
- // processed = processed.replace(/\[.*?\]/g, ''); // Removes all content within square brackets
520
- // processed = processed.replace(/\(.*?\)/g, ''); // Removes all content within parentheses
521
-
522
- // Remove specified words from the end of content, handling multiple occurrences
467
+ // Remove specified words at the end (repeatedly)
523
468
  const wordsToTrim = ["Facebook", "Twitter", "Donate Now", "Instagram"];
524
469
  let changed = true;
525
-
526
470
  while ( changed )
527
471
  {
528
472
  changed = false;
529
- for ( let i = 0; i < wordsToTrim.length; i++ )
473
+ for ( const word of wordsToTrim )
530
474
  {
531
475
  const oldProcessed = processed;
532
- processed = processed.replace( new RegExp( `\\s*${wordsToTrim[i]}\\s*$`, "g" ), "" ).trim();
476
+ processed = processed
477
+ .replace( new RegExp( `\\s*${word}\\s*$`, "g" ), "" )
478
+ .trim();
533
479
  if ( oldProcessed !== processed )
534
480
  {
535
481
  changed = true;
@@ -542,7 +488,6 @@ class WebScraper
542
488
  filterMetadata ( metadata )
543
489
  {
544
490
  if ( !this.includeMetadata ) return {};
545
-
546
491
  const filteredMetadata = {};
547
492
  for ( const field of this.metadataFields )
548
493
  {
@@ -562,10 +507,13 @@ class WebScraper
562
507
  description: document.querySelector( "meta[name=\"description\"]" )?.content,
563
508
  keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
564
509
  author: document.querySelector( "meta[name=\"author\"]" )?.content,
565
- language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
510
+ language:
511
+ document.documentElement.lang ||
512
+ document.querySelector( "html" )?.getAttribute( "lang" ),
566
513
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
567
514
  ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
568
- ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
515
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )
516
+ ?.content,
569
517
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
570
518
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
571
519
  dateScrapedDate: new Date().toISOString()
@@ -581,7 +529,7 @@ class WebScraper
581
529
  ...this.axiosOptions,
582
530
  };
583
531
 
584
- for ( let attempt = 1; attempt <= this.maxRetries; attempt++ )
532
+ for ( let attempt = 1; attempt <= this.axiosMaxRetries; attempt++ )
585
533
  {
586
534
  try
587
535
  {
@@ -589,7 +537,7 @@ class WebScraper
589
537
  {
590
538
  break;
591
539
  }
592
- if ( attempt === this.maxRetries && this.useProxyAsFallback && this.axiosProxy )
540
+ if ( attempt === this.axiosMaxRetries && this.useProxyAsFallback && this.axiosProxy )
593
541
  {
594
542
  options = {
595
543
  ...options,
@@ -601,9 +549,9 @@ class WebScraper
601
549
  }
602
550
  catch ( error )
603
551
  {
604
- if ( attempt >= this.maxRetries ) throw error;
605
- await WebScraper.sleep( this.retryDelay * attempt );
606
- console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.maxRetries})`, error.message, error.code );
552
+ if ( attempt >= this.axiosMaxRetries ) throw error;
553
+ await WebScraper.sleep( this.axiosRetryDelay * attempt );
554
+ console.error( `Retrying request to ${url} (Attempt ${attempt + 1}/${this.axiosMaxRetries})`, error.message, error.code );
607
555
  }
608
556
  }
609
557
  throw new Error( "Max retries reached" );
@@ -623,7 +571,6 @@ class WebScraper
623
571
  {
624
572
  this.puppeteerOptions.args.push( `--proxy-server=${this.puppeteerProxy}` );
625
573
  }
626
-
627
574
  if ( this.puppeteerExecutablePath )
628
575
  {
629
576
  this.puppeteerOptions.executablePath = this.puppeteerExecutablePath;
@@ -697,10 +644,7 @@ class WebScraper
697
644
 
698
645
  hasValidPageContent ( content )
699
646
  {
700
- // Remove whitespace and newlines for checking
701
647
  const cleanContent = content.replace( /\s+/g, " " ).trim().toLowerCase();
702
-
703
- // List of phrases that indicate invalid content
704
648
  const invalidPhrases = [
705
649
  "verifying that you are not a robot",
706
650
  "verifying you are human. this may take a few seconds.",
@@ -724,37 +668,35 @@ class WebScraper
724
668
 
725
669
  createOutputDirectory ()
726
670
  {
727
- if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
728
- {
729
- fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
730
- }
731
- if ( fs.existsSync( path.join( __dirname, this.textOutputPath ) ) )
732
- {
733
- fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
734
- }
735
- if ( fs.existsSync( path.join( __dirname, this.textOutputPathWithMeta ) ) )
736
- {
737
- fs.rmSync( path.join( __dirname, this.textOutputPathWithMeta ), { recursive: true, force: true });
738
- }
739
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPath ) ) )
740
- {
741
- fs.rmSync( path.join( __dirname, this.csvOutputPath ), { recursive: true, force: true });
742
- }
743
- if ( fs.existsSync( path.join( __dirname, this.csvOutputPathWithMeta ) ) )
744
- {
745
- fs.rmSync( path.join( __dirname, this.csvOutputPathWithMeta ), { recursive: true, force: true });
746
- }
747
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPath ) ) )
671
+ const paths = [
672
+ path.join( __dirname, this.scrapResultPath ),
673
+ path.join( __dirname, this.textOutputPath ),
674
+ path.join( __dirname, this.textOutputPathWithMeta ),
675
+ path.join( __dirname, this.csvOutputPath ),
676
+ path.join( __dirname, this.csvOutputPathWithMeta ),
677
+ path.join( __dirname, this.jsonlOutputPath ),
678
+ path.join( __dirname, this.jsonlOutputPathWithMeta )
679
+ ];
680
+ for ( const p of paths )
748
681
  {
749
- fs.rmSync( path.join( __dirname, this.jsonlOutputPath ), { recursive: true, force: true });
682
+ if ( fs.existsSync( p ) )
683
+ {
684
+ fs.rmSync( p, { recursive: true, force: true });
685
+ }
750
686
  }
751
- if ( fs.existsSync( path.join( __dirname, this.jsonlOutputPathWithMeta ) ) )
687
+ // Recreate directories needed for output
688
+ this.ensureDirectory( path.join( __dirname, this.scrapResultPath ) );
689
+ this.ensureDirectory( path.join( __dirname, this.textOutputPath ) );
690
+ this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
691
+ }
692
+
693
+ // Helper method to ensure a directory exists
694
+ ensureDirectory ( dirPath )
695
+ {
696
+ if ( !fs.existsSync( dirPath ) )
752
697
  {
753
- fs.rmSync( path.join( __dirname, this.jsonlOutputPathWithMeta ), { recursive: true, force: true });
698
+ fs.mkdirSync( dirPath, { recursive: true });
754
699
  }
755
- fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
756
- fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
757
- fs.mkdirSync( path.join( __dirname, this.textOutputPathWithMeta ), { recursive: true });
758
700
  }
759
701
 
760
702
  static async sleep ( ms )
@@ -766,11 +708,7 @@ class WebScraper
766
708
  {
767
709
  await WebScraper.sleep( 1000 );
768
710
  const fullOutputPath = path.join( __dirname, outputPath );
769
-
770
- // Create output directories
771
711
  WebScraper.createCombinedDirectories( fullOutputPath );
772
-
773
- // Combine files by type
774
712
  WebScraper.combineJSONLFiles( fullOutputPath, websites );
775
713
  WebScraper.combineCSVFiles( fullOutputPath, websites );
776
714
  WebScraper.combineTextFiles( fullOutputPath, websites );
@@ -784,34 +722,44 @@ class WebScraper
784
722
  }
785
723
  fs.mkdirSync( fullOutputPath, { recursive: true });
786
724
  fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
787
- fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
725
+ fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), {
726
+ recursive: true
727
+ });
788
728
  }
789
729
 
790
730
  static combineJSONLFiles ( fullOutputPath, websites )
791
731
  {
792
- const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
793
- .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
794
- const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
795
- .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
732
+ const jsonlOutput = fs
733
+ .createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
734
+ .on( "error", err =>
735
+ { return console.error( "Error combining JSONL:", err ) });
736
+ const jsonlMetaOutput = fs
737
+ .createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
738
+ .on( "error", err =>
739
+ { return console.error( "Error combining metadata JSONL:", err ) });
796
740
 
797
741
  for ( const website of websites )
798
742
  {
799
- const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
743
+ const jsonlContent = fs.readFileSync(
744
+ path.join( __dirname, website.jsonlOutputPath ),
745
+ "utf-8"
746
+ );
800
747
  if ( jsonlContent )
801
748
  {
802
749
  jsonlOutput.write( jsonlContent );
803
750
  }
804
-
805
751
  if ( website.includeMetadata )
806
752
  {
807
- const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
753
+ const jsonlMetaContent = fs.readFileSync(
754
+ path.join( __dirname, website.jsonlOutputPathWithMeta ),
755
+ "utf-8"
756
+ );
808
757
  if ( jsonlMetaContent )
809
758
  {
810
759
  jsonlMetaOutput.write( jsonlMetaContent );
811
760
  }
812
761
  }
813
762
  }
814
-
815
763
  jsonlOutput.end();
816
764
  jsonlMetaOutput.end();
817
765
  }
@@ -822,7 +770,8 @@ class WebScraper
822
770
  const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
823
771
 
824
772
  csvOutput.write( "text\n" );
825
- const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
773
+ const metadataFields =
774
+ websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
826
775
 
827
776
  if ( metadataFields.size > 0 )
828
777
  {
@@ -839,10 +788,13 @@ class WebScraper
839
788
  {
840
789
  csvOutput.write( `${csvContent.join( "\n" )}\n` );
841
790
  }
842
-
843
791
  if ( website.includeMetadata )
844
792
  {
845
- const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
793
+ const csvMetaContent = fs
794
+ .readFileSync(
795
+ path.join( __dirname, website.csvOutputPathWithMeta ),
796
+ "utf-8"
797
+ )
846
798
  .split( "\n" )
847
799
  .slice( 1 )
848
800
  .filter( line => { return line.trim() });
@@ -852,7 +804,6 @@ class WebScraper
852
804
  }
853
805
  }
854
806
  }
855
-
856
807
  csvOutput.end();
857
808
  csvMetaOutput.end();
858
809
  }
@@ -860,20 +811,20 @@ class WebScraper
860
811
  static combineTextFiles ( fullOutputPath, websites )
861
812
  {
862
813
  let textFileCounter = 1;
863
-
864
814
  for ( const website of websites )
865
815
  {
866
816
  const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
867
-
868
817
  for ( const file of textFiles )
869
818
  {
870
- const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
819
+ const content = fs.readFileSync(
820
+ path.join( __dirname, website.textOutputPath, file ),
821
+ "utf-8"
822
+ );
871
823
  fs.writeFileSync(
872
824
  path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
873
825
  content,
874
826
  "utf-8"
875
827
  );
876
-
877
828
  if ( website.includeMetadata )
878
829
  {
879
830
  const metaContent = fs.readFileSync(
@@ -881,7 +832,11 @@ class WebScraper
881
832
  "utf-8"
882
833
  );
883
834
  fs.writeFileSync(
884
- path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
835
+ path.join(
836
+ fullOutputPath,
837
+ "texts_with_metadata",
838
+ `${textFileCounter}.txt`
839
+ ),
885
840
  metaContent,
886
841
  "utf-8"
887
842
  );
@@ -892,4 +847,4 @@ class WebScraper
892
847
  }
893
848
  }
894
849
 
895
- module.exports = WebScraper;
850
+ module.exports = WebScraper;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.8.7",
3
+ "version": "3.9.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",