clean-web-scraper 3.1.0 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -57,10 +57,10 @@ const scraper = new WebScraper({
57
57
  includeMetadata: false, // Optional: Include metadata in output files
58
58
  metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
59
59
  });
60
- scraper.start();
60
+ await scraper.start();
61
61
 
62
62
  // Combine results from multiple scrapers
63
- WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
63
+ await WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
64
64
  ```
65
65
 
66
66
  ```bash
package/example-usage.js CHANGED
@@ -58,7 +58,7 @@ void async function main ()
58
58
  {
59
59
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
60
60
  const decolonizepalestineScraper = await decolonizepalestine();
61
- WebScraper.combineResults( "./dataset/combined", [
61
+ await WebScraper.combineResults( "./dataset/combined", [
62
62
  khameneiIrFreePalestineTagScraper,
63
63
  decolonizepalestineScraper
64
64
  ] );
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.1.0",
3
+ "version": "3.2.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -155,9 +155,14 @@ class WebScraper
155
155
  {
156
156
  const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
157
157
  let writeStreamMeta
158
+
159
+ // Add error handlers
160
+ writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
161
+
158
162
  if ( this.includeMetadata )
159
163
  {
160
164
  writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
165
+ writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
161
166
  }
162
167
  for ( const content of this.allProcessedContent )
163
168
  {
@@ -171,6 +176,7 @@ class WebScraper
171
176
  if ( this.includeMetadata )
172
177
  {
173
178
  writeStreamMeta.end();
179
+ console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
174
180
  }
175
181
  console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
176
182
  }
@@ -179,6 +185,7 @@ class WebScraper
179
185
  {
180
186
  // Create simple version
181
187
  const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
188
+ writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
182
189
  writeStreamSimple.write( "text\n" );
183
190
 
184
191
  // Create metadata version if requested
@@ -186,6 +193,7 @@ class WebScraper
186
193
  if ( this.includeMetadata )
187
194
  {
188
195
  writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
196
+ writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
189
197
  }
190
198
 
191
199
  if ( this.includeMetadata )
@@ -365,8 +373,14 @@ class WebScraper
365
373
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
366
374
  }
367
375
 
368
- static combineResults ( outputPath, websites )
376
+ static sleep ( ms )
377
+ {
378
+ return new Promise( resolve => { return setTimeout( resolve, ms ) });
379
+ }
380
+
381
+ static async combineResults ( outputPath, websites )
369
382
  {
383
+ await WebScraper.sleep( 1000 );
370
384
  const fullOutputPath = path.join( __dirname, outputPath );
371
385
 
372
386
  // Create output directories
@@ -375,8 +389,10 @@ class WebScraper
375
389
  fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
376
390
 
377
391
  // Combine regular JSONL files
378
- const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
379
- const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) );
392
+ const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
393
+ .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
394
+ const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
395
+ .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
380
396
  const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
381
397
  const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
382
398
 
@@ -449,7 +465,6 @@ class WebScraper
449
465
  }
450
466
  }
451
467
  }
452
-
453
468
  }
454
469
 
455
470
  module.exports = WebScraper;