clean-web-scraper 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -22,7 +22,7 @@ async function khameneiIrFreePalestineTag ()
22
22
  includeMetadata: true,
23
23
  metadataFields: ["title", "description", "author", "lastModified", "language"]
24
24
  });
25
- await scraper.start();
25
+ // await scraper.start();
26
26
  return scraper;
27
27
  }
28
28
 
@@ -50,7 +50,7 @@ async function decolonizepalestine ()
50
50
  includeMetadata: true,
51
51
  metadataFields: ["title", "description", "author", "lastModified", "language"]
52
52
  });
53
- await scraper.start();
53
+ // await scraper.start();
54
54
  return scraper;
55
55
  }
56
56
 
@@ -58,6 +58,7 @@ void async function main ()
58
58
  {
59
59
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
60
60
  const decolonizepalestineScraper = await decolonizepalestine();
61
+ await WebScraper.sleep( 1000 ); // Sleeps for 1 second
61
62
  WebScraper.combineResults( "./dataset/combined", [
62
63
  khameneiIrFreePalestineTagScraper,
63
64
  decolonizepalestineScraper
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.1.0",
3
+ "version": "3.2.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -155,9 +155,14 @@ class WebScraper
155
155
  {
156
156
  const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
157
157
  let writeStreamMeta
158
+
159
+ // Add error handlers
160
+ writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
161
+
158
162
  if ( this.includeMetadata )
159
163
  {
160
164
  writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
165
+ writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
161
166
  }
162
167
  for ( const content of this.allProcessedContent )
163
168
  {
@@ -171,6 +176,7 @@ class WebScraper
171
176
  if ( this.includeMetadata )
172
177
  {
173
178
  writeStreamMeta.end();
179
+ console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
174
180
  }
175
181
  console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
176
182
  }
@@ -179,6 +185,7 @@ class WebScraper
179
185
  {
180
186
  // Create simple version
181
187
  const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
188
+ writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
182
189
  writeStreamSimple.write( "text\n" );
183
190
 
184
191
  // Create metadata version if requested
@@ -186,6 +193,7 @@ class WebScraper
186
193
  if ( this.includeMetadata )
187
194
  {
188
195
  writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
196
+ writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
189
197
  }
190
198
 
191
199
  if ( this.includeMetadata )
@@ -365,6 +373,11 @@ class WebScraper
365
373
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
366
374
  }
367
375
 
376
+ static sleep ( ms )
377
+ {
378
+ return new Promise( resolve => { return setTimeout( resolve, ms ) });
379
+ }
380
+
368
381
  static combineResults ( outputPath, websites )
369
382
  {
370
383
  const fullOutputPath = path.join( __dirname, outputPath );
@@ -375,8 +388,10 @@ class WebScraper
375
388
  fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
376
389
 
377
390
  // Combine regular JSONL files
378
- const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
379
- const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) );
391
+ const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
392
+ .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
393
+ const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
394
+ .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
380
395
  const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
381
396
  const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
382
397
 
@@ -449,7 +464,6 @@ class WebScraper
449
464
  }
450
465
  }
451
466
  }
452
-
453
467
  }
454
468
 
455
469
  module.exports = WebScraper;