clean-web-scraper 3.0.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -20,9 +20,9 @@ async function khameneiIrFreePalestineTag ()
20
20
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
21
21
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
22
22
  includeMetadata: true,
23
- metadataFields: ["title", "description", "author", "lastModified", "language", "ogTags"]
23
+ metadataFields: ["title", "description", "author", "lastModified", "language"]
24
24
  });
25
- await scraper.start();
25
+ // await scraper.start();
26
26
  return scraper;
27
27
  }
28
28
 
@@ -50,7 +50,7 @@ async function decolonizepalestine ()
50
50
  includeMetadata: true,
51
51
  metadataFields: ["title", "description", "author", "lastModified", "language"]
52
52
  });
53
- await scraper.start();
53
+ // await scraper.start();
54
54
  return scraper;
55
55
  }
56
56
 
@@ -58,6 +58,7 @@ void async function main ()
58
58
  {
59
59
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
60
60
  const decolonizepalestineScraper = await decolonizepalestine();
61
+ await WebScraper.sleep( 1000 ); // Sleeps for 1 second
61
62
  WebScraper.combineResults( "./dataset/combined", [
62
63
  khameneiIrFreePalestineTagScraper,
63
64
  decolonizepalestineScraper
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.0.0",
3
+ "version": "3.2.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -155,9 +155,14 @@ class WebScraper
155
155
  {
156
156
  const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
157
157
  let writeStreamMeta
158
+
159
+ // Add error handlers
160
+ writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
161
+
158
162
  if ( this.includeMetadata )
159
163
  {
160
164
  writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
165
+ writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
161
166
  }
162
167
  for ( const content of this.allProcessedContent )
163
168
  {
@@ -171,6 +176,7 @@ class WebScraper
171
176
  if ( this.includeMetadata )
172
177
  {
173
178
  writeStreamMeta.end();
179
+ console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
174
180
  }
175
181
  console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
176
182
  }
@@ -179,6 +185,7 @@ class WebScraper
179
185
  {
180
186
  // Create simple version
181
187
  const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
188
+ writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
182
189
  writeStreamSimple.write( "text\n" );
183
190
 
184
191
  // Create metadata version if requested
@@ -186,6 +193,7 @@ class WebScraper
186
193
  if ( this.includeMetadata )
187
194
  {
188
195
  writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
196
+ writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
189
197
  }
190
198
 
191
199
  if ( this.includeMetadata )
@@ -359,11 +367,17 @@ class WebScraper
359
367
  if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
360
368
  {
361
369
  fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
370
+ fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
362
371
  }
363
372
  fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
364
373
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
365
374
  }
366
375
 
376
+ static sleep ( ms )
377
+ {
378
+ return new Promise( resolve => { return setTimeout( resolve, ms ) });
379
+ }
380
+
367
381
  static combineResults ( outputPath, websites )
368
382
  {
369
383
  const fullOutputPath = path.join( __dirname, outputPath );
@@ -371,33 +385,58 @@ class WebScraper
371
385
  // Create output directories
372
386
  fs.mkdirSync( fullOutputPath, { recursive: true });
373
387
  fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
388
+ fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
389
+
390
+ // Combine regular JSONL files
391
+ const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
392
+ .on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
393
+ const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
394
+ .on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
395
+ const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
396
+ const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
374
397
 
375
- // Combine JSONL files
376
- const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
398
+ csvOutput.write( "text\n" );
399
+ const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
400
+ if ( metadataFields.size > 0 )
401
+ {
402
+ csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
403
+ }
377
404
  for ( const website of websites )
378
405
  {
379
406
  const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
380
407
  jsonlOutput.write( jsonlContent );
381
- }
382
- jsonlOutput.end();
383
408
 
384
- // Combine CSV files
385
- const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
386
- csvOutput.write( "text\n" );
387
- for ( const website of websites )
388
- {
389
409
  const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
390
410
  .split( "\n" )
391
- .slice( 1 ) // Skip header
411
+ .slice( 1 )
392
412
  .filter( line => { return line.trim() });
393
- csvOutput.write( `${csvContent.join( "\n" ) }\n` );
413
+ csvOutput.write( `${csvContent.join( "\n" )}\n` );
414
+
415
+ // Combine metadata files if they exist
416
+ if ( website.includeMetadata )
417
+ {
418
+ const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
419
+ jsonlMetaOutput.write( jsonlMetaContent );
420
+
421
+ const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
422
+ .split( "\n" )
423
+ .slice( 1 )
424
+ .filter( line => { return line.trim() });
425
+ csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
426
+ }
394
427
  }
428
+
429
+ // Close all streams
430
+ jsonlOutput.end();
431
+ jsonlMetaOutput.end();
395
432
  csvOutput.end();
433
+ csvMetaOutput.end();
396
434
 
397
- // Combine text files
435
+ // Combine text files (both regular and metadata versions)
398
436
  let textFileCounter = 1;
399
437
  for ( const website of websites )
400
438
  {
439
+ // Regular text files
401
440
  const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
402
441
  for ( const file of textFiles )
403
442
  {
@@ -407,6 +446,20 @@ class WebScraper
407
446
  content,
408
447
  "utf-8"
409
448
  );
449
+
450
+ // Metadata text files if they exist
451
+ if ( website.includeMetadata )
452
+ {
453
+ const metaContent = fs.readFileSync(
454
+ path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
455
+ "utf-8"
456
+ );
457
+ fs.writeFileSync(
458
+ path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
459
+ metaContent,
460
+ "utf-8"
461
+ );
462
+ }
410
463
  textFileCounter++;
411
464
  }
412
465
  }