clean-web-scraper 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/example-usage.js CHANGED
@@ -20,7 +20,7 @@ async function khameneiIrFreePalestineTag ()
20
20
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
21
21
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
22
22
  includeMetadata: true,
23
- metadataFields: ["title", "description", "author", "lastModified", "language", "ogTags"]
23
+ metadataFields: ["title", "description", "author", "lastModified", "language"]
24
24
  });
25
25
  await scraper.start();
26
26
  return scraper;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "3.0.0",
3
+ "version": "3.1.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -359,6 +359,7 @@ class WebScraper
359
359
  if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
360
360
  {
361
361
  fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
362
+ fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
362
363
  }
363
364
  fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
364
365
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
@@ -371,33 +372,56 @@ class WebScraper
371
372
  // Create output directories
372
373
  fs.mkdirSync( fullOutputPath, { recursive: true });
373
374
  fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
375
+ fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
374
376
 
375
- // Combine JSONL files
377
+ // Combine regular JSONL files
376
378
  const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
379
+ const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) );
380
+ const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
381
+ const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
382
+
383
+ csvOutput.write( "text\n" );
384
+ const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
385
+ if ( metadataFields.size > 0 )
386
+ {
387
+ csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
388
+ }
377
389
  for ( const website of websites )
378
390
  {
379
391
  const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
380
392
  jsonlOutput.write( jsonlContent );
381
- }
382
- jsonlOutput.end();
383
393
 
384
- // Combine CSV files
385
- const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
386
- csvOutput.write( "text\n" );
387
- for ( const website of websites )
388
- {
389
394
  const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
390
395
  .split( "\n" )
391
- .slice( 1 ) // Skip header
396
+ .slice( 1 )
392
397
  .filter( line => { return line.trim() });
393
- csvOutput.write( `${csvContent.join( "\n" ) }\n` );
398
+ csvOutput.write( `${csvContent.join( "\n" )}\n` );
399
+
400
+ // Combine metadata files if they exist
401
+ if ( website.includeMetadata )
402
+ {
403
+ const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
404
+ jsonlMetaOutput.write( jsonlMetaContent );
405
+
406
+ const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
407
+ .split( "\n" )
408
+ .slice( 1 )
409
+ .filter( line => { return line.trim() });
410
+ csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
411
+ }
394
412
  }
413
+
414
+ // Close all streams
415
+ jsonlOutput.end();
416
+ jsonlMetaOutput.end();
395
417
  csvOutput.end();
418
+ csvMetaOutput.end();
396
419
 
397
- // Combine text files
420
+ // Combine text files (both regular and metadata versions)
398
421
  let textFileCounter = 1;
399
422
  for ( const website of websites )
400
423
  {
424
+ // Regular text files
401
425
  const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
402
426
  for ( const file of textFiles )
403
427
  {
@@ -407,10 +431,25 @@ class WebScraper
407
431
  content,
408
432
  "utf-8"
409
433
  );
434
+
435
+ // Metadata text files if they exist
436
+ if ( website.includeMetadata )
437
+ {
438
+ const metaContent = fs.readFileSync(
439
+ path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
440
+ "utf-8"
441
+ );
442
+ fs.writeFileSync(
443
+ path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
444
+ metaContent,
445
+ "utf-8"
446
+ );
447
+ }
410
448
  textFileCounter++;
411
449
  }
412
450
  }
413
451
  }
452
+
414
453
  }
415
454
 
416
455
  module.exports = WebScraper;