clean-web-scraper 3.0.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +1 -1
- package/package.json +1 -1
- package/src/WebScraper.js +50 -11
package/example-usage.js
CHANGED
|
@@ -20,7 +20,7 @@ async function khameneiIrFreePalestineTag ()
|
|
|
20
20
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
|
21
21
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
|
22
22
|
includeMetadata: true,
|
|
23
|
-
metadataFields: ["title", "description", "author", "lastModified", "language"
|
|
23
|
+
metadataFields: ["title", "description", "author", "lastModified", "language"]
|
|
24
24
|
});
|
|
25
25
|
await scraper.start();
|
|
26
26
|
return scraper;
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -359,6 +359,7 @@ class WebScraper
|
|
|
359
359
|
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
360
360
|
{
|
|
361
361
|
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
362
|
+
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
362
363
|
}
|
|
363
364
|
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
364
365
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
@@ -371,33 +372,56 @@ class WebScraper
|
|
|
371
372
|
// Create output directories
|
|
372
373
|
fs.mkdirSync( fullOutputPath, { recursive: true });
|
|
373
374
|
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
|
375
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
|
|
374
376
|
|
|
375
|
-
// Combine JSONL files
|
|
377
|
+
// Combine regular JSONL files
|
|
376
378
|
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
|
|
379
|
+
const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) );
|
|
380
|
+
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
381
|
+
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
|
382
|
+
|
|
383
|
+
csvOutput.write( "text\n" );
|
|
384
|
+
const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
|
|
385
|
+
if ( metadataFields.size > 0 )
|
|
386
|
+
{
|
|
387
|
+
csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
|
|
388
|
+
}
|
|
377
389
|
for ( const website of websites )
|
|
378
390
|
{
|
|
379
391
|
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
380
392
|
jsonlOutput.write( jsonlContent );
|
|
381
|
-
}
|
|
382
|
-
jsonlOutput.end();
|
|
383
393
|
|
|
384
|
-
// Combine CSV files
|
|
385
|
-
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
386
|
-
csvOutput.write( "text\n" );
|
|
387
|
-
for ( const website of websites )
|
|
388
|
-
{
|
|
389
394
|
const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
|
|
390
395
|
.split( "\n" )
|
|
391
|
-
.slice( 1 )
|
|
396
|
+
.slice( 1 )
|
|
392
397
|
.filter( line => { return line.trim() });
|
|
393
|
-
csvOutput.write( `${csvContent.join( "\n" )
|
|
398
|
+
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
|
399
|
+
|
|
400
|
+
// Combine metadata files if they exist
|
|
401
|
+
if ( website.includeMetadata )
|
|
402
|
+
{
|
|
403
|
+
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
404
|
+
jsonlMetaOutput.write( jsonlMetaContent );
|
|
405
|
+
|
|
406
|
+
const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
|
|
407
|
+
.split( "\n" )
|
|
408
|
+
.slice( 1 )
|
|
409
|
+
.filter( line => { return line.trim() });
|
|
410
|
+
csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
|
|
411
|
+
}
|
|
394
412
|
}
|
|
413
|
+
|
|
414
|
+
// Close all streams
|
|
415
|
+
jsonlOutput.end();
|
|
416
|
+
jsonlMetaOutput.end();
|
|
395
417
|
csvOutput.end();
|
|
418
|
+
csvMetaOutput.end();
|
|
396
419
|
|
|
397
|
-
// Combine text files
|
|
420
|
+
// Combine text files (both regular and metadata versions)
|
|
398
421
|
let textFileCounter = 1;
|
|
399
422
|
for ( const website of websites )
|
|
400
423
|
{
|
|
424
|
+
// Regular text files
|
|
401
425
|
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
|
402
426
|
for ( const file of textFiles )
|
|
403
427
|
{
|
|
@@ -407,10 +431,25 @@ class WebScraper
|
|
|
407
431
|
content,
|
|
408
432
|
"utf-8"
|
|
409
433
|
);
|
|
434
|
+
|
|
435
|
+
// Metadata text files if they exist
|
|
436
|
+
if ( website.includeMetadata )
|
|
437
|
+
{
|
|
438
|
+
const metaContent = fs.readFileSync(
|
|
439
|
+
path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
|
|
440
|
+
"utf-8"
|
|
441
|
+
);
|
|
442
|
+
fs.writeFileSync(
|
|
443
|
+
path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
|
|
444
|
+
metaContent,
|
|
445
|
+
"utf-8"
|
|
446
|
+
);
|
|
447
|
+
}
|
|
410
448
|
textFileCounter++;
|
|
411
449
|
}
|
|
412
450
|
}
|
|
413
451
|
}
|
|
452
|
+
|
|
414
453
|
}
|
|
415
454
|
|
|
416
455
|
module.exports = WebScraper;
|