clean-web-scraper 3.0.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +4 -3
- package/package.json +1 -1
- package/src/WebScraper.js +65 -12
package/example-usage.js
CHANGED
|
@@ -20,9 +20,9 @@ async function khameneiIrFreePalestineTag ()
|
|
|
20
20
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
|
21
21
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
|
22
22
|
includeMetadata: true,
|
|
23
|
-
metadataFields: ["title", "description", "author", "lastModified", "language"
|
|
23
|
+
metadataFields: ["title", "description", "author", "lastModified", "language"]
|
|
24
24
|
});
|
|
25
|
-
await scraper.start();
|
|
25
|
+
// await scraper.start();
|
|
26
26
|
return scraper;
|
|
27
27
|
}
|
|
28
28
|
|
|
@@ -50,7 +50,7 @@ async function decolonizepalestine ()
|
|
|
50
50
|
includeMetadata: true,
|
|
51
51
|
metadataFields: ["title", "description", "author", "lastModified", "language"]
|
|
52
52
|
});
|
|
53
|
-
await scraper.start();
|
|
53
|
+
// await scraper.start();
|
|
54
54
|
return scraper;
|
|
55
55
|
}
|
|
56
56
|
|
|
@@ -58,6 +58,7 @@ void async function main ()
|
|
|
58
58
|
{
|
|
59
59
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
60
60
|
const decolonizepalestineScraper = await decolonizepalestine();
|
|
61
|
+
await WebScraper.sleep( 1000 ); // Sleeps for 1 second
|
|
61
62
|
WebScraper.combineResults( "./dataset/combined", [
|
|
62
63
|
khameneiIrFreePalestineTagScraper,
|
|
63
64
|
decolonizepalestineScraper
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -155,9 +155,14 @@ class WebScraper
|
|
|
155
155
|
{
|
|
156
156
|
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
|
157
157
|
let writeStreamMeta
|
|
158
|
+
|
|
159
|
+
// Add error handlers
|
|
160
|
+
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
|
|
161
|
+
|
|
158
162
|
if ( this.includeMetadata )
|
|
159
163
|
{
|
|
160
164
|
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
|
|
165
|
+
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
|
|
161
166
|
}
|
|
162
167
|
for ( const content of this.allProcessedContent )
|
|
163
168
|
{
|
|
@@ -171,6 +176,7 @@ class WebScraper
|
|
|
171
176
|
if ( this.includeMetadata )
|
|
172
177
|
{
|
|
173
178
|
writeStreamMeta.end();
|
|
179
|
+
console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
|
|
174
180
|
}
|
|
175
181
|
console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
|
|
176
182
|
}
|
|
@@ -179,6 +185,7 @@ class WebScraper
|
|
|
179
185
|
{
|
|
180
186
|
// Create simple version
|
|
181
187
|
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
|
|
188
|
+
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
|
|
182
189
|
writeStreamSimple.write( "text\n" );
|
|
183
190
|
|
|
184
191
|
// Create metadata version if requested
|
|
@@ -186,6 +193,7 @@ class WebScraper
|
|
|
186
193
|
if ( this.includeMetadata )
|
|
187
194
|
{
|
|
188
195
|
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
|
|
196
|
+
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
|
|
189
197
|
}
|
|
190
198
|
|
|
191
199
|
if ( this.includeMetadata )
|
|
@@ -359,11 +367,17 @@ class WebScraper
|
|
|
359
367
|
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
360
368
|
{
|
|
361
369
|
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
370
|
+
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
362
371
|
}
|
|
363
372
|
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
364
373
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
365
374
|
}
|
|
366
375
|
|
|
376
|
+
static sleep ( ms )
|
|
377
|
+
{
|
|
378
|
+
return new Promise( resolve => { return setTimeout( resolve, ms ) });
|
|
379
|
+
}
|
|
380
|
+
|
|
367
381
|
static combineResults ( outputPath, websites )
|
|
368
382
|
{
|
|
369
383
|
const fullOutputPath = path.join( __dirname, outputPath );
|
|
@@ -371,33 +385,58 @@ class WebScraper
|
|
|
371
385
|
// Create output directories
|
|
372
386
|
fs.mkdirSync( fullOutputPath, { recursive: true });
|
|
373
387
|
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
|
388
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
|
|
389
|
+
|
|
390
|
+
// Combine regular JSONL files
|
|
391
|
+
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
|
392
|
+
.on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
|
|
393
|
+
const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
|
|
394
|
+
.on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
|
|
395
|
+
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
396
|
+
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
|
374
397
|
|
|
375
|
-
|
|
376
|
-
const
|
|
398
|
+
csvOutput.write( "text\n" );
|
|
399
|
+
const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
|
|
400
|
+
if ( metadataFields.size > 0 )
|
|
401
|
+
{
|
|
402
|
+
csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
|
|
403
|
+
}
|
|
377
404
|
for ( const website of websites )
|
|
378
405
|
{
|
|
379
406
|
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
380
407
|
jsonlOutput.write( jsonlContent );
|
|
381
|
-
}
|
|
382
|
-
jsonlOutput.end();
|
|
383
408
|
|
|
384
|
-
// Combine CSV files
|
|
385
|
-
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
386
|
-
csvOutput.write( "text\n" );
|
|
387
|
-
for ( const website of websites )
|
|
388
|
-
{
|
|
389
409
|
const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
|
|
390
410
|
.split( "\n" )
|
|
391
|
-
.slice( 1 )
|
|
411
|
+
.slice( 1 )
|
|
392
412
|
.filter( line => { return line.trim() });
|
|
393
|
-
csvOutput.write( `${csvContent.join( "\n" )
|
|
413
|
+
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
|
414
|
+
|
|
415
|
+
// Combine metadata files if they exist
|
|
416
|
+
if ( website.includeMetadata )
|
|
417
|
+
{
|
|
418
|
+
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
419
|
+
jsonlMetaOutput.write( jsonlMetaContent );
|
|
420
|
+
|
|
421
|
+
const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
|
|
422
|
+
.split( "\n" )
|
|
423
|
+
.slice( 1 )
|
|
424
|
+
.filter( line => { return line.trim() });
|
|
425
|
+
csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
|
|
426
|
+
}
|
|
394
427
|
}
|
|
428
|
+
|
|
429
|
+
// Close all streams
|
|
430
|
+
jsonlOutput.end();
|
|
431
|
+
jsonlMetaOutput.end();
|
|
395
432
|
csvOutput.end();
|
|
433
|
+
csvMetaOutput.end();
|
|
396
434
|
|
|
397
|
-
// Combine text files
|
|
435
|
+
// Combine text files (both regular and metadata versions)
|
|
398
436
|
let textFileCounter = 1;
|
|
399
437
|
for ( const website of websites )
|
|
400
438
|
{
|
|
439
|
+
// Regular text files
|
|
401
440
|
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
|
402
441
|
for ( const file of textFiles )
|
|
403
442
|
{
|
|
@@ -407,6 +446,20 @@ class WebScraper
|
|
|
407
446
|
content,
|
|
408
447
|
"utf-8"
|
|
409
448
|
);
|
|
449
|
+
|
|
450
|
+
// Metadata text files if they exist
|
|
451
|
+
if ( website.includeMetadata )
|
|
452
|
+
{
|
|
453
|
+
const metaContent = fs.readFileSync(
|
|
454
|
+
path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
|
|
455
|
+
"utf-8"
|
|
456
|
+
);
|
|
457
|
+
fs.writeFileSync(
|
|
458
|
+
path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
|
|
459
|
+
metaContent,
|
|
460
|
+
"utf-8"
|
|
461
|
+
);
|
|
462
|
+
}
|
|
410
463
|
textFileCounter++;
|
|
411
464
|
}
|
|
412
465
|
}
|