clean-web-scraper 3.1.0 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/example-usage.js +1 -1
- package/package.json +1 -1
- package/src/WebScraper.js +19 -4
package/README.md
CHANGED
|
@@ -57,10 +57,10 @@ const scraper = new WebScraper({
|
|
|
57
57
|
includeMetadata: false, // Optional: Include metadata in output files
|
|
58
58
|
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
59
59
|
});
|
|
60
|
-
scraper.start();
|
|
60
|
+
await scraper.start();
|
|
61
61
|
|
|
62
62
|
// Combine results from multiple scrapers
|
|
63
|
-
WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
|
|
63
|
+
await WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
|
|
64
64
|
```
|
|
65
65
|
|
|
66
66
|
```bash
|
package/example-usage.js
CHANGED
|
@@ -58,7 +58,7 @@ void async function main ()
|
|
|
58
58
|
{
|
|
59
59
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
60
60
|
const decolonizepalestineScraper = await decolonizepalestine();
|
|
61
|
-
WebScraper.combineResults( "./dataset/combined", [
|
|
61
|
+
await WebScraper.combineResults( "./dataset/combined", [
|
|
62
62
|
khameneiIrFreePalestineTagScraper,
|
|
63
63
|
decolonizepalestineScraper
|
|
64
64
|
] );
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -155,9 +155,14 @@ class WebScraper
|
|
|
155
155
|
{
|
|
156
156
|
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
|
157
157
|
let writeStreamMeta
|
|
158
|
+
|
|
159
|
+
// Add error handlers
|
|
160
|
+
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
|
|
161
|
+
|
|
158
162
|
if ( this.includeMetadata )
|
|
159
163
|
{
|
|
160
164
|
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
|
|
165
|
+
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
|
|
161
166
|
}
|
|
162
167
|
for ( const content of this.allProcessedContent )
|
|
163
168
|
{
|
|
@@ -171,6 +176,7 @@ class WebScraper
|
|
|
171
176
|
if ( this.includeMetadata )
|
|
172
177
|
{
|
|
173
178
|
writeStreamMeta.end();
|
|
179
|
+
console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
|
|
174
180
|
}
|
|
175
181
|
console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
|
|
176
182
|
}
|
|
@@ -179,6 +185,7 @@ class WebScraper
|
|
|
179
185
|
{
|
|
180
186
|
// Create simple version
|
|
181
187
|
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
|
|
188
|
+
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
|
|
182
189
|
writeStreamSimple.write( "text\n" );
|
|
183
190
|
|
|
184
191
|
// Create metadata version if requested
|
|
@@ -186,6 +193,7 @@ class WebScraper
|
|
|
186
193
|
if ( this.includeMetadata )
|
|
187
194
|
{
|
|
188
195
|
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
|
|
196
|
+
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
|
|
189
197
|
}
|
|
190
198
|
|
|
191
199
|
if ( this.includeMetadata )
|
|
@@ -365,8 +373,14 @@ class WebScraper
|
|
|
365
373
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
366
374
|
}
|
|
367
375
|
|
|
368
|
-
static
|
|
376
|
+
static sleep ( ms )
|
|
377
|
+
{
|
|
378
|
+
return new Promise( resolve => { return setTimeout( resolve, ms ) });
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
static async combineResults ( outputPath, websites )
|
|
369
382
|
{
|
|
383
|
+
await WebScraper.sleep( 1000 );
|
|
370
384
|
const fullOutputPath = path.join( __dirname, outputPath );
|
|
371
385
|
|
|
372
386
|
// Create output directories
|
|
@@ -375,8 +389,10 @@ class WebScraper
|
|
|
375
389
|
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
|
|
376
390
|
|
|
377
391
|
// Combine regular JSONL files
|
|
378
|
-
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
|
379
|
-
|
|
392
|
+
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
|
393
|
+
.on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
|
|
394
|
+
const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
|
|
395
|
+
.on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
|
|
380
396
|
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
381
397
|
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
|
382
398
|
|
|
@@ -449,7 +465,6 @@ class WebScraper
|
|
|
449
465
|
}
|
|
450
466
|
}
|
|
451
467
|
}
|
|
452
|
-
|
|
453
468
|
}
|
|
454
469
|
|
|
455
470
|
module.exports = WebScraper;
|