clean-web-scraper 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +3 -2
- package/package.json +1 -1
- package/src/WebScraper.js +17 -3
package/example-usage.js
CHANGED
|
@@ -22,7 +22,7 @@ async function khameneiIrFreePalestineTag ()
|
|
|
22
22
|
includeMetadata: true,
|
|
23
23
|
metadataFields: ["title", "description", "author", "lastModified", "language"]
|
|
24
24
|
});
|
|
25
|
-
await scraper.start();
|
|
25
|
+
// await scraper.start();
|
|
26
26
|
return scraper;
|
|
27
27
|
}
|
|
28
28
|
|
|
@@ -50,7 +50,7 @@ async function decolonizepalestine ()
|
|
|
50
50
|
includeMetadata: true,
|
|
51
51
|
metadataFields: ["title", "description", "author", "lastModified", "language"]
|
|
52
52
|
});
|
|
53
|
-
await scraper.start();
|
|
53
|
+
// await scraper.start();
|
|
54
54
|
return scraper;
|
|
55
55
|
}
|
|
56
56
|
|
|
@@ -58,6 +58,7 @@ void async function main ()
|
|
|
58
58
|
{
|
|
59
59
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
|
|
60
60
|
const decolonizepalestineScraper = await decolonizepalestine();
|
|
61
|
+
await WebScraper.sleep( 1000 ); // Sleeps for 1 second
|
|
61
62
|
WebScraper.combineResults( "./dataset/combined", [
|
|
62
63
|
khameneiIrFreePalestineTagScraper,
|
|
63
64
|
decolonizepalestineScraper
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -155,9 +155,14 @@ class WebScraper
|
|
|
155
155
|
{
|
|
156
156
|
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
|
157
157
|
let writeStreamMeta
|
|
158
|
+
|
|
159
|
+
// Add error handlers
|
|
160
|
+
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing JSONL:", err ) });
|
|
161
|
+
|
|
158
162
|
if ( this.includeMetadata )
|
|
159
163
|
{
|
|
160
164
|
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
|
|
165
|
+
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
|
|
161
166
|
}
|
|
162
167
|
for ( const content of this.allProcessedContent )
|
|
163
168
|
{
|
|
@@ -171,6 +176,7 @@ class WebScraper
|
|
|
171
176
|
if ( this.includeMetadata )
|
|
172
177
|
{
|
|
173
178
|
writeStreamMeta.end();
|
|
179
|
+
console.log( `Created JSONL file at: ${this.jsonlOutputPathWithMeta}` );
|
|
174
180
|
}
|
|
175
181
|
console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
|
|
176
182
|
}
|
|
@@ -179,6 +185,7 @@ class WebScraper
|
|
|
179
185
|
{
|
|
180
186
|
// Create simple version
|
|
181
187
|
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
|
|
188
|
+
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
|
|
182
189
|
writeStreamSimple.write( "text\n" );
|
|
183
190
|
|
|
184
191
|
// Create metadata version if requested
|
|
@@ -186,6 +193,7 @@ class WebScraper
|
|
|
186
193
|
if ( this.includeMetadata )
|
|
187
194
|
{
|
|
188
195
|
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
|
|
196
|
+
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
|
|
189
197
|
}
|
|
190
198
|
|
|
191
199
|
if ( this.includeMetadata )
|
|
@@ -365,6 +373,11 @@ class WebScraper
|
|
|
365
373
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
366
374
|
}
|
|
367
375
|
|
|
376
|
+
static sleep ( ms )
|
|
377
|
+
{
|
|
378
|
+
return new Promise( resolve => { return setTimeout( resolve, ms ) });
|
|
379
|
+
}
|
|
380
|
+
|
|
368
381
|
static combineResults ( outputPath, websites )
|
|
369
382
|
{
|
|
370
383
|
const fullOutputPath = path.join( __dirname, outputPath );
|
|
@@ -375,8 +388,10 @@ class WebScraper
|
|
|
375
388
|
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
|
|
376
389
|
|
|
377
390
|
// Combine regular JSONL files
|
|
378
|
-
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
|
379
|
-
|
|
391
|
+
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
|
392
|
+
.on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
|
|
393
|
+
const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
|
|
394
|
+
.on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
|
|
380
395
|
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
381
396
|
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
|
382
397
|
|
|
@@ -449,7 +464,6 @@ class WebScraper
|
|
|
449
464
|
}
|
|
450
465
|
}
|
|
451
466
|
}
|
|
452
|
-
|
|
453
467
|
}
|
|
454
468
|
|
|
455
469
|
module.exports = WebScraper;
|