clean-web-scraper 4.1.7 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/main.js +28 -27
- package/package.json +1 -1
package/main.js
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
const process = require( "node:process" );
|
2
|
+
const fs = require( "fs" );
|
3
|
+
const path = require( "path" );
|
1
4
|
const axios = require( "axios" );
|
2
5
|
const { JSDOM } = require( "jsdom" );
|
3
6
|
const { Readability } = require( "@mozilla/readability" );
|
4
|
-
const fs = require( "fs" );
|
5
|
-
const path = require( "path" );
|
6
7
|
const { connect } = require( "puppeteer-real-browser" );
|
7
8
|
|
8
9
|
class WebScraper
|
@@ -334,7 +335,7 @@ class WebScraper
|
|
334
335
|
{
|
335
336
|
urlPath = urlPath.slice( 0, -1 );
|
336
337
|
}
|
337
|
-
const filePath = path.join(
|
338
|
+
const filePath = path.join( process.cwd(), this.scrapResultPath, urlPath );
|
338
339
|
const dir = path.dirname( filePath );
|
339
340
|
|
340
341
|
fs.mkdirSync( dir, { recursive: true });
|
@@ -347,14 +348,14 @@ class WebScraper
|
|
347
348
|
|
348
349
|
createJSONLFile ()
|
349
350
|
{
|
350
|
-
const writeStreamSimple = fs.createWriteStream( path.join(
|
351
|
+
const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPath ) );
|
351
352
|
writeStreamSimple.on( "error", err =>
|
352
353
|
{ return console.error( "Error writing JSONL:", err ) });
|
353
354
|
|
354
355
|
let writeStreamMeta;
|
355
356
|
if ( this.includeMetadata )
|
356
357
|
{
|
357
|
-
writeStreamMeta = fs.createWriteStream( path.join(
|
358
|
+
writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPathWithMeta ) );
|
358
359
|
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
|
359
360
|
}
|
360
361
|
for ( const content of this.allProcessedContent )
|
@@ -377,7 +378,7 @@ class WebScraper
|
|
377
378
|
createCSVFile ()
|
378
379
|
{
|
379
380
|
// Create simple version
|
380
|
-
const writeStreamSimple = fs.createWriteStream( path.join(
|
381
|
+
const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPath ) );
|
381
382
|
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
|
382
383
|
writeStreamSimple.write( "text\n" );
|
383
384
|
|
@@ -385,7 +386,7 @@ class WebScraper
|
|
385
386
|
let writeStreamMeta;
|
386
387
|
if ( this.includeMetadata )
|
387
388
|
{
|
388
|
-
writeStreamMeta = fs.createWriteStream( path.join(
|
389
|
+
writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPathWithMeta ) );
|
389
390
|
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
|
390
391
|
}
|
391
392
|
|
@@ -427,12 +428,12 @@ class WebScraper
|
|
427
428
|
|
428
429
|
saveNumberedTextFiles ()
|
429
430
|
{
|
430
|
-
const baseTextPath = path.join(
|
431
|
+
const baseTextPath = path.join( process.cwd(), this.textOutputPath );
|
431
432
|
|
432
433
|
let metaTextPath = null;
|
433
434
|
if ( this.includeMetadata )
|
434
435
|
{
|
435
|
-
metaTextPath = path.join(
|
436
|
+
metaTextPath = path.join( process.cwd(), this.textOutputPathWithMeta );
|
436
437
|
fs.mkdirSync( metaTextPath, { recursive: true });
|
437
438
|
}
|
438
439
|
|
@@ -672,13 +673,13 @@ class WebScraper
|
|
672
673
|
createOutputDirectory ()
|
673
674
|
{
|
674
675
|
const paths = [
|
675
|
-
path.join(
|
676
|
-
path.join(
|
677
|
-
path.join(
|
678
|
-
path.join(
|
679
|
-
path.join(
|
680
|
-
path.join(
|
681
|
-
path.join(
|
676
|
+
path.join( process.cwd(), this.scrapResultPath ),
|
677
|
+
path.join( process.cwd(), this.textOutputPath ),
|
678
|
+
path.join( process.cwd(), this.textOutputPathWithMeta ),
|
679
|
+
path.join( process.cwd(), this.csvOutputPath ),
|
680
|
+
path.join( process.cwd(), this.csvOutputPathWithMeta ),
|
681
|
+
path.join( process.cwd(), this.jsonlOutputPath ),
|
682
|
+
path.join( process.cwd(), this.jsonlOutputPathWithMeta )
|
682
683
|
];
|
683
684
|
for ( const p of paths )
|
684
685
|
{
|
@@ -688,9 +689,9 @@ class WebScraper
|
|
688
689
|
}
|
689
690
|
}
|
690
691
|
// Recreate directories needed for output
|
691
|
-
this.ensureDirectory( path.join(
|
692
|
-
this.ensureDirectory( path.join(
|
693
|
-
this.ensureDirectory( path.join(
|
692
|
+
this.ensureDirectory( path.join( process.cwd(), this.scrapResultPath ) );
|
693
|
+
this.ensureDirectory( path.join( process.cwd(), this.textOutputPath ) );
|
694
|
+
this.ensureDirectory( path.join( process.cwd(), this.textOutputPathWithMeta ) );
|
694
695
|
}
|
695
696
|
|
696
697
|
ensureDirectory ( dirPath )
|
@@ -709,7 +710,7 @@ class WebScraper
|
|
709
710
|
static async combineResults ( outputPath, websites )
|
710
711
|
{
|
711
712
|
await WebScraper.sleep( 1000 );
|
712
|
-
const fullOutputPath = path.join(
|
713
|
+
const fullOutputPath = path.join( process.cwd(), outputPath );
|
713
714
|
WebScraper.createCombinedDirectories( fullOutputPath );
|
714
715
|
WebScraper.combineJSONLFiles( fullOutputPath, websites );
|
715
716
|
WebScraper.combineCSVFiles( fullOutputPath, websites );
|
@@ -743,7 +744,7 @@ class WebScraper
|
|
743
744
|
for ( const website of websites )
|
744
745
|
{
|
745
746
|
const jsonlContent = fs.readFileSync(
|
746
|
-
path.join(
|
747
|
+
path.join( process.cwd(), website.jsonlOutputPath ),
|
747
748
|
"utf-8"
|
748
749
|
);
|
749
750
|
if ( jsonlContent )
|
@@ -753,7 +754,7 @@ class WebScraper
|
|
753
754
|
if ( website.includeMetadata )
|
754
755
|
{
|
755
756
|
const jsonlMetaContent = fs.readFileSync(
|
756
|
-
path.join(
|
757
|
+
path.join( process.cwd(), website.jsonlOutputPathWithMeta ),
|
757
758
|
"utf-8"
|
758
759
|
);
|
759
760
|
if ( jsonlMetaContent )
|
@@ -782,7 +783,7 @@ class WebScraper
|
|
782
783
|
|
783
784
|
for ( const website of websites )
|
784
785
|
{
|
785
|
-
const csvContent = fs.readFileSync( path.join(
|
786
|
+
const csvContent = fs.readFileSync( path.join( process.cwd(), website.csvOutputPath ), "utf-8" )
|
786
787
|
.split( "\n" )
|
787
788
|
.slice( 1 )
|
788
789
|
.filter( line => { return line.trim() });
|
@@ -794,7 +795,7 @@ class WebScraper
|
|
794
795
|
{
|
795
796
|
const csvMetaContent = fs
|
796
797
|
.readFileSync(
|
797
|
-
path.join(
|
798
|
+
path.join( process.cwd(), website.csvOutputPathWithMeta ),
|
798
799
|
"utf-8"
|
799
800
|
)
|
800
801
|
.split( "\n" )
|
@@ -815,11 +816,11 @@ class WebScraper
|
|
815
816
|
let textFileCounter = 1;
|
816
817
|
for ( const website of websites )
|
817
818
|
{
|
818
|
-
const textFiles = fs.readdirSync( path.join(
|
819
|
+
const textFiles = fs.readdirSync( path.join( process.cwd(), website.textOutputPath ) );
|
819
820
|
for ( const file of textFiles )
|
820
821
|
{
|
821
822
|
const content = fs.readFileSync(
|
822
|
-
path.join(
|
823
|
+
path.join( process.cwd(), website.textOutputPath, file ),
|
823
824
|
"utf-8"
|
824
825
|
);
|
825
826
|
fs.writeFileSync(
|
@@ -830,7 +831,7 @@ class WebScraper
|
|
830
831
|
if ( website.includeMetadata )
|
831
832
|
{
|
832
833
|
const metaContent = fs.readFileSync(
|
833
|
-
path.join(
|
834
|
+
path.join( process.cwd(), website.textOutputPathWithMeta, file ),
|
834
835
|
"utf-8"
|
835
836
|
);
|
836
837
|
fs.writeFileSync(
|