clean-web-scraper 4.1.7 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +13 -10
- package/main.js +32 -43
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -50,6 +50,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
50
50
|
baseURL: "https://english.khamenei.ir/news",
|
51
51
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
52
52
|
maxDepth: 1,
|
53
|
+
maxArticles: 2,
|
53
54
|
exactExcludeList: [
|
54
55
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
55
56
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
|
@@ -72,6 +73,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
|
|
72
73
|
baseURL: "https://english.khamenei.ir/news",
|
73
74
|
startURL: "https://english.khamenei.ir/palestine-special-page",
|
74
75
|
maxDepth: 2,
|
76
|
+
maxArticles: 2,
|
75
77
|
exactExcludeList: [
|
76
78
|
"https://english.khamenei.ir/palestine-special-page/"
|
77
79
|
],
|
@@ -101,6 +103,7 @@ async function decolonizepalestine ( enable )
|
|
101
103
|
"https://decolonizepalestine.com/rainbow-washing",
|
102
104
|
"https://decolonizepalestine.com/"
|
103
105
|
],
|
106
|
+
maxArticles: 2,
|
104
107
|
scrapResultPath: "./dataset/decolonizepalestine/website",
|
105
108
|
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
106
109
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
@@ -307,7 +310,7 @@ async function palestineremembered ( enable )
|
|
307
310
|
void async function main ()
|
308
311
|
{
|
309
312
|
// const palianswersScraper = await palianswers( true );
|
310
|
-
|
313
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
311
314
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
312
315
|
// const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
|
313
316
|
// const electronicintifadaScraper = await electronicintifada( true );
|
@@ -316,13 +319,13 @@ void async function main ()
|
|
316
319
|
// const bdsmovementScraper = await bdsmovement( false );
|
317
320
|
// const palestinerememberedScraper = await palestineremembered( false );
|
318
321
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
322
|
+
await WebScraper.combineResults( "./dataset/combined", [
|
323
|
+
// palianswersScraper,
|
324
|
+
decolonizepalestineScraper,
|
325
|
+
khameneiIrFreePalestineTagScraper,
|
326
|
+
// khameneiIrPalestineSpecialPageScraper,
|
327
|
+
// electronicintifadaScraper,
|
328
|
+
// standWithPalestineScraper,
|
329
|
+
// mondoweisScraper
|
330
|
+
] );
|
328
331
|
}();
|
package/main.js
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
+
const process = require( "node:process" );
|
2
|
+
const fs = require( "fs" );
|
3
|
+
const path = require( "path" );
|
1
4
|
const axios = require( "axios" );
|
2
5
|
const { JSDOM } = require( "jsdom" );
|
3
6
|
const { Readability } = require( "@mozilla/readability" );
|
4
|
-
const fs = require( "fs" );
|
5
|
-
const path = require( "path" );
|
6
7
|
const { connect } = require( "puppeteer-real-browser" );
|
7
8
|
|
8
9
|
class WebScraper
|
@@ -334,7 +335,7 @@ class WebScraper
|
|
334
335
|
{
|
335
336
|
urlPath = urlPath.slice( 0, -1 );
|
336
337
|
}
|
337
|
-
const filePath = path.join(
|
338
|
+
const filePath = path.join( this.scrapResultPath, urlPath );
|
338
339
|
const dir = path.dirname( filePath );
|
339
340
|
|
340
341
|
fs.mkdirSync( dir, { recursive: true });
|
@@ -347,14 +348,14 @@ class WebScraper
|
|
347
348
|
|
348
349
|
createJSONLFile ()
|
349
350
|
{
|
350
|
-
const writeStreamSimple = fs.createWriteStream(
|
351
|
+
const writeStreamSimple = fs.createWriteStream( this.jsonlOutputPath );
|
351
352
|
writeStreamSimple.on( "error", err =>
|
352
353
|
{ return console.error( "Error writing JSONL:", err ) });
|
353
354
|
|
354
355
|
let writeStreamMeta;
|
355
356
|
if ( this.includeMetadata )
|
356
357
|
{
|
357
|
-
writeStreamMeta = fs.createWriteStream( path.join(
|
358
|
+
writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPathWithMeta ) );
|
358
359
|
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
|
359
360
|
}
|
360
361
|
for ( const content of this.allProcessedContent )
|
@@ -377,7 +378,7 @@ class WebScraper
|
|
377
378
|
createCSVFile ()
|
378
379
|
{
|
379
380
|
// Create simple version
|
380
|
-
const writeStreamSimple = fs.createWriteStream( path.join(
|
381
|
+
const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPath ) );
|
381
382
|
writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
|
382
383
|
writeStreamSimple.write( "text\n" );
|
383
384
|
|
@@ -385,7 +386,7 @@ class WebScraper
|
|
385
386
|
let writeStreamMeta;
|
386
387
|
if ( this.includeMetadata )
|
387
388
|
{
|
388
|
-
writeStreamMeta = fs.createWriteStream( path.join(
|
389
|
+
writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPathWithMeta ) );
|
389
390
|
writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
|
390
391
|
}
|
391
392
|
|
@@ -427,12 +428,12 @@ class WebScraper
|
|
427
428
|
|
428
429
|
saveNumberedTextFiles ()
|
429
430
|
{
|
430
|
-
const baseTextPath =
|
431
|
+
const baseTextPath = this.textOutputPath;
|
431
432
|
|
432
433
|
let metaTextPath = null;
|
433
434
|
if ( this.includeMetadata )
|
434
435
|
{
|
435
|
-
metaTextPath = path.join(
|
436
|
+
metaTextPath = path.join( process.cwd(), this.textOutputPathWithMeta );
|
436
437
|
fs.mkdirSync( metaTextPath, { recursive: true });
|
437
438
|
}
|
438
439
|
|
@@ -672,13 +673,13 @@ class WebScraper
|
|
672
673
|
createOutputDirectory ()
|
673
674
|
{
|
674
675
|
const paths = [
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
676
|
+
this.scrapResultPath,
|
677
|
+
this.textOutputPath,
|
678
|
+
this.textOutputPathWithMeta,
|
679
|
+
this.csvOutputPath,
|
680
|
+
this.csvOutputPathWithMeta,
|
681
|
+
this.jsonlOutputPath,
|
682
|
+
this.jsonlOutputPathWithMeta
|
682
683
|
];
|
683
684
|
for ( const p of paths )
|
684
685
|
{
|
@@ -688,9 +689,9 @@ class WebScraper
|
|
688
689
|
}
|
689
690
|
}
|
690
691
|
// Recreate directories needed for output
|
691
|
-
this.ensureDirectory( path.join(
|
692
|
-
this.ensureDirectory( path.join(
|
693
|
-
this.ensureDirectory( path.join(
|
692
|
+
this.ensureDirectory( path.join( process.cwd(), this.scrapResultPath ) );
|
693
|
+
this.ensureDirectory( path.join( process.cwd(), this.textOutputPath ) );
|
694
|
+
this.ensureDirectory( path.join( process.cwd(), this.textOutputPathWithMeta ) );
|
694
695
|
}
|
695
696
|
|
696
697
|
ensureDirectory ( dirPath )
|
@@ -709,7 +710,7 @@ class WebScraper
|
|
709
710
|
static async combineResults ( outputPath, websites )
|
710
711
|
{
|
711
712
|
await WebScraper.sleep( 1000 );
|
712
|
-
const fullOutputPath =
|
713
|
+
const fullOutputPath = outputPath;
|
713
714
|
WebScraper.createCombinedDirectories( fullOutputPath );
|
714
715
|
WebScraper.combineJSONLFiles( fullOutputPath, websites );
|
715
716
|
WebScraper.combineCSVFiles( fullOutputPath, websites );
|
@@ -742,20 +743,16 @@ class WebScraper
|
|
742
743
|
|
743
744
|
for ( const website of websites )
|
744
745
|
{
|
745
|
-
const jsonlContent = fs.readFileSync(
|
746
|
-
|
747
|
-
"utf-8"
|
748
|
-
);
|
746
|
+
const jsonlContent = fs.readFileSync( website.jsonlOutputPath, "utf-8" );
|
747
|
+
|
749
748
|
if ( jsonlContent )
|
750
749
|
{
|
751
750
|
jsonlOutput.write( jsonlContent );
|
752
751
|
}
|
753
752
|
if ( website.includeMetadata )
|
754
753
|
{
|
755
|
-
const jsonlMetaContent = fs.readFileSync(
|
756
|
-
|
757
|
-
"utf-8"
|
758
|
-
);
|
754
|
+
const jsonlMetaContent = fs.readFileSync( website.jsonlOutputPathWithMeta, "utf-8" );
|
755
|
+
|
759
756
|
if ( jsonlMetaContent )
|
760
757
|
{
|
761
758
|
jsonlMetaOutput.write( jsonlMetaContent );
|
@@ -782,7 +779,7 @@ class WebScraper
|
|
782
779
|
|
783
780
|
for ( const website of websites )
|
784
781
|
{
|
785
|
-
const csvContent = fs.readFileSync(
|
782
|
+
const csvContent = fs.readFileSync( website.csvOutputPath, "utf-8" )
|
786
783
|
.split( "\n" )
|
787
784
|
.slice( 1 )
|
788
785
|
.filter( line => { return line.trim() });
|
@@ -792,11 +789,8 @@ class WebScraper
|
|
792
789
|
}
|
793
790
|
if ( website.includeMetadata )
|
794
791
|
{
|
795
|
-
const csvMetaContent = fs
|
796
|
-
|
797
|
-
path.join( __dirname, website.csvOutputPathWithMeta ),
|
798
|
-
"utf-8"
|
799
|
-
)
|
792
|
+
const csvMetaContent = fs.readFileSync( website.csvOutputPathWithMeta, "utf-8" )
|
793
|
+
|
800
794
|
.split( "\n" )
|
801
795
|
.slice( 1 )
|
802
796
|
.filter( line => { return line.trim() });
|
@@ -815,13 +809,11 @@ class WebScraper
|
|
815
809
|
let textFileCounter = 1;
|
816
810
|
for ( const website of websites )
|
817
811
|
{
|
818
|
-
const textFiles = fs.readdirSync( path.join(
|
812
|
+
const textFiles = fs.readdirSync( path.join( process.cwd(), website.textOutputPath ) );
|
819
813
|
for ( const file of textFiles )
|
820
814
|
{
|
821
|
-
const content = fs.readFileSync(
|
822
|
-
|
823
|
-
"utf-8"
|
824
|
-
);
|
815
|
+
const content = fs.readFileSync( path.join( website.textOutputPath, file ), "utf-8" );
|
816
|
+
|
825
817
|
fs.writeFileSync(
|
826
818
|
path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
|
827
819
|
content,
|
@@ -829,10 +821,7 @@ class WebScraper
|
|
829
821
|
);
|
830
822
|
if ( website.includeMetadata )
|
831
823
|
{
|
832
|
-
const metaContent = fs.readFileSync(
|
833
|
-
path.join( __dirname, website.textOutputPathWithMeta, file ),
|
834
|
-
"utf-8"
|
835
|
-
);
|
824
|
+
const metaContent = fs.readFileSync( path.join( website.textOutputPathWithMeta, file ), "utf-8" );
|
836
825
|
fs.writeFileSync(
|
837
826
|
path.join(
|
838
827
|
fullOutputPath,
|