clean-web-scraper 4.2.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/example-usage.js +13 -10
- package/main.js +21 -33
- package/package.json +1 -1
package/example-usage.js
CHANGED
@@ -50,6 +50,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
50
50
|
baseURL: "https://english.khamenei.ir/news",
|
51
51
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
52
52
|
maxDepth: 1,
|
53
|
+
maxArticles: 2,
|
53
54
|
exactExcludeList: [
|
54
55
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
55
56
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
|
@@ -72,6 +73,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
|
|
72
73
|
baseURL: "https://english.khamenei.ir/news",
|
73
74
|
startURL: "https://english.khamenei.ir/palestine-special-page",
|
74
75
|
maxDepth: 2,
|
76
|
+
maxArticles: 2,
|
75
77
|
exactExcludeList: [
|
76
78
|
"https://english.khamenei.ir/palestine-special-page/"
|
77
79
|
],
|
@@ -101,6 +103,7 @@ async function decolonizepalestine ( enable )
|
|
101
103
|
"https://decolonizepalestine.com/rainbow-washing",
|
102
104
|
"https://decolonizepalestine.com/"
|
103
105
|
],
|
106
|
+
maxArticles: 2,
|
104
107
|
scrapResultPath: "./dataset/decolonizepalestine/website",
|
105
108
|
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
106
109
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
@@ -307,7 +310,7 @@ async function palestineremembered ( enable )
|
|
307
310
|
void async function main ()
|
308
311
|
{
|
309
312
|
// const palianswersScraper = await palianswers( true );
|
310
|
-
|
313
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
311
314
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
312
315
|
// const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
|
313
316
|
// const electronicintifadaScraper = await electronicintifada( true );
|
@@ -316,13 +319,13 @@ void async function main ()
|
|
316
319
|
// const bdsmovementScraper = await bdsmovement( false );
|
317
320
|
// const palestinerememberedScraper = await palestineremembered( false );
|
318
321
|
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
322
|
+
await WebScraper.combineResults( "./dataset/combined", [
|
323
|
+
// palianswersScraper,
|
324
|
+
decolonizepalestineScraper,
|
325
|
+
khameneiIrFreePalestineTagScraper,
|
326
|
+
// khameneiIrPalestineSpecialPageScraper,
|
327
|
+
// electronicintifadaScraper,
|
328
|
+
// standWithPalestineScraper,
|
329
|
+
// mondoweisScraper
|
330
|
+
] );
|
328
331
|
}();
|
package/main.js
CHANGED
@@ -335,7 +335,7 @@ class WebScraper
|
|
335
335
|
{
|
336
336
|
urlPath = urlPath.slice( 0, -1 );
|
337
337
|
}
|
338
|
-
const filePath = path.join(
|
338
|
+
const filePath = path.join( this.scrapResultPath, urlPath );
|
339
339
|
const dir = path.dirname( filePath );
|
340
340
|
|
341
341
|
fs.mkdirSync( dir, { recursive: true });
|
@@ -348,7 +348,7 @@ class WebScraper
|
|
348
348
|
|
349
349
|
createJSONLFile ()
|
350
350
|
{
|
351
|
-
const writeStreamSimple = fs.createWriteStream(
|
351
|
+
const writeStreamSimple = fs.createWriteStream( this.jsonlOutputPath );
|
352
352
|
writeStreamSimple.on( "error", err =>
|
353
353
|
{ return console.error( "Error writing JSONL:", err ) });
|
354
354
|
|
@@ -428,7 +428,7 @@ class WebScraper
|
|
428
428
|
|
429
429
|
saveNumberedTextFiles ()
|
430
430
|
{
|
431
|
-
const baseTextPath =
|
431
|
+
const baseTextPath = this.textOutputPath;
|
432
432
|
|
433
433
|
let metaTextPath = null;
|
434
434
|
if ( this.includeMetadata )
|
@@ -673,13 +673,13 @@ class WebScraper
|
|
673
673
|
createOutputDirectory ()
|
674
674
|
{
|
675
675
|
const paths = [
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
676
|
+
this.scrapResultPath,
|
677
|
+
this.textOutputPath,
|
678
|
+
this.textOutputPathWithMeta,
|
679
|
+
this.csvOutputPath,
|
680
|
+
this.csvOutputPathWithMeta,
|
681
|
+
this.jsonlOutputPath,
|
682
|
+
this.jsonlOutputPathWithMeta
|
683
683
|
];
|
684
684
|
for ( const p of paths )
|
685
685
|
{
|
@@ -710,7 +710,7 @@ class WebScraper
|
|
710
710
|
static async combineResults ( outputPath, websites )
|
711
711
|
{
|
712
712
|
await WebScraper.sleep( 1000 );
|
713
|
-
const fullOutputPath =
|
713
|
+
const fullOutputPath = outputPath;
|
714
714
|
WebScraper.createCombinedDirectories( fullOutputPath );
|
715
715
|
WebScraper.combineJSONLFiles( fullOutputPath, websites );
|
716
716
|
WebScraper.combineCSVFiles( fullOutputPath, websites );
|
@@ -743,20 +743,16 @@ class WebScraper
|
|
743
743
|
|
744
744
|
for ( const website of websites )
|
745
745
|
{
|
746
|
-
const jsonlContent = fs.readFileSync(
|
747
|
-
|
748
|
-
"utf-8"
|
749
|
-
);
|
746
|
+
const jsonlContent = fs.readFileSync( website.jsonlOutputPath, "utf-8" );
|
747
|
+
|
750
748
|
if ( jsonlContent )
|
751
749
|
{
|
752
750
|
jsonlOutput.write( jsonlContent );
|
753
751
|
}
|
754
752
|
if ( website.includeMetadata )
|
755
753
|
{
|
756
|
-
const jsonlMetaContent = fs.readFileSync(
|
757
|
-
|
758
|
-
"utf-8"
|
759
|
-
);
|
754
|
+
const jsonlMetaContent = fs.readFileSync( website.jsonlOutputPathWithMeta, "utf-8" );
|
755
|
+
|
760
756
|
if ( jsonlMetaContent )
|
761
757
|
{
|
762
758
|
jsonlMetaOutput.write( jsonlMetaContent );
|
@@ -783,7 +779,7 @@ class WebScraper
|
|
783
779
|
|
784
780
|
for ( const website of websites )
|
785
781
|
{
|
786
|
-
const csvContent = fs.readFileSync(
|
782
|
+
const csvContent = fs.readFileSync( website.csvOutputPath, "utf-8" )
|
787
783
|
.split( "\n" )
|
788
784
|
.slice( 1 )
|
789
785
|
.filter( line => { return line.trim() });
|
@@ -793,11 +789,8 @@ class WebScraper
|
|
793
789
|
}
|
794
790
|
if ( website.includeMetadata )
|
795
791
|
{
|
796
|
-
const csvMetaContent = fs
|
797
|
-
|
798
|
-
path.join( process.cwd(), website.csvOutputPathWithMeta ),
|
799
|
-
"utf-8"
|
800
|
-
)
|
792
|
+
const csvMetaContent = fs.readFileSync( website.csvOutputPathWithMeta, "utf-8" )
|
793
|
+
|
801
794
|
.split( "\n" )
|
802
795
|
.slice( 1 )
|
803
796
|
.filter( line => { return line.trim() });
|
@@ -819,10 +812,8 @@ class WebScraper
|
|
819
812
|
const textFiles = fs.readdirSync( path.join( process.cwd(), website.textOutputPath ) );
|
820
813
|
for ( const file of textFiles )
|
821
814
|
{
|
822
|
-
const content = fs.readFileSync(
|
823
|
-
|
824
|
-
"utf-8"
|
825
|
-
);
|
815
|
+
const content = fs.readFileSync( path.join( website.textOutputPath, file ), "utf-8" );
|
816
|
+
|
826
817
|
fs.writeFileSync(
|
827
818
|
path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
|
828
819
|
content,
|
@@ -830,10 +821,7 @@ class WebScraper
|
|
830
821
|
);
|
831
822
|
if ( website.includeMetadata )
|
832
823
|
{
|
833
|
-
const metaContent = fs.readFileSync(
|
834
|
-
path.join( process.cwd(), website.textOutputPathWithMeta, file ),
|
835
|
-
"utf-8"
|
836
|
-
);
|
824
|
+
const metaContent = fs.readFileSync( path.join( website.textOutputPathWithMeta, file ), "utf-8" );
|
837
825
|
fs.writeFileSync(
|
838
826
|
path.join(
|
839
827
|
fullOutputPath,
|