clean-web-scraper 4.1.7 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/example-usage.js +13 -10
  2. package/main.js +32 -43
  3. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -50,6 +50,7 @@ async function khameneiIrFreePalestineTag ( enable )
50
50
  baseURL: "https://english.khamenei.ir/news",
51
51
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
52
52
  maxDepth: 1,
53
+ maxArticles: 2,
53
54
  exactExcludeList: [
54
55
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
55
56
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
@@ -72,6 +73,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
72
73
  baseURL: "https://english.khamenei.ir/news",
73
74
  startURL: "https://english.khamenei.ir/palestine-special-page",
74
75
  maxDepth: 2,
76
+ maxArticles: 2,
75
77
  exactExcludeList: [
76
78
  "https://english.khamenei.ir/palestine-special-page/"
77
79
  ],
@@ -101,6 +103,7 @@ async function decolonizepalestine ( enable )
101
103
  "https://decolonizepalestine.com/rainbow-washing",
102
104
  "https://decolonizepalestine.com/"
103
105
  ],
106
+ maxArticles: 2,
104
107
  scrapResultPath: "./dataset/decolonizepalestine/website",
105
108
  jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
106
109
  textOutputPath: "./dataset/decolonizepalestine/texts",
@@ -307,7 +310,7 @@ async function palestineremembered ( enable )
307
310
  void async function main ()
308
311
  {
309
312
  // const palianswersScraper = await palianswers( true );
310
- // const decolonizepalestineScraper = await decolonizepalestine( true );
313
+ const decolonizepalestineScraper = await decolonizepalestine( true );
311
314
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
312
315
  // const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
313
316
  // const electronicintifadaScraper = await electronicintifada( true );
@@ -316,13 +319,13 @@ void async function main ()
316
319
  // const bdsmovementScraper = await bdsmovement( false );
317
320
  // const palestinerememberedScraper = await palestineremembered( false );
318
321
 
319
- // await WebScraper.combineResults( "./dataset/combined", [
320
- // palianswersScraper,
321
- // decolonizepalestineScraper,
322
- // khameneiIrFreePalestineTagScraper,
323
- // khameneiIrPalestineSpecialPageScraper,
324
- // electronicintifadaScraper,
325
- // standWithPalestineScraper,
326
- // mondoweisScraper
327
- // ] );
322
+ await WebScraper.combineResults( "./dataset/combined", [
323
+ // palianswersScraper,
324
+ decolonizepalestineScraper,
325
+ khameneiIrFreePalestineTagScraper,
326
+ // khameneiIrPalestineSpecialPageScraper,
327
+ // electronicintifadaScraper,
328
+ // standWithPalestineScraper,
329
+ // mondoweisScraper
330
+ ] );
328
331
  }();
package/main.js CHANGED
@@ -1,8 +1,9 @@
1
+ const process = require( "node:process" );
2
+ const fs = require( "fs" );
3
+ const path = require( "path" );
1
4
  const axios = require( "axios" );
2
5
  const { JSDOM } = require( "jsdom" );
3
6
  const { Readability } = require( "@mozilla/readability" );
4
- const fs = require( "fs" );
5
- const path = require( "path" );
6
7
  const { connect } = require( "puppeteer-real-browser" );
7
8
 
8
9
  class WebScraper
@@ -334,7 +335,7 @@ class WebScraper
334
335
  {
335
336
  urlPath = urlPath.slice( 0, -1 );
336
337
  }
337
- const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
338
+ const filePath = path.join( this.scrapResultPath, urlPath );
338
339
  const dir = path.dirname( filePath );
339
340
 
340
341
  fs.mkdirSync( dir, { recursive: true });
@@ -347,14 +348,14 @@ class WebScraper
347
348
 
348
349
  createJSONLFile ()
349
350
  {
350
- const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
351
+ const writeStreamSimple = fs.createWriteStream( this.jsonlOutputPath );
351
352
  writeStreamSimple.on( "error", err =>
352
353
  { return console.error( "Error writing JSONL:", err ) });
353
354
 
354
355
  let writeStreamMeta;
355
356
  if ( this.includeMetadata )
356
357
  {
357
- writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
358
+ writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPathWithMeta ) );
358
359
  writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
359
360
  }
360
361
  for ( const content of this.allProcessedContent )
@@ -377,7 +378,7 @@ class WebScraper
377
378
  createCSVFile ()
378
379
  {
379
380
  // Create simple version
380
- const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
381
+ const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPath ) );
381
382
  writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
382
383
  writeStreamSimple.write( "text\n" );
383
384
 
@@ -385,7 +386,7 @@ class WebScraper
385
386
  let writeStreamMeta;
386
387
  if ( this.includeMetadata )
387
388
  {
388
- writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
389
+ writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPathWithMeta ) );
389
390
  writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
390
391
  }
391
392
 
@@ -427,12 +428,12 @@ class WebScraper
427
428
 
428
429
  saveNumberedTextFiles ()
429
430
  {
430
- const baseTextPath = path.join( __dirname, this.textOutputPath );
431
+ const baseTextPath = this.textOutputPath;
431
432
 
432
433
  let metaTextPath = null;
433
434
  if ( this.includeMetadata )
434
435
  {
435
- metaTextPath = path.join( __dirname, this.textOutputPathWithMeta );
436
+ metaTextPath = path.join( process.cwd(), this.textOutputPathWithMeta );
436
437
  fs.mkdirSync( metaTextPath, { recursive: true });
437
438
  }
438
439
 
@@ -672,13 +673,13 @@ class WebScraper
672
673
  createOutputDirectory ()
673
674
  {
674
675
  const paths = [
675
- path.join( __dirname, this.scrapResultPath ),
676
- path.join( __dirname, this.textOutputPath ),
677
- path.join( __dirname, this.textOutputPathWithMeta ),
678
- path.join( __dirname, this.csvOutputPath ),
679
- path.join( __dirname, this.csvOutputPathWithMeta ),
680
- path.join( __dirname, this.jsonlOutputPath ),
681
- path.join( __dirname, this.jsonlOutputPathWithMeta )
676
+ this.scrapResultPath,
677
+ this.textOutputPath,
678
+ this.textOutputPathWithMeta,
679
+ this.csvOutputPath,
680
+ this.csvOutputPathWithMeta,
681
+ this.jsonlOutputPath,
682
+ this.jsonlOutputPathWithMeta
682
683
  ];
683
684
  for ( const p of paths )
684
685
  {
@@ -688,9 +689,9 @@ class WebScraper
688
689
  }
689
690
  }
690
691
  // Recreate directories needed for output
691
- this.ensureDirectory( path.join( __dirname, this.scrapResultPath ) );
692
- this.ensureDirectory( path.join( __dirname, this.textOutputPath ) );
693
- this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
692
+ this.ensureDirectory( path.join( process.cwd(), this.scrapResultPath ) );
693
+ this.ensureDirectory( path.join( process.cwd(), this.textOutputPath ) );
694
+ this.ensureDirectory( path.join( process.cwd(), this.textOutputPathWithMeta ) );
694
695
  }
695
696
 
696
697
  ensureDirectory ( dirPath )
@@ -709,7 +710,7 @@ class WebScraper
709
710
  static async combineResults ( outputPath, websites )
710
711
  {
711
712
  await WebScraper.sleep( 1000 );
712
- const fullOutputPath = path.join( __dirname, outputPath );
713
+ const fullOutputPath = outputPath;
713
714
  WebScraper.createCombinedDirectories( fullOutputPath );
714
715
  WebScraper.combineJSONLFiles( fullOutputPath, websites );
715
716
  WebScraper.combineCSVFiles( fullOutputPath, websites );
@@ -742,20 +743,16 @@ class WebScraper
742
743
 
743
744
  for ( const website of websites )
744
745
  {
745
- const jsonlContent = fs.readFileSync(
746
- path.join( __dirname, website.jsonlOutputPath ),
747
- "utf-8"
748
- );
746
+ const jsonlContent = fs.readFileSync( website.jsonlOutputPath, "utf-8" );
747
+
749
748
  if ( jsonlContent )
750
749
  {
751
750
  jsonlOutput.write( jsonlContent );
752
751
  }
753
752
  if ( website.includeMetadata )
754
753
  {
755
- const jsonlMetaContent = fs.readFileSync(
756
- path.join( __dirname, website.jsonlOutputPathWithMeta ),
757
- "utf-8"
758
- );
754
+ const jsonlMetaContent = fs.readFileSync( website.jsonlOutputPathWithMeta, "utf-8" );
755
+
759
756
  if ( jsonlMetaContent )
760
757
  {
761
758
  jsonlMetaOutput.write( jsonlMetaContent );
@@ -782,7 +779,7 @@ class WebScraper
782
779
 
783
780
  for ( const website of websites )
784
781
  {
785
- const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
782
+ const csvContent = fs.readFileSync( website.csvOutputPath, "utf-8" )
786
783
  .split( "\n" )
787
784
  .slice( 1 )
788
785
  .filter( line => { return line.trim() });
@@ -792,11 +789,8 @@ class WebScraper
792
789
  }
793
790
  if ( website.includeMetadata )
794
791
  {
795
- const csvMetaContent = fs
796
- .readFileSync(
797
- path.join( __dirname, website.csvOutputPathWithMeta ),
798
- "utf-8"
799
- )
792
+ const csvMetaContent = fs.readFileSync( website.csvOutputPathWithMeta, "utf-8" )
793
+
800
794
  .split( "\n" )
801
795
  .slice( 1 )
802
796
  .filter( line => { return line.trim() });
@@ -815,13 +809,11 @@ class WebScraper
815
809
  let textFileCounter = 1;
816
810
  for ( const website of websites )
817
811
  {
818
- const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
812
+ const textFiles = fs.readdirSync( path.join( process.cwd(), website.textOutputPath ) );
819
813
  for ( const file of textFiles )
820
814
  {
821
- const content = fs.readFileSync(
822
- path.join( __dirname, website.textOutputPath, file ),
823
- "utf-8"
824
- );
815
+ const content = fs.readFileSync( path.join( website.textOutputPath, file ), "utf-8" );
816
+
825
817
  fs.writeFileSync(
826
818
  path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
827
819
  content,
@@ -829,10 +821,7 @@ class WebScraper
829
821
  );
830
822
  if ( website.includeMetadata )
831
823
  {
832
- const metaContent = fs.readFileSync(
833
- path.join( __dirname, website.textOutputPathWithMeta, file ),
834
- "utf-8"
835
- );
824
+ const metaContent = fs.readFileSync( path.join( website.textOutputPathWithMeta, file ), "utf-8" );
836
825
  fs.writeFileSync(
837
826
  path.join(
838
827
  fullOutputPath,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.1.7",
3
+ "version": "4.2.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",