clean-web-scraper 4.1.6 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/example-usage.js +17 -18
  2. package/main.js +28 -27
  3. package/package.json +1 -1
package/example-usage.js CHANGED
@@ -306,24 +306,23 @@ async function palestineremembered ( enable )
306
306
 
307
307
  void async function main ()
308
308
  {
309
- const palianswersScraper = await palianswers( true );
310
- const decolonizepalestineScraper = await decolonizepalestine( true );
309
+ // const palianswersScraper = await palianswers( true );
310
+ // const decolonizepalestineScraper = await decolonizepalestine( true );
311
311
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
312
- const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
313
- const electronicintifadaScraper = await electronicintifada( true );
314
- const standWithPalestineScraper = await standWithPalestine( true );
315
- const mondoweisScraper = await mondoweiss( true );
316
- const bdsmovementScraper = await bdsmovement( false );
317
- const palestinerememberedScraper = await palestineremembered( false );
312
+ // const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
313
+ // const electronicintifadaScraper = await electronicintifada( true );
314
+ // const standWithPalestineScraper = await standWithPalestine( true );
315
+ // const mondoweisScraper = await mondoweiss( true );
316
+ // const bdsmovementScraper = await bdsmovement( false );
317
+ // const palestinerememberedScraper = await palestineremembered( false );
318
318
 
319
- await WebScraper.combineResults( "./dataset/combined", [
320
- palianswersScraper,
321
- decolonizepalestineScraper,
322
- khameneiIrFreePalestineTagScraper,
323
- khameneiIrPalestineSpecialPageScraper,
324
- electronicintifadaScraper,
325
- standWithPalestineScraper,
326
- mondoweisScraper
327
- ] );
328
- // QLoRA = LoRA with 4-bit quantization
319
+ // await WebScraper.combineResults( "./dataset/combined", [
320
+ // palianswersScraper,
321
+ // decolonizepalestineScraper,
322
+ // khameneiIrFreePalestineTagScraper,
323
+ // khameneiIrPalestineSpecialPageScraper,
324
+ // electronicintifadaScraper,
325
+ // standWithPalestineScraper,
326
+ // mondoweisScraper
327
+ // ] );
329
328
  }();
package/main.js CHANGED
@@ -1,8 +1,9 @@
1
+ const process = require( "node:process" );
2
+ const fs = require( "fs" );
3
+ const path = require( "path" );
1
4
  const axios = require( "axios" );
2
5
  const { JSDOM } = require( "jsdom" );
3
6
  const { Readability } = require( "@mozilla/readability" );
4
- const fs = require( "fs" );
5
- const path = require( "path" );
6
7
  const { connect } = require( "puppeteer-real-browser" );
7
8
 
8
9
  class WebScraper
@@ -334,7 +335,7 @@ class WebScraper
334
335
  {
335
336
  urlPath = urlPath.slice( 0, -1 );
336
337
  }
337
- const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
338
+ const filePath = path.join( process.cwd(), this.scrapResultPath, urlPath );
338
339
  const dir = path.dirname( filePath );
339
340
 
340
341
  fs.mkdirSync( dir, { recursive: true });
@@ -347,14 +348,14 @@ class WebScraper
347
348
 
348
349
  createJSONLFile ()
349
350
  {
350
- const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
351
+ const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPath ) );
351
352
  writeStreamSimple.on( "error", err =>
352
353
  { return console.error( "Error writing JSONL:", err ) });
353
354
 
354
355
  let writeStreamMeta;
355
356
  if ( this.includeMetadata )
356
357
  {
357
- writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
358
+ writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPathWithMeta ) );
358
359
  writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata JSONL:", err ) });
359
360
  }
360
361
  for ( const content of this.allProcessedContent )
@@ -377,7 +378,7 @@ class WebScraper
377
378
  createCSVFile ()
378
379
  {
379
380
  // Create simple version
380
- const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
381
+ const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPath ) );
381
382
  writeStreamSimple.on( "error", ( err ) => { return console.error( "Error writing CSV:", err ) });
382
383
  writeStreamSimple.write( "text\n" );
383
384
 
@@ -385,7 +386,7 @@ class WebScraper
385
386
  let writeStreamMeta;
386
387
  if ( this.includeMetadata )
387
388
  {
388
- writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
389
+ writeStreamMeta = fs.createWriteStream( path.join( process.cwd(), this.csvOutputPathWithMeta ) );
389
390
  writeStreamMeta.on( "error", ( err ) => { return console.error( "Error writing metadata CSV:", err ) });
390
391
  }
391
392
 
@@ -427,12 +428,12 @@ class WebScraper
427
428
 
428
429
  saveNumberedTextFiles ()
429
430
  {
430
- const baseTextPath = path.join( __dirname, this.textOutputPath );
431
+ const baseTextPath = path.join( process.cwd(), this.textOutputPath );
431
432
 
432
433
  let metaTextPath = null;
433
434
  if ( this.includeMetadata )
434
435
  {
435
- metaTextPath = path.join( __dirname, this.textOutputPathWithMeta );
436
+ metaTextPath = path.join( process.cwd(), this.textOutputPathWithMeta );
436
437
  fs.mkdirSync( metaTextPath, { recursive: true });
437
438
  }
438
439
 
@@ -672,13 +673,13 @@ class WebScraper
672
673
  createOutputDirectory ()
673
674
  {
674
675
  const paths = [
675
- path.join( __dirname, this.scrapResultPath ),
676
- path.join( __dirname, this.textOutputPath ),
677
- path.join( __dirname, this.textOutputPathWithMeta ),
678
- path.join( __dirname, this.csvOutputPath ),
679
- path.join( __dirname, this.csvOutputPathWithMeta ),
680
- path.join( __dirname, this.jsonlOutputPath ),
681
- path.join( __dirname, this.jsonlOutputPathWithMeta )
676
+ path.join( process.cwd(), this.scrapResultPath ),
677
+ path.join( process.cwd(), this.textOutputPath ),
678
+ path.join( process.cwd(), this.textOutputPathWithMeta ),
679
+ path.join( process.cwd(), this.csvOutputPath ),
680
+ path.join( process.cwd(), this.csvOutputPathWithMeta ),
681
+ path.join( process.cwd(), this.jsonlOutputPath ),
682
+ path.join( process.cwd(), this.jsonlOutputPathWithMeta )
682
683
  ];
683
684
  for ( const p of paths )
684
685
  {
@@ -688,9 +689,9 @@ class WebScraper
688
689
  }
689
690
  }
690
691
  // Recreate directories needed for output
691
- this.ensureDirectory( path.join( __dirname, this.scrapResultPath ) );
692
- this.ensureDirectory( path.join( __dirname, this.textOutputPath ) );
693
- this.ensureDirectory( path.join( __dirname, this.textOutputPathWithMeta ) );
692
+ this.ensureDirectory( path.join( process.cwd(), this.scrapResultPath ) );
693
+ this.ensureDirectory( path.join( process.cwd(), this.textOutputPath ) );
694
+ this.ensureDirectory( path.join( process.cwd(), this.textOutputPathWithMeta ) );
694
695
  }
695
696
 
696
697
  ensureDirectory ( dirPath )
@@ -709,7 +710,7 @@ class WebScraper
709
710
  static async combineResults ( outputPath, websites )
710
711
  {
711
712
  await WebScraper.sleep( 1000 );
712
- const fullOutputPath = path.join( __dirname, outputPath );
713
+ const fullOutputPath = path.join( process.cwd(), outputPath );
713
714
  WebScraper.createCombinedDirectories( fullOutputPath );
714
715
  WebScraper.combineJSONLFiles( fullOutputPath, websites );
715
716
  WebScraper.combineCSVFiles( fullOutputPath, websites );
@@ -743,7 +744,7 @@ class WebScraper
743
744
  for ( const website of websites )
744
745
  {
745
746
  const jsonlContent = fs.readFileSync(
746
- path.join( __dirname, website.jsonlOutputPath ),
747
+ path.join( process.cwd(), website.jsonlOutputPath ),
747
748
  "utf-8"
748
749
  );
749
750
  if ( jsonlContent )
@@ -753,7 +754,7 @@ class WebScraper
753
754
  if ( website.includeMetadata )
754
755
  {
755
756
  const jsonlMetaContent = fs.readFileSync(
756
- path.join( __dirname, website.jsonlOutputPathWithMeta ),
757
+ path.join( process.cwd(), website.jsonlOutputPathWithMeta ),
757
758
  "utf-8"
758
759
  );
759
760
  if ( jsonlMetaContent )
@@ -782,7 +783,7 @@ class WebScraper
782
783
 
783
784
  for ( const website of websites )
784
785
  {
785
- const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
786
+ const csvContent = fs.readFileSync( path.join( process.cwd(), website.csvOutputPath ), "utf-8" )
786
787
  .split( "\n" )
787
788
  .slice( 1 )
788
789
  .filter( line => { return line.trim() });
@@ -794,7 +795,7 @@ class WebScraper
794
795
  {
795
796
  const csvMetaContent = fs
796
797
  .readFileSync(
797
- path.join( __dirname, website.csvOutputPathWithMeta ),
798
+ path.join( process.cwd(), website.csvOutputPathWithMeta ),
798
799
  "utf-8"
799
800
  )
800
801
  .split( "\n" )
@@ -815,11 +816,11 @@ class WebScraper
815
816
  let textFileCounter = 1;
816
817
  for ( const website of websites )
817
818
  {
818
- const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
819
+ const textFiles = fs.readdirSync( path.join( process.cwd(), website.textOutputPath ) );
819
820
  for ( const file of textFiles )
820
821
  {
821
822
  const content = fs.readFileSync(
822
- path.join( __dirname, website.textOutputPath, file ),
823
+ path.join( process.cwd(), website.textOutputPath, file ),
823
824
  "utf-8"
824
825
  );
825
826
  fs.writeFileSync(
@@ -830,7 +831,7 @@ class WebScraper
830
831
  if ( website.includeMetadata )
831
832
  {
832
833
  const metaContent = fs.readFileSync(
833
- path.join( __dirname, website.textOutputPathWithMeta, file ),
834
+ path.join( process.cwd(), website.textOutputPathWithMeta, file ),
834
835
  "utf-8"
835
836
  );
836
837
  fs.writeFileSync(
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.1.6",
3
+ "version": "4.2.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",