clean-web-scraper 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -172,13 +172,13 @@ The actual article content starts here. This is the clean, processed text of the
172
172
  ### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
173
173
 
174
174
  ```text
175
- articleTitle: My Awesome Page
176
- description: This is a great article about coding
175
+ articleTitle: Palestine history
176
+ description: This is a great article about Palestine history
177
177
  author: John Doe
178
178
  language: en
179
179
  dateScraped: 2024-01-20T10:30:00Z
180
180
 
181
- \-\-\-
181
+ ---
182
182
 
183
183
  The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage.
184
184
  ```
package/example-usage.js CHANGED
@@ -50,6 +50,7 @@ async function khameneiIrFreePalestineTag ( enable )
50
50
  baseURL: "https://english.khamenei.ir/news",
51
51
  startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
52
52
  maxDepth: 1,
53
+ maxArticles: 2,
53
54
  exactExcludeList: [
54
55
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
55
56
  "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
@@ -72,6 +73,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
72
73
  baseURL: "https://english.khamenei.ir/news",
73
74
  startURL: "https://english.khamenei.ir/palestine-special-page",
74
75
  maxDepth: 2,
76
+ maxArticles: 2,
75
77
  exactExcludeList: [
76
78
  "https://english.khamenei.ir/palestine-special-page/"
77
79
  ],
@@ -101,6 +103,7 @@ async function decolonizepalestine ( enable )
101
103
  "https://decolonizepalestine.com/rainbow-washing",
102
104
  "https://decolonizepalestine.com/"
103
105
  ],
106
+ maxArticles: 2,
104
107
  scrapResultPath: "./dataset/decolonizepalestine/website",
105
108
  jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
106
109
  textOutputPath: "./dataset/decolonizepalestine/texts",
@@ -307,7 +310,7 @@ async function palestineremembered ( enable )
307
310
  void async function main ()
308
311
  {
309
312
  // const palianswersScraper = await palianswers( true );
310
- // const decolonizepalestineScraper = await decolonizepalestine( true );
313
+ const decolonizepalestineScraper = await decolonizepalestine( true );
311
314
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
312
315
  // const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
313
316
  // const electronicintifadaScraper = await electronicintifada( true );
@@ -316,13 +319,13 @@ void async function main ()
316
319
  // const bdsmovementScraper = await bdsmovement( false );
317
320
  // const palestinerememberedScraper = await palestineremembered( false );
318
321
 
319
- // await WebScraper.combineResults( "./dataset/combined", [
320
- // palianswersScraper,
321
- // decolonizepalestineScraper,
322
- // khameneiIrFreePalestineTagScraper,
323
- // khameneiIrPalestineSpecialPageScraper,
324
- // electronicintifadaScraper,
325
- // standWithPalestineScraper,
326
- // mondoweisScraper
327
- // ] );
322
+ await WebScraper.combineResults( "./dataset/combined", [
323
+ // palianswersScraper,
324
+ decolonizepalestineScraper,
325
+ khameneiIrFreePalestineTagScraper,
326
+ // khameneiIrPalestineSpecialPageScraper,
327
+ // electronicintifadaScraper,
328
+ // standWithPalestineScraper,
329
+ // mondoweisScraper
330
+ ] );
328
331
  }();
package/main.js CHANGED
@@ -335,7 +335,7 @@ class WebScraper
335
335
  {
336
336
  urlPath = urlPath.slice( 0, -1 );
337
337
  }
338
- const filePath = path.join( process.cwd(), this.scrapResultPath, urlPath );
338
+ const filePath = path.join( this.scrapResultPath, urlPath );
339
339
  const dir = path.dirname( filePath );
340
340
 
341
341
  fs.mkdirSync( dir, { recursive: true });
@@ -348,7 +348,7 @@ class WebScraper
348
348
 
349
349
  createJSONLFile ()
350
350
  {
351
- const writeStreamSimple = fs.createWriteStream( path.join( process.cwd(), this.jsonlOutputPath ) );
351
+ const writeStreamSimple = fs.createWriteStream( this.jsonlOutputPath );
352
352
  writeStreamSimple.on( "error", err =>
353
353
  { return console.error( "Error writing JSONL:", err ) });
354
354
 
@@ -428,7 +428,7 @@ class WebScraper
428
428
 
429
429
  saveNumberedTextFiles ()
430
430
  {
431
- const baseTextPath = path.join( process.cwd(), this.textOutputPath );
431
+ const baseTextPath = this.textOutputPath;
432
432
 
433
433
  let metaTextPath = null;
434
434
  if ( this.includeMetadata )
@@ -673,13 +673,13 @@ class WebScraper
673
673
  createOutputDirectory ()
674
674
  {
675
675
  const paths = [
676
- path.join( process.cwd(), this.scrapResultPath ),
677
- path.join( process.cwd(), this.textOutputPath ),
678
- path.join( process.cwd(), this.textOutputPathWithMeta ),
679
- path.join( process.cwd(), this.csvOutputPath ),
680
- path.join( process.cwd(), this.csvOutputPathWithMeta ),
681
- path.join( process.cwd(), this.jsonlOutputPath ),
682
- path.join( process.cwd(), this.jsonlOutputPathWithMeta )
676
+ this.scrapResultPath,
677
+ this.textOutputPath,
678
+ this.textOutputPathWithMeta,
679
+ this.csvOutputPath,
680
+ this.csvOutputPathWithMeta,
681
+ this.jsonlOutputPath,
682
+ this.jsonlOutputPathWithMeta
683
683
  ];
684
684
  for ( const p of paths )
685
685
  {
@@ -710,7 +710,7 @@ class WebScraper
710
710
  static async combineResults ( outputPath, websites )
711
711
  {
712
712
  await WebScraper.sleep( 1000 );
713
- const fullOutputPath = path.join( process.cwd(), outputPath );
713
+ const fullOutputPath = outputPath;
714
714
  WebScraper.createCombinedDirectories( fullOutputPath );
715
715
  WebScraper.combineJSONLFiles( fullOutputPath, websites );
716
716
  WebScraper.combineCSVFiles( fullOutputPath, websites );
@@ -743,20 +743,16 @@ class WebScraper
743
743
 
744
744
  for ( const website of websites )
745
745
  {
746
- const jsonlContent = fs.readFileSync(
747
- path.join( process.cwd(), website.jsonlOutputPath ),
748
- "utf-8"
749
- );
746
+ const jsonlContent = fs.readFileSync( website.jsonlOutputPath, "utf-8" );
747
+
750
748
  if ( jsonlContent )
751
749
  {
752
750
  jsonlOutput.write( jsonlContent );
753
751
  }
754
752
  if ( website.includeMetadata )
755
753
  {
756
- const jsonlMetaContent = fs.readFileSync(
757
- path.join( process.cwd(), website.jsonlOutputPathWithMeta ),
758
- "utf-8"
759
- );
754
+ const jsonlMetaContent = fs.readFileSync( website.jsonlOutputPathWithMeta, "utf-8" );
755
+
760
756
  if ( jsonlMetaContent )
761
757
  {
762
758
  jsonlMetaOutput.write( jsonlMetaContent );
@@ -783,7 +779,7 @@ class WebScraper
783
779
 
784
780
  for ( const website of websites )
785
781
  {
786
- const csvContent = fs.readFileSync( path.join( process.cwd(), website.csvOutputPath ), "utf-8" )
782
+ const csvContent = fs.readFileSync( website.csvOutputPath, "utf-8" )
787
783
  .split( "\n" )
788
784
  .slice( 1 )
789
785
  .filter( line => { return line.trim() });
@@ -793,11 +789,8 @@ class WebScraper
793
789
  }
794
790
  if ( website.includeMetadata )
795
791
  {
796
- const csvMetaContent = fs
797
- .readFileSync(
798
- path.join( process.cwd(), website.csvOutputPathWithMeta ),
799
- "utf-8"
800
- )
792
+ const csvMetaContent = fs.readFileSync( website.csvOutputPathWithMeta, "utf-8" )
793
+
801
794
  .split( "\n" )
802
795
  .slice( 1 )
803
796
  .filter( line => { return line.trim() });
@@ -819,10 +812,8 @@ class WebScraper
819
812
  const textFiles = fs.readdirSync( path.join( process.cwd(), website.textOutputPath ) );
820
813
  for ( const file of textFiles )
821
814
  {
822
- const content = fs.readFileSync(
823
- path.join( process.cwd(), website.textOutputPath, file ),
824
- "utf-8"
825
- );
815
+ const content = fs.readFileSync( path.join( website.textOutputPath, file ), "utf-8" );
816
+
826
817
  fs.writeFileSync(
827
818
  path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
828
819
  content,
@@ -830,10 +821,7 @@ class WebScraper
830
821
  );
831
822
  if ( website.includeMetadata )
832
823
  {
833
- const metaContent = fs.readFileSync(
834
- path.join( process.cwd(), website.textOutputPathWithMeta, file ),
835
- "utf-8"
836
- );
824
+ const metaContent = fs.readFileSync( path.join( website.textOutputPathWithMeta, file ), "utf-8" );
837
825
  fs.writeFileSync(
838
826
  path.join(
839
827
  fullOutputPath,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.2.0",
3
+ "version": "4.2.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",