clean-web-scraper 4.1.0 → 4.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -48,7 +48,8 @@ const scraper = new WebScraper({
48
48
  baseURL: 'https://example.com/news', // Required: The website base url to scrape
49
49
  startURL: 'https://example.com/blog', // Optional: Custom starting URL
50
50
  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
51
- exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
51
+ exactExcludeList: ['/specific-page', // Optional: Exact URLs to exclude
52
+ /^https:\/\/host\.com\/\d{4}\/$/], // Optional: Regex patterns to exclude. this will exclude urls likee https://host.com/2023/
52
53
  scrapResultPath: './example.com/website', // Required: Where to save the content
53
54
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
55
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
package/example-usage.js CHANGED
@@ -65,6 +65,26 @@ async function khameneiIrFreePalestineTag ( enable )
65
65
  return await runScraper( config, enable );
66
66
  }
67
67
 
68
+ async function khameneiIrPalestineSpecialPage ( enable )
69
+ {
70
+ // https://english.khamenei.ir/palestine-special-page/
71
+ const config = {
72
+ baseURL: "https://english.khamenei.ir/palestine-special-page/",
73
+ maxDepth: 2,
74
+ exactExcludeList: [
75
+ "https://english.khamenei.ir/palestine-special-page/"
76
+ ],
77
+ scrapResultPath: "./dataset/khamenei-ir-palestine-special-page/website",
78
+ jsonlOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.jsonl",
79
+ textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
80
+ csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
81
+ includeMetadata: true,
82
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
83
+ axiosRetryDelay: 10000
84
+ };
85
+ return await runScraper( config, enable );
86
+ }
87
+
68
88
  async function decolonizepalestine ( enable )
69
89
  {
70
90
  const config = {
@@ -187,26 +207,8 @@ async function mondoweiss ( enable )
187
207
  "https://mondoweiss.net/activism/",
188
208
  "https://mondoweiss.net/news-letters/",
189
209
  "https://mondoweiss.net/newsletters",
190
- "https://mondoweiss.net/2006/",
191
- "https://mondoweiss.net/2007/",
192
- "https://mondoweiss.net/2008/",
193
- "https://mondoweiss.net/2009/",
194
- "https://mondoweiss.net/2010/",
195
- "https://mondoweiss.net/2011/",
196
- "https://mondoweiss.net/2012/",
197
- "https://mondoweiss.net/2013/",
198
- "https://mondoweiss.net/2014/",
199
- "https://mondoweiss.net/2015/",
200
- "https://mondoweiss.net/2016/",
201
- "https://mondoweiss.net/2017/",
202
- "https://mondoweiss.net/2018/",
203
- "https://mondoweiss.net/2019/",
204
- "https://mondoweiss.net/2020/",
205
- "https://mondoweiss.net/2021/",
206
- "https://mondoweiss.net/2022/",
207
- "https://mondoweiss.net/2023/",
208
- "https://mondoweiss.net/2024/",
209
- "https://mondoweiss.net/2025/",
210
+ /^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
211
+ /^https:\/\/mondoweiss\.net\/\d{4}\/$/,
210
212
  "https://mondoweiss.net/daily-headlines",
211
213
  "https://mondoweiss.net/palestineletter",
212
214
  "https://mondoweiss.net/podcasts/",
@@ -306,6 +308,7 @@ void async function main ()
306
308
  const palianswersScraper = await palianswers( false );
307
309
  const decolonizepalestineScraper = await decolonizepalestine( false );
308
310
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
311
+ const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( false );
309
312
  const electronicintifadaScraper = await electronicintifada( false );
310
313
  const standWithPalestineScraper = await standWithPalestine( false );
311
314
  const mondoweisScraper = await mondoweiss( true );
@@ -316,8 +319,10 @@ void async function main ()
316
319
  palianswersScraper,
317
320
  decolonizepalestineScraper,
318
321
  khameneiIrFreePalestineTagScraper,
322
+ khameneiIrPalestineSpecialPageScraper,
319
323
  electronicintifadaScraper,
320
324
  standWithPalestineScraper,
321
325
  mondoweisScraper
322
326
  ] );
327
+ // QLoRA = LoRA with 4-bit quantization
323
328
  }();
package/main.js CHANGED
@@ -115,6 +115,7 @@ class WebScraper
115
115
  {
116
116
  return;
117
117
  }
118
+ let originalUrl = url;
118
119
  if ( this.removeURLFragment )
119
120
  {
120
121
  url = url.split( "#" )[0];
@@ -125,6 +126,7 @@ class WebScraper
125
126
  return;
126
127
  }
127
128
  this.visited.add( url );
129
+ this.visited.add( originalUrl );
128
130
  if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
129
131
  {
130
132
  return;
@@ -150,7 +152,7 @@ class WebScraper
150
152
  {
151
153
  if ( this.hasValidPageContent( article.textContent ) )
152
154
  {
153
- const metadata = this.extractMetadata( url, document );
155
+ const metadata = this.extractMetadata( url, document, data );
154
156
  metadata.articleTitle = article.title || "";
155
157
  this.saveArticle( url, article.textContent, metadata );
156
158
  }
@@ -164,6 +166,10 @@ class WebScraper
164
166
  console.error( `No readable content found at ${url}` );
165
167
  }
166
168
  }
169
+ else
170
+ {
171
+ console.log( `Skipping excluded URL: ${url}` );
172
+ }
167
173
  const links = this.extractLinks( data );
168
174
  const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
169
175
  for ( const link of unvisitedLinks )
@@ -334,6 +340,7 @@ class WebScraper
334
340
  fs.mkdirSync( dir, { recursive: true });
335
341
  fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
336
342
  fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
343
+ fs.writeFileSync( `${filePath}.html`, metadata.originalHtml, "utf-8" );
337
344
  console.log( `Saved: ${filePath}.txt` );
338
345
  console.log( `Saved: ${filePath}.json` );
339
346
  }
@@ -503,7 +510,7 @@ class WebScraper
503
510
  return filteredMetadata;
504
511
  }
505
512
 
506
- extractMetadata ( url, document )
513
+ extractMetadata ( url, document, html )
507
514
  {
508
515
  return {
509
516
  url,
@@ -517,7 +524,8 @@ class WebScraper
517
524
  ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
518
525
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
519
526
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
520
- dateScrapedDate: new Date().toISOString()
527
+ dateScrapedDate: new Date().toISOString(),
528
+ originalHtml: html,
521
529
  };
522
530
  }
523
531
 
@@ -578,25 +586,36 @@ class WebScraper
578
586
  normalizeExcludeList ( list = [] )
579
587
  {
580
588
  const normalizedSet = new Set();
581
- for ( let i = 0; i < list.length; i++ )
589
+
590
+ for ( const item of list )
582
591
  {
583
- const item = list[i];
584
- if ( item.endsWith( "/" ) )
585
- {
586
- normalizedSet.add( item.slice( 0, -1 ) );
587
- }
588
- else
592
+ if ( item instanceof RegExp )
589
593
  {
590
594
  normalizedSet.add( item );
595
+ continue;
591
596
  }
592
- normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
597
+
598
+ const withSlash = item.endsWith( "/" ) ? item : `${item }/`;
599
+ const withoutSlash = item.endsWith( "/" ) ? item.slice( 0, -1 ) : item;
600
+
601
+ normalizedSet.add( withSlash );
602
+ normalizedSet.add( withoutSlash );
593
603
  }
604
+
594
605
  return normalizedSet;
595
606
  }
596
607
 
608
+
597
609
  isExcluded ( url )
598
610
  {
599
- if ( this.exactExcludeList.has( url ) )
611
+ if ( Array.from( this.exactExcludeList ).some( excluded =>
612
+ {
613
+ if ( excluded instanceof RegExp )
614
+ {
615
+ return excluded.test( url );
616
+ }
617
+ return url === excluded;
618
+ }) )
600
619
  {
601
620
  return true;
602
621
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.1.0",
3
+ "version": "4.1.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",