clean-web-scraper 4.1.1 → 4.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -48,7 +48,8 @@ const scraper = new WebScraper({
48
48
  baseURL: 'https://example.com/news', // Required: The website base url to scrape
49
49
  startURL: 'https://example.com/blog', // Optional: Custom starting URL
50
50
  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
51
- exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
51
+ exactExcludeList: ['/specific-page', // Optional: Exact URLs to exclude
52
+ /^https:\/\/host\.com\/\d{4}\/$/], // Optional: Regex patterns to exclude. this will exclude urls likee https://host.com/2023/
52
53
  scrapResultPath: './example.com/website', // Required: Where to save the content
53
54
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
55
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
package/example-usage.js CHANGED
@@ -65,6 +65,26 @@ async function khameneiIrFreePalestineTag ( enable )
65
65
  return await runScraper( config, enable );
66
66
  }
67
67
 
68
+ async function khameneiIrPalestineSpecialPage ( enable )
69
+ {
70
+ // https://english.khamenei.ir/palestine-special-page/
71
+ const config = {
72
+ baseURL: "https://english.khamenei.ir/palestine-special-page/",
73
+ maxDepth: 2,
74
+ exactExcludeList: [
75
+ "https://english.khamenei.ir/palestine-special-page/"
76
+ ],
77
+ scrapResultPath: "./dataset/khamenei-ir-palestine-special-page/website",
78
+ jsonlOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.jsonl",
79
+ textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
80
+ csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
81
+ includeMetadata: true,
82
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
83
+ axiosRetryDelay: 10000
84
+ };
85
+ return await runScraper( config, enable );
86
+ }
87
+
68
88
  async function decolonizepalestine ( enable )
69
89
  {
70
90
  const config = {
@@ -187,26 +207,8 @@ async function mondoweiss ( enable )
187
207
  "https://mondoweiss.net/activism/",
188
208
  "https://mondoweiss.net/news-letters/",
189
209
  "https://mondoweiss.net/newsletters",
190
- "https://mondoweiss.net/2006/",
191
- "https://mondoweiss.net/2007/",
192
- "https://mondoweiss.net/2008/",
193
- "https://mondoweiss.net/2009/",
194
- "https://mondoweiss.net/2010/",
195
- "https://mondoweiss.net/2011/",
196
- "https://mondoweiss.net/2012/",
197
- "https://mondoweiss.net/2013/",
198
- "https://mondoweiss.net/2014/",
199
- "https://mondoweiss.net/2015/",
200
- "https://mondoweiss.net/2016/",
201
- "https://mondoweiss.net/2017/",
202
- "https://mondoweiss.net/2018/",
203
- "https://mondoweiss.net/2019/",
204
- "https://mondoweiss.net/2020/",
205
- "https://mondoweiss.net/2021/",
206
- "https://mondoweiss.net/2022/",
207
- "https://mondoweiss.net/2023/",
208
- "https://mondoweiss.net/2024/",
209
- "https://mondoweiss.net/2025/",
210
+ /^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
211
+ /^https:\/\/mondoweiss\.net\/\d{4}\/$/,
210
212
  "https://mondoweiss.net/daily-headlines",
211
213
  "https://mondoweiss.net/palestineletter",
212
214
  "https://mondoweiss.net/podcasts/",
@@ -306,6 +308,7 @@ void async function main ()
306
308
  const palianswersScraper = await palianswers( false );
307
309
  const decolonizepalestineScraper = await decolonizepalestine( false );
308
310
  const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
311
+ const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( false );
309
312
  const electronicintifadaScraper = await electronicintifada( false );
310
313
  const standWithPalestineScraper = await standWithPalestine( false );
311
314
  const mondoweisScraper = await mondoweiss( true );
@@ -316,8 +319,10 @@ void async function main ()
316
319
  palianswersScraper,
317
320
  decolonizepalestineScraper,
318
321
  khameneiIrFreePalestineTagScraper,
322
+ khameneiIrPalestineSpecialPageScraper,
319
323
  electronicintifadaScraper,
320
324
  standWithPalestineScraper,
321
325
  mondoweisScraper
322
326
  ] );
327
+ // QLoRA = LoRA with 4-bit quantization
323
328
  }();
package/main.js CHANGED
@@ -152,7 +152,7 @@ class WebScraper
152
152
  {
153
153
  if ( this.hasValidPageContent( article.textContent ) )
154
154
  {
155
- const metadata = this.extractMetadata( url, document );
155
+ const metadata = this.extractMetadata( url, document, data );
156
156
  metadata.articleTitle = article.title || "";
157
157
  this.saveArticle( url, article.textContent, metadata );
158
158
  }
@@ -166,6 +166,10 @@ class WebScraper
166
166
  console.error( `No readable content found at ${url}` );
167
167
  }
168
168
  }
169
+ else
170
+ {
171
+ console.log( `Skipping excluded URL: ${url}` );
172
+ }
169
173
  const links = this.extractLinks( data );
170
174
  const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
171
175
  for ( const link of unvisitedLinks )
@@ -336,6 +340,7 @@ class WebScraper
336
340
  fs.mkdirSync( dir, { recursive: true });
337
341
  fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
338
342
  fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
343
+ fs.writeFileSync( `${filePath}.html`, metadata.originalHtml, "utf-8" );
339
344
  console.log( `Saved: ${filePath}.txt` );
340
345
  console.log( `Saved: ${filePath}.json` );
341
346
  }
@@ -505,7 +510,7 @@ class WebScraper
505
510
  return filteredMetadata;
506
511
  }
507
512
 
508
- extractMetadata ( url, document )
513
+ extractMetadata ( url, document, html )
509
514
  {
510
515
  return {
511
516
  url,
@@ -519,7 +524,8 @@ class WebScraper
519
524
  ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
520
525
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
521
526
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
522
- dateScrapedDate: new Date().toISOString()
527
+ dateScrapedDate: new Date().toISOString(),
528
+ originalHtml: html,
523
529
  };
524
530
  }
525
531
 
@@ -580,25 +586,36 @@ class WebScraper
580
586
  normalizeExcludeList ( list = [] )
581
587
  {
582
588
  const normalizedSet = new Set();
583
- for ( let i = 0; i < list.length; i++ )
589
+
590
+ for ( const item of list )
584
591
  {
585
- const item = list[i];
586
- if ( item.endsWith( "/" ) )
587
- {
588
- normalizedSet.add( item.slice( 0, -1 ) );
589
- }
590
- else
592
+ if ( item instanceof RegExp )
591
593
  {
592
594
  normalizedSet.add( item );
595
+ continue;
593
596
  }
594
- normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
597
+
598
+ const withSlash = item.endsWith( "/" ) ? item : `${item }/`;
599
+ const withoutSlash = item.endsWith( "/" ) ? item.slice( 0, -1 ) : item;
600
+
601
+ normalizedSet.add( withSlash );
602
+ normalizedSet.add( withoutSlash );
595
603
  }
604
+
596
605
  return normalizedSet;
597
606
  }
598
607
 
608
+
599
609
  isExcluded ( url )
600
610
  {
601
- if ( this.exactExcludeList.has( url ) )
611
+ if ( Array.from( this.exactExcludeList ).some( excluded =>
612
+ {
613
+ if ( excluded instanceof RegExp )
614
+ {
615
+ return excluded.test( url );
616
+ }
617
+ return url === excluded;
618
+ }) )
602
619
  {
603
620
  return true;
604
621
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.1.1",
3
+ "version": "4.1.2",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",