clean-web-scraper 4.1.1 → 4.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -48,7 +48,8 @@ const scraper = new WebScraper({
48
48
  baseURL: 'https://example.com/news', // Required: The website base url to scrape
49
49
  startURL: 'https://example.com/blog', // Optional: Custom starting URL
50
50
  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
51
- exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
51
+ exactExcludeList: ['/specific-page', // Optional: Exact URLs to exclude
52
+ /^https:\/\/host\.com\/\d{4}\/$/], // Optional: Regex patterns to exclude. this will exclude urls likee https://host.com/2023/
52
53
  scrapResultPath: './example.com/website', // Required: Where to save the content
53
54
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
55
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
package/example-usage.js CHANGED
@@ -65,6 +65,27 @@ async function khameneiIrFreePalestineTag ( enable )
65
65
  return await runScraper( config, enable );
66
66
  }
67
67
 
68
+ async function khameneiIrPalestineSpecialPage ( enable )
69
+ {
70
+ // https://english.khamenei.ir/palestine-special-page/
71
+ const config = {
72
+ baseURL: "https://english.khamenei.ir/news",
73
+ startURL: "https://english.khamenei.ir/palestine-special-page",
74
+ maxDepth: 2,
75
+ exactExcludeList: [
76
+ "https://english.khamenei.ir/palestine-special-page/"
77
+ ],
78
+ scrapResultPath: "./dataset/khamenei-ir-palestine-special-page/website",
79
+ jsonlOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.jsonl",
80
+ textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
81
+ csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
82
+ includeMetadata: true,
83
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
84
+ axiosRetryDelay: 10000
85
+ };
86
+ return await runScraper( config, enable );
87
+ }
88
+
68
89
  async function decolonizepalestine ( enable )
69
90
  {
70
91
  const config = {
@@ -187,26 +208,8 @@ async function mondoweiss ( enable )
187
208
  "https://mondoweiss.net/activism/",
188
209
  "https://mondoweiss.net/news-letters/",
189
210
  "https://mondoweiss.net/newsletters",
190
- "https://mondoweiss.net/2006/",
191
- "https://mondoweiss.net/2007/",
192
- "https://mondoweiss.net/2008/",
193
- "https://mondoweiss.net/2009/",
194
- "https://mondoweiss.net/2010/",
195
- "https://mondoweiss.net/2011/",
196
- "https://mondoweiss.net/2012/",
197
- "https://mondoweiss.net/2013/",
198
- "https://mondoweiss.net/2014/",
199
- "https://mondoweiss.net/2015/",
200
- "https://mondoweiss.net/2016/",
201
- "https://mondoweiss.net/2017/",
202
- "https://mondoweiss.net/2018/",
203
- "https://mondoweiss.net/2019/",
204
- "https://mondoweiss.net/2020/",
205
- "https://mondoweiss.net/2021/",
206
- "https://mondoweiss.net/2022/",
207
- "https://mondoweiss.net/2023/",
208
- "https://mondoweiss.net/2024/",
209
- "https://mondoweiss.net/2025/",
211
+ /^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
212
+ /^https:\/\/mondoweiss\.net\/\d{4}\/$/,
210
213
  "https://mondoweiss.net/daily-headlines",
211
214
  "https://mondoweiss.net/palestineletter",
212
215
  "https://mondoweiss.net/podcasts/",
@@ -303,11 +306,12 @@ async function palestineremembered ( enable )
303
306
 
304
307
  void async function main ()
305
308
  {
306
- const palianswersScraper = await palianswers( false );
307
- const decolonizepalestineScraper = await decolonizepalestine( false );
308
- const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
309
- const electronicintifadaScraper = await electronicintifada( false );
310
- const standWithPalestineScraper = await standWithPalestine( false );
309
+ const palianswersScraper = await palianswers( true );
310
+ const decolonizepalestineScraper = await decolonizepalestine( true );
311
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
312
+ const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
313
+ const electronicintifadaScraper = await electronicintifada( true );
314
+ const standWithPalestineScraper = await standWithPalestine( true );
311
315
  const mondoweisScraper = await mondoweiss( true );
312
316
  const bdsmovementScraper = await bdsmovement( false );
313
317
  const palestinerememberedScraper = await palestineremembered( false );
@@ -316,8 +320,10 @@ void async function main ()
316
320
  palianswersScraper,
317
321
  decolonizepalestineScraper,
318
322
  khameneiIrFreePalestineTagScraper,
323
+ khameneiIrPalestineSpecialPageScraper,
319
324
  electronicintifadaScraper,
320
325
  standWithPalestineScraper,
321
326
  mondoweisScraper
322
327
  ] );
328
+ // QLoRA = LoRA with 4-bit quantization
323
329
  }();
package/main.js CHANGED
@@ -152,7 +152,7 @@ class WebScraper
152
152
  {
153
153
  if ( this.hasValidPageContent( article.textContent ) )
154
154
  {
155
- const metadata = this.extractMetadata( url, document );
155
+ const metadata = this.extractMetadata( url, document, data );
156
156
  metadata.articleTitle = article.title || "";
157
157
  this.saveArticle( url, article.textContent, metadata );
158
158
  }
@@ -166,6 +166,10 @@ class WebScraper
166
166
  console.error( `No readable content found at ${url}` );
167
167
  }
168
168
  }
169
+ else
170
+ {
171
+ console.log( `Skipping excluded URL: ${url}` );
172
+ }
169
173
  const links = this.extractLinks( data );
170
174
  const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
171
175
  for ( const link of unvisitedLinks )
@@ -336,6 +340,7 @@ class WebScraper
336
340
  fs.mkdirSync( dir, { recursive: true });
337
341
  fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
338
342
  fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
343
+ fs.writeFileSync( `${filePath}.html`, metadata.originalHtml, "utf-8" );
339
344
  console.log( `Saved: ${filePath}.txt` );
340
345
  console.log( `Saved: ${filePath}.json` );
341
346
  }
@@ -505,7 +510,7 @@ class WebScraper
505
510
  return filteredMetadata;
506
511
  }
507
512
 
508
- extractMetadata ( url, document )
513
+ extractMetadata ( url, document, html )
509
514
  {
510
515
  return {
511
516
  url,
@@ -519,7 +524,8 @@ class WebScraper
519
524
  ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
520
525
  ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
521
526
  ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
522
- dateScrapedDate: new Date().toISOString()
527
+ dateScrapedDate: new Date().toISOString(),
528
+ originalHtml: html,
523
529
  };
524
530
  }
525
531
 
@@ -580,25 +586,36 @@ class WebScraper
580
586
  normalizeExcludeList ( list = [] )
581
587
  {
582
588
  const normalizedSet = new Set();
583
- for ( let i = 0; i < list.length; i++ )
589
+
590
+ for ( const item of list )
584
591
  {
585
- const item = list[i];
586
- if ( item.endsWith( "/" ) )
587
- {
588
- normalizedSet.add( item.slice( 0, -1 ) );
589
- }
590
- else
592
+ if ( item instanceof RegExp )
591
593
  {
592
594
  normalizedSet.add( item );
595
+ continue;
593
596
  }
594
- normalizedSet.add( `${item.endsWith( "/" ) ? item : `${item }/`}` );
597
+
598
+ const withSlash = item.endsWith( "/" ) ? item : `${item }/`;
599
+ const withoutSlash = item.endsWith( "/" ) ? item.slice( 0, -1 ) : item;
600
+
601
+ normalizedSet.add( withSlash );
602
+ normalizedSet.add( withoutSlash );
595
603
  }
604
+
596
605
  return normalizedSet;
597
606
  }
598
607
 
608
+
599
609
  isExcluded ( url )
600
610
  {
601
- if ( this.exactExcludeList.has( url ) )
611
+ if ( Array.from( this.exactExcludeList ).some( excluded =>
612
+ {
613
+ if ( excluded instanceof RegExp )
614
+ {
615
+ return excluded.test( url );
616
+ }
617
+ return url === excluded;
618
+ }) )
602
619
  {
603
620
  return true;
604
621
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.1.1",
3
+ "version": "4.1.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",