clean-web-scraper 4.1.1 → 4.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/example-usage.js +25 -20
- package/main.js +29 -12
- package/package.json +1 -1
package/README.md
CHANGED
@@ -48,7 +48,8 @@ const scraper = new WebScraper({
|
|
48
48
|
baseURL: 'https://example.com/news', // Required: The website base url to scrape
|
49
49
|
startURL: 'https://example.com/blog', // Optional: Custom starting URL
|
50
50
|
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
51
|
-
exactExcludeList: ['/specific-page'
|
51
|
+
exactExcludeList: ['/specific-page', // Optional: Exact URLs to exclude
|
52
|
+
/^https:\/\/host\.com\/\d{4}\/$/], // Optional: Regex patterns to exclude. this will exclude urls likee https://host.com/2023/
|
52
53
|
scrapResultPath: './example.com/website', // Required: Where to save the content
|
53
54
|
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
54
55
|
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
package/example-usage.js
CHANGED
@@ -65,6 +65,26 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
65
65
|
return await runScraper( config, enable );
|
66
66
|
}
|
67
67
|
|
68
|
+
async function khameneiIrPalestineSpecialPage ( enable )
|
69
|
+
{
|
70
|
+
// https://english.khamenei.ir/palestine-special-page/
|
71
|
+
const config = {
|
72
|
+
baseURL: "https://english.khamenei.ir/palestine-special-page/",
|
73
|
+
maxDepth: 2,
|
74
|
+
exactExcludeList: [
|
75
|
+
"https://english.khamenei.ir/palestine-special-page/"
|
76
|
+
],
|
77
|
+
scrapResultPath: "./dataset/khamenei-ir-palestine-special-page/website",
|
78
|
+
jsonlOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.jsonl",
|
79
|
+
textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
|
80
|
+
csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
|
81
|
+
includeMetadata: true,
|
82
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
83
|
+
axiosRetryDelay: 10000
|
84
|
+
};
|
85
|
+
return await runScraper( config, enable );
|
86
|
+
}
|
87
|
+
|
68
88
|
async function decolonizepalestine ( enable )
|
69
89
|
{
|
70
90
|
const config = {
|
@@ -187,26 +207,8 @@ async function mondoweiss ( enable )
|
|
187
207
|
"https://mondoweiss.net/activism/",
|
188
208
|
"https://mondoweiss.net/news-letters/",
|
189
209
|
"https://mondoweiss.net/newsletters",
|
190
|
-
|
191
|
-
|
192
|
-
"https://mondoweiss.net/2008/",
|
193
|
-
"https://mondoweiss.net/2009/",
|
194
|
-
"https://mondoweiss.net/2010/",
|
195
|
-
"https://mondoweiss.net/2011/",
|
196
|
-
"https://mondoweiss.net/2012/",
|
197
|
-
"https://mondoweiss.net/2013/",
|
198
|
-
"https://mondoweiss.net/2014/",
|
199
|
-
"https://mondoweiss.net/2015/",
|
200
|
-
"https://mondoweiss.net/2016/",
|
201
|
-
"https://mondoweiss.net/2017/",
|
202
|
-
"https://mondoweiss.net/2018/",
|
203
|
-
"https://mondoweiss.net/2019/",
|
204
|
-
"https://mondoweiss.net/2020/",
|
205
|
-
"https://mondoweiss.net/2021/",
|
206
|
-
"https://mondoweiss.net/2022/",
|
207
|
-
"https://mondoweiss.net/2023/",
|
208
|
-
"https://mondoweiss.net/2024/",
|
209
|
-
"https://mondoweiss.net/2025/",
|
210
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
|
211
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/$/,
|
210
212
|
"https://mondoweiss.net/daily-headlines",
|
211
213
|
"https://mondoweiss.net/palestineletter",
|
212
214
|
"https://mondoweiss.net/podcasts/",
|
@@ -306,6 +308,7 @@ void async function main ()
|
|
306
308
|
const palianswersScraper = await palianswers( false );
|
307
309
|
const decolonizepalestineScraper = await decolonizepalestine( false );
|
308
310
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
|
311
|
+
const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( false );
|
309
312
|
const electronicintifadaScraper = await electronicintifada( false );
|
310
313
|
const standWithPalestineScraper = await standWithPalestine( false );
|
311
314
|
const mondoweisScraper = await mondoweiss( true );
|
@@ -316,8 +319,10 @@ void async function main ()
|
|
316
319
|
palianswersScraper,
|
317
320
|
decolonizepalestineScraper,
|
318
321
|
khameneiIrFreePalestineTagScraper,
|
322
|
+
khameneiIrPalestineSpecialPageScraper,
|
319
323
|
electronicintifadaScraper,
|
320
324
|
standWithPalestineScraper,
|
321
325
|
mondoweisScraper
|
322
326
|
] );
|
327
|
+
// QLoRA = LoRA with 4-bit quantization
|
323
328
|
}();
|
package/main.js
CHANGED
@@ -152,7 +152,7 @@ class WebScraper
|
|
152
152
|
{
|
153
153
|
if ( this.hasValidPageContent( article.textContent ) )
|
154
154
|
{
|
155
|
-
const metadata = this.extractMetadata( url, document );
|
155
|
+
const metadata = this.extractMetadata( url, document, data );
|
156
156
|
metadata.articleTitle = article.title || "";
|
157
157
|
this.saveArticle( url, article.textContent, metadata );
|
158
158
|
}
|
@@ -166,6 +166,10 @@ class WebScraper
|
|
166
166
|
console.error( `No readable content found at ${url}` );
|
167
167
|
}
|
168
168
|
}
|
169
|
+
else
|
170
|
+
{
|
171
|
+
console.log( `Skipping excluded URL: ${url}` );
|
172
|
+
}
|
169
173
|
const links = this.extractLinks( data );
|
170
174
|
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
171
175
|
for ( const link of unvisitedLinks )
|
@@ -336,6 +340,7 @@ class WebScraper
|
|
336
340
|
fs.mkdirSync( dir, { recursive: true });
|
337
341
|
fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
|
338
342
|
fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
|
343
|
+
fs.writeFileSync( `${filePath}.html`, metadata.originalHtml, "utf-8" );
|
339
344
|
console.log( `Saved: ${filePath}.txt` );
|
340
345
|
console.log( `Saved: ${filePath}.json` );
|
341
346
|
}
|
@@ -505,7 +510,7 @@ class WebScraper
|
|
505
510
|
return filteredMetadata;
|
506
511
|
}
|
507
512
|
|
508
|
-
extractMetadata ( url, document )
|
513
|
+
extractMetadata ( url, document, html )
|
509
514
|
{
|
510
515
|
return {
|
511
516
|
url,
|
@@ -519,7 +524,8 @@ class WebScraper
|
|
519
524
|
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
520
525
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
521
526
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
522
|
-
dateScrapedDate: new Date().toISOString()
|
527
|
+
dateScrapedDate: new Date().toISOString(),
|
528
|
+
originalHtml: html,
|
523
529
|
};
|
524
530
|
}
|
525
531
|
|
@@ -580,25 +586,36 @@ class WebScraper
|
|
580
586
|
normalizeExcludeList ( list = [] )
|
581
587
|
{
|
582
588
|
const normalizedSet = new Set();
|
583
|
-
|
589
|
+
|
590
|
+
for ( const item of list )
|
584
591
|
{
|
585
|
-
|
586
|
-
if ( item.endsWith( "/" ) )
|
587
|
-
{
|
588
|
-
normalizedSet.add( item.slice( 0, -1 ) );
|
589
|
-
}
|
590
|
-
else
|
592
|
+
if ( item instanceof RegExp )
|
591
593
|
{
|
592
594
|
normalizedSet.add( item );
|
595
|
+
continue;
|
593
596
|
}
|
594
|
-
|
597
|
+
|
598
|
+
const withSlash = item.endsWith( "/" ) ? item : `${item }/`;
|
599
|
+
const withoutSlash = item.endsWith( "/" ) ? item.slice( 0, -1 ) : item;
|
600
|
+
|
601
|
+
normalizedSet.add( withSlash );
|
602
|
+
normalizedSet.add( withoutSlash );
|
595
603
|
}
|
604
|
+
|
596
605
|
return normalizedSet;
|
597
606
|
}
|
598
607
|
|
608
|
+
|
599
609
|
isExcluded ( url )
|
600
610
|
{
|
601
|
-
if ( this.exactExcludeList.
|
611
|
+
if ( Array.from( this.exactExcludeList ).some( excluded =>
|
612
|
+
{
|
613
|
+
if ( excluded instanceof RegExp )
|
614
|
+
{
|
615
|
+
return excluded.test( url );
|
616
|
+
}
|
617
|
+
return url === excluded;
|
618
|
+
}) )
|
602
619
|
{
|
603
620
|
return true;
|
604
621
|
}
|