clean-web-scraper 4.1.0 → 4.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/example-usage.js +25 -20
- package/main.js +31 -12
- package/package.json +1 -1
package/README.md
CHANGED
@@ -48,7 +48,8 @@ const scraper = new WebScraper({
|
|
48
48
|
baseURL: 'https://example.com/news', // Required: The website base url to scrape
|
49
49
|
startURL: 'https://example.com/blog', // Optional: Custom starting URL
|
50
50
|
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
51
|
-
exactExcludeList: ['/specific-page'
|
51
|
+
exactExcludeList: ['/specific-page', // Optional: Exact URLs to exclude
|
52
|
+
/^https:\/\/host\.com\/\d{4}\/$/], // Optional: Regex patterns to exclude. this will exclude urls likee https://host.com/2023/
|
52
53
|
scrapResultPath: './example.com/website', // Required: Where to save the content
|
53
54
|
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
54
55
|
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
package/example-usage.js
CHANGED
@@ -65,6 +65,26 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
65
65
|
return await runScraper( config, enable );
|
66
66
|
}
|
67
67
|
|
68
|
+
async function khameneiIrPalestineSpecialPage ( enable )
|
69
|
+
{
|
70
|
+
// https://english.khamenei.ir/palestine-special-page/
|
71
|
+
const config = {
|
72
|
+
baseURL: "https://english.khamenei.ir/palestine-special-page/",
|
73
|
+
maxDepth: 2,
|
74
|
+
exactExcludeList: [
|
75
|
+
"https://english.khamenei.ir/palestine-special-page/"
|
76
|
+
],
|
77
|
+
scrapResultPath: "./dataset/khamenei-ir-palestine-special-page/website",
|
78
|
+
jsonlOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.jsonl",
|
79
|
+
textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
|
80
|
+
csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
|
81
|
+
includeMetadata: true,
|
82
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
83
|
+
axiosRetryDelay: 10000
|
84
|
+
};
|
85
|
+
return await runScraper( config, enable );
|
86
|
+
}
|
87
|
+
|
68
88
|
async function decolonizepalestine ( enable )
|
69
89
|
{
|
70
90
|
const config = {
|
@@ -187,26 +207,8 @@ async function mondoweiss ( enable )
|
|
187
207
|
"https://mondoweiss.net/activism/",
|
188
208
|
"https://mondoweiss.net/news-letters/",
|
189
209
|
"https://mondoweiss.net/newsletters",
|
190
|
-
|
191
|
-
|
192
|
-
"https://mondoweiss.net/2008/",
|
193
|
-
"https://mondoweiss.net/2009/",
|
194
|
-
"https://mondoweiss.net/2010/",
|
195
|
-
"https://mondoweiss.net/2011/",
|
196
|
-
"https://mondoweiss.net/2012/",
|
197
|
-
"https://mondoweiss.net/2013/",
|
198
|
-
"https://mondoweiss.net/2014/",
|
199
|
-
"https://mondoweiss.net/2015/",
|
200
|
-
"https://mondoweiss.net/2016/",
|
201
|
-
"https://mondoweiss.net/2017/",
|
202
|
-
"https://mondoweiss.net/2018/",
|
203
|
-
"https://mondoweiss.net/2019/",
|
204
|
-
"https://mondoweiss.net/2020/",
|
205
|
-
"https://mondoweiss.net/2021/",
|
206
|
-
"https://mondoweiss.net/2022/",
|
207
|
-
"https://mondoweiss.net/2023/",
|
208
|
-
"https://mondoweiss.net/2024/",
|
209
|
-
"https://mondoweiss.net/2025/",
|
210
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
|
211
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/$/,
|
210
212
|
"https://mondoweiss.net/daily-headlines",
|
211
213
|
"https://mondoweiss.net/palestineletter",
|
212
214
|
"https://mondoweiss.net/podcasts/",
|
@@ -306,6 +308,7 @@ void async function main ()
|
|
306
308
|
const palianswersScraper = await palianswers( false );
|
307
309
|
const decolonizepalestineScraper = await decolonizepalestine( false );
|
308
310
|
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( false );
|
311
|
+
const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( false );
|
309
312
|
const electronicintifadaScraper = await electronicintifada( false );
|
310
313
|
const standWithPalestineScraper = await standWithPalestine( false );
|
311
314
|
const mondoweisScraper = await mondoweiss( true );
|
@@ -316,8 +319,10 @@ void async function main ()
|
|
316
319
|
palianswersScraper,
|
317
320
|
decolonizepalestineScraper,
|
318
321
|
khameneiIrFreePalestineTagScraper,
|
322
|
+
khameneiIrPalestineSpecialPageScraper,
|
319
323
|
electronicintifadaScraper,
|
320
324
|
standWithPalestineScraper,
|
321
325
|
mondoweisScraper
|
322
326
|
] );
|
327
|
+
// QLoRA = LoRA with 4-bit quantization
|
323
328
|
}();
|
package/main.js
CHANGED
@@ -115,6 +115,7 @@ class WebScraper
|
|
115
115
|
{
|
116
116
|
return;
|
117
117
|
}
|
118
|
+
let originalUrl = url;
|
118
119
|
if ( this.removeURLFragment )
|
119
120
|
{
|
120
121
|
url = url.split( "#" )[0];
|
@@ -125,6 +126,7 @@ class WebScraper
|
|
125
126
|
return;
|
126
127
|
}
|
127
128
|
this.visited.add( url );
|
129
|
+
this.visited.add( originalUrl );
|
128
130
|
if ( !this.isValidFileType( url ) || !this.isValidDomain( url ) )
|
129
131
|
{
|
130
132
|
return;
|
@@ -150,7 +152,7 @@ class WebScraper
|
|
150
152
|
{
|
151
153
|
if ( this.hasValidPageContent( article.textContent ) )
|
152
154
|
{
|
153
|
-
const metadata = this.extractMetadata( url, document );
|
155
|
+
const metadata = this.extractMetadata( url, document, data );
|
154
156
|
metadata.articleTitle = article.title || "";
|
155
157
|
this.saveArticle( url, article.textContent, metadata );
|
156
158
|
}
|
@@ -164,6 +166,10 @@ class WebScraper
|
|
164
166
|
console.error( `No readable content found at ${url}` );
|
165
167
|
}
|
166
168
|
}
|
169
|
+
else
|
170
|
+
{
|
171
|
+
console.log( `Skipping excluded URL: ${url}` );
|
172
|
+
}
|
167
173
|
const links = this.extractLinks( data );
|
168
174
|
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
169
175
|
for ( const link of unvisitedLinks )
|
@@ -334,6 +340,7 @@ class WebScraper
|
|
334
340
|
fs.mkdirSync( dir, { recursive: true });
|
335
341
|
fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
|
336
342
|
fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
|
343
|
+
fs.writeFileSync( `${filePath}.html`, metadata.originalHtml, "utf-8" );
|
337
344
|
console.log( `Saved: ${filePath}.txt` );
|
338
345
|
console.log( `Saved: ${filePath}.json` );
|
339
346
|
}
|
@@ -503,7 +510,7 @@ class WebScraper
|
|
503
510
|
return filteredMetadata;
|
504
511
|
}
|
505
512
|
|
506
|
-
extractMetadata ( url, document )
|
513
|
+
extractMetadata ( url, document, html )
|
507
514
|
{
|
508
515
|
return {
|
509
516
|
url,
|
@@ -517,7 +524,8 @@ class WebScraper
|
|
517
524
|
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
518
525
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
519
526
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
520
|
-
dateScrapedDate: new Date().toISOString()
|
527
|
+
dateScrapedDate: new Date().toISOString(),
|
528
|
+
originalHtml: html,
|
521
529
|
};
|
522
530
|
}
|
523
531
|
|
@@ -578,25 +586,36 @@ class WebScraper
|
|
578
586
|
normalizeExcludeList ( list = [] )
|
579
587
|
{
|
580
588
|
const normalizedSet = new Set();
|
581
|
-
|
589
|
+
|
590
|
+
for ( const item of list )
|
582
591
|
{
|
583
|
-
|
584
|
-
if ( item.endsWith( "/" ) )
|
585
|
-
{
|
586
|
-
normalizedSet.add( item.slice( 0, -1 ) );
|
587
|
-
}
|
588
|
-
else
|
592
|
+
if ( item instanceof RegExp )
|
589
593
|
{
|
590
594
|
normalizedSet.add( item );
|
595
|
+
continue;
|
591
596
|
}
|
592
|
-
|
597
|
+
|
598
|
+
const withSlash = item.endsWith( "/" ) ? item : `${item }/`;
|
599
|
+
const withoutSlash = item.endsWith( "/" ) ? item.slice( 0, -1 ) : item;
|
600
|
+
|
601
|
+
normalizedSet.add( withSlash );
|
602
|
+
normalizedSet.add( withoutSlash );
|
593
603
|
}
|
604
|
+
|
594
605
|
return normalizedSet;
|
595
606
|
}
|
596
607
|
|
608
|
+
|
597
609
|
isExcluded ( url )
|
598
610
|
{
|
599
|
-
if ( this.exactExcludeList.
|
611
|
+
if ( Array.from( this.exactExcludeList ).some( excluded =>
|
612
|
+
{
|
613
|
+
if ( excluded instanceof RegExp )
|
614
|
+
{
|
615
|
+
return excluded.test( url );
|
616
|
+
}
|
617
|
+
return url === excluded;
|
618
|
+
}) )
|
600
619
|
{
|
601
620
|
return true;
|
602
621
|
}
|