clean-web-scraper 4.1.1 → 4.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/example-usage.js +31 -25
- package/main.js +29 -12
- package/package.json +1 -1
package/README.md
CHANGED
@@ -48,7 +48,8 @@ const scraper = new WebScraper({
|
|
48
48
|
baseURL: 'https://example.com/news', // Required: The website base url to scrape
|
49
49
|
startURL: 'https://example.com/blog', // Optional: Custom starting URL
|
50
50
|
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
51
|
-
exactExcludeList: ['/specific-page'
|
51
|
+
exactExcludeList: ['/specific-page', // Optional: Exact URLs to exclude
|
52
|
+
/^https:\/\/host\.com\/\d{4}\/$/], // Optional: Regex patterns to exclude. this will exclude urls likee https://host.com/2023/
|
52
53
|
scrapResultPath: './example.com/website', // Required: Where to save the content
|
53
54
|
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
54
55
|
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
package/example-usage.js
CHANGED
@@ -65,6 +65,27 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
65
65
|
return await runScraper( config, enable );
|
66
66
|
}
|
67
67
|
|
68
|
+
async function khameneiIrPalestineSpecialPage ( enable )
|
69
|
+
{
|
70
|
+
// https://english.khamenei.ir/palestine-special-page/
|
71
|
+
const config = {
|
72
|
+
baseURL: "https://english.khamenei.ir/news",
|
73
|
+
startURL: "https://english.khamenei.ir/palestine-special-page",
|
74
|
+
maxDepth: 2,
|
75
|
+
exactExcludeList: [
|
76
|
+
"https://english.khamenei.ir/palestine-special-page/"
|
77
|
+
],
|
78
|
+
scrapResultPath: "./dataset/khamenei-ir-palestine-special-page/website",
|
79
|
+
jsonlOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.jsonl",
|
80
|
+
textOutputPath: "./dataset/khamenei-ir-palestine-special-page/texts",
|
81
|
+
csvOutputPath: "./dataset/khamenei-ir-palestine-special-page/train.csv",
|
82
|
+
includeMetadata: true,
|
83
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
84
|
+
axiosRetryDelay: 10000
|
85
|
+
};
|
86
|
+
return await runScraper( config, enable );
|
87
|
+
}
|
88
|
+
|
68
89
|
async function decolonizepalestine ( enable )
|
69
90
|
{
|
70
91
|
const config = {
|
@@ -187,26 +208,8 @@ async function mondoweiss ( enable )
|
|
187
208
|
"https://mondoweiss.net/activism/",
|
188
209
|
"https://mondoweiss.net/news-letters/",
|
189
210
|
"https://mondoweiss.net/newsletters",
|
190
|
-
|
191
|
-
|
192
|
-
"https://mondoweiss.net/2008/",
|
193
|
-
"https://mondoweiss.net/2009/",
|
194
|
-
"https://mondoweiss.net/2010/",
|
195
|
-
"https://mondoweiss.net/2011/",
|
196
|
-
"https://mondoweiss.net/2012/",
|
197
|
-
"https://mondoweiss.net/2013/",
|
198
|
-
"https://mondoweiss.net/2014/",
|
199
|
-
"https://mondoweiss.net/2015/",
|
200
|
-
"https://mondoweiss.net/2016/",
|
201
|
-
"https://mondoweiss.net/2017/",
|
202
|
-
"https://mondoweiss.net/2018/",
|
203
|
-
"https://mondoweiss.net/2019/",
|
204
|
-
"https://mondoweiss.net/2020/",
|
205
|
-
"https://mondoweiss.net/2021/",
|
206
|
-
"https://mondoweiss.net/2022/",
|
207
|
-
"https://mondoweiss.net/2023/",
|
208
|
-
"https://mondoweiss.net/2024/",
|
209
|
-
"https://mondoweiss.net/2025/",
|
211
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/\d{2}$/,
|
212
|
+
/^https:\/\/mondoweiss\.net\/\d{4}\/$/,
|
210
213
|
"https://mondoweiss.net/daily-headlines",
|
211
214
|
"https://mondoweiss.net/palestineletter",
|
212
215
|
"https://mondoweiss.net/podcasts/",
|
@@ -303,11 +306,12 @@ async function palestineremembered ( enable )
|
|
303
306
|
|
304
307
|
void async function main ()
|
305
308
|
{
|
306
|
-
const palianswersScraper = await palianswers(
|
307
|
-
const decolonizepalestineScraper = await decolonizepalestine(
|
308
|
-
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag(
|
309
|
-
const
|
310
|
-
const
|
309
|
+
const palianswersScraper = await palianswers( true );
|
310
|
+
const decolonizepalestineScraper = await decolonizepalestine( true );
|
311
|
+
const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag( true );
|
312
|
+
const khameneiIrPalestineSpecialPageScraper = await khameneiIrPalestineSpecialPage( true );
|
313
|
+
const electronicintifadaScraper = await electronicintifada( true );
|
314
|
+
const standWithPalestineScraper = await standWithPalestine( true );
|
311
315
|
const mondoweisScraper = await mondoweiss( true );
|
312
316
|
const bdsmovementScraper = await bdsmovement( false );
|
313
317
|
const palestinerememberedScraper = await palestineremembered( false );
|
@@ -316,8 +320,10 @@ void async function main ()
|
|
316
320
|
palianswersScraper,
|
317
321
|
decolonizepalestineScraper,
|
318
322
|
khameneiIrFreePalestineTagScraper,
|
323
|
+
khameneiIrPalestineSpecialPageScraper,
|
319
324
|
electronicintifadaScraper,
|
320
325
|
standWithPalestineScraper,
|
321
326
|
mondoweisScraper
|
322
327
|
] );
|
328
|
+
// QLoRA = LoRA with 4-bit quantization
|
323
329
|
}();
|
package/main.js
CHANGED
@@ -152,7 +152,7 @@ class WebScraper
|
|
152
152
|
{
|
153
153
|
if ( this.hasValidPageContent( article.textContent ) )
|
154
154
|
{
|
155
|
-
const metadata = this.extractMetadata( url, document );
|
155
|
+
const metadata = this.extractMetadata( url, document, data );
|
156
156
|
metadata.articleTitle = article.title || "";
|
157
157
|
this.saveArticle( url, article.textContent, metadata );
|
158
158
|
}
|
@@ -166,6 +166,10 @@ class WebScraper
|
|
166
166
|
console.error( `No readable content found at ${url}` );
|
167
167
|
}
|
168
168
|
}
|
169
|
+
else
|
170
|
+
{
|
171
|
+
console.log( `Skipping excluded URL: ${url}` );
|
172
|
+
}
|
169
173
|
const links = this.extractLinks( data );
|
170
174
|
const unvisitedLinks = Array.from( links ).filter( link => { return !this.visited.has( link ) });
|
171
175
|
for ( const link of unvisitedLinks )
|
@@ -336,6 +340,7 @@ class WebScraper
|
|
336
340
|
fs.mkdirSync( dir, { recursive: true });
|
337
341
|
fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
|
338
342
|
fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
|
343
|
+
fs.writeFileSync( `${filePath}.html`, metadata.originalHtml, "utf-8" );
|
339
344
|
console.log( `Saved: ${filePath}.txt` );
|
340
345
|
console.log( `Saved: ${filePath}.json` );
|
341
346
|
}
|
@@ -505,7 +510,7 @@ class WebScraper
|
|
505
510
|
return filteredMetadata;
|
506
511
|
}
|
507
512
|
|
508
|
-
extractMetadata ( url, document )
|
513
|
+
extractMetadata ( url, document, html )
|
509
514
|
{
|
510
515
|
return {
|
511
516
|
url,
|
@@ -519,7 +524,8 @@ class WebScraper
|
|
519
524
|
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
520
525
|
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
521
526
|
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
522
|
-
dateScrapedDate: new Date().toISOString()
|
527
|
+
dateScrapedDate: new Date().toISOString(),
|
528
|
+
originalHtml: html,
|
523
529
|
};
|
524
530
|
}
|
525
531
|
|
@@ -580,25 +586,36 @@ class WebScraper
|
|
580
586
|
normalizeExcludeList ( list = [] )
|
581
587
|
{
|
582
588
|
const normalizedSet = new Set();
|
583
|
-
|
589
|
+
|
590
|
+
for ( const item of list )
|
584
591
|
{
|
585
|
-
|
586
|
-
if ( item.endsWith( "/" ) )
|
587
|
-
{
|
588
|
-
normalizedSet.add( item.slice( 0, -1 ) );
|
589
|
-
}
|
590
|
-
else
|
592
|
+
if ( item instanceof RegExp )
|
591
593
|
{
|
592
594
|
normalizedSet.add( item );
|
595
|
+
continue;
|
593
596
|
}
|
594
|
-
|
597
|
+
|
598
|
+
const withSlash = item.endsWith( "/" ) ? item : `${item }/`;
|
599
|
+
const withoutSlash = item.endsWith( "/" ) ? item.slice( 0, -1 ) : item;
|
600
|
+
|
601
|
+
normalizedSet.add( withSlash );
|
602
|
+
normalizedSet.add( withoutSlash );
|
595
603
|
}
|
604
|
+
|
596
605
|
return normalizedSet;
|
597
606
|
}
|
598
607
|
|
608
|
+
|
599
609
|
isExcluded ( url )
|
600
610
|
{
|
601
|
-
if ( this.exactExcludeList.
|
611
|
+
if ( Array.from( this.exactExcludeList ).some( excluded =>
|
612
|
+
{
|
613
|
+
if ( excluded instanceof RegExp )
|
614
|
+
{
|
615
|
+
return excluded.test( url );
|
616
|
+
}
|
617
|
+
return url === excluded;
|
618
|
+
}) )
|
602
619
|
{
|
603
620
|
return true;
|
604
621
|
}
|