clean-web-scraper 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -2
- package/example-usage.js +60 -26
- package/package.json +1 -1
- package/src/WebScraper.js +44 -27
package/README.md
CHANGED
|
@@ -13,6 +13,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
|
13
13
|
- 🎯 No duplicate page visits
|
|
14
14
|
- 📊 Generates JSONL output file for ML training
|
|
15
15
|
- 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
|
|
16
|
+
- 📊 Rich metadata extraction including:
|
|
16
17
|
|
|
17
18
|
## 🛠️ Prerequisites
|
|
18
19
|
|
|
@@ -44,10 +45,11 @@ const WebScraper = require('clean-web-scraper');
|
|
|
44
45
|
|
|
45
46
|
const scraper = new WebScraper({
|
|
46
47
|
baseURL: 'https://example.com', // Required: The website to scrape
|
|
47
|
-
|
|
48
|
+
startURL: 'https://example.com/blog', // Optional: Custom starting URL
|
|
48
49
|
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
|
49
50
|
exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
|
|
50
|
-
|
|
51
|
+
scrapResultPath: './dataset', // Required: Where to save the content
|
|
52
|
+
jsonlPath: './dataset/train.jsonl', // Optional: Custom JSONL output path
|
|
51
53
|
textOutputPath: "./dataset/texts", // Optional: Custom text output path
|
|
52
54
|
csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
|
|
53
55
|
});
|
|
@@ -70,6 +72,21 @@ Your AI-ready content is saved in a clean, structured format:
|
|
|
70
72
|
- 📊 JSONL output for ML training
|
|
71
73
|
- 📈 CSV output with clean text content
|
|
72
74
|
|
|
75
|
+
```bash
|
|
76
|
+
dataset/
|
|
77
|
+
├── example.com/
|
|
78
|
+
│ ├── page1.txt # Clean text content
|
|
79
|
+
│ ├── page1.json # Full metadata
|
|
80
|
+
│ ├── blog/
|
|
81
|
+
│ │ ├── post1.txt
|
|
82
|
+
│ │ ├── post1.json
|
|
83
|
+
│ ├── texts/ # Numbered text files
|
|
84
|
+
│ │ ├── 1.txt
|
|
85
|
+
│ │ ├── 2.txt
|
|
86
|
+
│ ├── train.jsonl # Combined content
|
|
87
|
+
│ └── train.csv # Clean text in CSV format
|
|
88
|
+
```
|
|
89
|
+
|
|
73
90
|
## 🤖 AI/LLM Training Ready
|
|
74
91
|
|
|
75
92
|
The output is specifically formatted for AI training purposes:
|
package/example-usage.js
CHANGED
|
@@ -1,28 +1,62 @@
|
|
|
1
1
|
const WebScraper = require( "./src/WebScraper" );
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
3
|
+
|
|
4
|
+
async function khameneiIrFreePalestineTag ()
|
|
5
|
+
{
|
|
6
|
+
// 1
|
|
7
|
+
// https://english.khamenei.ir/Opinions/FreePalestine
|
|
8
|
+
// https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
|
|
9
|
+
const scraper = new WebScraper({
|
|
10
|
+
baseURL: "https://english.khamenei.ir/news",
|
|
11
|
+
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
|
12
|
+
excludeList: [
|
|
13
|
+
],
|
|
14
|
+
exactExcludeList: [
|
|
15
|
+
],
|
|
16
|
+
scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag",
|
|
17
|
+
jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
|
|
18
|
+
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
|
19
|
+
csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
|
|
20
|
+
});
|
|
21
|
+
await scraper.start();
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// decolonizepalestine
|
|
25
|
+
async function decolonizepalestine ()
|
|
26
|
+
{
|
|
27
|
+
// 2
|
|
28
|
+
// https://decolonizepalestine.com
|
|
29
|
+
const scraper = new WebScraper({
|
|
30
|
+
baseURL: "https://decolonizepalestine.com",
|
|
31
|
+
excludeList: [
|
|
32
|
+
"https://decolonizepalestine.com/cdn-cgi",
|
|
33
|
+
"https://decolonizepalestine.com/introduction-to-palestine",
|
|
34
|
+
"https://decolonizepalestine.com/myths",
|
|
35
|
+
"https://decolonizepalestine.com/reading-list",
|
|
36
|
+
"https://decolonizepalestine.com/support-us"
|
|
37
|
+
],
|
|
38
|
+
exactExcludeList: [
|
|
39
|
+
"https://decolonizepalestine.com/rainbow-washing",
|
|
40
|
+
"https://decolonizepalestine.com/"
|
|
41
|
+
],
|
|
42
|
+
scrapResultPath: "./dataset/decolonizepalestine",
|
|
43
|
+
jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
|
|
44
|
+
textOutputPath: "./dataset/decolonizepalestine/texts",
|
|
45
|
+
csvPath: "./dataset/decolonizepalestine/train.csv"
|
|
46
|
+
});
|
|
47
|
+
await scraper.start();
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
void async function main ()
|
|
51
|
+
{
|
|
52
|
+
await khameneiIrFreePalestineTag();
|
|
53
|
+
// await decolonizepalestine();
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
// 3
|
|
57
|
+
// https://bdsmovement.net
|
|
58
|
+
|
|
59
|
+
// 4
|
|
60
|
+
// https://electronicintifada.net/
|
|
61
|
+
}()
|
|
62
|
+
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -9,6 +9,7 @@ class WebScraper
|
|
|
9
9
|
{
|
|
10
10
|
constructor ({
|
|
11
11
|
baseURL,
|
|
12
|
+
startURL,
|
|
12
13
|
excludeList,
|
|
13
14
|
exactExcludeList,
|
|
14
15
|
scrapResultPath = "./dataset",
|
|
@@ -18,6 +19,7 @@ class WebScraper
|
|
|
18
19
|
})
|
|
19
20
|
{
|
|
20
21
|
this.baseURL = baseURL;
|
|
22
|
+
this.startURL = startURL || baseURL;
|
|
21
23
|
this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
|
|
22
24
|
this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
23
25
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
@@ -25,14 +27,13 @@ class WebScraper
|
|
|
25
27
|
this.visited = new Set();
|
|
26
28
|
this.excludeList = new Set( excludeList );
|
|
27
29
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
28
|
-
this.
|
|
30
|
+
this.allProcessedContent = []; // Add this line
|
|
29
31
|
this.createOutputDirectory();
|
|
30
32
|
}
|
|
31
33
|
|
|
32
34
|
async start ()
|
|
33
35
|
{
|
|
34
|
-
this.
|
|
35
|
-
await this.fetchPage( this.baseURL );
|
|
36
|
+
await this.fetchPage( this.startURL );
|
|
36
37
|
this.createJSONLFile();
|
|
37
38
|
this.saveNumberedTextFiles();
|
|
38
39
|
this.createCSVFile();
|
|
@@ -41,20 +42,22 @@ class WebScraper
|
|
|
41
42
|
|
|
42
43
|
async fetchPage ( url )
|
|
43
44
|
{
|
|
45
|
+
this.visited.add( url );
|
|
44
46
|
try
|
|
45
47
|
{
|
|
46
|
-
const { data } = await axios.get( url );
|
|
48
|
+
const { data, headers } = await axios.get( url );
|
|
47
49
|
const dom = new JSDOM( data, { url });
|
|
50
|
+
const { document } = dom.window;
|
|
48
51
|
|
|
49
|
-
// Only save if the URL is not excluded
|
|
50
52
|
if ( !this.isExcluded( url ) )
|
|
51
53
|
{
|
|
52
|
-
const reader = new Readability(
|
|
54
|
+
const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
|
|
53
55
|
const article = reader.parse();
|
|
54
56
|
|
|
55
57
|
if ( article )
|
|
56
58
|
{
|
|
57
|
-
this.
|
|
59
|
+
const metadata = this.metadataextractor( url, document, headers );
|
|
60
|
+
this.saveArticle( url, article.textContent, metadata );
|
|
58
61
|
}
|
|
59
62
|
else
|
|
60
63
|
{
|
|
@@ -67,7 +70,6 @@ class WebScraper
|
|
|
67
70
|
{
|
|
68
71
|
if ( !this.visited.has( link ) )
|
|
69
72
|
{
|
|
70
|
-
this.visited.add( link );
|
|
71
73
|
await this.fetchPage( link );
|
|
72
74
|
}
|
|
73
75
|
}
|
|
@@ -87,6 +89,10 @@ class WebScraper
|
|
|
87
89
|
while ( ( match = regex.exec( data ) ) !== null )
|
|
88
90
|
{
|
|
89
91
|
let href = match[2];
|
|
92
|
+
if ( href.startsWith( "/" ) )
|
|
93
|
+
{
|
|
94
|
+
href = new URL( href, this.baseURL ).href
|
|
95
|
+
}
|
|
90
96
|
if ( href.endsWith( "/" ) )
|
|
91
97
|
{
|
|
92
98
|
href = href.slice( 0, -1 );
|
|
@@ -95,21 +101,17 @@ class WebScraper
|
|
|
95
101
|
{
|
|
96
102
|
links.add( href );
|
|
97
103
|
}
|
|
98
|
-
else if ( href.startsWith( "/" ) )
|
|
99
|
-
{
|
|
100
|
-
links.add( new URL( href, this.baseURL ).href );
|
|
101
|
-
}
|
|
102
104
|
}
|
|
103
|
-
|
|
104
105
|
return links;
|
|
105
106
|
}
|
|
106
107
|
|
|
107
|
-
saveArticle ( url, content )
|
|
108
|
+
saveArticle ( url, content, metadata )
|
|
108
109
|
{
|
|
109
110
|
const processedContent = this.processContent( content );
|
|
110
111
|
|
|
111
|
-
this.
|
|
112
|
-
text: processedContent.trim()
|
|
112
|
+
this.allProcessedContent.push({
|
|
113
|
+
text: processedContent.trim(),
|
|
114
|
+
metadata
|
|
113
115
|
});
|
|
114
116
|
|
|
115
117
|
let urlPath = new URL( url ).pathname;
|
|
@@ -120,14 +122,6 @@ class WebScraper
|
|
|
120
122
|
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
|
121
123
|
const dir = path.dirname( filePath );
|
|
122
124
|
|
|
123
|
-
// Create metadata object
|
|
124
|
-
const metadata = {
|
|
125
|
-
url,
|
|
126
|
-
dateScraped: new Date().toISOString(),
|
|
127
|
-
contentLength: processedContent.length,
|
|
128
|
-
fileName: `${path.basename( filePath )}.txt`
|
|
129
|
-
};
|
|
130
|
-
|
|
131
125
|
// Create directory if it doesn't exist
|
|
132
126
|
fs.mkdirSync( dir, { recursive: true });
|
|
133
127
|
|
|
@@ -145,7 +139,7 @@ class WebScraper
|
|
|
145
139
|
{
|
|
146
140
|
const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
|
|
147
141
|
|
|
148
|
-
for ( const content of this.
|
|
142
|
+
for ( const content of this.allProcessedContent )
|
|
149
143
|
{
|
|
150
144
|
const jsonLine = `${JSON.stringify( content )}\n`;
|
|
151
145
|
writeStream.write( jsonLine );
|
|
@@ -161,7 +155,7 @@ class WebScraper
|
|
|
161
155
|
|
|
162
156
|
writeStream.write( "text\n" );
|
|
163
157
|
|
|
164
|
-
for ( const content of this.
|
|
158
|
+
for ( const content of this.allProcessedContent )
|
|
165
159
|
{
|
|
166
160
|
const escapedText = content.text.replace( /"/g, "\"\"" );
|
|
167
161
|
const csvLine = `"${escapedText}"\n`;
|
|
@@ -174,7 +168,7 @@ class WebScraper
|
|
|
174
168
|
|
|
175
169
|
saveNumberedTextFiles ()
|
|
176
170
|
{
|
|
177
|
-
this.
|
|
171
|
+
this.allProcessedContent.forEach( ( content, index ) =>
|
|
178
172
|
{
|
|
179
173
|
const fileName = `${index + 1}.txt`;
|
|
180
174
|
const filePath = path.join( __dirname, this.textOutputPath, fileName );
|
|
@@ -205,6 +199,29 @@ class WebScraper
|
|
|
205
199
|
return processed;
|
|
206
200
|
}
|
|
207
201
|
|
|
202
|
+
metadataextractor ( url, document, headers )
|
|
203
|
+
{
|
|
204
|
+
return {
|
|
205
|
+
url,
|
|
206
|
+
title: document.title,
|
|
207
|
+
description: document.querySelector( "meta[name=\"description\"]" )?.content,
|
|
208
|
+
keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
|
|
209
|
+
author: document.querySelector( "meta[name=\"author\"]" )?.content,
|
|
210
|
+
lastModified: headers["last-modified"],
|
|
211
|
+
contentType: headers["content-type"],
|
|
212
|
+
contentLength: headers["content-length"],
|
|
213
|
+
language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
|
|
214
|
+
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
|
215
|
+
ogTags: {
|
|
216
|
+
title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
|
217
|
+
description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
|
218
|
+
image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
|
219
|
+
type: document.querySelector( "meta[property=\"og:type\"]" )?.content
|
|
220
|
+
},
|
|
221
|
+
dateScraped: new Date().toISOString()
|
|
222
|
+
};
|
|
223
|
+
}
|
|
224
|
+
|
|
208
225
|
normalizeExcludeList ( list )
|
|
209
226
|
{
|
|
210
227
|
const normalizedSet = new Set();
|