clean-web-scraper 2.2.0 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,6 +13,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
13
13
  - 🎯 No duplicate page visits
14
14
  - 📊 Generates JSONL output file for ML training
15
15
  - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
16
+ - 📊 Rich metadata extraction including:
16
17
 
17
18
  ## 🛠️ Prerequisites
18
19
 
@@ -44,10 +45,11 @@ const WebScraper = require('clean-web-scraper');
44
45
 
45
46
  const scraper = new WebScraper({
46
47
  baseURL: 'https://example.com', // Required: The website to scrape
47
- scrapResultPath: './output', // Required: Where to save the content
48
+ startURL: 'https://example.com/blog', // Optional: Custom starting URL
48
49
  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
50
  exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
50
- jsonlPath: 'output.jsonl', // Optional: Custom JSONL output path
51
+ scrapResultPath: './dataset', // Required: Where to save the content
52
+ jsonlPath: './dataset/train.jsonl', // Optional: Custom JSONL output path
51
53
  textOutputPath: "./dataset/texts", // Optional: Custom text output path
52
54
  csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
53
55
  });
@@ -70,6 +72,21 @@ Your AI-ready content is saved in a clean, structured format:
70
72
  - 📊 JSONL output for ML training
71
73
  - 📈 CSV output with clean text content
72
74
 
75
+ ```bash
76
+ dataset/
77
+ ├── example.com/
78
+ │ ├── page1.txt # Clean text content
79
+ │ ├── page1.json # Full metadata
80
+ │ ├── blog/
81
+ │ │ ├── post1.txt
82
+ │ │ ├── post1.json
83
+ │ ├── texts/ # Numbered text files
84
+ │ │ ├── 1.txt
85
+ │ │ ├── 2.txt
86
+ │ ├── train.jsonl # Combined content
87
+ │ └── train.csv # Clean text in CSV format
88
+ ```
89
+
73
90
  ## 🤖 AI/LLM Training Ready
74
91
 
75
92
  The output is specifically formatted for AI training purposes:
package/example-usage.js CHANGED
@@ -1,28 +1,62 @@
1
1
  const WebScraper = require( "./src/WebScraper" );
2
2
 
3
- // Configuration
4
- const baseURL = "https://decolonizepalestine.com";
5
- const scrapResultPath = "./dataset";
6
- const excludeList = [
7
- "https://decolonizepalestine.com/cdn-cgi",
8
- "https://decolonizepalestine.com/introduction-to-palestine",
9
- "https://decolonizepalestine.com/myths",
10
- "https://decolonizepalestine.com/reading-list",
11
- "https://decolonizepalestine.com/support-us"
12
- ];
13
- const exactExcludeList = [
14
- "https://decolonizepalestine.com/rainbow-washing",
15
- "https://decolonizepalestine.com/"
16
- ]
17
-
18
- // Initialize scraper with all available options
19
- const scraper = new WebScraper({
20
- baseURL,
21
- scrapResultPath,
22
- excludeList,
23
- exactExcludeList,
24
- jsonlPath: "./dataset/train.jsonl",
25
- textOutputPath: "./dataset/texts",
26
- csvPath: "./dataset/train.csv"
27
- });
28
- scraper.start();
3
+
4
+ async function khameneiIrFreePalestineTag ()
5
+ {
6
+ // 1
7
+ // https://english.khamenei.ir/Opinions/FreePalestine
8
+ // https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
9
+ const scraper = new WebScraper({
10
+ baseURL: "https://english.khamenei.ir/news",
11
+ startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
12
+ excludeList: [
13
+ ],
14
+ exactExcludeList: [
15
+ ],
16
+ scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag",
17
+ jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
18
+ textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
19
+ csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
20
+ });
21
+ await scraper.start();
22
+ }
23
+
24
+ // decolonizepalestine
25
+ async function decolonizepalestine ()
26
+ {
27
+ // 2
28
+ // https://decolonizepalestine.com
29
+ const scraper = new WebScraper({
30
+ baseURL: "https://decolonizepalestine.com",
31
+ excludeList: [
32
+ "https://decolonizepalestine.com/cdn-cgi",
33
+ "https://decolonizepalestine.com/introduction-to-palestine",
34
+ "https://decolonizepalestine.com/myths",
35
+ "https://decolonizepalestine.com/reading-list",
36
+ "https://decolonizepalestine.com/support-us"
37
+ ],
38
+ exactExcludeList: [
39
+ "https://decolonizepalestine.com/rainbow-washing",
40
+ "https://decolonizepalestine.com/"
41
+ ],
42
+ scrapResultPath: "./dataset/decolonizepalestine",
43
+ jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
44
+ textOutputPath: "./dataset/decolonizepalestine/texts",
45
+ csvPath: "./dataset/decolonizepalestine/train.csv"
46
+ });
47
+ await scraper.start();
48
+ }
49
+
50
+ void async function main ()
51
+ {
52
+ await khameneiIrFreePalestineTag();
53
+ // await decolonizepalestine();
54
+
55
+
56
+ // 3
57
+ // https://bdsmovement.net
58
+
59
+ // 4
60
+ // https://electronicintifada.net/
61
+ }()
62
+
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.2.0",
3
+ "version": "2.3.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -9,6 +9,7 @@ class WebScraper
9
9
  {
10
10
  constructor ({
11
11
  baseURL,
12
+ startURL,
12
13
  excludeList,
13
14
  exactExcludeList,
14
15
  scrapResultPath = "./dataset",
@@ -18,6 +19,7 @@ class WebScraper
18
19
  })
19
20
  {
20
21
  this.baseURL = baseURL;
22
+ this.startURL = startURL || baseURL;
21
23
  this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
22
24
  this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
23
25
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
@@ -25,14 +27,13 @@ class WebScraper
25
27
  this.visited = new Set();
26
28
  this.excludeList = new Set( excludeList );
27
29
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
28
- this.processedContent = []; // Add this line
30
+ this.allProcessedContent = []; // Add this line
29
31
  this.createOutputDirectory();
30
32
  }
31
33
 
32
34
  async start ()
33
35
  {
34
- this.visited.add( this.baseURL );
35
- await this.fetchPage( this.baseURL );
36
+ await this.fetchPage( this.startURL );
36
37
  this.createJSONLFile();
37
38
  this.saveNumberedTextFiles();
38
39
  this.createCSVFile();
@@ -41,20 +42,22 @@ class WebScraper
41
42
 
42
43
  async fetchPage ( url )
43
44
  {
45
+ this.visited.add( url );
44
46
  try
45
47
  {
46
- const { data } = await axios.get( url );
48
+ const { data, headers } = await axios.get( url );
47
49
  const dom = new JSDOM( data, { url });
50
+ const { document } = dom.window;
48
51
 
49
- // Only save if the URL is not excluded
50
52
  if ( !this.isExcluded( url ) )
51
53
  {
52
- const reader = new Readability( dom.window.document, { charThreshold: 500, nbTopCandidates: 20 });
54
+ const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
53
55
  const article = reader.parse();
54
56
 
55
57
  if ( article )
56
58
  {
57
- this.saveArticle( url, article.textContent );
59
+ const metadata = this.metadataextractor( url, document, headers );
60
+ this.saveArticle( url, article.textContent, metadata );
58
61
  }
59
62
  else
60
63
  {
@@ -67,7 +70,6 @@ class WebScraper
67
70
  {
68
71
  if ( !this.visited.has( link ) )
69
72
  {
70
- this.visited.add( link );
71
73
  await this.fetchPage( link );
72
74
  }
73
75
  }
@@ -87,6 +89,10 @@ class WebScraper
87
89
  while ( ( match = regex.exec( data ) ) !== null )
88
90
  {
89
91
  let href = match[2];
92
+ if ( href.startsWith( "/" ) )
93
+ {
94
+ href = new URL( href, this.baseURL ).href
95
+ }
90
96
  if ( href.endsWith( "/" ) )
91
97
  {
92
98
  href = href.slice( 0, -1 );
@@ -95,21 +101,17 @@ class WebScraper
95
101
  {
96
102
  links.add( href );
97
103
  }
98
- else if ( href.startsWith( "/" ) )
99
- {
100
- links.add( new URL( href, this.baseURL ).href );
101
- }
102
104
  }
103
-
104
105
  return links;
105
106
  }
106
107
 
107
- saveArticle ( url, content )
108
+ saveArticle ( url, content, metadata )
108
109
  {
109
110
  const processedContent = this.processContent( content );
110
111
 
111
- this.processedContent.push({
112
- text: processedContent.trim()
112
+ this.allProcessedContent.push({
113
+ text: processedContent.trim(),
114
+ metadata
113
115
  });
114
116
 
115
117
  let urlPath = new URL( url ).pathname;
@@ -120,14 +122,6 @@ class WebScraper
120
122
  const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
121
123
  const dir = path.dirname( filePath );
122
124
 
123
- // Create metadata object
124
- const metadata = {
125
- url,
126
- dateScraped: new Date().toISOString(),
127
- contentLength: processedContent.length,
128
- fileName: `${path.basename( filePath )}.txt`
129
- };
130
-
131
125
  // Create directory if it doesn't exist
132
126
  fs.mkdirSync( dir, { recursive: true });
133
127
 
@@ -145,7 +139,7 @@ class WebScraper
145
139
  {
146
140
  const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
147
141
 
148
- for ( const content of this.processedContent )
142
+ for ( const content of this.allProcessedContent )
149
143
  {
150
144
  const jsonLine = `${JSON.stringify( content )}\n`;
151
145
  writeStream.write( jsonLine );
@@ -161,7 +155,7 @@ class WebScraper
161
155
 
162
156
  writeStream.write( "text\n" );
163
157
 
164
- for ( const content of this.processedContent )
158
+ for ( const content of this.allProcessedContent )
165
159
  {
166
160
  const escapedText = content.text.replace( /"/g, "\"\"" );
167
161
  const csvLine = `"${escapedText}"\n`;
@@ -174,7 +168,7 @@ class WebScraper
174
168
 
175
169
  saveNumberedTextFiles ()
176
170
  {
177
- this.processedContent.forEach( ( content, index ) =>
171
+ this.allProcessedContent.forEach( ( content, index ) =>
178
172
  {
179
173
  const fileName = `${index + 1}.txt`;
180
174
  const filePath = path.join( __dirname, this.textOutputPath, fileName );
@@ -205,6 +199,29 @@ class WebScraper
205
199
  return processed;
206
200
  }
207
201
 
202
+ metadataextractor ( url, document, headers )
203
+ {
204
+ return {
205
+ url,
206
+ title: document.title,
207
+ description: document.querySelector( "meta[name=\"description\"]" )?.content,
208
+ keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
209
+ author: document.querySelector( "meta[name=\"author\"]" )?.content,
210
+ lastModified: headers["last-modified"],
211
+ contentType: headers["content-type"],
212
+ contentLength: headers["content-length"],
213
+ language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
214
+ canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
215
+ ogTags: {
216
+ title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
217
+ description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
218
+ image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
219
+ type: document.querySelector( "meta[property=\"og:type\"]" )?.content
220
+ },
221
+ dateScraped: new Date().toISOString()
222
+ };
223
+ }
224
+
208
225
  normalizeExcludeList ( list )
209
226
  {
210
227
  const normalizedSet = new Set();