clean-web-scraper 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,6 +13,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
13
13
  - 🎯 No duplicate page visits
14
14
  - 📊 Generates JSONL output file for ML training
15
15
  - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
16
+ - 📊 Rich metadata extraction including:
16
17
 
17
18
  ## 🛠️ Prerequisites
18
19
 
@@ -44,6 +45,7 @@ const WebScraper = require('clean-web-scraper');
44
45
 
45
46
  const scraper = new WebScraper({
46
47
  baseURL: 'https://example.com', // Required: The website to scrape
48
+ startURL: 'https://example.com/blog', // Optional: Custom starting URL
47
49
  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
48
50
  exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
49
51
  scrapResultPath: './dataset', // Required: Where to save the content
@@ -72,26 +74,17 @@ Your AI-ready content is saved in a clean, structured format:
72
74
 
73
75
  ```bash
74
76
  dataset/
75
- ├── decolonizepalestine.com
76
- │ ├── faq.json
77
- │ ├── faq.txt
78
- │ ├── intro
79
- │ │ ├── bds-101.json
80
- │ │ ├── bds-101.txt
81
- │ ├── myth
82
- │ │ ├── a-land-without-a-people-for-a-people-without-a-land.json
83
- │ │ ├── a-land-without-a-people-for-a-people-without-a-land.txt
84
- └── rainbow-washing
85
- ├── bluewashing.json
86
- │ ├── bluewashing.txt
87
- ├── texts
88
- │ ├── 1.txt
89
- │ ├── 2.txt
90
- │ ├── 3.txt
91
- │ ├── 4.txt
92
- │ └── 5.txt
93
- ├── train.csv
94
- └── train.jsonl
77
+ ├── example.com/
78
+ │ ├── page1.txt # Clean text content
79
+ │ ├── page1.json # Full metadata
80
+ │ ├── blog/
81
+ │ │ ├── post1.txt
82
+ │ │ ├── post1.json
83
+ │ ├── texts/ # Numbered text files
84
+ │ │ ├── 1.txt
85
+ │ │ ├── 2.txt
86
+ ├── train.jsonl # Combined content
87
+ └── train.csv # Clean text in CSV format
95
88
  ```
96
89
 
97
90
  ## 🤖 AI/LLM Training Ready
package/example-usage.js CHANGED
@@ -1,28 +1,62 @@
1
1
  const WebScraper = require( "./src/WebScraper" );
2
2
 
3
- // Configuration
4
- const baseURL = "https://decolonizepalestine.com";
5
- const scrapResultPath = "./dataset";
6
- const excludeList = [
7
- "https://decolonizepalestine.com/cdn-cgi",
8
- "https://decolonizepalestine.com/introduction-to-palestine",
9
- "https://decolonizepalestine.com/myths",
10
- "https://decolonizepalestine.com/reading-list",
11
- "https://decolonizepalestine.com/support-us"
12
- ];
13
- const exactExcludeList = [
14
- "https://decolonizepalestine.com/rainbow-washing",
15
- "https://decolonizepalestine.com/"
16
- ]
17
-
18
- // Initialize scraper with all available options
19
- const scraper = new WebScraper({
20
- baseURL,
21
- scrapResultPath,
22
- excludeList,
23
- exactExcludeList,
24
- jsonlPath: "./dataset/train.jsonl",
25
- textOutputPath: "./dataset/texts",
26
- csvPath: "./dataset/train.csv"
27
- });
28
- scraper.start();
3
+
4
+ async function khameneiIrFreePalestineTag ()
5
+ {
6
+ // 1
7
+ // https://english.khamenei.ir/Opinions/FreePalestine
8
+ // https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#
9
+ const scraper = new WebScraper({
10
+ baseURL: "https://english.khamenei.ir/news",
11
+ startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
12
+ excludeList: [
13
+ ],
14
+ exactExcludeList: [
15
+ ],
16
+ scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag",
17
+ jsonlPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
18
+ textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
19
+ csvPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv"
20
+ });
21
+ await scraper.start();
22
+ }
23
+
24
+ // decolonizepalestine
25
+ async function decolonizepalestine ()
26
+ {
27
+ // 2
28
+ // https://decolonizepalestine.com
29
+ const scraper = new WebScraper({
30
+ baseURL: "https://decolonizepalestine.com",
31
+ excludeList: [
32
+ "https://decolonizepalestine.com/cdn-cgi",
33
+ "https://decolonizepalestine.com/introduction-to-palestine",
34
+ "https://decolonizepalestine.com/myths",
35
+ "https://decolonizepalestine.com/reading-list",
36
+ "https://decolonizepalestine.com/support-us"
37
+ ],
38
+ exactExcludeList: [
39
+ "https://decolonizepalestine.com/rainbow-washing",
40
+ "https://decolonizepalestine.com/"
41
+ ],
42
+ scrapResultPath: "./dataset/decolonizepalestine",
43
+ jsonlPath: "./dataset/decolonizepalestine/train.jsonl",
44
+ textOutputPath: "./dataset/decolonizepalestine/texts",
45
+ csvPath: "./dataset/decolonizepalestine/train.csv"
46
+ });
47
+ await scraper.start();
48
+ }
49
+
50
+ void async function main ()
51
+ {
52
+ await khameneiIrFreePalestineTag();
53
+ // await decolonizepalestine();
54
+
55
+
56
+ // 3
57
+ // https://bdsmovement.net
58
+
59
+ // 4
60
+ // https://electronicintifada.net/
61
+ }()
62
+
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.2.1",
3
+ "version": "2.3.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
package/src/WebScraper.js CHANGED
@@ -9,6 +9,7 @@ class WebScraper
9
9
  {
10
10
  constructor ({
11
11
  baseURL,
12
+ startURL,
12
13
  excludeList,
13
14
  exactExcludeList,
14
15
  scrapResultPath = "./dataset",
@@ -18,6 +19,7 @@ class WebScraper
18
19
  })
19
20
  {
20
21
  this.baseURL = baseURL;
22
+ this.startURL = startURL || baseURL;
21
23
  this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
22
24
  this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
23
25
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
@@ -25,14 +27,13 @@ class WebScraper
25
27
  this.visited = new Set();
26
28
  this.excludeList = new Set( excludeList );
27
29
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
28
- this.processedContent = []; // Add this line
30
+ this.allProcessedContent = []; // Add this line
29
31
  this.createOutputDirectory();
30
32
  }
31
33
 
32
34
  async start ()
33
35
  {
34
- this.visited.add( this.baseURL );
35
- await this.fetchPage( this.baseURL );
36
+ await this.fetchPage( this.startURL );
36
37
  this.createJSONLFile();
37
38
  this.saveNumberedTextFiles();
38
39
  this.createCSVFile();
@@ -41,20 +42,22 @@ class WebScraper
41
42
 
42
43
  async fetchPage ( url )
43
44
  {
45
+ this.visited.add( url );
44
46
  try
45
47
  {
46
- const { data } = await axios.get( url );
48
+ const { data, headers } = await axios.get( url );
47
49
  const dom = new JSDOM( data, { url });
50
+ const { document } = dom.window;
48
51
 
49
- // Only save if the URL is not excluded
50
52
  if ( !this.isExcluded( url ) )
51
53
  {
52
- const reader = new Readability( dom.window.document, { charThreshold: 500, nbTopCandidates: 20 });
54
+ const reader = new Readability( document, { charThreshold: 500, nbTopCandidates: 20 });
53
55
  const article = reader.parse();
54
56
 
55
57
  if ( article )
56
58
  {
57
- this.saveArticle( url, article.textContent );
59
+ const metadata = this.metadataextractor( url, document, headers );
60
+ this.saveArticle( url, article.textContent, metadata );
58
61
  }
59
62
  else
60
63
  {
@@ -67,7 +70,6 @@ class WebScraper
67
70
  {
68
71
  if ( !this.visited.has( link ) )
69
72
  {
70
- this.visited.add( link );
71
73
  await this.fetchPage( link );
72
74
  }
73
75
  }
@@ -87,6 +89,10 @@ class WebScraper
87
89
  while ( ( match = regex.exec( data ) ) !== null )
88
90
  {
89
91
  let href = match[2];
92
+ if ( href.startsWith( "/" ) )
93
+ {
94
+ href = new URL( href, this.baseURL ).href
95
+ }
90
96
  if ( href.endsWith( "/" ) )
91
97
  {
92
98
  href = href.slice( 0, -1 );
@@ -95,21 +101,17 @@ class WebScraper
95
101
  {
96
102
  links.add( href );
97
103
  }
98
- else if ( href.startsWith( "/" ) )
99
- {
100
- links.add( new URL( href, this.baseURL ).href );
101
- }
102
104
  }
103
-
104
105
  return links;
105
106
  }
106
107
 
107
- saveArticle ( url, content )
108
+ saveArticle ( url, content, metadata )
108
109
  {
109
110
  const processedContent = this.processContent( content );
110
111
 
111
- this.processedContent.push({
112
- text: processedContent.trim()
112
+ this.allProcessedContent.push({
113
+ text: processedContent.trim(),
114
+ metadata
113
115
  });
114
116
 
115
117
  let urlPath = new URL( url ).pathname;
@@ -120,14 +122,6 @@ class WebScraper
120
122
  const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
121
123
  const dir = path.dirname( filePath );
122
124
 
123
- // Create metadata object
124
- const metadata = {
125
- url,
126
- dateScraped: new Date().toISOString(),
127
- contentLength: processedContent.length,
128
- fileName: `${path.basename( filePath )}.txt`
129
- };
130
-
131
125
  // Create directory if it doesn't exist
132
126
  fs.mkdirSync( dir, { recursive: true });
133
127
 
@@ -145,7 +139,7 @@ class WebScraper
145
139
  {
146
140
  const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlPath ) );
147
141
 
148
- for ( const content of this.processedContent )
142
+ for ( const content of this.allProcessedContent )
149
143
  {
150
144
  const jsonLine = `${JSON.stringify( content )}\n`;
151
145
  writeStream.write( jsonLine );
@@ -161,7 +155,7 @@ class WebScraper
161
155
 
162
156
  writeStream.write( "text\n" );
163
157
 
164
- for ( const content of this.processedContent )
158
+ for ( const content of this.allProcessedContent )
165
159
  {
166
160
  const escapedText = content.text.replace( /"/g, "\"\"" );
167
161
  const csvLine = `"${escapedText}"\n`;
@@ -174,7 +168,7 @@ class WebScraper
174
168
 
175
169
  saveNumberedTextFiles ()
176
170
  {
177
- this.processedContent.forEach( ( content, index ) =>
171
+ this.allProcessedContent.forEach( ( content, index ) =>
178
172
  {
179
173
  const fileName = `${index + 1}.txt`;
180
174
  const filePath = path.join( __dirname, this.textOutputPath, fileName );
@@ -205,6 +199,29 @@ class WebScraper
205
199
  return processed;
206
200
  }
207
201
 
202
+ metadataextractor ( url, document, headers )
203
+ {
204
+ return {
205
+ url,
206
+ title: document.title,
207
+ description: document.querySelector( "meta[name=\"description\"]" )?.content,
208
+ keywords: document.querySelector( "meta[name=\"keywords\"]" )?.content,
209
+ author: document.querySelector( "meta[name=\"author\"]" )?.content,
210
+ lastModified: headers["last-modified"],
211
+ contentType: headers["content-type"],
212
+ contentLength: headers["content-length"],
213
+ language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
214
+ canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
215
+ ogTags: {
216
+ title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
217
+ description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
218
+ image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
219
+ type: document.querySelector( "meta[property=\"og:type\"]" )?.content
220
+ },
221
+ dateScraped: new Date().toISOString()
222
+ };
223
+ }
224
+
208
225
  normalizeExcludeList ( list )
209
226
  {
210
227
  const normalizedSet = new Set();