clean-web-scraper 2.0.5 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -6
- package/example-usage.js +6 -4
- package/package.json +1 -1
- package/src/WebScraper.js +29 -10
package/README.md
CHANGED
|
@@ -11,7 +11,7 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
|
11
11
|
- 🚫 Excludes unwanted paths from scraping
|
|
12
12
|
- 🔄 Handles relative and absolute URLs like a pro
|
|
13
13
|
- 🎯 No duplicate page visits
|
|
14
|
-
- 📊 Generates JSONL output file for ML training
|
|
14
|
+
- 📊 Generates JSONL and raw text output file for ML training
|
|
15
15
|
- 📊 AI-friendly clean text output (perfect for LLM fine-tuning!)
|
|
16
16
|
|
|
17
17
|
## 🛠️ Prerequisites
|
|
@@ -43,11 +43,12 @@ npm install
|
|
|
43
43
|
const WebScraper = require('clean-web-scraper');
|
|
44
44
|
|
|
45
45
|
const scraper = new WebScraper({
|
|
46
|
-
baseURL: 'https://example.com',
|
|
47
|
-
|
|
48
|
-
excludeList: ['/admin', '/private'],
|
|
49
|
-
exactExcludeList: ['/specific-page']
|
|
50
|
-
jsonlPath: 'output.jsonl' // Optional: Custom JSONL output path
|
|
46
|
+
baseURL: 'https://example.com', // Required: The website to scrape
|
|
47
|
+
scrapResultPath: './output', // Required: Where to save the content
|
|
48
|
+
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
|
49
|
+
exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
|
|
50
|
+
jsonlPath: 'output.jsonl', // Optional: Custom JSONL output path
|
|
51
|
+
textOutputPath: "./dataset/texts" // Optional: Custom text output path
|
|
51
52
|
});
|
|
52
53
|
|
|
53
54
|
scraper.start();
|
package/example-usage.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
const WebScraper = require( "./src/WebScraper" );
|
|
2
2
|
|
|
3
|
+
// Configuration
|
|
3
4
|
const baseURL = "https://decolonizepalestine.com";
|
|
4
|
-
const
|
|
5
|
+
const scrapResultPath = "./dataset";
|
|
5
6
|
const excludeList = [
|
|
6
7
|
"https://decolonizepalestine.com/cdn-cgi",
|
|
7
8
|
"https://decolonizepalestine.com/introduction-to-palestine",
|
|
@@ -14,12 +15,13 @@ const exactExcludeList = [
|
|
|
14
15
|
"https://decolonizepalestine.com/"
|
|
15
16
|
]
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
// Initialize scraper with all available options
|
|
18
19
|
const scraper = new WebScraper({
|
|
19
20
|
baseURL,
|
|
20
|
-
|
|
21
|
+
scrapResultPath,
|
|
21
22
|
excludeList,
|
|
22
23
|
exactExcludeList,
|
|
23
|
-
jsonlPath: "./dataset/
|
|
24
|
+
jsonlPath: "./dataset/train.jsonl",
|
|
25
|
+
textOutputPath: "./dataset/texts"
|
|
24
26
|
});
|
|
25
27
|
scraper.start();
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -7,11 +7,19 @@ const path = require( "path" );
|
|
|
7
7
|
|
|
8
8
|
class WebScraper
|
|
9
9
|
{
|
|
10
|
-
constructor ({
|
|
10
|
+
constructor ({
|
|
11
|
+
baseURL,
|
|
12
|
+
excludeList,
|
|
13
|
+
exactExcludeList,
|
|
14
|
+
scrapResultPath = "./dataset",
|
|
15
|
+
jsonlPath,
|
|
16
|
+
textOutputPath
|
|
17
|
+
})
|
|
11
18
|
{
|
|
12
19
|
this.baseURL = baseURL;
|
|
13
|
-
this.
|
|
14
|
-
this.
|
|
20
|
+
this.scrapResultPath = path.join( scrapResultPath, baseURL.replace( /^(https?:\/\/)?(www\.)?/, "" ).replace( /\/$/, "" ) );
|
|
21
|
+
this.jsonlPath = jsonlPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
22
|
+
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
15
23
|
this.visited = new Set();
|
|
16
24
|
this.excludeList = new Set( excludeList );
|
|
17
25
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
@@ -24,6 +32,8 @@ class WebScraper
|
|
|
24
32
|
this.visited.add( this.baseURL );
|
|
25
33
|
await this.fetchPage( this.baseURL );
|
|
26
34
|
this.createJSONLFile();
|
|
35
|
+
this.saveNumberedTextFiles();
|
|
36
|
+
console.log( "Scraping completed." );
|
|
27
37
|
}
|
|
28
38
|
|
|
29
39
|
async fetchPage ( url )
|
|
@@ -104,7 +114,7 @@ class WebScraper
|
|
|
104
114
|
{
|
|
105
115
|
urlPath = "/index";
|
|
106
116
|
}
|
|
107
|
-
const filePath = path.join( __dirname, this.
|
|
117
|
+
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
|
108
118
|
const dir = path.dirname( filePath );
|
|
109
119
|
|
|
110
120
|
// Create metadata object
|
|
@@ -142,6 +152,17 @@ class WebScraper
|
|
|
142
152
|
console.log( `Created JSONL file at: ${this.jsonlPath}` );
|
|
143
153
|
}
|
|
144
154
|
|
|
155
|
+
saveNumberedTextFiles ()
|
|
156
|
+
{
|
|
157
|
+
this.processedContent.forEach( ( content, index ) =>
|
|
158
|
+
{
|
|
159
|
+
const fileName = `${index + 1}.txt`;
|
|
160
|
+
const filePath = path.join( __dirname, this.textOutputPath, fileName );
|
|
161
|
+
fs.writeFileSync( filePath, content.text, "utf-8" );
|
|
162
|
+
console.log( `Created numbered text file: ${fileName}` );
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
|
|
145
166
|
processContent ( content )
|
|
146
167
|
{
|
|
147
168
|
let processed = content;
|
|
@@ -194,14 +215,12 @@ class WebScraper
|
|
|
194
215
|
|
|
195
216
|
createOutputDirectory ()
|
|
196
217
|
{
|
|
197
|
-
if ( fs.existsSync( path.join( __dirname, this.
|
|
198
|
-
{
|
|
199
|
-
fs.rmSync( path.join( __dirname, this.folderPath ), { recursive: true, force: true });
|
|
200
|
-
}
|
|
201
|
-
if ( !fs.existsSync( path.join( __dirname, this.folderPath ) ) )
|
|
218
|
+
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
202
219
|
{
|
|
203
|
-
fs.
|
|
220
|
+
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
204
221
|
}
|
|
222
|
+
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
223
|
+
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
205
224
|
}
|
|
206
225
|
}
|
|
207
226
|
|