clean-web-scraper 3.2.1 → 3.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -6
- package/example-usage.js +7 -0
- package/package.json +1 -1
- package/src/WebScraper.js +47 -17
package/README.md
CHANGED
|
@@ -11,10 +11,10 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
|
|
|
11
11
|
- 🚫 Excludes unwanted paths from scraping
|
|
12
12
|
- 🔄 Handles relative and absolute URLs like a pro
|
|
13
13
|
- 🎯 No duplicate page visits
|
|
14
|
-
-
|
|
15
|
-
- 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
|
|
14
|
+
- 🤖 AI-friendly output formats (JSONL, CSV, clean text)
|
|
16
15
|
- 📊 Rich metadata extraction
|
|
17
16
|
- 📁 Combine results from multiple scrapers into a unified dataset
|
|
17
|
+
- 🎯 Turn any website into an AI training dataset
|
|
18
18
|
|
|
19
19
|
## 🛠️ Prerequisites
|
|
20
20
|
|
|
@@ -58,15 +58,37 @@ const scraper = new WebScraper({
|
|
|
58
58
|
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
59
59
|
});
|
|
60
60
|
await scraper.start();
|
|
61
|
-
|
|
62
|
-
// Combine results from multiple scrapers
|
|
63
|
-
await WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
|
|
64
61
|
```
|
|
65
62
|
|
|
66
63
|
```bash
|
|
67
64
|
node example-usage.js
|
|
68
65
|
```
|
|
69
66
|
|
|
67
|
+
## 💻 Advanced Usage: Multi-Site Scraping
|
|
68
|
+
|
|
69
|
+
```js
|
|
70
|
+
const WebScraper = require('clean-web-scraper');
|
|
71
|
+
|
|
72
|
+
// Scrape documentation website
|
|
73
|
+
const docsScraper = new WebScraper({
|
|
74
|
+
baseURL: 'https://docs.example.com',
|
|
75
|
+
scrapResultPath: './datasets/docs'
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
// Scrape blog website
|
|
79
|
+
const blogScraper = new WebScraper({
|
|
80
|
+
baseURL: 'https://blog.example.com',
|
|
81
|
+
scrapResultPath: './datasets/blog'
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
// Start scraping both sites
|
|
85
|
+
await docsScraper.start();
|
|
86
|
+
await blogScraper.start();
|
|
87
|
+
|
|
88
|
+
// Combine all scraped content into a single dataset
|
|
89
|
+
await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
|
|
90
|
+
```
|
|
91
|
+
|
|
70
92
|
## 📤 Output
|
|
71
93
|
|
|
72
94
|
Your AI-ready content is saved in a clean, structured format:
|
|
@@ -96,11 +118,84 @@ example.com/
|
|
|
96
118
|
├── train_with_metadata.jsonl # When includeMetadata is true
|
|
97
119
|
├── train.csv # Clean text in CSV format
|
|
98
120
|
└── train_with_metadata.csv # When includeMetadata is true
|
|
121
|
+
|
|
122
|
+
combined/
|
|
123
|
+
├── texts/ # Combined numbered text files
|
|
124
|
+
│ ├── 1.txt
|
|
125
|
+
│ ├── 2.txt
|
|
126
|
+
│ └── n.txt
|
|
127
|
+
├── texts_with_metadata/ # Combined metadata text files
|
|
128
|
+
│ ├── 1.txt
|
|
129
|
+
│ ├── 2.txt
|
|
130
|
+
│ └── n.txt
|
|
131
|
+
├── combined.jsonl # Combined JSONL content
|
|
132
|
+
├── combined_with_metadata.jsonl
|
|
133
|
+
├── combined.csv # Combined CSV content
|
|
134
|
+
└── combined_with_metadata.csv
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## 📄 Output File Formats
|
|
138
|
+
|
|
139
|
+
### 📝 Text Files (*.txt)
|
|
140
|
+
|
|
141
|
+
The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage
|
|
142
|
+
|
|
143
|
+
### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
|
|
144
|
+
|
|
145
|
+
title: My Awesome Page
|
|
146
|
+
description: This is a great article about coding
|
|
147
|
+
author: John Doe
|
|
148
|
+
language: en
|
|
149
|
+
dateScraped: 2024-01-20T10:30:00Z
|
|
150
|
+
|
|
151
|
+
\-\-\-
|
|
152
|
+
|
|
153
|
+
The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage.
|
|
154
|
+
|
|
155
|
+
### 📊 JSONL Files (train.jsonl)
|
|
156
|
+
|
|
157
|
+
```json
|
|
158
|
+
{"text": "Clean article content here"}
|
|
159
|
+
{"text": "Another article content here"}
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### 📈 JSONL with Metadata (train_with_metadata.jsonl)
|
|
163
|
+
|
|
164
|
+
```json
|
|
165
|
+
{"text": "Article content", "metadata": {"title": "Page Title", "author": "John Doe"}}
|
|
166
|
+
{"text": "Another article", "metadata": {"title": "Second Page", "author": "Jane Smith"}}
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### 🗃️ JSON Files In Website Output (*.json)
|
|
170
|
+
|
|
171
|
+
```json
|
|
172
|
+
{
|
|
173
|
+
"url": "<https://example.com/page>",
|
|
174
|
+
"title": "Page Title",
|
|
175
|
+
"description": "Page description",
|
|
176
|
+
"dateScraped": "2024-01-20T10:30:00Z"
|
|
177
|
+
}
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### 📋 CSV Files (train.csv)
|
|
181
|
+
|
|
182
|
+
```csv
|
|
183
|
+
text
|
|
184
|
+
"Clean article content here"
|
|
185
|
+
"Another article content here"
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### 📊 CSV with Metadata (train_with_metadata.csv)
|
|
189
|
+
|
|
190
|
+
```csv
|
|
191
|
+
text,title,author,description
|
|
192
|
+
"Article content","Page Title","John Doe","Page description"
|
|
193
|
+
"Another article","Second Page","Jane Smith","Another description"
|
|
99
194
|
```
|
|
100
195
|
|
|
101
196
|
## 🤖 AI/LLM Training Ready
|
|
102
197
|
|
|
103
|
-
The output is specifically formatted for AI training purposes:
|
|
198
|
+
The output is specifically formatted for AI training and fine-tuning purposes:
|
|
104
199
|
|
|
105
200
|
- Clean, processed text without HTML markup
|
|
106
201
|
- Multiple formats (JSONL, CSV, text files)
|
package/example-usage.js
CHANGED
|
@@ -68,5 +68,12 @@ void async function main ()
|
|
|
68
68
|
|
|
69
69
|
// 4
|
|
70
70
|
// https://electronicintifada.net/
|
|
71
|
+
|
|
72
|
+
// 5
|
|
73
|
+
// https://www.palestineremembered.com/ZionistFAQ.html
|
|
74
|
+
|
|
75
|
+
// 6 https://the-palestinian-side.vercel.app/
|
|
76
|
+
|
|
77
|
+
// 7 https://stand-with-palestine.org/blogs
|
|
71
78
|
}()
|
|
72
79
|
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -36,11 +36,11 @@ class WebScraper
|
|
|
36
36
|
this.excludeList = new Set( excludeList );
|
|
37
37
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
38
38
|
this.allProcessedContent = [];
|
|
39
|
-
this.createOutputDirectory();
|
|
40
39
|
}
|
|
41
40
|
|
|
42
41
|
async start ()
|
|
43
42
|
{
|
|
43
|
+
this.createOutputDirectory();
|
|
44
44
|
await this.fetchPage( this.startURL, 0 );
|
|
45
45
|
this.createJSONLFile();
|
|
46
46
|
this.saveNumberedTextFiles();
|
|
@@ -384,41 +384,71 @@ class WebScraper
|
|
|
384
384
|
const fullOutputPath = path.join( __dirname, outputPath );
|
|
385
385
|
|
|
386
386
|
// Create output directories
|
|
387
|
+
WebScraper.createCombinedDirectories( fullOutputPath );
|
|
388
|
+
|
|
389
|
+
// Combine files by type
|
|
390
|
+
WebScraper.combineJSONLFiles( fullOutputPath, websites );
|
|
391
|
+
WebScraper.combineCSVFiles( fullOutputPath, websites );
|
|
392
|
+
WebScraper.combineTextFiles( fullOutputPath, websites );
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
static createCombinedDirectories ( fullOutputPath )
|
|
396
|
+
{
|
|
397
|
+
if ( fs.existsSync( fullOutputPath ) )
|
|
398
|
+
{
|
|
399
|
+
fs.rmSync( fullOutputPath, { recursive: true, force: true });
|
|
400
|
+
}
|
|
387
401
|
fs.mkdirSync( fullOutputPath, { recursive: true });
|
|
388
402
|
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
|
389
403
|
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
|
|
404
|
+
}
|
|
390
405
|
|
|
391
|
-
|
|
406
|
+
static combineJSONLFiles ( fullOutputPath, websites )
|
|
407
|
+
{
|
|
392
408
|
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
|
393
|
-
|
|
409
|
+
.on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
|
|
394
410
|
const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
|
|
395
|
-
|
|
411
|
+
.on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
|
|
412
|
+
|
|
413
|
+
for ( const website of websites )
|
|
414
|
+
{
|
|
415
|
+
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
416
|
+
jsonlOutput.write( jsonlContent );
|
|
417
|
+
|
|
418
|
+
if ( website.includeMetadata )
|
|
419
|
+
{
|
|
420
|
+
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
421
|
+
jsonlMetaOutput.write( jsonlMetaContent );
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
jsonlOutput.end();
|
|
426
|
+
jsonlMetaOutput.end();
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
static combineCSVFiles ( fullOutputPath, websites )
|
|
430
|
+
{
|
|
396
431
|
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
397
432
|
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
|
398
433
|
|
|
399
434
|
csvOutput.write( "text\n" );
|
|
400
435
|
const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
|
|
436
|
+
|
|
401
437
|
if ( metadataFields.size > 0 )
|
|
402
438
|
{
|
|
403
439
|
csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
|
|
404
440
|
}
|
|
441
|
+
|
|
405
442
|
for ( const website of websites )
|
|
406
443
|
{
|
|
407
|
-
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
408
|
-
jsonlOutput.write( jsonlContent );
|
|
409
|
-
|
|
410
444
|
const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
|
|
411
445
|
.split( "\n" )
|
|
412
446
|
.slice( 1 )
|
|
413
447
|
.filter( line => { return line.trim() });
|
|
414
448
|
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
|
415
449
|
|
|
416
|
-
// Combine metadata files if they exist
|
|
417
450
|
if ( website.includeMetadata )
|
|
418
451
|
{
|
|
419
|
-
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
420
|
-
jsonlMetaOutput.write( jsonlMetaContent );
|
|
421
|
-
|
|
422
452
|
const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
|
|
423
453
|
.split( "\n" )
|
|
424
454
|
.slice( 1 )
|
|
@@ -427,18 +457,18 @@ class WebScraper
|
|
|
427
457
|
}
|
|
428
458
|
}
|
|
429
459
|
|
|
430
|
-
// Close all streams
|
|
431
|
-
jsonlOutput.end();
|
|
432
|
-
jsonlMetaOutput.end();
|
|
433
460
|
csvOutput.end();
|
|
434
461
|
csvMetaOutput.end();
|
|
462
|
+
}
|
|
435
463
|
|
|
436
|
-
|
|
464
|
+
static combineTextFiles ( fullOutputPath, websites )
|
|
465
|
+
{
|
|
437
466
|
let textFileCounter = 1;
|
|
467
|
+
|
|
438
468
|
for ( const website of websites )
|
|
439
469
|
{
|
|
440
|
-
// Regular text files
|
|
441
470
|
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
|
471
|
+
|
|
442
472
|
for ( const file of textFiles )
|
|
443
473
|
{
|
|
444
474
|
const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
|
|
@@ -448,7 +478,6 @@ class WebScraper
|
|
|
448
478
|
"utf-8"
|
|
449
479
|
);
|
|
450
480
|
|
|
451
|
-
// Metadata text files if they exist
|
|
452
481
|
if ( website.includeMetadata )
|
|
453
482
|
{
|
|
454
483
|
const metaContent = fs.readFileSync(
|
|
@@ -465,6 +494,7 @@ class WebScraper
|
|
|
465
494
|
}
|
|
466
495
|
}
|
|
467
496
|
}
|
|
497
|
+
|
|
468
498
|
}
|
|
469
499
|
|
|
470
500
|
module.exports = WebScraper;
|