clean-web-scraper 3.2.2 → 3.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -6
- package/example-usage.js +7 -0
- package/package.json +1 -1
- package/src/WebScraper.js +47 -17
package/README.md
CHANGED
|
@@ -53,9 +53,6 @@ const scraper = new WebScraper({
|
|
|
53
53
|
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
|
54
54
|
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
|
55
55
|
csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
|
|
56
|
-
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
57
|
-
includeMetadata: false, // Optional: Include metadata in output files
|
|
58
|
-
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
59
56
|
});
|
|
60
57
|
await scraper.start();
|
|
61
58
|
```
|
|
@@ -72,13 +69,19 @@ const WebScraper = require('clean-web-scraper');
|
|
|
72
69
|
// Scrape documentation website
|
|
73
70
|
const docsScraper = new WebScraper({
|
|
74
71
|
baseURL: 'https://docs.example.com',
|
|
75
|
-
scrapResultPath: './datasets/docs'
|
|
72
|
+
scrapResultPath: './datasets/docs',
|
|
73
|
+
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
74
|
+
includeMetadata: true, // Optional: Include metadata in output files
|
|
75
|
+
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
76
76
|
});
|
|
77
77
|
|
|
78
78
|
// Scrape blog website
|
|
79
79
|
const blogScraper = new WebScraper({
|
|
80
80
|
baseURL: 'https://blog.example.com',
|
|
81
|
-
scrapResultPath: './datasets/blog'
|
|
81
|
+
scrapResultPath: './datasets/blog',
|
|
82
|
+
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
83
|
+
includeMetadata: true, // Optional: Include metadata in output files
|
|
84
|
+
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
82
85
|
});
|
|
83
86
|
|
|
84
87
|
// Start scraping both sites
|
|
@@ -86,7 +89,7 @@ await docsScraper.start();
|
|
|
86
89
|
await blogScraper.start();
|
|
87
90
|
|
|
88
91
|
// Combine all scraped content into a single dataset
|
|
89
|
-
await WebScraper.combineResults('./combined
|
|
92
|
+
await WebScraper.combineResults('./combined', [docsScraper, blogScraper]);
|
|
90
93
|
```
|
|
91
94
|
|
|
92
95
|
## 📤 Output
|
|
@@ -118,6 +121,79 @@ example.com/
|
|
|
118
121
|
├── train_with_metadata.jsonl # When includeMetadata is true
|
|
119
122
|
├── train.csv # Clean text in CSV format
|
|
120
123
|
└── train_with_metadata.csv # When includeMetadata is true
|
|
124
|
+
|
|
125
|
+
combined/
|
|
126
|
+
├── texts/ # Combined numbered text files
|
|
127
|
+
│ ├── 1.txt
|
|
128
|
+
│ ├── 2.txt
|
|
129
|
+
│ └── n.txt
|
|
130
|
+
├── texts_with_metadata/ # Combined metadata text files
|
|
131
|
+
│ ├── 1.txt
|
|
132
|
+
│ ├── 2.txt
|
|
133
|
+
│ └── n.txt
|
|
134
|
+
├── combined.jsonl # Combined JSONL content
|
|
135
|
+
├── combined_with_metadata.jsonl
|
|
136
|
+
├── combined.csv # Combined CSV content
|
|
137
|
+
└── combined_with_metadata.csv
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## 📄 Output File Formats
|
|
141
|
+
|
|
142
|
+
### 📝 Text Files (*.txt)
|
|
143
|
+
|
|
144
|
+
The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage
|
|
145
|
+
|
|
146
|
+
### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
|
|
147
|
+
|
|
148
|
+
title: My Awesome Page
|
|
149
|
+
description: This is a great article about coding
|
|
150
|
+
author: John Doe
|
|
151
|
+
language: en
|
|
152
|
+
dateScraped: 2024-01-20T10:30:00Z
|
|
153
|
+
|
|
154
|
+
\-\-\-
|
|
155
|
+
|
|
156
|
+
The actual article content starts here. This is the clean, processed text of the article that was extracted from the webpage.
|
|
157
|
+
|
|
158
|
+
### 📊 JSONL Files (train.jsonl)
|
|
159
|
+
|
|
160
|
+
```json
|
|
161
|
+
{"text": "Clean article content here"}
|
|
162
|
+
{"text": "Another article content here"}
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### 📈 JSONL with Metadata (train_with_metadata.jsonl)
|
|
166
|
+
|
|
167
|
+
```json
|
|
168
|
+
{"text": "Article content", "metadata": {"title": "Page Title", "author": "John Doe"}}
|
|
169
|
+
{"text": "Another article", "metadata": {"title": "Second Page", "author": "Jane Smith"}}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### 🗃️ JSON Files In Website Output (*.json)
|
|
173
|
+
|
|
174
|
+
```json
|
|
175
|
+
{
|
|
176
|
+
"url": "<https://example.com/page>",
|
|
177
|
+
"title": "Page Title",
|
|
178
|
+
"description": "Page description",
|
|
179
|
+
"dateScraped": "2024-01-20T10:30:00Z"
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### 📋 CSV Files (train.csv)
|
|
184
|
+
|
|
185
|
+
```csv
|
|
186
|
+
text
|
|
187
|
+
"Clean article content here"
|
|
188
|
+
"Another article content here"
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### 📊 CSV with Metadata (train_with_metadata.csv)
|
|
192
|
+
|
|
193
|
+
```csv
|
|
194
|
+
text,title,author,description
|
|
195
|
+
"Article content","Page Title","John Doe","Page description"
|
|
196
|
+
"Another article","Second Page","Jane Smith","Another description"
|
|
121
197
|
```
|
|
122
198
|
|
|
123
199
|
## 🤖 AI/LLM Training Ready
|
package/example-usage.js
CHANGED
|
@@ -68,5 +68,12 @@ void async function main ()
|
|
|
68
68
|
|
|
69
69
|
// 4
|
|
70
70
|
// https://electronicintifada.net/
|
|
71
|
+
|
|
72
|
+
// 5
|
|
73
|
+
// https://www.palestineremembered.com/ZionistFAQ.html
|
|
74
|
+
|
|
75
|
+
// 6 https://the-palestinian-side.vercel.app/
|
|
76
|
+
|
|
77
|
+
// 7 https://stand-with-palestine.org/blogs
|
|
71
78
|
}()
|
|
72
79
|
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -36,11 +36,11 @@ class WebScraper
|
|
|
36
36
|
this.excludeList = new Set( excludeList );
|
|
37
37
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
38
38
|
this.allProcessedContent = [];
|
|
39
|
-
this.createOutputDirectory();
|
|
40
39
|
}
|
|
41
40
|
|
|
42
41
|
async start ()
|
|
43
42
|
{
|
|
43
|
+
this.createOutputDirectory();
|
|
44
44
|
await this.fetchPage( this.startURL, 0 );
|
|
45
45
|
this.createJSONLFile();
|
|
46
46
|
this.saveNumberedTextFiles();
|
|
@@ -384,41 +384,71 @@ class WebScraper
|
|
|
384
384
|
const fullOutputPath = path.join( __dirname, outputPath );
|
|
385
385
|
|
|
386
386
|
// Create output directories
|
|
387
|
+
WebScraper.createCombinedDirectories( fullOutputPath );
|
|
388
|
+
|
|
389
|
+
// Combine files by type
|
|
390
|
+
WebScraper.combineJSONLFiles( fullOutputPath, websites );
|
|
391
|
+
WebScraper.combineCSVFiles( fullOutputPath, websites );
|
|
392
|
+
WebScraper.combineTextFiles( fullOutputPath, websites );
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
static createCombinedDirectories ( fullOutputPath )
|
|
396
|
+
{
|
|
397
|
+
if ( fs.existsSync( fullOutputPath ) )
|
|
398
|
+
{
|
|
399
|
+
fs.rmSync( fullOutputPath, { recursive: true, force: true });
|
|
400
|
+
}
|
|
387
401
|
fs.mkdirSync( fullOutputPath, { recursive: true });
|
|
388
402
|
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
|
389
403
|
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
|
|
404
|
+
}
|
|
390
405
|
|
|
391
|
-
|
|
406
|
+
static combineJSONLFiles ( fullOutputPath, websites )
|
|
407
|
+
{
|
|
392
408
|
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) )
|
|
393
|
-
|
|
409
|
+
.on( "error", ( err ) => { return console.error( "Error combining JSONL:", err ) });
|
|
394
410
|
const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) )
|
|
395
|
-
|
|
411
|
+
.on( "error", ( err ) => { return console.error( "Error combining metadata JSONL:", err ) });
|
|
412
|
+
|
|
413
|
+
for ( const website of websites )
|
|
414
|
+
{
|
|
415
|
+
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
416
|
+
jsonlOutput.write( jsonlContent );
|
|
417
|
+
|
|
418
|
+
if ( website.includeMetadata )
|
|
419
|
+
{
|
|
420
|
+
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
421
|
+
jsonlMetaOutput.write( jsonlMetaContent );
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
jsonlOutput.end();
|
|
426
|
+
jsonlMetaOutput.end();
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
static combineCSVFiles ( fullOutputPath, websites )
|
|
430
|
+
{
|
|
396
431
|
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
397
432
|
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
|
398
433
|
|
|
399
434
|
csvOutput.write( "text\n" );
|
|
400
435
|
const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
|
|
436
|
+
|
|
401
437
|
if ( metadataFields.size > 0 )
|
|
402
438
|
{
|
|
403
439
|
csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
|
|
404
440
|
}
|
|
441
|
+
|
|
405
442
|
for ( const website of websites )
|
|
406
443
|
{
|
|
407
|
-
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
408
|
-
jsonlOutput.write( jsonlContent );
|
|
409
|
-
|
|
410
444
|
const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
|
|
411
445
|
.split( "\n" )
|
|
412
446
|
.slice( 1 )
|
|
413
447
|
.filter( line => { return line.trim() });
|
|
414
448
|
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
|
415
449
|
|
|
416
|
-
// Combine metadata files if they exist
|
|
417
450
|
if ( website.includeMetadata )
|
|
418
451
|
{
|
|
419
|
-
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
420
|
-
jsonlMetaOutput.write( jsonlMetaContent );
|
|
421
|
-
|
|
422
452
|
const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
|
|
423
453
|
.split( "\n" )
|
|
424
454
|
.slice( 1 )
|
|
@@ -427,18 +457,18 @@ class WebScraper
|
|
|
427
457
|
}
|
|
428
458
|
}
|
|
429
459
|
|
|
430
|
-
// Close all streams
|
|
431
|
-
jsonlOutput.end();
|
|
432
|
-
jsonlMetaOutput.end();
|
|
433
460
|
csvOutput.end();
|
|
434
461
|
csvMetaOutput.end();
|
|
462
|
+
}
|
|
435
463
|
|
|
436
|
-
|
|
464
|
+
static combineTextFiles ( fullOutputPath, websites )
|
|
465
|
+
{
|
|
437
466
|
let textFileCounter = 1;
|
|
467
|
+
|
|
438
468
|
for ( const website of websites )
|
|
439
469
|
{
|
|
440
|
-
// Regular text files
|
|
441
470
|
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
|
471
|
+
|
|
442
472
|
for ( const file of textFiles )
|
|
443
473
|
{
|
|
444
474
|
const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
|
|
@@ -448,7 +478,6 @@ class WebScraper
|
|
|
448
478
|
"utf-8"
|
|
449
479
|
);
|
|
450
480
|
|
|
451
|
-
// Metadata text files if they exist
|
|
452
481
|
if ( website.includeMetadata )
|
|
453
482
|
{
|
|
454
483
|
const metaContent = fs.readFileSync(
|
|
@@ -465,6 +494,7 @@ class WebScraper
|
|
|
465
494
|
}
|
|
466
495
|
}
|
|
467
496
|
}
|
|
497
|
+
|
|
468
498
|
}
|
|
469
499
|
|
|
470
500
|
module.exports = WebScraper;
|