clean-web-scraper 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -19
- package/example-usage.js +10 -8
- package/package.json +1 -1
- package/src/WebScraper.js +34 -24
package/README.md
CHANGED
|
@@ -44,14 +44,16 @@ npm install
|
|
|
44
44
|
const WebScraper = require('clean-web-scraper');
|
|
45
45
|
|
|
46
46
|
const scraper = new WebScraper({
|
|
47
|
-
baseURL: 'https://example.com',
|
|
48
|
-
startURL: 'https://example.com/blog',
|
|
49
|
-
excludeList: ['/admin', '/private'],
|
|
50
|
-
exactExcludeList: ['/specific-page'],
|
|
51
|
-
scrapResultPath: './
|
|
52
|
-
|
|
53
|
-
textOutputPath: "./
|
|
54
|
-
|
|
47
|
+
baseURL: 'https://example.com/news', // Required: The website base url to scrape
|
|
48
|
+
startURL: 'https://example.com/blog', // Optional: Custom starting URL
|
|
49
|
+
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
|
50
|
+
exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
|
|
51
|
+
scrapResultPath: './example.com/website', // Required: Where to save the content
|
|
52
|
+
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
|
53
|
+
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
|
54
|
+
csvOutputPath: "./example.com/train.csv" // Optional: Custom CSV output path
|
|
55
|
+
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
56
|
+
includeTitles: true, // Optional: Include page titles in outputs
|
|
55
57
|
});
|
|
56
58
|
|
|
57
59
|
scraper.start();
|
|
@@ -73,18 +75,18 @@ Your AI-ready content is saved in a clean, structured format:
|
|
|
73
75
|
- 📈 CSV output with clean text content
|
|
74
76
|
|
|
75
77
|
```bash
|
|
76
|
-
|
|
77
|
-
├──
|
|
78
|
+
example.com/
|
|
79
|
+
├── website/
|
|
78
80
|
│ ├── page1.txt # Clean text content
|
|
79
81
|
│ ├── page1.json # Full metadata
|
|
80
|
-
│
|
|
81
|
-
│
|
|
82
|
-
│
|
|
83
|
-
|
|
84
|
-
│
|
|
85
|
-
│
|
|
86
|
-
|
|
87
|
-
|
|
82
|
+
│ └── blog/
|
|
83
|
+
│ ├── post1.txt
|
|
84
|
+
│ └── post1.json
|
|
85
|
+
│── texts/ # Numbered text files
|
|
86
|
+
│ ├── 1.txt
|
|
87
|
+
│ ├── 2.txt
|
|
88
|
+
│── train.jsonl # Combined content
|
|
89
|
+
└── train.csv # Clean text in CSV format
|
|
88
90
|
```
|
|
89
91
|
|
|
90
92
|
## 🤖 AI/LLM Training Ready
|
|
@@ -92,7 +94,7 @@ dataset/
|
|
|
92
94
|
The output is specifically formatted for AI training purposes:
|
|
93
95
|
|
|
94
96
|
- Clean, processed text without HTML markup
|
|
95
|
-
-
|
|
97
|
+
- Multiple formats (JSONL, CSV, text files)
|
|
96
98
|
- Structured content perfect for fine-tuning LLMs
|
|
97
99
|
- Ready to use in your ML pipelines
|
|
98
100
|
|
package/example-usage.js
CHANGED
|
@@ -9,19 +9,21 @@ async function khameneiIrFreePalestineTag ()
|
|
|
9
9
|
const scraper = new WebScraper({
|
|
10
10
|
baseURL: "https://english.khamenei.ir/news",
|
|
11
11
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
|
12
|
+
maxDepth: 1,
|
|
12
13
|
excludeList: [
|
|
13
14
|
],
|
|
14
15
|
exactExcludeList: [
|
|
16
|
+
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
|
|
15
17
|
],
|
|
16
|
-
scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag",
|
|
17
|
-
|
|
18
|
+
scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
|
|
19
|
+
jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
|
|
18
20
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
|
19
|
-
|
|
21
|
+
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
|
22
|
+
includeTitles: true
|
|
20
23
|
});
|
|
21
24
|
await scraper.start();
|
|
22
25
|
}
|
|
23
26
|
|
|
24
|
-
// decolonizepalestine
|
|
25
27
|
async function decolonizepalestine ()
|
|
26
28
|
{
|
|
27
29
|
// 2
|
|
@@ -39,10 +41,10 @@ async function decolonizepalestine ()
|
|
|
39
41
|
"https://decolonizepalestine.com/rainbow-washing",
|
|
40
42
|
"https://decolonizepalestine.com/"
|
|
41
43
|
],
|
|
42
|
-
scrapResultPath: "./dataset/decolonizepalestine",
|
|
43
|
-
|
|
44
|
+
scrapResultPath: "./dataset/decolonizepalestine/website",
|
|
45
|
+
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
|
44
46
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
|
45
|
-
|
|
47
|
+
csvOutputPath: "./dataset/decolonizepalestine/train.csv"
|
|
46
48
|
});
|
|
47
49
|
await scraper.start();
|
|
48
50
|
}
|
|
@@ -50,7 +52,7 @@ async function decolonizepalestine ()
|
|
|
50
52
|
void async function main ()
|
|
51
53
|
{
|
|
52
54
|
await khameneiIrFreePalestineTag();
|
|
53
|
-
|
|
55
|
+
await decolonizepalestine();
|
|
54
56
|
|
|
55
57
|
|
|
56
58
|
// 3
|
package/package.json
CHANGED
package/src/WebScraper.js
CHANGED
|
@@ -10,38 +10,46 @@ class WebScraper
|
|
|
10
10
|
constructor ({
|
|
11
11
|
baseURL,
|
|
12
12
|
startURL,
|
|
13
|
+
maxDepth = Infinity,
|
|
13
14
|
excludeList,
|
|
14
15
|
exactExcludeList,
|
|
15
16
|
scrapResultPath = "./dataset",
|
|
16
|
-
|
|
17
|
+
jsonlOutputPath,
|
|
17
18
|
textOutputPath,
|
|
18
|
-
|
|
19
|
+
csvOutputPath,
|
|
20
|
+
includeTitles = false
|
|
19
21
|
})
|
|
20
22
|
{
|
|
21
23
|
this.baseURL = baseURL;
|
|
22
24
|
this.startURL = startURL || baseURL;
|
|
23
|
-
this.
|
|
24
|
-
this.
|
|
25
|
+
this.maxDepth = maxDepth;
|
|
26
|
+
this.scrapResultPath = scrapResultPath;
|
|
27
|
+
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
25
28
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
26
|
-
this.
|
|
29
|
+
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
30
|
+
this.includeTitles = includeTitles;
|
|
27
31
|
this.visited = new Set();
|
|
28
32
|
this.excludeList = new Set( excludeList );
|
|
29
33
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
30
|
-
this.allProcessedContent = [];
|
|
34
|
+
this.allProcessedContent = [];
|
|
31
35
|
this.createOutputDirectory();
|
|
32
36
|
}
|
|
33
37
|
|
|
34
38
|
async start ()
|
|
35
39
|
{
|
|
36
|
-
await this.fetchPage( this.startURL );
|
|
40
|
+
await this.fetchPage( this.startURL, 0 );
|
|
37
41
|
this.createJSONLFile();
|
|
38
42
|
this.saveNumberedTextFiles();
|
|
39
43
|
this.createCSVFile();
|
|
40
44
|
console.log( "Scraping completed." );
|
|
41
45
|
}
|
|
42
46
|
|
|
43
|
-
async fetchPage ( url )
|
|
47
|
+
async fetchPage ( url, depth )
|
|
44
48
|
{
|
|
49
|
+
if ( depth > this.maxDepth )
|
|
50
|
+
{
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
45
53
|
this.visited.add( url );
|
|
46
54
|
try
|
|
47
55
|
{
|
|
@@ -57,6 +65,7 @@ class WebScraper
|
|
|
57
65
|
if ( article )
|
|
58
66
|
{
|
|
59
67
|
const metadata = this.metadataextractor( url, document, headers );
|
|
68
|
+
metadata.depth = depth;
|
|
60
69
|
this.saveArticle( url, article.textContent, metadata );
|
|
61
70
|
}
|
|
62
71
|
else
|
|
@@ -70,7 +79,7 @@ class WebScraper
|
|
|
70
79
|
{
|
|
71
80
|
if ( !this.visited.has( link ) )
|
|
72
81
|
{
|
|
73
|
-
await this.fetchPage( link );
|
|
82
|
+
await this.fetchPage( link, depth + 1 );
|
|
74
83
|
}
|
|
75
84
|
}
|
|
76
85
|
}
|
|
@@ -122,23 +131,16 @@ class WebScraper
|
|
|
122
131
|
const filePath = path.join( __dirname, this.scrapResultPath, urlPath );
|
|
123
132
|
const dir = path.dirname( filePath );
|
|
124
133
|
|
|
125
|
-
// Create directory if it doesn't exist
|
|
126
134
|
fs.mkdirSync( dir, { recursive: true });
|
|
127
|
-
|
|
128
|
-
// Save the text content
|
|
129
135
|
fs.writeFileSync( `${filePath}.txt`, processedContent, "utf-8" );
|
|
130
|
-
|
|
131
|
-
// Save the JSON metadata
|
|
132
136
|
fs.writeFileSync( `${filePath}.json`, JSON.stringify( metadata, null, 2 ), "utf-8" );
|
|
133
|
-
|
|
134
137
|
console.log( `Saved: ${filePath}.txt` );
|
|
135
138
|
console.log( `Saved: ${filePath}.json` );
|
|
136
139
|
}
|
|
137
140
|
|
|
138
141
|
createJSONLFile ()
|
|
139
142
|
{
|
|
140
|
-
const writeStream = fs.createWriteStream( path.join( __dirname, this.
|
|
141
|
-
|
|
143
|
+
const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
|
142
144
|
for ( const content of this.allProcessedContent )
|
|
143
145
|
{
|
|
144
146
|
const jsonLine = `${JSON.stringify( content )}\n`;
|
|
@@ -146,24 +148,27 @@ class WebScraper
|
|
|
146
148
|
}
|
|
147
149
|
|
|
148
150
|
writeStream.end();
|
|
149
|
-
console.log( `Created JSONL file at: ${this.
|
|
151
|
+
console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
|
|
150
152
|
}
|
|
151
153
|
|
|
152
154
|
createCSVFile ()
|
|
153
155
|
{
|
|
154
|
-
const writeStream = fs.createWriteStream( path.join( __dirname, this.
|
|
155
|
-
|
|
156
|
+
const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
|
|
156
157
|
writeStream.write( "text\n" );
|
|
157
|
-
|
|
158
158
|
for ( const content of this.allProcessedContent )
|
|
159
159
|
{
|
|
160
|
-
|
|
160
|
+
let fullText = content.text;
|
|
161
|
+
if ( this.includeTitles && content.metadata.title )
|
|
162
|
+
{
|
|
163
|
+
fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
|
|
164
|
+
}
|
|
165
|
+
const escapedText = fullText.replace( /"/g, "\"\"" );
|
|
161
166
|
const csvLine = `"${escapedText}"\n`;
|
|
162
167
|
writeStream.write( csvLine );
|
|
163
168
|
}
|
|
164
169
|
|
|
165
170
|
writeStream.end();
|
|
166
|
-
console.log( `Created CSV file at: ${this.
|
|
171
|
+
console.log( `Created CSV file at: ${this.csvOutputPath}` );
|
|
167
172
|
}
|
|
168
173
|
|
|
169
174
|
saveNumberedTextFiles ()
|
|
@@ -172,7 +177,12 @@ class WebScraper
|
|
|
172
177
|
{
|
|
173
178
|
const fileName = `${index + 1}.txt`;
|
|
174
179
|
const filePath = path.join( __dirname, this.textOutputPath, fileName );
|
|
175
|
-
|
|
180
|
+
let titlePrefix = "";
|
|
181
|
+
if ( this.includeTitles && content.metadata.title )
|
|
182
|
+
{
|
|
183
|
+
titlePrefix = `Title: ${content.metadata.title}\n\n`;
|
|
184
|
+
}
|
|
185
|
+
fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
|
|
176
186
|
console.log( `Created numbered text file: ${fileName}` );
|
|
177
187
|
});
|
|
178
188
|
}
|