clean-web-scraper 2.3.3 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -7
- package/example-usage.js +7 -4
- package/package.json +3 -2
- package/src/WebScraper.js +174 -42
package/README.md
CHANGED
|
@@ -52,9 +52,10 @@ const scraper = new WebScraper({
|
|
|
52
52
|
scrapResultPath: './example.com/website', // Required: Where to save the content
|
|
53
53
|
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
|
54
54
|
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
|
55
|
-
csvOutputPath: "./example.com/train.csv"
|
|
55
|
+
csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
|
|
56
56
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
57
|
-
|
|
57
|
+
includeMetadata: false, // Optional: Include metadata in output files
|
|
58
|
+
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
58
59
|
});
|
|
59
60
|
scraper.start();
|
|
60
61
|
|
|
@@ -85,11 +86,16 @@ example.com/
|
|
|
85
86
|
│ └── blog/
|
|
86
87
|
│ ├── post1.txt
|
|
87
88
|
│ └── post1.json
|
|
88
|
-
|
|
89
|
-
│
|
|
90
|
-
│
|
|
91
|
-
|
|
92
|
-
|
|
89
|
+
├── texts/ # Numbered text files
|
|
90
|
+
│ ├── 1.txt
|
|
91
|
+
│ └── 2.txt
|
|
92
|
+
├── texts_with_metadata/ # When includeMetadata is true
|
|
93
|
+
│ ├── 1.txt
|
|
94
|
+
│ └── 2.txt
|
|
95
|
+
├── train.jsonl # Combined content
|
|
96
|
+
├── train_with_metadata.jsonl # When includeMetadata is true
|
|
97
|
+
├── train.csv # Clean text in CSV format
|
|
98
|
+
└── train_with_metadata.csv # When includeMetadata is true
|
|
93
99
|
```
|
|
94
100
|
|
|
95
101
|
## 🤖 AI/LLM Training Ready
|
package/example-usage.js
CHANGED
|
@@ -19,9 +19,10 @@ async function khameneiIrFreePalestineTag ()
|
|
|
19
19
|
jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
|
|
20
20
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
|
21
21
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
|
22
|
-
|
|
22
|
+
includeMetadata: true,
|
|
23
|
+
metadataFields: ["title", "description", "author", "lastModified", "language"]
|
|
23
24
|
});
|
|
24
|
-
|
|
25
|
+
await scraper.start();
|
|
25
26
|
return scraper;
|
|
26
27
|
}
|
|
27
28
|
|
|
@@ -45,9 +46,11 @@ async function decolonizepalestine ()
|
|
|
45
46
|
scrapResultPath: "./dataset/decolonizepalestine/website",
|
|
46
47
|
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
|
47
48
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
|
48
|
-
csvOutputPath: "./dataset/decolonizepalestine/train.csv"
|
|
49
|
+
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
|
50
|
+
includeMetadata: true,
|
|
51
|
+
metadataFields: ["title", "description", "author", "lastModified", "language"]
|
|
49
52
|
});
|
|
50
|
-
|
|
53
|
+
await scraper.start();
|
|
51
54
|
return scraper;
|
|
52
55
|
}
|
|
53
56
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "clean-web-scraper",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.1.0",
|
|
4
4
|
"main": "main.js",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"start": "node main.js",
|
|
@@ -16,7 +16,8 @@
|
|
|
16
16
|
"ai-ready-web-scraper",
|
|
17
17
|
"ai",
|
|
18
18
|
"fine-tune",
|
|
19
|
-
"data-processing"
|
|
19
|
+
"data-processing",
|
|
20
|
+
"dataset"
|
|
20
21
|
],
|
|
21
22
|
"author": "",
|
|
22
23
|
"license": "ISC",
|
package/src/WebScraper.js
CHANGED
|
@@ -17,7 +17,8 @@ class WebScraper
|
|
|
17
17
|
jsonlOutputPath,
|
|
18
18
|
textOutputPath,
|
|
19
19
|
csvOutputPath,
|
|
20
|
-
|
|
20
|
+
includeMetadata = false,
|
|
21
|
+
metadataFields = [] // ['title', 'description', 'author', 'lastModified', etc.]
|
|
21
22
|
})
|
|
22
23
|
{
|
|
23
24
|
this.baseURL = baseURL;
|
|
@@ -27,7 +28,10 @@ class WebScraper
|
|
|
27
28
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
28
29
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
29
30
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
30
|
-
this.
|
|
31
|
+
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
32
|
+
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
33
|
+
this.includeMetadata = includeMetadata;
|
|
34
|
+
this.metadataFields = new Set( metadataFields );
|
|
31
35
|
this.visited = new Set();
|
|
32
36
|
this.excludeList = new Set( excludeList );
|
|
33
37
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
@@ -118,9 +122,18 @@ class WebScraper
|
|
|
118
122
|
{
|
|
119
123
|
const processedContent = this.processContent( content );
|
|
120
124
|
|
|
121
|
-
|
|
125
|
+
const simpleContent = {
|
|
126
|
+
text: processedContent.trim()
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
const contentWithMetadata = {
|
|
122
130
|
text: processedContent.trim(),
|
|
123
|
-
metadata
|
|
131
|
+
metadata: this.filterMetadata( metadata )
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
this.allProcessedContent.push({
|
|
135
|
+
simple: simpleContent,
|
|
136
|
+
withMetadata: contentWithMetadata
|
|
124
137
|
});
|
|
125
138
|
|
|
126
139
|
let urlPath = new URL( url ).pathname;
|
|
@@ -140,50 +153,118 @@ class WebScraper
|
|
|
140
153
|
|
|
141
154
|
createJSONLFile ()
|
|
142
155
|
{
|
|
143
|
-
const
|
|
156
|
+
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
|
157
|
+
let writeStreamMeta
|
|
158
|
+
if ( this.includeMetadata )
|
|
159
|
+
{
|
|
160
|
+
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
|
|
161
|
+
}
|
|
144
162
|
for ( const content of this.allProcessedContent )
|
|
145
163
|
{
|
|
146
|
-
|
|
147
|
-
|
|
164
|
+
writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
|
|
165
|
+
if ( this.includeMetadata )
|
|
166
|
+
{
|
|
167
|
+
writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
writeStreamSimple.end();
|
|
171
|
+
if ( this.includeMetadata )
|
|
172
|
+
{
|
|
173
|
+
writeStreamMeta.end();
|
|
148
174
|
}
|
|
149
|
-
|
|
150
|
-
writeStream.end();
|
|
151
175
|
console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
|
|
152
176
|
}
|
|
153
177
|
|
|
154
178
|
createCSVFile ()
|
|
155
179
|
{
|
|
156
|
-
|
|
157
|
-
|
|
180
|
+
// Create simple version
|
|
181
|
+
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
|
|
182
|
+
writeStreamSimple.write( "text\n" );
|
|
183
|
+
|
|
184
|
+
// Create metadata version if requested
|
|
185
|
+
let writeStreamMeta;
|
|
186
|
+
if ( this.includeMetadata )
|
|
187
|
+
{
|
|
188
|
+
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if ( this.includeMetadata )
|
|
192
|
+
{
|
|
193
|
+
const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
|
|
194
|
+
writeStreamMeta.write( `${headers}\n` );
|
|
195
|
+
}
|
|
196
|
+
|
|
158
197
|
for ( const content of this.allProcessedContent )
|
|
159
198
|
{
|
|
160
|
-
|
|
161
|
-
|
|
199
|
+
// Write simple version
|
|
200
|
+
const escapedText = content.simple.text.replace( /"/g, "\"\"" );
|
|
201
|
+
writeStreamSimple.write( `"${escapedText}"\n` );
|
|
202
|
+
|
|
203
|
+
// Write metadata version if requested
|
|
204
|
+
if ( this.includeMetadata )
|
|
162
205
|
{
|
|
163
|
-
|
|
206
|
+
const { metadata } = content.withMetadata;
|
|
207
|
+
const metadataValues = Array.from( this.metadataFields )
|
|
208
|
+
.map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
|
|
209
|
+
writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
|
|
164
210
|
}
|
|
165
|
-
const escapedText = fullText.replace( /"/g, "\"\"" );
|
|
166
|
-
const csvLine = `"${escapedText}"\n`;
|
|
167
|
-
writeStream.write( csvLine );
|
|
168
211
|
}
|
|
169
212
|
|
|
170
|
-
|
|
171
|
-
|
|
213
|
+
writeStreamSimple.end();
|
|
214
|
+
if ( writeStreamMeta )
|
|
215
|
+
{
|
|
216
|
+
writeStreamMeta.end();
|
|
217
|
+
}
|
|
218
|
+
console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
|
|
219
|
+
if ( this.includeMetadata )
|
|
220
|
+
{
|
|
221
|
+
console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
|
|
222
|
+
}
|
|
172
223
|
}
|
|
173
224
|
|
|
174
225
|
saveNumberedTextFiles ()
|
|
175
226
|
{
|
|
227
|
+
// Create base text folder for simple content
|
|
228
|
+
const baseTextPath = path.join( __dirname, this.textOutputPath );
|
|
229
|
+
|
|
230
|
+
// Create metadata text folder if needed
|
|
231
|
+
let metaTextPath = null;
|
|
232
|
+
if ( this.includeMetadata )
|
|
233
|
+
{
|
|
234
|
+
metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
|
|
235
|
+
fs.mkdirSync( metaTextPath, { recursive: true });
|
|
236
|
+
}
|
|
237
|
+
|
|
176
238
|
this.allProcessedContent.forEach( ( content, index ) =>
|
|
177
239
|
{
|
|
178
240
|
const fileName = `${index + 1}.txt`;
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
241
|
+
|
|
242
|
+
// Always save simple version
|
|
243
|
+
const simpleFilePath = path.join( baseTextPath, fileName );
|
|
244
|
+
fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
|
|
245
|
+
console.log( `Created simple text file: ${fileName}` );
|
|
246
|
+
|
|
247
|
+
// Save metadata version if enabled
|
|
248
|
+
if ( this.includeMetadata )
|
|
182
249
|
{
|
|
183
|
-
|
|
250
|
+
const metaFilePath = path.join( metaTextPath, fileName );
|
|
251
|
+
let fileContent = "";
|
|
252
|
+
|
|
253
|
+
const { metadata } = content.withMetadata;
|
|
254
|
+
// Add metadata fields as headers
|
|
255
|
+
for ( const field of this.metadataFields )
|
|
256
|
+
{
|
|
257
|
+
if ( metadata[field] )
|
|
258
|
+
{
|
|
259
|
+
fileContent += `${field}: ${metadata[field]}\n`;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
fileContent += "\n---\n\n";
|
|
263
|
+
fileContent += content.withMetadata.text;
|
|
264
|
+
|
|
265
|
+
fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
|
|
266
|
+
console.log( `Created metadata text file: ${fileName}` );
|
|
184
267
|
}
|
|
185
|
-
fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
|
|
186
|
-
console.log( `Created numbered text file: ${fileName}` );
|
|
187
268
|
});
|
|
188
269
|
}
|
|
189
270
|
|
|
@@ -209,6 +290,21 @@ class WebScraper
|
|
|
209
290
|
return processed;
|
|
210
291
|
}
|
|
211
292
|
|
|
293
|
+
filterMetadata ( metadata )
|
|
294
|
+
{
|
|
295
|
+
if ( !this.includeMetadata ) return {};
|
|
296
|
+
|
|
297
|
+
const filteredMetadata = {};
|
|
298
|
+
for ( const field of this.metadataFields )
|
|
299
|
+
{
|
|
300
|
+
if ( metadata[field] && typeof metadata[field] === "string" )
|
|
301
|
+
{
|
|
302
|
+
filteredMetadata[field] = metadata[field];
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
return filteredMetadata;
|
|
306
|
+
}
|
|
307
|
+
|
|
212
308
|
metadataextractor ( url, document, headers )
|
|
213
309
|
{
|
|
214
310
|
return {
|
|
@@ -222,12 +318,10 @@ class WebScraper
|
|
|
222
318
|
contentLength: headers["content-length"],
|
|
223
319
|
language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
|
|
224
320
|
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
type: document.querySelector( "meta[property=\"og:type\"]" )?.content
|
|
230
|
-
},
|
|
321
|
+
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
|
322
|
+
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
|
323
|
+
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
|
324
|
+
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
|
231
325
|
dateScraped: new Date().toISOString()
|
|
232
326
|
};
|
|
233
327
|
}
|
|
@@ -265,6 +359,7 @@ class WebScraper
|
|
|
265
359
|
if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
|
|
266
360
|
{
|
|
267
361
|
fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
|
|
362
|
+
fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
|
|
268
363
|
}
|
|
269
364
|
fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
|
|
270
365
|
fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
|
|
@@ -277,33 +372,56 @@ class WebScraper
|
|
|
277
372
|
// Create output directories
|
|
278
373
|
fs.mkdirSync( fullOutputPath, { recursive: true });
|
|
279
374
|
fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
|
|
375
|
+
fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
|
|
280
376
|
|
|
281
|
-
// Combine JSONL files
|
|
377
|
+
// Combine regular JSONL files
|
|
282
378
|
const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
|
|
379
|
+
const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) );
|
|
380
|
+
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
381
|
+
const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
|
|
382
|
+
|
|
383
|
+
csvOutput.write( "text\n" );
|
|
384
|
+
const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
|
|
385
|
+
if ( metadataFields.size > 0 )
|
|
386
|
+
{
|
|
387
|
+
csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
|
|
388
|
+
}
|
|
283
389
|
for ( const website of websites )
|
|
284
390
|
{
|
|
285
391
|
const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
|
|
286
392
|
jsonlOutput.write( jsonlContent );
|
|
287
|
-
}
|
|
288
|
-
jsonlOutput.end();
|
|
289
393
|
|
|
290
|
-
// Combine CSV files
|
|
291
|
-
const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
|
|
292
|
-
csvOutput.write( "text\n" );
|
|
293
|
-
for ( const website of websites )
|
|
294
|
-
{
|
|
295
394
|
const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
|
|
296
395
|
.split( "\n" )
|
|
297
|
-
.slice( 1 )
|
|
396
|
+
.slice( 1 )
|
|
298
397
|
.filter( line => { return line.trim() });
|
|
299
|
-
csvOutput.write( `${csvContent.join( "\n" )
|
|
398
|
+
csvOutput.write( `${csvContent.join( "\n" )}\n` );
|
|
399
|
+
|
|
400
|
+
// Combine metadata files if they exist
|
|
401
|
+
if ( website.includeMetadata )
|
|
402
|
+
{
|
|
403
|
+
const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
|
|
404
|
+
jsonlMetaOutput.write( jsonlMetaContent );
|
|
405
|
+
|
|
406
|
+
const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
|
|
407
|
+
.split( "\n" )
|
|
408
|
+
.slice( 1 )
|
|
409
|
+
.filter( line => { return line.trim() });
|
|
410
|
+
csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
|
|
411
|
+
}
|
|
300
412
|
}
|
|
413
|
+
|
|
414
|
+
// Close all streams
|
|
415
|
+
jsonlOutput.end();
|
|
416
|
+
jsonlMetaOutput.end();
|
|
301
417
|
csvOutput.end();
|
|
418
|
+
csvMetaOutput.end();
|
|
302
419
|
|
|
303
|
-
// Combine text files
|
|
420
|
+
// Combine text files (both regular and metadata versions)
|
|
304
421
|
let textFileCounter = 1;
|
|
305
422
|
for ( const website of websites )
|
|
306
423
|
{
|
|
424
|
+
// Regular text files
|
|
307
425
|
const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
|
|
308
426
|
for ( const file of textFiles )
|
|
309
427
|
{
|
|
@@ -313,6 +431,20 @@ class WebScraper
|
|
|
313
431
|
content,
|
|
314
432
|
"utf-8"
|
|
315
433
|
);
|
|
434
|
+
|
|
435
|
+
// Metadata text files if they exist
|
|
436
|
+
if ( website.includeMetadata )
|
|
437
|
+
{
|
|
438
|
+
const metaContent = fs.readFileSync(
|
|
439
|
+
path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
|
|
440
|
+
"utf-8"
|
|
441
|
+
);
|
|
442
|
+
fs.writeFileSync(
|
|
443
|
+
path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
|
|
444
|
+
metaContent,
|
|
445
|
+
"utf-8"
|
|
446
|
+
);
|
|
447
|
+
}
|
|
316
448
|
textFileCounter++;
|
|
317
449
|
}
|
|
318
450
|
}
|