clean-web-scraper 2.3.2 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,7 +13,8 @@ A powerful Node.js web scraper that extracts clean, readable content from websit
13
13
  - 🎯 No duplicate page visits
14
14
  - 📊 Generates JSONL output file for ML training
15
15
  - 📊 AI-friendly clean text and csv output (perfect for LLM fine-tuning!)
16
- - 📊 Rich metadata extraction including:
16
+ - 📊 Rich metadata extraction
17
+ - 📁 Combine results from multiple scrapers into a unified dataset
17
18
 
18
19
  ## 🛠️ Prerequisites
19
20
 
@@ -51,12 +52,15 @@ const scraper = new WebScraper({
51
52
  scrapResultPath: './example.com/website', // Required: Where to save the content
52
53
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
53
54
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
54
- csvOutputPath: "./example.com/train.csv" // Optional: Custom CSV output path
55
+ csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
55
56
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
56
- includeTitles: true, // Optional: Include page titles in outputs
57
+ includeMetadata: false, // Optional: Include metadata in output files
58
+ metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
57
59
  });
58
-
59
60
  scraper.start();
61
+
62
+ // Combine results from multiple scrapers
63
+ WebScraper.combineResults('./combined-dataset', [scraper1, scraper2]);
60
64
  ```
61
65
 
62
66
  ```bash
@@ -82,11 +86,16 @@ example.com/
82
86
  │ └── blog/
83
87
  │ ├── post1.txt
84
88
  │ └── post1.json
85
- │── texts/ # Numbered text files
86
- ├── 1.txt
87
- ├── 2.txt
88
- │── train.jsonl # Combined content
89
- └── train.csv # Clean text in CSV format
89
+ ├── texts/ # Numbered text files
90
+ ├── 1.txt
91
+ └── 2.txt
92
+ ├── texts_with_metadata/ # When includeMetadata is true
93
+ │ ├── 1.txt
94
+ │ └── 2.txt
95
+ ├── train.jsonl # Combined content
96
+ ├── train_with_metadata.jsonl # When includeMetadata is true
97
+ ├── train.csv # Clean text in CSV format
98
+ └── train_with_metadata.csv # When includeMetadata is true
90
99
  ```
91
100
 
92
101
  ## 🤖 AI/LLM Training Ready
package/example-usage.js CHANGED
@@ -19,9 +19,11 @@ async function khameneiIrFreePalestineTag ()
19
19
  jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
20
20
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
21
21
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
22
- includeTitles: true
22
+ includeMetadata: true,
23
+ metadataFields: ["title", "description", "author", "lastModified", "language", "ogTags"]
23
24
  });
24
25
  await scraper.start();
26
+ return scraper;
25
27
  }
26
28
 
27
29
  async function decolonizepalestine ()
@@ -44,16 +46,22 @@ async function decolonizepalestine ()
44
46
  scrapResultPath: "./dataset/decolonizepalestine/website",
45
47
  jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
46
48
  textOutputPath: "./dataset/decolonizepalestine/texts",
47
- csvOutputPath: "./dataset/decolonizepalestine/train.csv"
49
+ csvOutputPath: "./dataset/decolonizepalestine/train.csv",
50
+ includeMetadata: true,
51
+ metadataFields: ["title", "description", "author", "lastModified", "language"]
48
52
  });
49
53
  await scraper.start();
54
+ return scraper;
50
55
  }
51
56
 
52
57
  void async function main ()
53
58
  {
54
- await khameneiIrFreePalestineTag();
55
- await decolonizepalestine();
56
-
59
+ const khameneiIrFreePalestineTagScraper = await khameneiIrFreePalestineTag();
60
+ const decolonizepalestineScraper = await decolonizepalestine();
61
+ WebScraper.combineResults( "./dataset/combined", [
62
+ khameneiIrFreePalestineTagScraper,
63
+ decolonizepalestineScraper
64
+ ] );
57
65
 
58
66
  // 3
59
67
  // https://bdsmovement.net
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.3.2",
3
+ "version": "3.0.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
@@ -16,7 +16,8 @@
16
16
  "ai-ready-web-scraper",
17
17
  "ai",
18
18
  "fine-tune",
19
- "data-processing"
19
+ "data-processing",
20
+ "dataset"
20
21
  ],
21
22
  "author": "",
22
23
  "license": "ISC",
package/src/WebScraper.js CHANGED
@@ -17,7 +17,8 @@ class WebScraper
17
17
  jsonlOutputPath,
18
18
  textOutputPath,
19
19
  csvOutputPath,
20
- includeTitles = false
20
+ includeMetadata = false,
21
+ metadataFields = [] // ['title', 'description', 'author', 'lastModified', etc.]
21
22
  })
22
23
  {
23
24
  this.baseURL = baseURL;
@@ -27,7 +28,10 @@ class WebScraper
27
28
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
28
29
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
29
30
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
30
- this.includeTitles = includeTitles;
31
+ this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
32
+ this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
33
+ this.includeMetadata = includeMetadata;
34
+ this.metadataFields = new Set( metadataFields );
31
35
  this.visited = new Set();
32
36
  this.excludeList = new Set( excludeList );
33
37
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -118,9 +122,18 @@ class WebScraper
118
122
  {
119
123
  const processedContent = this.processContent( content );
120
124
 
121
- this.allProcessedContent.push({
125
+ const simpleContent = {
126
+ text: processedContent.trim()
127
+ };
128
+
129
+ const contentWithMetadata = {
122
130
  text: processedContent.trim(),
123
- metadata
131
+ metadata: this.filterMetadata( metadata )
132
+ };
133
+
134
+ this.allProcessedContent.push({
135
+ simple: simpleContent,
136
+ withMetadata: contentWithMetadata
124
137
  });
125
138
 
126
139
  let urlPath = new URL( url ).pathname;
@@ -140,50 +153,118 @@ class WebScraper
140
153
 
141
154
  createJSONLFile ()
142
155
  {
143
- const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
156
+ const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
157
+ let writeStreamMeta
158
+ if ( this.includeMetadata )
159
+ {
160
+ writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
161
+ }
144
162
  for ( const content of this.allProcessedContent )
145
163
  {
146
- const jsonLine = `${JSON.stringify( content )}\n`;
147
- writeStream.write( jsonLine );
164
+ writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
165
+ if ( this.includeMetadata )
166
+ {
167
+ writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
168
+ }
169
+ }
170
+ writeStreamSimple.end();
171
+ if ( this.includeMetadata )
172
+ {
173
+ writeStreamMeta.end();
148
174
  }
149
-
150
- writeStream.end();
151
175
  console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
152
176
  }
153
177
 
154
178
  createCSVFile ()
155
179
  {
156
- const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
157
- writeStream.write( "text\n" );
180
+ // Create simple version
181
+ const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
182
+ writeStreamSimple.write( "text\n" );
183
+
184
+ // Create metadata version if requested
185
+ let writeStreamMeta;
186
+ if ( this.includeMetadata )
187
+ {
188
+ writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
189
+ }
190
+
191
+ if ( this.includeMetadata )
192
+ {
193
+ const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
194
+ writeStreamMeta.write( `${headers}\n` );
195
+ }
196
+
158
197
  for ( const content of this.allProcessedContent )
159
198
  {
160
- let fullText = content.text;
161
- if ( this.includeTitles && content.metadata.title )
199
+ // Write simple version
200
+ const escapedText = content.simple.text.replace( /"/g, "\"\"" );
201
+ writeStreamSimple.write( `"${escapedText}"\n` );
202
+
203
+ // Write metadata version if requested
204
+ if ( this.includeMetadata )
162
205
  {
163
- fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
206
+ const { metadata } = content.withMetadata;
207
+ const metadataValues = Array.from( this.metadataFields )
208
+ .map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
209
+ writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
164
210
  }
165
- const escapedText = fullText.replace( /"/g, "\"\"" );
166
- const csvLine = `"${escapedText}"\n`;
167
- writeStream.write( csvLine );
168
211
  }
169
212
 
170
- writeStream.end();
171
- console.log( `Created CSV file at: ${this.csvOutputPath}` );
213
+ writeStreamSimple.end();
214
+ if ( writeStreamMeta )
215
+ {
216
+ writeStreamMeta.end();
217
+ }
218
+ console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
219
+ if ( this.includeMetadata )
220
+ {
221
+ console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
222
+ }
172
223
  }
173
224
 
174
225
  saveNumberedTextFiles ()
175
226
  {
227
+ // Create base text folder for simple content
228
+ const baseTextPath = path.join( __dirname, this.textOutputPath );
229
+
230
+ // Create metadata text folder if needed
231
+ let metaTextPath = null;
232
+ if ( this.includeMetadata )
233
+ {
234
+ metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
235
+ fs.mkdirSync( metaTextPath, { recursive: true });
236
+ }
237
+
176
238
  this.allProcessedContent.forEach( ( content, index ) =>
177
239
  {
178
240
  const fileName = `${index + 1}.txt`;
179
- const filePath = path.join( __dirname, this.textOutputPath, fileName );
180
- let titlePrefix = "";
181
- if ( this.includeTitles && content.metadata.title )
241
+
242
+ // Always save simple version
243
+ const simpleFilePath = path.join( baseTextPath, fileName );
244
+ fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
245
+ console.log( `Created simple text file: ${fileName}` );
246
+
247
+ // Save metadata version if enabled
248
+ if ( this.includeMetadata )
182
249
  {
183
- titlePrefix = `Title: ${content.metadata.title}\n\n`;
250
+ const metaFilePath = path.join( metaTextPath, fileName );
251
+ let fileContent = "";
252
+
253
+ const { metadata } = content.withMetadata;
254
+ // Add metadata fields as headers
255
+ for ( const field of this.metadataFields )
256
+ {
257
+ if ( metadata[field] )
258
+ {
259
+ fileContent += `${field}: ${metadata[field]}\n`;
260
+ }
261
+ }
262
+ fileContent += "\n---\n\n";
263
+ fileContent += content.withMetadata.text;
264
+
265
+ fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
266
+ console.log( `Created metadata text file: ${fileName}` );
184
267
  }
185
- fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
186
- console.log( `Created numbered text file: ${fileName}` );
187
268
  });
188
269
  }
189
270
 
@@ -209,6 +290,21 @@ class WebScraper
209
290
  return processed;
210
291
  }
211
292
 
293
+ filterMetadata ( metadata )
294
+ {
295
+ if ( !this.includeMetadata ) return {};
296
+
297
+ const filteredMetadata = {};
298
+ for ( const field of this.metadataFields )
299
+ {
300
+ if ( metadata[field] && typeof metadata[field] === "string" )
301
+ {
302
+ filteredMetadata[field] = metadata[field];
303
+ }
304
+ }
305
+ return filteredMetadata;
306
+ }
307
+
212
308
  metadataextractor ( url, document, headers )
213
309
  {
214
310
  return {
@@ -222,12 +318,10 @@ class WebScraper
222
318
  contentLength: headers["content-length"],
223
319
  language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
224
320
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
225
- ogTags: {
226
- title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
227
- description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
228
- image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
229
- type: document.querySelector( "meta[property=\"og:type\"]" )?.content
230
- },
321
+ ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
322
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
323
+ ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
324
+ ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
231
325
  dateScraped: new Date().toISOString()
232
326
  };
233
327
  }
@@ -269,6 +363,54 @@ class WebScraper
269
363
  fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
270
364
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
271
365
  }
366
+
367
+ static combineResults ( outputPath, websites )
368
+ {
369
+ const fullOutputPath = path.join( __dirname, outputPath );
370
+
371
+ // Create output directories
372
+ fs.mkdirSync( fullOutputPath, { recursive: true });
373
+ fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
374
+
375
+ // Combine JSONL files
376
+ const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
377
+ for ( const website of websites )
378
+ {
379
+ const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
380
+ jsonlOutput.write( jsonlContent );
381
+ }
382
+ jsonlOutput.end();
383
+
384
+ // Combine CSV files
385
+ const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
386
+ csvOutput.write( "text\n" );
387
+ for ( const website of websites )
388
+ {
389
+ const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
390
+ .split( "\n" )
391
+ .slice( 1 ) // Skip header
392
+ .filter( line => { return line.trim() });
393
+ csvOutput.write( `${csvContent.join( "\n" ) }\n` );
394
+ }
395
+ csvOutput.end();
396
+
397
+ // Combine text files
398
+ let textFileCounter = 1;
399
+ for ( const website of websites )
400
+ {
401
+ const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
402
+ for ( const file of textFiles )
403
+ {
404
+ const content = fs.readFileSync( path.join( __dirname, website.textOutputPath, file ), "utf-8" );
405
+ fs.writeFileSync(
406
+ path.join( fullOutputPath, "texts", `${textFileCounter}.txt` ),
407
+ content,
408
+ "utf-8"
409
+ );
410
+ textFileCounter++;
411
+ }
412
+ }
413
+ }
272
414
  }
273
415
 
274
416
  module.exports = WebScraper;