clean-web-scraper 2.3.3 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -52,9 +52,10 @@ const scraper = new WebScraper({
52
52
  scrapResultPath: './example.com/website', // Required: Where to save the content
53
53
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
54
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
55
- csvOutputPath: "./example.com/train.csv" // Optional: Custom CSV output path
55
+ csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
56
56
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
57
- includeTitles: true, // Optional: Include page titles in outputs
57
+ includeMetadata: false, // Optional: Include metadata in output files
58
+ metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
58
59
  });
59
60
  scraper.start();
60
61
 
@@ -85,11 +86,16 @@ example.com/
85
86
  │ └── blog/
86
87
  │ ├── post1.txt
87
88
  │ └── post1.json
88
- │── texts/ # Numbered text files
89
- ├── 1.txt
90
- ├── 2.txt
91
- │── train.jsonl # Combined content
92
- └── train.csv # Clean text in CSV format
89
+ ├── texts/ # Numbered text files
90
+ ├── 1.txt
91
+ └── 2.txt
92
+ ├── texts_with_metadata/ # When includeMetadata is true
93
+ │ ├── 1.txt
94
+ │ └── 2.txt
95
+ ├── train.jsonl # Combined content
96
+ ├── train_with_metadata.jsonl # When includeMetadata is true
97
+ ├── train.csv # Clean text in CSV format
98
+ └── train_with_metadata.csv # When includeMetadata is true
93
99
  ```
94
100
 
95
101
  ## 🤖 AI/LLM Training Ready
package/example-usage.js CHANGED
@@ -19,9 +19,10 @@ async function khameneiIrFreePalestineTag ()
19
19
  jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
20
20
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
21
21
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
22
- includeTitles: true
22
+ includeMetadata: true,
23
+ metadataFields: ["title", "description", "author", "lastModified", "language"]
23
24
  });
24
- // await scraper.start();
25
+ await scraper.start();
25
26
  return scraper;
26
27
  }
27
28
 
@@ -45,9 +46,11 @@ async function decolonizepalestine ()
45
46
  scrapResultPath: "./dataset/decolonizepalestine/website",
46
47
  jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
47
48
  textOutputPath: "./dataset/decolonizepalestine/texts",
48
- csvOutputPath: "./dataset/decolonizepalestine/train.csv"
49
+ csvOutputPath: "./dataset/decolonizepalestine/train.csv",
50
+ includeMetadata: true,
51
+ metadataFields: ["title", "description", "author", "lastModified", "language"]
49
52
  });
50
- // await scraper.start();
53
+ await scraper.start();
51
54
  return scraper;
52
55
  }
53
56
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.3.3",
3
+ "version": "3.1.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
@@ -16,7 +16,8 @@
16
16
  "ai-ready-web-scraper",
17
17
  "ai",
18
18
  "fine-tune",
19
- "data-processing"
19
+ "data-processing",
20
+ "dataset"
20
21
  ],
21
22
  "author": "",
22
23
  "license": "ISC",
package/src/WebScraper.js CHANGED
@@ -17,7 +17,8 @@ class WebScraper
17
17
  jsonlOutputPath,
18
18
  textOutputPath,
19
19
  csvOutputPath,
20
- includeTitles = false
20
+ includeMetadata = false,
21
+ metadataFields = [] // ['title', 'description', 'author', 'lastModified', etc.]
21
22
  })
22
23
  {
23
24
  this.baseURL = baseURL;
@@ -27,7 +28,10 @@ class WebScraper
27
28
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
28
29
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
29
30
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
30
- this.includeTitles = includeTitles;
31
+ this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
32
+ this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
33
+ this.includeMetadata = includeMetadata;
34
+ this.metadataFields = new Set( metadataFields );
31
35
  this.visited = new Set();
32
36
  this.excludeList = new Set( excludeList );
33
37
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -118,9 +122,18 @@ class WebScraper
118
122
  {
119
123
  const processedContent = this.processContent( content );
120
124
 
121
- this.allProcessedContent.push({
125
+ const simpleContent = {
126
+ text: processedContent.trim()
127
+ };
128
+
129
+ const contentWithMetadata = {
122
130
  text: processedContent.trim(),
123
- metadata
131
+ metadata: this.filterMetadata( metadata )
132
+ };
133
+
134
+ this.allProcessedContent.push({
135
+ simple: simpleContent,
136
+ withMetadata: contentWithMetadata
124
137
  });
125
138
 
126
139
  let urlPath = new URL( url ).pathname;
@@ -140,50 +153,118 @@ class WebScraper
140
153
 
141
154
  createJSONLFile ()
142
155
  {
143
- const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
156
+ const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
157
+ let writeStreamMeta
158
+ if ( this.includeMetadata )
159
+ {
160
+ writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
161
+ }
144
162
  for ( const content of this.allProcessedContent )
145
163
  {
146
- const jsonLine = `${JSON.stringify( content )}\n`;
147
- writeStream.write( jsonLine );
164
+ writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
165
+ if ( this.includeMetadata )
166
+ {
167
+ writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
168
+ }
169
+ }
170
+ writeStreamSimple.end();
171
+ if ( this.includeMetadata )
172
+ {
173
+ writeStreamMeta.end();
148
174
  }
149
-
150
- writeStream.end();
151
175
  console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
152
176
  }
153
177
 
154
178
  createCSVFile ()
155
179
  {
156
- const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
157
- writeStream.write( "text\n" );
180
+ // Create simple version
181
+ const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
182
+ writeStreamSimple.write( "text\n" );
183
+
184
+ // Create metadata version if requested
185
+ let writeStreamMeta;
186
+ if ( this.includeMetadata )
187
+ {
188
+ writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
189
+ }
190
+
191
+ if ( this.includeMetadata )
192
+ {
193
+ const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
194
+ writeStreamMeta.write( `${headers}\n` );
195
+ }
196
+
158
197
  for ( const content of this.allProcessedContent )
159
198
  {
160
- let fullText = content.text;
161
- if ( this.includeTitles && content.metadata.title )
199
+ // Write simple version
200
+ const escapedText = content.simple.text.replace( /"/g, "\"\"" );
201
+ writeStreamSimple.write( `"${escapedText}"\n` );
202
+
203
+ // Write metadata version if requested
204
+ if ( this.includeMetadata )
162
205
  {
163
- fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
206
+ const { metadata } = content.withMetadata;
207
+ const metadataValues = Array.from( this.metadataFields )
208
+ .map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
209
+ writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
164
210
  }
165
- const escapedText = fullText.replace( /"/g, "\"\"" );
166
- const csvLine = `"${escapedText}"\n`;
167
- writeStream.write( csvLine );
168
211
  }
169
212
 
170
- writeStream.end();
171
- console.log( `Created CSV file at: ${this.csvOutputPath}` );
213
+ writeStreamSimple.end();
214
+ if ( writeStreamMeta )
215
+ {
216
+ writeStreamMeta.end();
217
+ }
218
+ console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
219
+ if ( this.includeMetadata )
220
+ {
221
+ console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
222
+ }
172
223
  }
173
224
 
174
225
  saveNumberedTextFiles ()
175
226
  {
227
+ // Create base text folder for simple content
228
+ const baseTextPath = path.join( __dirname, this.textOutputPath );
229
+
230
+ // Create metadata text folder if needed
231
+ let metaTextPath = null;
232
+ if ( this.includeMetadata )
233
+ {
234
+ metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
235
+ fs.mkdirSync( metaTextPath, { recursive: true });
236
+ }
237
+
176
238
  this.allProcessedContent.forEach( ( content, index ) =>
177
239
  {
178
240
  const fileName = `${index + 1}.txt`;
179
- const filePath = path.join( __dirname, this.textOutputPath, fileName );
180
- let titlePrefix = "";
181
- if ( this.includeTitles && content.metadata.title )
241
+
242
+ // Always save simple version
243
+ const simpleFilePath = path.join( baseTextPath, fileName );
244
+ fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
245
+ console.log( `Created simple text file: ${fileName}` );
246
+
247
+ // Save metadata version if enabled
248
+ if ( this.includeMetadata )
182
249
  {
183
- titlePrefix = `Title: ${content.metadata.title}\n\n`;
250
+ const metaFilePath = path.join( metaTextPath, fileName );
251
+ let fileContent = "";
252
+
253
+ const { metadata } = content.withMetadata;
254
+ // Add metadata fields as headers
255
+ for ( const field of this.metadataFields )
256
+ {
257
+ if ( metadata[field] )
258
+ {
259
+ fileContent += `${field}: ${metadata[field]}\n`;
260
+ }
261
+ }
262
+ fileContent += "\n---\n\n";
263
+ fileContent += content.withMetadata.text;
264
+
265
+ fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
266
+ console.log( `Created metadata text file: ${fileName}` );
184
267
  }
185
- fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
186
- console.log( `Created numbered text file: ${fileName}` );
187
268
  });
188
269
  }
189
270
 
@@ -209,6 +290,21 @@ class WebScraper
209
290
  return processed;
210
291
  }
211
292
 
293
+ filterMetadata ( metadata )
294
+ {
295
+ if ( !this.includeMetadata ) return {};
296
+
297
+ const filteredMetadata = {};
298
+ for ( const field of this.metadataFields )
299
+ {
300
+ if ( metadata[field] && typeof metadata[field] === "string" )
301
+ {
302
+ filteredMetadata[field] = metadata[field];
303
+ }
304
+ }
305
+ return filteredMetadata;
306
+ }
307
+
212
308
  metadataextractor ( url, document, headers )
213
309
  {
214
310
  return {
@@ -222,12 +318,10 @@ class WebScraper
222
318
  contentLength: headers["content-length"],
223
319
  language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
224
320
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
225
- ogTags: {
226
- title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
227
- description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
228
- image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
229
- type: document.querySelector( "meta[property=\"og:type\"]" )?.content
230
- },
321
+ ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
322
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
323
+ ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
324
+ ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
231
325
  dateScraped: new Date().toISOString()
232
326
  };
233
327
  }
@@ -265,6 +359,7 @@ class WebScraper
265
359
  if ( fs.existsSync( path.join( __dirname, this.scrapResultPath ) ) )
266
360
  {
267
361
  fs.rmSync( path.join( __dirname, this.scrapResultPath ), { recursive: true, force: true });
362
+ fs.rmSync( path.join( __dirname, this.textOutputPath ), { recursive: true, force: true });
268
363
  }
269
364
  fs.mkdirSync( path.join( __dirname, this.scrapResultPath ), { recursive: true });
270
365
  fs.mkdirSync( path.join( __dirname, this.textOutputPath ), { recursive: true });
@@ -277,33 +372,56 @@ class WebScraper
277
372
  // Create output directories
278
373
  fs.mkdirSync( fullOutputPath, { recursive: true });
279
374
  fs.mkdirSync( path.join( fullOutputPath, "texts" ), { recursive: true });
375
+ fs.mkdirSync( path.join( fullOutputPath, "texts_with_metadata" ), { recursive: true });
280
376
 
281
- // Combine JSONL files
377
+ // Combine regular JSONL files
282
378
  const jsonlOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.jsonl" ) );
379
+ const jsonlMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.jsonl" ) );
380
+ const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
381
+ const csvMetaOutput = fs.createWriteStream( path.join( fullOutputPath, "combined_with_metadata.csv" ) );
382
+
383
+ csvOutput.write( "text\n" );
384
+ const metadataFields = websites.find( w => { return w.includeMetadata })?.metadataFields || new Set();
385
+ if ( metadataFields.size > 0 )
386
+ {
387
+ csvMetaOutput.write( `text,${Array.from( metadataFields ).join( "," )}\n` );
388
+ }
283
389
  for ( const website of websites )
284
390
  {
285
391
  const jsonlContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPath ), "utf-8" );
286
392
  jsonlOutput.write( jsonlContent );
287
- }
288
- jsonlOutput.end();
289
393
 
290
- // Combine CSV files
291
- const csvOutput = fs.createWriteStream( path.join( fullOutputPath, "combined.csv" ) );
292
- csvOutput.write( "text\n" );
293
- for ( const website of websites )
294
- {
295
394
  const csvContent = fs.readFileSync( path.join( __dirname, website.csvOutputPath ), "utf-8" )
296
395
  .split( "\n" )
297
- .slice( 1 ) // Skip header
396
+ .slice( 1 )
298
397
  .filter( line => { return line.trim() });
299
- csvOutput.write( `${csvContent.join( "\n" ) }\n` );
398
+ csvOutput.write( `${csvContent.join( "\n" )}\n` );
399
+
400
+ // Combine metadata files if they exist
401
+ if ( website.includeMetadata )
402
+ {
403
+ const jsonlMetaContent = fs.readFileSync( path.join( __dirname, website.jsonlOutputPathWithMeta ), "utf-8" );
404
+ jsonlMetaOutput.write( jsonlMetaContent );
405
+
406
+ const csvMetaContent = fs.readFileSync( path.join( __dirname, website.csvOutputPathWithMeta ), "utf-8" )
407
+ .split( "\n" )
408
+ .slice( 1 )
409
+ .filter( line => { return line.trim() });
410
+ csvMetaOutput.write( `${csvMetaContent.join( "\n" )}\n` );
411
+ }
300
412
  }
413
+
414
+ // Close all streams
415
+ jsonlOutput.end();
416
+ jsonlMetaOutput.end();
301
417
  csvOutput.end();
418
+ csvMetaOutput.end();
302
419
 
303
- // Combine text files
420
+ // Combine text files (both regular and metadata versions)
304
421
  let textFileCounter = 1;
305
422
  for ( const website of websites )
306
423
  {
424
+ // Regular text files
307
425
  const textFiles = fs.readdirSync( path.join( __dirname, website.textOutputPath ) );
308
426
  for ( const file of textFiles )
309
427
  {
@@ -313,6 +431,20 @@ class WebScraper
313
431
  content,
314
432
  "utf-8"
315
433
  );
434
+
435
+ // Metadata text files if they exist
436
+ if ( website.includeMetadata )
437
+ {
438
+ const metaContent = fs.readFileSync(
439
+ path.join( __dirname, `${website.textOutputPath}_with_metadata`, file ),
440
+ "utf-8"
441
+ );
442
+ fs.writeFileSync(
443
+ path.join( fullOutputPath, "texts_with_metadata", `${textFileCounter}.txt` ),
444
+ metaContent,
445
+ "utf-8"
446
+ );
447
+ }
316
448
  textFileCounter++;
317
449
  }
318
450
  }