clean-web-scraper 2.3.3 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -52,9 +52,10 @@ const scraper = new WebScraper({
52
52
  scrapResultPath: './example.com/website', // Required: Where to save the content
53
53
  jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
54
54
  textOutputPath: "./example.com/texts", // Optional: Custom text output path
55
- csvOutputPath: "./example.com/train.csv" // Optional: Custom CSV output path
55
+ csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
56
56
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
57
- includeTitles: true, // Optional: Include page titles in outputs
57
+ includeMetadata: false, // Optional: Include metadata in output files
58
+ metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
58
59
  });
59
60
  scraper.start();
60
61
 
@@ -85,11 +86,16 @@ example.com/
85
86
  │ └── blog/
86
87
  │ ├── post1.txt
87
88
  │ └── post1.json
88
- │── texts/ # Numbered text files
89
- ├── 1.txt
90
- ├── 2.txt
91
- │── train.jsonl # Combined content
92
- └── train.csv # Clean text in CSV format
89
+ ├── texts/ # Numbered text files
90
+ ├── 1.txt
91
+ └── 2.txt
92
+ ├── texts_with_metadata/ # When includeMetadata is true
93
+ │ ├── 1.txt
94
+ │ └── 2.txt
95
+ ├── train.jsonl # Combined content
96
+ ├── train_with_metadata.jsonl # When includeMetadata is true
97
+ ├── train.csv # Clean text in CSV format
98
+ └── train_with_metadata.csv # When includeMetadata is true
93
99
  ```
94
100
 
95
101
  ## 🤖 AI/LLM Training Ready
package/example-usage.js CHANGED
@@ -19,9 +19,10 @@ async function khameneiIrFreePalestineTag ()
19
19
  jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
20
20
  textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
21
21
  csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
22
- includeTitles: true
22
+ includeMetadata: true,
23
+ metadataFields: ["title", "description", "author", "lastModified", "language", "ogTags"]
23
24
  });
24
- // await scraper.start();
25
+ await scraper.start();
25
26
  return scraper;
26
27
  }
27
28
 
@@ -45,9 +46,11 @@ async function decolonizepalestine ()
45
46
  scrapResultPath: "./dataset/decolonizepalestine/website",
46
47
  jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
47
48
  textOutputPath: "./dataset/decolonizepalestine/texts",
48
- csvOutputPath: "./dataset/decolonizepalestine/train.csv"
49
+ csvOutputPath: "./dataset/decolonizepalestine/train.csv",
50
+ includeMetadata: true,
51
+ metadataFields: ["title", "description", "author", "lastModified", "language"]
49
52
  });
50
- // await scraper.start();
53
+ await scraper.start();
51
54
  return scraper;
52
55
  }
53
56
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.3.3",
3
+ "version": "3.0.0",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",
@@ -16,7 +16,8 @@
16
16
  "ai-ready-web-scraper",
17
17
  "ai",
18
18
  "fine-tune",
19
- "data-processing"
19
+ "data-processing",
20
+ "dataset"
20
21
  ],
21
22
  "author": "",
22
23
  "license": "ISC",
package/src/WebScraper.js CHANGED
@@ -17,7 +17,8 @@ class WebScraper
17
17
  jsonlOutputPath,
18
18
  textOutputPath,
19
19
  csvOutputPath,
20
- includeTitles = false
20
+ includeMetadata = false,
21
+ metadataFields = [] // ['title', 'description', 'author', 'lastModified', etc.]
21
22
  })
22
23
  {
23
24
  this.baseURL = baseURL;
@@ -27,7 +28,10 @@ class WebScraper
27
28
  this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
28
29
  this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
29
30
  this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
30
- this.includeTitles = includeTitles;
31
+ this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
32
+ this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
33
+ this.includeMetadata = includeMetadata;
34
+ this.metadataFields = new Set( metadataFields );
31
35
  this.visited = new Set();
32
36
  this.excludeList = new Set( excludeList );
33
37
  this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
@@ -118,9 +122,18 @@ class WebScraper
118
122
  {
119
123
  const processedContent = this.processContent( content );
120
124
 
121
- this.allProcessedContent.push({
125
+ const simpleContent = {
126
+ text: processedContent.trim()
127
+ };
128
+
129
+ const contentWithMetadata = {
122
130
  text: processedContent.trim(),
123
- metadata
131
+ metadata: this.filterMetadata( metadata )
132
+ };
133
+
134
+ this.allProcessedContent.push({
135
+ simple: simpleContent,
136
+ withMetadata: contentWithMetadata
124
137
  });
125
138
 
126
139
  let urlPath = new URL( url ).pathname;
@@ -140,50 +153,118 @@ class WebScraper
140
153
 
141
154
  createJSONLFile ()
142
155
  {
143
- const writeStream = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
156
+ const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
157
+ let writeStreamMeta
158
+ if ( this.includeMetadata )
159
+ {
160
+ writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
161
+ }
144
162
  for ( const content of this.allProcessedContent )
145
163
  {
146
- const jsonLine = `${JSON.stringify( content )}\n`;
147
- writeStream.write( jsonLine );
164
+ writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
165
+ if ( this.includeMetadata )
166
+ {
167
+ writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
168
+ }
169
+ }
170
+ writeStreamSimple.end();
171
+ if ( this.includeMetadata )
172
+ {
173
+ writeStreamMeta.end();
148
174
  }
149
-
150
- writeStream.end();
151
175
  console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
152
176
  }
153
177
 
154
178
  createCSVFile ()
155
179
  {
156
- const writeStream = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
157
- writeStream.write( "text\n" );
180
+ // Create simple version
181
+ const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
182
+ writeStreamSimple.write( "text\n" );
183
+
184
+ // Create metadata version if requested
185
+ let writeStreamMeta;
186
+ if ( this.includeMetadata )
187
+ {
188
+ writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
189
+ }
190
+
191
+ if ( this.includeMetadata )
192
+ {
193
+ const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
194
+ writeStreamMeta.write( `${headers}\n` );
195
+ }
196
+
158
197
  for ( const content of this.allProcessedContent )
159
198
  {
160
- let fullText = content.text;
161
- if ( this.includeTitles && content.metadata.title )
199
+ // Write simple version
200
+ const escapedText = content.simple.text.replace( /"/g, "\"\"" );
201
+ writeStreamSimple.write( `"${escapedText}"\n` );
202
+
203
+ // Write metadata version if requested
204
+ if ( this.includeMetadata )
162
205
  {
163
- fullText = `Title: ${content.metadata.title}\n\n${content.text}`;
206
+ const { metadata } = content.withMetadata;
207
+ const metadataValues = Array.from( this.metadataFields )
208
+ .map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
209
+ writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
164
210
  }
165
- const escapedText = fullText.replace( /"/g, "\"\"" );
166
- const csvLine = `"${escapedText}"\n`;
167
- writeStream.write( csvLine );
168
211
  }
169
212
 
170
- writeStream.end();
171
- console.log( `Created CSV file at: ${this.csvOutputPath}` );
213
+ writeStreamSimple.end();
214
+ if ( writeStreamMeta )
215
+ {
216
+ writeStreamMeta.end();
217
+ }
218
+ console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
219
+ if ( this.includeMetadata )
220
+ {
221
+ console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
222
+ }
172
223
  }
173
224
 
174
225
  saveNumberedTextFiles ()
175
226
  {
227
+ // Create base text folder for simple content
228
+ const baseTextPath = path.join( __dirname, this.textOutputPath );
229
+
230
+ // Create metadata text folder if needed
231
+ let metaTextPath = null;
232
+ if ( this.includeMetadata )
233
+ {
234
+ metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
235
+ fs.mkdirSync( metaTextPath, { recursive: true });
236
+ }
237
+
176
238
  this.allProcessedContent.forEach( ( content, index ) =>
177
239
  {
178
240
  const fileName = `${index + 1}.txt`;
179
- const filePath = path.join( __dirname, this.textOutputPath, fileName );
180
- let titlePrefix = "";
181
- if ( this.includeTitles && content.metadata.title )
241
+
242
+ // Always save simple version
243
+ const simpleFilePath = path.join( baseTextPath, fileName );
244
+ fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
245
+ console.log( `Created simple text file: ${fileName}` );
246
+
247
+ // Save metadata version if enabled
248
+ if ( this.includeMetadata )
182
249
  {
183
- titlePrefix = `Title: ${content.metadata.title}\n\n`;
250
+ const metaFilePath = path.join( metaTextPath, fileName );
251
+ let fileContent = "";
252
+
253
+ const { metadata } = content.withMetadata;
254
+ // Add metadata fields as headers
255
+ for ( const field of this.metadataFields )
256
+ {
257
+ if ( metadata[field] )
258
+ {
259
+ fileContent += `${field}: ${metadata[field]}\n`;
260
+ }
261
+ }
262
+ fileContent += "\n---\n\n";
263
+ fileContent += content.withMetadata.text;
264
+
265
+ fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
266
+ console.log( `Created metadata text file: ${fileName}` );
184
267
  }
185
- fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
186
- console.log( `Created numbered text file: ${fileName}` );
187
268
  });
188
269
  }
189
270
 
@@ -209,6 +290,21 @@ class WebScraper
209
290
  return processed;
210
291
  }
211
292
 
293
+ filterMetadata ( metadata )
294
+ {
295
+ if ( !this.includeMetadata ) return {};
296
+
297
+ const filteredMetadata = {};
298
+ for ( const field of this.metadataFields )
299
+ {
300
+ if ( metadata[field] && typeof metadata[field] === "string" )
301
+ {
302
+ filteredMetadata[field] = metadata[field];
303
+ }
304
+ }
305
+ return filteredMetadata;
306
+ }
307
+
212
308
  metadataextractor ( url, document, headers )
213
309
  {
214
310
  return {
@@ -222,12 +318,10 @@ class WebScraper
222
318
  contentLength: headers["content-length"],
223
319
  language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
224
320
  canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
225
- ogTags: {
226
- title: document.querySelector( "meta[property=\"og:title\"]" )?.content,
227
- description: document.querySelector( "meta[property=\"og:description\"]" )?.content,
228
- image: document.querySelector( "meta[property=\"og:image\"]" )?.content,
229
- type: document.querySelector( "meta[property=\"og:type\"]" )?.content
230
- },
321
+ ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
322
+ ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
323
+ ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
324
+ ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
231
325
  dateScraped: new Date().toISOString()
232
326
  };
233
327
  }
@@ -317,7 +411,6 @@ class WebScraper
317
411
  }
318
412
  }
319
413
  }
320
-
321
414
  }
322
415
 
323
416
  module.exports = WebScraper;