clean-web-scraper 2.3.3 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -7
- package/example-usage.js +7 -4
- package/package.json +3 -2
- package/src/WebScraper.js +125 -32
package/README.md
CHANGED
|
@@ -52,9 +52,10 @@ const scraper = new WebScraper({
|
|
|
52
52
|
scrapResultPath: './example.com/website', // Required: Where to save the content
|
|
53
53
|
jsonlOutputPath: './example.com/train.jsonl', // Optional: Custom JSONL output path
|
|
54
54
|
textOutputPath: "./example.com/texts", // Optional: Custom text output path
|
|
55
|
-
csvOutputPath: "./example.com/train.csv"
|
|
55
|
+
csvOutputPath: "./example.com/train.csv", // Optional: Custom CSV output path
|
|
56
56
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
|
57
|
-
|
|
57
|
+
includeMetadata: false, // Optional: Include metadata in output files
|
|
58
|
+
metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
|
|
58
59
|
});
|
|
59
60
|
scraper.start();
|
|
60
61
|
|
|
@@ -85,11 +86,16 @@ example.com/
|
|
|
85
86
|
│ └── blog/
|
|
86
87
|
│ ├── post1.txt
|
|
87
88
|
│ └── post1.json
|
|
88
|
-
|
|
89
|
-
│
|
|
90
|
-
│
|
|
91
|
-
|
|
92
|
-
|
|
89
|
+
├── texts/ # Numbered text files
|
|
90
|
+
│ ├── 1.txt
|
|
91
|
+
│ └── 2.txt
|
|
92
|
+
├── texts_with_metadata/ # When includeMetadata is true
|
|
93
|
+
│ ├── 1.txt
|
|
94
|
+
│ └── 2.txt
|
|
95
|
+
├── train.jsonl # Combined content
|
|
96
|
+
├── train_with_metadata.jsonl # When includeMetadata is true
|
|
97
|
+
├── train.csv # Clean text in CSV format
|
|
98
|
+
└── train_with_metadata.csv # When includeMetadata is true
|
|
93
99
|
```
|
|
94
100
|
|
|
95
101
|
## 🤖 AI/LLM Training Ready
|
package/example-usage.js
CHANGED
|
@@ -19,9 +19,10 @@ async function khameneiIrFreePalestineTag ()
|
|
|
19
19
|
jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
|
|
20
20
|
textOutputPath: "./dataset/khamenei-ir-free-palestine-tag/texts",
|
|
21
21
|
csvOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.csv",
|
|
22
|
-
|
|
22
|
+
includeMetadata: true,
|
|
23
|
+
metadataFields: ["title", "description", "author", "lastModified", "language", "ogTags"]
|
|
23
24
|
});
|
|
24
|
-
|
|
25
|
+
await scraper.start();
|
|
25
26
|
return scraper;
|
|
26
27
|
}
|
|
27
28
|
|
|
@@ -45,9 +46,11 @@ async function decolonizepalestine ()
|
|
|
45
46
|
scrapResultPath: "./dataset/decolonizepalestine/website",
|
|
46
47
|
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
|
47
48
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
|
48
|
-
csvOutputPath: "./dataset/decolonizepalestine/train.csv"
|
|
49
|
+
csvOutputPath: "./dataset/decolonizepalestine/train.csv",
|
|
50
|
+
includeMetadata: true,
|
|
51
|
+
metadataFields: ["title", "description", "author", "lastModified", "language"]
|
|
49
52
|
});
|
|
50
|
-
|
|
53
|
+
await scraper.start();
|
|
51
54
|
return scraper;
|
|
52
55
|
}
|
|
53
56
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "clean-web-scraper",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.0.0",
|
|
4
4
|
"main": "main.js",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"start": "node main.js",
|
|
@@ -16,7 +16,8 @@
|
|
|
16
16
|
"ai-ready-web-scraper",
|
|
17
17
|
"ai",
|
|
18
18
|
"fine-tune",
|
|
19
|
-
"data-processing"
|
|
19
|
+
"data-processing",
|
|
20
|
+
"dataset"
|
|
20
21
|
],
|
|
21
22
|
"author": "",
|
|
22
23
|
"license": "ISC",
|
package/src/WebScraper.js
CHANGED
|
@@ -17,7 +17,8 @@ class WebScraper
|
|
|
17
17
|
jsonlOutputPath,
|
|
18
18
|
textOutputPath,
|
|
19
19
|
csvOutputPath,
|
|
20
|
-
|
|
20
|
+
includeMetadata = false,
|
|
21
|
+
metadataFields = [] // ['title', 'description', 'author', 'lastModified', etc.]
|
|
21
22
|
})
|
|
22
23
|
{
|
|
23
24
|
this.baseURL = baseURL;
|
|
@@ -27,7 +28,10 @@ class WebScraper
|
|
|
27
28
|
this.jsonlOutputPath = jsonlOutputPath || path.join( this.scrapResultPath, "train.jsonl" );
|
|
28
29
|
this.textOutputPath = textOutputPath || path.join( this.scrapResultPath, "texts" );
|
|
29
30
|
this.csvOutputPath = csvOutputPath || path.join( this.scrapResultPath, "train.csv" );
|
|
30
|
-
this.
|
|
31
|
+
this.jsonlOutputPathWithMeta = jsonlOutputPath.replace( ".jsonl", "_with_metadata.jsonl" );
|
|
32
|
+
this.csvOutputPathWithMeta = csvOutputPath.replace( ".csv", "_with_metadata.csv" );
|
|
33
|
+
this.includeMetadata = includeMetadata;
|
|
34
|
+
this.metadataFields = new Set( metadataFields );
|
|
31
35
|
this.visited = new Set();
|
|
32
36
|
this.excludeList = new Set( excludeList );
|
|
33
37
|
this.exactExcludeList = this.normalizeExcludeList( exactExcludeList );
|
|
@@ -118,9 +122,18 @@ class WebScraper
|
|
|
118
122
|
{
|
|
119
123
|
const processedContent = this.processContent( content );
|
|
120
124
|
|
|
121
|
-
|
|
125
|
+
const simpleContent = {
|
|
126
|
+
text: processedContent.trim()
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
const contentWithMetadata = {
|
|
122
130
|
text: processedContent.trim(),
|
|
123
|
-
metadata
|
|
131
|
+
metadata: this.filterMetadata( metadata )
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
this.allProcessedContent.push({
|
|
135
|
+
simple: simpleContent,
|
|
136
|
+
withMetadata: contentWithMetadata
|
|
124
137
|
});
|
|
125
138
|
|
|
126
139
|
let urlPath = new URL( url ).pathname;
|
|
@@ -140,50 +153,118 @@ class WebScraper
|
|
|
140
153
|
|
|
141
154
|
createJSONLFile ()
|
|
142
155
|
{
|
|
143
|
-
const
|
|
156
|
+
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPath ) );
|
|
157
|
+
let writeStreamMeta
|
|
158
|
+
if ( this.includeMetadata )
|
|
159
|
+
{
|
|
160
|
+
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.jsonlOutputPathWithMeta ) );
|
|
161
|
+
}
|
|
144
162
|
for ( const content of this.allProcessedContent )
|
|
145
163
|
{
|
|
146
|
-
|
|
147
|
-
|
|
164
|
+
writeStreamSimple.write( `${JSON.stringify( content.simple )}\n` );
|
|
165
|
+
if ( this.includeMetadata )
|
|
166
|
+
{
|
|
167
|
+
writeStreamMeta.write( `${JSON.stringify( content.withMetadata )}\n` );
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
writeStreamSimple.end();
|
|
171
|
+
if ( this.includeMetadata )
|
|
172
|
+
{
|
|
173
|
+
writeStreamMeta.end();
|
|
148
174
|
}
|
|
149
|
-
|
|
150
|
-
writeStream.end();
|
|
151
175
|
console.log( `Created JSONL file at: ${this.jsonlOutputPath}` );
|
|
152
176
|
}
|
|
153
177
|
|
|
154
178
|
createCSVFile ()
|
|
155
179
|
{
|
|
156
|
-
|
|
157
|
-
|
|
180
|
+
// Create simple version
|
|
181
|
+
const writeStreamSimple = fs.createWriteStream( path.join( __dirname, this.csvOutputPath ) );
|
|
182
|
+
writeStreamSimple.write( "text\n" );
|
|
183
|
+
|
|
184
|
+
// Create metadata version if requested
|
|
185
|
+
let writeStreamMeta;
|
|
186
|
+
if ( this.includeMetadata )
|
|
187
|
+
{
|
|
188
|
+
writeStreamMeta = fs.createWriteStream( path.join( __dirname, this.csvOutputPathWithMeta ) );
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
if ( this.includeMetadata )
|
|
192
|
+
{
|
|
193
|
+
const headers = ["text", ...Array.from( this.metadataFields )].join( "," );
|
|
194
|
+
writeStreamMeta.write( `${headers}\n` );
|
|
195
|
+
}
|
|
196
|
+
|
|
158
197
|
for ( const content of this.allProcessedContent )
|
|
159
198
|
{
|
|
160
|
-
|
|
161
|
-
|
|
199
|
+
// Write simple version
|
|
200
|
+
const escapedText = content.simple.text.replace( /"/g, "\"\"" );
|
|
201
|
+
writeStreamSimple.write( `"${escapedText}"\n` );
|
|
202
|
+
|
|
203
|
+
// Write metadata version if requested
|
|
204
|
+
if ( this.includeMetadata )
|
|
162
205
|
{
|
|
163
|
-
|
|
206
|
+
const { metadata } = content.withMetadata;
|
|
207
|
+
const metadataValues = Array.from( this.metadataFields )
|
|
208
|
+
.map( field => { return metadata[field] ? `"${metadata[field].replace( /"/g, "\"\"" )}"` : "\"\"" });
|
|
209
|
+
writeStreamMeta.write( `"${escapedText}",${metadataValues.join( "," )}\n` );
|
|
164
210
|
}
|
|
165
|
-
const escapedText = fullText.replace( /"/g, "\"\"" );
|
|
166
|
-
const csvLine = `"${escapedText}"\n`;
|
|
167
|
-
writeStream.write( csvLine );
|
|
168
211
|
}
|
|
169
212
|
|
|
170
|
-
|
|
171
|
-
|
|
213
|
+
writeStreamSimple.end();
|
|
214
|
+
if ( writeStreamMeta )
|
|
215
|
+
{
|
|
216
|
+
writeStreamMeta.end();
|
|
217
|
+
}
|
|
218
|
+
console.log( `Created simple CSV file at: ${this.csvOutputPath}` );
|
|
219
|
+
if ( this.includeMetadata )
|
|
220
|
+
{
|
|
221
|
+
console.log( `Created metadata CSV file at: ${this.csvOutputPathWithMeta}` );
|
|
222
|
+
}
|
|
172
223
|
}
|
|
173
224
|
|
|
174
225
|
saveNumberedTextFiles ()
|
|
175
226
|
{
|
|
227
|
+
// Create base text folder for simple content
|
|
228
|
+
const baseTextPath = path.join( __dirname, this.textOutputPath );
|
|
229
|
+
|
|
230
|
+
// Create metadata text folder if needed
|
|
231
|
+
let metaTextPath = null;
|
|
232
|
+
if ( this.includeMetadata )
|
|
233
|
+
{
|
|
234
|
+
metaTextPath = path.join( __dirname, `${this.textOutputPath }_with_metadata` );
|
|
235
|
+
fs.mkdirSync( metaTextPath, { recursive: true });
|
|
236
|
+
}
|
|
237
|
+
|
|
176
238
|
this.allProcessedContent.forEach( ( content, index ) =>
|
|
177
239
|
{
|
|
178
240
|
const fileName = `${index + 1}.txt`;
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
241
|
+
|
|
242
|
+
// Always save simple version
|
|
243
|
+
const simpleFilePath = path.join( baseTextPath, fileName );
|
|
244
|
+
fs.writeFileSync( simpleFilePath, content.simple.text, "utf-8" );
|
|
245
|
+
console.log( `Created simple text file: ${fileName}` );
|
|
246
|
+
|
|
247
|
+
// Save metadata version if enabled
|
|
248
|
+
if ( this.includeMetadata )
|
|
182
249
|
{
|
|
183
|
-
|
|
250
|
+
const metaFilePath = path.join( metaTextPath, fileName );
|
|
251
|
+
let fileContent = "";
|
|
252
|
+
|
|
253
|
+
const { metadata } = content.withMetadata;
|
|
254
|
+
// Add metadata fields as headers
|
|
255
|
+
for ( const field of this.metadataFields )
|
|
256
|
+
{
|
|
257
|
+
if ( metadata[field] )
|
|
258
|
+
{
|
|
259
|
+
fileContent += `${field}: ${metadata[field]}\n`;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
fileContent += "\n---\n\n";
|
|
263
|
+
fileContent += content.withMetadata.text;
|
|
264
|
+
|
|
265
|
+
fs.writeFileSync( metaFilePath, fileContent, "utf-8" );
|
|
266
|
+
console.log( `Created metadata text file: ${fileName}` );
|
|
184
267
|
}
|
|
185
|
-
fs.writeFileSync( filePath, titlePrefix + content.text, "utf-8" );
|
|
186
|
-
console.log( `Created numbered text file: ${fileName}` );
|
|
187
268
|
});
|
|
188
269
|
}
|
|
189
270
|
|
|
@@ -209,6 +290,21 @@ class WebScraper
|
|
|
209
290
|
return processed;
|
|
210
291
|
}
|
|
211
292
|
|
|
293
|
+
filterMetadata ( metadata )
|
|
294
|
+
{
|
|
295
|
+
if ( !this.includeMetadata ) return {};
|
|
296
|
+
|
|
297
|
+
const filteredMetadata = {};
|
|
298
|
+
for ( const field of this.metadataFields )
|
|
299
|
+
{
|
|
300
|
+
if ( metadata[field] && typeof metadata[field] === "string" )
|
|
301
|
+
{
|
|
302
|
+
filteredMetadata[field] = metadata[field];
|
|
303
|
+
}
|
|
304
|
+
}
|
|
305
|
+
return filteredMetadata;
|
|
306
|
+
}
|
|
307
|
+
|
|
212
308
|
metadataextractor ( url, document, headers )
|
|
213
309
|
{
|
|
214
310
|
return {
|
|
@@ -222,12 +318,10 @@ class WebScraper
|
|
|
222
318
|
contentLength: headers["content-length"],
|
|
223
319
|
language: document.documentElement.lang || document.querySelector( "html" )?.getAttribute( "lang" ),
|
|
224
320
|
canonicalUrl: document.querySelector( "link[rel=\"canonical\"]" )?.href,
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
type: document.querySelector( "meta[property=\"og:type\"]" )?.content
|
|
230
|
-
},
|
|
321
|
+
ogTitle: document.querySelector( "meta[property=\"og:title\"]" )?.content,
|
|
322
|
+
ogDescription: document.querySelector( "meta[property=\"og:description\"]" )?.content,
|
|
323
|
+
ogImage: document.querySelector( "meta[property=\"og:image\"]" )?.content,
|
|
324
|
+
ogType: document.querySelector( "meta[property=\"og:type\"]" )?.content,
|
|
231
325
|
dateScraped: new Date().toISOString()
|
|
232
326
|
};
|
|
233
327
|
}
|
|
@@ -317,7 +411,6 @@ class WebScraper
|
|
|
317
411
|
}
|
|
318
412
|
}
|
|
319
413
|
}
|
|
320
|
-
|
|
321
414
|
}
|
|
322
415
|
|
|
323
416
|
module.exports = WebScraper;
|