clean-web-scraper 4.0.2 → 4.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +8 -6
  2. package/package.json +1 -1
package/README.md CHANGED
@@ -86,7 +86,8 @@ const docsScraper = new WebScraper({
86
86
  scrapResultPath: './datasets/docs',
87
87
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
88
88
  includeMetadata: true, // Optional: Include metadata in output files
89
- metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
89
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
90
+ // Optional: Specify metadata fields to include
90
91
  });
91
92
 
92
93
  // Scrape blog website
@@ -95,7 +96,8 @@ const blogScraper = new WebScraper({
95
96
  scrapResultPath: './datasets/blog',
96
97
  maxDepth: 3, // Optional: Maximum depth for recursive crawling
97
98
  includeMetadata: true, // Optional: Include metadata in output files
98
- metadataFields: ['title', 'description'] // Optional: Specify metadata fields to include
99
+ metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
100
+ // Optional: Specify metadata fields to include
99
101
  });
100
102
 
101
103
  // Start scraping both sites
@@ -165,7 +167,7 @@ The actual article content starts here. This is the clean, processed text of the
165
167
  ### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
166
168
 
167
169
  ```text
168
- title: My Awesome Page
170
+ articleTitle: My Awesome Page
169
171
  description: This is a great article about coding
170
172
  author: John Doe
171
173
  language: en
@@ -186,8 +188,8 @@ The actual article content starts here. This is the clean, processed text of the
186
188
  ### 📈 JSONL with Metadata (train_with_metadata.jsonl)
187
189
 
188
190
  ```json
189
- {"text": "Article content", "metadata": {"title": "Page Title", "author": "John Doe"}}
190
- {"text": "Another article", "metadata": {"title": "Second Page", "author": "Jane Smith"}}
191
+ {"text": "Article content", "metadata": {"articleTitle": "Page Title", "author": "John Doe"}}
192
+ {"text": "Another article", "metadata": {"articleTitle": "Second Page", "author": "Jane Smith"}}
191
193
  ```
192
194
 
193
195
  ### 🗃️ JSON Files In Website Output (*.json)
@@ -212,7 +214,7 @@ text
212
214
  ### 📊 CSV with Metadata (train_with_metadata.csv)
213
215
 
214
216
  ```csv
215
- text,title,author,description
217
+ text,articleTitle,author,description
216
218
  "Article content","Page Title","John Doe","Page description"
217
219
  "Another article","Second Page","Jane Smith","Another description"
218
220
  ```
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.0.2",
3
+ "version": "4.0.3",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",