clean-web-scraper 4.0.1 → 4.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -6
- package/example-usage.js +4 -1
- package/package.json +1 -1
package/README.md
CHANGED
@@ -86,7 +86,8 @@ const docsScraper = new WebScraper({
|
|
86
86
|
scrapResultPath: './datasets/docs',
|
87
87
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
88
88
|
includeMetadata: true, // Optional: Include metadata in output files
|
89
|
-
metadataFields: [
|
89
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
90
|
+
// Optional: Specify metadata fields to include
|
90
91
|
});
|
91
92
|
|
92
93
|
// Scrape blog website
|
@@ -95,7 +96,8 @@ const blogScraper = new WebScraper({
|
|
95
96
|
scrapResultPath: './datasets/blog',
|
96
97
|
maxDepth: 3, // Optional: Maximum depth for recursive crawling
|
97
98
|
includeMetadata: true, // Optional: Include metadata in output files
|
98
|
-
metadataFields: [
|
99
|
+
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dateScrapedDate"],
|
100
|
+
// Optional: Specify metadata fields to include
|
99
101
|
});
|
100
102
|
|
101
103
|
// Start scraping both sites
|
@@ -165,7 +167,7 @@ The actual article content starts here. This is the clean, processed text of the
|
|
165
167
|
### 📑 Text Files with Metadata (texts_with_metadata/*.txt)
|
166
168
|
|
167
169
|
```text
|
168
|
-
|
170
|
+
articleTitle: My Awesome Page
|
169
171
|
description: This is a great article about coding
|
170
172
|
author: John Doe
|
171
173
|
language: en
|
@@ -186,8 +188,8 @@ The actual article content starts here. This is the clean, processed text of the
|
|
186
188
|
### 📈 JSONL with Metadata (train_with_metadata.jsonl)
|
187
189
|
|
188
190
|
```json
|
189
|
-
{"text": "Article content", "metadata": {"
|
190
|
-
{"text": "Another article", "metadata": {"
|
191
|
+
{"text": "Article content", "metadata": {"articleTitle": "Page Title", "author": "John Doe"}}
|
192
|
+
{"text": "Another article", "metadata": {"articleTitle": "Second Page", "author": "Jane Smith"}}
|
191
193
|
```
|
192
194
|
|
193
195
|
### 🗃️ JSON Files In Website Output (*.json)
|
@@ -212,7 +214,7 @@ text
|
|
212
214
|
### 📊 CSV with Metadata (train_with_metadata.csv)
|
213
215
|
|
214
216
|
```csv
|
215
|
-
text,
|
217
|
+
text,articleTitle,author,description
|
216
218
|
"Article content","Page Title","John Doe","Page description"
|
217
219
|
"Another article","Second Page","Jane Smith","Another description"
|
218
220
|
```
|
package/example-usage.js
CHANGED
@@ -51,7 +51,8 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
51
51
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
52
52
|
maxDepth: 1,
|
53
53
|
exactExcludeList: [
|
54
|
-
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#"
|
54
|
+
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
55
|
+
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
|
55
56
|
],
|
56
57
|
scrapResultPath: "./dataset/khamenei-ir-free-palestine-tag/website",
|
57
58
|
jsonlOutputPath: "./dataset/khamenei-ir-free-palestine-tag/train.jsonl",
|
@@ -107,6 +108,8 @@ async function electronicintifada ( enable )
|
|
107
108
|
"https://electronicintifada.net/search/site/",
|
108
109
|
"https://electronicintifada.net/news",
|
109
110
|
"https://electronicintifada.net/opinion",
|
111
|
+
"https://electronicintifada.net/about-ei",
|
112
|
+
"https://electronicintifada.net/review"
|
110
113
|
],
|
111
114
|
exactExcludeList: [
|
112
115
|
"https://electronicintifada.net",
|