clean-web-scraper 2.2.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -44,10 +44,10 @@ const WebScraper = require('clean-web-scraper');
|
|
|
44
44
|
|
|
45
45
|
const scraper = new WebScraper({
|
|
46
46
|
baseURL: 'https://example.com', // Required: The website to scrape
|
|
47
|
-
scrapResultPath: './output', // Required: Where to save the content
|
|
48
47
|
excludeList: ['/admin', '/private'], // Optional: Paths to exclude
|
|
49
48
|
exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
|
|
50
|
-
|
|
49
|
+
scrapResultPath: './dataset', // Required: Where to save the content
|
|
50
|
+
jsonlPath: './dataset/train.jsonl', // Optional: Custom JSONL output path
|
|
51
51
|
textOutputPath: "./dataset/texts", // Optional: Custom text output path
|
|
52
52
|
csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
|
|
53
53
|
});
|
|
@@ -70,6 +70,30 @@ Your AI-ready content is saved in a clean, structured format:
|
|
|
70
70
|
- 📊 JSONL output for ML training
|
|
71
71
|
- 📈 CSV output with clean text content
|
|
72
72
|
|
|
73
|
+
```bash
|
|
74
|
+
dataset/
|
|
75
|
+
├── decolonizepalestine.com
|
|
76
|
+
│ ├── faq.json
|
|
77
|
+
│ ├── faq.txt
|
|
78
|
+
│ ├── intro
|
|
79
|
+
│ │ ├── bds-101.json
|
|
80
|
+
│ │ ├── bds-101.txt
|
|
81
|
+
│ ├── myth
|
|
82
|
+
│ │ ├── a-land-without-a-people-for-a-people-without-a-land.json
|
|
83
|
+
│ │ ├── a-land-without-a-people-for-a-people-without-a-land.txt
|
|
84
|
+
│ └── rainbow-washing
|
|
85
|
+
│ ├── bluewashing.json
|
|
86
|
+
│ ├── bluewashing.txt
|
|
87
|
+
├── texts
|
|
88
|
+
│ ├── 1.txt
|
|
89
|
+
│ ├── 2.txt
|
|
90
|
+
│ ├── 3.txt
|
|
91
|
+
│ ├── 4.txt
|
|
92
|
+
│ └── 5.txt
|
|
93
|
+
├── train.csv
|
|
94
|
+
└── train.jsonl
|
|
95
|
+
```
|
|
96
|
+
|
|
73
97
|
## 🤖 AI/LLM Training Ready
|
|
74
98
|
|
|
75
99
|
The output is specifically formatted for AI training purposes:
|