clean-web-scraper 2.2.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +26 -2
  2. package/package.json +1 -1
package/README.md CHANGED
@@ -44,10 +44,10 @@ const WebScraper = require('clean-web-scraper');
44
44
 
45
45
  const scraper = new WebScraper({
46
46
  baseURL: 'https://example.com', // Required: The website to scrape
47
- scrapResultPath: './output', // Required: Where to save the content
48
47
  excludeList: ['/admin', '/private'], // Optional: Paths to exclude
49
48
  exactExcludeList: ['/specific-page'], // Optional: Exact URLs to exclude
50
- jsonlPath: 'output.jsonl', // Optional: Custom JSONL output path
49
+ scrapResultPath: './dataset', // Required: Where to save the content
50
+ jsonlPath: './dataset/train.jsonl', // Optional: Custom JSONL output path
51
51
  textOutputPath: "./dataset/texts", // Optional: Custom text output path
52
52
  csvPath: "./dataset/train.csv" // Optional: Custom CSV output path
53
53
  });
@@ -70,6 +70,30 @@ Your AI-ready content is saved in a clean, structured format:
70
70
  - 📊 JSONL output for ML training
71
71
  - 📈 CSV output with clean text content
72
72
 
73
+ ```bash
74
+ dataset/
75
+ ├── decolonizepalestine.com
76
+ │ ├── faq.json
77
+ │ ├── faq.txt
78
+ │ ├── intro
79
+ │ │ ├── bds-101.json
80
+ │ │ ├── bds-101.txt
81
+ │ ├── myth
82
+ │ │ ├── a-land-without-a-people-for-a-people-without-a-land.json
83
+ │ │ ├── a-land-without-a-people-for-a-people-without-a-land.txt
84
+ │ └── rainbow-washing
85
+ │ ├── bluewashing.json
86
+ │ ├── bluewashing.txt
87
+ ├── texts
88
+ │ ├── 1.txt
89
+ │ ├── 2.txt
90
+ │ ├── 3.txt
91
+ │ ├── 4.txt
92
+ │ └── 5.txt
93
+ ├── train.csv
94
+ └── train.jsonl
95
+ ```
96
+
73
97
  ## 🤖 AI/LLM Training Ready
74
98
 
75
99
  The output is specifically formatted for AI training purposes:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "2.2.0",
3
+ "version": "2.2.1",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",