vesper-wizard 2.0.4 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/{wizard.js → scripts/wizard.js} +148 -32
  174. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  175. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  179. package/src/python/asset_downloader_engine.py +92 -0
  180. package/src/python/cleaner.py +226 -0
  181. package/src/python/config.py +263 -0
  182. package/src/python/dataworld_engine.py +208 -0
  183. package/src/python/export_engine.py +243 -0
  184. package/src/python/framework_adapters.py +100 -0
  185. package/src/python/fusion_engine.py +368 -0
  186. package/src/python/github_adapter.py +106 -0
  187. package/src/python/hf_fallback.py +298 -0
  188. package/src/python/image_engine.py +86 -0
  189. package/src/python/kaggle_engine.py +295 -0
  190. package/src/python/media_engine.py +133 -0
  191. package/src/python/nasa_adapter.py +82 -0
  192. package/src/python/openml_engine.py +146 -0
  193. package/src/python/quality_engine.py +267 -0
  194. package/src/python/row_count.py +54 -0
  195. package/src/python/splitter_engine.py +283 -0
  196. package/src/python/target_engine.py +154 -0
  197. package/src/python/test_framework_adapters.py +61 -0
  198. package/src/python/test_fusion_engine.py +89 -0
  199. package/src/python/uci_adapter.py +94 -0
  200. package/src/python/vesper/__init__.py +1 -0
  201. package/src/python/vesper/core/__init__.py +1 -0
  202. package/src/python/vesper/core/asset_downloader.py +675 -0
  203. package/src/python/vesper/core/download_recipe.py +104 -0
  204. package/src/python/worldbank_adapter.py +99 -0
  205. package/vesper-mcp-config.json +0 -6
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Vesper Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md CHANGED
@@ -1,60 +1,323 @@
1
- # vesper-wizard
1
+ # Vesper MCP Server 🚀
2
2
 
3
- Zero-friction setup wizard for [Vesper](https://github.com/vesper/mcp-server) your local MCP-native dataset intelligence layer.
3
+ **AI-powered dataset discovery, quality analysis, and preparation** with multimodal support (text, image, audio, video).
4
4
 
5
- ## Install
5
+ Vesper is a Model Context Protocol (MCP) server that helps you find, analyze, and prepare high-quality datasets for machine learning projects. It integrates seamlessly with AI assistants like Claude, providing autonomous dataset workflows.
6
+
7
+ ## ✨ Features
8
+
9
+ ### 🔍 **Dataset Discovery**
10
+ - Search across HuggingFace, Kaggle, UCI ML Repository, and more
11
+ - Intelligent ranking based on quality, safety, and relevance
12
+ - Automatic metadata extraction and enrichment
13
+
14
+ ### 📊 **Quality Analysis**
15
+ - **Text**: Missing data, duplicates, column profiling
16
+ - **Images**: Resolution, corruption, blur detection
17
+ - **Audio**: Sample rate, duration, silence detection
18
+ - **Video**: FPS, frame validation, corruption risk
19
+ - **Unified Reports**: Consolidated quality scores (0-100) with recommendations
20
+
21
+ ### 🛠️ **Data Preparation**
22
+ - Automated cleaning pipelines
23
+ - Format conversion (CSV, JSON, Parquet)
24
+ - Train/test/validation splitting
25
+ - Automatic installation to project directories
26
+
27
+ ### 🎯 **Multimodal Support**
28
+ - Analyze mixed datasets (text + images + audio)
29
+ - Media-specific quality metrics
30
+ - Intelligent modality detection
31
+
32
+ ## 📦 Installation
33
+
34
+ ## 🚀 Quick Start (VS Code + Copilot)
35
+
36
+ The fastest way to install Vesper and configure it for **GitHub Copilot Chat** or **Cursor** is to run the automated setup:
37
+
38
+ ```bash
39
+ npx -y -p @vespermcp/mcp-server@latest vespermcp --setup
40
+ ```
41
+
42
+ 1. Select **Visual Studio Code (Settings.json)** from the list.
43
+ 2. Restart VS Code.
44
+ 3. Open Copilot Chat and look for the **MCP Servers** section.
45
+
46
+ ## 🛠️ Configuration
47
+ Vesper supports:
48
+ - **GitHub Copilot Chat**: Automated setup via `settings.json`.
49
+ - **Cursor**: Automated setup via `mcp.json`.
50
+ - **Claude Desktop**: Automated setup via `claude_desktop_config.json`.
51
+
52
+ ### Manual Python Setup (if needed)
53
+
54
+ ```bash
55
+ pip install opencv-python pillow numpy librosa soundfile
56
+ ```
57
+
58
+ ## ⚙️ MCP Configuration
59
+
60
+ ### For Cursor
61
+ 1. Go to **Settings** > **Features** > **MCP**
62
+ 2. Click **Add New MCP Server**
63
+ 3. Enter:
64
+ - **Name**: `vesper`
65
+ - **Type**: `command`
66
+ - **Command**: `vesper`
67
+
68
+ ### For Claude Desktop
69
+ Vesper attempts to auto-configure itself! Restart Claude and check. If not:
70
+
71
+ ```json
72
+ {
73
+ "mcpServers": {
74
+ "vesper": {
75
+ "command": "vesper",
76
+ "args": [],
77
+ "env": {
78
+ "HF_TOKEN": "your-huggingface-token"
79
+ }
80
+ }
81
+ }
82
+ }
83
+ ```
84
+
85
+ > **Note**: If the `vesper` command isn't found, you can stick to the absolute path method.
86
+
87
+ ### Environment Variables (Optional)
88
+
89
+ - `KAGGLE_USERNAME` & `KAGGLE_KEY`: For Kaggle dataset access
90
+ - `HF_TOKEN`: For private HuggingFace datasets
91
+
92
+ ### Optional Kaggle Setup (Not Required)
93
+
94
+ Core Vesper works without any API keys. Keys are only needed when you explicitly use Kaggle or gated Hugging Face.
95
+
96
+ Install optional Kaggle client only if you need Kaggle source access:
6
97
 
7
98
  ```bash
8
- npx vesper-wizard@latest
99
+ pip install kaggle
9
100
  ```
10
101
 
11
- That's it. The wizard handles everything:
102
+ ```bash
103
+ vespermcp config keys
104
+ ```
12
105
 
13
- 1. Creates `~/.vesper/` directories and local API key
14
- 2. Initializes a local credentials vault in unified-key mode (no external API keys required)
15
- 3. Installs `@vespermcp/mcp-server` and auto-configures MCP for all detected agents (Claude, Cursor, VS Code, Codex, Gemini CLI)
16
- 4. Verifies the installation
106
+ The setup wizard supports skip and stores keys securely via OS keyring when available,
107
+ with fallback to `~/.vesper/config.toml`.
17
108
 
18
- ## What you get
109
+ or use Kaggle's native file:
19
110
 
20
- After the wizard finishes, your AI assistant can immediately use Vesper tools:
111
+ - `~/.kaggle/kaggle.json`
21
112
 
22
- | Tool | Description |
23
- |------|-------------|
24
- | `vesper_search` | Search 16,000+ datasets via natural language |
25
- | `discover_datasets` | Discover from HuggingFace, Kaggle, OpenML, data.world |
26
- | `download_dataset` | Download any dataset to local storage |
27
- | `prepare_dataset` | Full pipeline: analyze → clean → split → export |
28
- | `analyze_quality` | Deep quality analysis with recommendations |
29
- | `export_dataset` | Export to parquet, csv, feather, jsonl, arrow |
30
- | `fuse_datasets` | Combine multiple datasets with quality checks |
113
+ If credentials are missing and you run Kaggle commands, Vesper shows:
31
114
 
32
- ## Security
115
+ `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).`
33
116
 
34
- - **Local-only**: Uses one local key in `~/.vesper/config.toml`
35
- - **Keyring-backed**: Uses OS keyring when available, falls back to local TOML
36
- - **No cloud**: Zero external API calls during setup
37
- - **No external keys**: No HuggingFace/Kaggle/Nia key prompts during setup
117
+ ### CLI Examples
118
+
119
+ ```bash
120
+ vespermcp discover --source kaggle "credit risk" --limit 10
121
+ vespermcp discover --source huggingface "credit risk" --limit 10
122
+ vespermcp download kaggle username/dataset-name
123
+ vespermcp download kaggle https://www.kaggle.com/datasets/username/dataset-name --target-dir ./data
124
+ ```
38
125
 
39
- ## Config file
126
+ ## 🚀 Quick Start
40
127
 
41
- The wizard generates `~/.vesper/config.toml`:
128
+ After installation and configuration, restart your AI assistant and try:
129
+
130
+ ```
131
+ search_datasets(query="sentiment analysis", limit=5)
132
+ ```
133
+
134
+ ```
135
+ prepare_dataset(query="image classification cats vs dogs")
136
+ ```
42
137
 
43
- ```toml
44
- api_key = "vesper_sk_local_..."
45
- auth_mode = "local_unified"
46
138
  ```
139
+ generate_quality_report(
140
+ dataset_id="huggingface:imdb",
141
+ dataset_path="/path/to/data"
142
+ )
143
+ ```
144
+
145
+ ## 📚 Available Tools
47
146
 
48
- ## Post-setup
147
+ ### Dataset Discovery
49
148
 
50
- Restart your IDE and try in your AI assistant:
149
+ #### `unified_dataset_api`
150
+ Single facade over multiple dataset backends. Use one tool for provider capability inspection, dataset discovery, dataset download, and dataset info lookup. The gateway prefers public/keyless providers and can also use server-managed credentials for connectors like Kaggle or data.world when configured by the operator.
51
151
 
152
+ **Parameters:**
153
+ - `operation` (string): `providers`, `discover`, `download`, or `info`
154
+ - `source` (string, optional): `auto`, `huggingface`, `openml`, `kaggle`, `dataworld`, `s3`, `bigquery`
155
+ - `query` (string, required for `discover`)
156
+ - `dataset_id` (string, required for `download`/`info`)
157
+ - `limit` (number, optional)
158
+ - `target_dir` (string, optional)
159
+ - `public_only` (boolean, optional)
160
+
161
+ **Examples:**
52
162
  ```
53
- vesper_search(query="sentiment analysis")
54
- prepare_dataset(query="image classification cats dogs")
55
- analyze_quality(dataset_id="imdb")
163
+ unified_dataset_api(operation="providers")
56
164
  ```
57
165
 
58
- ## License
166
+ ```
167
+ unified_dataset_api(operation="discover", query="credit risk", source="auto")
168
+ ```
169
+
170
+ ```
171
+ unified_dataset_api(operation="download", dataset_id="huggingface:imdb")
172
+ ```
173
+
174
+ ---
175
+
176
+ #### `search_datasets`
177
+ Search for datasets across multiple sources.
178
+
179
+ **Parameters:**
180
+ - `query` (string): Search query
181
+ - `limit` (number, optional): Max results (default: 10)
182
+ - `min_quality_score` (number, optional): Minimum quality threshold
183
+
184
+ **Example:**
185
+ ```
186
+ search_datasets(query="medical imaging", limit=5, min_quality_score=70)
187
+ ```
188
+
189
+ ---
190
+
191
+ ### Data Preparation
192
+
193
+ #### `prepare_dataset`
194
+ Download, analyze, and prepare a dataset for use.
195
+
196
+ **Parameters:**
197
+ - `query` (string): Dataset search query or ID
198
+
199
+ **Example:**
200
+ ```
201
+ prepare_dataset(query="squad")
202
+ ```
203
+
204
+ ---
205
+
206
+ #### `export_dataset`
207
+ Export a prepared dataset to a custom directory with format conversion.
208
+
209
+ **Parameters:**
210
+ - `dataset_id` (string): Dataset identifier
211
+ - `target_dir` (string): Export directory
212
+ - `format` (string, optional): Output format (csv, json, parquet)
213
+
214
+ **Example:**
215
+ ```
216
+ export_dataset(
217
+ dataset_id="huggingface:imdb",
218
+ target_dir="./my-data",
219
+ format="csv"
220
+ )
221
+ ```
222
+
223
+ ---
224
+
225
+ ### Quality Analysis
226
+
227
+ #### `analyze_image_quality`
228
+ Analyze image datasets for resolution, corruption, and blur.
229
+
230
+ **Parameters:**
231
+ - `path` (string): Path to image file or folder
232
+
233
+ **Example:**
234
+ ```
235
+ analyze_image_quality(path="/path/to/images")
236
+ ```
237
+
238
+ ---
239
+
240
+ #### `analyze_media_quality`
241
+ Analyze audio/video files for quality metrics.
242
+
243
+ **Parameters:**
244
+ - `path` (string): Path to media file or folder
245
+
246
+ **Example:**
247
+ ```
248
+ analyze_media_quality(path="/path/to/audio")
249
+ ```
250
+
251
+ ---
252
+
253
+ #### `generate_quality_report`
254
+ Generate a comprehensive unified quality report for multimodal datasets.
255
+
256
+ **Parameters:**
257
+ - `dataset_id` (string): Dataset identifier
258
+ - `dataset_path` (string): Path to dataset directory
259
+
260
+ **Example:**
261
+ ```
262
+ generate_quality_report(
263
+ dataset_id="my-dataset",
264
+ dataset_path="/path/to/data"
265
+ )
266
+ ```
267
+
268
+ ---
269
+
270
+ ### Data Splitting
271
+
272
+ #### `split_dataset`
273
+ Split a dataset into train/test/validation sets.
274
+
275
+ **Parameters:**
276
+ - `dataset_id` (string): Dataset identifier
277
+ - `train_ratio` (number): Training set ratio (0-1)
278
+ - `test_ratio` (number): Test set ratio (0-1)
279
+ - `val_ratio` (number, optional): Validation set ratio (0-1)
280
+
281
+ **Example:**
282
+ ```
283
+ split_dataset(
284
+ dataset_id="my-dataset",
285
+ train_ratio=0.7,
286
+ test_ratio=0.2,
287
+ val_ratio=0.1
288
+ )
289
+ ```
290
+
291
+ ## 🏗️ Architecture
292
+
293
+ Vesper is built with:
294
+ - **TypeScript** for the MCP server
295
+ - **Python** for image/audio/video processing
296
+ - **SQLite** for metadata storage
297
+ - **Transformers.js** for semantic search
298
+
299
+ ## 🤝 Contributing
300
+
301
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
302
+
303
+ ## 📄 License
304
+
305
+ MIT License - see [LICENSE](LICENSE) for details.
306
+
307
+ ## 🐛 Issues & Support
308
+
309
+ - **Issues**: https://github.com/vesper/mcp-server/issues
310
+ - **Discussions**: https://github.com/vesper/mcp-server/discussions
311
+
312
+ ## 🌟 Acknowledgments
313
+
314
+ Built with:
315
+ - [Model Context Protocol](https://modelcontextprotocol.io/)
316
+ - [HuggingFace Hub](https://huggingface.co/)
317
+ - [Kaggle API](https://www.kaggle.com/docs/api)
318
+ - [OpenCV](https://opencv.org/)
319
+ - [librosa](https://librosa.org/)
320
+
321
+ ---
59
322
 
60
- MIT
323
+ Made with ❤️ by the Vesper Team
@@ -0,0 +1,34 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ export class CDNService {
4
+ baseDir;
5
+ baseUrl;
6
+ constructor(baseDir = "data/cdn_mock", baseUrl = "https://cdn.vesper.ai") {
7
+ this.baseDir = path.resolve(baseDir);
8
+ this.baseUrl = baseUrl;
9
+ if (!fs.existsSync(this.baseDir)) {
10
+ fs.mkdirSync(this.baseDir, { recursive: true });
11
+ }
12
+ }
13
+ /**
14
+ * Uploads a file to the CDN.
15
+ * @param fileName Name of the file (including extension)
16
+ * @param content String or Buffer content
17
+ * @returns The public URL of the file
18
+ */
19
+ async upload(fileName, content) {
20
+ const filePath = path.join(this.baseDir, fileName);
21
+ fs.writeFileSync(filePath, content);
22
+ // Return a simulated cloud URL
23
+ return `${this.baseUrl}/${fileName}`;
24
+ }
25
+ /**
26
+ * Deletes a file from the CDN.
27
+ */
28
+ async delete(fileName) {
29
+ const filePath = path.join(this.baseDir, fileName);
30
+ if (fs.existsSync(filePath)) {
31
+ fs.unlinkSync(filePath);
32
+ }
33
+ }
34
+ }
@@ -0,0 +1,63 @@
1
+ import crypto from "crypto";
2
+ /**
3
+ * A simple in-memory cache provider simulating Redis for demonstration.
4
+ */
5
+ export class MockRedisProvider {
6
+ store = new Map();
7
+ async get(key) {
8
+ const item = this.store.get(key);
9
+ if (!item)
10
+ return null;
11
+ if (item.expiry && Date.now() > item.expiry) {
12
+ this.store.delete(key);
13
+ return null;
14
+ }
15
+ return item.value;
16
+ }
17
+ async set(key, value, ttlSeconds) {
18
+ const expiry = ttlSeconds ? Date.now() + ttlSeconds * 1000 : null;
19
+ this.store.set(key, { value, expiry });
20
+ }
21
+ async delete(key) {
22
+ this.store.delete(key);
23
+ }
24
+ }
25
+ export class CacheService {
26
+ provider;
27
+ constructor(provider) {
28
+ this.provider = provider;
29
+ }
30
+ /**
31
+ * Caches quality reports (TTL: 24h)
32
+ */
33
+ async getReport(datasetId) {
34
+ const key = `report:${datasetId}`;
35
+ const data = await this.provider.get(key);
36
+ return data ? JSON.parse(data) : null;
37
+ }
38
+ async saveReport(datasetId, report) {
39
+ const key = `report:${datasetId}`;
40
+ await this.provider.set(key, JSON.stringify(report), 86400); // 24 hours
41
+ }
42
+ /**
43
+ * Caches cleaning plans by dataset ID and configuration hash
44
+ */
45
+ async getPlan(datasetId, config) {
46
+ const hash = this.generateHash(config);
47
+ const key = `plan:${datasetId}:${hash}`;
48
+ const data = await this.provider.get(key);
49
+ return data ? JSON.parse(data) : null;
50
+ }
51
+ async savePlan(datasetId, config, plan) {
52
+ const hash = this.generateHash(config);
53
+ const key = `plan:${datasetId}:${hash}`;
54
+ await this.provider.set(key, JSON.stringify(plan), 3600); // 1 hour
55
+ }
56
+ generateHash(obj) {
57
+ return crypto
58
+ .createHash("sha256")
59
+ .update(JSON.stringify(obj))
60
+ .digest("hex")
61
+ .substring(0, 16);
62
+ }
63
+ }
@@ -0,0 +1,81 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ import fs from "fs";
4
+ export class DataCleaner {
5
+ pythonPath = "python";
6
+ scriptPath;
7
+ constructor(buildDir = process.cwd()) {
8
+ const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
9
+ const dataRoot = path.join(homeDir, ".vesper");
10
+ const scriptPath0 = path.resolve(dataRoot, "python", "cleaner.py");
11
+ const scriptPath1 = path.resolve(buildDir, "python", "cleaner.py");
12
+ const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "cleaner.py");
13
+ const scriptPath3 = path.resolve(buildDir, "..", "python", "cleaner.py");
14
+ if (fs.existsSync(scriptPath0)) {
15
+ this.scriptPath = scriptPath0;
16
+ }
17
+ else if (fs.existsSync(scriptPath1)) {
18
+ this.scriptPath = scriptPath1;
19
+ }
20
+ else if (fs.existsSync(scriptPath2)) {
21
+ this.scriptPath = scriptPath2;
22
+ }
23
+ else if (fs.existsSync(scriptPath3)) {
24
+ this.scriptPath = scriptPath3;
25
+ }
26
+ else {
27
+ this.scriptPath = scriptPath0; // Final fallback
28
+ }
29
+ // Detect Python command (Windows may use 'py' instead of 'python')
30
+ if (process.platform === "win32") {
31
+ this.pythonPath = "py";
32
+ }
33
+ }
34
+ /**
35
+ * Execute a list of cleaning operations on a file
36
+ */
37
+ async clean(filePath, operations, format) {
38
+ return new Promise((resolve, reject) => {
39
+ const args = [
40
+ this.scriptPath,
41
+ filePath,
42
+ JSON.stringify(operations)
43
+ ];
44
+ if (format)
45
+ args.push(format);
46
+ const process = spawn(this.pythonPath, args);
47
+ let stdout = "";
48
+ let stderr = "";
49
+ process.stdout.on("data", (data) => {
50
+ stdout += data.toString();
51
+ });
52
+ process.stderr.on("data", (data) => {
53
+ stderr += data.toString();
54
+ });
55
+ process.on("close", (code) => {
56
+ if (code !== 0) {
57
+ reject(new Error(`Data Cleaner failed (code ${code}): ${stderr}`));
58
+ return;
59
+ }
60
+ try {
61
+ const result = JSON.parse(stdout);
62
+ if (!result.success) {
63
+ reject(new Error(result.error));
64
+ }
65
+ else {
66
+ resolve({
67
+ success: true,
68
+ rows_affected: Number(result.rows_affected ?? 0),
69
+ columns_affected: Number(result.columns_affected ?? 0),
70
+ output_path: result.output_path,
71
+ logs: Array.isArray(result.logs) ? result.logs : [],
72
+ });
73
+ }
74
+ }
75
+ catch (e) {
76
+ reject(new Error(`Failed to parse cleaner output: ${stdout}`));
77
+ }
78
+ });
79
+ });
80
+ }
81
+ }
@@ -0,0 +1,89 @@
1
+ import * as crypto from "crypto";
2
+ export class RuleEvaluator {
3
+ /**
4
+ * Checks if a record matches a rule's condition.
5
+ */
6
+ matches(record, condition) {
7
+ const columnsToTest = condition.column === "*"
8
+ ? Object.keys(record)
9
+ : [condition.column];
10
+ for (const col of columnsToTest) {
11
+ const val = record[col];
12
+ if (val === undefined)
13
+ continue;
14
+ if (this.testValue(val, condition)) {
15
+ return true;
16
+ }
17
+ }
18
+ return false;
19
+ }
20
+ /**
21
+ * Applies a rule action to a record.
22
+ */
23
+ apply(record, rule) {
24
+ const newRecord = { ...record };
25
+ const action = rule.action;
26
+ const condition = rule.condition;
27
+ const columnsToApply = condition.column === "*"
28
+ ? Object.keys(record)
29
+ : [condition.column];
30
+ for (const col of columnsToApply) {
31
+ const val = record[col];
32
+ if (val === undefined)
33
+ continue;
34
+ // Optional: Re-test condition per column if using wildcard
35
+ if (condition.column === "*" && !this.testValue(val, condition)) {
36
+ continue;
37
+ }
38
+ switch (action.type) {
39
+ case "Replace":
40
+ if (typeof val === "string") {
41
+ const regex = new RegExp(action.params.pattern, "g");
42
+ newRecord[col] = val.replace(regex, action.params.replacement);
43
+ }
44
+ break;
45
+ case "CustomMask":
46
+ if (val !== null) {
47
+ newRecord[col] = this.maskValue(String(val), action.params);
48
+ }
49
+ break;
50
+ case "NormalizeText":
51
+ if (typeof val === "string") {
52
+ newRecord[col] = action.params.case === "lower"
53
+ ? val.toLowerCase()
54
+ : val.toUpperCase();
55
+ }
56
+ break;
57
+ // Add more handlers as needed...
58
+ }
59
+ }
60
+ return newRecord;
61
+ }
62
+ testValue(val, condition) {
63
+ const { operator, value } = condition;
64
+ const strVal = String(val);
65
+ switch (operator) {
66
+ case "contains":
67
+ return strVal.includes(String(value));
68
+ case "equals":
69
+ return val === value;
70
+ case "starts_with":
71
+ return strVal.startsWith(String(value));
72
+ case "ends_with":
73
+ return strVal.endsWith(String(value));
74
+ case "is_null":
75
+ return (val === null || val === undefined) === value;
76
+ case "matches_regex":
77
+ return new RegExp(String(value)).test(strVal);
78
+ default:
79
+ return false;
80
+ }
81
+ }
82
+ maskValue(val, params) {
83
+ if (params.method === "hash") {
84
+ const salt = params.salt || "";
85
+ return crypto.createHash("sha256").update(val + salt).digest("hex").substring(0, 12) + "...";
86
+ }
87
+ return "MASKED";
88
+ }
89
+ }