@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Vesper Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,259 @@
1
+ # Vesper MCP Server 🚀
2
+
3
+ **AI-powered dataset discovery, quality analysis, and preparation** with multimodal support (text, image, audio, video).
4
+
5
+ Vesper is a Model Context Protocol (MCP) server that helps you find, analyze, and prepare high-quality datasets for machine learning projects. It integrates seamlessly with AI assistants like Claude, providing autonomous dataset workflows.
6
+
7
+ ## ✨ Features
8
+
9
+ ### 🔍 **Dataset Discovery**
10
+ - Search across HuggingFace, Kaggle, UCI ML Repository, and more
11
+ - Intelligent ranking based on quality, safety, and relevance
12
+ - Automatic metadata extraction and enrichment
13
+
14
+ ### 📊 **Quality Analysis**
15
+ - **Text**: Missing data, duplicates, column profiling
16
+ - **Images**: Resolution, corruption, blur detection
17
+ - **Audio**: Sample rate, duration, silence detection
18
+ - **Video**: FPS, frame validation, corruption risk
19
+ - **Unified Reports**: Consolidated quality scores (0-100) with recommendations
20
+
21
+ ### 🛠️ **Data Preparation**
22
+ - Automated cleaning pipelines
23
+ - Format conversion (CSV, JSON, Parquet)
24
+ - Train/test/validation splitting
25
+ - Automatic installation to project directories
26
+
27
+ ### 🎯 **Multimodal Support**
28
+ - Analyze mixed datasets (text + images + audio)
29
+ - Media-specific quality metrics
30
+ - Intelligent modality detection
31
+
32
+ ## 📦 Installation
33
+
34
+ ### Option A: Install via Git (Recommended)
35
+ Install directly from the repository without waiting for npm publishing:
36
+ ```bash
37
+ npm install -g git+https://github.com/vesper/mcp-server.git
38
+ ```
39
+
40
+ ### Option B: Install Globally from Source
41
+ 1. Clone the repository
42
+ 2. Run install:
43
+ ```bash
44
+ npm install -g .
45
+ ```
46
+
47
+ The postinstall script will automatically:
48
+ - Install Python dependencies (opencv-python, librosa, etc.)
49
+ - Create data directories in `~/.vesper`
50
+ - Display setup instructions
51
+
52
+ ### Manual Python Setup (if needed)
53
+
54
+ ```bash
55
+ pip install opencv-python pillow numpy librosa soundfile
56
+ ```
57
+
58
+ ## ⚙️ MCP Configuration
59
+
60
+ Add Vesper to your MCP settings file:
61
+
62
+ **Claude Desktop** (`~/Library/Application Support/Claude/claude_desktop_config.json` on macOS):
63
+
64
+ ```json
65
+ {
66
+ "mcpServers": {
67
+ "vesper": {
68
+ "command": "node",
69
+ "args": [
70
+ "vesper"
71
+ ],
72
+ "env": {
73
+ "KAGGLE_USERNAME": "your-username",
74
+ "KAGGLE_KEY": "your-api-key",
75
+ "HF_TOKEN": "your-huggingface-token"
76
+ }
77
+ }
78
+ }
79
+ }
80
+ ```
81
+
82
+ > **Note**: Update the path in `args` if `vesper` is not in your PATH. You can use the full path: `/usr/local/lib/node_modules/vesper-mcp-server/build/index.js` (use `npm root -g` to check location).
83
+
84
+ ### Environment Variables (Optional)
85
+
86
+ - `KAGGLE_USERNAME` & `KAGGLE_KEY`: For Kaggle dataset access
87
+ - `HF_TOKEN`: For private HuggingFace datasets
88
+
89
+ ## 🚀 Quick Start
90
+
91
+ After installation and configuration, restart your AI assistant and try:
92
+
93
+ ```
94
+ search_datasets(query="sentiment analysis", limit=5)
95
+ ```
96
+
97
+ ```
98
+ prepare_dataset(query="image classification cats vs dogs")
99
+ ```
100
+
101
+ ```
102
+ generate_quality_report(
103
+ dataset_id="huggingface:imdb",
104
+ dataset_path="/path/to/data"
105
+ )
106
+ ```
107
+
108
+ ## 📚 Available Tools
109
+
110
+ ### Dataset Discovery
111
+
112
+ #### `search_datasets`
113
+ Search for datasets across multiple sources.
114
+
115
+ **Parameters:**
116
+ - `query` (string): Search query
117
+ - `limit` (number, optional): Max results (default: 10)
118
+ - `min_quality_score` (number, optional): Minimum quality threshold
119
+
120
+ **Example:**
121
+ ```
122
+ search_datasets(query="medical imaging", limit=5, min_quality_score=70)
123
+ ```
124
+
125
+ ---
126
+
127
+ ### Data Preparation
128
+
129
+ #### `prepare_dataset`
130
+ Download, analyze, and prepare a dataset for use.
131
+
132
+ **Parameters:**
133
+ - `query` (string): Dataset search query or ID
134
+
135
+ **Example:**
136
+ ```
137
+ prepare_dataset(query="squad")
138
+ ```
139
+
140
+ ---
141
+
142
+ #### `export_dataset`
143
+ Export a prepared dataset to a custom directory with format conversion.
144
+
145
+ **Parameters:**
146
+ - `dataset_id` (string): Dataset identifier
147
+ - `target_dir` (string): Export directory
148
+ - `format` (string, optional): Output format (csv, json, parquet)
149
+
150
+ **Example:**
151
+ ```
152
+ export_dataset(
153
+ dataset_id="huggingface:imdb",
154
+ target_dir="./my-data",
155
+ format="csv"
156
+ )
157
+ ```
158
+
159
+ ---
160
+
161
+ ### Quality Analysis
162
+
163
+ #### `analyze_image_quality`
164
+ Analyze image datasets for resolution, corruption, and blur.
165
+
166
+ **Parameters:**
167
+ - `path` (string): Path to image file or folder
168
+
169
+ **Example:**
170
+ ```
171
+ analyze_image_quality(path="/path/to/images")
172
+ ```
173
+
174
+ ---
175
+
176
+ #### `analyze_media_quality`
177
+ Analyze audio/video files for quality metrics.
178
+
179
+ **Parameters:**
180
+ - `path` (string): Path to media file or folder
181
+
182
+ **Example:**
183
+ ```
184
+ analyze_media_quality(path="/path/to/audio")
185
+ ```
186
+
187
+ ---
188
+
189
+ #### `generate_quality_report`
190
+ Generate a comprehensive unified quality report for multimodal datasets.
191
+
192
+ **Parameters:**
193
+ - `dataset_id` (string): Dataset identifier
194
+ - `dataset_path` (string): Path to dataset directory
195
+
196
+ **Example:**
197
+ ```
198
+ generate_quality_report(
199
+ dataset_id="my-dataset",
200
+ dataset_path="/path/to/data"
201
+ )
202
+ ```
203
+
204
+ ---
205
+
206
+ ### Data Splitting
207
+
208
+ #### `split_dataset`
209
+ Split a dataset into train/test/validation sets.
210
+
211
+ **Parameters:**
212
+ - `dataset_id` (string): Dataset identifier
213
+ - `train_ratio` (number): Training set ratio (0-1)
214
+ - `test_ratio` (number): Test set ratio (0-1)
215
+ - `val_ratio` (number, optional): Validation set ratio (0-1)
216
+
217
+ **Example:**
218
+ ```
219
+ split_dataset(
220
+ dataset_id="my-dataset",
221
+ train_ratio=0.7,
222
+ test_ratio=0.2,
223
+ val_ratio=0.1
224
+ )
225
+ ```
226
+
227
+ ## 🏗️ Architecture
228
+
229
+ Vesper is built with:
230
+ - **TypeScript** for the MCP server
231
+ - **Python** for image/audio/video processing
232
+ - **SQLite** for metadata storage
233
+ - **Transformers.js** for semantic search
234
+
235
+ ## 🤝 Contributing
236
+
237
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
238
+
239
+ ## 📄 License
240
+
241
+ MIT License - see [LICENSE](LICENSE) for details.
242
+
243
+ ## 🐛 Issues & Support
244
+
245
+ - **Issues**: https://github.com/vesper/mcp-server/issues
246
+ - **Discussions**: https://github.com/vesper/mcp-server/discussions
247
+
248
+ ## 🌟 Acknowledgments
249
+
250
+ Built with:
251
+ - [Model Context Protocol](https://modelcontextprotocol.io/)
252
+ - [HuggingFace Hub](https://huggingface.co/)
253
+ - [Kaggle API](https://www.kaggle.com/docs/api)
254
+ - [OpenCV](https://opencv.org/)
255
+ - [librosa](https://librosa.org/)
256
+
257
+ ---
258
+
259
+ Made with ❤️ by the Vesper Team
@@ -0,0 +1,34 @@
1
+ import fs from "fs";
2
+ import path from "path";
3
+ export class CDNService {
4
+ baseDir;
5
+ baseUrl;
6
+ constructor(baseDir = "data/cdn_mock", baseUrl = "https://cdn.vesper.ai") {
7
+ this.baseDir = path.resolve(baseDir);
8
+ this.baseUrl = baseUrl;
9
+ if (!fs.existsSync(this.baseDir)) {
10
+ fs.mkdirSync(this.baseDir, { recursive: true });
11
+ }
12
+ }
13
+ /**
14
+ * Uploads a file to the CDN.
15
+ * @param fileName Name of the file (including extension)
16
+ * @param content String or Buffer content
17
+ * @returns The public URL of the file
18
+ */
19
+ async upload(fileName, content) {
20
+ const filePath = path.join(this.baseDir, fileName);
21
+ fs.writeFileSync(filePath, content);
22
+ // Return a simulated cloud URL
23
+ return `${this.baseUrl}/${fileName}`;
24
+ }
25
+ /**
26
+ * Deletes a file from the CDN.
27
+ */
28
+ async delete(fileName) {
29
+ const filePath = path.join(this.baseDir, fileName);
30
+ if (fs.existsSync(filePath)) {
31
+ fs.unlinkSync(filePath);
32
+ }
33
+ }
34
+ }
@@ -0,0 +1,63 @@
1
+ import crypto from "crypto";
2
+ /**
3
+ * A simple in-memory cache provider simulating Redis for demonstration.
4
+ */
5
+ export class MockRedisProvider {
6
+ store = new Map();
7
+ async get(key) {
8
+ const item = this.store.get(key);
9
+ if (!item)
10
+ return null;
11
+ if (item.expiry && Date.now() > item.expiry) {
12
+ this.store.delete(key);
13
+ return null;
14
+ }
15
+ return item.value;
16
+ }
17
+ async set(key, value, ttlSeconds) {
18
+ const expiry = ttlSeconds ? Date.now() + ttlSeconds * 1000 : null;
19
+ this.store.set(key, { value, expiry });
20
+ }
21
+ async delete(key) {
22
+ this.store.delete(key);
23
+ }
24
+ }
25
+ export class CacheService {
26
+ provider;
27
+ constructor(provider) {
28
+ this.provider = provider;
29
+ }
30
+ /**
31
+ * Caches quality reports (TTL: 24h)
32
+ */
33
+ async getReport(datasetId) {
34
+ const key = `report:${datasetId}`;
35
+ const data = await this.provider.get(key);
36
+ return data ? JSON.parse(data) : null;
37
+ }
38
+ async saveReport(datasetId, report) {
39
+ const key = `report:${datasetId}`;
40
+ await this.provider.set(key, JSON.stringify(report), 86400); // 24 hours
41
+ }
42
+ /**
43
+ * Caches cleaning plans by dataset ID and configuration hash
44
+ */
45
+ async getPlan(datasetId, config) {
46
+ const hash = this.generateHash(config);
47
+ const key = `plan:${datasetId}:${hash}`;
48
+ const data = await this.provider.get(key);
49
+ return data ? JSON.parse(data) : null;
50
+ }
51
+ async savePlan(datasetId, config, plan) {
52
+ const hash = this.generateHash(config);
53
+ const key = `plan:${datasetId}:${hash}`;
54
+ await this.provider.set(key, JSON.stringify(plan), 3600); // 1 hour
55
+ }
56
+ generateHash(obj) {
57
+ return crypto
58
+ .createHash("sha256")
59
+ .update(JSON.stringify(obj))
60
+ .digest("hex")
61
+ .substring(0, 16);
62
+ }
63
+ }
@@ -0,0 +1,50 @@
1
+ import { spawn } from "child_process";
2
+ import path from "path";
3
+ export class DataCleaner {
4
+ pythonPath = "python";
5
+ scriptPath;
6
+ constructor(projectRoot = process.cwd()) {
7
+ this.scriptPath = path.join(projectRoot, "src", "python", "cleaner.py");
8
+ }
9
+ /**
10
+ * Execute a list of cleaning operations on a file
11
+ */
12
+ async clean(filePath, operations, format) {
13
+ return new Promise((resolve, reject) => {
14
+ const args = [
15
+ this.scriptPath,
16
+ filePath,
17
+ JSON.stringify(operations)
18
+ ];
19
+ if (format)
20
+ args.push(format);
21
+ const process = spawn(this.pythonPath, args);
22
+ let stdout = "";
23
+ let stderr = "";
24
+ process.stdout.on("data", (data) => {
25
+ stdout += data.toString();
26
+ });
27
+ process.stderr.on("data", (data) => {
28
+ stderr += data.toString();
29
+ });
30
+ process.on("close", (code) => {
31
+ if (code !== 0) {
32
+ reject(new Error(`Data Cleaner failed (code ${code}): ${stderr}`));
33
+ return;
34
+ }
35
+ try {
36
+ const result = JSON.parse(stdout);
37
+ if (!result.success) {
38
+ reject(new Error(result.error));
39
+ }
40
+ else {
41
+ resolve(result);
42
+ }
43
+ }
44
+ catch (e) {
45
+ reject(new Error(`Failed to parse cleaner output: ${stdout}`));
46
+ }
47
+ });
48
+ });
49
+ }
50
+ }
@@ -0,0 +1,89 @@
1
+ import * as crypto from "crypto";
2
+ export class RuleEvaluator {
3
+ /**
4
+ * Checks if a record matches a rule's condition.
5
+ */
6
+ matches(record, condition) {
7
+ const columnsToTest = condition.column === "*"
8
+ ? Object.keys(record)
9
+ : [condition.column];
10
+ for (const col of columnsToTest) {
11
+ const val = record[col];
12
+ if (val === undefined)
13
+ continue;
14
+ if (this.testValue(val, condition)) {
15
+ return true;
16
+ }
17
+ }
18
+ return false;
19
+ }
20
+ /**
21
+ * Applies a rule action to a record.
22
+ */
23
+ apply(record, rule) {
24
+ const newRecord = { ...record };
25
+ const action = rule.action;
26
+ const condition = rule.condition;
27
+ const columnsToApply = condition.column === "*"
28
+ ? Object.keys(record)
29
+ : [condition.column];
30
+ for (const col of columnsToApply) {
31
+ const val = record[col];
32
+ if (val === undefined)
33
+ continue;
34
+ // Optional: Re-test condition per column if using wildcard
35
+ if (condition.column === "*" && !this.testValue(val, condition)) {
36
+ continue;
37
+ }
38
+ switch (action.type) {
39
+ case "Replace":
40
+ if (typeof val === "string") {
41
+ const regex = new RegExp(action.params.pattern, "g");
42
+ newRecord[col] = val.replace(regex, action.params.replacement);
43
+ }
44
+ break;
45
+ case "CustomMask":
46
+ if (val !== null) {
47
+ newRecord[col] = this.maskValue(String(val), action.params);
48
+ }
49
+ break;
50
+ case "NormalizeText":
51
+ if (typeof val === "string") {
52
+ newRecord[col] = action.params.case === "lower"
53
+ ? val.toLowerCase()
54
+ : val.toUpperCase();
55
+ }
56
+ break;
57
+ // Add more handlers as needed...
58
+ }
59
+ }
60
+ return newRecord;
61
+ }
62
+ testValue(val, condition) {
63
+ const { operator, value } = condition;
64
+ const strVal = String(val);
65
+ switch (operator) {
66
+ case "contains":
67
+ return strVal.includes(String(value));
68
+ case "equals":
69
+ return val === value;
70
+ case "starts_with":
71
+ return strVal.startsWith(String(value));
72
+ case "ends_with":
73
+ return strVal.endsWith(String(value));
74
+ case "is_null":
75
+ return (val === null || val === undefined) === value;
76
+ case "matches_regex":
77
+ return new RegExp(String(value)).test(strVal);
78
+ default:
79
+ return false;
80
+ }
81
+ }
82
+ maskValue(val, params) {
83
+ if (params.method === "hash") {
84
+ const salt = params.salt || "";
85
+ return crypto.createHash("sha256").update(val + salt).digest("hex").substring(0, 12) + "...";
86
+ }
87
+ return "MASKED";
88
+ }
89
+ }
@@ -0,0 +1,60 @@
1
+ import { QualityAnalyzer } from "../quality/analyzer.js";
2
+ import { CleaningPlanner } from "./planner.js";
3
+ import { DataCleaner } from "./cleaner.js";
4
+ export class PipelineExecutor {
5
+ analyzer;
6
+ planner;
7
+ cleaner;
8
+ constructor(projectRoot = process.cwd()) {
9
+ this.analyzer = new QualityAnalyzer(undefined, projectRoot);
10
+ this.planner = new CleaningPlanner();
11
+ this.cleaner = new DataCleaner(projectRoot);
12
+ }
13
+ /**
14
+ * Run the full Auto-Cleaning Pipeline on a dataset file
15
+ */
16
+ async runPipeline(datasetId, filePath, outputFormat = "csv", onProgress) {
17
+ // ... (logging setup)
18
+ const log = (m) => {
19
+ console.log(`[Pipeline] ${m}`);
20
+ if (onProgress)
21
+ onProgress(m);
22
+ };
23
+ log(`Analyzing quality for ${datasetId}...`);
24
+ const qualityReport = await this.analyzer.analyze(filePath);
25
+ // 2. Generate Plan
26
+ log(`Generating cleaning plan...`);
27
+ const plan = await this.planner.generatePlan(datasetId, qualityReport);
28
+ // If no cleaning needed, we still might need format conversion
29
+ const needsConversion = !filePath.toLowerCase().endsWith(`.${outputFormat}`);
30
+ if (plan.operations.length === 0 && !needsConversion) {
31
+ log(`No cleaning or conversion needed.`);
32
+ return {
33
+ initial_quality: qualityReport,
34
+ plan,
35
+ cleaning_result: {
36
+ success: true,
37
+ rows_affected: 0,
38
+ columns_affected: 0,
39
+ logs: ["No operations generated."]
40
+ }
41
+ };
42
+ }
43
+ // 3. Execute Plan (includes conversion if requested)
44
+ log(`Executing ${plan.operations.length} operations (Format: ${outputFormat})...`);
45
+ plan.operations.forEach(op => console.log(` - ${op.type}: ${op.reason}`));
46
+ const cleaningResult = await this.cleaner.clean(filePath, plan.operations, outputFormat);
47
+ if (cleaningResult.success) {
48
+ log(`Cleaning complete. Output: ${cleaningResult.output_path}`);
49
+ }
50
+ else {
51
+ log(`Cleaning failed: ${cleaningResult.error}`);
52
+ }
53
+ return {
54
+ initial_quality: qualityReport,
55
+ plan,
56
+ cleaning_result: cleaningResult,
57
+ final_output_path: cleaningResult.output_path
58
+ };
59
+ }
60
+ }
@@ -0,0 +1,87 @@
1
+ export class ScriptGenerator {
2
+ /**
3
+ * Generate a standalone Python script for the cleaning plan
4
+ */
5
+ generatePythonScript(plan, inputPath) {
6
+ const timestamp = new Date().toISOString().split('T')[0];
7
+ let script = `"""
8
+ Vesper Auto-Cleaning Script
9
+ Generated: ${timestamp}
10
+ Dataset ID: ${plan.dataset_id}
11
+ """
12
+
13
+ import polars as pl
14
+ import numpy as np
15
+
16
+ def clean_dataset(file_path):
17
+ print(f"Loading {file_path}...")
18
+
19
+ # Load Data
20
+ if file_path.endswith(".csv"):
21
+ df = pl.read_csv(file_path, ignore_errors=True)
22
+ elif file_path.endswith(".parquet"):
23
+ df = pl.read_parquet(file_path)
24
+ else:
25
+ raise ValueError("Unsupported format")
26
+
27
+ print(f"Initial shape: {df.shape}")
28
+
29
+ `;
30
+ // Generate code for each operation
31
+ plan.operations.forEach((op, index) => {
32
+ script += ` # Step ${index + 1}: ${op.type}\n`;
33
+ script += ` # Reason: ${op.reason}\n`;
34
+ script += this.generateOpCode(op);
35
+ script += ` print(f"After Step ${index + 1} (${op.type}): {df.shape}")\n\n`;
36
+ });
37
+ script += ` # Save Output
38
+ output_path = file_path.replace(".csv", "_cleaned_repro.csv").replace(".parquet", "_cleaned_repro.parquet")
39
+ if file_path.endswith(".csv"):
40
+ df.write_csv(output_path)
41
+ else:
42
+ df.write_parquet(output_path)
43
+
44
+ print(f"Done! Saved to {output_path}")
45
+
46
+ if __name__ == "__main__":
47
+ # Default input path from generation time, can be overridden
48
+ INPUT_PATH = r"${inputPath}"
49
+ clean_dataset(INPUT_PATH)
50
+ `;
51
+ return script;
52
+ }
53
+ generateOpCode(op) {
54
+ const p = op.params;
55
+ switch (op.type) {
56
+ case "RemoveDuplicates":
57
+ return ` df = df.unique()\n`;
58
+ case "DropColumns":
59
+ return ` cols_to_drop = ${JSON.stringify(p.columns)}\n existing_cols = [c for c in cols_to_drop if c in df.columns]\n if existing_cols:\n df = df.drop(existing_cols)\n`;
60
+ case "FillMissing":
61
+ if (p.method === "constant") {
62
+ const val = typeof p.value === 'string' ? `"${p.value}"` : p.value;
63
+ return ` df = df.with_columns(pl.col("${p.column}").fill_null(${val}))\n`;
64
+ }
65
+ else if (p.method === "mean") {
66
+ return ` mean_val = df["${p.column}"].mean()\n df = df.with_columns(pl.col("${p.column}").fill_null(mean_val))\n`;
67
+ }
68
+ else if (p.method === "median") {
69
+ return ` median_val = df["${p.column}"].median()\n df = df.with_columns(pl.col("${p.column}").fill_null(median_val))\n`;
70
+ }
71
+ return ` # Unknown fill method for ${p.column}\n`;
72
+ case "FixTypes":
73
+ if (p.type === "float")
74
+ return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Float64, strict=False))\n`;
75
+ if (p.type === "int")
76
+ return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Int64, strict=False))\n`;
77
+ if (p.type === "string")
78
+ return ` df = df.with_columns(pl.col("${p.column}").cast(pl.Utf8))\n`;
79
+ return ` # Unknown type conversion for ${p.column}\n`;
80
+ case "RemoveOutliers":
81
+ // IQR implementation inline
82
+ return ` q1 = df["${p.column}"].quantile(0.25)\n q3 = df["${p.column}"].quantile(0.75)\n iqr = q3 - q1\n lower = q1 - (1.5 * iqr)\n upper = q3 + (1.5 * iqr)\n df = df.filter((pl.col("${p.column}") >= lower) & (pl.col("${p.column}") <= upper))\n`;
83
+ default:
84
+ return ` # Operation ${op.type} not fully supported in export yet\n`;
85
+ }
86
+ }
87
+ }