@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +259 -0
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +50 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +60 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +111 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/storage-manager.js +20 -0
  15. package/build/cloud/types.js +1 -0
  16. package/build/compliance/service.js +73 -0
  17. package/build/compliance/store.js +80 -0
  18. package/build/compliance/types.js +1 -0
  19. package/build/data/processing-worker.js +23 -0
  20. package/build/data/streaming.js +38 -0
  21. package/build/data/worker-pool.js +39 -0
  22. package/build/export/exporter.js +45 -0
  23. package/build/export/packager.js +100 -0
  24. package/build/export/types.js +1 -0
  25. package/build/fusion/aligner.js +56 -0
  26. package/build/fusion/deduplicator.js +69 -0
  27. package/build/fusion/harmonizer.js +39 -0
  28. package/build/fusion/orchestrator.js +86 -0
  29. package/build/fusion/types.js +1 -0
  30. package/build/index.js +632 -0
  31. package/build/ingestion/hf-downloader.js +64 -0
  32. package/build/ingestion/ingestor.js +96 -0
  33. package/build/ingestion/kaggle-downloader.js +79 -0
  34. package/build/install/install-service.js +41 -0
  35. package/build/jobs/manager.js +129 -0
  36. package/build/jobs/queue.js +59 -0
  37. package/build/jobs/types.js +1 -0
  38. package/build/metadata/domain.js +147 -0
  39. package/build/metadata/github-scraper.js +47 -0
  40. package/build/metadata/institutional-scrapers.js +49 -0
  41. package/build/metadata/kaggle-scraper.js +182 -0
  42. package/build/metadata/license.js +68 -0
  43. package/build/metadata/monitoring-service.js +107 -0
  44. package/build/metadata/monitoring-store.js +78 -0
  45. package/build/metadata/monitoring-types.js +1 -0
  46. package/build/metadata/quality.js +48 -0
  47. package/build/metadata/rate-limiter.js +128 -0
  48. package/build/metadata/scraper.js +353 -0
  49. package/build/metadata/store.js +325 -0
  50. package/build/metadata/types.js +1 -0
  51. package/build/metadata/uci-scraper.js +49 -0
  52. package/build/monitoring/observability.js +76 -0
  53. package/build/quality/analyzer.js +57 -0
  54. package/build/quality/image-analyzer.js +46 -0
  55. package/build/quality/media-analyzer.js +46 -0
  56. package/build/quality/quality-orchestrator.js +162 -0
  57. package/build/quality/types.js +1 -0
  58. package/build/scripts/build-index.js +54 -0
  59. package/build/scripts/check-db.js +73 -0
  60. package/build/scripts/check-jobs.js +24 -0
  61. package/build/scripts/check-naruto.js +17 -0
  62. package/build/scripts/demo-full-pipeline.js +62 -0
  63. package/build/scripts/demo-ui.js +58 -0
  64. package/build/scripts/e2e-demo.js +72 -0
  65. package/build/scripts/massive-scrape.js +103 -0
  66. package/build/scripts/ops-dashboard.js +33 -0
  67. package/build/scripts/scrape-metadata.js +100 -0
  68. package/build/scripts/search-cli.js +26 -0
  69. package/build/scripts/test-bias.js +45 -0
  70. package/build/scripts/test-caching.js +51 -0
  71. package/build/scripts/test-cleaning.js +76 -0
  72. package/build/scripts/test-cloud-storage.js +48 -0
  73. package/build/scripts/test-compliance.js +58 -0
  74. package/build/scripts/test-conversion.js +64 -0
  75. package/build/scripts/test-custom-rules.js +58 -0
  76. package/build/scripts/test-db-opt.js +63 -0
  77. package/build/scripts/test-export-custom.js +33 -0
  78. package/build/scripts/test-exporter.js +53 -0
  79. package/build/scripts/test-fusion.js +61 -0
  80. package/build/scripts/test-github.js +27 -0
  81. package/build/scripts/test-group-split.js +52 -0
  82. package/build/scripts/test-hf-download.js +29 -0
  83. package/build/scripts/test-holdout-manager.js +61 -0
  84. package/build/scripts/test-hybrid-search.js +41 -0
  85. package/build/scripts/test-image-analysis.js +50 -0
  86. package/build/scripts/test-ingestion-infra.js +39 -0
  87. package/build/scripts/test-install.js +40 -0
  88. package/build/scripts/test-institutional.js +26 -0
  89. package/build/scripts/test-integrity.js +41 -0
  90. package/build/scripts/test-jit.js +42 -0
  91. package/build/scripts/test-job-queue.js +62 -0
  92. package/build/scripts/test-kaggle-download.js +34 -0
  93. package/build/scripts/test-large-data.js +50 -0
  94. package/build/scripts/test-mcp-v5.js +73 -0
  95. package/build/scripts/test-media-analysis.js +61 -0
  96. package/build/scripts/test-monitoring.js +91 -0
  97. package/build/scripts/test-observability.js +106 -0
  98. package/build/scripts/test-packager.js +55 -0
  99. package/build/scripts/test-pipeline.js +50 -0
  100. package/build/scripts/test-planning.js +64 -0
  101. package/build/scripts/test-privacy.js +38 -0
  102. package/build/scripts/test-quality.js +43 -0
  103. package/build/scripts/test-robust-ingestion.js +41 -0
  104. package/build/scripts/test-schema.js +45 -0
  105. package/build/scripts/test-split-validation.js +40 -0
  106. package/build/scripts/test-splitter.js +93 -0
  107. package/build/scripts/test-uci.js +27 -0
  108. package/build/scripts/test-unified-quality.js +86 -0
  109. package/build/search/embedder.js +34 -0
  110. package/build/search/engine.js +129 -0
  111. package/build/search/jit-orchestrator.js +232 -0
  112. package/build/search/vector-store.js +105 -0
  113. package/build/splitting/splitter.js +57 -0
  114. package/build/splitting/types.js +1 -0
  115. package/build/tools/formatter.js +227 -0
  116. package/build/utils/downloader.js +52 -0
  117. package/mcp-config-template.json +15 -0
  118. package/package.json +84 -0
  119. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  120. package/src/python/cleaner.py +196 -0
  121. package/src/python/export_engine.py +112 -0
  122. package/src/python/framework_adapters.py +100 -0
  123. package/src/python/github_adapter.py +106 -0
  124. package/src/python/image_engine.py +86 -0
  125. package/src/python/media_engine.py +133 -0
  126. package/src/python/nasa_adapter.py +82 -0
  127. package/src/python/quality_engine.py +243 -0
  128. package/src/python/splitter_engine.py +283 -0
  129. package/src/python/test_framework_adapters.py +61 -0
  130. package/src/python/uci_adapter.py +94 -0
  131. package/src/python/worldbank_adapter.py +99 -0
@@ -0,0 +1,227 @@
1
+ /**
2
+ * Format job status for visual representation
3
+ */
4
+ export function formatJobStatus(job) {
5
+ const emojiMap = {
6
+ "pending": "⏳",
7
+ "queued": "📋",
8
+ "running": "🔄",
9
+ "completed": "✅",
10
+ "failed": "❌",
11
+ "retrying": "🔁"
12
+ };
13
+ const emoji = emojiMap[job.status] || "❓";
14
+ const barWidth = 20;
15
+ const filledWidth = Math.round((job.progress / 100) * barWidth);
16
+ const emptyWidth = barWidth - filledWidth;
17
+ const bar = "█".repeat(filledWidth) + "░".repeat(emptyWidth);
18
+ let output = `═ Job Status: ${job.type.toUpperCase()} ═\n`;
19
+ output += `ID: ${job.id}\n`;
20
+ output += `Status: ${emoji} ${job.status.toUpperCase()}\n`;
21
+ output += `Progress: ${bar} ${job.progress}%\n`;
22
+ output += `Activity: ${job.status_text}\n`;
23
+ if (job.result_url) {
24
+ output += `Result: ${job.result_url}\n`;
25
+ }
26
+ if (job.error) {
27
+ output += `Error: ${job.error}\n`;
28
+ }
29
+ output += `Updated: ${new Date(job.updated_at).toLocaleTimeString()}\n`;
30
+ output += "═".repeat(25) + "\n";
31
+ return output;
32
+ }
33
+ /**
34
+ * Format dataset search results for human-readable display
35
+ */
36
+ export function formatSearchResults(results) {
37
+ if (results.length === 0) {
38
+ return "No datasets found matching your query.";
39
+ }
40
+ let output = `Found ${results.length} dataset(s):\n\n`;
41
+ output += "═".repeat(80) + "\n\n";
42
+ results.forEach((ds, index) => {
43
+ const relevanceScore = ds.relevance_score || 0;
44
+ // Source badge
45
+ const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
46
+ // Safety indicator
47
+ let safetyIndicator = "";
48
+ if (ds.license.category === "safe") {
49
+ safetyIndicator = "Safe";
50
+ }
51
+ else if (ds.license.category === "restricted") {
52
+ safetyIndicator = "Restricted";
53
+ }
54
+ else {
55
+ safetyIndicator = "Unknown License";
56
+ }
57
+ // Header
58
+ output += `${index + 1}. ${ds.name}\n`;
59
+ output += ` ${sourceBadge} | ${safetyIndicator} | Relevance: ${(relevanceScore * 100).toFixed(0)}%\n`;
60
+ output += ` ID: ${ds.id}\n\n`;
61
+ // Description
62
+ if (ds.description && ds.description.length > 0) {
63
+ const shortDesc = ds.description.length > 200
64
+ ? ds.description.substring(0, 200) + "..."
65
+ : ds.description;
66
+ output += ` ${shortDesc}\n\n`;
67
+ }
68
+ // Quality warnings
69
+ if (ds.quality_warnings && ds.quality_warnings.length > 0) {
70
+ output += ` Quality Warnings:\n`;
71
+ ds.quality_warnings.forEach(warning => {
72
+ output += ` • ${warning}\n`;
73
+ });
74
+ output += "\n";
75
+ }
76
+ // Key stats
77
+ output += ` Stats:\n`;
78
+ if (ds.downloads)
79
+ output += ` Downloads: ${ds.downloads.toLocaleString()}\n`;
80
+ if (ds.likes)
81
+ output += ` Likes: ${ds.likes}\n`;
82
+ if (ds.total_examples)
83
+ output += ` Examples: ${ds.total_examples.toLocaleString()}\n`;
84
+ if (ds.total_size_mb)
85
+ output += ` Size: ${ds.total_size_mb} MB\n`;
86
+ output += ` Domain: ${ds.domain || "unknown"}\n`;
87
+ output += ` Task: ${ds.task || "unknown"}\n`;
88
+ // Data splits
89
+ if (ds.splits && ds.splits.length > 0) {
90
+ const splitNames = ds.splits.map(s => s.name).join(", ");
91
+ output += ` Splits: ${splitNames}\n`;
92
+ }
93
+ // License details
94
+ output += `\n License: ${ds.license.id || "Unknown"}\n`;
95
+ if (ds.license.warnings && ds.license.warnings.length > 0) {
96
+ ds.license.warnings.forEach(warning => {
97
+ output += ` WARNING: ${warning}\n`;
98
+ });
99
+ }
100
+ if (ds.license.commercial_use !== undefined) {
101
+ output += ` Commercial use: ${ds.license.commercial_use ? "Yes" : "No"}\n`;
102
+ }
103
+ // Download link
104
+ output += `\n ${ds.download_url}\n`;
105
+ output += "\n" + "─".repeat(80) + "\n\n";
106
+ });
107
+ return output;
108
+ }
109
+ /**
110
+ * Format detailed dataset info
111
+ */
112
+ export function formatDatasetInfo(ds) {
113
+ let output = "";
114
+ // Header
115
+ output += "═".repeat(80) + "\n";
116
+ output += `${ds.name}\n`;
117
+ output += "═".repeat(80) + "\n\n";
118
+ // Source and safety
119
+ const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
120
+ let safetyIndicator = "";
121
+ if (ds.license.category === "safe") {
122
+ safetyIndicator = "Safe for use";
123
+ }
124
+ else if (ds.license.category === "restricted") {
125
+ safetyIndicator = "Restricted - Review license carefully";
126
+ }
127
+ else {
128
+ safetyIndicator = "Unknown license - Use with caution";
129
+ }
130
+ output += `Source: ${sourceBadge}\n`;
131
+ output += `Safety: ${safetyIndicator}\n`;
132
+ output += `ID: ${ds.id}\n\n`;
133
+ // Description
134
+ if (ds.description) {
135
+ output += "Description:\n";
136
+ output += `${ds.description}\n\n`;
137
+ }
138
+ // Quality warnings
139
+ if (ds.quality_warnings && ds.quality_warnings.length > 0) {
140
+ output += "Quality Warnings:\n";
141
+ ds.quality_warnings.forEach(warning => {
142
+ output += ` • ${warning}\n`;
143
+ });
144
+ output += "\n";
145
+ }
146
+ // Metadata
147
+ output += "Metadata:\n";
148
+ output += ` Downloads: ${ds.downloads?.toLocaleString() || "N/A"}\n`;
149
+ output += ` Likes: ${ds.likes || 0}\n`;
150
+ output += ` Quality Score: ${ds.quality_score}/100\n`;
151
+ output += ` Domain: ${ds.domain || "unknown"}\n`;
152
+ output += ` Task: ${ds.task || "unknown"}\n`;
153
+ output += ` Languages: ${ds.languages?.join(", ") || "N/A"}\n`;
154
+ output += ` Last Updated: ${new Date(ds.last_updated).toLocaleDateString()}\n\n`;
155
+ // Data characteristics
156
+ output += "Data Characteristics:\n";
157
+ output += ` Total Examples: ${ds.total_examples?.toLocaleString() || "N/A"}\n`;
158
+ output += ` Total Size: ${ds.total_size_mb ? ds.total_size_mb + " MB" : "N/A"}\n`;
159
+ output += ` Structured: ${ds.is_structured ? "Yes" : "No"}\n`;
160
+ output += ` Has Target Column: ${ds.has_target_column ? "Yes" : "No"}\n`;
161
+ output += ` Format: ${ds.format || "N/A"}\n\n`;
162
+ // Splits
163
+ if (ds.splits && ds.splits.length > 0) {
164
+ output += "Data Splits:\n";
165
+ ds.splits.forEach(split => {
166
+ output += ` • ${split.name}: ${split.num_examples?.toLocaleString() || "?"} examples`;
167
+ if (split.size_bytes) {
168
+ output += ` (${(split.size_bytes / (1024 * 1024)).toFixed(2)} MB)`;
169
+ }
170
+ output += "\n";
171
+ });
172
+ output += "\n";
173
+ }
174
+ // Columns
175
+ if (ds.columns && ds.columns.length > 0) {
176
+ output += "Columns:\n";
177
+ ds.columns.slice(0, 10).forEach(col => {
178
+ const targetMarker = col.is_target ? " [TARGET]" : "";
179
+ output += ` • ${col.name}${targetMarker}`;
180
+ if (col.type)
181
+ output += ` (${col.type})`;
182
+ output += "\n";
183
+ });
184
+ if (ds.columns.length > 10) {
185
+ output += ` ... and ${ds.columns.length - 10} more columns\n`;
186
+ }
187
+ output += "\n";
188
+ }
189
+ // License
190
+ output += "License Information:\n";
191
+ output += ` License: ${ds.license.id || "Unknown"}\n`;
192
+ output += ` Category: ${ds.license.category}\n`;
193
+ output += ` Commercial Use: ${ds.license.commercial_use ? "Allowed" : "Not allowed"}\n`;
194
+ if (ds.license.warnings && ds.license.warnings.length > 0) {
195
+ output += ` Warnings:\n`;
196
+ ds.license.warnings.forEach(warning => {
197
+ output += ` WARNING: ${warning}\n`;
198
+ });
199
+ }
200
+ if (ds.license.usage_restrictions && ds.license.usage_restrictions.length > 0) {
201
+ output += ` Restrictions:\n`;
202
+ ds.license.usage_restrictions.forEach(restriction => {
203
+ output += ` • ${restriction}\n`;
204
+ });
205
+ }
206
+ output += "\n";
207
+ // Safety flags
208
+ output += "Safety Flags:\n";
209
+ output += ` Safe Source: ${ds.is_safe_source ? "Yes" : "No"}\n`;
210
+ output += ` Has Personal Data: ${ds.has_personal_data ? "Yes" : "No"}\n`;
211
+ output += ` Paywalled: ${ds.is_paywalled ? "Yes" : "No"}\n`;
212
+ output += ` Scraped Web Data: ${ds.is_scraped_web_data ? "Yes" : "No"}\n\n`;
213
+ // Tags
214
+ if (ds.tags && ds.tags.length > 0) {
215
+ output += "Tags:\n";
216
+ output += ` ${ds.tags.slice(0, 15).join(", ")}`;
217
+ if (ds.tags.length > 15) {
218
+ output += ` ... and ${ds.tags.length - 15} more`;
219
+ }
220
+ output += "\n\n";
221
+ }
222
+ // Download link
223
+ output += "Download:\n";
224
+ output += ` ${ds.download_url}\n\n`;
225
+ output += "═".repeat(80) + "\n";
226
+ return output;
227
+ }
@@ -0,0 +1,52 @@
1
+ import fs from "fs";
2
+ import { Readable } from "stream";
3
+ import { finished } from "stream/promises";
4
+ import { retryWithBackoff } from "../metadata/rate-limiter.js";
5
+ export class RobustDownloader {
6
+ /**
7
+ * Downloads a file with automatic retries and resume support
8
+ */
9
+ async download(url, targetPath, options = {}) {
10
+ await retryWithBackoff(async () => {
11
+ let startByte = 0;
12
+ const headers = { ...(options.headers || {}) };
13
+ // Handle resume logic
14
+ if (options.resume && fs.existsSync(targetPath)) {
15
+ startByte = fs.statSync(targetPath).size;
16
+ if (startByte > 0) {
17
+ headers["Range"] = `bytes=${startByte}-`;
18
+ console.log(`[Downloader] Resuming from byte ${startByte}`);
19
+ }
20
+ }
21
+ const response = await fetch(url, { headers });
22
+ if (response.status === 416) {
23
+ // Requested range not satisfiable - likely already finished
24
+ console.log("[Downloader] Range not satisfiable, file might be complete.");
25
+ return;
26
+ }
27
+ if (!response.ok && response.status !== 206) {
28
+ throw new Error(`Download failed: ${response.statusText} (${response.status})`);
29
+ }
30
+ const contentLength = response.headers.get("content-length");
31
+ const totalSize = (contentLength ? parseInt(contentLength, 10) : 0) + startByte;
32
+ const reader = response.body;
33
+ if (!reader)
34
+ throw new Error("Response body is empty");
35
+ // Open stream in append mode if resuming
36
+ const fileStream = fs.createWriteStream(targetPath, { flags: startByte > 0 ? "a" : "w" });
37
+ const nodeReadable = Readable.fromWeb(reader);
38
+ let downloadedBytes = startByte;
39
+ let lastProgressTime = 0;
40
+ nodeReadable.on("data", (chunk) => {
41
+ downloadedBytes += chunk.length;
42
+ // Throttle progress updates
43
+ const now = Date.now();
44
+ if (options.onProgress && (now - lastProgressTime > 500 || downloadedBytes === totalSize)) {
45
+ options.onProgress(downloadedBytes, totalSize);
46
+ lastProgressTime = now;
47
+ }
48
+ });
49
+ await finished(nodeReadable.pipe(fileStream));
50
+ }, { maxRetries: 5, initialDelay: 2000 });
51
+ }
52
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "mcpServers": {
3
+ "vesper": {
4
+ "command": "node",
5
+ "args": [
6
+ "/path/to/global/node_modules/@vesper/mcp-server/build/index.js"
7
+ ],
8
+ "env": {
9
+ "KAGGLE_USERNAME": "your-kaggle-username",
10
+ "KAGGLE_KEY": "your-kaggle-api-key",
11
+ "HF_TOKEN": "your-huggingface-token"
12
+ }
13
+ }
14
+ }
15
+ }
package/package.json ADDED
@@ -0,0 +1,84 @@
1
+ {
2
+ "name": "@vespermcp/mcp-server",
3
+ "version": "1.0.0",
4
+ "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
+ "type": "module",
6
+ "main": "build/index.js",
7
+ "bin": {
8
+ "vesper": "./build/index.js"
9
+ },
10
+ "files": [
11
+ "build/**/*",
12
+ "src/python/**/*",
13
+ "README.md",
14
+ "LICENSE",
15
+ "mcp-config-template.json"
16
+ ],
17
+ "scripts": {
18
+ "build": "tsc",
19
+ "dev": "tsx watch src/index.ts",
20
+ "postinstall": "node scripts/postinstall.cjs",
21
+ "scrape": "tsx src/scripts/scrape-metadata.ts",
22
+ "massive-scrape": "tsx src/scripts/massive-scrape.ts",
23
+ "index": "tsx src/scripts/build-index.ts",
24
+ "search-cli": "tsx src/scripts/search-cli.ts",
25
+ "check-db": "tsx src/scripts/check-db.ts",
26
+ "test-jit": "tsx src/scripts/test-jit.ts",
27
+ "demo-ui": "tsx src/scripts/demo-ui.ts",
28
+ "test": "vitest"
29
+ },
30
+ "keywords": [
31
+ "mcp",
32
+ "model-context-protocol",
33
+ "dataset",
34
+ "machine-learning",
35
+ "data-quality",
36
+ "huggingface",
37
+ "kaggle",
38
+ "multimodal",
39
+ "image-analysis",
40
+ "audio-analysis",
41
+ "video-analysis",
42
+ "data-preparation",
43
+ "ai",
44
+ "ml"
45
+ ],
46
+ "author": "Vesper Team",
47
+ "license": "MIT",
48
+ "repository": {
49
+ "type": "git",
50
+ "url": "https://github.com/vesper/mcp-server"
51
+ },
52
+ "engines": {
53
+ "node": ">=18.0.0",
54
+ "npm": ">=8.0.0"
55
+ },
56
+ "dependencies": {
57
+ "@huggingface/hub": "^2.7.1",
58
+ "@modelcontextprotocol/sdk": "^1.25.2",
59
+ "@xenova/transformers": "^2.17.2",
60
+ "adm-zip": "^0.5.16",
61
+ "ajv": "^8.17.1",
62
+ "ajv-formats": "^3.0.1",
63
+ "better-sqlite3": "^12.6.0",
64
+ "lodash": "^4.17.21",
65
+ "uuid": "^13.0.0",
66
+ "zod": "^4.3.5",
67
+ "zod-to-json-schema": "^3.25.1"
68
+ },
69
+ "devDependencies": {
70
+ "@types/adm-zip": "^0.5.7",
71
+ "@types/better-sqlite3": "^7.6.13",
72
+ "@types/lodash": "^4.17.23",
73
+ "@types/node": "^25.0.9",
74
+ "@types/uuid": "^10.0.0",
75
+ "@typescript-eslint/eslint-plugin": "^8.53.0",
76
+ "@typescript-eslint/parser": "^8.53.0",
77
+ "eslint": "^9.39.2",
78
+ "eslint-config-prettier": "^10.1.8",
79
+ "prettier": "^3.8.0",
80
+ "tsx": "^4.21.0",
81
+ "typescript": "^5.9.3",
82
+ "vitest": "^4.0.17"
83
+ }
84
+ }
@@ -0,0 +1,196 @@
1
+ import sys
2
+ import json
3
+ import polars as pl
4
+ import numpy as np
5
+
6
+ # --- Operations Library ---
7
+
8
+ def op_remove_duplicates(df, params):
9
+ subset = params.get("subset", None) # List of cols or None
10
+ before = len(df)
11
+ if subset:
12
+ df = df.unique(subset=subset)
13
+ else:
14
+ df = df.unique()
15
+ return df, {"rows_removed": before - len(df)}
16
+
17
+ def op_drop_columns(df, params):
18
+ cols = params.get("columns", [])
19
+ before = len(df.columns)
20
+ # Filter only existing cols to avoid errors
21
+ cols_to_drop = [c for c in cols if c in df.columns]
22
+ df = df.drop(cols_to_drop)
23
+ return df, {"columns_dropped": len(cols_to_drop)}
24
+
25
+ def op_fill_missing(df, params):
26
+ col = params["column"]
27
+ method = params.get("method", "mean") # mean, median, mode, constant
28
+ value = params.get("value", None)
29
+
30
+ if col not in df.columns:
31
+ return df, {"error": f"Column {col} not found"}
32
+
33
+ affected = df[col].null_count()
34
+
35
+ if method == "constant":
36
+ df = df.with_columns(pl.col(col).fill_null(value))
37
+ elif method == "mean":
38
+ mean_val = df[col].mean()
39
+ df = df.with_columns(pl.col(col).fill_null(mean_val))
40
+ elif method == "median":
41
+ median_val = df[col].median()
42
+ df = df.with_columns(pl.col(col).fill_null(median_val))
43
+
44
+ return df, {"rows_imputed": affected}
45
+
46
+ def op_fix_types(df, params):
47
+ col = params["column"]
48
+ target_type = params["type"] # "int", "float", "string", "date"
49
+
50
+ if col not in df.columns:
51
+ return df, {"error": f"Column {col} not found"}
52
+
53
+ try:
54
+ if target_type == "int":
55
+ df = df.with_columns(pl.col(col).cast(pl.Int64, strict=False))
56
+ elif target_type == "float":
57
+ df = df.with_columns(pl.col(col).cast(pl.Float64, strict=False))
58
+ elif target_type == "string":
59
+ df = df.with_columns(pl.col(col).cast(pl.Utf8))
60
+ elif target_type == "date":
61
+ df = df.with_columns(pl.col(col).str.to_date(strict=False))
62
+
63
+ return df, {"status": "Converted"}
64
+ except Exception as e:
65
+ return df, {"error": str(e)}
66
+
67
+ def op_remove_outliers(df, params):
68
+ col = params["column"]
69
+ method = params.get("method", "iqr")
70
+ threshold = params.get("threshold", 1.5)
71
+
72
+ if col not in df.columns:
73
+ return df, {"error": f"Column {col} not found"}
74
+
75
+ before = len(df)
76
+
77
+ if method == "iqr":
78
+ q1 = df[col].quantile(0.25)
79
+ q3 = df[col].quantile(0.75)
80
+ iqr = q3 - q1
81
+ lower = q1 - (threshold * iqr)
82
+ upper = q3 + (threshold * iqr)
83
+
84
+ df = df.filter((pl.col(col) >= lower) & (pl.col(col) <= upper))
85
+
86
+ return df, {"rows_removed": before - len(df)}
87
+
88
+ def op_encode_categories(df, params):
89
+ col = params["column"]
90
+ method = params.get("method", "label") # label, onehot
91
+
92
+ if col not in df.columns:
93
+ return df, {"error": f"Column {col} not found"}
94
+
95
+ if method == "label":
96
+ # Polars dense_rank acts similar to label encoding
97
+ df = df.with_columns(pl.col(col).rank("dense").alias(f"{col}_encoded"))
98
+ elif method == "onehot":
99
+ dummies = df[col].to_dummies()
100
+ df = pl.concat([df, dummies], how="horizontal")
101
+
102
+ return df, {"status": f"Encoded using {method}"}
103
+
104
+ # --- Registry ---
105
+
106
+ OPERATIONS = {
107
+ "RemoveDuplicates": op_remove_duplicates,
108
+ "DropColumns": op_drop_columns,
109
+ "FillMissing": op_fill_missing,
110
+ "FixTypes": op_fix_types,
111
+ "RemoveOutliers": op_remove_outliers,
112
+ "EncodeCategories": op_encode_categories
113
+ }
114
+
115
+ def main():
116
+ if len(sys.argv) < 3:
117
+ print(json.dumps({"error": "Usage: cleaner.py <file_path> <operations_json>"}), file=sys.stderr)
118
+ sys.exit(1)
119
+
120
+ file_path = sys.argv[1]
121
+ ops_json = sys.argv[2]
122
+
123
+ try:
124
+ operations = json.loads(ops_json)
125
+
126
+ # Load Data
127
+ file_path_lower = file_path.lower()
128
+ if file_path_lower.endswith(".csv"):
129
+ df = pl.read_csv(file_path, ignore_errors=True)
130
+ elif file_path_lower.endswith(".parquet"):
131
+ df = pl.read_parquet(file_path)
132
+ elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
133
+ # Explicit NDJSON
134
+ df = pl.read_ndjson(file_path)
135
+ elif file_path_lower.endswith(".json"):
136
+ # Ambiguous .json
137
+ try:
138
+ df = pl.read_json(file_path)
139
+ except Exception:
140
+ try:
141
+ df = pl.read_ndjson(file_path)
142
+ except Exception as e:
143
+ raise ValueError(f"Failed to read JSON: {str(e)}")
144
+ else:
145
+ raise ValueError(f"Unsupported format: {file_path}")
146
+
147
+ logs = []
148
+ total_rows_affected = 0
149
+
150
+ # Execute Pipeline
151
+ for op in operations:
152
+ op_type = op["type"]
153
+ params = op.get("params", {})
154
+
155
+ if op_type in OPERATIONS:
156
+ try:
157
+ df, stats = OPERATIONS[op_type](df, params)
158
+ logs.append(f"Executed {op_type}: {stats}")
159
+ total_rows_affected += stats.get("rows_removed", 0)
160
+ except Exception as e:
161
+ logs.append(f"Failed {op_type}: {str(e)}")
162
+ else:
163
+ logs.append(f"Unknown operation: {op_type}")
164
+
165
+ # Save Result (overwrite or new file)
166
+ # Save Result (overwrite or new file)
167
+ output_format = sys.argv[3] if len(sys.argv) > 3 else None
168
+
169
+ if not output_format:
170
+ # Legacy logic: preserve CSV or default to parquet
171
+ if file_path_lower.endswith(".csv"):
172
+ output_format = "csv"
173
+ else:
174
+ output_format = "parquet"
175
+
176
+ base_name = file_path.rsplit(".", 1)[0]
177
+ if output_format == "csv":
178
+ output_path = f"{base_name}_cleaned.csv"
179
+ df.write_csv(output_path)
180
+ else:
181
+ output_path = f"{base_name}_cleaned.parquet"
182
+ df.write_parquet(output_path)
183
+
184
+ print(json.dumps({
185
+ "success": True,
186
+ "output_path": output_path,
187
+ "rows_affected": total_rows_affected,
188
+ "logs": logs
189
+ }))
190
+
191
+ except Exception as e:
192
+ print(json.dumps({"success": False, "error": str(e)}))
193
+ sys.exit(1)
194
+
195
+ if __name__ == "__main__":
196
+ main()
@@ -0,0 +1,112 @@
1
+ import sys
2
+ import json
3
+ import polars as pl
4
+ import os
5
+
6
+ # Optional TensorFlow import for TFRecord support
7
+ try:
8
+ import tensorflow as tf
9
+ HAS_TENSORFLOW = True
10
+ except ImportError:
11
+ HAS_TENSORFLOW = False
12
+
13
+ def export_data(file_path, output_path, format, options=None):
14
+ options = options or {}
15
+
16
+ # Load Data
17
+ try:
18
+ if file_path.endswith(".csv"):
19
+ df = pl.read_csv(file_path, ignore_errors=True)
20
+ elif file_path.endswith(".parquet"):
21
+ df = pl.read_parquet(file_path)
22
+ else:
23
+ return {"error": f"Unsupported input format: {file_path}"}
24
+ except Exception as e:
25
+ return {"error": f"Failed to load input file: {str(e)}"}
26
+
27
+ output_dir = os.path.dirname(output_path)
28
+ if output_dir and not os.path.exists(output_dir):
29
+ os.makedirs(output_dir, exist_ok=True)
30
+
31
+ try:
32
+ # Export Logic
33
+ if format == "csv":
34
+ df.write_csv(output_path)
35
+
36
+ elif format == "parquet":
37
+ compression = options.get("compression", "snappy")
38
+ df.write_parquet(output_path, compression=compression)
39
+
40
+ elif format == "jsonl":
41
+ df.write_ndjson(output_path)
42
+
43
+ elif format == "arrow" or format == "ipc":
44
+ compression = options.get("compression", "uncompressed")
45
+ if compression == "uncompressed": compression = None
46
+ df.write_ipc(output_path, compression=compression)
47
+
48
+ elif format == "tfrecord":
49
+ if not HAS_TENSORFLOW:
50
+ return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
51
+
52
+ # TFRecord Export Logic (using TensorFlow)
53
+ with tf.io.TFRecordWriter(output_path) as writer:
54
+ # Convert Polars -> Pandas for iteration (simpler for now)
55
+ # TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
56
+ pdf = df.to_pandas()
57
+ for _, row in pdf.iterrows():
58
+ feature = {}
59
+ for col, value in row.items():
60
+ if value is None:
61
+ continue
62
+
63
+ # Type inference for TFRecord features
64
+ if isinstance(value, int):
65
+ feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
66
+ elif isinstance(value, float):
67
+ feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
68
+ elif isinstance(value, str):
69
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))
70
+ elif isinstance(value, bytes):
71
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
72
+ else:
73
+ # Fallback to string for unknown types
74
+ feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
75
+
76
+ example = tf.train.Example(features=tf.train.Features(feature=feature))
77
+ writer.write(example.SerializeToString())
78
+
79
+ else:
80
+ return {"error": f"Unknown export format: {format}"}
81
+
82
+ return {
83
+ "success": True,
84
+ "output_path": output_path,
85
+ "rows": len(df),
86
+ "format": format
87
+ }
88
+
89
+ except Exception as e:
90
+ return {"error": f"Export failed: {str(e)}"}
91
+
92
+ def main():
93
+ if len(sys.argv) < 4:
94
+ print(json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}), file=sys.stderr)
95
+ sys.exit(1)
96
+
97
+ input_file = sys.argv[1]
98
+ output_file = sys.argv[2]
99
+ fmt = sys.argv[3]
100
+
101
+ options = {}
102
+ if len(sys.argv) > 4:
103
+ try:
104
+ options = json.loads(sys.argv[4])
105
+ except:
106
+ pass
107
+
108
+ result = export_data(input_file, output_file, fmt, options)
109
+ print(json.dumps(result))
110
+
111
+ if __name__ == "__main__":
112
+ main()