npm - @vespermcp/mcp-server - Versions diffs - 1.0.0 - Mend

@vespermcp/mcp-server 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

package/LICENSE +21 -0
package/README.md +259 -0
package/build/cache/cdn.js +34 -0
package/build/cache/service.js +63 -0
package/build/cleaning/cleaner.js +50 -0
package/build/cleaning/evaluator.js +89 -0
package/build/cleaning/executor.js +60 -0
package/build/cleaning/exporter.js +87 -0
package/build/cleaning/planner.js +111 -0
package/build/cleaning/rules.js +57 -0
package/build/cleaning/types.js +1 -0
package/build/cloud/adapters/local.js +37 -0
package/build/cloud/adapters/s3.js +24 -0
package/build/cloud/storage-manager.js +20 -0
package/build/cloud/types.js +1 -0
package/build/compliance/service.js +73 -0
package/build/compliance/store.js +80 -0
package/build/compliance/types.js +1 -0
package/build/data/processing-worker.js +23 -0
package/build/data/streaming.js +38 -0
package/build/data/worker-pool.js +39 -0
package/build/export/exporter.js +45 -0
package/build/export/packager.js +100 -0
package/build/export/types.js +1 -0
package/build/fusion/aligner.js +56 -0
package/build/fusion/deduplicator.js +69 -0
package/build/fusion/harmonizer.js +39 -0
package/build/fusion/orchestrator.js +86 -0
package/build/fusion/types.js +1 -0
package/build/index.js +632 -0
package/build/ingestion/hf-downloader.js +64 -0
package/build/ingestion/ingestor.js +96 -0
package/build/ingestion/kaggle-downloader.js +79 -0
package/build/install/install-service.js +41 -0
package/build/jobs/manager.js +129 -0
package/build/jobs/queue.js +59 -0
package/build/jobs/types.js +1 -0
package/build/metadata/domain.js +147 -0
package/build/metadata/github-scraper.js +47 -0
package/build/metadata/institutional-scrapers.js +49 -0
package/build/metadata/kaggle-scraper.js +182 -0
package/build/metadata/license.js +68 -0
package/build/metadata/monitoring-service.js +107 -0
package/build/metadata/monitoring-store.js +78 -0
package/build/metadata/monitoring-types.js +1 -0
package/build/metadata/quality.js +48 -0
package/build/metadata/rate-limiter.js +128 -0
package/build/metadata/scraper.js +353 -0
package/build/metadata/store.js +325 -0
package/build/metadata/types.js +1 -0
package/build/metadata/uci-scraper.js +49 -0
package/build/monitoring/observability.js +76 -0
package/build/quality/analyzer.js +57 -0
package/build/quality/image-analyzer.js +46 -0
package/build/quality/media-analyzer.js +46 -0
package/build/quality/quality-orchestrator.js +162 -0
package/build/quality/types.js +1 -0
package/build/scripts/build-index.js +54 -0
package/build/scripts/check-db.js +73 -0
package/build/scripts/check-jobs.js +24 -0
package/build/scripts/check-naruto.js +17 -0
package/build/scripts/demo-full-pipeline.js +62 -0
package/build/scripts/demo-ui.js +58 -0
package/build/scripts/e2e-demo.js +72 -0
package/build/scripts/massive-scrape.js +103 -0
package/build/scripts/ops-dashboard.js +33 -0
package/build/scripts/scrape-metadata.js +100 -0
package/build/scripts/search-cli.js +26 -0
package/build/scripts/test-bias.js +45 -0
package/build/scripts/test-caching.js +51 -0
package/build/scripts/test-cleaning.js +76 -0
package/build/scripts/test-cloud-storage.js +48 -0
package/build/scripts/test-compliance.js +58 -0
package/build/scripts/test-conversion.js +64 -0
package/build/scripts/test-custom-rules.js +58 -0
package/build/scripts/test-db-opt.js +63 -0
package/build/scripts/test-export-custom.js +33 -0
package/build/scripts/test-exporter.js +53 -0
package/build/scripts/test-fusion.js +61 -0
package/build/scripts/test-github.js +27 -0
package/build/scripts/test-group-split.js +52 -0
package/build/scripts/test-hf-download.js +29 -0
package/build/scripts/test-holdout-manager.js +61 -0
package/build/scripts/test-hybrid-search.js +41 -0
package/build/scripts/test-image-analysis.js +50 -0
package/build/scripts/test-ingestion-infra.js +39 -0
package/build/scripts/test-install.js +40 -0
package/build/scripts/test-institutional.js +26 -0
package/build/scripts/test-integrity.js +41 -0
package/build/scripts/test-jit.js +42 -0
package/build/scripts/test-job-queue.js +62 -0
package/build/scripts/test-kaggle-download.js +34 -0
package/build/scripts/test-large-data.js +50 -0
package/build/scripts/test-mcp-v5.js +73 -0
package/build/scripts/test-media-analysis.js +61 -0
package/build/scripts/test-monitoring.js +91 -0
package/build/scripts/test-observability.js +106 -0
package/build/scripts/test-packager.js +55 -0
package/build/scripts/test-pipeline.js +50 -0
package/build/scripts/test-planning.js +64 -0
package/build/scripts/test-privacy.js +38 -0
package/build/scripts/test-quality.js +43 -0
package/build/scripts/test-robust-ingestion.js +41 -0
package/build/scripts/test-schema.js +45 -0
package/build/scripts/test-split-validation.js +40 -0
package/build/scripts/test-splitter.js +93 -0
package/build/scripts/test-uci.js +27 -0
package/build/scripts/test-unified-quality.js +86 -0
package/build/search/embedder.js +34 -0
package/build/search/engine.js +129 -0
package/build/search/jit-orchestrator.js +232 -0
package/build/search/vector-store.js +105 -0
package/build/splitting/splitter.js +57 -0
package/build/splitting/types.js +1 -0
package/build/tools/formatter.js +227 -0
package/build/utils/downloader.js +52 -0
package/mcp-config-template.json +15 -0
package/package.json +84 -0
package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
package/src/python/cleaner.py +196 -0
package/src/python/export_engine.py +112 -0
package/src/python/framework_adapters.py +100 -0
package/src/python/github_adapter.py +106 -0
package/src/python/image_engine.py +86 -0
package/src/python/media_engine.py +133 -0
package/src/python/nasa_adapter.py +82 -0
package/src/python/quality_engine.py +243 -0
package/src/python/splitter_engine.py +283 -0
package/src/python/test_framework_adapters.py +61 -0
package/src/python/uci_adapter.py +94 -0
package/src/python/worldbank_adapter.py +99 -0

package/build/tools/formatter.js ADDED Viewed

@@ -0,0 +1,227 @@
+/**
+ * Format job status for visual representation
+ */
+export function formatJobStatus(job) {
+    const emojiMap = {
+        "pending": "⏳",
+        "queued": "📋",
+        "running": "🔄",
+        "completed": "✅",
+        "failed": "❌",
+        "retrying": "🔁"
+    };
+    const emoji = emojiMap[job.status] || "❓";
+    const barWidth = 20;
+    const filledWidth = Math.round((job.progress / 100) * barWidth);
+    const emptyWidth = barWidth - filledWidth;
+    const bar = "█".repeat(filledWidth) + "░".repeat(emptyWidth);
+    let output = `═ Job Status: ${job.type.toUpperCase()} ═\n`;
+    output += `ID: ${job.id}\n`;
+    output += `Status: ${emoji} ${job.status.toUpperCase()}\n`;
+    output += `Progress: ${bar} ${job.progress}%\n`;
+    output += `Activity: ${job.status_text}\n`;
+    if (job.result_url) {
+        output += `Result: ${job.result_url}\n`;
+    }
+    if (job.error) {
+        output += `Error: ${job.error}\n`;
+    }
+    output += `Updated: ${new Date(job.updated_at).toLocaleTimeString()}\n`;
+    output += "═".repeat(25) + "\n";
+    return output;
+}
+/**
+ * Format dataset search results for human-readable display
+ */
+export function formatSearchResults(results) {
+    if (results.length === 0) {
+        return "No datasets found matching your query.";
+    }
+    let output = `Found ${results.length} dataset(s):\n\n`;
+    output += "═".repeat(80) + "\n\n";
+    results.forEach((ds, index) => {
+        const relevanceScore = ds.relevance_score || 0;
+        // Source badge
+        const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
+        // Safety indicator
+        let safetyIndicator = "";
+        if (ds.license.category === "safe") {
+            safetyIndicator = "Safe";
+        }
+        else if (ds.license.category === "restricted") {
+            safetyIndicator = "Restricted";
+        }
+        else {
+            safetyIndicator = "Unknown License";
+        }
+        // Header
+        output += `${index + 1}. ${ds.name}\n`;
+        output += `   ${sourceBadge} | ${safetyIndicator} | Relevance: ${(relevanceScore * 100).toFixed(0)}%\n`;
+        output += `   ID: ${ds.id}\n\n`;
+        // Description
+        if (ds.description && ds.description.length > 0) {
+            const shortDesc = ds.description.length > 200
+                ? ds.description.substring(0, 200) + "..."
+                : ds.description;
+            output += `   ${shortDesc}\n\n`;
+        }
+        // Quality warnings
+        if (ds.quality_warnings && ds.quality_warnings.length > 0) {
+            output += `   Quality Warnings:\n`;
+            ds.quality_warnings.forEach(warning => {
+                output += `      • ${warning}\n`;
+            });
+            output += "\n";
+        }
+        // Key stats
+        output += `   Stats:\n`;
+        if (ds.downloads)
+            output += `      Downloads: ${ds.downloads.toLocaleString()}\n`;
+        if (ds.likes)
+            output += `      Likes: ${ds.likes}\n`;
+        if (ds.total_examples)
+            output += `      Examples: ${ds.total_examples.toLocaleString()}\n`;
+        if (ds.total_size_mb)
+            output += `      Size: ${ds.total_size_mb} MB\n`;
+        output += `      Domain: ${ds.domain || "unknown"}\n`;
+        output += `      Task: ${ds.task || "unknown"}\n`;
+        // Data splits
+        if (ds.splits && ds.splits.length > 0) {
+            const splitNames = ds.splits.map(s => s.name).join(", ");
+            output += `      Splits: ${splitNames}\n`;
+        }
+        // License details
+        output += `\n   License: ${ds.license.id || "Unknown"}\n`;
+        if (ds.license.warnings && ds.license.warnings.length > 0) {
+            ds.license.warnings.forEach(warning => {
+                output += `      WARNING: ${warning}\n`;
+            });
+        }
+        if (ds.license.commercial_use !== undefined) {
+            output += `      Commercial use: ${ds.license.commercial_use ? "Yes" : "No"}\n`;
+        }
+        // Download link
+        output += `\n   ${ds.download_url}\n`;
+        output += "\n" + "─".repeat(80) + "\n\n";
+    });
+    return output;
+}
+/**
+ * Format detailed dataset info
+ */
+export function formatDatasetInfo(ds) {
+    let output = "";
+    // Header
+    output += "═".repeat(80) + "\n";
+    output += `${ds.name}\n`;
+    output += "═".repeat(80) + "\n\n";
+    // Source and safety
+    const sourceBadge = ds.source === "huggingface" ? "HuggingFace" : "Kaggle";
+    let safetyIndicator = "";
+    if (ds.license.category === "safe") {
+        safetyIndicator = "Safe for use";
+    }
+    else if (ds.license.category === "restricted") {
+        safetyIndicator = "Restricted - Review license carefully";
+    }
+    else {
+        safetyIndicator = "Unknown license - Use with caution";
+    }
+    output += `Source: ${sourceBadge}\n`;
+    output += `Safety: ${safetyIndicator}\n`;
+    output += `ID: ${ds.id}\n\n`;
+    // Description
+    if (ds.description) {
+        output += "Description:\n";
+        output += `${ds.description}\n\n`;
+    }
+    // Quality warnings
+    if (ds.quality_warnings && ds.quality_warnings.length > 0) {
+        output += "Quality Warnings:\n";
+        ds.quality_warnings.forEach(warning => {
+            output += `   • ${warning}\n`;
+        });
+        output += "\n";
+    }
+    // Metadata
+    output += "Metadata:\n";
+    output += `   Downloads: ${ds.downloads?.toLocaleString() || "N/A"}\n`;
+    output += `   Likes: ${ds.likes || 0}\n`;
+    output += `   Quality Score: ${ds.quality_score}/100\n`;
+    output += `   Domain: ${ds.domain || "unknown"}\n`;
+    output += `   Task: ${ds.task || "unknown"}\n`;
+    output += `   Languages: ${ds.languages?.join(", ") || "N/A"}\n`;
+    output += `   Last Updated: ${new Date(ds.last_updated).toLocaleDateString()}\n\n`;
+    // Data characteristics
+    output += "Data Characteristics:\n";
+    output += `   Total Examples: ${ds.total_examples?.toLocaleString() || "N/A"}\n`;
+    output += `   Total Size: ${ds.total_size_mb ? ds.total_size_mb + " MB" : "N/A"}\n`;
+    output += `   Structured: ${ds.is_structured ? "Yes" : "No"}\n`;
+    output += `   Has Target Column: ${ds.has_target_column ? "Yes" : "No"}\n`;
+    output += `   Format: ${ds.format || "N/A"}\n\n`;
+    // Splits
+    if (ds.splits && ds.splits.length > 0) {
+        output += "Data Splits:\n";
+        ds.splits.forEach(split => {
+            output += `   • ${split.name}: ${split.num_examples?.toLocaleString() || "?"} examples`;
+            if (split.size_bytes) {
+                output += ` (${(split.size_bytes / (1024 * 1024)).toFixed(2)} MB)`;
+            }
+            output += "\n";
+        });
+        output += "\n";
+    }
+    // Columns
+    if (ds.columns && ds.columns.length > 0) {
+        output += "Columns:\n";
+        ds.columns.slice(0, 10).forEach(col => {
+            const targetMarker = col.is_target ? " [TARGET]" : "";
+            output += `   • ${col.name}${targetMarker}`;
+            if (col.type)
+                output += ` (${col.type})`;
+            output += "\n";
+        });
+        if (ds.columns.length > 10) {
+            output += `   ... and ${ds.columns.length - 10} more columns\n`;
+        }
+        output += "\n";
+    }
+    // License
+    output += "License Information:\n";
+    output += `   License: ${ds.license.id || "Unknown"}\n`;
+    output += `   Category: ${ds.license.category}\n`;
+    output += `   Commercial Use: ${ds.license.commercial_use ? "Allowed" : "Not allowed"}\n`;
+    if (ds.license.warnings && ds.license.warnings.length > 0) {
+        output += `   Warnings:\n`;
+        ds.license.warnings.forEach(warning => {
+            output += `      WARNING: ${warning}\n`;
+        });
+    }
+    if (ds.license.usage_restrictions && ds.license.usage_restrictions.length > 0) {
+        output += `   Restrictions:\n`;
+        ds.license.usage_restrictions.forEach(restriction => {
+            output += `      • ${restriction}\n`;
+        });
+    }
+    output += "\n";
+    // Safety flags
+    output += "Safety Flags:\n";
+    output += `   Safe Source: ${ds.is_safe_source ? "Yes" : "No"}\n`;
+    output += `   Has Personal Data: ${ds.has_personal_data ? "Yes" : "No"}\n`;
+    output += `   Paywalled: ${ds.is_paywalled ? "Yes" : "No"}\n`;
+    output += `   Scraped Web Data: ${ds.is_scraped_web_data ? "Yes" : "No"}\n\n`;
+    // Tags
+    if (ds.tags && ds.tags.length > 0) {
+        output += "Tags:\n";
+        output += `   ${ds.tags.slice(0, 15).join(", ")}`;
+        if (ds.tags.length > 15) {
+            output += ` ... and ${ds.tags.length - 15} more`;
+        }
+        output += "\n\n";
+    }
+    // Download link
+    output += "Download:\n";
+    output += `   ${ds.download_url}\n\n`;
+    output += "═".repeat(80) + "\n";
+    return output;
+}

package/build/utils/downloader.js ADDED Viewed

@@ -0,0 +1,52 @@
+import fs from "fs";
+import { Readable } from "stream";
+import { finished } from "stream/promises";
+import { retryWithBackoff } from "../metadata/rate-limiter.js";
+export class RobustDownloader {
+    /**
+     * Downloads a file with automatic retries and resume support
+     */
+    async download(url, targetPath, options = {}) {
+        await retryWithBackoff(async () => {
+            let startByte = 0;
+            const headers = { ...(options.headers || {}) };
+            // Handle resume logic
+            if (options.resume && fs.existsSync(targetPath)) {
+                startByte = fs.statSync(targetPath).size;
+                if (startByte > 0) {
+                    headers["Range"] = `bytes=${startByte}-`;
+                    console.log(`[Downloader] Resuming from byte ${startByte}`);
+                }
+            }
+            const response = await fetch(url, { headers });
+            if (response.status === 416) {
+                // Requested range not satisfiable - likely already finished
+                console.log("[Downloader] Range not satisfiable, file might be complete.");
+                return;
+            }
+            if (!response.ok && response.status !== 206) {
+                throw new Error(`Download failed: ${response.statusText} (${response.status})`);
+            }
+            const contentLength = response.headers.get("content-length");
+            const totalSize = (contentLength ? parseInt(contentLength, 10) : 0) + startByte;
+            const reader = response.body;
+            if (!reader)
+                throw new Error("Response body is empty");
+            // Open stream in append mode if resuming
+            const fileStream = fs.createWriteStream(targetPath, { flags: startByte > 0 ? "a" : "w" });
+            const nodeReadable = Readable.fromWeb(reader);
+            let downloadedBytes = startByte;
+            let lastProgressTime = 0;
+            nodeReadable.on("data", (chunk) => {
+                downloadedBytes += chunk.length;
+                // Throttle progress updates
+                const now = Date.now();
+                if (options.onProgress && (now - lastProgressTime > 500 || downloadedBytes === totalSize)) {
+                    options.onProgress(downloadedBytes, totalSize);
+                    lastProgressTime = now;
+                }
+            });
+            await finished(nodeReadable.pipe(fileStream));
+        }, { maxRetries: 5, initialDelay: 2000 });
+    }
+}

package/mcp-config-template.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+    "mcpServers": {
+        "vesper": {
+            "command": "node",
+            "args": [
+                "/path/to/global/node_modules/@vesper/mcp-server/build/index.js"
+            ],
+            "env": {
+                "KAGGLE_USERNAME": "your-kaggle-username",
+                "KAGGLE_KEY": "your-kaggle-api-key",
+                "HF_TOKEN": "your-huggingface-token"
+            }
+        }
+    }
+}

package/package.json ADDED Viewed

@@ -0,0 +1,84 @@
+{
+  "name": "@vespermcp/mcp-server",
+  "version": "1.0.0",
+  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
+  "type": "module",
+  "main": "build/index.js",
+  "bin": {
+    "vesper": "./build/index.js"
+  },
+  "files": [
+    "build/**/*",
+    "src/python/**/*",
+    "README.md",
+    "LICENSE",
+    "mcp-config-template.json"
+  ],
+  "scripts": {
+    "build": "tsc",
+    "dev": "tsx watch src/index.ts",
+    "postinstall": "node scripts/postinstall.cjs",
+    "scrape": "tsx src/scripts/scrape-metadata.ts",
+    "massive-scrape": "tsx src/scripts/massive-scrape.ts",
+    "index": "tsx src/scripts/build-index.ts",
+    "search-cli": "tsx src/scripts/search-cli.ts",
+    "check-db": "tsx src/scripts/check-db.ts",
+    "test-jit": "tsx src/scripts/test-jit.ts",
+    "demo-ui": "tsx src/scripts/demo-ui.ts",
+    "test": "vitest"
+  },
+  "keywords": [
+    "mcp",
+    "model-context-protocol",
+    "dataset",
+    "machine-learning",
+    "data-quality",
+    "huggingface",
+    "kaggle",
+    "multimodal",
+    "image-analysis",
+    "audio-analysis",
+    "video-analysis",
+    "data-preparation",
+    "ai",
+    "ml"
+  ],
+  "author": "Vesper Team",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/vesper/mcp-server"
+  },
+  "engines": {
+    "node": ">=18.0.0",
+    "npm": ">=8.0.0"
+  },
+  "dependencies": {
+    "@huggingface/hub": "^2.7.1",
+    "@modelcontextprotocol/sdk": "^1.25.2",
+    "@xenova/transformers": "^2.17.2",
+    "adm-zip": "^0.5.16",
+    "ajv": "^8.17.1",
+    "ajv-formats": "^3.0.1",
+    "better-sqlite3": "^12.6.0",
+    "lodash": "^4.17.21",
+    "uuid": "^13.0.0",
+    "zod": "^4.3.5",
+    "zod-to-json-schema": "^3.25.1"
+  },
+  "devDependencies": {
+    "@types/adm-zip": "^0.5.7",
+    "@types/better-sqlite3": "^7.6.13",
+    "@types/lodash": "^4.17.23",
+    "@types/node": "^25.0.9",
+    "@types/uuid": "^10.0.0",
+    "@typescript-eslint/eslint-plugin": "^8.53.0",
+    "@typescript-eslint/parser": "^8.53.0",
+    "eslint": "^9.39.2",
+    "eslint-config-prettier": "^10.1.8",
+    "prettier": "^3.8.0",
+    "tsx": "^4.21.0",
+    "typescript": "^5.9.3",
+    "vitest": "^4.0.17"
+  }
+}

package/src/python/__pycache__/framework_adapters.cpython-312.pyc ADDED Viewed

Binary file

package/src/python/cleaner.py ADDED Viewed

@@ -0,0 +1,196 @@
+import sys
+import json
+import polars as pl
+import numpy as np
+# --- Operations Library ---
+def op_remove_duplicates(df, params):
+    subset = params.get("subset", None) # List of cols or None
+    before = len(df)
+    if subset:
+        df = df.unique(subset=subset)
+    else:
+        df = df.unique()
+    return df, {"rows_removed": before - len(df)}
+def op_drop_columns(df, params):
+    cols = params.get("columns", [])
+    before = len(df.columns)
+    # Filter only existing cols to avoid errors
+    cols_to_drop = [c for c in cols if c in df.columns]
+    df = df.drop(cols_to_drop)
+    return df, {"columns_dropped": len(cols_to_drop)}
+def op_fill_missing(df, params):
+    col = params["column"]
+    method = params.get("method", "mean") # mean, median, mode, constant
+    value = params.get("value", None)
+    if col not in df.columns:
+        return df, {"error": f"Column {col} not found"}
+    affected = df[col].null_count()
+    if method == "constant":
+        df = df.with_columns(pl.col(col).fill_null(value))
+    elif method == "mean":
+        mean_val = df[col].mean()
+        df = df.with_columns(pl.col(col).fill_null(mean_val))
+    elif method == "median":
+        median_val = df[col].median()
+        df = df.with_columns(pl.col(col).fill_null(median_val))
+    return df, {"rows_imputed": affected}
+def op_fix_types(df, params):
+    col = params["column"]
+    target_type = params["type"] # "int", "float", "string", "date"
+    if col not in df.columns:
+        return df, {"error": f"Column {col} not found"}
+    try:
+        if target_type == "int":
+            df = df.with_columns(pl.col(col).cast(pl.Int64, strict=False))
+        elif target_type == "float":
+            df = df.with_columns(pl.col(col).cast(pl.Float64, strict=False))
+        elif target_type == "string":
+            df = df.with_columns(pl.col(col).cast(pl.Utf8))
+        elif target_type == "date":
+             df = df.with_columns(pl.col(col).str.to_date(strict=False))
+        return df, {"status": "Converted"}
+    except Exception as e:
+        return df, {"error": str(e)}
+def op_remove_outliers(df, params):
+    col = params["column"]
+    method = params.get("method", "iqr")
+    threshold = params.get("threshold", 1.5)
+    if col not in df.columns:
+        return df, {"error": f"Column {col} not found"}
+    before = len(df)
+    if method == "iqr":
+        q1 = df[col].quantile(0.25)
+        q3 = df[col].quantile(0.75)
+        iqr = q3 - q1
+        lower = q1 - (threshold * iqr)
+        upper = q3 + (threshold * iqr)
+        df = df.filter((pl.col(col) >= lower) & (pl.col(col) <= upper))
+    return df, {"rows_removed": before - len(df)}
+def op_encode_categories(df, params):
+    col = params["column"]
+    method = params.get("method", "label") # label, onehot
+    if col not in df.columns:
+        return df, {"error": f"Column {col} not found"}
+    if method == "label":
+        # Polars dense_rank acts similar to label encoding
+        df = df.with_columns(pl.col(col).rank("dense").alias(f"{col}_encoded"))
+    elif method == "onehot":
+        dummies = df[col].to_dummies()
+        df = pl.concat([df, dummies], how="horizontal")
+    return df, {"status": f"Encoded using {method}"}
+# --- Registry ---
+OPERATIONS = {
+    "RemoveDuplicates": op_remove_duplicates,
+    "DropColumns": op_drop_columns,
+    "FillMissing": op_fill_missing,
+    "FixTypes": op_fix_types,
+    "RemoveOutliers": op_remove_outliers,
+    "EncodeCategories": op_encode_categories
+}
+def main():
+    if len(sys.argv) < 3:
+        print(json.dumps({"error": "Usage: cleaner.py <file_path> <operations_json>"}), file=sys.stderr)
+        sys.exit(1)
+    file_path = sys.argv[1]
+    ops_json = sys.argv[2]
+    try:
+        operations = json.loads(ops_json)
+        # Load Data
+        file_path_lower = file_path.lower()
+        if file_path_lower.endswith(".csv"):
+            df = pl.read_csv(file_path, ignore_errors=True)
+        elif file_path_lower.endswith(".parquet"):
+            df = pl.read_parquet(file_path)
+        elif file_path_lower.endswith(".jsonl") or file_path_lower.endswith(".ndjson"):
+            # Explicit NDJSON
+            df = pl.read_ndjson(file_path)
+        elif file_path_lower.endswith(".json"):
+            # Ambiguous .json
+            try:
+                df = pl.read_json(file_path)
+            except Exception:
+                try:
+                     df = pl.read_ndjson(file_path)
+                except Exception as e:
+                     raise ValueError(f"Failed to read JSON: {str(e)}")
+        else:
+            raise ValueError(f"Unsupported format: {file_path}")
+        logs = []
+        total_rows_affected = 0
+        # Execute Pipeline
+        for op in operations:
+            op_type = op["type"]
+            params = op.get("params", {})
+            if op_type in OPERATIONS:
+                try:
+                    df, stats = OPERATIONS[op_type](df, params)
+                    logs.append(f"Executed {op_type}: {stats}")
+                    total_rows_affected += stats.get("rows_removed", 0)
+                except Exception as e:
+                    logs.append(f"Failed {op_type}: {str(e)}")
+            else:
+                logs.append(f"Unknown operation: {op_type}")
+        # Save Result (overwrite or new file)
+        # Save Result (overwrite or new file)
+        output_format = sys.argv[3] if len(sys.argv) > 3 else None
+        if not output_format:
+            # Legacy logic: preserve CSV or default to parquet
+            if file_path_lower.endswith(".csv"):
+                output_format = "csv"
+            else:
+                output_format = "parquet"
+        base_name = file_path.rsplit(".", 1)[0]
+        if output_format == "csv":
+            output_path = f"{base_name}_cleaned.csv"
+            df.write_csv(output_path)
+        else:
+            output_path = f"{base_name}_cleaned.parquet"
+            df.write_parquet(output_path)
+        print(json.dumps({
+            "success": True,
+            "output_path": output_path,
+            "rows_affected": total_rows_affected,
+            "logs": logs
+        }))
+    except Exception as e:
+        print(json.dumps({"success": False, "error": str(e)}))
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

package/src/python/export_engine.py ADDED Viewed

@@ -0,0 +1,112 @@
+import sys
+import json
+import polars as pl
+import os
+# Optional TensorFlow import for TFRecord support
+try:
+    import tensorflow as tf
+    HAS_TENSORFLOW = True
+except ImportError:
+    HAS_TENSORFLOW = False
+def export_data(file_path, output_path, format, options=None):
+    options = options or {}
+    # Load Data
+    try:
+        if file_path.endswith(".csv"):
+            df = pl.read_csv(file_path, ignore_errors=True)
+        elif file_path.endswith(".parquet"):
+            df = pl.read_parquet(file_path)
+        else:
+            return {"error": f"Unsupported input format: {file_path}"}
+    except Exception as e:
+        return {"error": f"Failed to load input file: {str(e)}"}
+    output_dir = os.path.dirname(output_path)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+    try:
+        # Export Logic
+        if format == "csv":
+            df.write_csv(output_path)
+        elif format == "parquet":
+            compression = options.get("compression", "snappy")
+            df.write_parquet(output_path, compression=compression)
+        elif format == "jsonl":
+            df.write_ndjson(output_path)
+        elif format == "arrow" or format == "ipc":
+            compression = options.get("compression", "uncompressed")
+            if compression == "uncompressed": compression = None
+            df.write_ipc(output_path, compression=compression)
+        elif format == "tfrecord":
+            if not HAS_TENSORFLOW:
+                return {"error": "TensorFlow is not installed. Cannot export to TFRecord."}
+            # TFRecord Export Logic (using TensorFlow)
+            with tf.io.TFRecordWriter(output_path) as writer:
+                # Convert Polars -> Pandas for iteration (simpler for now)
+                # TODO: Optimize with Arrow -> Tensor conversion if needed for massive data
+                pdf = df.to_pandas()
+                for _, row in pdf.iterrows():
+                    feature = {}
+                    for col, value in row.items():
+                        if value is None:
+                            continue
+                        # Type inference for TFRecord features
+                        if isinstance(value, int):
+                            feature[col] = tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+                        elif isinstance(value, float):
+                            feature[col] = tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
+                        elif isinstance(value, str):
+                            feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode('utf-8')]))
+                        elif isinstance(value, bytes):
+                            feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+                        else:
+                            # Fallback to string for unknown types
+                            feature[col] = tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value).encode('utf-8')]))
+                    example = tf.train.Example(features=tf.train.Features(feature=feature))
+                    writer.write(example.SerializeToString())
+        else:
+            return {"error": f"Unknown export format: {format}"}
+        return {
+            "success": True,
+            "output_path": output_path,
+            "rows": len(df),
+            "format": format
+        }
+    except Exception as e:
+        return {"error": f"Export failed: {str(e)}"}
+def main():
+    if len(sys.argv) < 4:
+        print(json.dumps({"error": "Usage: export_engine.py <input_file> <output_file> <format> [options_json]"}), file=sys.stderr)
+        sys.exit(1)
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    fmt = sys.argv[3]
+    options = {}
+    if len(sys.argv) > 4:
+        try:
+            options = json.loads(sys.argv[4])
+        except:
+            pass
+    result = export_data(input_file, output_file, fmt, options)
+    print(json.dumps(result))
+if __name__ == "__main__":
+    main()