npm - @vespermcp/mcp-server - Versions diffs - 1.2.28 → 1.2.30 - Mend

@vespermcp/mcp-server 1.2.28 → 1.2.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.MCP_MIGRATION.md ADDED Viewed

@@ -0,0 +1,116 @@
+# Vesper MCP Migration Guide (Pre-Production)
+This guide documents the MCP surface consolidation for Vesper as a general-purpose data layer.
+## What Changed
+- Removed: `configure_kaggle`
+- Merged quality tools into `quality_analyze`
+- Merged fusion tools into `fuse`
+- Merged lineage tools into `lineage`
+- Kept separate by design: `vesper_normalize_schema` and `vesper_convert_format`
+## Deprecation Map
+- `configure_kaggle` -> `configure_keys`
+  - Map: `username` -> `kaggle_username`, `key` -> `kaggle_key`
+- `analyze_quality` -> `quality_analyze` with `operation="dataset"`
+- `analyze_image_quality` -> `quality_analyze` with `operation="image"`
+- `analyze_media_quality` -> `quality_analyze` with `operation="media"`
+- `generate_quality_report` -> `quality_analyze` with `operation="report"`
+- `fuse_datasets` -> `fuse` with `operation="tabular"`
+- `vesper_fuse` -> `fuse` with `operation="web"`
+- `get_lineage` -> `lineage` with `operation="get"`
+- `diff_lineage_versions` -> `lineage` with `operation="diff"`
+## Migration Examples
+### Credentials
+Old:
+```json
+{ "name": "configure_kaggle", "arguments": { "username": "u", "key": "k" } }
+```
+New:
+```json
+{
+  "name": "configure_keys",
+  "arguments": {
+    "kaggle_username": "u",
+    "kaggle_key": "k"
+  }
+}
+```
+### Quality
+Old:
+```json
+{ "name": "analyze_quality", "arguments": { "dataset_id": "my_ds" } }
+```
+New:
+```json
+{
+  "name": "quality_analyze",
+  "arguments": {
+    "operation": "dataset",
+    "dataset_id": "my_ds"
+  }
+}
+```
+### Fusion
+Old:
+```json
+{ "name": "fuse_datasets", "arguments": { "sources": ["a", "b"], "strategy": "concat" } }
+```
+New:
+```json
+{
+  "name": "fuse",
+  "arguments": {
+    "operation": "tabular",
+    "sources": ["a", "b"],
+    "strategy": "concat"
+  }
+}
+```
+### Lineage
+Old:
+```json
+{ "name": "get_lineage", "arguments": { "dataset_id": "my_ds" } }
+```
+New:
+```json
+{
+  "name": "lineage",
+  "arguments": {
+    "operation": "get",
+    "dataset_id": "my_ds"
+  }
+}
+```
+## Notes for Agent Builders
+- Prefer the new unified tools for all new integrations.
+- Do not merge `vesper_normalize_schema` and `vesper_convert_format` unless your client can present explicit operation-specific schemas.
+- If you have old prompts/tool maps, migrate now before production rollout.

package/build/index.js CHANGED Viewed

@@ -339,6 +339,36 @@ function ensureLocalPipelineSource(sourcePath, datasetId, targetDir) {
     }
     return stagedPath;
 }
+function cleanupIntermediateArtifacts(artifactPaths, finalOutputPath) {
+    const finalResolved = path.resolve(finalOutputPath);
+    const finalLineage = `${finalResolved}.lineage.json`;
+    for (const candidate of artifactPaths) {
+        if (!candidate)
+            continue;
+        const resolved = path.resolve(candidate);
+        if (resolved === finalResolved || resolved === finalLineage)
+            continue;
+        try {
+            if (fs.existsSync(resolved) && fs.statSync(resolved).isFile()) {
+                fs.unlinkSync(resolved);
+            }
+        }
+        catch {
+            // Best-effort cleanup.
+        }
+        const sidecar = `${resolved}.lineage.json`;
+        if (sidecar === finalLineage)
+            continue;
+        try {
+            if (fs.existsSync(sidecar) && fs.statSync(sidecar).isFile()) {
+                fs.unlinkSync(sidecar);
+            }
+        }
+        catch {
+            // Best-effort cleanup.
+        }
+    }
+}
 function resolveDatasetLocalPath(datasetIdOrPath, preferredDirs = []) {
     if (fs.existsSync(datasetIdOrPath)) {
         return ensureExportableLocalPath(datasetIdOrPath);
@@ -1463,55 +1493,6 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     required: ["query"],
                 },
             },
-            {
-                name: "vesper.fuse",
-                description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
-                inputSchema: {
-                    type: "object",
-                    properties: {
-                        sources: {
-                            type: "array",
-                            description: "Web sources to collect from, each with its own query.",
-                            items: {
-                                type: "object",
-                                properties: {
-                                    type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
-                                    query: { type: "string", description: "Query for this source." },
-                                    max_results: { type: "number", description: "Max results for this source (optional)." },
-                                    min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
-                                    bucket: { type: "string", description: "S3 bucket (for type='s3')." },
-                                    path: { type: "string", description: "S3 prefix/path (for type='s3')." },
-                                    region: { type: "string", description: "AWS region (for type='s3')." },
-                                    credentials: {
-                                        type: "object",
-                                        description: "Pass-through AWS credentials (optional; not persisted).",
-                                        properties: {
-                                            accessKeyId: { type: "string" },
-                                            secretAccessKey: { type: "string" },
-                                            sessionToken: { type: "string" },
-                                            roleArn: { type: "string" },
-                                        }
-                                    },
-                                },
-                                required: ["type", "query"],
-                            },
-                        },
-                        merge_strategy: {
-                            type: "string",
-                            enum: ["union", "dedup"],
-                            description: "How to merge collected documents.",
-                        },
-                        deduplication: {
-                            type: "string",
-                            enum: ["semantic", "exact", "none"],
-                            description: "How to deduplicate across sources.",
-                        },
-                        agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
-                        pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
-                    },
-                    required: ["sources"],
-                },
-            },
             {
                 name: "vesper.extract_web",
                 description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
@@ -1606,18 +1587,6 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                     required: ["dataset_id", "source"],
                 },
             },
-            {
-                name: "configure_kaggle",
-                description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
-                inputSchema: {
-                    type: "object",
-                    properties: {
-                        username: { type: "string", description: "Kaggle username" },
-                        key: { type: "string", description: "Kaggle API key" }
-                    },
-                    required: ["username", "key"],
-                },
-            },
             {
                 name: "configure_keys",
                 description: "One-time optional key setup for external sources (Kaggle, data.world, gated HF). Core tools do not require keys.",
@@ -1646,17 +1615,29 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                 },
             },
             {
-                name: "analyze_quality",
-                description: "Perform a deep quality check on a dataset. Returns a detailed report including duplicates, outliers, and schema issues.",
+                name: "quality_analyze",
+                description: "Unified quality tool. operation='dataset' (tabular quality), 'image', 'media', or 'report' (multimodal report).",
                 inputSchema: {
                     type: "object",
                     properties: {
+                        operation: {
+                            type: "string",
+                            enum: ["dataset", "image", "media", "report"],
+                            description: "Quality analysis mode. Defaults to 'dataset'.",
+                        },
                         dataset_id: {
                             type: "string",
-                            description: "The dataset ID to analyze.",
+                            description: "Dataset ID for operation='dataset' or operation='report'.",
+                        },
+                        dataset_path: {
+                            type: "string",
+                            description: "Absolute dataset directory path for operation='report'.",
+                        },
+                        path: {
+                            type: "string",
+                            description: "Absolute file/folder path for operation='image' or operation='media'.",
                         },
                     },
-                    required: ["dataset_id"],
                 },
             },
             {
@@ -1808,39 +1789,30 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                 },
             },
             {
-                name: "get_lineage",
-                description: "Get version history and full lineage/provenance for a dataset (sources, steps, inputs/outputs, trigger metadata).",
+                name: "lineage",
+                description: "Unified lineage tool. operation='get' returns lineage/provenance history, operation='diff' compares two versions.",
                 inputSchema: {
                     type: "object",
                     properties: {
-                        dataset_id: {
+                        operation: {
                             type: "string",
-                            description: "Dataset ID (base or versioned, e.g. my_dataset or my_dataset_v2).",
+                            enum: ["get", "diff"],
+                            description: "Lineage operation. Defaults to 'get'.",
                         },
-                    },
-                    required: ["dataset_id"],
-                },
-            },
-            {
-                name: "diff_lineage_versions",
-                description: "Diff two lineage versions for one dataset and return structured changes (schema, rows, steps, actor identity).",
-                inputSchema: {
-                    type: "object",
-                    properties: {
                         dataset_id: {
                             type: "string",
                             description: "Dataset ID (base or versioned).",
                         },
                         from_version: {
                             type: "number",
-                            description: "Source lineage version number (e.g., 1).",
+                            description: "Source lineage version number (required for operation='diff').",
                         },
                         to_version: {
                             type: "number",
-                            description: "Target lineage version number (e.g., 2).",
+                            description: "Target lineage version number (required for operation='diff').",
                         },
                     },
-                    required: ["dataset_id", "from_version", "to_version"],
+                    required: ["dataset_id"],
                 },
             },
             {
@@ -1915,109 +1887,55 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
                 },
             },
             {
-                name: "fuse_datasets",
-                description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
+                name: "fuse",
+                description: "Unified fusion tool. operation='tabular' for row/column dataset fusion, operation='web' for web-native multi-source fusion.",
                 inputSchema: {
                     type: "object",
                     properties: {
-                        sources: {
-                            type: "array",
-                            items: { type: "string" },
-                            description: "List of dataset IDs and/or local file paths to fuse.",
-                        },
-                        strategy: {
-                            type: "string",
-                            enum: ["concat", "join"],
-                            description: "Fusion strategy. concat appends rows; join merges on key(s).",
-                        },
-                        join_on: {
-                            oneOf: [
-                                { type: "string" },
-                                { type: "array", items: { type: "string" } }
-                            ],
-                            description: "Join key(s). Required when strategy='join'.",
-                        },
-                        how: {
-                            type: "string",
-                            enum: ["inner", "left", "outer"],
-                            description: "Join mode (only for strategy='join').",
-                        },
-                        dedup: {
-                            type: "boolean",
-                            description: "Drop exact duplicate rows after fusion.",
-                        },
-                        run_quality_after: {
-                            type: "boolean",
-                            description: "Run quality analysis on the fused output.",
-                        },
-                        leakage_check: {
-                            type: "boolean",
-                            description: "Run leakage/overlap checks across fused sources.",
-                        },
-                        output_format: {
-                            type: "string",
-                            enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
-                            description: "Output format (default: parquet).",
-                        },
-                        compression: {
+                        operation: {
                             type: "string",
-                            enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
-                            description: "Compression algorithm for binary outputs.",
+                            enum: ["tabular", "web"],
+                            description: "Fusion operation mode. Defaults to 'tabular'.",
                         },
-                        preview: {
-                            type: "boolean",
-                            description: "Generate a small preview CSV of fused output.",
+                        sources: {
+                            type: "array",
+                            description: "For tabular: dataset IDs/paths. For web: source query objects.",
+                            items: {
+                                oneOf: [
+                                    { type: "string" },
+                                    {
+                                        type: "object",
+                                        properties: {
+                                            type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
+                                            query: { type: "string" },
+                                            max_results: { type: "number" },
+                                            min_stars: { type: "number" },
+                                            bucket: { type: "string" },
+                                            path: { type: "string" },
+                                            region: { type: "string" },
+                                        },
+                                        required: ["type", "query"],
+                                    },
+                                ],
+                            },
                         },
+                        strategy: { type: "string", enum: ["concat", "join"] },
+                        join_on: { oneOf: [{ type: "string" }, { type: "array", items: { type: "string" } }] },
+                        how: { type: "string", enum: ["inner", "left", "outer"] },
+                        dedup: { type: "boolean" },
+                        run_quality_after: { type: "boolean" },
+                        leakage_check: { type: "boolean" },
+                        output_format: { type: "string", enum: ["feather", "parquet", "csv", "jsonl", "arrow"] },
+                        compression: { type: "string", enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"] },
+                        preview: { type: "boolean" },
+                        merge_strategy: { type: "string", enum: ["union", "dedup"] },
+                        deduplication: { type: "string", enum: ["semantic", "exact", "none"] },
+                        agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
+                        pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
                     },
                     required: ["sources"],
                 },
             },
-            {
-                name: "analyze_image_quality",
-                description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
-                inputSchema: {
-                    type: "object",
-                    properties: {
-                        path: {
-                            type: "string",
-                            description: "Absolute path to the image file or folder.",
-                        },
-                    },
-                    required: ["path"],
-                },
-            },
-            {
-                name: "analyze_media_quality",
-                description: "Analyze audio/video quality (sample rate, duration, FPS, corruption) for a folder or single file.",
-                inputSchema: {
-                    type: "object",
-                    properties: {
-                        path: {
-                            type: "string",
-                            description: "Absolute path to the audio/video file or folder.",
-                        },
-                    },
-                    required: ["path"],
-                },
-            },
-            {
-                name: "generate_quality_report",
-                description: "Generate a comprehensive unified quality report for a multimodal dataset (text, image, audio, video).",
-                inputSchema: {
-                    type: "object",
-                    properties: {
-                        dataset_id: {
-                            type: "string",
-                            description: "Dataset identifier.",
-                        },
-                        dataset_path: {
-                            type: "string",
-                            description: "Absolute path to the dataset directory.",
-                        },
-                    },
-                    required: ["dataset_id", "dataset_path"],
-                },
-            },
         ],
     };
 });
@@ -2084,6 +2002,101 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             markStepComplete(String(datasetId), String(step));
         }
         switch (request.params.name) {
+            case "lineage":
+            case "get_lineage":
+            case "diff_lineage_versions": {
+                const operation = request.params.name === "get_lineage"
+                    ? "get"
+                    : request.params.name === "diff_lineage_versions"
+                        ? "diff"
+                        : String(request.params.arguments?.operation || "get").toLowerCase();
+                const datasetId = String(request.params.arguments?.dataset_id || "").trim();
+                if (!datasetId) {
+                    throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
+                }
+                if (operation === "get") {
+                    const base = toBaseDatasetId(datasetId);
+                    const record = readLineageRecord(base);
+                    if (!record.versions || record.versions.length === 0) {
+                        return {
+                            content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
+                        };
+                    }
+                    return {
+                        content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
+                    };
+                }
+                if (operation !== "diff") {
+                    throw new McpError(ErrorCode.InvalidParams, "operation must be 'get' or 'diff'");
+                }
+                const fromVersion = Number(request.params.arguments?.from_version);
+                const toVersion = Number(request.params.arguments?.to_version);
+                if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
+                    throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
+                }
+                if (!Number.isInteger(toVersion) || toVersion <= 0) {
+                    throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
+                }
+                const base = toBaseDatasetId(datasetId);
+                const record = readLineageRecord(base);
+                const fromV = record.versions.find((v) => v.version === fromVersion);
+                const toV = record.versions.find((v) => v.version === toVersion);
+                if (!fromV || !toV) {
+                    return {
+                        content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
+                        isError: true,
+                    };
+                }
+                const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
+                    ? fromV.output?.schema_after || fromV.output?.schema_before || {}
+                    : fromV.output?.schema_after || fromV.output?.schema_before || {};
+                const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
+                const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
+                const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
+                const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
+                const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
+                const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
+                const fromRows = typeof fromSchema.rows === "number" ? fromSchema.rows : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
+                const toRows = typeof toSchema.rows === "number" ? toSchema.rows : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
+                const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
+                const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
+                return {
+                    content: [{
+                            type: "text",
+                            text: JSON.stringify({
+                                dataset_id_base: base,
+                                from_version: fromVersion,
+                                to_version: toVersion,
+                                schema_diff: schemaDiff,
+                                row_count_delta: {
+                                    from: fromRows,
+                                    to: toRows,
+                                    delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
+                                },
+                                steps_diff: {
+                                    added: Array.from(toSteps).filter((s) => !fromSteps.has(s)),
+                                    removed: Array.from(fromSteps).filter((s) => !toSteps.has(s)),
+                                    from_steps: Array.from(fromSteps),
+                                    to_steps: Array.from(toSteps),
+                                },
+                                actor_diff: {
+                                    changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
+                                        String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
+                                    from: {
+                                        tool: fromV.triggered_by?.tool,
+                                        agent_id: fromV.triggered_by?.agent_id,
+                                        pipeline_id: fromV.triggered_by?.pipeline_id,
+                                    },
+                                    to: {
+                                        tool: toV.triggered_by?.tool,
+                                        agent_id: toV.triggered_by?.agent_id,
+                                        pipeline_id: toV.triggered_by?.pipeline_id,
+                                    },
+                                },
+                            }, null, 2),
+                        }],
+                };
+            }
             case "vesper_web_find": {
                 hydrateExternalKeys();
                 const query = String(request.params.arguments?.query || "").trim();
@@ -2568,20 +2581,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     };
                 }
             }
-            case "configure_kaggle": {
-                const username = String(request.params.arguments?.username || "").trim();
-                const key = String(request.params.arguments?.key || "").trim();
-                if (!username || !key) {
-                    throw new McpError(ErrorCode.InvalidParams, "username and key are required");
-                }
-                const r1 = secureKeys.set("kaggle_username", username);
-                const r2 = secureKeys.set("kaggle_key", key);
-                process.env.KAGGLE_USERNAME = username;
-                process.env.KAGGLE_KEY = key;
-                return {
-                    content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
-                };
-            }
             case "configure_keys": {
                 const hfToken = String(request.params.arguments?.hf_token || "").trim();
                 const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
@@ -2687,8 +2686,56 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 const formattedOutput = formatDatasetInfo(dataset);
                 return { content: [{ type: "text", text: formattedOutput }] };
             }
-            case "analyze_quality": {
-                const datasetId = String(request.params.arguments?.dataset_id);
+            case "quality_analyze":
+            case "analyze_quality":
+            case "analyze_image_quality":
+            case "analyze_media_quality":
+            case "generate_quality_report": {
+                const resolvedOperation = request.params.name === "analyze_image_quality"
+                    ? "image"
+                    : request.params.name === "analyze_media_quality"
+                        ? "media"
+                        : request.params.name === "generate_quality_report"
+                            ? "report"
+                            : String(request.params.arguments?.operation || "dataset").toLowerCase();
+                if (resolvedOperation === "image") {
+                    const inputPath = String(request.params.arguments?.path || "").trim();
+                    if (!inputPath || !fs.existsSync(inputPath)) {
+                        throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
+                    }
+                    const report = await imageAnalyzer.analyze(inputPath);
+                    return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
+                }
+                if (resolvedOperation === "media") {
+                    const inputPath = String(request.params.arguments?.path || "").trim();
+                    if (!inputPath || !fs.existsSync(inputPath)) {
+                        throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
+                    }
+                    const report = await mediaAnalyzer.analyze(inputPath);
+                    return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
+                }
+                if (resolvedOperation === "report") {
+                    const datasetId = String(request.params.arguments?.dataset_id || "").trim();
+                    const datasetPath = String(request.params.arguments?.dataset_path || "").trim();
+                    if (!datasetId) {
+                        throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='report'");
+                    }
+                    if (!datasetPath || !fs.existsSync(datasetPath)) {
+                        throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
+                    }
+                    const metadata = await metadataStore.getDataset(datasetId);
+                    const textQuality = null;
+                    const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
+                    if (metadata) {
+                        metadata.unified_quality_report = report;
+                        await metadataStore.saveDataset(metadata);
+                    }
+                    return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
+                }
+                const datasetId = String(request.params.arguments?.dataset_id || "").trim();
+                if (!datasetId) {
+                    throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='dataset'");
+                }
                 const safeId = toSafeDatasetPathFragment(datasetId);
                 const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
                 const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
@@ -2898,6 +2945,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
             case "export_dataset": {
                 const datasetId = String(request.params.arguments?.dataset_id);
                 const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
+                const intermediateArtifacts = new Set();
                 const requestedTargetDir = request.params.arguments?.target_dir
                     ? String(request.params.arguments?.target_dir).trim()
                     : request.params.arguments?.output_dir
@@ -2967,9 +3015,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     else if (currentExt !== pipelineFmt) {
                         console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
                         try {
+                            const beforeStagingPath = sourcePath;
                             sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
+                            if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
+                                intermediateArtifacts.add(sourcePath);
+                            }
                             const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
                             if (pipelineResult.final_output_path) {
+                                if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
+                                    intermediateArtifacts.add(pipelineResult.final_output_path);
+                                }
                                 sourcePath = pipelineResult.final_output_path;
                                 try {
                                     // Update registry to point to pipeline's final output
@@ -3058,6 +3113,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                         msg += `  Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
                         msg += `  DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
                     }
+                    cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
                     return { content: [{ type: "text", text: msg }] };
                 }
                 catch (error) {
@@ -3084,100 +3140,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
                 };
             }
-            case "get_lineage": {
-                const datasetId = String(request.params.arguments?.dataset_id || "").trim();
-                if (!datasetId) {
-                    throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
-                }
-                const base = toBaseDatasetId(datasetId);
-                const record = readLineageRecord(base);
-                if (!record.versions || record.versions.length === 0) {
-                    return {
-                        content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
-                    };
-                }
-                return {
-                    content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
-                };
-            }
-            case "diff_lineage_versions": {
-                const datasetId = String(request.params.arguments?.dataset_id || "").trim();
-                const fromVersion = Number(request.params.arguments?.from_version);
-                const toVersion = Number(request.params.arguments?.to_version);
-                if (!datasetId) {
-                    throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
-                }
-                if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
-                    throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
-                }
-                if (!Number.isInteger(toVersion) || toVersion <= 0) {
-                    throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
-                }
-                const base = toBaseDatasetId(datasetId);
-                const record = readLineageRecord(base);
-                const fromV = record.versions.find((v) => v.version === fromVersion);
-                const toV = record.versions.find((v) => v.version === toVersion);
-                if (!fromV || !toV) {
-                    return {
-                        content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
-                        isError: true,
-                    };
-                }
-                const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
-                    ? fromV.output?.schema_after || fromV.output?.schema_before || {}
-                    : fromV.output?.schema_after || fromV.output?.schema_before || {};
-                const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
-                const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
-                const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
-                const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
-                const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
-                const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
-                const fromRows = typeof fromSchema.rows === "number"
-                    ? fromSchema.rows
-                    : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
-                const toRows = typeof toSchema.rows === "number"
-                    ? toSchema.rows
-                    : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
-                const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
-                const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
-                const addedSteps = Array.from(toSteps).filter((s) => !fromSteps.has(s));
-                const removedSteps = Array.from(fromSteps).filter((s) => !toSteps.has(s));
-                const actorDiff = {
-                    changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
-                        String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
-                    from: {
-                        tool: fromV.triggered_by?.tool,
-                        agent_id: fromV.triggered_by?.agent_id,
-                        pipeline_id: fromV.triggered_by?.pipeline_id,
-                    },
-                    to: {
-                        tool: toV.triggered_by?.tool,
-                        agent_id: toV.triggered_by?.agent_id,
-                        pipeline_id: toV.triggered_by?.pipeline_id,
-                    },
-                };
-                const diffResult = {
-                    dataset_id_base: base,
-                    from_version: fromVersion,
-                    to_version: toVersion,
-                    schema_diff: schemaDiff,
-                    row_count_delta: {
-                        from: fromRows,
-                        to: toRows,
-                        delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
-                    },
-                    steps_diff: {
-                        added: addedSteps,
-                        removed: removedSteps,
-                        from_steps: Array.from(fromSteps),
-                        to_steps: Array.from(toSteps),
-                    },
-                    actor_diff: actorDiff,
-                };
-                return {
-                    content: [{ type: "text", text: JSON.stringify(diffResult, null, 2) }],
-                };
-            }
             case "vesper_convert_format": {
                 const filePath = String(request.params.arguments?.file_path || "").trim();
                 const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
@@ -3340,7 +3302,57 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
                 }
             }
+            case "fuse":
             case "fuse_datasets": {
+                const operation = request.params.name === "fuse_datasets"
+                    ? "tabular"
+                    : String(request.params.arguments?.operation || "tabular").toLowerCase();
+                if (operation === "web") {
+                    hydrateExternalKeys();
+                    const webSources = Array.isArray(request.params.arguments?.sources)
+                        ? request.params.arguments?.sources
+                        : undefined;
+                    if (!webSources || !Array.isArray(webSources)) {
+                        return {
+                            content: [{ type: "text", text: "ERROR: fuse(operation='web') requires 'sources' array." }],
+                            isError: true,
+                        };
+                    }
+                    const mergeStrategyRaw = request.params.arguments?.merge_strategy
+                        ? String(request.params.arguments?.merge_strategy).toLowerCase()
+                        : undefined;
+                    const dedupRaw = request.params.arguments?.deduplication
+                        ? String(request.params.arguments?.deduplication).toLowerCase()
+                        : undefined;
+                    const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
+                        ? mergeStrategyRaw
+                        : undefined;
+                    const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
+                        ? dedupRaw
+                        : undefined;
+                    const webResult = await webFusionEngine.fuse({
+                        sources: webSources.map((s) => ({
+                            type: String(s?.type || "").trim().toLowerCase(),
+                            query: String(s?.query || "").trim(),
+                            max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
+                            min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
+                            bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
+                            path: s?.path !== undefined ? String(s.path) : undefined,
+                            region: s?.region !== undefined ? String(s.region) : undefined,
+                            credentials: s?.credentials ? {
+                                accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
+                                secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
+                                sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
+                                roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
+                            } : undefined,
+                        })),
+                        merge_strategy,
+                        deduplication,
+                    });
+                    return {
+                        content: [{ type: "text", text: JSON.stringify(webResult, null, 2) }],
+                    };
+                }
                 const rawSources = request.params.arguments?.sources;
                 if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
                     throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
@@ -3454,142 +3466,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                     };
                 }
             }
-            case "analyze_image_quality": {
-                const inputPath = String(request.params.arguments?.path);
-                if (!fs.existsSync(inputPath)) {
-                    throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
-                }
-                try {
-                    const report = await imageAnalyzer.analyze(inputPath);
-                    let output = `## Image Quality Report\n\n`;
-                    output += `- **Total Images**: ${report.total_images}\n`;
-                    output += `- **Corrupted**: ${report.corrupted_count}\n`;
-                    output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
-                    output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
-                    if (report.individual_results.length > 0) {
-                        output += `### Sample Detail (Top 5)\n`;
-                        report.individual_results.slice(0, 5).forEach(img => {
-                            const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
-                            output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
-                        });
-                    }
-                    return {
-                        content: [{ type: "text", text: output }]
-                    };
-                }
-                catch (error) {
-                    return {
-                        content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
-                        isError: true
-                    };
-                }
-            }
-            case "analyze_media_quality": {
-                const inputPath = String(request.params.arguments?.path);
-                if (!fs.existsSync(inputPath)) {
-                    throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
-                }
-                try {
-                    const report = await mediaAnalyzer.analyze(inputPath);
-                    let output = `## Media Quality Report\n\n`;
-                    output += `- **Total Files**: ${report.total_files}\n`;
-                    output += `- **OK Files**: ${report.ok_files}\n`;
-                    output += `- **Failed Files**: ${report.failed_files}\n`;
-                    if ('avg_audio_duration' in report && report.avg_audio_duration) {
-                        output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
-                    }
-                    if ('avg_video_duration' in report && report.avg_video_duration) {
-                        output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
-                        output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
-                    }
-                    output += `\n### Sample Detail (Top 5)\n`;
-                    report.details.slice(0, 5).forEach(item => {
-                        const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
-                        if (item.type === "audio" && 'sample_rate' in item) {
-                            output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
-                        }
-                        else if (item.type === "video" && 'width' in item) {
-                            output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
-                        }
-                        else {
-                            output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
-                        }
-                    });
-                    return {
-                        content: [{ type: "text", text: output }]
-                    };
-                }
-                catch (error) {
-                    return {
-                        content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
-                        isError: true
-                    };
-                }
-            }
-            case "generate_quality_report": {
-                const datasetId = String(request.params.arguments?.dataset_id);
-                const datasetPath = String(request.params.arguments?.dataset_path);
-                if (!fs.existsSync(datasetPath)) {
-                    throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
-                }
-                try {
-                    // Optionally load text quality from metadata if available
-                    const metadata = await metadataStore.getDataset(datasetId);
-                    // TODO: Integrate text quality analysis when available
-                    const textQuality = null;
-                    const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
-                    // Save report to metadata
-                    if (metadata) {
-                        metadata.unified_quality_report = report;
-                        await metadataStore.saveDataset(metadata);
-                    }
-                    let output = `# Unified Quality Report\n\n`;
-                    output += `**Dataset**: ${datasetId}\n`;
-                    output += `**Modalities**: ${report.modalities.join(", ")}\n`;
-                    output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
-                    if (report.text_quality) {
-                        output += `## Text Quality\n`;
-                        output += `- Rows: ${report.text_quality.row_count}\n`;
-                        output += `- Columns: ${report.text_quality.column_count}\n`;
-                        output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
-                        output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
-                    }
-                    if (report.image_quality) {
-                        output += `## Image Quality\n`;
-                        output += `- Total Images: ${report.image_quality.total_images}\n`;
-                        output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
-                        output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
-                        output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
-                    }
-                    if (report.audio_quality) {
-                        output += `## Audio Quality\n`;
-                        output += `- Total Files: ${report.audio_quality.total_files}\n`;
-                        output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
-                        output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
-                        output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
-                    }
-                    if (report.video_quality) {
-                        output += `## Video Quality\n`;
-                        output += `- Total Files: ${report.video_quality.total_files}\n`;
-                        output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
-                        output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
-                        output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
-                    }
-                    output += `## Recommendations\n`;
-                    report.recommendations.forEach(rec => {
-                        output += `- ${rec}\n`;
-                    });
-                    return {
-                        content: [{ type: "text", text: output }]
-                    };
-                }
-                catch (error) {
-                    return {
-                        content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
-                        isError: true
-                    };
-                }
-            }
             default:
                 throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
         }
@@ -3933,6 +3809,7 @@ async function runExportCli(args) {
     const fastMode = args.includes("--fast");
     const preview = args.includes("--preview");
     const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
+    const intermediateArtifacts = new Set();
     const resolvedTargetDir = path.resolve(targetDir || process.cwd());
     let sourcePath = resolveDatasetLocalPath(datasetId, [resolvedTargetDir, process.cwd()]);
     if (!sourcePath) {
@@ -3954,9 +3831,16 @@ async function runExportCli(args) {
         const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
         if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
             try {
+                const beforeStagingPath = sourcePath;
                 sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, resolvedTargetDir);
+                if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
+                    intermediateArtifacts.add(sourcePath);
+                }
                 const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
                 if (pipelineResult.final_output_path) {
+                    if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
+                        intermediateArtifacts.add(pipelineResult.final_output_path);
+                    }
                     sourcePath = pipelineResult.final_output_path;
                     if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
                         upsertRegistry(datasetId, sourcePath, "completed");
@@ -3987,6 +3871,7 @@ async function runExportCli(args) {
     console.error(`[Export] Resolved output directory: ${outDir}`);
     console.error(`[Export] Output file: ${outputFile}`);
     const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
+    cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
     console.log(`Export complete: ${result.output_path}`);
     console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
     if (result.rows !== undefined)

package/build/python/cleaner.py CHANGED Viewed

@@ -182,6 +182,8 @@ def main():
                 output_format = "parquet"
         base_name = file_path.rsplit(".", 1)[0]
+        if base_name.endswith("_cleaned"):
+            base_name = base_name[:-8]
         if output_format == "csv":
             output_path = f"{base_name}_cleaned.csv"
             # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vespermcp/mcp-server",
-  "version": "1.2.28",
+  "version": "1.2.30",
   "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
   "type": "module",
   "main": "build/index.js",

package/src/python/cleaner.py CHANGED Viewed

@@ -182,6 +182,8 @@ def main():
                 output_format = "parquet"
         base_name = file_path.rsplit(".", 1)[0]
+        if base_name.endswith("_cleaned"):
+            base_name = base_name[:-8]
         if output_format == "csv":
             output_path = f"{base_name}_cleaned.csv"
             # Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)