@vespermcp/mcp-server 1.2.30 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -467,6 +467,8 @@ import readline from "readline";
467
467
  import http from "http";
468
468
  import https from "https";
469
469
  import os from "os";
470
+ import { enforcePlanGateForTool } from "./lib/plan-resolve.js";
471
+ import { recordMcpToolAnalyticsAfterCall } from "./lib/mcp-analytics.js";
470
472
  // Determine absolute paths relative to the compiled script
471
473
  const __filename = fileURLToPath(import.meta.url);
472
474
  const __dirname = path.dirname(__filename);
@@ -1942,6 +1944,13 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1942
1944
  // Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
1943
1945
  server.setRequestHandler(CallToolRequestSchema, async (request) => {
1944
1946
  return requestQueue.enqueue(async () => {
1947
+ const planGate = await enforcePlanGateForTool(String(request.params.name), request.params.arguments);
1948
+ if (!planGate.ok) {
1949
+ return {
1950
+ content: [{ type: "text", text: `ERROR: ${planGate.message}` }],
1951
+ isError: true,
1952
+ };
1953
+ }
1945
1954
  // --- Pipeline Enforcement ---
1946
1955
  // Map tool names to pipeline steps
1947
1956
  const toolToStep = {
@@ -2001,1474 +2010,1482 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2001
2010
  // Mark this step as complete
2002
2011
  markStepComplete(String(datasetId), String(step));
2003
2012
  }
2004
- switch (request.params.name) {
2005
- case "lineage":
2006
- case "get_lineage":
2007
- case "diff_lineage_versions": {
2008
- const operation = request.params.name === "get_lineage"
2009
- ? "get"
2010
- : request.params.name === "diff_lineage_versions"
2011
- ? "diff"
2012
- : String(request.params.arguments?.operation || "get").toLowerCase();
2013
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2014
- if (!datasetId) {
2015
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2016
- }
2017
- if (operation === "get") {
2013
+ const toolResponse = await (async () => {
2014
+ switch (request.params.name) {
2015
+ case "lineage":
2016
+ case "get_lineage":
2017
+ case "diff_lineage_versions": {
2018
+ const operation = request.params.name === "get_lineage"
2019
+ ? "get"
2020
+ : request.params.name === "diff_lineage_versions"
2021
+ ? "diff"
2022
+ : String(request.params.arguments?.operation || "get").toLowerCase();
2023
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2024
+ if (!datasetId) {
2025
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2026
+ }
2027
+ if (operation === "get") {
2028
+ const base = toBaseDatasetId(datasetId);
2029
+ const record = readLineageRecord(base);
2030
+ if (!record.versions || record.versions.length === 0) {
2031
+ return {
2032
+ content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
2033
+ };
2034
+ }
2035
+ return {
2036
+ content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
2037
+ };
2038
+ }
2039
+ if (operation !== "diff") {
2040
+ throw new McpError(ErrorCode.InvalidParams, "operation must be 'get' or 'diff'");
2041
+ }
2042
+ const fromVersion = Number(request.params.arguments?.from_version);
2043
+ const toVersion = Number(request.params.arguments?.to_version);
2044
+ if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
2045
+ throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
2046
+ }
2047
+ if (!Number.isInteger(toVersion) || toVersion <= 0) {
2048
+ throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
2049
+ }
2018
2050
  const base = toBaseDatasetId(datasetId);
2019
2051
  const record = readLineageRecord(base);
2020
- if (!record.versions || record.versions.length === 0) {
2052
+ const fromV = record.versions.find((v) => v.version === fromVersion);
2053
+ const toV = record.versions.find((v) => v.version === toVersion);
2054
+ if (!fromV || !toV) {
2021
2055
  return {
2022
- content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
2056
+ content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
2057
+ isError: true,
2023
2058
  };
2024
2059
  }
2060
+ const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
2061
+ ? fromV.output?.schema_after || fromV.output?.schema_before || {}
2062
+ : fromV.output?.schema_after || fromV.output?.schema_before || {};
2063
+ const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
2064
+ const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
2065
+ const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
2066
+ const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
2067
+ const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
2068
+ const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
2069
+ const fromRows = typeof fromSchema.rows === "number" ? fromSchema.rows : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
2070
+ const toRows = typeof toSchema.rows === "number" ? toSchema.rows : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
2071
+ const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
2072
+ const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
2025
2073
  return {
2026
- content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
2027
- };
2028
- }
2029
- if (operation !== "diff") {
2030
- throw new McpError(ErrorCode.InvalidParams, "operation must be 'get' or 'diff'");
2031
- }
2032
- const fromVersion = Number(request.params.arguments?.from_version);
2033
- const toVersion = Number(request.params.arguments?.to_version);
2034
- if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
2035
- throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
2036
- }
2037
- if (!Number.isInteger(toVersion) || toVersion <= 0) {
2038
- throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
2039
- }
2040
- const base = toBaseDatasetId(datasetId);
2041
- const record = readLineageRecord(base);
2042
- const fromV = record.versions.find((v) => v.version === fromVersion);
2043
- const toV = record.versions.find((v) => v.version === toVersion);
2044
- if (!fromV || !toV) {
2045
- return {
2046
- content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
2047
- isError: true,
2048
- };
2049
- }
2050
- const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
2051
- ? fromV.output?.schema_after || fromV.output?.schema_before || {}
2052
- : fromV.output?.schema_after || fromV.output?.schema_before || {};
2053
- const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
2054
- const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
2055
- const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
2056
- const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
2057
- const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
2058
- const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
2059
- const fromRows = typeof fromSchema.rows === "number" ? fromSchema.rows : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
2060
- const toRows = typeof toSchema.rows === "number" ? toSchema.rows : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
2061
- const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
2062
- const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
2063
- return {
2064
- content: [{
2065
- type: "text",
2066
- text: JSON.stringify({
2067
- dataset_id_base: base,
2068
- from_version: fromVersion,
2069
- to_version: toVersion,
2070
- schema_diff: schemaDiff,
2071
- row_count_delta: {
2072
- from: fromRows,
2073
- to: toRows,
2074
- delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
2075
- },
2076
- steps_diff: {
2077
- added: Array.from(toSteps).filter((s) => !fromSteps.has(s)),
2078
- removed: Array.from(fromSteps).filter((s) => !toSteps.has(s)),
2079
- from_steps: Array.from(fromSteps),
2080
- to_steps: Array.from(toSteps),
2081
- },
2082
- actor_diff: {
2083
- changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
2084
- String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
2085
- from: {
2086
- tool: fromV.triggered_by?.tool,
2087
- agent_id: fromV.triggered_by?.agent_id,
2088
- pipeline_id: fromV.triggered_by?.pipeline_id,
2074
+ content: [{
2075
+ type: "text",
2076
+ text: JSON.stringify({
2077
+ dataset_id_base: base,
2078
+ from_version: fromVersion,
2079
+ to_version: toVersion,
2080
+ schema_diff: schemaDiff,
2081
+ row_count_delta: {
2082
+ from: fromRows,
2083
+ to: toRows,
2084
+ delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
2089
2085
  },
2090
- to: {
2091
- tool: toV.triggered_by?.tool,
2092
- agent_id: toV.triggered_by?.agent_id,
2093
- pipeline_id: toV.triggered_by?.pipeline_id,
2086
+ steps_diff: {
2087
+ added: Array.from(toSteps).filter((s) => !fromSteps.has(s)),
2088
+ removed: Array.from(fromSteps).filter((s) => !toSteps.has(s)),
2089
+ from_steps: Array.from(fromSteps),
2090
+ to_steps: Array.from(toSteps),
2094
2091
  },
2095
- },
2096
- }, null, 2),
2097
- }],
2098
- };
2099
- }
2100
- case "vesper_web_find": {
2101
- hydrateExternalKeys();
2102
- const query = String(request.params.arguments?.query || "").trim();
2103
- const limit = Number(request.params.arguments?.limit || 10);
2104
- const sources = Array.isArray(request.params.arguments?.sources)
2105
- ? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
2106
- : undefined;
2107
- try {
2108
- const result = await webCoreEngine.find({
2109
- query,
2110
- sources: sources,
2111
- limit,
2112
- arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
2113
- github_include_readme: request.params.arguments?.github_include_readme === true,
2114
- });
2115
- try {
2116
- appendLineageVersion({
2117
- datasetIdBase: `webfind_${query || "query"}`,
2118
- tool: "vesper_web_find",
2119
- requestArgs: request.params.arguments,
2120
- output: {
2121
- rows: Array.isArray(result.results) ? result.results.length : undefined,
2122
- },
2123
- sources: Array.isArray(result.results)
2124
- ? result.results.slice(0, 200).map((r) => ({
2125
- source: String(r?.source_type || "unknown"),
2126
- url: typeof r?.source_url === "string" ? r.source_url : undefined,
2127
- at: typeof r?.collected_at === "string" ? r.collected_at : undefined,
2128
- }))
2129
- : [],
2130
- steps: [
2131
- { step: "web_find_discover", at: new Date().toISOString(), params: { query, sources, limit } },
2132
- { step: "web_find_complete", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
2133
- ],
2134
- });
2135
- }
2136
- catch (e) {
2137
- console.error(`[Lineage] vesper_web_find append failed: ${e?.message || e}`);
2138
- }
2139
- return {
2140
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
2141
- };
2142
- }
2143
- catch (error) {
2144
- return {
2145
- content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
2146
- isError: true,
2147
- };
2148
- }
2149
- }
2150
- case "vesper.fuse": {
2151
- hydrateExternalKeys();
2152
- const sources = Array.isArray(request.params.arguments?.sources)
2153
- ? request.params.arguments?.sources
2154
- : undefined;
2155
- if (!sources || !Array.isArray(sources)) {
2156
- return {
2157
- content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
2158
- isError: true,
2092
+ actor_diff: {
2093
+ changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
2094
+ String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
2095
+ from: {
2096
+ tool: fromV.triggered_by?.tool,
2097
+ agent_id: fromV.triggered_by?.agent_id,
2098
+ pipeline_id: fromV.triggered_by?.pipeline_id,
2099
+ },
2100
+ to: {
2101
+ tool: toV.triggered_by?.tool,
2102
+ agent_id: toV.triggered_by?.agent_id,
2103
+ pipeline_id: toV.triggered_by?.pipeline_id,
2104
+ },
2105
+ },
2106
+ }, null, 2),
2107
+ }],
2159
2108
  };
2160
2109
  }
2161
- try {
2162
- const mergeStrategyRaw = request.params.arguments?.merge_strategy
2163
- ? String(request.params.arguments?.merge_strategy).toLowerCase()
2164
- : undefined;
2165
- const dedupRaw = request.params.arguments?.deduplication
2166
- ? String(request.params.arguments?.deduplication).toLowerCase()
2167
- : undefined;
2168
- const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
2169
- ? mergeStrategyRaw
2170
- : undefined;
2171
- const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
2172
- ? dedupRaw
2110
+ case "vesper_web_find": {
2111
+ hydrateExternalKeys();
2112
+ const query = String(request.params.arguments?.query || "").trim();
2113
+ const limit = Number(request.params.arguments?.limit || 10);
2114
+ const sources = Array.isArray(request.params.arguments?.sources)
2115
+ ? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
2173
2116
  : undefined;
2174
- const result = await webFusionEngine.fuse({
2175
- sources: sources.map((s) => ({
2176
- type: String(s?.type || "").trim().toLowerCase(),
2177
- query: String(s?.query || "").trim(),
2178
- max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
2179
- min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
2180
- bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
2181
- path: s?.path !== undefined ? String(s.path) : undefined,
2182
- region: s?.region !== undefined ? String(s.region) : undefined,
2183
- credentials: s?.credentials ? {
2184
- accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
2185
- secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
2186
- sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
2187
- roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
2188
- } : undefined,
2189
- })),
2190
- merge_strategy,
2191
- deduplication,
2192
- });
2193
- return {
2194
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
2195
- };
2196
- }
2197
- catch (error) {
2198
- return {
2199
- content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
2200
- isError: true,
2201
- };
2202
- }
2203
- }
2204
- case "vesper.extract_web": {
2205
- hydrateExternalKeys();
2206
- const url = String(request.params.arguments?.url || "").trim();
2207
- const mode = request.params.arguments?.mode
2208
- ? String(request.params.arguments?.mode).trim().toLowerCase()
2209
- : "auto";
2210
- const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
2211
- ? request.params.arguments.schema
2212
- : undefined;
2213
- if (!url) {
2214
- return {
2215
- content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
2216
- isError: true,
2217
- };
2218
- }
2219
- try {
2220
- const out = await webExtractorEngine.extract({
2221
- url,
2222
- mode: mode,
2223
- strict_schema: request.params.arguments?.strict_schema !== false,
2224
- schema: schema,
2225
- });
2226
- return {
2227
- content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
2228
- };
2229
- }
2230
- catch (error) {
2231
- return {
2232
- content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
2233
- isError: true,
2234
- };
2235
- }
2236
- }
2237
- case "unified_dataset_api": {
2238
- hydrateExternalKeys();
2239
- const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
2240
- const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
2241
- const includeUnavailable = request.params.arguments?.include_unavailable === true;
2242
- const publicOnly = request.params.arguments?.public_only !== false;
2243
- try {
2244
- if (operation === "providers") {
2245
- return {
2246
- content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
2247
- };
2248
- }
2249
- if (operation === "discover") {
2250
- const query = String(request.params.arguments?.query || "").trim();
2251
- if (!query) {
2252
- throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
2253
- }
2254
- const result = await unifiedDatasetGateway.discover({
2117
+ try {
2118
+ const result = await webCoreEngine.find({
2255
2119
  query,
2256
- source,
2257
- limit: Number(request.params.arguments?.limit || 10),
2258
- publicOnly,
2120
+ sources: sources,
2121
+ limit,
2122
+ arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
2123
+ github_include_readme: request.params.arguments?.github_include_readme === true,
2259
2124
  });
2260
2125
  try {
2261
2126
  appendLineageVersion({
2262
- datasetIdBase: `discover_${source}_${query || "query"}`,
2263
- tool: "unified_dataset_api.discover",
2127
+ datasetIdBase: `webfind_${query || "query"}`,
2128
+ tool: "vesper_web_find",
2264
2129
  requestArgs: request.params.arguments,
2265
- output: { rows: Array.isArray(result.results) ? result.results.length : undefined },
2130
+ output: {
2131
+ rows: Array.isArray(result.results) ? result.results.length : undefined,
2132
+ },
2266
2133
  sources: Array.isArray(result.results)
2267
2134
  ? result.results.slice(0, 200).map((r) => ({
2268
- source: String(r?.source || source || "unknown"),
2269
- url: typeof r?.download_url === "string"
2270
- ? r.download_url
2271
- : (typeof r?.metadata_url === "string" ? r.metadata_url : undefined),
2272
- at: new Date().toISOString(),
2135
+ source: String(r?.source_type || "unknown"),
2136
+ url: typeof r?.source_url === "string" ? r.source_url : undefined,
2137
+ at: typeof r?.collected_at === "string" ? r.collected_at : undefined,
2273
2138
  }))
2274
2139
  : [],
2275
2140
  steps: [
2276
- { step: "discover_requested", at: new Date().toISOString(), params: { query, source, limit: Number(request.params.arguments?.limit || 10), publicOnly } },
2277
- { step: "discover_completed", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
2141
+ { step: "web_find_discover", at: new Date().toISOString(), params: { query, sources, limit } },
2142
+ { step: "web_find_complete", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
2278
2143
  ],
2279
2144
  });
2280
2145
  }
2281
2146
  catch (e) {
2282
- console.error(`[Lineage] unified discover append failed: ${e?.message || e}`);
2147
+ console.error(`[Lineage] vesper_web_find append failed: ${e?.message || e}`);
2283
2148
  }
2284
2149
  return {
2285
2150
  content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
2286
2151
  };
2287
2152
  }
2288
- if (operation === "download") {
2289
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2290
- if (!datasetId) {
2291
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
2292
- }
2293
- const requestedTargetDir = request.params.arguments?.target_dir
2294
- ? String(request.params.arguments.target_dir).trim()
2295
- : request.params.arguments?.output_dir
2296
- ? String(request.params.arguments.output_dir).trim()
2297
- : "";
2298
- const targetDir = requestedTargetDir || process.cwd();
2299
- try {
2300
- await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
2301
- }
2302
- catch {
2303
- // best effort; non-HF providers do not require this
2304
- }
2305
- const result = await unifiedDatasetGateway.download({
2306
- datasetId,
2307
- source,
2308
- targetDir,
2309
- });
2310
- try {
2311
- upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
2312
- }
2313
- catch (e) {
2314
- console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
2315
- }
2316
- try {
2317
- const schemaAfter = await getSchemaSnapshot(result.copied_to || result.local_path);
2318
- const lineage = appendLineageVersion({
2319
- datasetIdBase: result.dataset_id,
2320
- tool: "unified_dataset_api.download",
2321
- requestArgs: request.params.arguments,
2322
- outputPath: result.copied_to || result.local_path,
2323
- output: {
2324
- local_path: result.copied_to || result.local_path,
2325
- format: path.extname(result.copied_to || result.local_path).replace(".", ""),
2326
- schema_after: schemaAfter,
2327
- },
2328
- sources: [{
2329
- source: source,
2330
- url: typeof result.dataset_id === "string" ? result.dataset_id : undefined,
2331
- at: new Date().toISOString(),
2332
- }],
2333
- steps: [
2334
- { step: "download_requested", at: new Date().toISOString(), params: { datasetId, source, targetDir } },
2335
- { step: "download_completed", at: new Date().toISOString(), metrics: { local_path: result.copied_to || result.local_path } },
2336
- ],
2337
- });
2338
- try {
2339
- upsertRegistry(lineage.datasetVersionId, result.copied_to || result.local_path, "completed");
2340
- }
2341
- catch { }
2342
- }
2343
- catch (e) {
2344
- console.error(`[Lineage] unified download append failed: ${e?.message || e}`);
2345
- }
2153
+ catch (error) {
2346
2154
  return {
2347
- content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
2155
+ content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
2156
+ isError: true,
2348
2157
  };
2349
2158
  }
2350
- if (operation === "info") {
2351
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2352
- if (!datasetId) {
2353
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
2354
- }
2355
- const result = await unifiedDatasetGateway.info({
2356
- datasetId,
2357
- source,
2358
- publicOnly,
2159
+ }
2160
+ case "vesper.fuse": {
2161
+ hydrateExternalKeys();
2162
+ const sources = Array.isArray(request.params.arguments?.sources)
2163
+ ? request.params.arguments?.sources
2164
+ : undefined;
2165
+ if (!sources || !Array.isArray(sources)) {
2166
+ return {
2167
+ content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
2168
+ isError: true,
2169
+ };
2170
+ }
2171
+ try {
2172
+ const mergeStrategyRaw = request.params.arguments?.merge_strategy
2173
+ ? String(request.params.arguments?.merge_strategy).toLowerCase()
2174
+ : undefined;
2175
+ const dedupRaw = request.params.arguments?.deduplication
2176
+ ? String(request.params.arguments?.deduplication).toLowerCase()
2177
+ : undefined;
2178
+ const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
2179
+ ? mergeStrategyRaw
2180
+ : undefined;
2181
+ const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
2182
+ ? dedupRaw
2183
+ : undefined;
2184
+ const result = await webFusionEngine.fuse({
2185
+ sources: sources.map((s) => ({
2186
+ type: String(s?.type || "").trim().toLowerCase(),
2187
+ query: String(s?.query || "").trim(),
2188
+ max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
2189
+ min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
2190
+ bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
2191
+ path: s?.path !== undefined ? String(s.path) : undefined,
2192
+ region: s?.region !== undefined ? String(s.region) : undefined,
2193
+ credentials: s?.credentials ? {
2194
+ accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
2195
+ secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
2196
+ sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
2197
+ roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
2198
+ } : undefined,
2199
+ })),
2200
+ merge_strategy,
2201
+ deduplication,
2359
2202
  });
2360
2203
  return {
2361
2204
  content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
2362
2205
  };
2363
2206
  }
2364
- throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
2365
- }
2366
- catch (error) {
2367
- return {
2368
- content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
2369
- isError: true,
2370
- };
2371
- }
2372
- }
2373
- case "vesper_search": {
2374
- const query = String(request.params.arguments?.query);
2375
- const limit = 5;
2376
- const safeOnly = true; // Enable safe filter by default
2377
- const enableJIT = request.params.arguments?.enable_jit === true;
2378
- if (!query) {
2379
- throw new McpError(ErrorCode.InvalidParams, "Query is required");
2380
- }
2381
- const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
2382
- const formattedOutput = formatSearchResults(results);
2383
- return {
2384
- content: [
2385
- {
2386
- type: "text",
2387
- text: formattedOutput,
2388
- },
2389
- ],
2390
- };
2391
- }
2392
- case "discover_datasets": {
2393
- hydrateExternalKeys();
2394
- const query = String(request.params.arguments?.query || "").trim();
2395
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
2396
- const limit = Number(request.params.arguments?.limit || 10);
2397
- if (!query) {
2398
- throw new McpError(ErrorCode.InvalidParams, "query is required");
2399
- }
2400
- try {
2401
- const gatewayResult = await unifiedDatasetGateway.discover({
2402
- query,
2403
- source,
2404
- limit,
2405
- publicOnly: false,
2406
- });
2407
- const results = gatewayResult.results;
2408
- const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
2409
- for (const ds of results.slice(0, limit)) {
2410
- const info = {
2411
- dataset_id: ds.id,
2412
- id: ds.id,
2413
- source: ds.source,
2414
- repo_id: ds.id,
2415
- total_images: ds.total_examples || 0,
2416
- image_column: undefined,
2417
- recipes_dir: path.join(dataRoot, "recipes"),
2207
+ catch (error) {
2208
+ return {
2209
+ content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
2210
+ isError: true,
2418
2211
  };
2419
- try {
2420
- await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
2421
- }
2422
- catch {
2423
- // best-effort recipe generation; ignore discovery-time recipe failures
2424
- }
2425
2212
  }
2426
- const formattedOutput = formatSearchResults(results.slice(0, limit));
2427
- const noteBlock = gatewayResult.notes.length > 0
2428
- ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
2429
- : "";
2430
- return {
2431
- content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
2432
- };
2433
- }
2434
- catch (error) {
2435
- return {
2436
- content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
2437
- isError: true,
2438
- };
2439
2213
  }
2440
- }
2441
- case "download_dataset": {
2442
- hydrateExternalKeys();
2443
- const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
2444
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2445
- const requestedTargetDir = request.params.arguments?.target_dir
2446
- ? String(request.params.arguments.target_dir).trim()
2447
- : request.params.arguments?.output_dir
2448
- ? String(request.params.arguments.output_dir).trim()
2449
- : "";
2450
- const targetDir = requestedTargetDir || process.cwd();
2451
- if (!datasetId) {
2452
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2453
- }
2454
- // Pre-install Python datasets library for HuggingFace fallback
2455
- if (source === "huggingface") {
2456
- try {
2457
- await ensurePythonModules([
2458
- { module: "datasets", packageName: "datasets" },
2459
- ]);
2460
- }
2461
- catch {
2462
- // Continue - direct download may still work
2214
+ case "vesper.extract_web": {
2215
+ hydrateExternalKeys();
2216
+ const url = String(request.params.arguments?.url || "").trim();
2217
+ const mode = request.params.arguments?.mode
2218
+ ? String(request.params.arguments?.mode).trim().toLowerCase()
2219
+ : "auto";
2220
+ const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
2221
+ ? request.params.arguments.schema
2222
+ : undefined;
2223
+ if (!url) {
2224
+ return {
2225
+ content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
2226
+ isError: true,
2227
+ };
2463
2228
  }
2464
- }
2465
- try {
2466
- const result = await unifiedDatasetGateway.download({
2467
- datasetId,
2468
- source,
2469
- targetDir,
2470
- });
2471
2229
  try {
2472
- upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
2230
+ const out = await webExtractorEngine.extract({
2231
+ url,
2232
+ mode: mode,
2233
+ strict_schema: request.params.arguments?.strict_schema !== false,
2234
+ schema: schema,
2235
+ });
2236
+ return {
2237
+ content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
2238
+ };
2473
2239
  }
2474
- catch (e) {
2475
- console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
2240
+ catch (error) {
2241
+ return {
2242
+ content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
2243
+ isError: true,
2244
+ };
2476
2245
  }
2477
- const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
2478
- return {
2479
- content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
2480
- };
2481
- }
2482
- catch (error) {
2483
- return {
2484
- content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
2485
- isError: true,
2486
- };
2487
- }
2488
- }
2489
- case "vesper_download_assets": {
2490
- hydrateExternalKeys();
2491
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2492
- const source = String(request.params.arguments?.source || "").trim().toLowerCase();
2493
- // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
2494
- const repoId = request.params.arguments?.repo_id
2495
- ? String(request.params.arguments.repo_id)
2496
- : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
2497
- const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
2498
- const urls = Array.isArray(request.params.arguments?.urls)
2499
- ? (request.params.arguments?.urls).map(v => String(v))
2500
- : undefined;
2501
- const outputFormat = String(request.params.arguments?.output_format || "webdataset");
2502
- const requestedOutputDir = request.params.arguments?.target_dir
2503
- ? String(request.params.arguments.target_dir).trim()
2504
- : request.params.arguments?.output_dir
2505
- ? String(request.params.arguments.output_dir).trim()
2506
- : undefined;
2507
- const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
2508
- const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
2509
- const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
2510
- if (!datasetId || !source) {
2511
- throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
2512
- }
2513
- if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
2514
- return {
2515
- content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
2516
- isError: true,
2517
- };
2518
- }
2519
- const requiredModules = [
2520
- { module: "aiohttp", packageName: "aiohttp" },
2521
- ];
2522
- if (source === "url") {
2523
- requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
2524
- }
2525
- if (source === "huggingface") {
2526
- requiredModules.push({ module: "datasets", packageName: "datasets" });
2527
- requiredModules.push({ module: "PIL", packageName: "Pillow" });
2528
- }
2529
- if (source === "kaggle") {
2530
- requiredModules.push({ module: "kaggle", packageName: "kaggle" });
2531
- }
2532
- try {
2533
- await ensurePythonModules(requiredModules);
2534
2246
  }
2535
- catch (error) {
2536
- return {
2537
- content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
2538
- isError: true,
2539
- };
2540
- }
2541
- const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
2542
- const payload = {
2543
- dataset_id: datasetId,
2544
- source,
2545
- repo_id: repoId,
2546
- kaggle_ref: kaggleRef,
2547
- urls,
2548
- output_format: outputFormat,
2549
- output_dir: requestedOutputDir,
2550
- max_items: maxItems,
2551
- workers,
2552
- image_column: imageColumn,
2553
- output_root: requestedOutputDir || process.cwd(),
2554
- recipes_dir: path.join(dataRoot, "recipes"),
2555
- };
2556
- try {
2557
- const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
2558
- if (!result?.ok) {
2559
- const errMsg = result?.error || "Unknown error";
2560
- // Enhance error messages for common failures
2561
- let hint = "";
2562
- if (errMsg.includes("No image column")) {
2563
- hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
2247
+ case "unified_dataset_api": {
2248
+ hydrateExternalKeys();
2249
+ const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
2250
+ const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
2251
+ const includeUnavailable = request.params.arguments?.include_unavailable === true;
2252
+ const publicOnly = request.params.arguments?.public_only !== false;
2253
+ try {
2254
+ if (operation === "providers") {
2255
+ return {
2256
+ content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
2257
+ };
2258
+ }
2259
+ if (operation === "discover") {
2260
+ const query = String(request.params.arguments?.query || "").trim();
2261
+ if (!query) {
2262
+ throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
2263
+ }
2264
+ const result = await unifiedDatasetGateway.discover({
2265
+ query,
2266
+ source,
2267
+ limit: Number(request.params.arguments?.limit || 10),
2268
+ publicOnly,
2269
+ });
2270
+ try {
2271
+ appendLineageVersion({
2272
+ datasetIdBase: `discover_${source}_${query || "query"}`,
2273
+ tool: "unified_dataset_api.discover",
2274
+ requestArgs: request.params.arguments,
2275
+ output: { rows: Array.isArray(result.results) ? result.results.length : undefined },
2276
+ sources: Array.isArray(result.results)
2277
+ ? result.results.slice(0, 200).map((r) => ({
2278
+ source: String(r?.source || source || "unknown"),
2279
+ url: typeof r?.download_url === "string"
2280
+ ? r.download_url
2281
+ : (typeof r?.metadata_url === "string" ? r.metadata_url : undefined),
2282
+ at: new Date().toISOString(),
2283
+ }))
2284
+ : [],
2285
+ steps: [
2286
+ { step: "discover_requested", at: new Date().toISOString(), params: { query, source, limit: Number(request.params.arguments?.limit || 10), publicOnly } },
2287
+ { step: "discover_completed", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
2288
+ ],
2289
+ });
2290
+ }
2291
+ catch (e) {
2292
+ console.error(`[Lineage] unified discover append failed: ${e?.message || e}`);
2293
+ }
2294
+ return {
2295
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
2296
+ };
2297
+ }
2298
+ if (operation === "download") {
2299
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2300
+ if (!datasetId) {
2301
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
2302
+ }
2303
+ const requestedTargetDir = request.params.arguments?.target_dir
2304
+ ? String(request.params.arguments.target_dir).trim()
2305
+ : request.params.arguments?.output_dir
2306
+ ? String(request.params.arguments.output_dir).trim()
2307
+ : "";
2308
+ const targetDir = requestedTargetDir || process.cwd();
2309
+ try {
2310
+ await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
2311
+ }
2312
+ catch {
2313
+ // best effort; non-HF providers do not require this
2314
+ }
2315
+ const result = await unifiedDatasetGateway.download({
2316
+ datasetId,
2317
+ source,
2318
+ targetDir,
2319
+ });
2320
+ try {
2321
+ upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
2322
+ }
2323
+ catch (e) {
2324
+ console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
2325
+ }
2326
+ try {
2327
+ const schemaAfter = await getSchemaSnapshot(result.copied_to || result.local_path);
2328
+ const lineage = appendLineageVersion({
2329
+ datasetIdBase: result.dataset_id,
2330
+ tool: "unified_dataset_api.download",
2331
+ requestArgs: request.params.arguments,
2332
+ outputPath: result.copied_to || result.local_path,
2333
+ output: {
2334
+ local_path: result.copied_to || result.local_path,
2335
+ format: path.extname(result.copied_to || result.local_path).replace(".", ""),
2336
+ schema_after: schemaAfter,
2337
+ },
2338
+ sources: [{
2339
+ source: source,
2340
+ url: typeof result.dataset_id === "string" ? result.dataset_id : undefined,
2341
+ at: new Date().toISOString(),
2342
+ }],
2343
+ steps: [
2344
+ { step: "download_requested", at: new Date().toISOString(), params: { datasetId, source, targetDir } },
2345
+ { step: "download_completed", at: new Date().toISOString(), metrics: { local_path: result.copied_to || result.local_path } },
2346
+ ],
2347
+ });
2348
+ try {
2349
+ upsertRegistry(lineage.datasetVersionId, result.copied_to || result.local_path, "completed");
2350
+ }
2351
+ catch { }
2352
+ }
2353
+ catch (e) {
2354
+ console.error(`[Lineage] unified download append failed: ${e?.message || e}`);
2355
+ }
2356
+ return {
2357
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
2358
+ };
2564
2359
  }
2565
- else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
2566
- hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
2360
+ if (operation === "info") {
2361
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2362
+ if (!datasetId) {
2363
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
2364
+ }
2365
+ const result = await unifiedDatasetGateway.info({
2366
+ datasetId,
2367
+ source,
2368
+ publicOnly,
2369
+ });
2370
+ return {
2371
+ content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
2372
+ };
2567
2373
  }
2374
+ throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
2375
+ }
2376
+ catch (error) {
2568
2377
  return {
2569
- content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
2378
+ content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
2570
2379
  isError: true,
2571
2380
  };
2572
2381
  }
2573
- return {
2574
- content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
2575
- };
2576
2382
  }
2577
- catch (error) {
2383
+ case "vesper_search": {
2384
+ const query = String(request.params.arguments?.query);
2385
+ const limit = 5;
2386
+ const safeOnly = true; // Enable safe filter by default
2387
+ const enableJIT = request.params.arguments?.enable_jit === true;
2388
+ if (!query) {
2389
+ throw new McpError(ErrorCode.InvalidParams, "Query is required");
2390
+ }
2391
+ const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
2392
+ const formattedOutput = formatSearchResults(results);
2578
2393
  return {
2579
- content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
2580
- isError: true,
2394
+ content: [
2395
+ {
2396
+ type: "text",
2397
+ text: formattedOutput,
2398
+ },
2399
+ ],
2581
2400
  };
2582
2401
  }
2583
- }
2584
- case "configure_keys": {
2585
- const hfToken = String(request.params.arguments?.hf_token || "").trim();
2586
- const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
2587
- const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
2588
- const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
2589
- const saved = [];
2590
- const methods = [];
2591
- if (hfToken) {
2592
- const r = secureKeys.set("hf_token", hfToken);
2593
- if (r.ok) {
2594
- process.env.HF_TOKEN = hfToken;
2595
- saved.push("HF token");
2596
- if (r.method)
2597
- methods.push(r.method);
2598
- }
2599
- }
2600
- if (kaggleUsername) {
2601
- const r = secureKeys.set("kaggle_username", kaggleUsername);
2602
- if (r.ok) {
2603
- process.env.KAGGLE_USERNAME = kaggleUsername;
2604
- saved.push("Kaggle username");
2605
- if (r.method)
2606
- methods.push(r.method);
2402
+ case "discover_datasets": {
2403
+ hydrateExternalKeys();
2404
+ const query = String(request.params.arguments?.query || "").trim();
2405
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
2406
+ const limit = Number(request.params.arguments?.limit || 10);
2407
+ if (!query) {
2408
+ throw new McpError(ErrorCode.InvalidParams, "query is required");
2607
2409
  }
2608
- }
2609
- if (kaggleKey) {
2610
- const r = secureKeys.set("kaggle_key", kaggleKey);
2611
- if (r.ok) {
2612
- process.env.KAGGLE_KEY = kaggleKey;
2613
- saved.push("Kaggle key");
2614
- if (r.method)
2615
- methods.push(r.method);
2410
+ try {
2411
+ const gatewayResult = await unifiedDatasetGateway.discover({
2412
+ query,
2413
+ source,
2414
+ limit,
2415
+ publicOnly: false,
2416
+ });
2417
+ const results = gatewayResult.results;
2418
+ const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
2419
+ for (const ds of results.slice(0, limit)) {
2420
+ const info = {
2421
+ dataset_id: ds.id,
2422
+ id: ds.id,
2423
+ source: ds.source,
2424
+ repo_id: ds.id,
2425
+ total_images: ds.total_examples || 0,
2426
+ image_column: undefined,
2427
+ recipes_dir: path.join(dataRoot, "recipes"),
2428
+ };
2429
+ try {
2430
+ await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
2431
+ }
2432
+ catch {
2433
+ // best-effort recipe generation; ignore discovery-time recipe failures
2434
+ }
2435
+ }
2436
+ const formattedOutput = formatSearchResults(results.slice(0, limit));
2437
+ const noteBlock = gatewayResult.notes.length > 0
2438
+ ? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
2439
+ : "";
2440
+ return {
2441
+ content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
2442
+ };
2616
2443
  }
2617
- }
2618
- if (dataworldToken) {
2619
- const r = secureKeys.set("dataworld_token", dataworldToken);
2620
- if (r.ok) {
2621
- process.env.DW_AUTH_TOKEN = dataworldToken;
2622
- saved.push("data.world token");
2623
- if (r.method)
2624
- methods.push(r.method);
2444
+ catch (error) {
2445
+ return {
2446
+ content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
2447
+ isError: true,
2448
+ };
2625
2449
  }
2626
2450
  }
2627
- if (saved.length === 0) {
2628
- return {
2629
- content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
2630
- };
2631
- }
2632
- return {
2633
- content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
2634
- };
2635
- }
2636
- case "get_dataset_info": {
2637
- const datasetId = String(request.params.arguments?.dataset_id);
2638
- if (!datasetId) {
2639
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2640
- }
2641
- const dataset = metadataStore.getDataset(datasetId);
2642
- if (!dataset) {
2643
- // Fallback: check the registry for local path info
2644
- const regEntry = getRegistryEntry(datasetId);
2645
- const regPath = regEntry?.local_path || regEntry?.path;
2646
- if (regEntry) {
2647
- const exists = regPath && fs.existsSync(regPath);
2648
- return {
2649
- content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
2650
- };
2451
+ case "download_dataset": {
2452
+ hydrateExternalKeys();
2453
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
2454
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2455
+ const requestedTargetDir = request.params.arguments?.target_dir
2456
+ ? String(request.params.arguments.target_dir).trim()
2457
+ : request.params.arguments?.output_dir
2458
+ ? String(request.params.arguments.output_dir).trim()
2459
+ : "";
2460
+ const targetDir = requestedTargetDir || process.cwd();
2461
+ if (!datasetId) {
2462
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2651
2463
  }
2652
- return {
2653
- content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
2654
- isError: true,
2655
- };
2656
- }
2657
- // Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
2658
- if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
2659
- try {
2660
- const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
2661
- if (sizeResp.ok) {
2662
- const sizeData = await sizeResp.json();
2663
- const numRows = sizeData?.size?.dataset?.num_rows;
2664
- if (numRows && numRows > 0) {
2665
- dataset.total_examples = numRows;
2666
- // Also backfill splits
2667
- if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
2668
- dataset.splits = sizeData.size.splits.map((s) => ({
2669
- name: s.split,
2670
- num_examples: s.num_rows || 0,
2671
- size_bytes: s.num_bytes_parquet_files || 0,
2672
- }));
2673
- dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
2674
- dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
2675
- dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
2676
- }
2677
- // Persist enriched metadata
2678
- metadataStore.saveDataset(dataset);
2679
- }
2464
+ // Pre-install Python datasets library for HuggingFace fallback
2465
+ if (source === "huggingface") {
2466
+ try {
2467
+ await ensurePythonModules([
2468
+ { module: "datasets", packageName: "datasets" },
2469
+ ]);
2470
+ }
2471
+ catch {
2472
+ // Continue - direct download may still work
2680
2473
  }
2681
2474
  }
2682
- catch {
2683
- // Enrichment is best-effort; continue with whatever we have
2684
- }
2685
- }
2686
- const formattedOutput = formatDatasetInfo(dataset);
2687
- return { content: [{ type: "text", text: formattedOutput }] };
2688
- }
2689
- case "quality_analyze":
2690
- case "analyze_quality":
2691
- case "analyze_image_quality":
2692
- case "analyze_media_quality":
2693
- case "generate_quality_report": {
2694
- const resolvedOperation = request.params.name === "analyze_image_quality"
2695
- ? "image"
2696
- : request.params.name === "analyze_media_quality"
2697
- ? "media"
2698
- : request.params.name === "generate_quality_report"
2699
- ? "report"
2700
- : String(request.params.arguments?.operation || "dataset").toLowerCase();
2701
- if (resolvedOperation === "image") {
2702
- const inputPath = String(request.params.arguments?.path || "").trim();
2703
- if (!inputPath || !fs.existsSync(inputPath)) {
2704
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2475
+ try {
2476
+ const result = await unifiedDatasetGateway.download({
2477
+ datasetId,
2478
+ source,
2479
+ targetDir,
2480
+ });
2481
+ try {
2482
+ upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
2483
+ }
2484
+ catch (e) {
2485
+ console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
2486
+ }
2487
+ const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
2488
+ return {
2489
+ content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
2490
+ };
2705
2491
  }
2706
- const report = await imageAnalyzer.analyze(inputPath);
2707
- return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
2708
- }
2709
- if (resolvedOperation === "media") {
2710
- const inputPath = String(request.params.arguments?.path || "").trim();
2711
- if (!inputPath || !fs.existsSync(inputPath)) {
2712
- throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2492
+ catch (error) {
2493
+ return {
2494
+ content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
2495
+ isError: true,
2496
+ };
2713
2497
  }
2714
- const report = await mediaAnalyzer.analyze(inputPath);
2715
- return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
2716
2498
  }
2717
- if (resolvedOperation === "report") {
2499
+ case "vesper_download_assets": {
2500
+ hydrateExternalKeys();
2718
2501
  const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2719
- const datasetPath = String(request.params.arguments?.dataset_path || "").trim();
2720
- if (!datasetId) {
2721
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='report'");
2502
+ const source = String(request.params.arguments?.source || "").trim().toLowerCase();
2503
+ // Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
2504
+ const repoId = request.params.arguments?.repo_id
2505
+ ? String(request.params.arguments.repo_id)
2506
+ : (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
2507
+ const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
2508
+ const urls = Array.isArray(request.params.arguments?.urls)
2509
+ ? (request.params.arguments?.urls).map(v => String(v))
2510
+ : undefined;
2511
+ const outputFormat = String(request.params.arguments?.output_format || "webdataset");
2512
+ const requestedOutputDir = request.params.arguments?.target_dir
2513
+ ? String(request.params.arguments.target_dir).trim()
2514
+ : request.params.arguments?.output_dir
2515
+ ? String(request.params.arguments.output_dir).trim()
2516
+ : undefined;
2517
+ const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
2518
+ const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
2519
+ const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
2520
+ if (!datasetId || !source) {
2521
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
2522
+ }
2523
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
2524
+ return {
2525
+ content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
2526
+ isError: true,
2527
+ };
2722
2528
  }
2723
- if (!datasetPath || !fs.existsSync(datasetPath)) {
2724
- throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2529
+ const requiredModules = [
2530
+ { module: "aiohttp", packageName: "aiohttp" },
2531
+ ];
2532
+ if (source === "url") {
2533
+ requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
2725
2534
  }
2726
- const metadata = await metadataStore.getDataset(datasetId);
2727
- const textQuality = null;
2728
- const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2729
- if (metadata) {
2730
- metadata.unified_quality_report = report;
2731
- await metadataStore.saveDataset(metadata);
2535
+ if (source === "huggingface") {
2536
+ requiredModules.push({ module: "datasets", packageName: "datasets" });
2537
+ requiredModules.push({ module: "PIL", packageName: "Pillow" });
2732
2538
  }
2733
- return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
2734
- }
2735
- const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2736
- if (!datasetId) {
2737
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='dataset'");
2738
- }
2739
- const safeId = toSafeDatasetPathFragment(datasetId);
2740
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2741
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
2742
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
2743
- // Demo Fallback for easy testing
2744
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
2745
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
2746
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
2747
- if (fs.existsSync(demoParquetPath)) {
2748
- filePath = demoParquetPath;
2539
+ if (source === "kaggle") {
2540
+ requiredModules.push({ module: "kaggle", packageName: "kaggle" });
2749
2541
  }
2750
- else if (fs.existsSync(demoCsvPath)) {
2751
- filePath = demoCsvPath;
2542
+ try {
2543
+ await ensurePythonModules(requiredModules);
2752
2544
  }
2753
- else if (datasetId !== "demo") {
2545
+ catch (error) {
2754
2546
  return {
2755
- content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
2756
- isError: true
2547
+ content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
2548
+ isError: true,
2757
2549
  };
2758
2550
  }
2759
- }
2760
- const report = await qualityAnalyzer.analyze(filePath);
2761
- return {
2762
- content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
2763
- };
2764
- }
2765
- case "preview_cleaning": {
2766
- const datasetId = String(request.params.arguments?.dataset_id);
2767
- const safeId = toSafeDatasetPathFragment(datasetId);
2768
- const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2769
- const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
2770
- let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
2771
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
2772
- const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
2773
- const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
2774
- if (fs.existsSync(demoParquetPath)) {
2775
- filePath = demoParquetPath;
2776
- }
2777
- else if (fs.existsSync(demoCsvPath)) {
2778
- filePath = demoCsvPath;
2779
- }
2780
- else {
2781
- throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
2782
- }
2783
- }
2784
- const report = await qualityAnalyzer.analyze(filePath);
2785
- // Phase 1: Target Detection
2786
- // We use the same TargetDetector instance inside CleaningPlanner now?
2787
- // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
2788
- // OR let the planner handle it if we update its signature to accept filePath.
2789
- // Let's check `CleaningPlanner.generatePlan` signature again.
2790
- // We updated it to accept `targetInfo`.
2791
- // So we need to run detection HERE and pass it.
2792
- // But `TargetDetector` is not exposed in `index.ts` scope yet.
2793
- // Let's create a global instance or use the one inside planner if exposed (it's private).
2794
- // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
2795
- // Quick fix: Instantiate local detector or make global.
2796
- // I'll make a global `targetDetector` constant in index.ts
2797
- // But wait, I updated `CleaningPlanner` to instantiate its own detector.
2798
- // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
2799
- // RETRY STRATEGY:
2800
- // 1. Instantiate `targetDetector` in `index.ts`.
2801
- // 2. Run `detectTarget(filePath)`.
2802
- // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
2803
- // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
2804
- // But since I'm in this tool, I can't look back.
2805
- // I will assume I can add it, or just do it inside the case for now.
2806
- // To do it properly, I should have added `targetDetector` to the global scope in previous step.
2807
- // Let's do that in a separate step if needed.
2808
- // For now, I'll instantiate it here.
2809
- const { TargetDetector } = await import("./preparation/target-detector.js");
2810
- const detector = new TargetDetector(__dirname);
2811
- const targetResult = await detector.detectTarget(filePath);
2812
- const targetInfo = targetResult.target_column ? {
2813
- target: targetResult.target_column,
2814
- confidence: targetResult.confidence
2815
- } : undefined;
2816
- const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
2817
- let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
2818
- if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
2819
- explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
2820
- explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
2821
- }
2822
- explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
2823
- if (plan.operations.length === 0) {
2824
- explanation += "No cleaning operations required.";
2825
- }
2826
- else {
2827
- plan.operations.forEach((op, i) => {
2828
- explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
2829
- });
2830
- }
2831
- return {
2832
- content: [{ type: "text", text: explanation }]
2833
- };
2834
- }
2835
- case "custom_clean": {
2836
- const datasetId = String(request.params.arguments?.dataset_id);
2837
- const ops = request.params.arguments?.operations;
2838
- if (!datasetId || datasetId === "undefined") {
2839
- throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2840
- }
2841
- if (!ops || !Array.isArray(ops) || ops.length === 0) {
2842
- throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
2843
- }
2844
- // Pre-check: verify dataset file exists before starting the job
2845
- const cleanRegEntry = getRegistryEntry(datasetId);
2846
- const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
2847
- const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
2848
- const cleanSafeId = toSafeDatasetPathFragment(datasetId);
2849
- const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
2850
- (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
2851
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
2852
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
2853
- fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
2854
- fs.existsSync(datasetId);
2855
- if (!cleanDataExists) {
2856
- return {
2857
- content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
2858
- isError: true,
2859
- };
2860
- }
2861
- const job = jobManager.createJob("clean", 0, { datasetId, ops });
2862
- return {
2863
- content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
2864
- };
2865
- }
2866
- case "prepare_dataset": {
2867
- hydrateExternalKeys();
2868
- const query = String(request.params.arguments?.query);
2869
- const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
2870
- const downloadImages = request.params.arguments?.download_images === true;
2871
- const requestedOutputDir = request.params.arguments?.target_dir
2872
- ? String(request.params.arguments.target_dir).trim()
2873
- : request.params.arguments?.output_dir
2874
- ? String(request.params.arguments.output_dir).trim()
2875
- : "";
2876
- const outputDir = requestedOutputDir || process.cwd();
2877
- if (!query || query === "undefined") {
2878
- throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
2879
- }
2880
- const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
2881
- return {
2882
- content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
2883
- };
2884
- }
2885
- case "compare_datasets": {
2886
- const datasetIds = request.params.arguments?.dataset_ids;
2887
- const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
2888
- let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
2889
- comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
2890
- comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
2891
- comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
2892
- comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
2893
- comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
2894
- return {
2895
- content: [{ type: "text", text: comparison }]
2896
- };
2897
- }
2898
- case "check_job_status": {
2899
- const jobId = String(request.params.arguments?.job_id);
2900
- const job = metadataStore.getJob(jobId);
2901
- if (!job) {
2902
- throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
2903
- }
2904
- const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
2905
- const now = Date.now();
2906
- const last = jobStatusLastPoll[jobId] || 0;
2907
- const minPollMs = 3000;
2908
- if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
2909
- const waitMs = minPollMs - (now - last);
2910
- return {
2911
- content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
2551
+ const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
2552
+ const payload = {
2553
+ dataset_id: datasetId,
2554
+ source,
2555
+ repo_id: repoId,
2556
+ kaggle_ref: kaggleRef,
2557
+ urls,
2558
+ output_format: outputFormat,
2559
+ output_dir: requestedOutputDir,
2560
+ max_items: maxItems,
2561
+ workers,
2562
+ image_column: imageColumn,
2563
+ output_root: requestedOutputDir || process.cwd(),
2564
+ recipes_dir: path.join(dataRoot, "recipes"),
2912
2565
  };
2913
- }
2914
- jobStatusLastPoll[jobId] = now;
2915
- if (job.status === "completed") {
2916
2566
  try {
2917
- const meta = job.metadata ? JSON.parse(job.metadata) : {};
2918
- const baseId = String(meta?.datasetId || meta?.dataset_id || meta?.query || job.id);
2919
- const outPath = typeof job.result_url === "string" ? job.result_url : undefined;
2920
- appendLineageVersion({
2921
- datasetIdBase: baseId,
2922
- tool: `job:${job.type}`,
2923
- requestArgs: {
2924
- dataset_id: meta?.datasetId || meta?.dataset_id,
2925
- query: meta?.query,
2926
- pipeline_id: meta?.pipeline_id,
2927
- agent_id: meta?.agent_id,
2928
- },
2929
- outputPath: outPath,
2930
- output: {},
2931
- steps: [
2932
- { step: `${job.type}_started`, at: job.created_at, params: meta || {} },
2933
- { step: `${job.type}_completed`, at: job.updated_at || new Date().toISOString(), metrics: { progress: job.progress } },
2934
- ],
2935
- });
2567
+ const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
2568
+ if (!result?.ok) {
2569
+ const errMsg = result?.error || "Unknown error";
2570
+ // Enhance error messages for common failures
2571
+ let hint = "";
2572
+ if (errMsg.includes("No image column")) {
2573
+ hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
2574
+ }
2575
+ else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
2576
+ hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
2577
+ }
2578
+ return {
2579
+ content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
2580
+ isError: true,
2581
+ };
2582
+ }
2583
+ return {
2584
+ content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
2585
+ };
2936
2586
  }
2937
- catch (e) {
2938
- console.error(`[Lineage] check_job_status append failed: ${e?.message || e}`);
2587
+ catch (error) {
2588
+ return {
2589
+ content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
2590
+ isError: true,
2591
+ };
2939
2592
  }
2940
2593
  }
2941
- return {
2942
- content: [{ type: "text", text: formatJobStatus(job) }]
2943
- };
2944
- }
2945
- case "export_dataset": {
2946
- const datasetId = String(request.params.arguments?.dataset_id);
2947
- const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
2948
- const intermediateArtifacts = new Set();
2949
- const requestedTargetDir = request.params.arguments?.target_dir
2950
- ? String(request.params.arguments?.target_dir).trim()
2951
- : request.params.arguments?.output_dir
2952
- ? String(request.params.arguments?.output_dir).trim()
2953
- : "";
2954
- const targetDir = path.resolve(requestedTargetDir || process.cwd());
2955
- const requestedFormat = String(request.params.arguments?.format || "feather");
2956
- const fastMode = request.params.arguments?.fast === true;
2957
- const preview = request.params.arguments?.preview === true;
2958
- const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
2959
- const columns = request.params.arguments?.columns;
2960
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2961
- // Use Metadata or Registry to find the actual local file
2962
- const preferredLookupDirs = [targetDir, process.cwd()];
2963
- let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
2964
- if (!sourcePath) {
2965
- console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
2966
- // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2967
- try {
2968
- jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
2594
+ case "configure_keys": {
2595
+ const hfToken = String(request.params.arguments?.hf_token || "").trim();
2596
+ const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
2597
+ const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
2598
+ const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
2599
+ const saved = [];
2600
+ const methods = [];
2601
+ if (hfToken) {
2602
+ const r = secureKeys.set("hf_token", hfToken);
2603
+ if (r.ok) {
2604
+ process.env.HF_TOKEN = hfToken;
2605
+ saved.push("HF token");
2606
+ if (r.method)
2607
+ methods.push(r.method);
2608
+ }
2969
2609
  }
2970
- catch (e) {
2971
- console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
2610
+ if (kaggleUsername) {
2611
+ const r = secureKeys.set("kaggle_username", kaggleUsername);
2612
+ if (r.ok) {
2613
+ process.env.KAGGLE_USERNAME = kaggleUsername;
2614
+ saved.push("Kaggle username");
2615
+ if (r.method)
2616
+ methods.push(r.method);
2617
+ }
2972
2618
  }
2973
- // Poll for download status or registry entry until local_path appears or timeout
2974
- const wait = (ms) => new Promise(res => setTimeout(res, ms));
2975
- const maxWait = 120_000; // 120s
2976
- const interval = 2000;
2977
- let waited = 0;
2978
- while (waited < maxWait) {
2979
- const resolved = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
2980
- if (resolved) {
2981
- sourcePath = resolved;
2982
- console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
2983
- break;
2619
+ if (kaggleKey) {
2620
+ const r = secureKeys.set("kaggle_key", kaggleKey);
2621
+ if (r.ok) {
2622
+ process.env.KAGGLE_KEY = kaggleKey;
2623
+ saved.push("Kaggle key");
2624
+ if (r.method)
2625
+ methods.push(r.method);
2984
2626
  }
2985
- await wait(interval);
2986
- waited += interval;
2987
2627
  }
2988
- // If still no sourcePath, return helpful error listing prepared datasets
2989
- if (!sourcePath) {
2990
- const entries = readRegistry();
2991
- const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
2628
+ if (dataworldToken) {
2629
+ const r = secureKeys.set("dataworld_token", dataworldToken);
2630
+ if (r.ok) {
2631
+ process.env.DW_AUTH_TOKEN = dataworldToken;
2632
+ saved.push("data.world token");
2633
+ if (r.method)
2634
+ methods.push(r.method);
2635
+ }
2636
+ }
2637
+ if (saved.length === 0) {
2992
2638
  return {
2993
- content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
2994
- isError: true
2639
+ content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
2995
2640
  };
2996
2641
  }
2642
+ return {
2643
+ content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
2644
+ };
2997
2645
  }
2998
- sourcePath = ensureExportableLocalPath(sourcePath);
2999
- try {
3000
- if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
3001
- upsertRegistry(datasetId, sourcePath, "completed");
2646
+ case "get_dataset_info": {
2647
+ const datasetId = String(request.params.arguments?.dataset_id);
2648
+ if (!datasetId) {
2649
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
3002
2650
  }
3003
- }
3004
- catch (e) {
3005
- console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
3006
- }
3007
- // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
3008
- if (!fastMode) {
3009
- const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
3010
- const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
3011
- const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
3012
- if (!pipelineCompatibleInput) {
3013
- console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
2651
+ const dataset = metadataStore.getDataset(datasetId);
2652
+ if (!dataset) {
2653
+ // Fallback: check the registry for local path info
2654
+ const regEntry = getRegistryEntry(datasetId);
2655
+ const regPath = regEntry?.local_path || regEntry?.path;
2656
+ if (regEntry) {
2657
+ const exists = regPath && fs.existsSync(regPath);
2658
+ return {
2659
+ content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
2660
+ };
2661
+ }
2662
+ return {
2663
+ content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
2664
+ isError: true,
2665
+ };
3014
2666
  }
3015
- else if (currentExt !== pipelineFmt) {
3016
- console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
2667
+ // Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
2668
+ if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
3017
2669
  try {
3018
- const beforeStagingPath = sourcePath;
3019
- sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
3020
- if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
3021
- intermediateArtifacts.add(sourcePath);
3022
- }
3023
- const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
3024
- if (pipelineResult.final_output_path) {
3025
- if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
3026
- intermediateArtifacts.add(pipelineResult.final_output_path);
3027
- }
3028
- sourcePath = pipelineResult.final_output_path;
3029
- try {
3030
- // Update registry to point to pipeline's final output
3031
- if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
3032
- upsertRegistry(datasetId, sourcePath, "completed");
2670
+ const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
2671
+ if (sizeResp.ok) {
2672
+ const sizeData = await sizeResp.json();
2673
+ const numRows = sizeData?.size?.dataset?.num_rows;
2674
+ if (numRows && numRows > 0) {
2675
+ dataset.total_examples = numRows;
2676
+ // Also backfill splits
2677
+ if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
2678
+ dataset.splits = sizeData.size.splits.map((s) => ({
2679
+ name: s.split,
2680
+ num_examples: s.num_rows || 0,
2681
+ size_bytes: s.num_bytes_parquet_files || 0,
2682
+ }));
2683
+ dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
2684
+ dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
2685
+ dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
3033
2686
  }
3034
- }
3035
- catch (e) {
3036
- console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
2687
+ // Persist enriched metadata
2688
+ metadataStore.saveDataset(dataset);
3037
2689
  }
3038
2690
  }
3039
2691
  }
3040
- catch (err) {
3041
- console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
2692
+ catch {
2693
+ // Enrichment is best-effort; continue with whatever we have
3042
2694
  }
3043
2695
  }
2696
+ const formattedOutput = formatDatasetInfo(dataset);
2697
+ return { content: [{ type: "text", text: formattedOutput }] };
3044
2698
  }
3045
- else {
3046
- console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
3047
- }
3048
- // Build export options
3049
- const exportOpts = {};
3050
- if (compression)
3051
- exportOpts.compression = compression;
3052
- if (preview)
3053
- exportOpts.preview = true;
3054
- if (sampleRows)
3055
- exportOpts.sample_rows = sampleRows;
3056
- if (columns)
3057
- exportOpts.columns = columns;
3058
- try {
3059
- // Determine output file name
3060
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
3061
- const ext = extMap[requestedFormat] || ".feather";
3062
- const safeName = getExportFileStem(datasetId);
3063
- const outDir = targetDir;
3064
- if (!fs.existsSync(outDir))
3065
- fs.mkdirSync(outDir, { recursive: true });
3066
- const outputFile = path.join(outDir, `${safeName}${ext}`);
3067
- const schemaBefore = await getSchemaSnapshot(sourcePath);
3068
- const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
3069
- const schemaAfter = await getSchemaSnapshot(result.output_path);
3070
- const lineage = appendLineageVersion({
3071
- datasetIdBase: datasetId,
3072
- tool: "export_dataset",
3073
- requestArgs: request.params.arguments,
3074
- outputPath: result.output_path,
3075
- output: {
3076
- rows: result.rows,
3077
- columns: result.columns,
3078
- format: requestedFormat,
3079
- size_mb: result.file_size_mb,
3080
- schema_before: schemaBefore,
3081
- schema_after: schemaAfter,
3082
- },
3083
- steps: [
3084
- { step: "source_resolved", at: new Date().toISOString(), params: { sourcePath } },
3085
- { step: "exported", at: new Date().toISOString(), params: { format: requestedFormat, compression }, metrics: { rows: result.rows, columns: result.columns } },
3086
- ],
3087
- });
3088
- try {
3089
- upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
2699
+ case "quality_analyze":
2700
+ case "analyze_quality":
2701
+ case "analyze_image_quality":
2702
+ case "analyze_media_quality":
2703
+ case "generate_quality_report": {
2704
+ const resolvedOperation = request.params.name === "analyze_image_quality"
2705
+ ? "image"
2706
+ : request.params.name === "analyze_media_quality"
2707
+ ? "media"
2708
+ : request.params.name === "generate_quality_report"
2709
+ ? "report"
2710
+ : String(request.params.arguments?.operation || "dataset").toLowerCase();
2711
+ if (resolvedOperation === "image") {
2712
+ const inputPath = String(request.params.arguments?.path || "").trim();
2713
+ if (!inputPath || !fs.existsSync(inputPath)) {
2714
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2715
+ }
2716
+ const report = await imageAnalyzer.analyze(inputPath);
2717
+ return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
3090
2718
  }
3091
- catch { }
3092
- // Build rich response
3093
- let msg = `**Export complete**\n`;
3094
- msg += `- **File**: ${result.output_path}\n`;
3095
- msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3096
- msg += `- **Lineage**: ${lineage.lineagePath}\n`;
3097
- msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
3098
- msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
3099
- if (result.file_size_mb !== undefined)
3100
- msg += `- **Size**: ${result.file_size_mb} MB\n`;
3101
- if (result.elapsed_seconds !== undefined)
3102
- msg += `- **Time**: ${result.elapsed_seconds}s\n`;
3103
- if (result.preview_path)
3104
- msg += `- **Preview**: ${result.preview_path}\n`;
3105
- msg += `\n`;
3106
- if (requestedFormat === "feather") {
3107
- msg += `**Inspect with:**\n`;
3108
- msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
3109
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2719
+ if (resolvedOperation === "media") {
2720
+ const inputPath = String(request.params.arguments?.path || "").trim();
2721
+ if (!inputPath || !fs.existsSync(inputPath)) {
2722
+ throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
2723
+ }
2724
+ const report = await mediaAnalyzer.analyze(inputPath);
2725
+ return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
3110
2726
  }
3111
- else if (requestedFormat === "parquet") {
3112
- msg += `**Inspect with:**\n`;
3113
- msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
3114
- msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
2727
+ if (resolvedOperation === "report") {
2728
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2729
+ const datasetPath = String(request.params.arguments?.dataset_path || "").trim();
2730
+ if (!datasetId) {
2731
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='report'");
2732
+ }
2733
+ if (!datasetPath || !fs.existsSync(datasetPath)) {
2734
+ throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
2735
+ }
2736
+ const metadata = await metadataStore.getDataset(datasetId);
2737
+ const textQuality = null;
2738
+ const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
2739
+ if (metadata) {
2740
+ metadata.unified_quality_report = report;
2741
+ await metadataStore.saveDataset(metadata);
2742
+ }
2743
+ return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
3115
2744
  }
3116
- cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
3117
- return { content: [{ type: "text", text: msg }] };
3118
- }
3119
- catch (error) {
2745
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
2746
+ if (!datasetId) {
2747
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='dataset'");
2748
+ }
2749
+ const safeId = toSafeDatasetPathFragment(datasetId);
2750
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2751
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
2752
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
2753
+ // Demo Fallback for easy testing
2754
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
2755
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
2756
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
2757
+ if (fs.existsSync(demoParquetPath)) {
2758
+ filePath = demoParquetPath;
2759
+ }
2760
+ else if (fs.existsSync(demoCsvPath)) {
2761
+ filePath = demoCsvPath;
2762
+ }
2763
+ else if (datasetId !== "demo") {
2764
+ return {
2765
+ content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
2766
+ isError: true
2767
+ };
2768
+ }
2769
+ }
2770
+ const report = await qualityAnalyzer.analyze(filePath);
3120
2771
  return {
3121
- content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
3122
- isError: true
2772
+ content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
3123
2773
  };
3124
2774
  }
3125
- }
3126
- case "vesper_list_datasets": {
3127
- const entries = readRegistry();
3128
- if (entries.length === 0) {
2775
+ case "preview_cleaning": {
2776
+ const datasetId = String(request.params.arguments?.dataset_id);
2777
+ const safeId = toSafeDatasetPathFragment(datasetId);
2778
+ const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
2779
+ const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
2780
+ let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
2781
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
2782
+ const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
2783
+ const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
2784
+ if (fs.existsSync(demoParquetPath)) {
2785
+ filePath = demoParquetPath;
2786
+ }
2787
+ else if (fs.existsSync(demoCsvPath)) {
2788
+ filePath = demoCsvPath;
2789
+ }
2790
+ else {
2791
+ throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
2792
+ }
2793
+ }
2794
+ const report = await qualityAnalyzer.analyze(filePath);
2795
+ // Phase 1: Target Detection
2796
+ // We use the same TargetDetector instance inside CleaningPlanner now?
2797
+ // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
2798
+ // OR let the planner handle it if we update its signature to accept filePath.
2799
+ // Let's check `CleaningPlanner.generatePlan` signature again.
2800
+ // We updated it to accept `targetInfo`.
2801
+ // So we need to run detection HERE and pass it.
2802
+ // But `TargetDetector` is not exposed in `index.ts` scope yet.
2803
+ // Let's create a global instance or use the one inside planner if exposed (it's private).
2804
+ // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
2805
+ // Quick fix: Instantiate local detector or make global.
2806
+ // I'll make a global `targetDetector` constant in index.ts
2807
+ // But wait, I updated `CleaningPlanner` to instantiate its own detector.
2808
+ // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
2809
+ // RETRY STRATEGY:
2810
+ // 1. Instantiate `targetDetector` in `index.ts`.
2811
+ // 2. Run `detectTarget(filePath)`.
2812
+ // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
2813
+ // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
2814
+ // But since I'm in this tool, I can't look back.
2815
+ // I will assume I can add it, or just do it inside the case for now.
2816
+ // To do it properly, I should have added `targetDetector` to the global scope in previous step.
2817
+ // Let's do that in a separate step if needed.
2818
+ // For now, I'll instantiate it here.
2819
+ const { TargetDetector } = await import("./preparation/target-detector.js");
2820
+ const detector = new TargetDetector(__dirname);
2821
+ const targetResult = await detector.detectTarget(filePath);
2822
+ const targetInfo = targetResult.target_column ? {
2823
+ target: targetResult.target_column,
2824
+ confidence: targetResult.confidence
2825
+ } : undefined;
2826
+ const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
2827
+ let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
2828
+ if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
2829
+ explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
2830
+ explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
2831
+ }
2832
+ explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
2833
+ if (plan.operations.length === 0) {
2834
+ explanation += "No cleaning operations required.";
2835
+ }
2836
+ else {
2837
+ plan.operations.forEach((op, i) => {
2838
+ explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
2839
+ });
2840
+ }
3129
2841
  return {
3130
- content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
2842
+ content: [{ type: "text", text: explanation }]
3131
2843
  };
3132
2844
  }
3133
- const lines = entries.map((e, i) => {
3134
- const id = e.dataset_id || e.id || "unknown";
3135
- const localPath = e.local_path || e.path || "unknown";
3136
- const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
3137
- return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
3138
- });
3139
- return {
3140
- content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
3141
- };
3142
- }
3143
- case "vesper_convert_format": {
3144
- const filePath = String(request.params.arguments?.file_path || "").trim();
3145
- const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
3146
- if (!filePath) {
3147
- throw new McpError(ErrorCode.InvalidParams, "file_path is required");
3148
- }
3149
- if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
3150
- throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
2845
+ case "custom_clean": {
2846
+ const datasetId = String(request.params.arguments?.dataset_id);
2847
+ const ops = request.params.arguments?.operations;
2848
+ if (!datasetId || datasetId === "undefined") {
2849
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
2850
+ }
2851
+ if (!ops || !Array.isArray(ops) || ops.length === 0) {
2852
+ throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
2853
+ }
2854
+ // Pre-check: verify dataset file exists before starting the job
2855
+ const cleanRegEntry = getRegistryEntry(datasetId);
2856
+ const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
2857
+ const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
2858
+ const cleanSafeId = toSafeDatasetPathFragment(datasetId);
2859
+ const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
2860
+ (cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
2861
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
2862
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
2863
+ fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
2864
+ fs.existsSync(datasetId);
2865
+ if (!cleanDataExists) {
2866
+ return {
2867
+ content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
2868
+ isError: true,
2869
+ };
2870
+ }
2871
+ const job = jobManager.createJob("clean", 0, { datasetId, ops });
2872
+ return {
2873
+ content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
2874
+ };
3151
2875
  }
3152
- if (!fs.existsSync(filePath)) {
2876
+ case "prepare_dataset": {
2877
+ hydrateExternalKeys();
2878
+ const query = String(request.params.arguments?.query);
2879
+ const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
2880
+ const downloadImages = request.params.arguments?.download_images === true;
2881
+ const requestedOutputDir = request.params.arguments?.target_dir
2882
+ ? String(request.params.arguments.target_dir).trim()
2883
+ : request.params.arguments?.output_dir
2884
+ ? String(request.params.arguments.output_dir).trim()
2885
+ : "";
2886
+ const outputDir = requestedOutputDir || process.cwd();
2887
+ if (!query || query === "undefined") {
2888
+ throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
2889
+ }
2890
+ const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
3153
2891
  return {
3154
- content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
3155
- isError: true,
2892
+ content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
3156
2893
  };
3157
2894
  }
3158
- const inputExt = path.extname(filePath).toLowerCase();
3159
- const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
3160
- const outputExt = extMap[targetFormat];
3161
- if (inputExt === outputExt) {
2895
+ case "compare_datasets": {
2896
+ const datasetIds = request.params.arguments?.dataset_ids;
2897
+ const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
2898
+ let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
2899
+ comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
2900
+ comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
2901
+ comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
2902
+ comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
2903
+ comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
3162
2904
  return {
3163
- content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
2905
+ content: [{ type: "text", text: comparison }]
3164
2906
  };
3165
2907
  }
3166
- const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
3167
- try {
3168
- await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
3169
- const convertScript = path.join(dataRoot, "python", "convert_engine.py");
3170
- const schemaBefore = await getSchemaSnapshot(filePath);
3171
- const result = await runPythonJson(convertScript, [filePath, outputPath]);
3172
- const schemaAfter = await getSchemaSnapshot(outputPath);
3173
- if (!result.ok) {
2908
+ case "check_job_status": {
2909
+ const jobId = String(request.params.arguments?.job_id);
2910
+ const job = metadataStore.getJob(jobId);
2911
+ if (!job) {
2912
+ throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
2913
+ }
2914
+ const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
2915
+ const now = Date.now();
2916
+ const last = jobStatusLastPoll[jobId] || 0;
2917
+ const minPollMs = 3000;
2918
+ if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
2919
+ const waitMs = minPollMs - (now - last);
3174
2920
  return {
3175
- content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
3176
- isError: true,
2921
+ content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
3177
2922
  };
3178
2923
  }
3179
- // Register converted file in the registry
3180
- const datasetId = path.basename(outputPath, outputExt);
3181
- try {
3182
- upsertRegistry(datasetId, outputPath, "completed");
3183
- }
3184
- catch (e) {
3185
- console.error(`[Convert] Registry write failed: ${e?.message || e}`);
3186
- }
3187
- const lineage = appendLineageVersion({
3188
- datasetIdBase: datasetId,
3189
- tool: "vesper_convert_format",
3190
- requestArgs: request.params.arguments,
3191
- outputPath,
3192
- output: {
3193
- rows: result.rows,
3194
- columns: result.columns,
3195
- format: targetFormat,
3196
- size_mb: result.size_mb,
3197
- schema_before: schemaBefore,
3198
- schema_after: schemaAfter,
3199
- },
3200
- steps: [
3201
- { step: "converted", at: new Date().toISOString(), params: { from: inputExt, to: outputExt } },
3202
- ],
3203
- });
3204
- try {
3205
- upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
2924
+ jobStatusLastPoll[jobId] = now;
2925
+ if (job.status === "completed") {
2926
+ try {
2927
+ const meta = job.metadata ? JSON.parse(job.metadata) : {};
2928
+ const baseId = String(meta?.datasetId || meta?.dataset_id || meta?.query || job.id);
2929
+ const outPath = typeof job.result_url === "string" ? job.result_url : undefined;
2930
+ appendLineageVersion({
2931
+ datasetIdBase: baseId,
2932
+ tool: `job:${job.type}`,
2933
+ requestArgs: {
2934
+ dataset_id: meta?.datasetId || meta?.dataset_id,
2935
+ query: meta?.query,
2936
+ pipeline_id: meta?.pipeline_id,
2937
+ agent_id: meta?.agent_id,
2938
+ },
2939
+ outputPath: outPath,
2940
+ output: {},
2941
+ steps: [
2942
+ { step: `${job.type}_started`, at: job.created_at, params: meta || {} },
2943
+ { step: `${job.type}_completed`, at: job.updated_at || new Date().toISOString(), metrics: { progress: job.progress } },
2944
+ ],
2945
+ });
2946
+ }
2947
+ catch (e) {
2948
+ console.error(`[Lineage] check_job_status append failed: ${e?.message || e}`);
2949
+ }
3206
2950
  }
3207
- catch { }
3208
- let msg = `**Conversion complete**\n`;
3209
- msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
3210
- msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
3211
- msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3212
- msg += `- **Lineage**: ${lineage.lineagePath}\n`;
3213
- msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
3214
- if (result.size_mb !== undefined)
3215
- msg += `- **Size**: ${result.size_mb} MB\n`;
3216
- return { content: [{ type: "text", text: msg }] };
3217
- }
3218
- catch (error) {
3219
2951
  return {
3220
- content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
3221
- isError: true,
2952
+ content: [{ type: "text", text: formatJobStatus(job) }]
3222
2953
  };
3223
2954
  }
3224
- }
3225
- case "vesper_normalize_schema": {
3226
- const filePath = String(request.params.arguments?.file_path || "").trim();
3227
- const outputFormat = String(request.params.arguments?.output_format || "jsonl").trim().toLowerCase();
3228
- const outputDirRaw = request.params.arguments?.output_dir ? String(request.params.arguments.output_dir).trim() : "";
3229
- const flattenMetadataJson = request.params.arguments?.flatten_metadata_json !== false;
3230
- const maxKeys = Number(request.params.arguments?.max_keys ?? 200);
3231
- const extrasMode = String(request.params.arguments?.extras_mode || "blob").trim().toLowerCase();
3232
- if (!filePath) {
3233
- throw new McpError(ErrorCode.InvalidParams, "file_path is required");
3234
- }
3235
- if (!["jsonl", "json"].includes(outputFormat)) {
3236
- throw new McpError(ErrorCode.InvalidParams, "output_format must be one of: jsonl, json");
3237
- }
3238
- if (!fs.existsSync(filePath)) {
3239
- return { content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }], isError: true };
3240
- }
3241
- const outDir = outputDirRaw || path.join(dataRoot, "data", "normalized_schema");
3242
- if (!fs.existsSync(outDir))
3243
- fs.mkdirSync(outDir, { recursive: true });
3244
- const baseName = path.parse(filePath).name || `normalized_${Date.now()}`;
3245
- const outputPath = path.join(outDir, `${baseName}.normalized.${outputFormat}`);
3246
- try {
3247
- const scriptPath = path.join(dataRoot, "python", "normalize_schema_engine.py");
3248
- const options = {
3249
- flatten_metadata_json: !!flattenMetadataJson,
3250
- max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
3251
- extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
3252
- };
3253
- const schemaBefore = await getSchemaSnapshot(filePath);
3254
- const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
3255
- const schemaAfter = await getSchemaSnapshot(outputPath);
3256
- if (!result.ok) {
3257
- return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
2955
+ case "export_dataset": {
2956
+ const datasetId = String(request.params.arguments?.dataset_id);
2957
+ const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
2958
+ const intermediateArtifacts = new Set();
2959
+ const requestedTargetDir = request.params.arguments?.target_dir
2960
+ ? String(request.params.arguments?.target_dir).trim()
2961
+ : request.params.arguments?.output_dir
2962
+ ? String(request.params.arguments?.output_dir).trim()
2963
+ : "";
2964
+ const targetDir = path.resolve(requestedTargetDir || process.cwd());
2965
+ const requestedFormat = String(request.params.arguments?.format || "feather");
2966
+ const fastMode = request.params.arguments?.fast === true;
2967
+ const preview = request.params.arguments?.preview === true;
2968
+ const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
2969
+ const columns = request.params.arguments?.columns;
2970
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
2971
+ // Use Metadata or Registry to find the actual local file
2972
+ const preferredLookupDirs = [targetDir, process.cwd()];
2973
+ let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
2974
+ if (!sourcePath) {
2975
+ console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
2976
+ // Start a prepare job for this dataset id (acts like calling prepare_dataset)
2977
+ try {
2978
+ jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
2979
+ }
2980
+ catch (e) {
2981
+ console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
2982
+ }
2983
+ // Poll for download status or registry entry until local_path appears or timeout
2984
+ const wait = (ms) => new Promise(res => setTimeout(res, ms));
2985
+ const maxWait = 120_000; // 120s
2986
+ const interval = 2000;
2987
+ let waited = 0;
2988
+ while (waited < maxWait) {
2989
+ const resolved = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
2990
+ if (resolved) {
2991
+ sourcePath = resolved;
2992
+ console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
2993
+ break;
2994
+ }
2995
+ await wait(interval);
2996
+ waited += interval;
2997
+ }
2998
+ // If still no sourcePath, return helpful error listing prepared datasets
2999
+ if (!sourcePath) {
3000
+ const entries = readRegistry();
3001
+ const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
3002
+ return {
3003
+ content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
3004
+ isError: true
3005
+ };
3006
+ }
3258
3007
  }
3259
- // Register normalized file to make follow-up conversion easier.
3008
+ sourcePath = ensureExportableLocalPath(sourcePath);
3260
3009
  try {
3261
- const datasetId = path.basename(outputPath, path.extname(outputPath));
3262
- upsertRegistry(datasetId, outputPath, "completed");
3010
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
3011
+ upsertRegistry(datasetId, sourcePath, "completed");
3012
+ }
3263
3013
  }
3264
3014
  catch (e) {
3265
- console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
3015
+ console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
3266
3016
  }
3267
- const lineage = appendLineageVersion({
3268
- datasetIdBase: path.basename(outputPath, path.extname(outputPath)),
3269
- tool: "vesper_normalize_schema",
3270
- requestArgs: request.params.arguments,
3271
- outputPath,
3272
- output: {
3273
- rows: result.rows,
3274
- columns: result.columns,
3275
- format: outputFormat,
3276
- schema_before: schemaBefore,
3277
- schema_after: schemaAfter,
3278
- },
3279
- steps: [
3280
- { step: "schema_normalized", at: new Date().toISOString(), params: options, metrics: { flattened_keys: result.flattened_keys } },
3281
- ],
3282
- });
3017
+ // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
3018
+ if (!fastMode) {
3019
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
3020
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
3021
+ const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
3022
+ if (!pipelineCompatibleInput) {
3023
+ console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
3024
+ }
3025
+ else if (currentExt !== pipelineFmt) {
3026
+ console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
3027
+ try {
3028
+ const beforeStagingPath = sourcePath;
3029
+ sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
3030
+ if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
3031
+ intermediateArtifacts.add(sourcePath);
3032
+ }
3033
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
3034
+ if (pipelineResult.final_output_path) {
3035
+ if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
3036
+ intermediateArtifacts.add(pipelineResult.final_output_path);
3037
+ }
3038
+ sourcePath = pipelineResult.final_output_path;
3039
+ try {
3040
+ // Update registry to point to pipeline's final output
3041
+ if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
3042
+ upsertRegistry(datasetId, sourcePath, "completed");
3043
+ }
3044
+ }
3045
+ catch (e) {
3046
+ console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
3047
+ }
3048
+ }
3049
+ }
3050
+ catch (err) {
3051
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
3052
+ }
3053
+ }
3054
+ }
3055
+ else {
3056
+ console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
3057
+ }
3058
+ // Build export options
3059
+ const exportOpts = {};
3060
+ if (compression)
3061
+ exportOpts.compression = compression;
3062
+ if (preview)
3063
+ exportOpts.preview = true;
3064
+ if (sampleRows)
3065
+ exportOpts.sample_rows = sampleRows;
3066
+ if (columns)
3067
+ exportOpts.columns = columns;
3283
3068
  try {
3284
- upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
3069
+ // Determine output file name
3070
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
3071
+ const ext = extMap[requestedFormat] || ".feather";
3072
+ const safeName = getExportFileStem(datasetId);
3073
+ const outDir = targetDir;
3074
+ if (!fs.existsSync(outDir))
3075
+ fs.mkdirSync(outDir, { recursive: true });
3076
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
3077
+ const schemaBefore = await getSchemaSnapshot(sourcePath);
3078
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
3079
+ const schemaAfter = await getSchemaSnapshot(result.output_path);
3080
+ const lineage = appendLineageVersion({
3081
+ datasetIdBase: datasetId,
3082
+ tool: "export_dataset",
3083
+ requestArgs: request.params.arguments,
3084
+ outputPath: result.output_path,
3085
+ output: {
3086
+ rows: result.rows,
3087
+ columns: result.columns,
3088
+ format: requestedFormat,
3089
+ size_mb: result.file_size_mb,
3090
+ schema_before: schemaBefore,
3091
+ schema_after: schemaAfter,
3092
+ },
3093
+ steps: [
3094
+ { step: "source_resolved", at: new Date().toISOString(), params: { sourcePath } },
3095
+ { step: "exported", at: new Date().toISOString(), params: { format: requestedFormat, compression }, metrics: { rows: result.rows, columns: result.columns } },
3096
+ ],
3097
+ });
3098
+ try {
3099
+ upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
3100
+ }
3101
+ catch { }
3102
+ // Build rich response
3103
+ let msg = `**Export complete**\n`;
3104
+ msg += `- **File**: ${result.output_path}\n`;
3105
+ msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3106
+ msg += `- **Lineage**: ${lineage.lineagePath}\n`;
3107
+ msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
3108
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
3109
+ if (result.file_size_mb !== undefined)
3110
+ msg += `- **Size**: ${result.file_size_mb} MB\n`;
3111
+ if (result.elapsed_seconds !== undefined)
3112
+ msg += `- **Time**: ${result.elapsed_seconds}s\n`;
3113
+ if (result.preview_path)
3114
+ msg += `- **Preview**: ${result.preview_path}\n`;
3115
+ msg += `\n`;
3116
+ if (requestedFormat === "feather") {
3117
+ msg += `**Inspect with:**\n`;
3118
+ msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
3119
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
3120
+ }
3121
+ else if (requestedFormat === "parquet") {
3122
+ msg += `**Inspect with:**\n`;
3123
+ msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
3124
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
3125
+ }
3126
+ cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
3127
+ return { content: [{ type: "text", text: msg }] };
3128
+ }
3129
+ catch (error) {
3130
+ return {
3131
+ content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
3132
+ isError: true
3133
+ };
3285
3134
  }
3286
- catch { }
3287
- let msg = `**Schema normalization complete**\n`;
3288
- msg += `- **Input**: ${filePath}\n`;
3289
- msg += `- **Output**: ${result.output_path}\n`;
3290
- msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3291
- msg += `- **Lineage**: ${lineage.lineagePath}\n`;
3292
- msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
3293
- msg += `- **Columns**: ${result.columns}\n`;
3294
- msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
3295
- msg += `- **Extras mode**: ${result.extras_mode}\n`;
3296
- if (result.extras_rows !== undefined)
3297
- msg += `- **Rows with extras**: ${result.extras_rows}\n`;
3298
- msg += `\nNext: run \`vesper_convert_format\` on the output to convert to parquet.\n`;
3299
- return { content: [{ type: "text", text: msg }] };
3300
- }
3301
- catch (error) {
3302
- return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
3303
3135
  }
3304
- }
3305
- case "fuse":
3306
- case "fuse_datasets": {
3307
- const operation = request.params.name === "fuse_datasets"
3308
- ? "tabular"
3309
- : String(request.params.arguments?.operation || "tabular").toLowerCase();
3310
- if (operation === "web") {
3311
- hydrateExternalKeys();
3312
- const webSources = Array.isArray(request.params.arguments?.sources)
3313
- ? request.params.arguments?.sources
3314
- : undefined;
3315
- if (!webSources || !Array.isArray(webSources)) {
3136
+ case "vesper_list_datasets": {
3137
+ const entries = readRegistry();
3138
+ if (entries.length === 0) {
3316
3139
  return {
3317
- content: [{ type: "text", text: "ERROR: fuse(operation='web') requires 'sources' array." }],
3318
- isError: true,
3140
+ content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
3319
3141
  };
3320
3142
  }
3321
- const mergeStrategyRaw = request.params.arguments?.merge_strategy
3322
- ? String(request.params.arguments?.merge_strategy).toLowerCase()
3323
- : undefined;
3324
- const dedupRaw = request.params.arguments?.deduplication
3325
- ? String(request.params.arguments?.deduplication).toLowerCase()
3326
- : undefined;
3327
- const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
3328
- ? mergeStrategyRaw
3329
- : undefined;
3330
- const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
3331
- ? dedupRaw
3332
- : undefined;
3333
- const webResult = await webFusionEngine.fuse({
3334
- sources: webSources.map((s) => ({
3335
- type: String(s?.type || "").trim().toLowerCase(),
3336
- query: String(s?.query || "").trim(),
3337
- max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
3338
- min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
3339
- bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
3340
- path: s?.path !== undefined ? String(s.path) : undefined,
3341
- region: s?.region !== undefined ? String(s.region) : undefined,
3342
- credentials: s?.credentials ? {
3343
- accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
3344
- secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
3345
- sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
3346
- roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
3347
- } : undefined,
3348
- })),
3349
- merge_strategy,
3350
- deduplication,
3143
+ const lines = entries.map((e, i) => {
3144
+ const id = e.dataset_id || e.id || "unknown";
3145
+ const localPath = e.local_path || e.path || "unknown";
3146
+ const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
3147
+ return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
3351
3148
  });
3352
3149
  return {
3353
- content: [{ type: "text", text: JSON.stringify(webResult, null, 2) }],
3150
+ content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
3354
3151
  };
3355
3152
  }
3356
- const rawSources = request.params.arguments?.sources;
3357
- if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
3358
- throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
3359
- }
3360
- const strategy = request.params.arguments?.strategy || "concat";
3361
- const joinOn = request.params.arguments?.join_on;
3362
- const how = request.params.arguments?.how || "inner";
3363
- const dedup = request.params.arguments?.dedup !== false;
3364
- const runQualityAfter = request.params.arguments?.run_quality_after !== false;
3365
- const leakageCheck = request.params.arguments?.leakage_check !== false;
3366
- const outputFormat = request.params.arguments?.output_format || "feather";
3367
- const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
3368
- const preview = request.params.arguments?.preview !== false;
3369
- const resolvedPaths = [];
3370
- const unresolved = [];
3371
- for (const src of rawSources) {
3372
- if (fs.existsSync(src)) {
3373
- resolvedPaths.push(src);
3374
- continue;
3153
+ case "vesper_convert_format": {
3154
+ const filePath = String(request.params.arguments?.file_path || "").trim();
3155
+ const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
3156
+ if (!filePath) {
3157
+ throw new McpError(ErrorCode.InvalidParams, "file_path is required");
3375
3158
  }
3376
- const status = metadataStore.getDownloadStatus(src);
3377
- if (status?.local_path && fs.existsSync(status.local_path)) {
3378
- resolvedPaths.push(status.local_path);
3379
- continue;
3159
+ if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
3160
+ throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
3161
+ }
3162
+ if (!fs.existsSync(filePath)) {
3163
+ return {
3164
+ content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
3165
+ isError: true,
3166
+ };
3167
+ }
3168
+ const inputExt = path.extname(filePath).toLowerCase();
3169
+ const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
3170
+ const outputExt = extMap[targetFormat];
3171
+ if (inputExt === outputExt) {
3172
+ return {
3173
+ content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
3174
+ };
3175
+ }
3176
+ const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
3177
+ try {
3178
+ await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
3179
+ const convertScript = path.join(dataRoot, "python", "convert_engine.py");
3180
+ const schemaBefore = await getSchemaSnapshot(filePath);
3181
+ const result = await runPythonJson(convertScript, [filePath, outputPath]);
3182
+ const schemaAfter = await getSchemaSnapshot(outputPath);
3183
+ if (!result.ok) {
3184
+ return {
3185
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
3186
+ isError: true,
3187
+ };
3188
+ }
3189
+ // Register converted file in the registry
3190
+ const datasetId = path.basename(outputPath, outputExt);
3191
+ try {
3192
+ upsertRegistry(datasetId, outputPath, "completed");
3193
+ }
3194
+ catch (e) {
3195
+ console.error(`[Convert] Registry write failed: ${e?.message || e}`);
3196
+ }
3197
+ const lineage = appendLineageVersion({
3198
+ datasetIdBase: datasetId,
3199
+ tool: "vesper_convert_format",
3200
+ requestArgs: request.params.arguments,
3201
+ outputPath,
3202
+ output: {
3203
+ rows: result.rows,
3204
+ columns: result.columns,
3205
+ format: targetFormat,
3206
+ size_mb: result.size_mb,
3207
+ schema_before: schemaBefore,
3208
+ schema_after: schemaAfter,
3209
+ },
3210
+ steps: [
3211
+ { step: "converted", at: new Date().toISOString(), params: { from: inputExt, to: outputExt } },
3212
+ ],
3213
+ });
3214
+ try {
3215
+ upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
3216
+ }
3217
+ catch { }
3218
+ let msg = `**Conversion complete**\n`;
3219
+ msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
3220
+ msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
3221
+ msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3222
+ msg += `- **Lineage**: ${lineage.lineagePath}\n`;
3223
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
3224
+ if (result.size_mb !== undefined)
3225
+ msg += `- **Size**: ${result.size_mb} MB\n`;
3226
+ return { content: [{ type: "text", text: msg }] };
3227
+ }
3228
+ catch (error) {
3229
+ return {
3230
+ content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
3231
+ isError: true,
3232
+ };
3380
3233
  }
3381
- unresolved.push(src);
3382
- }
3383
- if (unresolved.length > 0) {
3384
- return {
3385
- content: [{
3386
- type: "text",
3387
- text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
3388
- }],
3389
- isError: true
3390
- };
3391
3234
  }
3392
- try {
3393
- const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
3394
- const ext = extMap[outputFormat] || ".feather";
3395
- const outDir = process.cwd();
3235
+ case "vesper_normalize_schema": {
3236
+ const filePath = String(request.params.arguments?.file_path || "").trim();
3237
+ const outputFormat = String(request.params.arguments?.output_format || "jsonl").trim().toLowerCase();
3238
+ const outputDirRaw = request.params.arguments?.output_dir ? String(request.params.arguments.output_dir).trim() : "";
3239
+ const flattenMetadataJson = request.params.arguments?.flatten_metadata_json !== false;
3240
+ const maxKeys = Number(request.params.arguments?.max_keys ?? 200);
3241
+ const extrasMode = String(request.params.arguments?.extras_mode || "blob").trim().toLowerCase();
3242
+ if (!filePath) {
3243
+ throw new McpError(ErrorCode.InvalidParams, "file_path is required");
3244
+ }
3245
+ if (!["jsonl", "json"].includes(outputFormat)) {
3246
+ throw new McpError(ErrorCode.InvalidParams, "output_format must be one of: jsonl, json");
3247
+ }
3248
+ if (!fs.existsSync(filePath)) {
3249
+ return { content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }], isError: true };
3250
+ }
3251
+ const outDir = outputDirRaw || path.join(dataRoot, "data", "normalized_schema");
3396
3252
  if (!fs.existsSync(outDir))
3397
3253
  fs.mkdirSync(outDir, { recursive: true });
3398
- const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
3399
- console.error(`[Fusion] Resolved output directory: ${outDir}`);
3400
- const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
3401
- strategy,
3402
- join_on: joinOn,
3403
- how,
3404
- dedup,
3405
- run_quality_after: runQualityAfter,
3406
- leakage_check: leakageCheck,
3407
- output_format: outputFormat,
3408
- compression: compression,
3409
- preview,
3410
- });
3411
- const nullDelta = result.stats.null_delta;
3412
- const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
3413
- // Register fused dataset under a generated id so users can export it easily
3414
- const fusedId = `fused_${Date.now()}`;
3254
+ const baseName = path.parse(filePath).name || `normalized_${Date.now()}`;
3255
+ const outputPath = path.join(outDir, `${baseName}.normalized.${outputFormat}`);
3415
3256
  try {
3416
- upsertRegistry(fusedId, result.output_path, "completed");
3257
+ const scriptPath = path.join(dataRoot, "python", "normalize_schema_engine.py");
3258
+ const options = {
3259
+ flatten_metadata_json: !!flattenMetadataJson,
3260
+ max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
3261
+ extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
3262
+ };
3263
+ const schemaBefore = await getSchemaSnapshot(filePath);
3264
+ const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
3265
+ const schemaAfter = await getSchemaSnapshot(outputPath);
3266
+ if (!result.ok) {
3267
+ return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
3268
+ }
3269
+ // Register normalized file to make follow-up conversion easier.
3270
+ try {
3271
+ const datasetId = path.basename(outputPath, path.extname(outputPath));
3272
+ upsertRegistry(datasetId, outputPath, "completed");
3273
+ }
3274
+ catch (e) {
3275
+ console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
3276
+ }
3277
+ const lineage = appendLineageVersion({
3278
+ datasetIdBase: path.basename(outputPath, path.extname(outputPath)),
3279
+ tool: "vesper_normalize_schema",
3280
+ requestArgs: request.params.arguments,
3281
+ outputPath,
3282
+ output: {
3283
+ rows: result.rows,
3284
+ columns: result.columns,
3285
+ format: outputFormat,
3286
+ schema_before: schemaBefore,
3287
+ schema_after: schemaAfter,
3288
+ },
3289
+ steps: [
3290
+ { step: "schema_normalized", at: new Date().toISOString(), params: options, metrics: { flattened_keys: result.flattened_keys } },
3291
+ ],
3292
+ });
3293
+ try {
3294
+ upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
3295
+ }
3296
+ catch { }
3297
+ let msg = `**Schema normalization complete**\n`;
3298
+ msg += `- **Input**: ${filePath}\n`;
3299
+ msg += `- **Output**: ${result.output_path}\n`;
3300
+ msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3301
+ msg += `- **Lineage**: ${lineage.lineagePath}\n`;
3302
+ msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
3303
+ msg += `- **Columns**: ${result.columns}\n`;
3304
+ msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
3305
+ msg += `- **Extras mode**: ${result.extras_mode}\n`;
3306
+ if (result.extras_rows !== undefined)
3307
+ msg += `- **Rows with extras**: ${result.extras_rows}\n`;
3308
+ msg += `\nNext: run \`vesper_convert_format\` on the output to convert to parquet.\n`;
3309
+ return { content: [{ type: "text", text: msg }] };
3417
3310
  }
3418
- catch (e) {
3419
- console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
3311
+ catch (error) {
3312
+ return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
3420
3313
  }
3421
- const inputSchemaSnapshots = await Promise.all(resolvedPaths.map((p) => getSchemaSnapshot(p)));
3422
- const schemaBefore = mergeSchemaSnapshots(inputSchemaSnapshots);
3423
- const schemaAfter = await getSchemaSnapshot(result.output_path);
3424
- const lineage = appendLineageVersion({
3425
- datasetIdBase: fusedId,
3426
- tool: "fuse_datasets",
3427
- requestArgs: request.params.arguments,
3428
- outputPath: result.output_path,
3429
- output: {
3430
- rows: result.stats.rows_after,
3431
- format: outputFormat,
3432
- schema_before: schemaBefore,
3433
- schema_after: schemaAfter,
3434
- },
3435
- sources: resolvedPaths.map((p) => ({ source: "local", url: p, at: new Date().toISOString() })),
3436
- steps: [
3437
- { step: "fused", at: new Date().toISOString(), params: { strategy, dedup, how }, metrics: { rows_before: result.stats.rows_before, rows_after: result.stats.rows_after, duplicates_removed: result.stats.duplicates_removed } },
3438
- ],
3439
- });
3440
- try {
3441
- upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
3314
+ }
3315
+ case "fuse":
3316
+ case "fuse_datasets": {
3317
+ const operation = request.params.name === "fuse_datasets"
3318
+ ? "tabular"
3319
+ : String(request.params.arguments?.operation || "tabular").toLowerCase();
3320
+ if (operation === "web") {
3321
+ hydrateExternalKeys();
3322
+ const webSources = Array.isArray(request.params.arguments?.sources)
3323
+ ? request.params.arguments?.sources
3324
+ : undefined;
3325
+ if (!webSources || !Array.isArray(webSources)) {
3326
+ return {
3327
+ content: [{ type: "text", text: "ERROR: fuse(operation='web') requires 'sources' array." }],
3328
+ isError: true,
3329
+ };
3330
+ }
3331
+ const mergeStrategyRaw = request.params.arguments?.merge_strategy
3332
+ ? String(request.params.arguments?.merge_strategy).toLowerCase()
3333
+ : undefined;
3334
+ const dedupRaw = request.params.arguments?.deduplication
3335
+ ? String(request.params.arguments?.deduplication).toLowerCase()
3336
+ : undefined;
3337
+ const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
3338
+ ? mergeStrategyRaw
3339
+ : undefined;
3340
+ const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
3341
+ ? dedupRaw
3342
+ : undefined;
3343
+ const webResult = await webFusionEngine.fuse({
3344
+ sources: webSources.map((s) => ({
3345
+ type: String(s?.type || "").trim().toLowerCase(),
3346
+ query: String(s?.query || "").trim(),
3347
+ max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
3348
+ min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
3349
+ bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
3350
+ path: s?.path !== undefined ? String(s.path) : undefined,
3351
+ region: s?.region !== undefined ? String(s.region) : undefined,
3352
+ credentials: s?.credentials ? {
3353
+ accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
3354
+ secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
3355
+ sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
3356
+ roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
3357
+ } : undefined,
3358
+ })),
3359
+ merge_strategy,
3360
+ deduplication,
3361
+ });
3362
+ return {
3363
+ content: [{ type: "text", text: JSON.stringify(webResult, null, 2) }],
3364
+ };
3365
+ }
3366
+ const rawSources = request.params.arguments?.sources;
3367
+ if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
3368
+ throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
3442
3369
  }
3443
- catch { }
3444
- let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
3445
- msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
3446
- msg += `- Null change: ${nullText}\n`;
3447
- msg += `- Output: ${result.output_path}\n`;
3448
- msg += `- Version: ${lineage.datasetVersionId}\n`;
3449
- msg += `- Lineage: ${lineage.lineagePath}\n`;
3450
- if (result.preview_path)
3451
- msg += `- Preview: ${result.preview_path}\n`;
3452
- if (result.leakage_report) {
3453
- msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
3454
- if (result.leakage_report.leakage_count) {
3455
- msg += ` (${result.leakage_report.leakage_count})`;
3370
+ const strategy = request.params.arguments?.strategy || "concat";
3371
+ const joinOn = request.params.arguments?.join_on;
3372
+ const how = request.params.arguments?.how || "inner";
3373
+ const dedup = request.params.arguments?.dedup !== false;
3374
+ const runQualityAfter = request.params.arguments?.run_quality_after !== false;
3375
+ const leakageCheck = request.params.arguments?.leakage_check !== false;
3376
+ const outputFormat = request.params.arguments?.output_format || "feather";
3377
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
3378
+ const preview = request.params.arguments?.preview !== false;
3379
+ const resolvedPaths = [];
3380
+ const unresolved = [];
3381
+ for (const src of rawSources) {
3382
+ if (fs.existsSync(src)) {
3383
+ resolvedPaths.push(src);
3384
+ continue;
3385
+ }
3386
+ const status = metadataStore.getDownloadStatus(src);
3387
+ if (status?.local_path && fs.existsSync(status.local_path)) {
3388
+ resolvedPaths.push(status.local_path);
3389
+ continue;
3456
3390
  }
3457
- msg += "\n";
3391
+ unresolved.push(src);
3392
+ }
3393
+ if (unresolved.length > 0) {
3394
+ return {
3395
+ content: [{
3396
+ type: "text",
3397
+ text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
3398
+ }],
3399
+ isError: true
3400
+ };
3401
+ }
3402
+ try {
3403
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
3404
+ const ext = extMap[outputFormat] || ".feather";
3405
+ const outDir = process.cwd();
3406
+ if (!fs.existsSync(outDir))
3407
+ fs.mkdirSync(outDir, { recursive: true });
3408
+ const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
3409
+ console.error(`[Fusion] Resolved output directory: ${outDir}`);
3410
+ const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
3411
+ strategy,
3412
+ join_on: joinOn,
3413
+ how,
3414
+ dedup,
3415
+ run_quality_after: runQualityAfter,
3416
+ leakage_check: leakageCheck,
3417
+ output_format: outputFormat,
3418
+ compression: compression,
3419
+ preview,
3420
+ });
3421
+ const nullDelta = result.stats.null_delta;
3422
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
3423
+ // Register fused dataset under a generated id so users can export it easily
3424
+ const fusedId = `fused_${Date.now()}`;
3425
+ try {
3426
+ upsertRegistry(fusedId, result.output_path, "completed");
3427
+ }
3428
+ catch (e) {
3429
+ console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
3430
+ }
3431
+ const inputSchemaSnapshots = await Promise.all(resolvedPaths.map((p) => getSchemaSnapshot(p)));
3432
+ const schemaBefore = mergeSchemaSnapshots(inputSchemaSnapshots);
3433
+ const schemaAfter = await getSchemaSnapshot(result.output_path);
3434
+ const lineage = appendLineageVersion({
3435
+ datasetIdBase: fusedId,
3436
+ tool: "fuse_datasets",
3437
+ requestArgs: request.params.arguments,
3438
+ outputPath: result.output_path,
3439
+ output: {
3440
+ rows: result.stats.rows_after,
3441
+ format: outputFormat,
3442
+ schema_before: schemaBefore,
3443
+ schema_after: schemaAfter,
3444
+ },
3445
+ sources: resolvedPaths.map((p) => ({ source: "local", url: p, at: new Date().toISOString() })),
3446
+ steps: [
3447
+ { step: "fused", at: new Date().toISOString(), params: { strategy, dedup, how }, metrics: { rows_before: result.stats.rows_before, rows_after: result.stats.rows_after, duplicates_removed: result.stats.duplicates_removed } },
3448
+ ],
3449
+ });
3450
+ try {
3451
+ upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
3452
+ }
3453
+ catch { }
3454
+ let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
3455
+ msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
3456
+ msg += `- Null change: ${nullText}\n`;
3457
+ msg += `- Output: ${result.output_path}\n`;
3458
+ msg += `- Version: ${lineage.datasetVersionId}\n`;
3459
+ msg += `- Lineage: ${lineage.lineagePath}\n`;
3460
+ if (result.preview_path)
3461
+ msg += `- Preview: ${result.preview_path}\n`;
3462
+ if (result.leakage_report) {
3463
+ msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
3464
+ if (result.leakage_report.leakage_count) {
3465
+ msg += ` (${result.leakage_report.leakage_count})`;
3466
+ }
3467
+ msg += "\n";
3468
+ }
3469
+ msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
3470
+ return { content: [{ type: "text", text: msg }] };
3471
+ }
3472
+ catch (error) {
3473
+ return {
3474
+ content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
3475
+ isError: true
3476
+ };
3458
3477
  }
3459
- msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
3460
- return { content: [{ type: "text", text: msg }] };
3461
- }
3462
- catch (error) {
3463
- return {
3464
- content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
3465
- isError: true
3466
- };
3467
3478
  }
3479
+ default:
3480
+ throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
3468
3481
  }
3469
- default:
3470
- throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
3471
- }
3482
+ })();
3483
+ void recordMcpToolAnalyticsAfterCall({
3484
+ toolName: String(request.params.name),
3485
+ args: request.params.arguments,
3486
+ result: toolResponse,
3487
+ }).catch((err) => console.error("[mcp-analytics]", err));
3488
+ return toolResponse;
3472
3489
  }); // end requestQueue.enqueue
3473
3490
  });
3474
3491
  async function main() {