@vespermcp/mcp-server 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +1327 -1318
- package/build/lib/mcp-analytics.js +164 -0
- package/build/lib/plan-resolve.js +10 -2
- package/mcp-config-template.json +2 -3
- package/package.json +2 -3
- package/scripts/postinstall.cjs +45 -9
- package/scripts/wizard.cjs +45 -7
- package/scripts/wizard.js +2 -2
- package/wizard.cjs +0 -0
package/build/index.js
CHANGED
|
@@ -468,6 +468,7 @@ import http from "http";
|
|
|
468
468
|
import https from "https";
|
|
469
469
|
import os from "os";
|
|
470
470
|
import { enforcePlanGateForTool } from "./lib/plan-resolve.js";
|
|
471
|
+
import { recordMcpToolAnalyticsAfterCall } from "./lib/mcp-analytics.js";
|
|
471
472
|
// Determine absolute paths relative to the compiled script
|
|
472
473
|
const __filename = fileURLToPath(import.meta.url);
|
|
473
474
|
const __dirname = path.dirname(__filename);
|
|
@@ -2009,1474 +2010,1482 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2009
2010
|
// Mark this step as complete
|
|
2010
2011
|
markStepComplete(String(datasetId), String(step));
|
|
2011
2012
|
}
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2013
|
+
const toolResponse = await (async () => {
|
|
2014
|
+
switch (request.params.name) {
|
|
2015
|
+
case "lineage":
|
|
2016
|
+
case "get_lineage":
|
|
2017
|
+
case "diff_lineage_versions": {
|
|
2018
|
+
const operation = request.params.name === "get_lineage"
|
|
2019
|
+
? "get"
|
|
2020
|
+
: request.params.name === "diff_lineage_versions"
|
|
2021
|
+
? "diff"
|
|
2022
|
+
: String(request.params.arguments?.operation || "get").toLowerCase();
|
|
2023
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2024
|
+
if (!datasetId) {
|
|
2025
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2026
|
+
}
|
|
2027
|
+
if (operation === "get") {
|
|
2028
|
+
const base = toBaseDatasetId(datasetId);
|
|
2029
|
+
const record = readLineageRecord(base);
|
|
2030
|
+
if (!record.versions || record.versions.length === 0) {
|
|
2031
|
+
return {
|
|
2032
|
+
content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
|
|
2033
|
+
};
|
|
2034
|
+
}
|
|
2035
|
+
return {
|
|
2036
|
+
content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
|
|
2037
|
+
};
|
|
2038
|
+
}
|
|
2039
|
+
if (operation !== "diff") {
|
|
2040
|
+
throw new McpError(ErrorCode.InvalidParams, "operation must be 'get' or 'diff'");
|
|
2041
|
+
}
|
|
2042
|
+
const fromVersion = Number(request.params.arguments?.from_version);
|
|
2043
|
+
const toVersion = Number(request.params.arguments?.to_version);
|
|
2044
|
+
if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
|
|
2045
|
+
throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
|
|
2046
|
+
}
|
|
2047
|
+
if (!Number.isInteger(toVersion) || toVersion <= 0) {
|
|
2048
|
+
throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
|
|
2049
|
+
}
|
|
2026
2050
|
const base = toBaseDatasetId(datasetId);
|
|
2027
2051
|
const record = readLineageRecord(base);
|
|
2028
|
-
|
|
2052
|
+
const fromV = record.versions.find((v) => v.version === fromVersion);
|
|
2053
|
+
const toV = record.versions.find((v) => v.version === toVersion);
|
|
2054
|
+
if (!fromV || !toV) {
|
|
2029
2055
|
return {
|
|
2030
|
-
content: [{ type: "text", text: `
|
|
2056
|
+
content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
|
|
2057
|
+
isError: true,
|
|
2031
2058
|
};
|
|
2032
2059
|
}
|
|
2060
|
+
const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
|
|
2061
|
+
? fromV.output?.schema_after || fromV.output?.schema_before || {}
|
|
2062
|
+
: fromV.output?.schema_after || fromV.output?.schema_before || {};
|
|
2063
|
+
const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
|
|
2064
|
+
const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
|
|
2065
|
+
const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
|
|
2066
|
+
const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
|
|
2067
|
+
const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
|
|
2068
|
+
const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
|
|
2069
|
+
const fromRows = typeof fromSchema.rows === "number" ? fromSchema.rows : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
|
|
2070
|
+
const toRows = typeof toSchema.rows === "number" ? toSchema.rows : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
|
|
2071
|
+
const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
|
|
2072
|
+
const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
|
|
2033
2073
|
return {
|
|
2034
|
-
content: [{
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
if (!Number.isInteger(toVersion) || toVersion <= 0) {
|
|
2046
|
-
throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
|
|
2047
|
-
}
|
|
2048
|
-
const base = toBaseDatasetId(datasetId);
|
|
2049
|
-
const record = readLineageRecord(base);
|
|
2050
|
-
const fromV = record.versions.find((v) => v.version === fromVersion);
|
|
2051
|
-
const toV = record.versions.find((v) => v.version === toVersion);
|
|
2052
|
-
if (!fromV || !toV) {
|
|
2053
|
-
return {
|
|
2054
|
-
content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
|
|
2055
|
-
isError: true,
|
|
2056
|
-
};
|
|
2057
|
-
}
|
|
2058
|
-
const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
|
|
2059
|
-
? fromV.output?.schema_after || fromV.output?.schema_before || {}
|
|
2060
|
-
: fromV.output?.schema_after || fromV.output?.schema_before || {};
|
|
2061
|
-
const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
|
|
2062
|
-
const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
|
|
2063
|
-
const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
|
|
2064
|
-
const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
|
|
2065
|
-
const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
|
|
2066
|
-
const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
|
|
2067
|
-
const fromRows = typeof fromSchema.rows === "number" ? fromSchema.rows : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
|
|
2068
|
-
const toRows = typeof toSchema.rows === "number" ? toSchema.rows : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
|
|
2069
|
-
const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
|
|
2070
|
-
const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
|
|
2071
|
-
return {
|
|
2072
|
-
content: [{
|
|
2073
|
-
type: "text",
|
|
2074
|
-
text: JSON.stringify({
|
|
2075
|
-
dataset_id_base: base,
|
|
2076
|
-
from_version: fromVersion,
|
|
2077
|
-
to_version: toVersion,
|
|
2078
|
-
schema_diff: schemaDiff,
|
|
2079
|
-
row_count_delta: {
|
|
2080
|
-
from: fromRows,
|
|
2081
|
-
to: toRows,
|
|
2082
|
-
delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
|
|
2083
|
-
},
|
|
2084
|
-
steps_diff: {
|
|
2085
|
-
added: Array.from(toSteps).filter((s) => !fromSteps.has(s)),
|
|
2086
|
-
removed: Array.from(fromSteps).filter((s) => !toSteps.has(s)),
|
|
2087
|
-
from_steps: Array.from(fromSteps),
|
|
2088
|
-
to_steps: Array.from(toSteps),
|
|
2089
|
-
},
|
|
2090
|
-
actor_diff: {
|
|
2091
|
-
changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
|
|
2092
|
-
String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
|
|
2093
|
-
from: {
|
|
2094
|
-
tool: fromV.triggered_by?.tool,
|
|
2095
|
-
agent_id: fromV.triggered_by?.agent_id,
|
|
2096
|
-
pipeline_id: fromV.triggered_by?.pipeline_id,
|
|
2074
|
+
content: [{
|
|
2075
|
+
type: "text",
|
|
2076
|
+
text: JSON.stringify({
|
|
2077
|
+
dataset_id_base: base,
|
|
2078
|
+
from_version: fromVersion,
|
|
2079
|
+
to_version: toVersion,
|
|
2080
|
+
schema_diff: schemaDiff,
|
|
2081
|
+
row_count_delta: {
|
|
2082
|
+
from: fromRows,
|
|
2083
|
+
to: toRows,
|
|
2084
|
+
delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
|
|
2097
2085
|
},
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2086
|
+
steps_diff: {
|
|
2087
|
+
added: Array.from(toSteps).filter((s) => !fromSteps.has(s)),
|
|
2088
|
+
removed: Array.from(fromSteps).filter((s) => !toSteps.has(s)),
|
|
2089
|
+
from_steps: Array.from(fromSteps),
|
|
2090
|
+
to_steps: Array.from(toSteps),
|
|
2102
2091
|
},
|
|
2103
|
-
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2113
|
-
|
|
2114
|
-
|
|
2115
|
-
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
|
|
2119
|
-
limit,
|
|
2120
|
-
arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
|
|
2121
|
-
github_include_readme: request.params.arguments?.github_include_readme === true,
|
|
2122
|
-
});
|
|
2123
|
-
try {
|
|
2124
|
-
appendLineageVersion({
|
|
2125
|
-
datasetIdBase: `webfind_${query || "query"}`,
|
|
2126
|
-
tool: "vesper_web_find",
|
|
2127
|
-
requestArgs: request.params.arguments,
|
|
2128
|
-
output: {
|
|
2129
|
-
rows: Array.isArray(result.results) ? result.results.length : undefined,
|
|
2130
|
-
},
|
|
2131
|
-
sources: Array.isArray(result.results)
|
|
2132
|
-
? result.results.slice(0, 200).map((r) => ({
|
|
2133
|
-
source: String(r?.source_type || "unknown"),
|
|
2134
|
-
url: typeof r?.source_url === "string" ? r.source_url : undefined,
|
|
2135
|
-
at: typeof r?.collected_at === "string" ? r.collected_at : undefined,
|
|
2136
|
-
}))
|
|
2137
|
-
: [],
|
|
2138
|
-
steps: [
|
|
2139
|
-
{ step: "web_find_discover", at: new Date().toISOString(), params: { query, sources, limit } },
|
|
2140
|
-
{ step: "web_find_complete", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
|
|
2141
|
-
],
|
|
2142
|
-
});
|
|
2143
|
-
}
|
|
2144
|
-
catch (e) {
|
|
2145
|
-
console.error(`[Lineage] vesper_web_find append failed: ${e?.message || e}`);
|
|
2146
|
-
}
|
|
2147
|
-
return {
|
|
2148
|
-
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
2149
|
-
};
|
|
2150
|
-
}
|
|
2151
|
-
catch (error) {
|
|
2152
|
-
return {
|
|
2153
|
-
content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
|
|
2154
|
-
isError: true,
|
|
2155
|
-
};
|
|
2156
|
-
}
|
|
2157
|
-
}
|
|
2158
|
-
case "vesper.fuse": {
|
|
2159
|
-
hydrateExternalKeys();
|
|
2160
|
-
const sources = Array.isArray(request.params.arguments?.sources)
|
|
2161
|
-
? request.params.arguments?.sources
|
|
2162
|
-
: undefined;
|
|
2163
|
-
if (!sources || !Array.isArray(sources)) {
|
|
2164
|
-
return {
|
|
2165
|
-
content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
|
|
2166
|
-
isError: true,
|
|
2092
|
+
actor_diff: {
|
|
2093
|
+
changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
|
|
2094
|
+
String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
|
|
2095
|
+
from: {
|
|
2096
|
+
tool: fromV.triggered_by?.tool,
|
|
2097
|
+
agent_id: fromV.triggered_by?.agent_id,
|
|
2098
|
+
pipeline_id: fromV.triggered_by?.pipeline_id,
|
|
2099
|
+
},
|
|
2100
|
+
to: {
|
|
2101
|
+
tool: toV.triggered_by?.tool,
|
|
2102
|
+
agent_id: toV.triggered_by?.agent_id,
|
|
2103
|
+
pipeline_id: toV.triggered_by?.pipeline_id,
|
|
2104
|
+
},
|
|
2105
|
+
},
|
|
2106
|
+
}, null, 2),
|
|
2107
|
+
}],
|
|
2167
2108
|
};
|
|
2168
2109
|
}
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
const
|
|
2174
|
-
?
|
|
2175
|
-
: undefined;
|
|
2176
|
-
const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
|
|
2177
|
-
? mergeStrategyRaw
|
|
2178
|
-
: undefined;
|
|
2179
|
-
const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
|
|
2180
|
-
? dedupRaw
|
|
2110
|
+
case "vesper_web_find": {
|
|
2111
|
+
hydrateExternalKeys();
|
|
2112
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
2113
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
2114
|
+
const sources = Array.isArray(request.params.arguments?.sources)
|
|
2115
|
+
? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
|
|
2181
2116
|
: undefined;
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
type: String(s?.type || "").trim().toLowerCase(),
|
|
2185
|
-
query: String(s?.query || "").trim(),
|
|
2186
|
-
max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
|
|
2187
|
-
min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
|
|
2188
|
-
bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
|
|
2189
|
-
path: s?.path !== undefined ? String(s.path) : undefined,
|
|
2190
|
-
region: s?.region !== undefined ? String(s.region) : undefined,
|
|
2191
|
-
credentials: s?.credentials ? {
|
|
2192
|
-
accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
|
|
2193
|
-
secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
|
|
2194
|
-
sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
|
|
2195
|
-
roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
|
|
2196
|
-
} : undefined,
|
|
2197
|
-
})),
|
|
2198
|
-
merge_strategy,
|
|
2199
|
-
deduplication,
|
|
2200
|
-
});
|
|
2201
|
-
return {
|
|
2202
|
-
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
2203
|
-
};
|
|
2204
|
-
}
|
|
2205
|
-
catch (error) {
|
|
2206
|
-
return {
|
|
2207
|
-
content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
|
|
2208
|
-
isError: true,
|
|
2209
|
-
};
|
|
2210
|
-
}
|
|
2211
|
-
}
|
|
2212
|
-
case "vesper.extract_web": {
|
|
2213
|
-
hydrateExternalKeys();
|
|
2214
|
-
const url = String(request.params.arguments?.url || "").trim();
|
|
2215
|
-
const mode = request.params.arguments?.mode
|
|
2216
|
-
? String(request.params.arguments?.mode).trim().toLowerCase()
|
|
2217
|
-
: "auto";
|
|
2218
|
-
const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
|
|
2219
|
-
? request.params.arguments.schema
|
|
2220
|
-
: undefined;
|
|
2221
|
-
if (!url) {
|
|
2222
|
-
return {
|
|
2223
|
-
content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
|
|
2224
|
-
isError: true,
|
|
2225
|
-
};
|
|
2226
|
-
}
|
|
2227
|
-
try {
|
|
2228
|
-
const out = await webExtractorEngine.extract({
|
|
2229
|
-
url,
|
|
2230
|
-
mode: mode,
|
|
2231
|
-
strict_schema: request.params.arguments?.strict_schema !== false,
|
|
2232
|
-
schema: schema,
|
|
2233
|
-
});
|
|
2234
|
-
return {
|
|
2235
|
-
content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
|
|
2236
|
-
};
|
|
2237
|
-
}
|
|
2238
|
-
catch (error) {
|
|
2239
|
-
return {
|
|
2240
|
-
content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
|
|
2241
|
-
isError: true,
|
|
2242
|
-
};
|
|
2243
|
-
}
|
|
2244
|
-
}
|
|
2245
|
-
case "unified_dataset_api": {
|
|
2246
|
-
hydrateExternalKeys();
|
|
2247
|
-
const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
|
|
2248
|
-
const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
|
|
2249
|
-
const includeUnavailable = request.params.arguments?.include_unavailable === true;
|
|
2250
|
-
const publicOnly = request.params.arguments?.public_only !== false;
|
|
2251
|
-
try {
|
|
2252
|
-
if (operation === "providers") {
|
|
2253
|
-
return {
|
|
2254
|
-
content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
|
|
2255
|
-
};
|
|
2256
|
-
}
|
|
2257
|
-
if (operation === "discover") {
|
|
2258
|
-
const query = String(request.params.arguments?.query || "").trim();
|
|
2259
|
-
if (!query) {
|
|
2260
|
-
throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
|
|
2261
|
-
}
|
|
2262
|
-
const result = await unifiedDatasetGateway.discover({
|
|
2117
|
+
try {
|
|
2118
|
+
const result = await webCoreEngine.find({
|
|
2263
2119
|
query,
|
|
2264
|
-
|
|
2265
|
-
limit
|
|
2266
|
-
|
|
2120
|
+
sources: sources,
|
|
2121
|
+
limit,
|
|
2122
|
+
arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
|
|
2123
|
+
github_include_readme: request.params.arguments?.github_include_readme === true,
|
|
2267
2124
|
});
|
|
2268
2125
|
try {
|
|
2269
2126
|
appendLineageVersion({
|
|
2270
|
-
datasetIdBase: `
|
|
2271
|
-
tool: "
|
|
2127
|
+
datasetIdBase: `webfind_${query || "query"}`,
|
|
2128
|
+
tool: "vesper_web_find",
|
|
2272
2129
|
requestArgs: request.params.arguments,
|
|
2273
|
-
output: {
|
|
2130
|
+
output: {
|
|
2131
|
+
rows: Array.isArray(result.results) ? result.results.length : undefined,
|
|
2132
|
+
},
|
|
2274
2133
|
sources: Array.isArray(result.results)
|
|
2275
2134
|
? result.results.slice(0, 200).map((r) => ({
|
|
2276
|
-
source: String(r?.
|
|
2277
|
-
url: typeof r?.
|
|
2278
|
-
|
|
2279
|
-
: (typeof r?.metadata_url === "string" ? r.metadata_url : undefined),
|
|
2280
|
-
at: new Date().toISOString(),
|
|
2135
|
+
source: String(r?.source_type || "unknown"),
|
|
2136
|
+
url: typeof r?.source_url === "string" ? r.source_url : undefined,
|
|
2137
|
+
at: typeof r?.collected_at === "string" ? r.collected_at : undefined,
|
|
2281
2138
|
}))
|
|
2282
2139
|
: [],
|
|
2283
2140
|
steps: [
|
|
2284
|
-
{ step: "
|
|
2285
|
-
{ step: "
|
|
2141
|
+
{ step: "web_find_discover", at: new Date().toISOString(), params: { query, sources, limit } },
|
|
2142
|
+
{ step: "web_find_complete", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
|
|
2286
2143
|
],
|
|
2287
2144
|
});
|
|
2288
2145
|
}
|
|
2289
2146
|
catch (e) {
|
|
2290
|
-
console.error(`[Lineage]
|
|
2147
|
+
console.error(`[Lineage] vesper_web_find append failed: ${e?.message || e}`);
|
|
2291
2148
|
}
|
|
2292
2149
|
return {
|
|
2293
2150
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
2294
2151
|
};
|
|
2295
2152
|
}
|
|
2296
|
-
|
|
2297
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2298
|
-
if (!datasetId) {
|
|
2299
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
|
|
2300
|
-
}
|
|
2301
|
-
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2302
|
-
? String(request.params.arguments.target_dir).trim()
|
|
2303
|
-
: request.params.arguments?.output_dir
|
|
2304
|
-
? String(request.params.arguments.output_dir).trim()
|
|
2305
|
-
: "";
|
|
2306
|
-
const targetDir = requestedTargetDir || process.cwd();
|
|
2307
|
-
try {
|
|
2308
|
-
await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
|
|
2309
|
-
}
|
|
2310
|
-
catch {
|
|
2311
|
-
// best effort; non-HF providers do not require this
|
|
2312
|
-
}
|
|
2313
|
-
const result = await unifiedDatasetGateway.download({
|
|
2314
|
-
datasetId,
|
|
2315
|
-
source,
|
|
2316
|
-
targetDir,
|
|
2317
|
-
});
|
|
2318
|
-
try {
|
|
2319
|
-
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
2320
|
-
}
|
|
2321
|
-
catch (e) {
|
|
2322
|
-
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
2323
|
-
}
|
|
2324
|
-
try {
|
|
2325
|
-
const schemaAfter = await getSchemaSnapshot(result.copied_to || result.local_path);
|
|
2326
|
-
const lineage = appendLineageVersion({
|
|
2327
|
-
datasetIdBase: result.dataset_id,
|
|
2328
|
-
tool: "unified_dataset_api.download",
|
|
2329
|
-
requestArgs: request.params.arguments,
|
|
2330
|
-
outputPath: result.copied_to || result.local_path,
|
|
2331
|
-
output: {
|
|
2332
|
-
local_path: result.copied_to || result.local_path,
|
|
2333
|
-
format: path.extname(result.copied_to || result.local_path).replace(".", ""),
|
|
2334
|
-
schema_after: schemaAfter,
|
|
2335
|
-
},
|
|
2336
|
-
sources: [{
|
|
2337
|
-
source: source,
|
|
2338
|
-
url: typeof result.dataset_id === "string" ? result.dataset_id : undefined,
|
|
2339
|
-
at: new Date().toISOString(),
|
|
2340
|
-
}],
|
|
2341
|
-
steps: [
|
|
2342
|
-
{ step: "download_requested", at: new Date().toISOString(), params: { datasetId, source, targetDir } },
|
|
2343
|
-
{ step: "download_completed", at: new Date().toISOString(), metrics: { local_path: result.copied_to || result.local_path } },
|
|
2344
|
-
],
|
|
2345
|
-
});
|
|
2346
|
-
try {
|
|
2347
|
-
upsertRegistry(lineage.datasetVersionId, result.copied_to || result.local_path, "completed");
|
|
2348
|
-
}
|
|
2349
|
-
catch { }
|
|
2350
|
-
}
|
|
2351
|
-
catch (e) {
|
|
2352
|
-
console.error(`[Lineage] unified download append failed: ${e?.message || e}`);
|
|
2353
|
-
}
|
|
2153
|
+
catch (error) {
|
|
2354
2154
|
return {
|
|
2355
|
-
content: [{ type: "text", text:
|
|
2155
|
+
content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
|
|
2156
|
+
isError: true,
|
|
2356
2157
|
};
|
|
2357
2158
|
}
|
|
2358
|
-
|
|
2359
|
-
|
|
2360
|
-
|
|
2361
|
-
|
|
2362
|
-
|
|
2363
|
-
|
|
2364
|
-
|
|
2365
|
-
|
|
2366
|
-
|
|
2159
|
+
}
|
|
2160
|
+
case "vesper.fuse": {
|
|
2161
|
+
hydrateExternalKeys();
|
|
2162
|
+
const sources = Array.isArray(request.params.arguments?.sources)
|
|
2163
|
+
? request.params.arguments?.sources
|
|
2164
|
+
: undefined;
|
|
2165
|
+
if (!sources || !Array.isArray(sources)) {
|
|
2166
|
+
return {
|
|
2167
|
+
content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
|
|
2168
|
+
isError: true,
|
|
2169
|
+
};
|
|
2170
|
+
}
|
|
2171
|
+
try {
|
|
2172
|
+
const mergeStrategyRaw = request.params.arguments?.merge_strategy
|
|
2173
|
+
? String(request.params.arguments?.merge_strategy).toLowerCase()
|
|
2174
|
+
: undefined;
|
|
2175
|
+
const dedupRaw = request.params.arguments?.deduplication
|
|
2176
|
+
? String(request.params.arguments?.deduplication).toLowerCase()
|
|
2177
|
+
: undefined;
|
|
2178
|
+
const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
|
|
2179
|
+
? mergeStrategyRaw
|
|
2180
|
+
: undefined;
|
|
2181
|
+
const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
|
|
2182
|
+
? dedupRaw
|
|
2183
|
+
: undefined;
|
|
2184
|
+
const result = await webFusionEngine.fuse({
|
|
2185
|
+
sources: sources.map((s) => ({
|
|
2186
|
+
type: String(s?.type || "").trim().toLowerCase(),
|
|
2187
|
+
query: String(s?.query || "").trim(),
|
|
2188
|
+
max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
|
|
2189
|
+
min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
|
|
2190
|
+
bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
|
|
2191
|
+
path: s?.path !== undefined ? String(s.path) : undefined,
|
|
2192
|
+
region: s?.region !== undefined ? String(s.region) : undefined,
|
|
2193
|
+
credentials: s?.credentials ? {
|
|
2194
|
+
accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
|
|
2195
|
+
secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
|
|
2196
|
+
sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
|
|
2197
|
+
roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
|
|
2198
|
+
} : undefined,
|
|
2199
|
+
})),
|
|
2200
|
+
merge_strategy,
|
|
2201
|
+
deduplication,
|
|
2367
2202
|
});
|
|
2368
2203
|
return {
|
|
2369
2204
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
2370
2205
|
};
|
|
2371
2206
|
}
|
|
2372
|
-
|
|
2373
|
-
|
|
2374
|
-
|
|
2375
|
-
|
|
2376
|
-
content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
|
|
2377
|
-
isError: true,
|
|
2378
|
-
};
|
|
2379
|
-
}
|
|
2380
|
-
}
|
|
2381
|
-
case "vesper_search": {
|
|
2382
|
-
const query = String(request.params.arguments?.query);
|
|
2383
|
-
const limit = 5;
|
|
2384
|
-
const safeOnly = true; // Enable safe filter by default
|
|
2385
|
-
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
2386
|
-
if (!query) {
|
|
2387
|
-
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
2388
|
-
}
|
|
2389
|
-
const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
|
|
2390
|
-
const formattedOutput = formatSearchResults(results);
|
|
2391
|
-
return {
|
|
2392
|
-
content: [
|
|
2393
|
-
{
|
|
2394
|
-
type: "text",
|
|
2395
|
-
text: formattedOutput,
|
|
2396
|
-
},
|
|
2397
|
-
],
|
|
2398
|
-
};
|
|
2399
|
-
}
|
|
2400
|
-
case "discover_datasets": {
|
|
2401
|
-
hydrateExternalKeys();
|
|
2402
|
-
const query = String(request.params.arguments?.query || "").trim();
|
|
2403
|
-
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
2404
|
-
const limit = Number(request.params.arguments?.limit || 10);
|
|
2405
|
-
if (!query) {
|
|
2406
|
-
throw new McpError(ErrorCode.InvalidParams, "query is required");
|
|
2407
|
-
}
|
|
2408
|
-
try {
|
|
2409
|
-
const gatewayResult = await unifiedDatasetGateway.discover({
|
|
2410
|
-
query,
|
|
2411
|
-
source,
|
|
2412
|
-
limit,
|
|
2413
|
-
publicOnly: false,
|
|
2414
|
-
});
|
|
2415
|
-
const results = gatewayResult.results;
|
|
2416
|
-
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
2417
|
-
for (const ds of results.slice(0, limit)) {
|
|
2418
|
-
const info = {
|
|
2419
|
-
dataset_id: ds.id,
|
|
2420
|
-
id: ds.id,
|
|
2421
|
-
source: ds.source,
|
|
2422
|
-
repo_id: ds.id,
|
|
2423
|
-
total_images: ds.total_examples || 0,
|
|
2424
|
-
image_column: undefined,
|
|
2425
|
-
recipes_dir: path.join(dataRoot, "recipes"),
|
|
2207
|
+
catch (error) {
|
|
2208
|
+
return {
|
|
2209
|
+
content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
|
|
2210
|
+
isError: true,
|
|
2426
2211
|
};
|
|
2427
|
-
try {
|
|
2428
|
-
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
2429
|
-
}
|
|
2430
|
-
catch {
|
|
2431
|
-
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
2432
|
-
}
|
|
2433
2212
|
}
|
|
2434
|
-
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
2435
|
-
const noteBlock = gatewayResult.notes.length > 0
|
|
2436
|
-
? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
|
|
2437
|
-
: "";
|
|
2438
|
-
return {
|
|
2439
|
-
content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
|
|
2440
|
-
};
|
|
2441
2213
|
}
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
? String(request.params.arguments.output_dir).trim()
|
|
2457
|
-
: "";
|
|
2458
|
-
const targetDir = requestedTargetDir || process.cwd();
|
|
2459
|
-
if (!datasetId) {
|
|
2460
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2461
|
-
}
|
|
2462
|
-
// Pre-install Python datasets library for HuggingFace fallback
|
|
2463
|
-
if (source === "huggingface") {
|
|
2464
|
-
try {
|
|
2465
|
-
await ensurePythonModules([
|
|
2466
|
-
{ module: "datasets", packageName: "datasets" },
|
|
2467
|
-
]);
|
|
2468
|
-
}
|
|
2469
|
-
catch {
|
|
2470
|
-
// Continue - direct download may still work
|
|
2214
|
+
case "vesper.extract_web": {
|
|
2215
|
+
hydrateExternalKeys();
|
|
2216
|
+
const url = String(request.params.arguments?.url || "").trim();
|
|
2217
|
+
const mode = request.params.arguments?.mode
|
|
2218
|
+
? String(request.params.arguments?.mode).trim().toLowerCase()
|
|
2219
|
+
: "auto";
|
|
2220
|
+
const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
|
|
2221
|
+
? request.params.arguments.schema
|
|
2222
|
+
: undefined;
|
|
2223
|
+
if (!url) {
|
|
2224
|
+
return {
|
|
2225
|
+
content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
|
|
2226
|
+
isError: true,
|
|
2227
|
+
};
|
|
2471
2228
|
}
|
|
2472
|
-
}
|
|
2473
|
-
try {
|
|
2474
|
-
const result = await unifiedDatasetGateway.download({
|
|
2475
|
-
datasetId,
|
|
2476
|
-
source,
|
|
2477
|
-
targetDir,
|
|
2478
|
-
});
|
|
2479
2229
|
try {
|
|
2480
|
-
|
|
2230
|
+
const out = await webExtractorEngine.extract({
|
|
2231
|
+
url,
|
|
2232
|
+
mode: mode,
|
|
2233
|
+
strict_schema: request.params.arguments?.strict_schema !== false,
|
|
2234
|
+
schema: schema,
|
|
2235
|
+
});
|
|
2236
|
+
return {
|
|
2237
|
+
content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
|
|
2238
|
+
};
|
|
2481
2239
|
}
|
|
2482
|
-
catch (
|
|
2483
|
-
|
|
2240
|
+
catch (error) {
|
|
2241
|
+
return {
|
|
2242
|
+
content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
|
|
2243
|
+
isError: true,
|
|
2244
|
+
};
|
|
2484
2245
|
}
|
|
2485
|
-
const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
|
|
2486
|
-
return {
|
|
2487
|
-
content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
|
|
2488
|
-
};
|
|
2489
|
-
}
|
|
2490
|
-
catch (error) {
|
|
2491
|
-
return {
|
|
2492
|
-
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
2493
|
-
isError: true,
|
|
2494
|
-
};
|
|
2495
|
-
}
|
|
2496
|
-
}
|
|
2497
|
-
case "vesper_download_assets": {
|
|
2498
|
-
hydrateExternalKeys();
|
|
2499
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2500
|
-
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
2501
|
-
// Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
|
|
2502
|
-
const repoId = request.params.arguments?.repo_id
|
|
2503
|
-
? String(request.params.arguments.repo_id)
|
|
2504
|
-
: (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
|
|
2505
|
-
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
2506
|
-
const urls = Array.isArray(request.params.arguments?.urls)
|
|
2507
|
-
? (request.params.arguments?.urls).map(v => String(v))
|
|
2508
|
-
: undefined;
|
|
2509
|
-
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
2510
|
-
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2511
|
-
? String(request.params.arguments.target_dir).trim()
|
|
2512
|
-
: request.params.arguments?.output_dir
|
|
2513
|
-
? String(request.params.arguments.output_dir).trim()
|
|
2514
|
-
: undefined;
|
|
2515
|
-
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
2516
|
-
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
2517
|
-
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
2518
|
-
if (!datasetId || !source) {
|
|
2519
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
2520
|
-
}
|
|
2521
|
-
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
2522
|
-
return {
|
|
2523
|
-
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
2524
|
-
isError: true,
|
|
2525
|
-
};
|
|
2526
|
-
}
|
|
2527
|
-
const requiredModules = [
|
|
2528
|
-
{ module: "aiohttp", packageName: "aiohttp" },
|
|
2529
|
-
];
|
|
2530
|
-
if (source === "url") {
|
|
2531
|
-
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
2532
|
-
}
|
|
2533
|
-
if (source === "huggingface") {
|
|
2534
|
-
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
2535
|
-
requiredModules.push({ module: "PIL", packageName: "Pillow" });
|
|
2536
|
-
}
|
|
2537
|
-
if (source === "kaggle") {
|
|
2538
|
-
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
2539
|
-
}
|
|
2540
|
-
try {
|
|
2541
|
-
await ensurePythonModules(requiredModules);
|
|
2542
2246
|
}
|
|
2543
|
-
|
|
2544
|
-
|
|
2545
|
-
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
|
|
2551
|
-
|
|
2552
|
-
|
|
2553
|
-
|
|
2554
|
-
kaggle_ref: kaggleRef,
|
|
2555
|
-
urls,
|
|
2556
|
-
output_format: outputFormat,
|
|
2557
|
-
output_dir: requestedOutputDir,
|
|
2558
|
-
max_items: maxItems,
|
|
2559
|
-
workers,
|
|
2560
|
-
image_column: imageColumn,
|
|
2561
|
-
output_root: requestedOutputDir || process.cwd(),
|
|
2562
|
-
recipes_dir: path.join(dataRoot, "recipes"),
|
|
2563
|
-
};
|
|
2564
|
-
try {
|
|
2565
|
-
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
2566
|
-
if (!result?.ok) {
|
|
2567
|
-
const errMsg = result?.error || "Unknown error";
|
|
2568
|
-
// Enhance error messages for common failures
|
|
2569
|
-
let hint = "";
|
|
2570
|
-
if (errMsg.includes("No image column")) {
|
|
2571
|
-
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
2247
|
+
case "unified_dataset_api": {
|
|
2248
|
+
hydrateExternalKeys();
|
|
2249
|
+
const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
|
|
2250
|
+
const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
|
|
2251
|
+
const includeUnavailable = request.params.arguments?.include_unavailable === true;
|
|
2252
|
+
const publicOnly = request.params.arguments?.public_only !== false;
|
|
2253
|
+
try {
|
|
2254
|
+
if (operation === "providers") {
|
|
2255
|
+
return {
|
|
2256
|
+
content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
|
|
2257
|
+
};
|
|
2572
2258
|
}
|
|
2573
|
-
|
|
2574
|
-
|
|
2259
|
+
if (operation === "discover") {
|
|
2260
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
2261
|
+
if (!query) {
|
|
2262
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
|
|
2263
|
+
}
|
|
2264
|
+
const result = await unifiedDatasetGateway.discover({
|
|
2265
|
+
query,
|
|
2266
|
+
source,
|
|
2267
|
+
limit: Number(request.params.arguments?.limit || 10),
|
|
2268
|
+
publicOnly,
|
|
2269
|
+
});
|
|
2270
|
+
try {
|
|
2271
|
+
appendLineageVersion({
|
|
2272
|
+
datasetIdBase: `discover_${source}_${query || "query"}`,
|
|
2273
|
+
tool: "unified_dataset_api.discover",
|
|
2274
|
+
requestArgs: request.params.arguments,
|
|
2275
|
+
output: { rows: Array.isArray(result.results) ? result.results.length : undefined },
|
|
2276
|
+
sources: Array.isArray(result.results)
|
|
2277
|
+
? result.results.slice(0, 200).map((r) => ({
|
|
2278
|
+
source: String(r?.source || source || "unknown"),
|
|
2279
|
+
url: typeof r?.download_url === "string"
|
|
2280
|
+
? r.download_url
|
|
2281
|
+
: (typeof r?.metadata_url === "string" ? r.metadata_url : undefined),
|
|
2282
|
+
at: new Date().toISOString(),
|
|
2283
|
+
}))
|
|
2284
|
+
: [],
|
|
2285
|
+
steps: [
|
|
2286
|
+
{ step: "discover_requested", at: new Date().toISOString(), params: { query, source, limit: Number(request.params.arguments?.limit || 10), publicOnly } },
|
|
2287
|
+
{ step: "discover_completed", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
|
|
2288
|
+
],
|
|
2289
|
+
});
|
|
2290
|
+
}
|
|
2291
|
+
catch (e) {
|
|
2292
|
+
console.error(`[Lineage] unified discover append failed: ${e?.message || e}`);
|
|
2293
|
+
}
|
|
2294
|
+
return {
|
|
2295
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
2296
|
+
};
|
|
2575
2297
|
}
|
|
2298
|
+
if (operation === "download") {
|
|
2299
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2300
|
+
if (!datasetId) {
|
|
2301
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
|
|
2302
|
+
}
|
|
2303
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2304
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2305
|
+
: request.params.arguments?.output_dir
|
|
2306
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2307
|
+
: "";
|
|
2308
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
2309
|
+
try {
|
|
2310
|
+
await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
|
|
2311
|
+
}
|
|
2312
|
+
catch {
|
|
2313
|
+
// best effort; non-HF providers do not require this
|
|
2314
|
+
}
|
|
2315
|
+
const result = await unifiedDatasetGateway.download({
|
|
2316
|
+
datasetId,
|
|
2317
|
+
source,
|
|
2318
|
+
targetDir,
|
|
2319
|
+
});
|
|
2320
|
+
try {
|
|
2321
|
+
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
2322
|
+
}
|
|
2323
|
+
catch (e) {
|
|
2324
|
+
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
2325
|
+
}
|
|
2326
|
+
try {
|
|
2327
|
+
const schemaAfter = await getSchemaSnapshot(result.copied_to || result.local_path);
|
|
2328
|
+
const lineage = appendLineageVersion({
|
|
2329
|
+
datasetIdBase: result.dataset_id,
|
|
2330
|
+
tool: "unified_dataset_api.download",
|
|
2331
|
+
requestArgs: request.params.arguments,
|
|
2332
|
+
outputPath: result.copied_to || result.local_path,
|
|
2333
|
+
output: {
|
|
2334
|
+
local_path: result.copied_to || result.local_path,
|
|
2335
|
+
format: path.extname(result.copied_to || result.local_path).replace(".", ""),
|
|
2336
|
+
schema_after: schemaAfter,
|
|
2337
|
+
},
|
|
2338
|
+
sources: [{
|
|
2339
|
+
source: source,
|
|
2340
|
+
url: typeof result.dataset_id === "string" ? result.dataset_id : undefined,
|
|
2341
|
+
at: new Date().toISOString(),
|
|
2342
|
+
}],
|
|
2343
|
+
steps: [
|
|
2344
|
+
{ step: "download_requested", at: new Date().toISOString(), params: { datasetId, source, targetDir } },
|
|
2345
|
+
{ step: "download_completed", at: new Date().toISOString(), metrics: { local_path: result.copied_to || result.local_path } },
|
|
2346
|
+
],
|
|
2347
|
+
});
|
|
2348
|
+
try {
|
|
2349
|
+
upsertRegistry(lineage.datasetVersionId, result.copied_to || result.local_path, "completed");
|
|
2350
|
+
}
|
|
2351
|
+
catch { }
|
|
2352
|
+
}
|
|
2353
|
+
catch (e) {
|
|
2354
|
+
console.error(`[Lineage] unified download append failed: ${e?.message || e}`);
|
|
2355
|
+
}
|
|
2356
|
+
return {
|
|
2357
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
2358
|
+
};
|
|
2359
|
+
}
|
|
2360
|
+
if (operation === "info") {
|
|
2361
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2362
|
+
if (!datasetId) {
|
|
2363
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
|
|
2364
|
+
}
|
|
2365
|
+
const result = await unifiedDatasetGateway.info({
|
|
2366
|
+
datasetId,
|
|
2367
|
+
source,
|
|
2368
|
+
publicOnly,
|
|
2369
|
+
});
|
|
2370
|
+
return {
|
|
2371
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
2372
|
+
};
|
|
2373
|
+
}
|
|
2374
|
+
throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
|
|
2375
|
+
}
|
|
2376
|
+
catch (error) {
|
|
2576
2377
|
return {
|
|
2577
|
-
content: [{ type: "text", text: `ERROR:
|
|
2378
|
+
content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
|
|
2578
2379
|
isError: true,
|
|
2579
2380
|
};
|
|
2580
2381
|
}
|
|
2581
|
-
return {
|
|
2582
|
-
content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
|
|
2583
|
-
};
|
|
2584
|
-
}
|
|
2585
|
-
catch (error) {
|
|
2586
|
-
return {
|
|
2587
|
-
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
2588
|
-
isError: true,
|
|
2589
|
-
};
|
|
2590
2382
|
}
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
const methods = [];
|
|
2599
|
-
if (hfToken) {
|
|
2600
|
-
const r = secureKeys.set("hf_token", hfToken);
|
|
2601
|
-
if (r.ok) {
|
|
2602
|
-
process.env.HF_TOKEN = hfToken;
|
|
2603
|
-
saved.push("HF token");
|
|
2604
|
-
if (r.method)
|
|
2605
|
-
methods.push(r.method);
|
|
2383
|
+
case "vesper_search": {
|
|
2384
|
+
const query = String(request.params.arguments?.query);
|
|
2385
|
+
const limit = 5;
|
|
2386
|
+
const safeOnly = true; // Enable safe filter by default
|
|
2387
|
+
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
2388
|
+
if (!query) {
|
|
2389
|
+
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
2606
2390
|
}
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
2610
|
-
if (r.ok) {
|
|
2611
|
-
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
2612
|
-
saved.push("Kaggle username");
|
|
2613
|
-
if (r.method)
|
|
2614
|
-
methods.push(r.method);
|
|
2615
|
-
}
|
|
2616
|
-
}
|
|
2617
|
-
if (kaggleKey) {
|
|
2618
|
-
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
2619
|
-
if (r.ok) {
|
|
2620
|
-
process.env.KAGGLE_KEY = kaggleKey;
|
|
2621
|
-
saved.push("Kaggle key");
|
|
2622
|
-
if (r.method)
|
|
2623
|
-
methods.push(r.method);
|
|
2624
|
-
}
|
|
2625
|
-
}
|
|
2626
|
-
if (dataworldToken) {
|
|
2627
|
-
const r = secureKeys.set("dataworld_token", dataworldToken);
|
|
2628
|
-
if (r.ok) {
|
|
2629
|
-
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
2630
|
-
saved.push("data.world token");
|
|
2631
|
-
if (r.method)
|
|
2632
|
-
methods.push(r.method);
|
|
2633
|
-
}
|
|
2634
|
-
}
|
|
2635
|
-
if (saved.length === 0) {
|
|
2391
|
+
const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
|
|
2392
|
+
const formattedOutput = formatSearchResults(results);
|
|
2636
2393
|
return {
|
|
2637
|
-
content: [
|
|
2394
|
+
content: [
|
|
2395
|
+
{
|
|
2396
|
+
type: "text",
|
|
2397
|
+
text: formattedOutput,
|
|
2398
|
+
},
|
|
2399
|
+
],
|
|
2638
2400
|
};
|
|
2639
2401
|
}
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2648
|
-
}
|
|
2649
|
-
const dataset = metadataStore.getDataset(datasetId);
|
|
2650
|
-
if (!dataset) {
|
|
2651
|
-
// Fallback: check the registry for local path info
|
|
2652
|
-
const regEntry = getRegistryEntry(datasetId);
|
|
2653
|
-
const regPath = regEntry?.local_path || regEntry?.path;
|
|
2654
|
-
if (regEntry) {
|
|
2655
|
-
const exists = regPath && fs.existsSync(regPath);
|
|
2656
|
-
return {
|
|
2657
|
-
content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
|
|
2658
|
-
};
|
|
2402
|
+
case "discover_datasets": {
|
|
2403
|
+
hydrateExternalKeys();
|
|
2404
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
2405
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
2406
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
2407
|
+
if (!query) {
|
|
2408
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required");
|
|
2659
2409
|
}
|
|
2660
|
-
return {
|
|
2661
|
-
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
|
|
2662
|
-
isError: true,
|
|
2663
|
-
};
|
|
2664
|
-
}
|
|
2665
|
-
// Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
|
|
2666
|
-
if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
|
|
2667
2410
|
try {
|
|
2668
|
-
const
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
|
|
2672
|
-
|
|
2673
|
-
|
|
2674
|
-
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
|
|
2679
|
-
|
|
2680
|
-
|
|
2681
|
-
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
2686
|
-
|
|
2411
|
+
const gatewayResult = await unifiedDatasetGateway.discover({
|
|
2412
|
+
query,
|
|
2413
|
+
source,
|
|
2414
|
+
limit,
|
|
2415
|
+
publicOnly: false,
|
|
2416
|
+
});
|
|
2417
|
+
const results = gatewayResult.results;
|
|
2418
|
+
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
2419
|
+
for (const ds of results.slice(0, limit)) {
|
|
2420
|
+
const info = {
|
|
2421
|
+
dataset_id: ds.id,
|
|
2422
|
+
id: ds.id,
|
|
2423
|
+
source: ds.source,
|
|
2424
|
+
repo_id: ds.id,
|
|
2425
|
+
total_images: ds.total_examples || 0,
|
|
2426
|
+
image_column: undefined,
|
|
2427
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
2428
|
+
};
|
|
2429
|
+
try {
|
|
2430
|
+
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
2431
|
+
}
|
|
2432
|
+
catch {
|
|
2433
|
+
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
2687
2434
|
}
|
|
2688
2435
|
}
|
|
2436
|
+
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
2437
|
+
const noteBlock = gatewayResult.notes.length > 0
|
|
2438
|
+
? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
|
|
2439
|
+
: "";
|
|
2440
|
+
return {
|
|
2441
|
+
content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
|
|
2442
|
+
};
|
|
2689
2443
|
}
|
|
2690
|
-
catch {
|
|
2691
|
-
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
return { content: [{ type: "text", text: formattedOutput }] };
|
|
2696
|
-
}
|
|
2697
|
-
case "quality_analyze":
|
|
2698
|
-
case "analyze_quality":
|
|
2699
|
-
case "analyze_image_quality":
|
|
2700
|
-
case "analyze_media_quality":
|
|
2701
|
-
case "generate_quality_report": {
|
|
2702
|
-
const resolvedOperation = request.params.name === "analyze_image_quality"
|
|
2703
|
-
? "image"
|
|
2704
|
-
: request.params.name === "analyze_media_quality"
|
|
2705
|
-
? "media"
|
|
2706
|
-
: request.params.name === "generate_quality_report"
|
|
2707
|
-
? "report"
|
|
2708
|
-
: String(request.params.arguments?.operation || "dataset").toLowerCase();
|
|
2709
|
-
if (resolvedOperation === "image") {
|
|
2710
|
-
const inputPath = String(request.params.arguments?.path || "").trim();
|
|
2711
|
-
if (!inputPath || !fs.existsSync(inputPath)) {
|
|
2712
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2713
|
-
}
|
|
2714
|
-
const report = await imageAnalyzer.analyze(inputPath);
|
|
2715
|
-
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
2716
|
-
}
|
|
2717
|
-
if (resolvedOperation === "media") {
|
|
2718
|
-
const inputPath = String(request.params.arguments?.path || "").trim();
|
|
2719
|
-
if (!inputPath || !fs.existsSync(inputPath)) {
|
|
2720
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2444
|
+
catch (error) {
|
|
2445
|
+
return {
|
|
2446
|
+
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
2447
|
+
isError: true,
|
|
2448
|
+
};
|
|
2721
2449
|
}
|
|
2722
|
-
const report = await mediaAnalyzer.analyze(inputPath);
|
|
2723
|
-
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
2724
2450
|
}
|
|
2725
|
-
|
|
2451
|
+
case "download_dataset": {
|
|
2452
|
+
hydrateExternalKeys();
|
|
2453
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
2726
2454
|
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2727
|
-
const
|
|
2455
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2456
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2457
|
+
: request.params.arguments?.output_dir
|
|
2458
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2459
|
+
: "";
|
|
2460
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
2728
2461
|
if (!datasetId) {
|
|
2729
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required
|
|
2462
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2463
|
+
}
|
|
2464
|
+
// Pre-install Python datasets library for HuggingFace fallback
|
|
2465
|
+
if (source === "huggingface") {
|
|
2466
|
+
try {
|
|
2467
|
+
await ensurePythonModules([
|
|
2468
|
+
{ module: "datasets", packageName: "datasets" },
|
|
2469
|
+
]);
|
|
2470
|
+
}
|
|
2471
|
+
catch {
|
|
2472
|
+
// Continue - direct download may still work
|
|
2473
|
+
}
|
|
2730
2474
|
}
|
|
2731
|
-
|
|
2732
|
-
|
|
2475
|
+
try {
|
|
2476
|
+
const result = await unifiedDatasetGateway.download({
|
|
2477
|
+
datasetId,
|
|
2478
|
+
source,
|
|
2479
|
+
targetDir,
|
|
2480
|
+
});
|
|
2481
|
+
try {
|
|
2482
|
+
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
2483
|
+
}
|
|
2484
|
+
catch (e) {
|
|
2485
|
+
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
2486
|
+
}
|
|
2487
|
+
const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
|
|
2488
|
+
return {
|
|
2489
|
+
content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
|
|
2490
|
+
};
|
|
2733
2491
|
}
|
|
2734
|
-
|
|
2735
|
-
|
|
2736
|
-
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
await metadataStore.saveDataset(metadata);
|
|
2492
|
+
catch (error) {
|
|
2493
|
+
return {
|
|
2494
|
+
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
2495
|
+
isError: true,
|
|
2496
|
+
};
|
|
2740
2497
|
}
|
|
2741
|
-
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
2742
|
-
}
|
|
2743
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2744
|
-
if (!datasetId) {
|
|
2745
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='dataset'");
|
|
2746
2498
|
}
|
|
2747
|
-
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2751
|
-
|
|
2752
|
-
|
|
2753
|
-
|
|
2754
|
-
|
|
2755
|
-
|
|
2756
|
-
|
|
2757
|
-
|
|
2758
|
-
|
|
2759
|
-
|
|
2499
|
+
case "vesper_download_assets": {
|
|
2500
|
+
hydrateExternalKeys();
|
|
2501
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2502
|
+
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
2503
|
+
// Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
|
|
2504
|
+
const repoId = request.params.arguments?.repo_id
|
|
2505
|
+
? String(request.params.arguments.repo_id)
|
|
2506
|
+
: (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
|
|
2507
|
+
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
2508
|
+
const urls = Array.isArray(request.params.arguments?.urls)
|
|
2509
|
+
? (request.params.arguments?.urls).map(v => String(v))
|
|
2510
|
+
: undefined;
|
|
2511
|
+
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
2512
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2513
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2514
|
+
: request.params.arguments?.output_dir
|
|
2515
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2516
|
+
: undefined;
|
|
2517
|
+
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
2518
|
+
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
2519
|
+
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
2520
|
+
if (!datasetId || !source) {
|
|
2521
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
2760
2522
|
}
|
|
2761
|
-
|
|
2523
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
2762
2524
|
return {
|
|
2763
|
-
content: [{ type: "text", text:
|
|
2764
|
-
isError: true
|
|
2525
|
+
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
2526
|
+
isError: true,
|
|
2765
2527
|
};
|
|
2766
2528
|
}
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
}
|
|
2773
|
-
case "preview_cleaning": {
|
|
2774
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2775
|
-
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
2776
|
-
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
2777
|
-
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
2778
|
-
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
2779
|
-
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
2780
|
-
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
2781
|
-
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
2782
|
-
if (fs.existsSync(demoParquetPath)) {
|
|
2783
|
-
filePath = demoParquetPath;
|
|
2529
|
+
const requiredModules = [
|
|
2530
|
+
{ module: "aiohttp", packageName: "aiohttp" },
|
|
2531
|
+
];
|
|
2532
|
+
if (source === "url") {
|
|
2533
|
+
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
2784
2534
|
}
|
|
2785
|
-
|
|
2786
|
-
|
|
2535
|
+
if (source === "huggingface") {
|
|
2536
|
+
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
2537
|
+
requiredModules.push({ module: "PIL", packageName: "Pillow" });
|
|
2787
2538
|
}
|
|
2788
|
-
|
|
2789
|
-
|
|
2539
|
+
if (source === "kaggle") {
|
|
2540
|
+
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
2790
2541
|
}
|
|
2791
|
-
|
|
2792
|
-
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2796
|
-
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
2815
|
-
// Let's do that in a separate step if needed.
|
|
2816
|
-
// For now, I'll instantiate it here.
|
|
2817
|
-
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
2818
|
-
const detector = new TargetDetector(__dirname);
|
|
2819
|
-
const targetResult = await detector.detectTarget(filePath);
|
|
2820
|
-
const targetInfo = targetResult.target_column ? {
|
|
2821
|
-
target: targetResult.target_column,
|
|
2822
|
-
confidence: targetResult.confidence
|
|
2823
|
-
} : undefined;
|
|
2824
|
-
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
2825
|
-
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
2826
|
-
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
2827
|
-
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
2828
|
-
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
2829
|
-
}
|
|
2830
|
-
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
2831
|
-
if (plan.operations.length === 0) {
|
|
2832
|
-
explanation += "No cleaning operations required.";
|
|
2833
|
-
}
|
|
2834
|
-
else {
|
|
2835
|
-
plan.operations.forEach((op, i) => {
|
|
2836
|
-
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
2837
|
-
});
|
|
2838
|
-
}
|
|
2839
|
-
return {
|
|
2840
|
-
content: [{ type: "text", text: explanation }]
|
|
2841
|
-
};
|
|
2842
|
-
}
|
|
2843
|
-
case "custom_clean": {
|
|
2844
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2845
|
-
const ops = request.params.arguments?.operations;
|
|
2846
|
-
if (!datasetId || datasetId === "undefined") {
|
|
2847
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2848
|
-
}
|
|
2849
|
-
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
2850
|
-
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
2851
|
-
}
|
|
2852
|
-
// Pre-check: verify dataset file exists before starting the job
|
|
2853
|
-
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
2854
|
-
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
2855
|
-
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
2856
|
-
const cleanSafeId = toSafeDatasetPathFragment(datasetId);
|
|
2857
|
-
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
2858
|
-
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
2859
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
2860
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
2861
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
2862
|
-
fs.existsSync(datasetId);
|
|
2863
|
-
if (!cleanDataExists) {
|
|
2864
|
-
return {
|
|
2865
|
-
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
2866
|
-
isError: true,
|
|
2867
|
-
};
|
|
2868
|
-
}
|
|
2869
|
-
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
2870
|
-
return {
|
|
2871
|
-
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
2872
|
-
};
|
|
2873
|
-
}
|
|
2874
|
-
case "prepare_dataset": {
|
|
2875
|
-
hydrateExternalKeys();
|
|
2876
|
-
const query = String(request.params.arguments?.query);
|
|
2877
|
-
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
2878
|
-
const downloadImages = request.params.arguments?.download_images === true;
|
|
2879
|
-
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2880
|
-
? String(request.params.arguments.target_dir).trim()
|
|
2881
|
-
: request.params.arguments?.output_dir
|
|
2882
|
-
? String(request.params.arguments.output_dir).trim()
|
|
2883
|
-
: "";
|
|
2884
|
-
const outputDir = requestedOutputDir || process.cwd();
|
|
2885
|
-
if (!query || query === "undefined") {
|
|
2886
|
-
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
2887
|
-
}
|
|
2888
|
-
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
|
|
2889
|
-
return {
|
|
2890
|
-
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
2891
|
-
};
|
|
2892
|
-
}
|
|
2893
|
-
case "compare_datasets": {
|
|
2894
|
-
const datasetIds = request.params.arguments?.dataset_ids;
|
|
2895
|
-
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
2896
|
-
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
2897
|
-
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
2898
|
-
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
2899
|
-
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
2900
|
-
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
2901
|
-
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
2902
|
-
return {
|
|
2903
|
-
content: [{ type: "text", text: comparison }]
|
|
2904
|
-
};
|
|
2905
|
-
}
|
|
2906
|
-
case "check_job_status": {
|
|
2907
|
-
const jobId = String(request.params.arguments?.job_id);
|
|
2908
|
-
const job = metadataStore.getJob(jobId);
|
|
2909
|
-
if (!job) {
|
|
2910
|
-
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
2911
|
-
}
|
|
2912
|
-
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
2913
|
-
const now = Date.now();
|
|
2914
|
-
const last = jobStatusLastPoll[jobId] || 0;
|
|
2915
|
-
const minPollMs = 3000;
|
|
2916
|
-
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
2917
|
-
const waitMs = minPollMs - (now - last);
|
|
2918
|
-
return {
|
|
2919
|
-
content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
|
|
2542
|
+
try {
|
|
2543
|
+
await ensurePythonModules(requiredModules);
|
|
2544
|
+
}
|
|
2545
|
+
catch (error) {
|
|
2546
|
+
return {
|
|
2547
|
+
content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
|
|
2548
|
+
isError: true,
|
|
2549
|
+
};
|
|
2550
|
+
}
|
|
2551
|
+
const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
2552
|
+
const payload = {
|
|
2553
|
+
dataset_id: datasetId,
|
|
2554
|
+
source,
|
|
2555
|
+
repo_id: repoId,
|
|
2556
|
+
kaggle_ref: kaggleRef,
|
|
2557
|
+
urls,
|
|
2558
|
+
output_format: outputFormat,
|
|
2559
|
+
output_dir: requestedOutputDir,
|
|
2560
|
+
max_items: maxItems,
|
|
2561
|
+
workers,
|
|
2562
|
+
image_column: imageColumn,
|
|
2563
|
+
output_root: requestedOutputDir || process.cwd(),
|
|
2564
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
2920
2565
|
};
|
|
2921
|
-
}
|
|
2922
|
-
jobStatusLastPoll[jobId] = now;
|
|
2923
|
-
if (job.status === "completed") {
|
|
2924
2566
|
try {
|
|
2925
|
-
const
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
2941
|
-
|
|
2942
|
-
],
|
|
2943
|
-
}
|
|
2567
|
+
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
2568
|
+
if (!result?.ok) {
|
|
2569
|
+
const errMsg = result?.error || "Unknown error";
|
|
2570
|
+
// Enhance error messages for common failures
|
|
2571
|
+
let hint = "";
|
|
2572
|
+
if (errMsg.includes("No image column")) {
|
|
2573
|
+
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
2574
|
+
}
|
|
2575
|
+
else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
|
|
2576
|
+
hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
|
|
2577
|
+
}
|
|
2578
|
+
return {
|
|
2579
|
+
content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
|
|
2580
|
+
isError: true,
|
|
2581
|
+
};
|
|
2582
|
+
}
|
|
2583
|
+
return {
|
|
2584
|
+
content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
|
|
2585
|
+
};
|
|
2944
2586
|
}
|
|
2945
|
-
catch (
|
|
2946
|
-
|
|
2587
|
+
catch (error) {
|
|
2588
|
+
return {
|
|
2589
|
+
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
2590
|
+
isError: true,
|
|
2591
|
+
};
|
|
2592
|
+
}
|
|
2593
|
+
}
|
|
2594
|
+
case "configure_keys": {
|
|
2595
|
+
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
2596
|
+
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
2597
|
+
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
2598
|
+
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
2599
|
+
const saved = [];
|
|
2600
|
+
const methods = [];
|
|
2601
|
+
if (hfToken) {
|
|
2602
|
+
const r = secureKeys.set("hf_token", hfToken);
|
|
2603
|
+
if (r.ok) {
|
|
2604
|
+
process.env.HF_TOKEN = hfToken;
|
|
2605
|
+
saved.push("HF token");
|
|
2606
|
+
if (r.method)
|
|
2607
|
+
methods.push(r.method);
|
|
2608
|
+
}
|
|
2947
2609
|
}
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
const intermediateArtifacts = new Set();
|
|
2957
|
-
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2958
|
-
? String(request.params.arguments?.target_dir).trim()
|
|
2959
|
-
: request.params.arguments?.output_dir
|
|
2960
|
-
? String(request.params.arguments?.output_dir).trim()
|
|
2961
|
-
: "";
|
|
2962
|
-
const targetDir = path.resolve(requestedTargetDir || process.cwd());
|
|
2963
|
-
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
2964
|
-
const fastMode = request.params.arguments?.fast === true;
|
|
2965
|
-
const preview = request.params.arguments?.preview === true;
|
|
2966
|
-
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
2967
|
-
const columns = request.params.arguments?.columns;
|
|
2968
|
-
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2969
|
-
// Use Metadata or Registry to find the actual local file
|
|
2970
|
-
const preferredLookupDirs = [targetDir, process.cwd()];
|
|
2971
|
-
let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
|
|
2972
|
-
if (!sourcePath) {
|
|
2973
|
-
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
2974
|
-
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
2975
|
-
try {
|
|
2976
|
-
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
2610
|
+
if (kaggleUsername) {
|
|
2611
|
+
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
2612
|
+
if (r.ok) {
|
|
2613
|
+
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
2614
|
+
saved.push("Kaggle username");
|
|
2615
|
+
if (r.method)
|
|
2616
|
+
methods.push(r.method);
|
|
2617
|
+
}
|
|
2977
2618
|
}
|
|
2978
|
-
|
|
2979
|
-
|
|
2619
|
+
if (kaggleKey) {
|
|
2620
|
+
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
2621
|
+
if (r.ok) {
|
|
2622
|
+
process.env.KAGGLE_KEY = kaggleKey;
|
|
2623
|
+
saved.push("Kaggle key");
|
|
2624
|
+
if (r.method)
|
|
2625
|
+
methods.push(r.method);
|
|
2626
|
+
}
|
|
2980
2627
|
}
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
if (resolved) {
|
|
2989
|
-
sourcePath = resolved;
|
|
2990
|
-
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
2991
|
-
break;
|
|
2628
|
+
if (dataworldToken) {
|
|
2629
|
+
const r = secureKeys.set("dataworld_token", dataworldToken);
|
|
2630
|
+
if (r.ok) {
|
|
2631
|
+
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
2632
|
+
saved.push("data.world token");
|
|
2633
|
+
if (r.method)
|
|
2634
|
+
methods.push(r.method);
|
|
2992
2635
|
}
|
|
2993
|
-
await wait(interval);
|
|
2994
|
-
waited += interval;
|
|
2995
2636
|
}
|
|
2996
|
-
|
|
2997
|
-
if (!sourcePath) {
|
|
2998
|
-
const entries = readRegistry();
|
|
2999
|
-
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
2637
|
+
if (saved.length === 0) {
|
|
3000
2638
|
return {
|
|
3001
|
-
content: [{ type: "text", text:
|
|
3002
|
-
isError: true
|
|
2639
|
+
content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
|
|
3003
2640
|
};
|
|
3004
2641
|
}
|
|
2642
|
+
return {
|
|
2643
|
+
content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
|
|
2644
|
+
};
|
|
3005
2645
|
}
|
|
3006
|
-
|
|
3007
|
-
|
|
3008
|
-
if (!
|
|
3009
|
-
|
|
2646
|
+
case "get_dataset_info": {
|
|
2647
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2648
|
+
if (!datasetId) {
|
|
2649
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
3010
2650
|
}
|
|
3011
|
-
|
|
3012
|
-
|
|
3013
|
-
|
|
3014
|
-
|
|
3015
|
-
|
|
3016
|
-
|
|
3017
|
-
|
|
3018
|
-
|
|
3019
|
-
|
|
3020
|
-
|
|
3021
|
-
|
|
2651
|
+
const dataset = metadataStore.getDataset(datasetId);
|
|
2652
|
+
if (!dataset) {
|
|
2653
|
+
// Fallback: check the registry for local path info
|
|
2654
|
+
const regEntry = getRegistryEntry(datasetId);
|
|
2655
|
+
const regPath = regEntry?.local_path || regEntry?.path;
|
|
2656
|
+
if (regEntry) {
|
|
2657
|
+
const exists = regPath && fs.existsSync(regPath);
|
|
2658
|
+
return {
|
|
2659
|
+
content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
|
|
2660
|
+
};
|
|
2661
|
+
}
|
|
2662
|
+
return {
|
|
2663
|
+
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
|
|
2664
|
+
isError: true,
|
|
2665
|
+
};
|
|
3022
2666
|
}
|
|
3023
|
-
|
|
3024
|
-
|
|
2667
|
+
// Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
|
|
2668
|
+
if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
|
|
3025
2669
|
try {
|
|
3026
|
-
const
|
|
3027
|
-
|
|
3028
|
-
|
|
3029
|
-
|
|
3030
|
-
|
|
3031
|
-
|
|
3032
|
-
|
|
3033
|
-
|
|
3034
|
-
|
|
3035
|
-
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
|
|
3040
|
-
|
|
2670
|
+
const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
|
|
2671
|
+
if (sizeResp.ok) {
|
|
2672
|
+
const sizeData = await sizeResp.json();
|
|
2673
|
+
const numRows = sizeData?.size?.dataset?.num_rows;
|
|
2674
|
+
if (numRows && numRows > 0) {
|
|
2675
|
+
dataset.total_examples = numRows;
|
|
2676
|
+
// Also backfill splits
|
|
2677
|
+
if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
|
|
2678
|
+
dataset.splits = sizeData.size.splits.map((s) => ({
|
|
2679
|
+
name: s.split,
|
|
2680
|
+
num_examples: s.num_rows || 0,
|
|
2681
|
+
size_bytes: s.num_bytes_parquet_files || 0,
|
|
2682
|
+
}));
|
|
2683
|
+
dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
|
|
2684
|
+
dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
|
|
2685
|
+
dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
|
|
3041
2686
|
}
|
|
3042
|
-
|
|
3043
|
-
|
|
3044
|
-
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
2687
|
+
// Persist enriched metadata
|
|
2688
|
+
metadataStore.saveDataset(dataset);
|
|
3045
2689
|
}
|
|
3046
2690
|
}
|
|
3047
2691
|
}
|
|
3048
|
-
catch
|
|
3049
|
-
|
|
2692
|
+
catch {
|
|
2693
|
+
// Enrichment is best-effort; continue with whatever we have
|
|
3050
2694
|
}
|
|
3051
2695
|
}
|
|
2696
|
+
const formattedOutput = formatDatasetInfo(dataset);
|
|
2697
|
+
return { content: [{ type: "text", text: formattedOutput }] };
|
|
3052
2698
|
}
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
3069
|
-
|
|
3070
|
-
|
|
3071
|
-
|
|
3072
|
-
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
tool: "export_dataset",
|
|
3081
|
-
requestArgs: request.params.arguments,
|
|
3082
|
-
outputPath: result.output_path,
|
|
3083
|
-
output: {
|
|
3084
|
-
rows: result.rows,
|
|
3085
|
-
columns: result.columns,
|
|
3086
|
-
format: requestedFormat,
|
|
3087
|
-
size_mb: result.file_size_mb,
|
|
3088
|
-
schema_before: schemaBefore,
|
|
3089
|
-
schema_after: schemaAfter,
|
|
3090
|
-
},
|
|
3091
|
-
steps: [
|
|
3092
|
-
{ step: "source_resolved", at: new Date().toISOString(), params: { sourcePath } },
|
|
3093
|
-
{ step: "exported", at: new Date().toISOString(), params: { format: requestedFormat, compression }, metrics: { rows: result.rows, columns: result.columns } },
|
|
3094
|
-
],
|
|
3095
|
-
});
|
|
3096
|
-
try {
|
|
3097
|
-
upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
|
|
2699
|
+
case "quality_analyze":
|
|
2700
|
+
case "analyze_quality":
|
|
2701
|
+
case "analyze_image_quality":
|
|
2702
|
+
case "analyze_media_quality":
|
|
2703
|
+
case "generate_quality_report": {
|
|
2704
|
+
const resolvedOperation = request.params.name === "analyze_image_quality"
|
|
2705
|
+
? "image"
|
|
2706
|
+
: request.params.name === "analyze_media_quality"
|
|
2707
|
+
? "media"
|
|
2708
|
+
: request.params.name === "generate_quality_report"
|
|
2709
|
+
? "report"
|
|
2710
|
+
: String(request.params.arguments?.operation || "dataset").toLowerCase();
|
|
2711
|
+
if (resolvedOperation === "image") {
|
|
2712
|
+
const inputPath = String(request.params.arguments?.path || "").trim();
|
|
2713
|
+
if (!inputPath || !fs.existsSync(inputPath)) {
|
|
2714
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2715
|
+
}
|
|
2716
|
+
const report = await imageAnalyzer.analyze(inputPath);
|
|
2717
|
+
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
2718
|
+
}
|
|
2719
|
+
if (resolvedOperation === "media") {
|
|
2720
|
+
const inputPath = String(request.params.arguments?.path || "").trim();
|
|
2721
|
+
if (!inputPath || !fs.existsSync(inputPath)) {
|
|
2722
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2723
|
+
}
|
|
2724
|
+
const report = await mediaAnalyzer.analyze(inputPath);
|
|
2725
|
+
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
3098
2726
|
}
|
|
3099
|
-
|
|
3100
|
-
|
|
3101
|
-
|
|
3102
|
-
|
|
3103
|
-
|
|
3104
|
-
|
|
3105
|
-
|
|
3106
|
-
|
|
3107
|
-
|
|
3108
|
-
|
|
3109
|
-
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
|
|
3115
|
-
|
|
3116
|
-
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
3117
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2727
|
+
if (resolvedOperation === "report") {
|
|
2728
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2729
|
+
const datasetPath = String(request.params.arguments?.dataset_path || "").trim();
|
|
2730
|
+
if (!datasetId) {
|
|
2731
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='report'");
|
|
2732
|
+
}
|
|
2733
|
+
if (!datasetPath || !fs.existsSync(datasetPath)) {
|
|
2734
|
+
throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
|
|
2735
|
+
}
|
|
2736
|
+
const metadata = await metadataStore.getDataset(datasetId);
|
|
2737
|
+
const textQuality = null;
|
|
2738
|
+
const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
|
|
2739
|
+
if (metadata) {
|
|
2740
|
+
metadata.unified_quality_report = report;
|
|
2741
|
+
await metadataStore.saveDataset(metadata);
|
|
2742
|
+
}
|
|
2743
|
+
return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
|
|
3118
2744
|
}
|
|
3119
|
-
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2745
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2746
|
+
if (!datasetId) {
|
|
2747
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='dataset'");
|
|
3123
2748
|
}
|
|
3124
|
-
|
|
3125
|
-
|
|
3126
|
-
|
|
3127
|
-
|
|
2749
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
2750
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
2751
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
2752
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
2753
|
+
// Demo Fallback for easy testing
|
|
2754
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
2755
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
2756
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
2757
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
2758
|
+
filePath = demoParquetPath;
|
|
2759
|
+
}
|
|
2760
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
2761
|
+
filePath = demoCsvPath;
|
|
2762
|
+
}
|
|
2763
|
+
else if (datasetId !== "demo") {
|
|
2764
|
+
return {
|
|
2765
|
+
content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
|
|
2766
|
+
isError: true
|
|
2767
|
+
};
|
|
2768
|
+
}
|
|
2769
|
+
}
|
|
2770
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
3128
2771
|
return {
|
|
3129
|
-
content: [{ type: "text", text:
|
|
3130
|
-
isError: true
|
|
2772
|
+
content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
|
|
3131
2773
|
};
|
|
3132
2774
|
}
|
|
3133
|
-
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
2775
|
+
case "preview_cleaning": {
|
|
2776
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2777
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
2778
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
2779
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
2780
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
2781
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
2782
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
2783
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
2784
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
2785
|
+
filePath = demoParquetPath;
|
|
2786
|
+
}
|
|
2787
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
2788
|
+
filePath = demoCsvPath;
|
|
2789
|
+
}
|
|
2790
|
+
else {
|
|
2791
|
+
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
2792
|
+
}
|
|
2793
|
+
}
|
|
2794
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
2795
|
+
// Phase 1: Target Detection
|
|
2796
|
+
// We use the same TargetDetector instance inside CleaningPlanner now?
|
|
2797
|
+
// Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
|
|
2798
|
+
// OR let the planner handle it if we update its signature to accept filePath.
|
|
2799
|
+
// Let's check `CleaningPlanner.generatePlan` signature again.
|
|
2800
|
+
// We updated it to accept `targetInfo`.
|
|
2801
|
+
// So we need to run detection HERE and pass it.
|
|
2802
|
+
// But `TargetDetector` is not exposed in `index.ts` scope yet.
|
|
2803
|
+
// Let's create a global instance or use the one inside planner if exposed (it's private).
|
|
2804
|
+
// Better approach: Instantiate TargetDetector here in index.ts for the tool content.
|
|
2805
|
+
// Quick fix: Instantiate local detector or make global.
|
|
2806
|
+
// I'll make a global `targetDetector` constant in index.ts
|
|
2807
|
+
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
2808
|
+
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
2809
|
+
// RETRY STRATEGY:
|
|
2810
|
+
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
2811
|
+
// 2. Run `detectTarget(filePath)`.
|
|
2812
|
+
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
2813
|
+
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
2814
|
+
// But since I'm in this tool, I can't look back.
|
|
2815
|
+
// I will assume I can add it, or just do it inside the case for now.
|
|
2816
|
+
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
2817
|
+
// Let's do that in a separate step if needed.
|
|
2818
|
+
// For now, I'll instantiate it here.
|
|
2819
|
+
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
2820
|
+
const detector = new TargetDetector(__dirname);
|
|
2821
|
+
const targetResult = await detector.detectTarget(filePath);
|
|
2822
|
+
const targetInfo = targetResult.target_column ? {
|
|
2823
|
+
target: targetResult.target_column,
|
|
2824
|
+
confidence: targetResult.confidence
|
|
2825
|
+
} : undefined;
|
|
2826
|
+
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
2827
|
+
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
2828
|
+
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
2829
|
+
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
2830
|
+
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
2831
|
+
}
|
|
2832
|
+
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
2833
|
+
if (plan.operations.length === 0) {
|
|
2834
|
+
explanation += "No cleaning operations required.";
|
|
2835
|
+
}
|
|
2836
|
+
else {
|
|
2837
|
+
plan.operations.forEach((op, i) => {
|
|
2838
|
+
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
2839
|
+
});
|
|
2840
|
+
}
|
|
3137
2841
|
return {
|
|
3138
|
-
content: [{ type: "text", text:
|
|
2842
|
+
content: [{ type: "text", text: explanation }]
|
|
3139
2843
|
};
|
|
3140
2844
|
}
|
|
3141
|
-
|
|
3142
|
-
const
|
|
3143
|
-
const
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
|
|
3154
|
-
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
3158
|
-
|
|
2845
|
+
case "custom_clean": {
|
|
2846
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2847
|
+
const ops = request.params.arguments?.operations;
|
|
2848
|
+
if (!datasetId || datasetId === "undefined") {
|
|
2849
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2850
|
+
}
|
|
2851
|
+
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
2852
|
+
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
2853
|
+
}
|
|
2854
|
+
// Pre-check: verify dataset file exists before starting the job
|
|
2855
|
+
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
2856
|
+
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
2857
|
+
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
2858
|
+
const cleanSafeId = toSafeDatasetPathFragment(datasetId);
|
|
2859
|
+
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
2860
|
+
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
2861
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
2862
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
2863
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
2864
|
+
fs.existsSync(datasetId);
|
|
2865
|
+
if (!cleanDataExists) {
|
|
2866
|
+
return {
|
|
2867
|
+
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
2868
|
+
isError: true,
|
|
2869
|
+
};
|
|
2870
|
+
}
|
|
2871
|
+
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
2872
|
+
return {
|
|
2873
|
+
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
2874
|
+
};
|
|
3159
2875
|
}
|
|
3160
|
-
|
|
2876
|
+
case "prepare_dataset": {
|
|
2877
|
+
hydrateExternalKeys();
|
|
2878
|
+
const query = String(request.params.arguments?.query);
|
|
2879
|
+
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
2880
|
+
const downloadImages = request.params.arguments?.download_images === true;
|
|
2881
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2882
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2883
|
+
: request.params.arguments?.output_dir
|
|
2884
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2885
|
+
: "";
|
|
2886
|
+
const outputDir = requestedOutputDir || process.cwd();
|
|
2887
|
+
if (!query || query === "undefined") {
|
|
2888
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
2889
|
+
}
|
|
2890
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
|
|
3161
2891
|
return {
|
|
3162
|
-
content: [{ type: "text", text: `
|
|
3163
|
-
isError: true,
|
|
2892
|
+
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
3164
2893
|
};
|
|
3165
2894
|
}
|
|
3166
|
-
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
2895
|
+
case "compare_datasets": {
|
|
2896
|
+
const datasetIds = request.params.arguments?.dataset_ids;
|
|
2897
|
+
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
2898
|
+
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
2899
|
+
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
2900
|
+
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
2901
|
+
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
2902
|
+
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
2903
|
+
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
3170
2904
|
return {
|
|
3171
|
-
content: [{ type: "text", text:
|
|
2905
|
+
content: [{ type: "text", text: comparison }]
|
|
3172
2906
|
};
|
|
3173
2907
|
}
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
const
|
|
3181
|
-
|
|
2908
|
+
case "check_job_status": {
|
|
2909
|
+
const jobId = String(request.params.arguments?.job_id);
|
|
2910
|
+
const job = metadataStore.getJob(jobId);
|
|
2911
|
+
if (!job) {
|
|
2912
|
+
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
2913
|
+
}
|
|
2914
|
+
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
2915
|
+
const now = Date.now();
|
|
2916
|
+
const last = jobStatusLastPoll[jobId] || 0;
|
|
2917
|
+
const minPollMs = 3000;
|
|
2918
|
+
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
2919
|
+
const waitMs = minPollMs - (now - last);
|
|
3182
2920
|
return {
|
|
3183
|
-
content: [{ type: "text", text: `
|
|
3184
|
-
isError: true,
|
|
2921
|
+
content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
|
|
3185
2922
|
};
|
|
3186
2923
|
}
|
|
3187
|
-
|
|
3188
|
-
|
|
3189
|
-
|
|
3190
|
-
|
|
3191
|
-
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
|
|
3201
|
-
|
|
3202
|
-
|
|
3203
|
-
|
|
3204
|
-
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
|
|
3210
|
-
|
|
3211
|
-
|
|
3212
|
-
|
|
3213
|
-
upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
|
|
2924
|
+
jobStatusLastPoll[jobId] = now;
|
|
2925
|
+
if (job.status === "completed") {
|
|
2926
|
+
try {
|
|
2927
|
+
const meta = job.metadata ? JSON.parse(job.metadata) : {};
|
|
2928
|
+
const baseId = String(meta?.datasetId || meta?.dataset_id || meta?.query || job.id);
|
|
2929
|
+
const outPath = typeof job.result_url === "string" ? job.result_url : undefined;
|
|
2930
|
+
appendLineageVersion({
|
|
2931
|
+
datasetIdBase: baseId,
|
|
2932
|
+
tool: `job:${job.type}`,
|
|
2933
|
+
requestArgs: {
|
|
2934
|
+
dataset_id: meta?.datasetId || meta?.dataset_id,
|
|
2935
|
+
query: meta?.query,
|
|
2936
|
+
pipeline_id: meta?.pipeline_id,
|
|
2937
|
+
agent_id: meta?.agent_id,
|
|
2938
|
+
},
|
|
2939
|
+
outputPath: outPath,
|
|
2940
|
+
output: {},
|
|
2941
|
+
steps: [
|
|
2942
|
+
{ step: `${job.type}_started`, at: job.created_at, params: meta || {} },
|
|
2943
|
+
{ step: `${job.type}_completed`, at: job.updated_at || new Date().toISOString(), metrics: { progress: job.progress } },
|
|
2944
|
+
],
|
|
2945
|
+
});
|
|
2946
|
+
}
|
|
2947
|
+
catch (e) {
|
|
2948
|
+
console.error(`[Lineage] check_job_status append failed: ${e?.message || e}`);
|
|
2949
|
+
}
|
|
3214
2950
|
}
|
|
3215
|
-
catch { }
|
|
3216
|
-
let msg = `**Conversion complete**\n`;
|
|
3217
|
-
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
3218
|
-
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
3219
|
-
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3220
|
-
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
3221
|
-
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
3222
|
-
if (result.size_mb !== undefined)
|
|
3223
|
-
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
3224
|
-
return { content: [{ type: "text", text: msg }] };
|
|
3225
|
-
}
|
|
3226
|
-
catch (error) {
|
|
3227
2951
|
return {
|
|
3228
|
-
content: [{ type: "text", text:
|
|
3229
|
-
isError: true,
|
|
2952
|
+
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
3230
2953
|
};
|
|
3231
2954
|
}
|
|
3232
|
-
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3237
|
-
|
|
3238
|
-
|
|
3239
|
-
|
|
3240
|
-
|
|
3241
|
-
|
|
3242
|
-
|
|
3243
|
-
|
|
3244
|
-
|
|
3245
|
-
|
|
3246
|
-
|
|
3247
|
-
|
|
3248
|
-
|
|
3249
|
-
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
2955
|
+
case "export_dataset": {
|
|
2956
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2957
|
+
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
2958
|
+
const intermediateArtifacts = new Set();
|
|
2959
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2960
|
+
? String(request.params.arguments?.target_dir).trim()
|
|
2961
|
+
: request.params.arguments?.output_dir
|
|
2962
|
+
? String(request.params.arguments?.output_dir).trim()
|
|
2963
|
+
: "";
|
|
2964
|
+
const targetDir = path.resolve(requestedTargetDir || process.cwd());
|
|
2965
|
+
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
2966
|
+
const fastMode = request.params.arguments?.fast === true;
|
|
2967
|
+
const preview = request.params.arguments?.preview === true;
|
|
2968
|
+
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
2969
|
+
const columns = request.params.arguments?.columns;
|
|
2970
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2971
|
+
// Use Metadata or Registry to find the actual local file
|
|
2972
|
+
const preferredLookupDirs = [targetDir, process.cwd()];
|
|
2973
|
+
let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
|
|
2974
|
+
if (!sourcePath) {
|
|
2975
|
+
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
2976
|
+
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
2977
|
+
try {
|
|
2978
|
+
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
2979
|
+
}
|
|
2980
|
+
catch (e) {
|
|
2981
|
+
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
2982
|
+
}
|
|
2983
|
+
// Poll for download status or registry entry until local_path appears or timeout
|
|
2984
|
+
const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
|
2985
|
+
const maxWait = 120_000; // 120s
|
|
2986
|
+
const interval = 2000;
|
|
2987
|
+
let waited = 0;
|
|
2988
|
+
while (waited < maxWait) {
|
|
2989
|
+
const resolved = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
|
|
2990
|
+
if (resolved) {
|
|
2991
|
+
sourcePath = resolved;
|
|
2992
|
+
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
2993
|
+
break;
|
|
2994
|
+
}
|
|
2995
|
+
await wait(interval);
|
|
2996
|
+
waited += interval;
|
|
2997
|
+
}
|
|
2998
|
+
// If still no sourcePath, return helpful error listing prepared datasets
|
|
2999
|
+
if (!sourcePath) {
|
|
3000
|
+
const entries = readRegistry();
|
|
3001
|
+
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
3002
|
+
return {
|
|
3003
|
+
content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
|
|
3004
|
+
isError: true
|
|
3005
|
+
};
|
|
3006
|
+
}
|
|
3266
3007
|
}
|
|
3267
|
-
|
|
3008
|
+
sourcePath = ensureExportableLocalPath(sourcePath);
|
|
3268
3009
|
try {
|
|
3269
|
-
|
|
3270
|
-
|
|
3010
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
3011
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
3012
|
+
}
|
|
3271
3013
|
}
|
|
3272
3014
|
catch (e) {
|
|
3273
|
-
console.error(`[
|
|
3015
|
+
console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
|
|
3274
3016
|
}
|
|
3275
|
-
|
|
3276
|
-
|
|
3277
|
-
|
|
3278
|
-
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
|
|
3283
|
-
|
|
3284
|
-
|
|
3285
|
-
|
|
3286
|
-
|
|
3287
|
-
|
|
3288
|
-
|
|
3289
|
-
|
|
3290
|
-
|
|
3017
|
+
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
3018
|
+
if (!fastMode) {
|
|
3019
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
3020
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
3021
|
+
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
3022
|
+
if (!pipelineCompatibleInput) {
|
|
3023
|
+
console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
|
|
3024
|
+
}
|
|
3025
|
+
else if (currentExt !== pipelineFmt) {
|
|
3026
|
+
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
3027
|
+
try {
|
|
3028
|
+
const beforeStagingPath = sourcePath;
|
|
3029
|
+
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
|
|
3030
|
+
if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
|
|
3031
|
+
intermediateArtifacts.add(sourcePath);
|
|
3032
|
+
}
|
|
3033
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
3034
|
+
if (pipelineResult.final_output_path) {
|
|
3035
|
+
if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
|
|
3036
|
+
intermediateArtifacts.add(pipelineResult.final_output_path);
|
|
3037
|
+
}
|
|
3038
|
+
sourcePath = pipelineResult.final_output_path;
|
|
3039
|
+
try {
|
|
3040
|
+
// Update registry to point to pipeline's final output
|
|
3041
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
3042
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
3043
|
+
}
|
|
3044
|
+
}
|
|
3045
|
+
catch (e) {
|
|
3046
|
+
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
3047
|
+
}
|
|
3048
|
+
}
|
|
3049
|
+
}
|
|
3050
|
+
catch (err) {
|
|
3051
|
+
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
3052
|
+
}
|
|
3053
|
+
}
|
|
3054
|
+
}
|
|
3055
|
+
else {
|
|
3056
|
+
console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
|
|
3057
|
+
}
|
|
3058
|
+
// Build export options
|
|
3059
|
+
const exportOpts = {};
|
|
3060
|
+
if (compression)
|
|
3061
|
+
exportOpts.compression = compression;
|
|
3062
|
+
if (preview)
|
|
3063
|
+
exportOpts.preview = true;
|
|
3064
|
+
if (sampleRows)
|
|
3065
|
+
exportOpts.sample_rows = sampleRows;
|
|
3066
|
+
if (columns)
|
|
3067
|
+
exportOpts.columns = columns;
|
|
3291
3068
|
try {
|
|
3292
|
-
|
|
3069
|
+
// Determine output file name
|
|
3070
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
3071
|
+
const ext = extMap[requestedFormat] || ".feather";
|
|
3072
|
+
const safeName = getExportFileStem(datasetId);
|
|
3073
|
+
const outDir = targetDir;
|
|
3074
|
+
if (!fs.existsSync(outDir))
|
|
3075
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
3076
|
+
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
3077
|
+
const schemaBefore = await getSchemaSnapshot(sourcePath);
|
|
3078
|
+
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
3079
|
+
const schemaAfter = await getSchemaSnapshot(result.output_path);
|
|
3080
|
+
const lineage = appendLineageVersion({
|
|
3081
|
+
datasetIdBase: datasetId,
|
|
3082
|
+
tool: "export_dataset",
|
|
3083
|
+
requestArgs: request.params.arguments,
|
|
3084
|
+
outputPath: result.output_path,
|
|
3085
|
+
output: {
|
|
3086
|
+
rows: result.rows,
|
|
3087
|
+
columns: result.columns,
|
|
3088
|
+
format: requestedFormat,
|
|
3089
|
+
size_mb: result.file_size_mb,
|
|
3090
|
+
schema_before: schemaBefore,
|
|
3091
|
+
schema_after: schemaAfter,
|
|
3092
|
+
},
|
|
3093
|
+
steps: [
|
|
3094
|
+
{ step: "source_resolved", at: new Date().toISOString(), params: { sourcePath } },
|
|
3095
|
+
{ step: "exported", at: new Date().toISOString(), params: { format: requestedFormat, compression }, metrics: { rows: result.rows, columns: result.columns } },
|
|
3096
|
+
],
|
|
3097
|
+
});
|
|
3098
|
+
try {
|
|
3099
|
+
upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
|
|
3100
|
+
}
|
|
3101
|
+
catch { }
|
|
3102
|
+
// Build rich response
|
|
3103
|
+
let msg = `**Export complete**\n`;
|
|
3104
|
+
msg += `- **File**: ${result.output_path}\n`;
|
|
3105
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3106
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
3107
|
+
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
3108
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
3109
|
+
if (result.file_size_mb !== undefined)
|
|
3110
|
+
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
3111
|
+
if (result.elapsed_seconds !== undefined)
|
|
3112
|
+
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
3113
|
+
if (result.preview_path)
|
|
3114
|
+
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
3115
|
+
msg += `\n`;
|
|
3116
|
+
if (requestedFormat === "feather") {
|
|
3117
|
+
msg += `**Inspect with:**\n`;
|
|
3118
|
+
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
3119
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
3120
|
+
}
|
|
3121
|
+
else if (requestedFormat === "parquet") {
|
|
3122
|
+
msg += `**Inspect with:**\n`;
|
|
3123
|
+
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
3124
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
3125
|
+
}
|
|
3126
|
+
cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
|
|
3127
|
+
return { content: [{ type: "text", text: msg }] };
|
|
3128
|
+
}
|
|
3129
|
+
catch (error) {
|
|
3130
|
+
return {
|
|
3131
|
+
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
3132
|
+
isError: true
|
|
3133
|
+
};
|
|
3293
3134
|
}
|
|
3294
|
-
catch { }
|
|
3295
|
-
let msg = `**Schema normalization complete**\n`;
|
|
3296
|
-
msg += `- **Input**: ${filePath}\n`;
|
|
3297
|
-
msg += `- **Output**: ${result.output_path}\n`;
|
|
3298
|
-
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3299
|
-
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
3300
|
-
msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
|
|
3301
|
-
msg += `- **Columns**: ${result.columns}\n`;
|
|
3302
|
-
msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
|
|
3303
|
-
msg += `- **Extras mode**: ${result.extras_mode}\n`;
|
|
3304
|
-
if (result.extras_rows !== undefined)
|
|
3305
|
-
msg += `- **Rows with extras**: ${result.extras_rows}\n`;
|
|
3306
|
-
msg += `\nNext: run \`vesper_convert_format\` on the output to convert to parquet.\n`;
|
|
3307
|
-
return { content: [{ type: "text", text: msg }] };
|
|
3308
|
-
}
|
|
3309
|
-
catch (error) {
|
|
3310
|
-
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
|
|
3311
3135
|
}
|
|
3312
|
-
|
|
3313
|
-
|
|
3314
|
-
|
|
3315
|
-
const operation = request.params.name === "fuse_datasets"
|
|
3316
|
-
? "tabular"
|
|
3317
|
-
: String(request.params.arguments?.operation || "tabular").toLowerCase();
|
|
3318
|
-
if (operation === "web") {
|
|
3319
|
-
hydrateExternalKeys();
|
|
3320
|
-
const webSources = Array.isArray(request.params.arguments?.sources)
|
|
3321
|
-
? request.params.arguments?.sources
|
|
3322
|
-
: undefined;
|
|
3323
|
-
if (!webSources || !Array.isArray(webSources)) {
|
|
3136
|
+
case "vesper_list_datasets": {
|
|
3137
|
+
const entries = readRegistry();
|
|
3138
|
+
if (entries.length === 0) {
|
|
3324
3139
|
return {
|
|
3325
|
-
content: [{ type: "text", text: "
|
|
3326
|
-
isError: true,
|
|
3140
|
+
content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
|
|
3327
3141
|
};
|
|
3328
3142
|
}
|
|
3329
|
-
const
|
|
3330
|
-
|
|
3331
|
-
|
|
3332
|
-
|
|
3333
|
-
|
|
3334
|
-
: undefined;
|
|
3335
|
-
const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
|
|
3336
|
-
? mergeStrategyRaw
|
|
3337
|
-
: undefined;
|
|
3338
|
-
const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
|
|
3339
|
-
? dedupRaw
|
|
3340
|
-
: undefined;
|
|
3341
|
-
const webResult = await webFusionEngine.fuse({
|
|
3342
|
-
sources: webSources.map((s) => ({
|
|
3343
|
-
type: String(s?.type || "").trim().toLowerCase(),
|
|
3344
|
-
query: String(s?.query || "").trim(),
|
|
3345
|
-
max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
|
|
3346
|
-
min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
|
|
3347
|
-
bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
|
|
3348
|
-
path: s?.path !== undefined ? String(s.path) : undefined,
|
|
3349
|
-
region: s?.region !== undefined ? String(s.region) : undefined,
|
|
3350
|
-
credentials: s?.credentials ? {
|
|
3351
|
-
accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
|
|
3352
|
-
secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
|
|
3353
|
-
sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
|
|
3354
|
-
roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
|
|
3355
|
-
} : undefined,
|
|
3356
|
-
})),
|
|
3357
|
-
merge_strategy,
|
|
3358
|
-
deduplication,
|
|
3143
|
+
const lines = entries.map((e, i) => {
|
|
3144
|
+
const id = e.dataset_id || e.id || "unknown";
|
|
3145
|
+
const localPath = e.local_path || e.path || "unknown";
|
|
3146
|
+
const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
|
|
3147
|
+
return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
|
|
3359
3148
|
});
|
|
3360
3149
|
return {
|
|
3361
|
-
content: [{ type: "text", text:
|
|
3150
|
+
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
3362
3151
|
};
|
|
3363
3152
|
}
|
|
3364
|
-
|
|
3365
|
-
|
|
3366
|
-
|
|
3367
|
-
|
|
3368
|
-
|
|
3369
|
-
const joinOn = request.params.arguments?.join_on;
|
|
3370
|
-
const how = request.params.arguments?.how || "inner";
|
|
3371
|
-
const dedup = request.params.arguments?.dedup !== false;
|
|
3372
|
-
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
3373
|
-
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
3374
|
-
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
3375
|
-
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
3376
|
-
const preview = request.params.arguments?.preview !== false;
|
|
3377
|
-
const resolvedPaths = [];
|
|
3378
|
-
const unresolved = [];
|
|
3379
|
-
for (const src of rawSources) {
|
|
3380
|
-
if (fs.existsSync(src)) {
|
|
3381
|
-
resolvedPaths.push(src);
|
|
3382
|
-
continue;
|
|
3153
|
+
case "vesper_convert_format": {
|
|
3154
|
+
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
3155
|
+
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
3156
|
+
if (!filePath) {
|
|
3157
|
+
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
3383
3158
|
}
|
|
3384
|
-
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
|
|
3159
|
+
if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
|
|
3160
|
+
throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
|
|
3161
|
+
}
|
|
3162
|
+
if (!fs.existsSync(filePath)) {
|
|
3163
|
+
return {
|
|
3164
|
+
content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
|
|
3165
|
+
isError: true,
|
|
3166
|
+
};
|
|
3167
|
+
}
|
|
3168
|
+
const inputExt = path.extname(filePath).toLowerCase();
|
|
3169
|
+
const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
|
|
3170
|
+
const outputExt = extMap[targetFormat];
|
|
3171
|
+
if (inputExt === outputExt) {
|
|
3172
|
+
return {
|
|
3173
|
+
content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
|
|
3174
|
+
};
|
|
3175
|
+
}
|
|
3176
|
+
const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
|
|
3177
|
+
try {
|
|
3178
|
+
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
3179
|
+
const convertScript = path.join(dataRoot, "python", "convert_engine.py");
|
|
3180
|
+
const schemaBefore = await getSchemaSnapshot(filePath);
|
|
3181
|
+
const result = await runPythonJson(convertScript, [filePath, outputPath]);
|
|
3182
|
+
const schemaAfter = await getSchemaSnapshot(outputPath);
|
|
3183
|
+
if (!result.ok) {
|
|
3184
|
+
return {
|
|
3185
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
|
|
3186
|
+
isError: true,
|
|
3187
|
+
};
|
|
3188
|
+
}
|
|
3189
|
+
// Register converted file in the registry
|
|
3190
|
+
const datasetId = path.basename(outputPath, outputExt);
|
|
3191
|
+
try {
|
|
3192
|
+
upsertRegistry(datasetId, outputPath, "completed");
|
|
3193
|
+
}
|
|
3194
|
+
catch (e) {
|
|
3195
|
+
console.error(`[Convert] Registry write failed: ${e?.message || e}`);
|
|
3196
|
+
}
|
|
3197
|
+
const lineage = appendLineageVersion({
|
|
3198
|
+
datasetIdBase: datasetId,
|
|
3199
|
+
tool: "vesper_convert_format",
|
|
3200
|
+
requestArgs: request.params.arguments,
|
|
3201
|
+
outputPath,
|
|
3202
|
+
output: {
|
|
3203
|
+
rows: result.rows,
|
|
3204
|
+
columns: result.columns,
|
|
3205
|
+
format: targetFormat,
|
|
3206
|
+
size_mb: result.size_mb,
|
|
3207
|
+
schema_before: schemaBefore,
|
|
3208
|
+
schema_after: schemaAfter,
|
|
3209
|
+
},
|
|
3210
|
+
steps: [
|
|
3211
|
+
{ step: "converted", at: new Date().toISOString(), params: { from: inputExt, to: outputExt } },
|
|
3212
|
+
],
|
|
3213
|
+
});
|
|
3214
|
+
try {
|
|
3215
|
+
upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
|
|
3216
|
+
}
|
|
3217
|
+
catch { }
|
|
3218
|
+
let msg = `**Conversion complete**\n`;
|
|
3219
|
+
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
3220
|
+
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
3221
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3222
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
3223
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
3224
|
+
if (result.size_mb !== undefined)
|
|
3225
|
+
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
3226
|
+
return { content: [{ type: "text", text: msg }] };
|
|
3227
|
+
}
|
|
3228
|
+
catch (error) {
|
|
3229
|
+
return {
|
|
3230
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
|
|
3231
|
+
isError: true,
|
|
3232
|
+
};
|
|
3388
3233
|
}
|
|
3389
|
-
unresolved.push(src);
|
|
3390
|
-
}
|
|
3391
|
-
if (unresolved.length > 0) {
|
|
3392
|
-
return {
|
|
3393
|
-
content: [{
|
|
3394
|
-
type: "text",
|
|
3395
|
-
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
3396
|
-
}],
|
|
3397
|
-
isError: true
|
|
3398
|
-
};
|
|
3399
3234
|
}
|
|
3400
|
-
|
|
3401
|
-
const
|
|
3402
|
-
const
|
|
3403
|
-
const
|
|
3235
|
+
case "vesper_normalize_schema": {
|
|
3236
|
+
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
3237
|
+
const outputFormat = String(request.params.arguments?.output_format || "jsonl").trim().toLowerCase();
|
|
3238
|
+
const outputDirRaw = request.params.arguments?.output_dir ? String(request.params.arguments.output_dir).trim() : "";
|
|
3239
|
+
const flattenMetadataJson = request.params.arguments?.flatten_metadata_json !== false;
|
|
3240
|
+
const maxKeys = Number(request.params.arguments?.max_keys ?? 200);
|
|
3241
|
+
const extrasMode = String(request.params.arguments?.extras_mode || "blob").trim().toLowerCase();
|
|
3242
|
+
if (!filePath) {
|
|
3243
|
+
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
3244
|
+
}
|
|
3245
|
+
if (!["jsonl", "json"].includes(outputFormat)) {
|
|
3246
|
+
throw new McpError(ErrorCode.InvalidParams, "output_format must be one of: jsonl, json");
|
|
3247
|
+
}
|
|
3248
|
+
if (!fs.existsSync(filePath)) {
|
|
3249
|
+
return { content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }], isError: true };
|
|
3250
|
+
}
|
|
3251
|
+
const outDir = outputDirRaw || path.join(dataRoot, "data", "normalized_schema");
|
|
3404
3252
|
if (!fs.existsSync(outDir))
|
|
3405
3253
|
fs.mkdirSync(outDir, { recursive: true });
|
|
3406
|
-
const
|
|
3407
|
-
|
|
3408
|
-
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
3409
|
-
strategy,
|
|
3410
|
-
join_on: joinOn,
|
|
3411
|
-
how,
|
|
3412
|
-
dedup,
|
|
3413
|
-
run_quality_after: runQualityAfter,
|
|
3414
|
-
leakage_check: leakageCheck,
|
|
3415
|
-
output_format: outputFormat,
|
|
3416
|
-
compression: compression,
|
|
3417
|
-
preview,
|
|
3418
|
-
});
|
|
3419
|
-
const nullDelta = result.stats.null_delta;
|
|
3420
|
-
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
3421
|
-
// Register fused dataset under a generated id so users can export it easily
|
|
3422
|
-
const fusedId = `fused_${Date.now()}`;
|
|
3254
|
+
const baseName = path.parse(filePath).name || `normalized_${Date.now()}`;
|
|
3255
|
+
const outputPath = path.join(outDir, `${baseName}.normalized.${outputFormat}`);
|
|
3423
3256
|
try {
|
|
3424
|
-
|
|
3257
|
+
const scriptPath = path.join(dataRoot, "python", "normalize_schema_engine.py");
|
|
3258
|
+
const options = {
|
|
3259
|
+
flatten_metadata_json: !!flattenMetadataJson,
|
|
3260
|
+
max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
|
|
3261
|
+
extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
|
|
3262
|
+
};
|
|
3263
|
+
const schemaBefore = await getSchemaSnapshot(filePath);
|
|
3264
|
+
const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
|
|
3265
|
+
const schemaAfter = await getSchemaSnapshot(outputPath);
|
|
3266
|
+
if (!result.ok) {
|
|
3267
|
+
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
|
|
3268
|
+
}
|
|
3269
|
+
// Register normalized file to make follow-up conversion easier.
|
|
3270
|
+
try {
|
|
3271
|
+
const datasetId = path.basename(outputPath, path.extname(outputPath));
|
|
3272
|
+
upsertRegistry(datasetId, outputPath, "completed");
|
|
3273
|
+
}
|
|
3274
|
+
catch (e) {
|
|
3275
|
+
console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
|
|
3276
|
+
}
|
|
3277
|
+
const lineage = appendLineageVersion({
|
|
3278
|
+
datasetIdBase: path.basename(outputPath, path.extname(outputPath)),
|
|
3279
|
+
tool: "vesper_normalize_schema",
|
|
3280
|
+
requestArgs: request.params.arguments,
|
|
3281
|
+
outputPath,
|
|
3282
|
+
output: {
|
|
3283
|
+
rows: result.rows,
|
|
3284
|
+
columns: result.columns,
|
|
3285
|
+
format: outputFormat,
|
|
3286
|
+
schema_before: schemaBefore,
|
|
3287
|
+
schema_after: schemaAfter,
|
|
3288
|
+
},
|
|
3289
|
+
steps: [
|
|
3290
|
+
{ step: "schema_normalized", at: new Date().toISOString(), params: options, metrics: { flattened_keys: result.flattened_keys } },
|
|
3291
|
+
],
|
|
3292
|
+
});
|
|
3293
|
+
try {
|
|
3294
|
+
upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
|
|
3295
|
+
}
|
|
3296
|
+
catch { }
|
|
3297
|
+
let msg = `**Schema normalization complete**\n`;
|
|
3298
|
+
msg += `- **Input**: ${filePath}\n`;
|
|
3299
|
+
msg += `- **Output**: ${result.output_path}\n`;
|
|
3300
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3301
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
3302
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
|
|
3303
|
+
msg += `- **Columns**: ${result.columns}\n`;
|
|
3304
|
+
msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
|
|
3305
|
+
msg += `- **Extras mode**: ${result.extras_mode}\n`;
|
|
3306
|
+
if (result.extras_rows !== undefined)
|
|
3307
|
+
msg += `- **Rows with extras**: ${result.extras_rows}\n`;
|
|
3308
|
+
msg += `\nNext: run \`vesper_convert_format\` on the output to convert to parquet.\n`;
|
|
3309
|
+
return { content: [{ type: "text", text: msg }] };
|
|
3425
3310
|
}
|
|
3426
|
-
catch (
|
|
3427
|
-
|
|
3311
|
+
catch (error) {
|
|
3312
|
+
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
|
|
3428
3313
|
}
|
|
3429
|
-
|
|
3430
|
-
|
|
3431
|
-
|
|
3432
|
-
const
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3436
|
-
|
|
3437
|
-
|
|
3438
|
-
|
|
3439
|
-
|
|
3440
|
-
|
|
3441
|
-
|
|
3442
|
-
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
|
|
3448
|
-
|
|
3449
|
-
|
|
3314
|
+
}
|
|
3315
|
+
case "fuse":
|
|
3316
|
+
case "fuse_datasets": {
|
|
3317
|
+
const operation = request.params.name === "fuse_datasets"
|
|
3318
|
+
? "tabular"
|
|
3319
|
+
: String(request.params.arguments?.operation || "tabular").toLowerCase();
|
|
3320
|
+
if (operation === "web") {
|
|
3321
|
+
hydrateExternalKeys();
|
|
3322
|
+
const webSources = Array.isArray(request.params.arguments?.sources)
|
|
3323
|
+
? request.params.arguments?.sources
|
|
3324
|
+
: undefined;
|
|
3325
|
+
if (!webSources || !Array.isArray(webSources)) {
|
|
3326
|
+
return {
|
|
3327
|
+
content: [{ type: "text", text: "ERROR: fuse(operation='web') requires 'sources' array." }],
|
|
3328
|
+
isError: true,
|
|
3329
|
+
};
|
|
3330
|
+
}
|
|
3331
|
+
const mergeStrategyRaw = request.params.arguments?.merge_strategy
|
|
3332
|
+
? String(request.params.arguments?.merge_strategy).toLowerCase()
|
|
3333
|
+
: undefined;
|
|
3334
|
+
const dedupRaw = request.params.arguments?.deduplication
|
|
3335
|
+
? String(request.params.arguments?.deduplication).toLowerCase()
|
|
3336
|
+
: undefined;
|
|
3337
|
+
const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
|
|
3338
|
+
? mergeStrategyRaw
|
|
3339
|
+
: undefined;
|
|
3340
|
+
const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
|
|
3341
|
+
? dedupRaw
|
|
3342
|
+
: undefined;
|
|
3343
|
+
const webResult = await webFusionEngine.fuse({
|
|
3344
|
+
sources: webSources.map((s) => ({
|
|
3345
|
+
type: String(s?.type || "").trim().toLowerCase(),
|
|
3346
|
+
query: String(s?.query || "").trim(),
|
|
3347
|
+
max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
|
|
3348
|
+
min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
|
|
3349
|
+
bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
|
|
3350
|
+
path: s?.path !== undefined ? String(s.path) : undefined,
|
|
3351
|
+
region: s?.region !== undefined ? String(s.region) : undefined,
|
|
3352
|
+
credentials: s?.credentials ? {
|
|
3353
|
+
accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
|
|
3354
|
+
secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
|
|
3355
|
+
sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
|
|
3356
|
+
roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
|
|
3357
|
+
} : undefined,
|
|
3358
|
+
})),
|
|
3359
|
+
merge_strategy,
|
|
3360
|
+
deduplication,
|
|
3361
|
+
});
|
|
3362
|
+
return {
|
|
3363
|
+
content: [{ type: "text", text: JSON.stringify(webResult, null, 2) }],
|
|
3364
|
+
};
|
|
3365
|
+
}
|
|
3366
|
+
const rawSources = request.params.arguments?.sources;
|
|
3367
|
+
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
3368
|
+
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
3450
3369
|
}
|
|
3451
|
-
|
|
3452
|
-
|
|
3453
|
-
|
|
3454
|
-
|
|
3455
|
-
|
|
3456
|
-
|
|
3457
|
-
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3463
|
-
|
|
3370
|
+
const strategy = request.params.arguments?.strategy || "concat";
|
|
3371
|
+
const joinOn = request.params.arguments?.join_on;
|
|
3372
|
+
const how = request.params.arguments?.how || "inner";
|
|
3373
|
+
const dedup = request.params.arguments?.dedup !== false;
|
|
3374
|
+
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
3375
|
+
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
3376
|
+
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
3377
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
3378
|
+
const preview = request.params.arguments?.preview !== false;
|
|
3379
|
+
const resolvedPaths = [];
|
|
3380
|
+
const unresolved = [];
|
|
3381
|
+
for (const src of rawSources) {
|
|
3382
|
+
if (fs.existsSync(src)) {
|
|
3383
|
+
resolvedPaths.push(src);
|
|
3384
|
+
continue;
|
|
3464
3385
|
}
|
|
3465
|
-
|
|
3386
|
+
const status = metadataStore.getDownloadStatus(src);
|
|
3387
|
+
if (status?.local_path && fs.existsSync(status.local_path)) {
|
|
3388
|
+
resolvedPaths.push(status.local_path);
|
|
3389
|
+
continue;
|
|
3390
|
+
}
|
|
3391
|
+
unresolved.push(src);
|
|
3392
|
+
}
|
|
3393
|
+
if (unresolved.length > 0) {
|
|
3394
|
+
return {
|
|
3395
|
+
content: [{
|
|
3396
|
+
type: "text",
|
|
3397
|
+
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
3398
|
+
}],
|
|
3399
|
+
isError: true
|
|
3400
|
+
};
|
|
3401
|
+
}
|
|
3402
|
+
try {
|
|
3403
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
3404
|
+
const ext = extMap[outputFormat] || ".feather";
|
|
3405
|
+
const outDir = process.cwd();
|
|
3406
|
+
if (!fs.existsSync(outDir))
|
|
3407
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
3408
|
+
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
3409
|
+
console.error(`[Fusion] Resolved output directory: ${outDir}`);
|
|
3410
|
+
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
3411
|
+
strategy,
|
|
3412
|
+
join_on: joinOn,
|
|
3413
|
+
how,
|
|
3414
|
+
dedup,
|
|
3415
|
+
run_quality_after: runQualityAfter,
|
|
3416
|
+
leakage_check: leakageCheck,
|
|
3417
|
+
output_format: outputFormat,
|
|
3418
|
+
compression: compression,
|
|
3419
|
+
preview,
|
|
3420
|
+
});
|
|
3421
|
+
const nullDelta = result.stats.null_delta;
|
|
3422
|
+
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
3423
|
+
// Register fused dataset under a generated id so users can export it easily
|
|
3424
|
+
const fusedId = `fused_${Date.now()}`;
|
|
3425
|
+
try {
|
|
3426
|
+
upsertRegistry(fusedId, result.output_path, "completed");
|
|
3427
|
+
}
|
|
3428
|
+
catch (e) {
|
|
3429
|
+
console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
|
|
3430
|
+
}
|
|
3431
|
+
const inputSchemaSnapshots = await Promise.all(resolvedPaths.map((p) => getSchemaSnapshot(p)));
|
|
3432
|
+
const schemaBefore = mergeSchemaSnapshots(inputSchemaSnapshots);
|
|
3433
|
+
const schemaAfter = await getSchemaSnapshot(result.output_path);
|
|
3434
|
+
const lineage = appendLineageVersion({
|
|
3435
|
+
datasetIdBase: fusedId,
|
|
3436
|
+
tool: "fuse_datasets",
|
|
3437
|
+
requestArgs: request.params.arguments,
|
|
3438
|
+
outputPath: result.output_path,
|
|
3439
|
+
output: {
|
|
3440
|
+
rows: result.stats.rows_after,
|
|
3441
|
+
format: outputFormat,
|
|
3442
|
+
schema_before: schemaBefore,
|
|
3443
|
+
schema_after: schemaAfter,
|
|
3444
|
+
},
|
|
3445
|
+
sources: resolvedPaths.map((p) => ({ source: "local", url: p, at: new Date().toISOString() })),
|
|
3446
|
+
steps: [
|
|
3447
|
+
{ step: "fused", at: new Date().toISOString(), params: { strategy, dedup, how }, metrics: { rows_before: result.stats.rows_before, rows_after: result.stats.rows_after, duplicates_removed: result.stats.duplicates_removed } },
|
|
3448
|
+
],
|
|
3449
|
+
});
|
|
3450
|
+
try {
|
|
3451
|
+
upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
|
|
3452
|
+
}
|
|
3453
|
+
catch { }
|
|
3454
|
+
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
3455
|
+
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
3456
|
+
msg += `- Null change: ${nullText}\n`;
|
|
3457
|
+
msg += `- Output: ${result.output_path}\n`;
|
|
3458
|
+
msg += `- Version: ${lineage.datasetVersionId}\n`;
|
|
3459
|
+
msg += `- Lineage: ${lineage.lineagePath}\n`;
|
|
3460
|
+
if (result.preview_path)
|
|
3461
|
+
msg += `- Preview: ${result.preview_path}\n`;
|
|
3462
|
+
if (result.leakage_report) {
|
|
3463
|
+
msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
|
|
3464
|
+
if (result.leakage_report.leakage_count) {
|
|
3465
|
+
msg += ` (${result.leakage_report.leakage_count})`;
|
|
3466
|
+
}
|
|
3467
|
+
msg += "\n";
|
|
3468
|
+
}
|
|
3469
|
+
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
3470
|
+
return { content: [{ type: "text", text: msg }] };
|
|
3471
|
+
}
|
|
3472
|
+
catch (error) {
|
|
3473
|
+
return {
|
|
3474
|
+
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
3475
|
+
isError: true
|
|
3476
|
+
};
|
|
3466
3477
|
}
|
|
3467
|
-
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
3468
|
-
return { content: [{ type: "text", text: msg }] };
|
|
3469
|
-
}
|
|
3470
|
-
catch (error) {
|
|
3471
|
-
return {
|
|
3472
|
-
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
3473
|
-
isError: true
|
|
3474
|
-
};
|
|
3475
3478
|
}
|
|
3479
|
+
default:
|
|
3480
|
+
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
3476
3481
|
}
|
|
3477
|
-
|
|
3478
|
-
|
|
3479
|
-
|
|
3482
|
+
})();
|
|
3483
|
+
void recordMcpToolAnalyticsAfterCall({
|
|
3484
|
+
toolName: String(request.params.name),
|
|
3485
|
+
args: request.params.arguments,
|
|
3486
|
+
result: toolResponse,
|
|
3487
|
+
}).catch((err) => console.error("[mcp-analytics]", err));
|
|
3488
|
+
return toolResponse;
|
|
3480
3489
|
}); // end requestQueue.enqueue
|
|
3481
3490
|
});
|
|
3482
3491
|
async function main() {
|