@vespermcp/mcp-server 1.2.26 → 1.2.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -0
- package/build/index.js +904 -4
- package/build/metadata/semantic-scholar-source.js +64 -11
- package/build/python/normalize_schema_engine.py +224 -0
- package/build/web/fusion-engine.js +3 -0
- package/build/web/web-core.js +2 -0
- package/package.json +6 -1
- package/src/python/normalize_schema_engine.py +224 -0
package/build/index.js
CHANGED
|
@@ -74,18 +74,156 @@ function getRegistryEntry(dataset_id) {
|
|
|
74
74
|
console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
|
|
75
75
|
return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
|
|
76
76
|
}
|
|
77
|
+
function getLineageDir() {
|
|
78
|
+
const p = path.join(dataRoot, "lineage");
|
|
79
|
+
if (!fs.existsSync(p))
|
|
80
|
+
fs.mkdirSync(p, { recursive: true });
|
|
81
|
+
return p;
|
|
82
|
+
}
|
|
83
|
+
function toBaseDatasetId(datasetId) {
|
|
84
|
+
const safe = normalize_dataset_id(String(datasetId || "").trim());
|
|
85
|
+
return safe.replace(/_v\d+$/i, "");
|
|
86
|
+
}
|
|
87
|
+
function getLineageRecordPath(datasetIdBase) {
|
|
88
|
+
return path.join(getLineageDir(), `${toBaseDatasetId(datasetIdBase)}.lineage.json`);
|
|
89
|
+
}
|
|
90
|
+
function readLineageRecord(datasetIdBase) {
|
|
91
|
+
const p = getLineageRecordPath(datasetIdBase);
|
|
92
|
+
if (!fs.existsSync(p)) {
|
|
93
|
+
return {
|
|
94
|
+
dataset_id_base: toBaseDatasetId(datasetIdBase),
|
|
95
|
+
latest_version: 0,
|
|
96
|
+
updated_at: new Date().toISOString(),
|
|
97
|
+
versions: [],
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
try {
|
|
101
|
+
return JSON.parse(fs.readFileSync(p, "utf-8"));
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return {
|
|
105
|
+
dataset_id_base: toBaseDatasetId(datasetIdBase),
|
|
106
|
+
latest_version: 0,
|
|
107
|
+
updated_at: new Date().toISOString(),
|
|
108
|
+
versions: [],
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
function writeLineageRecord(record) {
|
|
113
|
+
const p = getLineageRecordPath(record.dataset_id_base);
|
|
114
|
+
fs.writeFileSync(p, JSON.stringify(record, null, 2));
|
|
115
|
+
}
|
|
116
|
+
function appendLineageVersion(input) {
|
|
117
|
+
const base = toBaseDatasetId(input.datasetIdBase);
|
|
118
|
+
const record = readLineageRecord(base);
|
|
119
|
+
if (input.outputPath) {
|
|
120
|
+
const existing = record.versions.find(v => v.output?.local_path === input.outputPath);
|
|
121
|
+
if (existing) {
|
|
122
|
+
return { datasetVersionId: existing.dataset_id, version: existing.version, lineagePath: getLineageRecordPath(base) };
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
const version = (record.latest_version || 0) + 1;
|
|
126
|
+
const datasetVersionId = `${base}_v${version}`;
|
|
127
|
+
const now = new Date().toISOString();
|
|
128
|
+
const outputPath = input.outputPath;
|
|
129
|
+
let sidecarPath;
|
|
130
|
+
if (outputPath && fs.existsSync(outputPath)) {
|
|
131
|
+
sidecarPath = `${outputPath}.lineage.json`;
|
|
132
|
+
}
|
|
133
|
+
const v = {
|
|
134
|
+
version,
|
|
135
|
+
dataset_id: datasetVersionId,
|
|
136
|
+
created_at: now,
|
|
137
|
+
triggered_by: {
|
|
138
|
+
tool: input.tool,
|
|
139
|
+
agent_id: input.requestArgs?.agent_id ? String(input.requestArgs.agent_id) : undefined,
|
|
140
|
+
pipeline_id: input.requestArgs?.pipeline_id ? String(input.requestArgs.pipeline_id) : undefined,
|
|
141
|
+
api_key: input.requestArgs?.api_key ? String(input.requestArgs.api_key) : undefined,
|
|
142
|
+
},
|
|
143
|
+
input: {
|
|
144
|
+
dataset_id: input.requestArgs?.dataset_id ? String(input.requestArgs.dataset_id) : undefined,
|
|
145
|
+
query: input.requestArgs?.query ? String(input.requestArgs.query) : undefined,
|
|
146
|
+
source_path: input.requestArgs?.file_path ? String(input.requestArgs.file_path) : undefined,
|
|
147
|
+
source_urls: Array.isArray(input.requestArgs?.source_urls)
|
|
148
|
+
? input.requestArgs.source_urls.map((u) => String(u))
|
|
149
|
+
: undefined,
|
|
150
|
+
},
|
|
151
|
+
output: {
|
|
152
|
+
local_path: outputPath,
|
|
153
|
+
rows: typeof input.output?.rows === "number" ? input.output.rows : undefined,
|
|
154
|
+
columns: typeof input.output?.columns === "number" ? input.output.columns : undefined,
|
|
155
|
+
format: typeof input.output?.format === "string" ? input.output.format : undefined,
|
|
156
|
+
size_mb: typeof input.output?.size_mb === "number" ? input.output.size_mb : undefined,
|
|
157
|
+
quality_score: typeof input.output?.quality_score === "number" ? input.output.quality_score : undefined,
|
|
158
|
+
schema_before: input.output?.schema_before && typeof input.output.schema_before === "object"
|
|
159
|
+
? {
|
|
160
|
+
rows: typeof input.output.schema_before.rows === "number" ? input.output.schema_before.rows : undefined,
|
|
161
|
+
columns: Array.isArray(input.output.schema_before.columns) ? input.output.schema_before.columns.map((c) => String(c)) : undefined,
|
|
162
|
+
dtypes: input.output.schema_before.dtypes && typeof input.output.schema_before.dtypes === "object"
|
|
163
|
+
? Object.fromEntries(Object.entries(input.output.schema_before.dtypes).map(([k, v]) => [String(k), String(v)]))
|
|
164
|
+
: undefined,
|
|
165
|
+
}
|
|
166
|
+
: undefined,
|
|
167
|
+
schema_after: input.output?.schema_after && typeof input.output.schema_after === "object"
|
|
168
|
+
? {
|
|
169
|
+
rows: typeof input.output.schema_after.rows === "number" ? input.output.schema_after.rows : undefined,
|
|
170
|
+
columns: Array.isArray(input.output.schema_after.columns) ? input.output.schema_after.columns.map((c) => String(c)) : undefined,
|
|
171
|
+
dtypes: input.output.schema_after.dtypes && typeof input.output.schema_after.dtypes === "object"
|
|
172
|
+
? Object.fromEntries(Object.entries(input.output.schema_after.dtypes).map(([k, v]) => [String(k), String(v)]))
|
|
173
|
+
: undefined,
|
|
174
|
+
}
|
|
175
|
+
: undefined,
|
|
176
|
+
},
|
|
177
|
+
sources: input.sources || [],
|
|
178
|
+
steps: input.steps || [{ step: input.tool, at: now, params: input.requestArgs || {} }],
|
|
179
|
+
};
|
|
180
|
+
record.latest_version = version;
|
|
181
|
+
record.updated_at = now;
|
|
182
|
+
record.versions.push(v);
|
|
183
|
+
writeLineageRecord(record);
|
|
184
|
+
const telemetryEndpoint = process.env.VESPER_TELEMETRY_ENDPOINT?.trim();
|
|
185
|
+
if (telemetryEndpoint) {
|
|
186
|
+
postJsonNonBlocking(telemetryEndpoint, {
|
|
187
|
+
event: "lineage.version.appended",
|
|
188
|
+
sent_at: now,
|
|
189
|
+
host: os.hostname(),
|
|
190
|
+
dataset_id_base: base,
|
|
191
|
+
version: v.version,
|
|
192
|
+
dataset_id: v.dataset_id,
|
|
193
|
+
triggered_by: v.triggered_by,
|
|
194
|
+
output: {
|
|
195
|
+
local_path: v.output?.local_path,
|
|
196
|
+
rows: v.output?.rows,
|
|
197
|
+
columns: v.output?.columns,
|
|
198
|
+
format: v.output?.format,
|
|
199
|
+
},
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
if (sidecarPath) {
|
|
203
|
+
try {
|
|
204
|
+
fs.writeFileSync(sidecarPath, JSON.stringify(v, null, 2));
|
|
205
|
+
}
|
|
206
|
+
catch {
|
|
207
|
+
// best effort sidecar write
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
return { datasetVersionId, version, lineagePath: getLineageRecordPath(base) };
|
|
211
|
+
}
|
|
77
212
|
const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
|
|
78
213
|
const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
|
|
79
|
-
function walkFilesRecursive(rootDir) {
|
|
214
|
+
function walkFilesRecursive(rootDir, maxDepth = Number.POSITIVE_INFINITY) {
|
|
80
215
|
const out = [];
|
|
81
|
-
const stack = [rootDir];
|
|
216
|
+
const stack = [{ dir: rootDir, depth: 0 }];
|
|
82
217
|
while (stack.length > 0) {
|
|
83
|
-
const
|
|
218
|
+
const current = stack.pop();
|
|
219
|
+
const currentDir = current.dir;
|
|
84
220
|
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
|
|
85
221
|
for (const entry of entries) {
|
|
86
222
|
const fullPath = path.join(currentDir, entry.name);
|
|
87
223
|
if (entry.isDirectory()) {
|
|
88
|
-
|
|
224
|
+
if (current.depth < maxDepth) {
|
|
225
|
+
stack.push({ dir: fullPath, depth: current.depth + 1 });
|
|
226
|
+
}
|
|
89
227
|
}
|
|
90
228
|
else if (entry.isFile()) {
|
|
91
229
|
out.push(fullPath);
|
|
@@ -296,6 +434,8 @@ import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
|
|
|
296
434
|
import { ConfigManager } from "./config/config-manager.js";
|
|
297
435
|
import { SecureKeysManager } from "./config/secure-keys.js";
|
|
298
436
|
import readline from "readline";
|
|
437
|
+
import http from "http";
|
|
438
|
+
import https from "https";
|
|
299
439
|
import os from "os";
|
|
300
440
|
// Determine absolute paths relative to the compiled script
|
|
301
441
|
const __filename = fileURLToPath(import.meta.url);
|
|
@@ -321,6 +461,34 @@ function logError(err, context) {
|
|
|
321
461
|
fs.appendFileSync(errorLogPath, msg);
|
|
322
462
|
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
323
463
|
}
|
|
464
|
+
function postJsonNonBlocking(urlRaw, body) {
|
|
465
|
+
try {
|
|
466
|
+
const u = new URL(urlRaw);
|
|
467
|
+
const payload = JSON.stringify(body);
|
|
468
|
+
const isHttps = u.protocol === "https:";
|
|
469
|
+
const transport = isHttps ? https : http;
|
|
470
|
+
const req = transport.request({
|
|
471
|
+
method: "POST",
|
|
472
|
+
hostname: u.hostname,
|
|
473
|
+
port: u.port ? Number(u.port) : (isHttps ? 443 : 80),
|
|
474
|
+
path: `${u.pathname}${u.search || ""}`,
|
|
475
|
+
headers: {
|
|
476
|
+
"content-type": "application/json",
|
|
477
|
+
"content-length": Buffer.byteLength(payload),
|
|
478
|
+
},
|
|
479
|
+
timeout: 3000,
|
|
480
|
+
}, (res) => {
|
|
481
|
+
res.resume();
|
|
482
|
+
});
|
|
483
|
+
req.on("error", () => { });
|
|
484
|
+
req.on("timeout", () => req.destroy());
|
|
485
|
+
req.write(payload);
|
|
486
|
+
req.end();
|
|
487
|
+
}
|
|
488
|
+
catch {
|
|
489
|
+
// best effort telemetry only
|
|
490
|
+
}
|
|
491
|
+
}
|
|
324
492
|
// --- Request Queue: serialize all MCP tool calls to prevent crashes ---
|
|
325
493
|
class RequestQueue {
|
|
326
494
|
queue = [];
|
|
@@ -535,6 +703,102 @@ function runPythonJson(scriptPath, args) {
|
|
|
535
703
|
});
|
|
536
704
|
});
|
|
537
705
|
}
|
|
706
|
+
async function getSchemaSnapshot(filePath) {
|
|
707
|
+
try {
|
|
708
|
+
if (!filePath || !fs.existsSync(filePath))
|
|
709
|
+
return undefined;
|
|
710
|
+
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
711
|
+
const pyCode = [
|
|
712
|
+
"import json, os, polars as pl",
|
|
713
|
+
"p = __import__('sys').argv[1]",
|
|
714
|
+
"ext = os.path.splitext(p)[1].lower()",
|
|
715
|
+
"if ext == '.csv': df = pl.read_csv(p, infer_schema_length=10000, ignore_errors=True)",
|
|
716
|
+
"elif ext in ('.jsonl', '.ndjson'): df = pl.read_ndjson(p)",
|
|
717
|
+
"elif ext == '.json': df = pl.read_json(p)",
|
|
718
|
+
"elif ext in ('.parquet', '.pq'): df = pl.read_parquet(p)",
|
|
719
|
+
"elif ext in ('.feather', '.ftr', '.arrow', '.ipc'): df = pl.read_ipc(p)",
|
|
720
|
+
"else: df = pl.read_csv(p, infer_schema_length=10000, ignore_errors=True)",
|
|
721
|
+
"print(json.dumps({'rows': int(df.height), 'columns': [str(c) for c in df.columns], 'dtypes': {str(c): str(t) for c,t in zip(df.columns, df.dtypes)}}))",
|
|
722
|
+
].join(";");
|
|
723
|
+
const proc = await runPythonProcess(["-c", pyCode, filePath], 120000);
|
|
724
|
+
if (proc.code !== 0)
|
|
725
|
+
return undefined;
|
|
726
|
+
return JSON.parse((proc.stdout || "{}").trim());
|
|
727
|
+
}
|
|
728
|
+
catch {
|
|
729
|
+
return undefined;
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
function mergeSchemaSnapshots(snapshots) {
|
|
733
|
+
const valid = snapshots.filter(Boolean);
|
|
734
|
+
if (valid.length === 0)
|
|
735
|
+
return undefined;
|
|
736
|
+
let rows = 0;
|
|
737
|
+
const colSet = new Set();
|
|
738
|
+
const dtypeMulti = {};
|
|
739
|
+
for (const s of valid) {
|
|
740
|
+
if (typeof s.rows === "number") {
|
|
741
|
+
rows = (rows ?? 0) + s.rows;
|
|
742
|
+
}
|
|
743
|
+
else {
|
|
744
|
+
rows = undefined;
|
|
745
|
+
}
|
|
746
|
+
for (const c of s.columns || [])
|
|
747
|
+
colSet.add(String(c));
|
|
748
|
+
for (const [k, v] of Object.entries(s.dtypes || {})) {
|
|
749
|
+
if (!dtypeMulti[k])
|
|
750
|
+
dtypeMulti[k] = new Set();
|
|
751
|
+
dtypeMulti[k].add(String(v));
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
const dtypes = {};
|
|
755
|
+
for (const [k, values] of Object.entries(dtypeMulti)) {
|
|
756
|
+
const list = Array.from(values);
|
|
757
|
+
dtypes[k] = list.length <= 1 ? list[0] : `mixed(${list.join("|")})`;
|
|
758
|
+
}
|
|
759
|
+
return {
|
|
760
|
+
rows,
|
|
761
|
+
columns: Array.from(colSet).sort(),
|
|
762
|
+
dtypes,
|
|
763
|
+
};
|
|
764
|
+
}
|
|
765
|
+
function diffSchemaMaps(fromColumns, toColumns, fromDtypes, toDtypes) {
|
|
766
|
+
const fromSet = new Set(fromColumns);
|
|
767
|
+
const toSet = new Set(toColumns);
|
|
768
|
+
const added_columns = toColumns.filter((c) => !fromSet.has(c));
|
|
769
|
+
const removed_columns = fromColumns.filter((c) => !toSet.has(c));
|
|
770
|
+
const common = fromColumns.filter((c) => toSet.has(c));
|
|
771
|
+
const changed_dtypes = common
|
|
772
|
+
.filter((c) => String(fromDtypes[c] || "") !== String(toDtypes[c] || ""))
|
|
773
|
+
.map((c) => ({ column: c, from: fromDtypes[c], to: toDtypes[c] }));
|
|
774
|
+
return { added_columns, removed_columns, changed_dtypes };
|
|
775
|
+
}
|
|
776
|
+
function isLineageRecordShape(value) {
|
|
777
|
+
return !!value && typeof value === "object" && typeof value.dataset_id_base === "string" && Array.isArray(value.versions);
|
|
778
|
+
}
|
|
779
|
+
function parseErrorLogLines(filePath, withinDays) {
|
|
780
|
+
try {
|
|
781
|
+
if (!fs.existsSync(filePath))
|
|
782
|
+
return [];
|
|
783
|
+
const text = fs.readFileSync(filePath, "utf-8");
|
|
784
|
+
const lines = text.split(/\r?\n/).filter(Boolean);
|
|
785
|
+
const minTs = Date.now() - withinDays * 24 * 60 * 60 * 1000;
|
|
786
|
+
const out = [];
|
|
787
|
+
for (const line of lines) {
|
|
788
|
+
const m = line.match(/^\[(.+?)\]\s+ERROR\s+in\s+(.+)$/);
|
|
789
|
+
if (!m)
|
|
790
|
+
continue;
|
|
791
|
+
const ts = Date.parse(m[1]);
|
|
792
|
+
if (Number.isFinite(ts) && ts >= minTs) {
|
|
793
|
+
out.push({ at: m[1], message: m[2] });
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
return out.slice(-20);
|
|
797
|
+
}
|
|
798
|
+
catch {
|
|
799
|
+
return [];
|
|
800
|
+
}
|
|
801
|
+
}
|
|
538
802
|
async function countRows(filePath) {
|
|
539
803
|
const scriptPath = path.join(dataRoot, "python", "row_count.py");
|
|
540
804
|
const result = await runPythonJson(scriptPath, [filePath]);
|
|
@@ -1166,6 +1430,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1166
1430
|
type: "boolean",
|
|
1167
1431
|
description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
|
|
1168
1432
|
},
|
|
1433
|
+
agent_id: {
|
|
1434
|
+
type: "string",
|
|
1435
|
+
description: "Strongly recommended: caller agent identity for lineage/audit.",
|
|
1436
|
+
},
|
|
1437
|
+
pipeline_id: {
|
|
1438
|
+
type: "string",
|
|
1439
|
+
description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
|
|
1440
|
+
},
|
|
1169
1441
|
},
|
|
1170
1442
|
required: ["operation"],
|
|
1171
1443
|
},
|
|
@@ -1185,6 +1457,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1185
1457
|
limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
|
|
1186
1458
|
arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
|
|
1187
1459
|
github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
|
|
1460
|
+
agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
|
|
1461
|
+
pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
|
|
1188
1462
|
},
|
|
1189
1463
|
required: ["query"],
|
|
1190
1464
|
},
|
|
@@ -1232,6 +1506,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1232
1506
|
enum: ["semantic", "exact", "none"],
|
|
1233
1507
|
description: "How to deduplicate across sources.",
|
|
1234
1508
|
},
|
|
1509
|
+
agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
|
|
1510
|
+
pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
|
|
1235
1511
|
},
|
|
1236
1512
|
required: ["sources"],
|
|
1237
1513
|
},
|
|
@@ -1435,6 +1711,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1435
1711
|
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
1436
1712
|
cleaning_options: { type: "object" },
|
|
1437
1713
|
split_config: { type: "object" },
|
|
1714
|
+
agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
|
|
1715
|
+
pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
|
|
1438
1716
|
},
|
|
1439
1717
|
required: ["query"],
|
|
1440
1718
|
},
|
|
@@ -1509,6 +1787,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1509
1787
|
items: { type: "string" },
|
|
1510
1788
|
description: "Export only these columns (faster for wide datasets).",
|
|
1511
1789
|
},
|
|
1790
|
+
agent_id: {
|
|
1791
|
+
type: "string",
|
|
1792
|
+
description: "Strongly recommended: caller agent identity for lineage/audit.",
|
|
1793
|
+
},
|
|
1794
|
+
pipeline_id: {
|
|
1795
|
+
type: "string",
|
|
1796
|
+
description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
|
|
1797
|
+
},
|
|
1512
1798
|
},
|
|
1513
1799
|
required: ["dataset_id"],
|
|
1514
1800
|
},
|
|
@@ -1521,6 +1807,42 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1521
1807
|
properties: {},
|
|
1522
1808
|
},
|
|
1523
1809
|
},
|
|
1810
|
+
{
|
|
1811
|
+
name: "get_lineage",
|
|
1812
|
+
description: "Get version history and full lineage/provenance for a dataset (sources, steps, inputs/outputs, trigger metadata).",
|
|
1813
|
+
inputSchema: {
|
|
1814
|
+
type: "object",
|
|
1815
|
+
properties: {
|
|
1816
|
+
dataset_id: {
|
|
1817
|
+
type: "string",
|
|
1818
|
+
description: "Dataset ID (base or versioned, e.g. my_dataset or my_dataset_v2).",
|
|
1819
|
+
},
|
|
1820
|
+
},
|
|
1821
|
+
required: ["dataset_id"],
|
|
1822
|
+
},
|
|
1823
|
+
},
|
|
1824
|
+
{
|
|
1825
|
+
name: "diff_lineage_versions",
|
|
1826
|
+
description: "Diff two lineage versions for one dataset and return structured changes (schema, rows, steps, actor identity).",
|
|
1827
|
+
inputSchema: {
|
|
1828
|
+
type: "object",
|
|
1829
|
+
properties: {
|
|
1830
|
+
dataset_id: {
|
|
1831
|
+
type: "string",
|
|
1832
|
+
description: "Dataset ID (base or versioned).",
|
|
1833
|
+
},
|
|
1834
|
+
from_version: {
|
|
1835
|
+
type: "number",
|
|
1836
|
+
description: "Source lineage version number (e.g., 1).",
|
|
1837
|
+
},
|
|
1838
|
+
to_version: {
|
|
1839
|
+
type: "number",
|
|
1840
|
+
description: "Target lineage version number (e.g., 2).",
|
|
1841
|
+
},
|
|
1842
|
+
},
|
|
1843
|
+
required: ["dataset_id", "from_version", "to_version"],
|
|
1844
|
+
},
|
|
1845
|
+
},
|
|
1524
1846
|
{
|
|
1525
1847
|
name: "vesper_convert_format",
|
|
1526
1848
|
description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
|
|
@@ -1536,10 +1858,62 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1536
1858
|
enum: ["csv", "parquet", "json", "jsonl"],
|
|
1537
1859
|
description: "The desired output format.",
|
|
1538
1860
|
},
|
|
1861
|
+
agent_id: {
|
|
1862
|
+
type: "string",
|
|
1863
|
+
description: "Strongly recommended: caller agent identity for lineage/audit.",
|
|
1864
|
+
},
|
|
1865
|
+
pipeline_id: {
|
|
1866
|
+
type: "string",
|
|
1867
|
+
description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
|
|
1868
|
+
},
|
|
1539
1869
|
},
|
|
1540
1870
|
required: ["file_path", "target_format"],
|
|
1541
1871
|
},
|
|
1542
1872
|
},
|
|
1873
|
+
{
|
|
1874
|
+
name: "vesper_normalize_schema",
|
|
1875
|
+
description: "Normalize ragged JSON/JSONL rows into a schema-uniform JSONL (or JSON) by flattening metadata_json into stable columns (fills missing values with null). Useful before converting fused WebCore JSON to Parquet.",
|
|
1876
|
+
inputSchema: {
|
|
1877
|
+
type: "object",
|
|
1878
|
+
properties: {
|
|
1879
|
+
file_path: {
|
|
1880
|
+
type: "string",
|
|
1881
|
+
description: "Absolute path to the input file (.json or .jsonl). If it's a fused Vesper output JSON, tool will extract results[].",
|
|
1882
|
+
},
|
|
1883
|
+
output_format: {
|
|
1884
|
+
type: "string",
|
|
1885
|
+
enum: ["jsonl", "json"],
|
|
1886
|
+
description: "Output format for normalized rows. Default: jsonl.",
|
|
1887
|
+
},
|
|
1888
|
+
output_dir: {
|
|
1889
|
+
type: "string",
|
|
1890
|
+
description: "Directory to write normalized output. Default: ~/.vesper/data/normalized_schema",
|
|
1891
|
+
},
|
|
1892
|
+
flatten_metadata_json: {
|
|
1893
|
+
type: "boolean",
|
|
1894
|
+
description: "Flatten metadata_json into metadata__* columns. Default: true.",
|
|
1895
|
+
},
|
|
1896
|
+
max_keys: {
|
|
1897
|
+
type: "number",
|
|
1898
|
+
description: "Max number of metadata_json keys to materialize as columns. Extra keys go into metadata_json_blob (if extras_mode='blob'). Default: 200.",
|
|
1899
|
+
},
|
|
1900
|
+
extras_mode: {
|
|
1901
|
+
type: "string",
|
|
1902
|
+
enum: ["blob", "drop"],
|
|
1903
|
+
description: "How to handle metadata_json keys beyond max_keys. blob keeps them in metadata_json_blob; drop discards them. Default: blob.",
|
|
1904
|
+
},
|
|
1905
|
+
agent_id: {
|
|
1906
|
+
type: "string",
|
|
1907
|
+
description: "Strongly recommended: caller agent identity for lineage/audit.",
|
|
1908
|
+
},
|
|
1909
|
+
pipeline_id: {
|
|
1910
|
+
type: "string",
|
|
1911
|
+
description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
|
|
1912
|
+
},
|
|
1913
|
+
},
|
|
1914
|
+
required: ["file_path"],
|
|
1915
|
+
},
|
|
1916
|
+
},
|
|
1543
1917
|
{
|
|
1544
1918
|
name: "fuse_datasets",
|
|
1545
1919
|
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
@@ -1725,6 +2099,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1725
2099
|
arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
|
|
1726
2100
|
github_include_readme: request.params.arguments?.github_include_readme === true,
|
|
1727
2101
|
});
|
|
2102
|
+
try {
|
|
2103
|
+
appendLineageVersion({
|
|
2104
|
+
datasetIdBase: `webfind_${query || "query"}`,
|
|
2105
|
+
tool: "vesper_web_find",
|
|
2106
|
+
requestArgs: request.params.arguments,
|
|
2107
|
+
output: {
|
|
2108
|
+
rows: Array.isArray(result.results) ? result.results.length : undefined,
|
|
2109
|
+
},
|
|
2110
|
+
sources: Array.isArray(result.results)
|
|
2111
|
+
? result.results.slice(0, 200).map((r) => ({
|
|
2112
|
+
source: String(r?.source_type || "unknown"),
|
|
2113
|
+
url: typeof r?.source_url === "string" ? r.source_url : undefined,
|
|
2114
|
+
at: typeof r?.collected_at === "string" ? r.collected_at : undefined,
|
|
2115
|
+
}))
|
|
2116
|
+
: [],
|
|
2117
|
+
steps: [
|
|
2118
|
+
{ step: "web_find_discover", at: new Date().toISOString(), params: { query, sources, limit } },
|
|
2119
|
+
{ step: "web_find_complete", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
|
|
2120
|
+
],
|
|
2121
|
+
});
|
|
2122
|
+
}
|
|
2123
|
+
catch (e) {
|
|
2124
|
+
console.error(`[Lineage] vesper_web_find append failed: ${e?.message || e}`);
|
|
2125
|
+
}
|
|
1728
2126
|
return {
|
|
1729
2127
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1730
2128
|
};
|
|
@@ -1846,6 +2244,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1846
2244
|
limit: Number(request.params.arguments?.limit || 10),
|
|
1847
2245
|
publicOnly,
|
|
1848
2246
|
});
|
|
2247
|
+
try {
|
|
2248
|
+
appendLineageVersion({
|
|
2249
|
+
datasetIdBase: `discover_${source}_${query || "query"}`,
|
|
2250
|
+
tool: "unified_dataset_api.discover",
|
|
2251
|
+
requestArgs: request.params.arguments,
|
|
2252
|
+
output: { rows: Array.isArray(result.results) ? result.results.length : undefined },
|
|
2253
|
+
sources: Array.isArray(result.results)
|
|
2254
|
+
? result.results.slice(0, 200).map((r) => ({
|
|
2255
|
+
source: String(r?.source || source || "unknown"),
|
|
2256
|
+
url: typeof r?.download_url === "string"
|
|
2257
|
+
? r.download_url
|
|
2258
|
+
: (typeof r?.metadata_url === "string" ? r.metadata_url : undefined),
|
|
2259
|
+
at: new Date().toISOString(),
|
|
2260
|
+
}))
|
|
2261
|
+
: [],
|
|
2262
|
+
steps: [
|
|
2263
|
+
{ step: "discover_requested", at: new Date().toISOString(), params: { query, source, limit: Number(request.params.arguments?.limit || 10), publicOnly } },
|
|
2264
|
+
{ step: "discover_completed", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
|
|
2265
|
+
],
|
|
2266
|
+
});
|
|
2267
|
+
}
|
|
2268
|
+
catch (e) {
|
|
2269
|
+
console.error(`[Lineage] unified discover append failed: ${e?.message || e}`);
|
|
2270
|
+
}
|
|
1849
2271
|
return {
|
|
1850
2272
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1851
2273
|
};
|
|
@@ -1878,6 +2300,36 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1878
2300
|
catch (e) {
|
|
1879
2301
|
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1880
2302
|
}
|
|
2303
|
+
try {
|
|
2304
|
+
const schemaAfter = await getSchemaSnapshot(result.copied_to || result.local_path);
|
|
2305
|
+
const lineage = appendLineageVersion({
|
|
2306
|
+
datasetIdBase: result.dataset_id,
|
|
2307
|
+
tool: "unified_dataset_api.download",
|
|
2308
|
+
requestArgs: request.params.arguments,
|
|
2309
|
+
outputPath: result.copied_to || result.local_path,
|
|
2310
|
+
output: {
|
|
2311
|
+
local_path: result.copied_to || result.local_path,
|
|
2312
|
+
format: path.extname(result.copied_to || result.local_path).replace(".", ""),
|
|
2313
|
+
schema_after: schemaAfter,
|
|
2314
|
+
},
|
|
2315
|
+
sources: [{
|
|
2316
|
+
source: source,
|
|
2317
|
+
url: typeof result.dataset_id === "string" ? result.dataset_id : undefined,
|
|
2318
|
+
at: new Date().toISOString(),
|
|
2319
|
+
}],
|
|
2320
|
+
steps: [
|
|
2321
|
+
{ step: "download_requested", at: new Date().toISOString(), params: { datasetId, source, targetDir } },
|
|
2322
|
+
{ step: "download_completed", at: new Date().toISOString(), metrics: { local_path: result.copied_to || result.local_path } },
|
|
2323
|
+
],
|
|
2324
|
+
});
|
|
2325
|
+
try {
|
|
2326
|
+
upsertRegistry(lineage.datasetVersionId, result.copied_to || result.local_path, "completed");
|
|
2327
|
+
}
|
|
2328
|
+
catch { }
|
|
2329
|
+
}
|
|
2330
|
+
catch (e) {
|
|
2331
|
+
console.error(`[Lineage] unified download append failed: ${e?.message || e}`);
|
|
2332
|
+
}
|
|
1881
2333
|
return {
|
|
1882
2334
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1883
2335
|
};
|
|
@@ -2413,6 +2865,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2413
2865
|
};
|
|
2414
2866
|
}
|
|
2415
2867
|
jobStatusLastPoll[jobId] = now;
|
|
2868
|
+
if (job.status === "completed") {
|
|
2869
|
+
try {
|
|
2870
|
+
const meta = job.metadata ? JSON.parse(job.metadata) : {};
|
|
2871
|
+
const baseId = String(meta?.datasetId || meta?.dataset_id || meta?.query || job.id);
|
|
2872
|
+
const outPath = typeof job.result_url === "string" ? job.result_url : undefined;
|
|
2873
|
+
appendLineageVersion({
|
|
2874
|
+
datasetIdBase: baseId,
|
|
2875
|
+
tool: `job:${job.type}`,
|
|
2876
|
+
requestArgs: {
|
|
2877
|
+
dataset_id: meta?.datasetId || meta?.dataset_id,
|
|
2878
|
+
query: meta?.query,
|
|
2879
|
+
pipeline_id: meta?.pipeline_id,
|
|
2880
|
+
agent_id: meta?.agent_id,
|
|
2881
|
+
},
|
|
2882
|
+
outputPath: outPath,
|
|
2883
|
+
output: {},
|
|
2884
|
+
steps: [
|
|
2885
|
+
{ step: `${job.type}_started`, at: job.created_at, params: meta || {} },
|
|
2886
|
+
{ step: `${job.type}_completed`, at: job.updated_at || new Date().toISOString(), metrics: { progress: job.progress } },
|
|
2887
|
+
],
|
|
2888
|
+
});
|
|
2889
|
+
}
|
|
2890
|
+
catch (e) {
|
|
2891
|
+
console.error(`[Lineage] check_job_status append failed: ${e?.message || e}`);
|
|
2892
|
+
}
|
|
2893
|
+
}
|
|
2416
2894
|
return {
|
|
2417
2895
|
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
2418
2896
|
};
|
|
@@ -2531,10 +3009,36 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2531
3009
|
if (!fs.existsSync(outDir))
|
|
2532
3010
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2533
3011
|
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
3012
|
+
const schemaBefore = await getSchemaSnapshot(sourcePath);
|
|
2534
3013
|
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
3014
|
+
const schemaAfter = await getSchemaSnapshot(result.output_path);
|
|
3015
|
+
const lineage = appendLineageVersion({
|
|
3016
|
+
datasetIdBase: datasetId,
|
|
3017
|
+
tool: "export_dataset",
|
|
3018
|
+
requestArgs: request.params.arguments,
|
|
3019
|
+
outputPath: result.output_path,
|
|
3020
|
+
output: {
|
|
3021
|
+
rows: result.rows,
|
|
3022
|
+
columns: result.columns,
|
|
3023
|
+
format: requestedFormat,
|
|
3024
|
+
size_mb: result.file_size_mb,
|
|
3025
|
+
schema_before: schemaBefore,
|
|
3026
|
+
schema_after: schemaAfter,
|
|
3027
|
+
},
|
|
3028
|
+
steps: [
|
|
3029
|
+
{ step: "source_resolved", at: new Date().toISOString(), params: { sourcePath } },
|
|
3030
|
+
{ step: "exported", at: new Date().toISOString(), params: { format: requestedFormat, compression }, metrics: { rows: result.rows, columns: result.columns } },
|
|
3031
|
+
],
|
|
3032
|
+
});
|
|
3033
|
+
try {
|
|
3034
|
+
upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
|
|
3035
|
+
}
|
|
3036
|
+
catch { }
|
|
2535
3037
|
// Build rich response
|
|
2536
3038
|
let msg = `**Export complete**\n`;
|
|
2537
3039
|
msg += `- **File**: ${result.output_path}\n`;
|
|
3040
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3041
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
2538
3042
|
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
2539
3043
|
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2540
3044
|
if (result.file_size_mb !== undefined)
|
|
@@ -2580,6 +3084,100 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2580
3084
|
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
2581
3085
|
};
|
|
2582
3086
|
}
|
|
3087
|
+
case "get_lineage": {
|
|
3088
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
3089
|
+
if (!datasetId) {
|
|
3090
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
3091
|
+
}
|
|
3092
|
+
const base = toBaseDatasetId(datasetId);
|
|
3093
|
+
const record = readLineageRecord(base);
|
|
3094
|
+
if (!record.versions || record.versions.length === 0) {
|
|
3095
|
+
return {
|
|
3096
|
+
content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
|
|
3097
|
+
};
|
|
3098
|
+
}
|
|
3099
|
+
return {
|
|
3100
|
+
content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
|
|
3101
|
+
};
|
|
3102
|
+
}
|
|
3103
|
+
case "diff_lineage_versions": {
|
|
3104
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
3105
|
+
const fromVersion = Number(request.params.arguments?.from_version);
|
|
3106
|
+
const toVersion = Number(request.params.arguments?.to_version);
|
|
3107
|
+
if (!datasetId) {
|
|
3108
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
3109
|
+
}
|
|
3110
|
+
if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
|
|
3111
|
+
throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
|
|
3112
|
+
}
|
|
3113
|
+
if (!Number.isInteger(toVersion) || toVersion <= 0) {
|
|
3114
|
+
throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
|
|
3115
|
+
}
|
|
3116
|
+
const base = toBaseDatasetId(datasetId);
|
|
3117
|
+
const record = readLineageRecord(base);
|
|
3118
|
+
const fromV = record.versions.find((v) => v.version === fromVersion);
|
|
3119
|
+
const toV = record.versions.find((v) => v.version === toVersion);
|
|
3120
|
+
if (!fromV || !toV) {
|
|
3121
|
+
return {
|
|
3122
|
+
content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
|
|
3123
|
+
isError: true,
|
|
3124
|
+
};
|
|
3125
|
+
}
|
|
3126
|
+
const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
|
|
3127
|
+
? fromV.output?.schema_after || fromV.output?.schema_before || {}
|
|
3128
|
+
: fromV.output?.schema_after || fromV.output?.schema_before || {};
|
|
3129
|
+
const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
|
|
3130
|
+
const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
|
|
3131
|
+
const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
|
|
3132
|
+
const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
|
|
3133
|
+
const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
|
|
3134
|
+
const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
|
|
3135
|
+
const fromRows = typeof fromSchema.rows === "number"
|
|
3136
|
+
? fromSchema.rows
|
|
3137
|
+
: (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
|
|
3138
|
+
const toRows = typeof toSchema.rows === "number"
|
|
3139
|
+
? toSchema.rows
|
|
3140
|
+
: (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
|
|
3141
|
+
const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
|
|
3142
|
+
const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
|
|
3143
|
+
const addedSteps = Array.from(toSteps).filter((s) => !fromSteps.has(s));
|
|
3144
|
+
const removedSteps = Array.from(fromSteps).filter((s) => !toSteps.has(s));
|
|
3145
|
+
const actorDiff = {
|
|
3146
|
+
changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
|
|
3147
|
+
String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
|
|
3148
|
+
from: {
|
|
3149
|
+
tool: fromV.triggered_by?.tool,
|
|
3150
|
+
agent_id: fromV.triggered_by?.agent_id,
|
|
3151
|
+
pipeline_id: fromV.triggered_by?.pipeline_id,
|
|
3152
|
+
},
|
|
3153
|
+
to: {
|
|
3154
|
+
tool: toV.triggered_by?.tool,
|
|
3155
|
+
agent_id: toV.triggered_by?.agent_id,
|
|
3156
|
+
pipeline_id: toV.triggered_by?.pipeline_id,
|
|
3157
|
+
},
|
|
3158
|
+
};
|
|
3159
|
+
const diffResult = {
|
|
3160
|
+
dataset_id_base: base,
|
|
3161
|
+
from_version: fromVersion,
|
|
3162
|
+
to_version: toVersion,
|
|
3163
|
+
schema_diff: schemaDiff,
|
|
3164
|
+
row_count_delta: {
|
|
3165
|
+
from: fromRows,
|
|
3166
|
+
to: toRows,
|
|
3167
|
+
delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
|
|
3168
|
+
},
|
|
3169
|
+
steps_diff: {
|
|
3170
|
+
added: addedSteps,
|
|
3171
|
+
removed: removedSteps,
|
|
3172
|
+
from_steps: Array.from(fromSteps),
|
|
3173
|
+
to_steps: Array.from(toSteps),
|
|
3174
|
+
},
|
|
3175
|
+
actor_diff: actorDiff,
|
|
3176
|
+
};
|
|
3177
|
+
return {
|
|
3178
|
+
content: [{ type: "text", text: JSON.stringify(diffResult, null, 2) }],
|
|
3179
|
+
};
|
|
3180
|
+
}
|
|
2583
3181
|
case "vesper_convert_format": {
|
|
2584
3182
|
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
2585
3183
|
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
@@ -2607,7 +3205,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2607
3205
|
try {
|
|
2608
3206
|
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
2609
3207
|
const convertScript = path.join(dataRoot, "python", "convert_engine.py");
|
|
3208
|
+
const schemaBefore = await getSchemaSnapshot(filePath);
|
|
2610
3209
|
const result = await runPythonJson(convertScript, [filePath, outputPath]);
|
|
3210
|
+
const schemaAfter = await getSchemaSnapshot(outputPath);
|
|
2611
3211
|
if (!result.ok) {
|
|
2612
3212
|
return {
|
|
2613
3213
|
content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
|
|
@@ -2622,9 +3222,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2622
3222
|
catch (e) {
|
|
2623
3223
|
console.error(`[Convert] Registry write failed: ${e?.message || e}`);
|
|
2624
3224
|
}
|
|
3225
|
+
const lineage = appendLineageVersion({
|
|
3226
|
+
datasetIdBase: datasetId,
|
|
3227
|
+
tool: "vesper_convert_format",
|
|
3228
|
+
requestArgs: request.params.arguments,
|
|
3229
|
+
outputPath,
|
|
3230
|
+
output: {
|
|
3231
|
+
rows: result.rows,
|
|
3232
|
+
columns: result.columns,
|
|
3233
|
+
format: targetFormat,
|
|
3234
|
+
size_mb: result.size_mb,
|
|
3235
|
+
schema_before: schemaBefore,
|
|
3236
|
+
schema_after: schemaAfter,
|
|
3237
|
+
},
|
|
3238
|
+
steps: [
|
|
3239
|
+
{ step: "converted", at: new Date().toISOString(), params: { from: inputExt, to: outputExt } },
|
|
3240
|
+
],
|
|
3241
|
+
});
|
|
3242
|
+
try {
|
|
3243
|
+
upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
|
|
3244
|
+
}
|
|
3245
|
+
catch { }
|
|
2625
3246
|
let msg = `**Conversion complete**\n`;
|
|
2626
3247
|
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
2627
3248
|
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
3249
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3250
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
2628
3251
|
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2629
3252
|
if (result.size_mb !== undefined)
|
|
2630
3253
|
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
@@ -2637,6 +3260,86 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2637
3260
|
};
|
|
2638
3261
|
}
|
|
2639
3262
|
}
|
|
3263
|
+
case "vesper_normalize_schema": {
|
|
3264
|
+
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
3265
|
+
const outputFormat = String(request.params.arguments?.output_format || "jsonl").trim().toLowerCase();
|
|
3266
|
+
const outputDirRaw = request.params.arguments?.output_dir ? String(request.params.arguments.output_dir).trim() : "";
|
|
3267
|
+
const flattenMetadataJson = request.params.arguments?.flatten_metadata_json !== false;
|
|
3268
|
+
const maxKeys = Number(request.params.arguments?.max_keys ?? 200);
|
|
3269
|
+
const extrasMode = String(request.params.arguments?.extras_mode || "blob").trim().toLowerCase();
|
|
3270
|
+
if (!filePath) {
|
|
3271
|
+
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
3272
|
+
}
|
|
3273
|
+
if (!["jsonl", "json"].includes(outputFormat)) {
|
|
3274
|
+
throw new McpError(ErrorCode.InvalidParams, "output_format must be one of: jsonl, json");
|
|
3275
|
+
}
|
|
3276
|
+
if (!fs.existsSync(filePath)) {
|
|
3277
|
+
return { content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }], isError: true };
|
|
3278
|
+
}
|
|
3279
|
+
const outDir = outputDirRaw || path.join(dataRoot, "data", "normalized_schema");
|
|
3280
|
+
if (!fs.existsSync(outDir))
|
|
3281
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
3282
|
+
const baseName = path.parse(filePath).name || `normalized_${Date.now()}`;
|
|
3283
|
+
const outputPath = path.join(outDir, `${baseName}.normalized.${outputFormat}`);
|
|
3284
|
+
try {
|
|
3285
|
+
const scriptPath = path.join(dataRoot, "python", "normalize_schema_engine.py");
|
|
3286
|
+
const options = {
|
|
3287
|
+
flatten_metadata_json: !!flattenMetadataJson,
|
|
3288
|
+
max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
|
|
3289
|
+
extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
|
|
3290
|
+
};
|
|
3291
|
+
const schemaBefore = await getSchemaSnapshot(filePath);
|
|
3292
|
+
const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
|
|
3293
|
+
const schemaAfter = await getSchemaSnapshot(outputPath);
|
|
3294
|
+
if (!result.ok) {
|
|
3295
|
+
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
|
|
3296
|
+
}
|
|
3297
|
+
// Register normalized file to make follow-up conversion easier.
|
|
3298
|
+
try {
|
|
3299
|
+
const datasetId = path.basename(outputPath, path.extname(outputPath));
|
|
3300
|
+
upsertRegistry(datasetId, outputPath, "completed");
|
|
3301
|
+
}
|
|
3302
|
+
catch (e) {
|
|
3303
|
+
console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
|
|
3304
|
+
}
|
|
3305
|
+
const lineage = appendLineageVersion({
|
|
3306
|
+
datasetIdBase: path.basename(outputPath, path.extname(outputPath)),
|
|
3307
|
+
tool: "vesper_normalize_schema",
|
|
3308
|
+
requestArgs: request.params.arguments,
|
|
3309
|
+
outputPath,
|
|
3310
|
+
output: {
|
|
3311
|
+
rows: result.rows,
|
|
3312
|
+
columns: result.columns,
|
|
3313
|
+
format: outputFormat,
|
|
3314
|
+
schema_before: schemaBefore,
|
|
3315
|
+
schema_after: schemaAfter,
|
|
3316
|
+
},
|
|
3317
|
+
steps: [
|
|
3318
|
+
{ step: "schema_normalized", at: new Date().toISOString(), params: options, metrics: { flattened_keys: result.flattened_keys } },
|
|
3319
|
+
],
|
|
3320
|
+
});
|
|
3321
|
+
try {
|
|
3322
|
+
upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
|
|
3323
|
+
}
|
|
3324
|
+
catch { }
|
|
3325
|
+
let msg = `**Schema normalization complete**\n`;
|
|
3326
|
+
msg += `- **Input**: ${filePath}\n`;
|
|
3327
|
+
msg += `- **Output**: ${result.output_path}\n`;
|
|
3328
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3329
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
3330
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
|
|
3331
|
+
msg += `- **Columns**: ${result.columns}\n`;
|
|
3332
|
+
msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
|
|
3333
|
+
msg += `- **Extras mode**: ${result.extras_mode}\n`;
|
|
3334
|
+
if (result.extras_rows !== undefined)
|
|
3335
|
+
msg += `- **Rows with extras**: ${result.extras_rows}\n`;
|
|
3336
|
+
msg += `\nNext: run \`vesper_convert_format\` on the output to convert to parquet.\n`;
|
|
3337
|
+
return { content: [{ type: "text", text: msg }] };
|
|
3338
|
+
}
|
|
3339
|
+
catch (error) {
|
|
3340
|
+
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${error.message}` }], isError: true };
|
|
3341
|
+
}
|
|
3342
|
+
}
|
|
2640
3343
|
case "fuse_datasets": {
|
|
2641
3344
|
const rawSources = request.params.arguments?.sources;
|
|
2642
3345
|
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
@@ -2703,10 +3406,35 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2703
3406
|
catch (e) {
|
|
2704
3407
|
console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
|
|
2705
3408
|
}
|
|
3409
|
+
const inputSchemaSnapshots = await Promise.all(resolvedPaths.map((p) => getSchemaSnapshot(p)));
|
|
3410
|
+
const schemaBefore = mergeSchemaSnapshots(inputSchemaSnapshots);
|
|
3411
|
+
const schemaAfter = await getSchemaSnapshot(result.output_path);
|
|
3412
|
+
const lineage = appendLineageVersion({
|
|
3413
|
+
datasetIdBase: fusedId,
|
|
3414
|
+
tool: "fuse_datasets",
|
|
3415
|
+
requestArgs: request.params.arguments,
|
|
3416
|
+
outputPath: result.output_path,
|
|
3417
|
+
output: {
|
|
3418
|
+
rows: result.stats.rows_after,
|
|
3419
|
+
format: outputFormat,
|
|
3420
|
+
schema_before: schemaBefore,
|
|
3421
|
+
schema_after: schemaAfter,
|
|
3422
|
+
},
|
|
3423
|
+
sources: resolvedPaths.map((p) => ({ source: "local", url: p, at: new Date().toISOString() })),
|
|
3424
|
+
steps: [
|
|
3425
|
+
{ step: "fused", at: new Date().toISOString(), params: { strategy, dedup, how }, metrics: { rows_before: result.stats.rows_before, rows_after: result.stats.rows_after, duplicates_removed: result.stats.duplicates_removed } },
|
|
3426
|
+
],
|
|
3427
|
+
});
|
|
3428
|
+
try {
|
|
3429
|
+
upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
|
|
3430
|
+
}
|
|
3431
|
+
catch { }
|
|
2706
3432
|
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
2707
3433
|
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
2708
3434
|
msg += `- Null change: ${nullText}\n`;
|
|
2709
3435
|
msg += `- Output: ${result.output_path}\n`;
|
|
3436
|
+
msg += `- Version: ${lineage.datasetVersionId}\n`;
|
|
3437
|
+
msg += `- Lineage: ${lineage.lineagePath}\n`;
|
|
2710
3438
|
if (result.preview_path)
|
|
2711
3439
|
msg += `- Preview: ${result.preview_path}\n`;
|
|
2712
3440
|
if (result.leakage_report) {
|
|
@@ -2874,6 +3602,7 @@ async function main() {
|
|
|
2874
3602
|
const isDiscover = args.includes("discover");
|
|
2875
3603
|
const isDownload = args.includes("download");
|
|
2876
3604
|
const isExport = args.includes("export");
|
|
3605
|
+
const isStatus = args.includes("status");
|
|
2877
3606
|
const isConfig = args.includes("config") || args.includes("configure");
|
|
2878
3607
|
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
2879
3608
|
const isSilent = args.includes("--silent");
|
|
@@ -2900,6 +3629,10 @@ async function main() {
|
|
|
2900
3629
|
await runExportCli(args);
|
|
2901
3630
|
return;
|
|
2902
3631
|
}
|
|
3632
|
+
if (isStatus) {
|
|
3633
|
+
await runStatusCli(args);
|
|
3634
|
+
return;
|
|
3635
|
+
}
|
|
2903
3636
|
// If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
|
|
2904
3637
|
if (isSetup) {
|
|
2905
3638
|
await runSetupWizard(isSilent);
|
|
@@ -3320,6 +4053,173 @@ async function runFuseCli(args) {
|
|
|
3320
4053
|
console.log(`Preview saved: ${result.preview_path}`);
|
|
3321
4054
|
console.log("Next: run vespermcp split/export on the fused dataset");
|
|
3322
4055
|
}
|
|
4056
|
+
async function runStatusCli(args) {
|
|
4057
|
+
const [{ default: chalk }, { default: Table }] = await Promise.all([
|
|
4058
|
+
import("chalk"),
|
|
4059
|
+
import("cli-table3"),
|
|
4060
|
+
]);
|
|
4061
|
+
const getArgValue = (name) => {
|
|
4062
|
+
const idx = args.findIndex(a => a === name);
|
|
4063
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
4064
|
+
return args[idx + 1];
|
|
4065
|
+
return undefined;
|
|
4066
|
+
};
|
|
4067
|
+
const defaultDir = path.join(dataRoot, "lineage");
|
|
4068
|
+
const scanDirRaw = getArgValue("--dir");
|
|
4069
|
+
const scanDir = path.resolve(scanDirRaw || defaultDir);
|
|
4070
|
+
const maxDepthRaw = getArgValue("--max-depth");
|
|
4071
|
+
const maxDepthParsed = maxDepthRaw !== undefined ? Number(maxDepthRaw) : 4;
|
|
4072
|
+
const maxDepth = Number.isFinite(maxDepthParsed) && maxDepthParsed >= 0
|
|
4073
|
+
? Math.floor(maxDepthParsed)
|
|
4074
|
+
: 4;
|
|
4075
|
+
if (!fs.existsSync(scanDir)) {
|
|
4076
|
+
console.log(`Lineage directory not found: ${scanDir}`);
|
|
4077
|
+
console.log("Tip: use --dir <path> to scan a custom location.");
|
|
4078
|
+
return;
|
|
4079
|
+
}
|
|
4080
|
+
const lineageFiles = walkFilesRecursive(scanDir, maxDepth).filter((p) => p.toLowerCase().endsWith(".lineage.json"));
|
|
4081
|
+
const records = [];
|
|
4082
|
+
for (const filePath of lineageFiles) {
|
|
4083
|
+
try {
|
|
4084
|
+
const data = JSON.parse(fs.readFileSync(filePath, "utf-8"));
|
|
4085
|
+
if (isLineageRecordShape(data)) {
|
|
4086
|
+
records.push(data);
|
|
4087
|
+
}
|
|
4088
|
+
}
|
|
4089
|
+
catch {
|
|
4090
|
+
// ignore malformed files
|
|
4091
|
+
}
|
|
4092
|
+
}
|
|
4093
|
+
if (records.length === 0) {
|
|
4094
|
+
console.log("No lineage records found.");
|
|
4095
|
+
console.log("Tip: default scan is ~/.vesper/lineage. Use --dir <path> for project-local lineage files.");
|
|
4096
|
+
return;
|
|
4097
|
+
}
|
|
4098
|
+
const allVersions = records.flatMap((r) => r.versions || []);
|
|
4099
|
+
const sevenDaysAgo = Date.now() - 7 * 24 * 60 * 60 * 1000;
|
|
4100
|
+
const operationsLast7d = allVersions.filter((v) => Date.parse(v.created_at || "") >= sevenDaysAgo).length;
|
|
4101
|
+
console.log(chalk.bold.cyan("\nVesper Lineage Status"));
|
|
4102
|
+
console.log(chalk.gray(`Scan dir: ${scanDir}`));
|
|
4103
|
+
console.log(chalk.gray(`Max depth: ${maxDepth}`));
|
|
4104
|
+
console.log(chalk.gray(`Lineage records: ${records.length}`));
|
|
4105
|
+
console.log(chalk.gray(`Total operations: ${allVersions.length} (${operationsLast7d} in last 7 days)\n`));
|
|
4106
|
+
const perDatasetTable = new Table({
|
|
4107
|
+
head: ["Dataset", "Versions", "Last Modified", "Last Actor"],
|
|
4108
|
+
colWidths: [34, 10, 28, 28],
|
|
4109
|
+
wordWrap: true,
|
|
4110
|
+
});
|
|
4111
|
+
for (const record of records.sort((a, b) => (a.dataset_id_base || "").localeCompare(b.dataset_id_base || ""))) {
|
|
4112
|
+
const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
|
|
4113
|
+
const last = sorted[sorted.length - 1];
|
|
4114
|
+
const actor = last?.triggered_by?.agent_id || last?.triggered_by?.pipeline_id || "-";
|
|
4115
|
+
perDatasetTable.push([
|
|
4116
|
+
record.dataset_id_base,
|
|
4117
|
+
String(sorted.length),
|
|
4118
|
+
last?.created_at || "-",
|
|
4119
|
+
actor,
|
|
4120
|
+
]);
|
|
4121
|
+
}
|
|
4122
|
+
console.log(chalk.bold("Per-dataset summary"));
|
|
4123
|
+
console.log(perDatasetTable.toString());
|
|
4124
|
+
const trendTable = new Table({
|
|
4125
|
+
head: ["Dataset", "Rows Trend", "Details"],
|
|
4126
|
+
colWidths: [34, 14, 52],
|
|
4127
|
+
wordWrap: true,
|
|
4128
|
+
});
|
|
4129
|
+
for (const record of records.sort((a, b) => (a.dataset_id_base || "").localeCompare(b.dataset_id_base || ""))) {
|
|
4130
|
+
const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
|
|
4131
|
+
const series = sorted
|
|
4132
|
+
.map((v) => ({ version: v.version, rows: v.output?.schema_after?.rows ?? v.output?.rows }))
|
|
4133
|
+
.filter((x) => typeof x.rows === "number");
|
|
4134
|
+
if (series.length < 2) {
|
|
4135
|
+
trendTable.push([record.dataset_id_base, "-", "insufficient row snapshots"]);
|
|
4136
|
+
continue;
|
|
4137
|
+
}
|
|
4138
|
+
const first = series[0].rows;
|
|
4139
|
+
const last = series[series.length - 1].rows;
|
|
4140
|
+
const trend = last > first ? chalk.green("growing") : last < first ? chalk.yellow("shrinking") : "flat";
|
|
4141
|
+
const details = series.map((x) => `v${x.version}:${x.rows}`).join(" -> ");
|
|
4142
|
+
trendTable.push([record.dataset_id_base, trend, details]);
|
|
4143
|
+
}
|
|
4144
|
+
console.log(chalk.bold("\nQuality trend (schema_after.rows)"));
|
|
4145
|
+
console.log(trendTable.toString());
|
|
4146
|
+
const dtypeWarnings = [];
|
|
4147
|
+
for (const record of records) {
|
|
4148
|
+
const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
|
|
4149
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4150
|
+
const prev = sorted[i - 1];
|
|
4151
|
+
const curr = sorted[i];
|
|
4152
|
+
const prevSchema = prev.output?.schema_after || prev.output?.schema_before;
|
|
4153
|
+
const currSchema = curr.output?.schema_after || curr.output?.schema_before;
|
|
4154
|
+
const prevCols = prevSchema?.columns || [];
|
|
4155
|
+
const currCols = currSchema?.columns || [];
|
|
4156
|
+
const prevDtypes = prevSchema?.dtypes || {};
|
|
4157
|
+
const currDtypes = currSchema?.dtypes || {};
|
|
4158
|
+
const diff = diffSchemaMaps(prevCols, currCols, prevDtypes, currDtypes);
|
|
4159
|
+
if (diff.changed_dtypes.length > 0) {
|
|
4160
|
+
dtypeWarnings.push({
|
|
4161
|
+
dataset: record.dataset_id_base,
|
|
4162
|
+
from: prev.version,
|
|
4163
|
+
to: curr.version,
|
|
4164
|
+
changes: diff.changed_dtypes.slice(0, 4).map((d) => `${d.column}:${d.from}->${d.to}`),
|
|
4165
|
+
});
|
|
4166
|
+
}
|
|
4167
|
+
}
|
|
4168
|
+
}
|
|
4169
|
+
console.log(chalk.bold("\nDtype warnings"));
|
|
4170
|
+
if (dtypeWarnings.length === 0) {
|
|
4171
|
+
console.log(chalk.green("No dtype changes detected across adjacent versions."));
|
|
4172
|
+
}
|
|
4173
|
+
else {
|
|
4174
|
+
const warningTable = new Table({
|
|
4175
|
+
head: ["Dataset", "Versions", "Changed dtypes"],
|
|
4176
|
+
colWidths: [34, 14, 52],
|
|
4177
|
+
wordWrap: true,
|
|
4178
|
+
});
|
|
4179
|
+
for (const w of dtypeWarnings.slice(-20)) {
|
|
4180
|
+
warningTable.push([
|
|
4181
|
+
w.dataset,
|
|
4182
|
+
`v${w.from}->v${w.to}`,
|
|
4183
|
+
w.changes.join(", "),
|
|
4184
|
+
]);
|
|
4185
|
+
}
|
|
4186
|
+
console.log(warningTable.toString());
|
|
4187
|
+
}
|
|
4188
|
+
const lineageErrors = [];
|
|
4189
|
+
for (const record of records) {
|
|
4190
|
+
for (const v of record.versions || []) {
|
|
4191
|
+
for (const step of v.steps || []) {
|
|
4192
|
+
const errMsg = (typeof step.metrics?.error === "string" && step.metrics.error) ||
|
|
4193
|
+
(typeof step.params?.error === "string" && step.params.error) ||
|
|
4194
|
+
undefined;
|
|
4195
|
+
if (errMsg) {
|
|
4196
|
+
lineageErrors.push({ at: step.at, where: `${record.dataset_id_base}/v${v.version}:${step.step}`, message: errMsg });
|
|
4197
|
+
}
|
|
4198
|
+
}
|
|
4199
|
+
}
|
|
4200
|
+
}
|
|
4201
|
+
const logErrors = parseErrorLogLines(errorLogPath, 7).map((e) => ({
|
|
4202
|
+
at: e.at,
|
|
4203
|
+
where: "vesper_errors.log",
|
|
4204
|
+
message: e.message,
|
|
4205
|
+
}));
|
|
4206
|
+
const recentErrors = [...lineageErrors, ...logErrors].slice(-20);
|
|
4207
|
+
console.log(chalk.bold("\nRecent errors"));
|
|
4208
|
+
if (recentErrors.length === 0) {
|
|
4209
|
+
console.log(chalk.green("No recent lineage-linked errors found."));
|
|
4210
|
+
}
|
|
4211
|
+
else {
|
|
4212
|
+
const errTable = new Table({
|
|
4213
|
+
head: ["At", "Where", "Error"],
|
|
4214
|
+
colWidths: [28, 36, 46],
|
|
4215
|
+
wordWrap: true,
|
|
4216
|
+
});
|
|
4217
|
+
for (const e of recentErrors) {
|
|
4218
|
+
errTable.push([e.at || "-", e.where, e.message]);
|
|
4219
|
+
}
|
|
4220
|
+
console.log(errTable.toString());
|
|
4221
|
+
}
|
|
4222
|
+
}
|
|
3323
4223
|
async function runSetupWizard(silent = false) {
|
|
3324
4224
|
if (!silent && process.stdin.isTTY) {
|
|
3325
4225
|
const wizardCandidates = [
|