@vespermcp/mcp-server 1.2.27 → 1.2.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -0
- package/build/index.js +860 -4
- package/build/python/cleaner.py +2 -0
- package/package.json +6 -1
- package/src/python/cleaner.py +2 -0
package/README.md
CHANGED
|
@@ -88,6 +88,47 @@ Vesper attempts to auto-configure itself! Restart Claude and check. If not:
|
|
|
88
88
|
|
|
89
89
|
- `KAGGLE_USERNAME` & `KAGGLE_KEY`: For Kaggle dataset access
|
|
90
90
|
- `HF_TOKEN`: For private HuggingFace datasets
|
|
91
|
+
- `VESPER_TELEMETRY_ENDPOINT`: Optional HTTP endpoint for lineage telemetry events (`lineage.version.appended`)
|
|
92
|
+
|
|
93
|
+
### Telemetry Transparency (Opt-in)
|
|
94
|
+
|
|
95
|
+
Vesper does **not** send telemetry unless `VESPER_TELEMETRY_ENDPOINT` is explicitly set.
|
|
96
|
+
|
|
97
|
+
When enabled, Vesper sends only lineage event metadata on version append:
|
|
98
|
+
- dataset base/version IDs
|
|
99
|
+
- tool name + actor metadata (`agent_id`, `pipeline_id` when provided)
|
|
100
|
+
- basic output metadata (`local_path`, rows/columns, format)
|
|
101
|
+
- timestamp + host name
|
|
102
|
+
|
|
103
|
+
It does **not** upload dataset file contents.
|
|
104
|
+
|
|
105
|
+
### Lineage Receiver (for web dashboard backend)
|
|
106
|
+
|
|
107
|
+
Vesper includes a tiny ingestion server for lineage telemetry events:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
npm run telemetry:receiver
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Storage backends:
|
|
114
|
+
- **Postgres**: set `DATABASE_URL`
|
|
115
|
+
- **SQLite**: set `SQLITE_PATH` (for lightweight/local deployments)
|
|
116
|
+
|
|
117
|
+
Optional env vars:
|
|
118
|
+
- `PORT` (default `8787`)
|
|
119
|
+
- `LINEAGE_INGEST_PATH` (default `/vesper/lineage`)
|
|
120
|
+
|
|
121
|
+
Example for hosted backend:
|
|
122
|
+
- ingest URL: `https://getvesper.dev/vesper/lineage`
|
|
123
|
+
- client env: `VESPER_TELEMETRY_ENDPOINT=https://getvesper.dev/vesper/lineage`
|
|
124
|
+
|
|
125
|
+
DDL files:
|
|
126
|
+
- `telemetry/sql/lineage_events.postgres.sql`
|
|
127
|
+
- `telemetry/sql/lineage_events.sqlite.sql`
|
|
128
|
+
|
|
129
|
+
Stats endpoint for web dashboard bootstrap:
|
|
130
|
+
- `GET /vesper/lineage/stats?days=30`
|
|
131
|
+
- Returns JSON: overview, by-tool counts, by-day counts, top datasets, recent activity.
|
|
91
132
|
|
|
92
133
|
### Optional Kaggle Setup (Not Required)
|
|
93
134
|
|
|
@@ -121,6 +162,8 @@ vespermcp discover --source kaggle "credit risk" --limit 10
|
|
|
121
162
|
vespermcp discover --source huggingface "credit risk" --limit 10
|
|
122
163
|
vespermcp download kaggle username/dataset-name
|
|
123
164
|
vespermcp download kaggle https://www.kaggle.com/datasets/username/dataset-name --target-dir ./data
|
|
165
|
+
vespermcp status
|
|
166
|
+
vespermcp status --dir ./some/project --max-depth 3
|
|
124
167
|
```
|
|
125
168
|
|
|
126
169
|
## 🚀 Quick Start
|
package/build/index.js
CHANGED
|
@@ -74,18 +74,156 @@ function getRegistryEntry(dataset_id) {
|
|
|
74
74
|
console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
|
|
75
75
|
return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
|
|
76
76
|
}
|
|
77
|
+
function getLineageDir() {
|
|
78
|
+
const p = path.join(dataRoot, "lineage");
|
|
79
|
+
if (!fs.existsSync(p))
|
|
80
|
+
fs.mkdirSync(p, { recursive: true });
|
|
81
|
+
return p;
|
|
82
|
+
}
|
|
83
|
+
function toBaseDatasetId(datasetId) {
|
|
84
|
+
const safe = normalize_dataset_id(String(datasetId || "").trim());
|
|
85
|
+
return safe.replace(/_v\d+$/i, "");
|
|
86
|
+
}
|
|
87
|
+
function getLineageRecordPath(datasetIdBase) {
|
|
88
|
+
return path.join(getLineageDir(), `${toBaseDatasetId(datasetIdBase)}.lineage.json`);
|
|
89
|
+
}
|
|
90
|
+
function readLineageRecord(datasetIdBase) {
|
|
91
|
+
const p = getLineageRecordPath(datasetIdBase);
|
|
92
|
+
if (!fs.existsSync(p)) {
|
|
93
|
+
return {
|
|
94
|
+
dataset_id_base: toBaseDatasetId(datasetIdBase),
|
|
95
|
+
latest_version: 0,
|
|
96
|
+
updated_at: new Date().toISOString(),
|
|
97
|
+
versions: [],
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
try {
|
|
101
|
+
return JSON.parse(fs.readFileSync(p, "utf-8"));
|
|
102
|
+
}
|
|
103
|
+
catch {
|
|
104
|
+
return {
|
|
105
|
+
dataset_id_base: toBaseDatasetId(datasetIdBase),
|
|
106
|
+
latest_version: 0,
|
|
107
|
+
updated_at: new Date().toISOString(),
|
|
108
|
+
versions: [],
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
function writeLineageRecord(record) {
|
|
113
|
+
const p = getLineageRecordPath(record.dataset_id_base);
|
|
114
|
+
fs.writeFileSync(p, JSON.stringify(record, null, 2));
|
|
115
|
+
}
|
|
116
|
+
function appendLineageVersion(input) {
|
|
117
|
+
const base = toBaseDatasetId(input.datasetIdBase);
|
|
118
|
+
const record = readLineageRecord(base);
|
|
119
|
+
if (input.outputPath) {
|
|
120
|
+
const existing = record.versions.find(v => v.output?.local_path === input.outputPath);
|
|
121
|
+
if (existing) {
|
|
122
|
+
return { datasetVersionId: existing.dataset_id, version: existing.version, lineagePath: getLineageRecordPath(base) };
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
const version = (record.latest_version || 0) + 1;
|
|
126
|
+
const datasetVersionId = `${base}_v${version}`;
|
|
127
|
+
const now = new Date().toISOString();
|
|
128
|
+
const outputPath = input.outputPath;
|
|
129
|
+
let sidecarPath;
|
|
130
|
+
if (outputPath && fs.existsSync(outputPath)) {
|
|
131
|
+
sidecarPath = `${outputPath}.lineage.json`;
|
|
132
|
+
}
|
|
133
|
+
const v = {
|
|
134
|
+
version,
|
|
135
|
+
dataset_id: datasetVersionId,
|
|
136
|
+
created_at: now,
|
|
137
|
+
triggered_by: {
|
|
138
|
+
tool: input.tool,
|
|
139
|
+
agent_id: input.requestArgs?.agent_id ? String(input.requestArgs.agent_id) : undefined,
|
|
140
|
+
pipeline_id: input.requestArgs?.pipeline_id ? String(input.requestArgs.pipeline_id) : undefined,
|
|
141
|
+
api_key: input.requestArgs?.api_key ? String(input.requestArgs.api_key) : undefined,
|
|
142
|
+
},
|
|
143
|
+
input: {
|
|
144
|
+
dataset_id: input.requestArgs?.dataset_id ? String(input.requestArgs.dataset_id) : undefined,
|
|
145
|
+
query: input.requestArgs?.query ? String(input.requestArgs.query) : undefined,
|
|
146
|
+
source_path: input.requestArgs?.file_path ? String(input.requestArgs.file_path) : undefined,
|
|
147
|
+
source_urls: Array.isArray(input.requestArgs?.source_urls)
|
|
148
|
+
? input.requestArgs.source_urls.map((u) => String(u))
|
|
149
|
+
: undefined,
|
|
150
|
+
},
|
|
151
|
+
output: {
|
|
152
|
+
local_path: outputPath,
|
|
153
|
+
rows: typeof input.output?.rows === "number" ? input.output.rows : undefined,
|
|
154
|
+
columns: typeof input.output?.columns === "number" ? input.output.columns : undefined,
|
|
155
|
+
format: typeof input.output?.format === "string" ? input.output.format : undefined,
|
|
156
|
+
size_mb: typeof input.output?.size_mb === "number" ? input.output.size_mb : undefined,
|
|
157
|
+
quality_score: typeof input.output?.quality_score === "number" ? input.output.quality_score : undefined,
|
|
158
|
+
schema_before: input.output?.schema_before && typeof input.output.schema_before === "object"
|
|
159
|
+
? {
|
|
160
|
+
rows: typeof input.output.schema_before.rows === "number" ? input.output.schema_before.rows : undefined,
|
|
161
|
+
columns: Array.isArray(input.output.schema_before.columns) ? input.output.schema_before.columns.map((c) => String(c)) : undefined,
|
|
162
|
+
dtypes: input.output.schema_before.dtypes && typeof input.output.schema_before.dtypes === "object"
|
|
163
|
+
? Object.fromEntries(Object.entries(input.output.schema_before.dtypes).map(([k, v]) => [String(k), String(v)]))
|
|
164
|
+
: undefined,
|
|
165
|
+
}
|
|
166
|
+
: undefined,
|
|
167
|
+
schema_after: input.output?.schema_after && typeof input.output.schema_after === "object"
|
|
168
|
+
? {
|
|
169
|
+
rows: typeof input.output.schema_after.rows === "number" ? input.output.schema_after.rows : undefined,
|
|
170
|
+
columns: Array.isArray(input.output.schema_after.columns) ? input.output.schema_after.columns.map((c) => String(c)) : undefined,
|
|
171
|
+
dtypes: input.output.schema_after.dtypes && typeof input.output.schema_after.dtypes === "object"
|
|
172
|
+
? Object.fromEntries(Object.entries(input.output.schema_after.dtypes).map(([k, v]) => [String(k), String(v)]))
|
|
173
|
+
: undefined,
|
|
174
|
+
}
|
|
175
|
+
: undefined,
|
|
176
|
+
},
|
|
177
|
+
sources: input.sources || [],
|
|
178
|
+
steps: input.steps || [{ step: input.tool, at: now, params: input.requestArgs || {} }],
|
|
179
|
+
};
|
|
180
|
+
record.latest_version = version;
|
|
181
|
+
record.updated_at = now;
|
|
182
|
+
record.versions.push(v);
|
|
183
|
+
writeLineageRecord(record);
|
|
184
|
+
const telemetryEndpoint = process.env.VESPER_TELEMETRY_ENDPOINT?.trim();
|
|
185
|
+
if (telemetryEndpoint) {
|
|
186
|
+
postJsonNonBlocking(telemetryEndpoint, {
|
|
187
|
+
event: "lineage.version.appended",
|
|
188
|
+
sent_at: now,
|
|
189
|
+
host: os.hostname(),
|
|
190
|
+
dataset_id_base: base,
|
|
191
|
+
version: v.version,
|
|
192
|
+
dataset_id: v.dataset_id,
|
|
193
|
+
triggered_by: v.triggered_by,
|
|
194
|
+
output: {
|
|
195
|
+
local_path: v.output?.local_path,
|
|
196
|
+
rows: v.output?.rows,
|
|
197
|
+
columns: v.output?.columns,
|
|
198
|
+
format: v.output?.format,
|
|
199
|
+
},
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
if (sidecarPath) {
|
|
203
|
+
try {
|
|
204
|
+
fs.writeFileSync(sidecarPath, JSON.stringify(v, null, 2));
|
|
205
|
+
}
|
|
206
|
+
catch {
|
|
207
|
+
// best effort sidecar write
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
return { datasetVersionId, version, lineagePath: getLineageRecordPath(base) };
|
|
211
|
+
}
|
|
77
212
|
const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
|
|
78
213
|
const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
|
|
79
|
-
function walkFilesRecursive(rootDir) {
|
|
214
|
+
function walkFilesRecursive(rootDir, maxDepth = Number.POSITIVE_INFINITY) {
|
|
80
215
|
const out = [];
|
|
81
|
-
const stack = [rootDir];
|
|
216
|
+
const stack = [{ dir: rootDir, depth: 0 }];
|
|
82
217
|
while (stack.length > 0) {
|
|
83
|
-
const
|
|
218
|
+
const current = stack.pop();
|
|
219
|
+
const currentDir = current.dir;
|
|
84
220
|
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
|
|
85
221
|
for (const entry of entries) {
|
|
86
222
|
const fullPath = path.join(currentDir, entry.name);
|
|
87
223
|
if (entry.isDirectory()) {
|
|
88
|
-
|
|
224
|
+
if (current.depth < maxDepth) {
|
|
225
|
+
stack.push({ dir: fullPath, depth: current.depth + 1 });
|
|
226
|
+
}
|
|
89
227
|
}
|
|
90
228
|
else if (entry.isFile()) {
|
|
91
229
|
out.push(fullPath);
|
|
@@ -201,6 +339,36 @@ function ensureLocalPipelineSource(sourcePath, datasetId, targetDir) {
|
|
|
201
339
|
}
|
|
202
340
|
return stagedPath;
|
|
203
341
|
}
|
|
342
|
+
function cleanupIntermediateArtifacts(artifactPaths, finalOutputPath) {
|
|
343
|
+
const finalResolved = path.resolve(finalOutputPath);
|
|
344
|
+
const finalLineage = `${finalResolved}.lineage.json`;
|
|
345
|
+
for (const candidate of artifactPaths) {
|
|
346
|
+
if (!candidate)
|
|
347
|
+
continue;
|
|
348
|
+
const resolved = path.resolve(candidate);
|
|
349
|
+
if (resolved === finalResolved || resolved === finalLineage)
|
|
350
|
+
continue;
|
|
351
|
+
try {
|
|
352
|
+
if (fs.existsSync(resolved) && fs.statSync(resolved).isFile()) {
|
|
353
|
+
fs.unlinkSync(resolved);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
catch {
|
|
357
|
+
// Best-effort cleanup.
|
|
358
|
+
}
|
|
359
|
+
const sidecar = `${resolved}.lineage.json`;
|
|
360
|
+
if (sidecar === finalLineage)
|
|
361
|
+
continue;
|
|
362
|
+
try {
|
|
363
|
+
if (fs.existsSync(sidecar) && fs.statSync(sidecar).isFile()) {
|
|
364
|
+
fs.unlinkSync(sidecar);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
catch {
|
|
368
|
+
// Best-effort cleanup.
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
204
372
|
function resolveDatasetLocalPath(datasetIdOrPath, preferredDirs = []) {
|
|
205
373
|
if (fs.existsSync(datasetIdOrPath)) {
|
|
206
374
|
return ensureExportableLocalPath(datasetIdOrPath);
|
|
@@ -296,6 +464,8 @@ import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
|
|
|
296
464
|
import { ConfigManager } from "./config/config-manager.js";
|
|
297
465
|
import { SecureKeysManager } from "./config/secure-keys.js";
|
|
298
466
|
import readline from "readline";
|
|
467
|
+
import http from "http";
|
|
468
|
+
import https from "https";
|
|
299
469
|
import os from "os";
|
|
300
470
|
// Determine absolute paths relative to the compiled script
|
|
301
471
|
const __filename = fileURLToPath(import.meta.url);
|
|
@@ -321,6 +491,34 @@ function logError(err, context) {
|
|
|
321
491
|
fs.appendFileSync(errorLogPath, msg);
|
|
322
492
|
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
323
493
|
}
|
|
494
|
+
function postJsonNonBlocking(urlRaw, body) {
|
|
495
|
+
try {
|
|
496
|
+
const u = new URL(urlRaw);
|
|
497
|
+
const payload = JSON.stringify(body);
|
|
498
|
+
const isHttps = u.protocol === "https:";
|
|
499
|
+
const transport = isHttps ? https : http;
|
|
500
|
+
const req = transport.request({
|
|
501
|
+
method: "POST",
|
|
502
|
+
hostname: u.hostname,
|
|
503
|
+
port: u.port ? Number(u.port) : (isHttps ? 443 : 80),
|
|
504
|
+
path: `${u.pathname}${u.search || ""}`,
|
|
505
|
+
headers: {
|
|
506
|
+
"content-type": "application/json",
|
|
507
|
+
"content-length": Buffer.byteLength(payload),
|
|
508
|
+
},
|
|
509
|
+
timeout: 3000,
|
|
510
|
+
}, (res) => {
|
|
511
|
+
res.resume();
|
|
512
|
+
});
|
|
513
|
+
req.on("error", () => { });
|
|
514
|
+
req.on("timeout", () => req.destroy());
|
|
515
|
+
req.write(payload);
|
|
516
|
+
req.end();
|
|
517
|
+
}
|
|
518
|
+
catch {
|
|
519
|
+
// best effort telemetry only
|
|
520
|
+
}
|
|
521
|
+
}
|
|
324
522
|
// --- Request Queue: serialize all MCP tool calls to prevent crashes ---
|
|
325
523
|
class RequestQueue {
|
|
326
524
|
queue = [];
|
|
@@ -535,6 +733,102 @@ function runPythonJson(scriptPath, args) {
|
|
|
535
733
|
});
|
|
536
734
|
});
|
|
537
735
|
}
|
|
736
|
+
async function getSchemaSnapshot(filePath) {
|
|
737
|
+
try {
|
|
738
|
+
if (!filePath || !fs.existsSync(filePath))
|
|
739
|
+
return undefined;
|
|
740
|
+
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
741
|
+
const pyCode = [
|
|
742
|
+
"import json, os, polars as pl",
|
|
743
|
+
"p = __import__('sys').argv[1]",
|
|
744
|
+
"ext = os.path.splitext(p)[1].lower()",
|
|
745
|
+
"if ext == '.csv': df = pl.read_csv(p, infer_schema_length=10000, ignore_errors=True)",
|
|
746
|
+
"elif ext in ('.jsonl', '.ndjson'): df = pl.read_ndjson(p)",
|
|
747
|
+
"elif ext == '.json': df = pl.read_json(p)",
|
|
748
|
+
"elif ext in ('.parquet', '.pq'): df = pl.read_parquet(p)",
|
|
749
|
+
"elif ext in ('.feather', '.ftr', '.arrow', '.ipc'): df = pl.read_ipc(p)",
|
|
750
|
+
"else: df = pl.read_csv(p, infer_schema_length=10000, ignore_errors=True)",
|
|
751
|
+
"print(json.dumps({'rows': int(df.height), 'columns': [str(c) for c in df.columns], 'dtypes': {str(c): str(t) for c,t in zip(df.columns, df.dtypes)}}))",
|
|
752
|
+
].join(";");
|
|
753
|
+
const proc = await runPythonProcess(["-c", pyCode, filePath], 120000);
|
|
754
|
+
if (proc.code !== 0)
|
|
755
|
+
return undefined;
|
|
756
|
+
return JSON.parse((proc.stdout || "{}").trim());
|
|
757
|
+
}
|
|
758
|
+
catch {
|
|
759
|
+
return undefined;
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
function mergeSchemaSnapshots(snapshots) {
|
|
763
|
+
const valid = snapshots.filter(Boolean);
|
|
764
|
+
if (valid.length === 0)
|
|
765
|
+
return undefined;
|
|
766
|
+
let rows = 0;
|
|
767
|
+
const colSet = new Set();
|
|
768
|
+
const dtypeMulti = {};
|
|
769
|
+
for (const s of valid) {
|
|
770
|
+
if (typeof s.rows === "number") {
|
|
771
|
+
rows = (rows ?? 0) + s.rows;
|
|
772
|
+
}
|
|
773
|
+
else {
|
|
774
|
+
rows = undefined;
|
|
775
|
+
}
|
|
776
|
+
for (const c of s.columns || [])
|
|
777
|
+
colSet.add(String(c));
|
|
778
|
+
for (const [k, v] of Object.entries(s.dtypes || {})) {
|
|
779
|
+
if (!dtypeMulti[k])
|
|
780
|
+
dtypeMulti[k] = new Set();
|
|
781
|
+
dtypeMulti[k].add(String(v));
|
|
782
|
+
}
|
|
783
|
+
}
|
|
784
|
+
const dtypes = {};
|
|
785
|
+
for (const [k, values] of Object.entries(dtypeMulti)) {
|
|
786
|
+
const list = Array.from(values);
|
|
787
|
+
dtypes[k] = list.length <= 1 ? list[0] : `mixed(${list.join("|")})`;
|
|
788
|
+
}
|
|
789
|
+
return {
|
|
790
|
+
rows,
|
|
791
|
+
columns: Array.from(colSet).sort(),
|
|
792
|
+
dtypes,
|
|
793
|
+
};
|
|
794
|
+
}
|
|
795
|
+
function diffSchemaMaps(fromColumns, toColumns, fromDtypes, toDtypes) {
|
|
796
|
+
const fromSet = new Set(fromColumns);
|
|
797
|
+
const toSet = new Set(toColumns);
|
|
798
|
+
const added_columns = toColumns.filter((c) => !fromSet.has(c));
|
|
799
|
+
const removed_columns = fromColumns.filter((c) => !toSet.has(c));
|
|
800
|
+
const common = fromColumns.filter((c) => toSet.has(c));
|
|
801
|
+
const changed_dtypes = common
|
|
802
|
+
.filter((c) => String(fromDtypes[c] || "") !== String(toDtypes[c] || ""))
|
|
803
|
+
.map((c) => ({ column: c, from: fromDtypes[c], to: toDtypes[c] }));
|
|
804
|
+
return { added_columns, removed_columns, changed_dtypes };
|
|
805
|
+
}
|
|
806
|
+
function isLineageRecordShape(value) {
|
|
807
|
+
return !!value && typeof value === "object" && typeof value.dataset_id_base === "string" && Array.isArray(value.versions);
|
|
808
|
+
}
|
|
809
|
+
function parseErrorLogLines(filePath, withinDays) {
|
|
810
|
+
try {
|
|
811
|
+
if (!fs.existsSync(filePath))
|
|
812
|
+
return [];
|
|
813
|
+
const text = fs.readFileSync(filePath, "utf-8");
|
|
814
|
+
const lines = text.split(/\r?\n/).filter(Boolean);
|
|
815
|
+
const minTs = Date.now() - withinDays * 24 * 60 * 60 * 1000;
|
|
816
|
+
const out = [];
|
|
817
|
+
for (const line of lines) {
|
|
818
|
+
const m = line.match(/^\[(.+?)\]\s+ERROR\s+in\s+(.+)$/);
|
|
819
|
+
if (!m)
|
|
820
|
+
continue;
|
|
821
|
+
const ts = Date.parse(m[1]);
|
|
822
|
+
if (Number.isFinite(ts) && ts >= minTs) {
|
|
823
|
+
out.push({ at: m[1], message: m[2] });
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
return out.slice(-20);
|
|
827
|
+
}
|
|
828
|
+
catch {
|
|
829
|
+
return [];
|
|
830
|
+
}
|
|
831
|
+
}
|
|
538
832
|
async function countRows(filePath) {
|
|
539
833
|
const scriptPath = path.join(dataRoot, "python", "row_count.py");
|
|
540
834
|
const result = await runPythonJson(scriptPath, [filePath]);
|
|
@@ -1166,6 +1460,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1166
1460
|
type: "boolean",
|
|
1167
1461
|
description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
|
|
1168
1462
|
},
|
|
1463
|
+
agent_id: {
|
|
1464
|
+
type: "string",
|
|
1465
|
+
description: "Strongly recommended: caller agent identity for lineage/audit.",
|
|
1466
|
+
},
|
|
1467
|
+
pipeline_id: {
|
|
1468
|
+
type: "string",
|
|
1469
|
+
description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
|
|
1470
|
+
},
|
|
1169
1471
|
},
|
|
1170
1472
|
required: ["operation"],
|
|
1171
1473
|
},
|
|
@@ -1185,6 +1487,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1185
1487
|
limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
|
|
1186
1488
|
arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
|
|
1187
1489
|
github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
|
|
1490
|
+
agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
|
|
1491
|
+
pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
|
|
1188
1492
|
},
|
|
1189
1493
|
required: ["query"],
|
|
1190
1494
|
},
|
|
@@ -1232,6 +1536,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1232
1536
|
enum: ["semantic", "exact", "none"],
|
|
1233
1537
|
description: "How to deduplicate across sources.",
|
|
1234
1538
|
},
|
|
1539
|
+
agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
|
|
1540
|
+
pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
|
|
1235
1541
|
},
|
|
1236
1542
|
required: ["sources"],
|
|
1237
1543
|
},
|
|
@@ -1435,6 +1741,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1435
1741
|
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
1436
1742
|
cleaning_options: { type: "object" },
|
|
1437
1743
|
split_config: { type: "object" },
|
|
1744
|
+
agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
|
|
1745
|
+
pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
|
|
1438
1746
|
},
|
|
1439
1747
|
required: ["query"],
|
|
1440
1748
|
},
|
|
@@ -1509,6 +1817,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1509
1817
|
items: { type: "string" },
|
|
1510
1818
|
description: "Export only these columns (faster for wide datasets).",
|
|
1511
1819
|
},
|
|
1820
|
+
agent_id: {
|
|
1821
|
+
type: "string",
|
|
1822
|
+
description: "Strongly recommended: caller agent identity for lineage/audit.",
|
|
1823
|
+
},
|
|
1824
|
+
pipeline_id: {
|
|
1825
|
+
type: "string",
|
|
1826
|
+
description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
|
|
1827
|
+
},
|
|
1512
1828
|
},
|
|
1513
1829
|
required: ["dataset_id"],
|
|
1514
1830
|
},
|
|
@@ -1521,6 +1837,42 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1521
1837
|
properties: {},
|
|
1522
1838
|
},
|
|
1523
1839
|
},
|
|
1840
|
+
{
|
|
1841
|
+
name: "get_lineage",
|
|
1842
|
+
description: "Get version history and full lineage/provenance for a dataset (sources, steps, inputs/outputs, trigger metadata).",
|
|
1843
|
+
inputSchema: {
|
|
1844
|
+
type: "object",
|
|
1845
|
+
properties: {
|
|
1846
|
+
dataset_id: {
|
|
1847
|
+
type: "string",
|
|
1848
|
+
description: "Dataset ID (base or versioned, e.g. my_dataset or my_dataset_v2).",
|
|
1849
|
+
},
|
|
1850
|
+
},
|
|
1851
|
+
required: ["dataset_id"],
|
|
1852
|
+
},
|
|
1853
|
+
},
|
|
1854
|
+
{
|
|
1855
|
+
name: "diff_lineage_versions",
|
|
1856
|
+
description: "Diff two lineage versions for one dataset and return structured changes (schema, rows, steps, actor identity).",
|
|
1857
|
+
inputSchema: {
|
|
1858
|
+
type: "object",
|
|
1859
|
+
properties: {
|
|
1860
|
+
dataset_id: {
|
|
1861
|
+
type: "string",
|
|
1862
|
+
description: "Dataset ID (base or versioned).",
|
|
1863
|
+
},
|
|
1864
|
+
from_version: {
|
|
1865
|
+
type: "number",
|
|
1866
|
+
description: "Source lineage version number (e.g., 1).",
|
|
1867
|
+
},
|
|
1868
|
+
to_version: {
|
|
1869
|
+
type: "number",
|
|
1870
|
+
description: "Target lineage version number (e.g., 2).",
|
|
1871
|
+
},
|
|
1872
|
+
},
|
|
1873
|
+
required: ["dataset_id", "from_version", "to_version"],
|
|
1874
|
+
},
|
|
1875
|
+
},
|
|
1524
1876
|
{
|
|
1525
1877
|
name: "vesper_convert_format",
|
|
1526
1878
|
description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
|
|
@@ -1536,6 +1888,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1536
1888
|
enum: ["csv", "parquet", "json", "jsonl"],
|
|
1537
1889
|
description: "The desired output format.",
|
|
1538
1890
|
},
|
|
1891
|
+
agent_id: {
|
|
1892
|
+
type: "string",
|
|
1893
|
+
description: "Strongly recommended: caller agent identity for lineage/audit.",
|
|
1894
|
+
},
|
|
1895
|
+
pipeline_id: {
|
|
1896
|
+
type: "string",
|
|
1897
|
+
description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
|
|
1898
|
+
},
|
|
1539
1899
|
},
|
|
1540
1900
|
required: ["file_path", "target_format"],
|
|
1541
1901
|
},
|
|
@@ -1572,6 +1932,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1572
1932
|
enum: ["blob", "drop"],
|
|
1573
1933
|
description: "How to handle metadata_json keys beyond max_keys. blob keeps them in metadata_json_blob; drop discards them. Default: blob.",
|
|
1574
1934
|
},
|
|
1935
|
+
agent_id: {
|
|
1936
|
+
type: "string",
|
|
1937
|
+
description: "Strongly recommended: caller agent identity for lineage/audit.",
|
|
1938
|
+
},
|
|
1939
|
+
pipeline_id: {
|
|
1940
|
+
type: "string",
|
|
1941
|
+
description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
|
|
1942
|
+
},
|
|
1575
1943
|
},
|
|
1576
1944
|
required: ["file_path"],
|
|
1577
1945
|
},
|
|
@@ -1761,6 +2129,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1761
2129
|
arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
|
|
1762
2130
|
github_include_readme: request.params.arguments?.github_include_readme === true,
|
|
1763
2131
|
});
|
|
2132
|
+
try {
|
|
2133
|
+
appendLineageVersion({
|
|
2134
|
+
datasetIdBase: `webfind_${query || "query"}`,
|
|
2135
|
+
tool: "vesper_web_find",
|
|
2136
|
+
requestArgs: request.params.arguments,
|
|
2137
|
+
output: {
|
|
2138
|
+
rows: Array.isArray(result.results) ? result.results.length : undefined,
|
|
2139
|
+
},
|
|
2140
|
+
sources: Array.isArray(result.results)
|
|
2141
|
+
? result.results.slice(0, 200).map((r) => ({
|
|
2142
|
+
source: String(r?.source_type || "unknown"),
|
|
2143
|
+
url: typeof r?.source_url === "string" ? r.source_url : undefined,
|
|
2144
|
+
at: typeof r?.collected_at === "string" ? r.collected_at : undefined,
|
|
2145
|
+
}))
|
|
2146
|
+
: [],
|
|
2147
|
+
steps: [
|
|
2148
|
+
{ step: "web_find_discover", at: new Date().toISOString(), params: { query, sources, limit } },
|
|
2149
|
+
{ step: "web_find_complete", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
|
|
2150
|
+
],
|
|
2151
|
+
});
|
|
2152
|
+
}
|
|
2153
|
+
catch (e) {
|
|
2154
|
+
console.error(`[Lineage] vesper_web_find append failed: ${e?.message || e}`);
|
|
2155
|
+
}
|
|
1764
2156
|
return {
|
|
1765
2157
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1766
2158
|
};
|
|
@@ -1882,6 +2274,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1882
2274
|
limit: Number(request.params.arguments?.limit || 10),
|
|
1883
2275
|
publicOnly,
|
|
1884
2276
|
});
|
|
2277
|
+
try {
|
|
2278
|
+
appendLineageVersion({
|
|
2279
|
+
datasetIdBase: `discover_${source}_${query || "query"}`,
|
|
2280
|
+
tool: "unified_dataset_api.discover",
|
|
2281
|
+
requestArgs: request.params.arguments,
|
|
2282
|
+
output: { rows: Array.isArray(result.results) ? result.results.length : undefined },
|
|
2283
|
+
sources: Array.isArray(result.results)
|
|
2284
|
+
? result.results.slice(0, 200).map((r) => ({
|
|
2285
|
+
source: String(r?.source || source || "unknown"),
|
|
2286
|
+
url: typeof r?.download_url === "string"
|
|
2287
|
+
? r.download_url
|
|
2288
|
+
: (typeof r?.metadata_url === "string" ? r.metadata_url : undefined),
|
|
2289
|
+
at: new Date().toISOString(),
|
|
2290
|
+
}))
|
|
2291
|
+
: [],
|
|
2292
|
+
steps: [
|
|
2293
|
+
{ step: "discover_requested", at: new Date().toISOString(), params: { query, source, limit: Number(request.params.arguments?.limit || 10), publicOnly } },
|
|
2294
|
+
{ step: "discover_completed", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
|
|
2295
|
+
],
|
|
2296
|
+
});
|
|
2297
|
+
}
|
|
2298
|
+
catch (e) {
|
|
2299
|
+
console.error(`[Lineage] unified discover append failed: ${e?.message || e}`);
|
|
2300
|
+
}
|
|
1885
2301
|
return {
|
|
1886
2302
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1887
2303
|
};
|
|
@@ -1914,6 +2330,36 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1914
2330
|
catch (e) {
|
|
1915
2331
|
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1916
2332
|
}
|
|
2333
|
+
try {
|
|
2334
|
+
const schemaAfter = await getSchemaSnapshot(result.copied_to || result.local_path);
|
|
2335
|
+
const lineage = appendLineageVersion({
|
|
2336
|
+
datasetIdBase: result.dataset_id,
|
|
2337
|
+
tool: "unified_dataset_api.download",
|
|
2338
|
+
requestArgs: request.params.arguments,
|
|
2339
|
+
outputPath: result.copied_to || result.local_path,
|
|
2340
|
+
output: {
|
|
2341
|
+
local_path: result.copied_to || result.local_path,
|
|
2342
|
+
format: path.extname(result.copied_to || result.local_path).replace(".", ""),
|
|
2343
|
+
schema_after: schemaAfter,
|
|
2344
|
+
},
|
|
2345
|
+
sources: [{
|
|
2346
|
+
source: source,
|
|
2347
|
+
url: typeof result.dataset_id === "string" ? result.dataset_id : undefined,
|
|
2348
|
+
at: new Date().toISOString(),
|
|
2349
|
+
}],
|
|
2350
|
+
steps: [
|
|
2351
|
+
{ step: "download_requested", at: new Date().toISOString(), params: { datasetId, source, targetDir } },
|
|
2352
|
+
{ step: "download_completed", at: new Date().toISOString(), metrics: { local_path: result.copied_to || result.local_path } },
|
|
2353
|
+
],
|
|
2354
|
+
});
|
|
2355
|
+
try {
|
|
2356
|
+
upsertRegistry(lineage.datasetVersionId, result.copied_to || result.local_path, "completed");
|
|
2357
|
+
}
|
|
2358
|
+
catch { }
|
|
2359
|
+
}
|
|
2360
|
+
catch (e) {
|
|
2361
|
+
console.error(`[Lineage] unified download append failed: ${e?.message || e}`);
|
|
2362
|
+
}
|
|
1917
2363
|
return {
|
|
1918
2364
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1919
2365
|
};
|
|
@@ -2449,6 +2895,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2449
2895
|
};
|
|
2450
2896
|
}
|
|
2451
2897
|
jobStatusLastPoll[jobId] = now;
|
|
2898
|
+
if (job.status === "completed") {
|
|
2899
|
+
try {
|
|
2900
|
+
const meta = job.metadata ? JSON.parse(job.metadata) : {};
|
|
2901
|
+
const baseId = String(meta?.datasetId || meta?.dataset_id || meta?.query || job.id);
|
|
2902
|
+
const outPath = typeof job.result_url === "string" ? job.result_url : undefined;
|
|
2903
|
+
appendLineageVersion({
|
|
2904
|
+
datasetIdBase: baseId,
|
|
2905
|
+
tool: `job:${job.type}`,
|
|
2906
|
+
requestArgs: {
|
|
2907
|
+
dataset_id: meta?.datasetId || meta?.dataset_id,
|
|
2908
|
+
query: meta?.query,
|
|
2909
|
+
pipeline_id: meta?.pipeline_id,
|
|
2910
|
+
agent_id: meta?.agent_id,
|
|
2911
|
+
},
|
|
2912
|
+
outputPath: outPath,
|
|
2913
|
+
output: {},
|
|
2914
|
+
steps: [
|
|
2915
|
+
{ step: `${job.type}_started`, at: job.created_at, params: meta || {} },
|
|
2916
|
+
{ step: `${job.type}_completed`, at: job.updated_at || new Date().toISOString(), metrics: { progress: job.progress } },
|
|
2917
|
+
],
|
|
2918
|
+
});
|
|
2919
|
+
}
|
|
2920
|
+
catch (e) {
|
|
2921
|
+
console.error(`[Lineage] check_job_status append failed: ${e?.message || e}`);
|
|
2922
|
+
}
|
|
2923
|
+
}
|
|
2452
2924
|
return {
|
|
2453
2925
|
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
2454
2926
|
};
|
|
@@ -2456,6 +2928,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2456
2928
|
case "export_dataset": {
|
|
2457
2929
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2458
2930
|
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
2931
|
+
const intermediateArtifacts = new Set();
|
|
2459
2932
|
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2460
2933
|
? String(request.params.arguments?.target_dir).trim()
|
|
2461
2934
|
: request.params.arguments?.output_dir
|
|
@@ -2525,9 +2998,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2525
2998
|
else if (currentExt !== pipelineFmt) {
|
|
2526
2999
|
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
2527
3000
|
try {
|
|
3001
|
+
const beforeStagingPath = sourcePath;
|
|
2528
3002
|
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
|
|
3003
|
+
if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
|
|
3004
|
+
intermediateArtifacts.add(sourcePath);
|
|
3005
|
+
}
|
|
2529
3006
|
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
2530
3007
|
if (pipelineResult.final_output_path) {
|
|
3008
|
+
if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
|
|
3009
|
+
intermediateArtifacts.add(pipelineResult.final_output_path);
|
|
3010
|
+
}
|
|
2531
3011
|
sourcePath = pipelineResult.final_output_path;
|
|
2532
3012
|
try {
|
|
2533
3013
|
// Update registry to point to pipeline's final output
|
|
@@ -2567,10 +3047,36 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2567
3047
|
if (!fs.existsSync(outDir))
|
|
2568
3048
|
fs.mkdirSync(outDir, { recursive: true });
|
|
2569
3049
|
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
3050
|
+
const schemaBefore = await getSchemaSnapshot(sourcePath);
|
|
2570
3051
|
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
3052
|
+
const schemaAfter = await getSchemaSnapshot(result.output_path);
|
|
3053
|
+
const lineage = appendLineageVersion({
|
|
3054
|
+
datasetIdBase: datasetId,
|
|
3055
|
+
tool: "export_dataset",
|
|
3056
|
+
requestArgs: request.params.arguments,
|
|
3057
|
+
outputPath: result.output_path,
|
|
3058
|
+
output: {
|
|
3059
|
+
rows: result.rows,
|
|
3060
|
+
columns: result.columns,
|
|
3061
|
+
format: requestedFormat,
|
|
3062
|
+
size_mb: result.file_size_mb,
|
|
3063
|
+
schema_before: schemaBefore,
|
|
3064
|
+
schema_after: schemaAfter,
|
|
3065
|
+
},
|
|
3066
|
+
steps: [
|
|
3067
|
+
{ step: "source_resolved", at: new Date().toISOString(), params: { sourcePath } },
|
|
3068
|
+
{ step: "exported", at: new Date().toISOString(), params: { format: requestedFormat, compression }, metrics: { rows: result.rows, columns: result.columns } },
|
|
3069
|
+
],
|
|
3070
|
+
});
|
|
3071
|
+
try {
|
|
3072
|
+
upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
|
|
3073
|
+
}
|
|
3074
|
+
catch { }
|
|
2571
3075
|
// Build rich response
|
|
2572
3076
|
let msg = `**Export complete**\n`;
|
|
2573
3077
|
msg += `- **File**: ${result.output_path}\n`;
|
|
3078
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3079
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
2574
3080
|
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
2575
3081
|
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2576
3082
|
if (result.file_size_mb !== undefined)
|
|
@@ -2590,6 +3096,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2590
3096
|
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
2591
3097
|
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2592
3098
|
}
|
|
3099
|
+
cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
|
|
2593
3100
|
return { content: [{ type: "text", text: msg }] };
|
|
2594
3101
|
}
|
|
2595
3102
|
catch (error) {
|
|
@@ -2616,6 +3123,100 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2616
3123
|
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
2617
3124
|
};
|
|
2618
3125
|
}
|
|
3126
|
+
case "get_lineage": {
|
|
3127
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
3128
|
+
if (!datasetId) {
|
|
3129
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
3130
|
+
}
|
|
3131
|
+
const base = toBaseDatasetId(datasetId);
|
|
3132
|
+
const record = readLineageRecord(base);
|
|
3133
|
+
if (!record.versions || record.versions.length === 0) {
|
|
3134
|
+
return {
|
|
3135
|
+
content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
|
|
3136
|
+
};
|
|
3137
|
+
}
|
|
3138
|
+
return {
|
|
3139
|
+
content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
|
|
3140
|
+
};
|
|
3141
|
+
}
|
|
3142
|
+
case "diff_lineage_versions": {
|
|
3143
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
3144
|
+
const fromVersion = Number(request.params.arguments?.from_version);
|
|
3145
|
+
const toVersion = Number(request.params.arguments?.to_version);
|
|
3146
|
+
if (!datasetId) {
|
|
3147
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
3148
|
+
}
|
|
3149
|
+
if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
|
|
3150
|
+
throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
|
|
3151
|
+
}
|
|
3152
|
+
if (!Number.isInteger(toVersion) || toVersion <= 0) {
|
|
3153
|
+
throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
|
|
3154
|
+
}
|
|
3155
|
+
const base = toBaseDatasetId(datasetId);
|
|
3156
|
+
const record = readLineageRecord(base);
|
|
3157
|
+
const fromV = record.versions.find((v) => v.version === fromVersion);
|
|
3158
|
+
const toV = record.versions.find((v) => v.version === toVersion);
|
|
3159
|
+
if (!fromV || !toV) {
|
|
3160
|
+
return {
|
|
3161
|
+
content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
|
|
3162
|
+
isError: true,
|
|
3163
|
+
};
|
|
3164
|
+
}
|
|
3165
|
+
const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
|
|
3166
|
+
? fromV.output?.schema_after || fromV.output?.schema_before || {}
|
|
3167
|
+
: fromV.output?.schema_after || fromV.output?.schema_before || {};
|
|
3168
|
+
const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
|
|
3169
|
+
const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
|
|
3170
|
+
const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
|
|
3171
|
+
const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
|
|
3172
|
+
const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
|
|
3173
|
+
const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
|
|
3174
|
+
const fromRows = typeof fromSchema.rows === "number"
|
|
3175
|
+
? fromSchema.rows
|
|
3176
|
+
: (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
|
|
3177
|
+
const toRows = typeof toSchema.rows === "number"
|
|
3178
|
+
? toSchema.rows
|
|
3179
|
+
: (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
|
|
3180
|
+
const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
|
|
3181
|
+
const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
|
|
3182
|
+
const addedSteps = Array.from(toSteps).filter((s) => !fromSteps.has(s));
|
|
3183
|
+
const removedSteps = Array.from(fromSteps).filter((s) => !toSteps.has(s));
|
|
3184
|
+
const actorDiff = {
|
|
3185
|
+
changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
|
|
3186
|
+
String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
|
|
3187
|
+
from: {
|
|
3188
|
+
tool: fromV.triggered_by?.tool,
|
|
3189
|
+
agent_id: fromV.triggered_by?.agent_id,
|
|
3190
|
+
pipeline_id: fromV.triggered_by?.pipeline_id,
|
|
3191
|
+
},
|
|
3192
|
+
to: {
|
|
3193
|
+
tool: toV.triggered_by?.tool,
|
|
3194
|
+
agent_id: toV.triggered_by?.agent_id,
|
|
3195
|
+
pipeline_id: toV.triggered_by?.pipeline_id,
|
|
3196
|
+
},
|
|
3197
|
+
};
|
|
3198
|
+
const diffResult = {
|
|
3199
|
+
dataset_id_base: base,
|
|
3200
|
+
from_version: fromVersion,
|
|
3201
|
+
to_version: toVersion,
|
|
3202
|
+
schema_diff: schemaDiff,
|
|
3203
|
+
row_count_delta: {
|
|
3204
|
+
from: fromRows,
|
|
3205
|
+
to: toRows,
|
|
3206
|
+
delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
|
|
3207
|
+
},
|
|
3208
|
+
steps_diff: {
|
|
3209
|
+
added: addedSteps,
|
|
3210
|
+
removed: removedSteps,
|
|
3211
|
+
from_steps: Array.from(fromSteps),
|
|
3212
|
+
to_steps: Array.from(toSteps),
|
|
3213
|
+
},
|
|
3214
|
+
actor_diff: actorDiff,
|
|
3215
|
+
};
|
|
3216
|
+
return {
|
|
3217
|
+
content: [{ type: "text", text: JSON.stringify(diffResult, null, 2) }],
|
|
3218
|
+
};
|
|
3219
|
+
}
|
|
2619
3220
|
case "vesper_convert_format": {
|
|
2620
3221
|
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
2621
3222
|
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
@@ -2643,7 +3244,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2643
3244
|
try {
|
|
2644
3245
|
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
2645
3246
|
const convertScript = path.join(dataRoot, "python", "convert_engine.py");
|
|
3247
|
+
const schemaBefore = await getSchemaSnapshot(filePath);
|
|
2646
3248
|
const result = await runPythonJson(convertScript, [filePath, outputPath]);
|
|
3249
|
+
const schemaAfter = await getSchemaSnapshot(outputPath);
|
|
2647
3250
|
if (!result.ok) {
|
|
2648
3251
|
return {
|
|
2649
3252
|
content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
|
|
@@ -2658,9 +3261,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2658
3261
|
catch (e) {
|
|
2659
3262
|
console.error(`[Convert] Registry write failed: ${e?.message || e}`);
|
|
2660
3263
|
}
|
|
3264
|
+
const lineage = appendLineageVersion({
|
|
3265
|
+
datasetIdBase: datasetId,
|
|
3266
|
+
tool: "vesper_convert_format",
|
|
3267
|
+
requestArgs: request.params.arguments,
|
|
3268
|
+
outputPath,
|
|
3269
|
+
output: {
|
|
3270
|
+
rows: result.rows,
|
|
3271
|
+
columns: result.columns,
|
|
3272
|
+
format: targetFormat,
|
|
3273
|
+
size_mb: result.size_mb,
|
|
3274
|
+
schema_before: schemaBefore,
|
|
3275
|
+
schema_after: schemaAfter,
|
|
3276
|
+
},
|
|
3277
|
+
steps: [
|
|
3278
|
+
{ step: "converted", at: new Date().toISOString(), params: { from: inputExt, to: outputExt } },
|
|
3279
|
+
],
|
|
3280
|
+
});
|
|
3281
|
+
try {
|
|
3282
|
+
upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
|
|
3283
|
+
}
|
|
3284
|
+
catch { }
|
|
2661
3285
|
let msg = `**Conversion complete**\n`;
|
|
2662
3286
|
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
2663
3287
|
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
3288
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3289
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
2664
3290
|
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2665
3291
|
if (result.size_mb !== undefined)
|
|
2666
3292
|
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
@@ -2701,7 +3327,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2701
3327
|
max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
|
|
2702
3328
|
extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
|
|
2703
3329
|
};
|
|
3330
|
+
const schemaBefore = await getSchemaSnapshot(filePath);
|
|
2704
3331
|
const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
|
|
3332
|
+
const schemaAfter = await getSchemaSnapshot(outputPath);
|
|
2705
3333
|
if (!result.ok) {
|
|
2706
3334
|
return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
|
|
2707
3335
|
}
|
|
@@ -2713,9 +3341,31 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2713
3341
|
catch (e) {
|
|
2714
3342
|
console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
|
|
2715
3343
|
}
|
|
3344
|
+
const lineage = appendLineageVersion({
|
|
3345
|
+
datasetIdBase: path.basename(outputPath, path.extname(outputPath)),
|
|
3346
|
+
tool: "vesper_normalize_schema",
|
|
3347
|
+
requestArgs: request.params.arguments,
|
|
3348
|
+
outputPath,
|
|
3349
|
+
output: {
|
|
3350
|
+
rows: result.rows,
|
|
3351
|
+
columns: result.columns,
|
|
3352
|
+
format: outputFormat,
|
|
3353
|
+
schema_before: schemaBefore,
|
|
3354
|
+
schema_after: schemaAfter,
|
|
3355
|
+
},
|
|
3356
|
+
steps: [
|
|
3357
|
+
{ step: "schema_normalized", at: new Date().toISOString(), params: options, metrics: { flattened_keys: result.flattened_keys } },
|
|
3358
|
+
],
|
|
3359
|
+
});
|
|
3360
|
+
try {
|
|
3361
|
+
upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
|
|
3362
|
+
}
|
|
3363
|
+
catch { }
|
|
2716
3364
|
let msg = `**Schema normalization complete**\n`;
|
|
2717
3365
|
msg += `- **Input**: ${filePath}\n`;
|
|
2718
3366
|
msg += `- **Output**: ${result.output_path}\n`;
|
|
3367
|
+
msg += `- **Version**: ${lineage.datasetVersionId}\n`;
|
|
3368
|
+
msg += `- **Lineage**: ${lineage.lineagePath}\n`;
|
|
2719
3369
|
msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
|
|
2720
3370
|
msg += `- **Columns**: ${result.columns}\n`;
|
|
2721
3371
|
msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
|
|
@@ -2795,10 +3445,35 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
2795
3445
|
catch (e) {
|
|
2796
3446
|
console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
|
|
2797
3447
|
}
|
|
3448
|
+
const inputSchemaSnapshots = await Promise.all(resolvedPaths.map((p) => getSchemaSnapshot(p)));
|
|
3449
|
+
const schemaBefore = mergeSchemaSnapshots(inputSchemaSnapshots);
|
|
3450
|
+
const schemaAfter = await getSchemaSnapshot(result.output_path);
|
|
3451
|
+
const lineage = appendLineageVersion({
|
|
3452
|
+
datasetIdBase: fusedId,
|
|
3453
|
+
tool: "fuse_datasets",
|
|
3454
|
+
requestArgs: request.params.arguments,
|
|
3455
|
+
outputPath: result.output_path,
|
|
3456
|
+
output: {
|
|
3457
|
+
rows: result.stats.rows_after,
|
|
3458
|
+
format: outputFormat,
|
|
3459
|
+
schema_before: schemaBefore,
|
|
3460
|
+
schema_after: schemaAfter,
|
|
3461
|
+
},
|
|
3462
|
+
sources: resolvedPaths.map((p) => ({ source: "local", url: p, at: new Date().toISOString() })),
|
|
3463
|
+
steps: [
|
|
3464
|
+
{ step: "fused", at: new Date().toISOString(), params: { strategy, dedup, how }, metrics: { rows_before: result.stats.rows_before, rows_after: result.stats.rows_after, duplicates_removed: result.stats.duplicates_removed } },
|
|
3465
|
+
],
|
|
3466
|
+
});
|
|
3467
|
+
try {
|
|
3468
|
+
upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
|
|
3469
|
+
}
|
|
3470
|
+
catch { }
|
|
2798
3471
|
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
2799
3472
|
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
2800
3473
|
msg += `- Null change: ${nullText}\n`;
|
|
2801
3474
|
msg += `- Output: ${result.output_path}\n`;
|
|
3475
|
+
msg += `- Version: ${lineage.datasetVersionId}\n`;
|
|
3476
|
+
msg += `- Lineage: ${lineage.lineagePath}\n`;
|
|
2802
3477
|
if (result.preview_path)
|
|
2803
3478
|
msg += `- Preview: ${result.preview_path}\n`;
|
|
2804
3479
|
if (result.leakage_report) {
|
|
@@ -2966,6 +3641,7 @@ async function main() {
|
|
|
2966
3641
|
const isDiscover = args.includes("discover");
|
|
2967
3642
|
const isDownload = args.includes("download");
|
|
2968
3643
|
const isExport = args.includes("export");
|
|
3644
|
+
const isStatus = args.includes("status");
|
|
2969
3645
|
const isConfig = args.includes("config") || args.includes("configure");
|
|
2970
3646
|
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
2971
3647
|
const isSilent = args.includes("--silent");
|
|
@@ -2992,6 +3668,10 @@ async function main() {
|
|
|
2992
3668
|
await runExportCli(args);
|
|
2993
3669
|
return;
|
|
2994
3670
|
}
|
|
3671
|
+
if (isStatus) {
|
|
3672
|
+
await runStatusCli(args);
|
|
3673
|
+
return;
|
|
3674
|
+
}
|
|
2995
3675
|
// If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
|
|
2996
3676
|
if (isSetup) {
|
|
2997
3677
|
await runSetupWizard(isSilent);
|
|
@@ -3292,6 +3972,7 @@ async function runExportCli(args) {
|
|
|
3292
3972
|
const fastMode = args.includes("--fast");
|
|
3293
3973
|
const preview = args.includes("--preview");
|
|
3294
3974
|
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
3975
|
+
const intermediateArtifacts = new Set();
|
|
3295
3976
|
const resolvedTargetDir = path.resolve(targetDir || process.cwd());
|
|
3296
3977
|
let sourcePath = resolveDatasetLocalPath(datasetId, [resolvedTargetDir, process.cwd()]);
|
|
3297
3978
|
if (!sourcePath) {
|
|
@@ -3313,9 +3994,16 @@ async function runExportCli(args) {
|
|
|
3313
3994
|
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
3314
3995
|
if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
|
|
3315
3996
|
try {
|
|
3997
|
+
const beforeStagingPath = sourcePath;
|
|
3316
3998
|
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, resolvedTargetDir);
|
|
3999
|
+
if (path.resolve(beforeStagingPath) !== path.resolve(sourcePath)) {
|
|
4000
|
+
intermediateArtifacts.add(sourcePath);
|
|
4001
|
+
}
|
|
3317
4002
|
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
3318
4003
|
if (pipelineResult.final_output_path) {
|
|
4004
|
+
if (path.resolve(pipelineResult.final_output_path) !== path.resolve(sourcePath)) {
|
|
4005
|
+
intermediateArtifacts.add(pipelineResult.final_output_path);
|
|
4006
|
+
}
|
|
3319
4007
|
sourcePath = pipelineResult.final_output_path;
|
|
3320
4008
|
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
3321
4009
|
upsertRegistry(datasetId, sourcePath, "completed");
|
|
@@ -3346,6 +4034,7 @@ async function runExportCli(args) {
|
|
|
3346
4034
|
console.error(`[Export] Resolved output directory: ${outDir}`);
|
|
3347
4035
|
console.error(`[Export] Output file: ${outputFile}`);
|
|
3348
4036
|
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
4037
|
+
cleanupIntermediateArtifacts(intermediateArtifacts, result.output_path);
|
|
3349
4038
|
console.log(`Export complete: ${result.output_path}`);
|
|
3350
4039
|
console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
|
|
3351
4040
|
if (result.rows !== undefined)
|
|
@@ -3412,6 +4101,173 @@ async function runFuseCli(args) {
|
|
|
3412
4101
|
console.log(`Preview saved: ${result.preview_path}`);
|
|
3413
4102
|
console.log("Next: run vespermcp split/export on the fused dataset");
|
|
3414
4103
|
}
|
|
4104
|
+
async function runStatusCli(args) {
|
|
4105
|
+
const [{ default: chalk }, { default: Table }] = await Promise.all([
|
|
4106
|
+
import("chalk"),
|
|
4107
|
+
import("cli-table3"),
|
|
4108
|
+
]);
|
|
4109
|
+
const getArgValue = (name) => {
|
|
4110
|
+
const idx = args.findIndex(a => a === name);
|
|
4111
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
4112
|
+
return args[idx + 1];
|
|
4113
|
+
return undefined;
|
|
4114
|
+
};
|
|
4115
|
+
const defaultDir = path.join(dataRoot, "lineage");
|
|
4116
|
+
const scanDirRaw = getArgValue("--dir");
|
|
4117
|
+
const scanDir = path.resolve(scanDirRaw || defaultDir);
|
|
4118
|
+
const maxDepthRaw = getArgValue("--max-depth");
|
|
4119
|
+
const maxDepthParsed = maxDepthRaw !== undefined ? Number(maxDepthRaw) : 4;
|
|
4120
|
+
const maxDepth = Number.isFinite(maxDepthParsed) && maxDepthParsed >= 0
|
|
4121
|
+
? Math.floor(maxDepthParsed)
|
|
4122
|
+
: 4;
|
|
4123
|
+
if (!fs.existsSync(scanDir)) {
|
|
4124
|
+
console.log(`Lineage directory not found: ${scanDir}`);
|
|
4125
|
+
console.log("Tip: use --dir <path> to scan a custom location.");
|
|
4126
|
+
return;
|
|
4127
|
+
}
|
|
4128
|
+
const lineageFiles = walkFilesRecursive(scanDir, maxDepth).filter((p) => p.toLowerCase().endsWith(".lineage.json"));
|
|
4129
|
+
const records = [];
|
|
4130
|
+
for (const filePath of lineageFiles) {
|
|
4131
|
+
try {
|
|
4132
|
+
const data = JSON.parse(fs.readFileSync(filePath, "utf-8"));
|
|
4133
|
+
if (isLineageRecordShape(data)) {
|
|
4134
|
+
records.push(data);
|
|
4135
|
+
}
|
|
4136
|
+
}
|
|
4137
|
+
catch {
|
|
4138
|
+
// ignore malformed files
|
|
4139
|
+
}
|
|
4140
|
+
}
|
|
4141
|
+
if (records.length === 0) {
|
|
4142
|
+
console.log("No lineage records found.");
|
|
4143
|
+
console.log("Tip: default scan is ~/.vesper/lineage. Use --dir <path> for project-local lineage files.");
|
|
4144
|
+
return;
|
|
4145
|
+
}
|
|
4146
|
+
const allVersions = records.flatMap((r) => r.versions || []);
|
|
4147
|
+
const sevenDaysAgo = Date.now() - 7 * 24 * 60 * 60 * 1000;
|
|
4148
|
+
const operationsLast7d = allVersions.filter((v) => Date.parse(v.created_at || "") >= sevenDaysAgo).length;
|
|
4149
|
+
console.log(chalk.bold.cyan("\nVesper Lineage Status"));
|
|
4150
|
+
console.log(chalk.gray(`Scan dir: ${scanDir}`));
|
|
4151
|
+
console.log(chalk.gray(`Max depth: ${maxDepth}`));
|
|
4152
|
+
console.log(chalk.gray(`Lineage records: ${records.length}`));
|
|
4153
|
+
console.log(chalk.gray(`Total operations: ${allVersions.length} (${operationsLast7d} in last 7 days)\n`));
|
|
4154
|
+
const perDatasetTable = new Table({
|
|
4155
|
+
head: ["Dataset", "Versions", "Last Modified", "Last Actor"],
|
|
4156
|
+
colWidths: [34, 10, 28, 28],
|
|
4157
|
+
wordWrap: true,
|
|
4158
|
+
});
|
|
4159
|
+
for (const record of records.sort((a, b) => (a.dataset_id_base || "").localeCompare(b.dataset_id_base || ""))) {
|
|
4160
|
+
const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
|
|
4161
|
+
const last = sorted[sorted.length - 1];
|
|
4162
|
+
const actor = last?.triggered_by?.agent_id || last?.triggered_by?.pipeline_id || "-";
|
|
4163
|
+
perDatasetTable.push([
|
|
4164
|
+
record.dataset_id_base,
|
|
4165
|
+
String(sorted.length),
|
|
4166
|
+
last?.created_at || "-",
|
|
4167
|
+
actor,
|
|
4168
|
+
]);
|
|
4169
|
+
}
|
|
4170
|
+
console.log(chalk.bold("Per-dataset summary"));
|
|
4171
|
+
console.log(perDatasetTable.toString());
|
|
4172
|
+
const trendTable = new Table({
|
|
4173
|
+
head: ["Dataset", "Rows Trend", "Details"],
|
|
4174
|
+
colWidths: [34, 14, 52],
|
|
4175
|
+
wordWrap: true,
|
|
4176
|
+
});
|
|
4177
|
+
for (const record of records.sort((a, b) => (a.dataset_id_base || "").localeCompare(b.dataset_id_base || ""))) {
|
|
4178
|
+
const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
|
|
4179
|
+
const series = sorted
|
|
4180
|
+
.map((v) => ({ version: v.version, rows: v.output?.schema_after?.rows ?? v.output?.rows }))
|
|
4181
|
+
.filter((x) => typeof x.rows === "number");
|
|
4182
|
+
if (series.length < 2) {
|
|
4183
|
+
trendTable.push([record.dataset_id_base, "-", "insufficient row snapshots"]);
|
|
4184
|
+
continue;
|
|
4185
|
+
}
|
|
4186
|
+
const first = series[0].rows;
|
|
4187
|
+
const last = series[series.length - 1].rows;
|
|
4188
|
+
const trend = last > first ? chalk.green("growing") : last < first ? chalk.yellow("shrinking") : "flat";
|
|
4189
|
+
const details = series.map((x) => `v${x.version}:${x.rows}`).join(" -> ");
|
|
4190
|
+
trendTable.push([record.dataset_id_base, trend, details]);
|
|
4191
|
+
}
|
|
4192
|
+
console.log(chalk.bold("\nQuality trend (schema_after.rows)"));
|
|
4193
|
+
console.log(trendTable.toString());
|
|
4194
|
+
const dtypeWarnings = [];
|
|
4195
|
+
for (const record of records) {
|
|
4196
|
+
const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
|
|
4197
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
4198
|
+
const prev = sorted[i - 1];
|
|
4199
|
+
const curr = sorted[i];
|
|
4200
|
+
const prevSchema = prev.output?.schema_after || prev.output?.schema_before;
|
|
4201
|
+
const currSchema = curr.output?.schema_after || curr.output?.schema_before;
|
|
4202
|
+
const prevCols = prevSchema?.columns || [];
|
|
4203
|
+
const currCols = currSchema?.columns || [];
|
|
4204
|
+
const prevDtypes = prevSchema?.dtypes || {};
|
|
4205
|
+
const currDtypes = currSchema?.dtypes || {};
|
|
4206
|
+
const diff = diffSchemaMaps(prevCols, currCols, prevDtypes, currDtypes);
|
|
4207
|
+
if (diff.changed_dtypes.length > 0) {
|
|
4208
|
+
dtypeWarnings.push({
|
|
4209
|
+
dataset: record.dataset_id_base,
|
|
4210
|
+
from: prev.version,
|
|
4211
|
+
to: curr.version,
|
|
4212
|
+
changes: diff.changed_dtypes.slice(0, 4).map((d) => `${d.column}:${d.from}->${d.to}`),
|
|
4213
|
+
});
|
|
4214
|
+
}
|
|
4215
|
+
}
|
|
4216
|
+
}
|
|
4217
|
+
console.log(chalk.bold("\nDtype warnings"));
|
|
4218
|
+
if (dtypeWarnings.length === 0) {
|
|
4219
|
+
console.log(chalk.green("No dtype changes detected across adjacent versions."));
|
|
4220
|
+
}
|
|
4221
|
+
else {
|
|
4222
|
+
const warningTable = new Table({
|
|
4223
|
+
head: ["Dataset", "Versions", "Changed dtypes"],
|
|
4224
|
+
colWidths: [34, 14, 52],
|
|
4225
|
+
wordWrap: true,
|
|
4226
|
+
});
|
|
4227
|
+
for (const w of dtypeWarnings.slice(-20)) {
|
|
4228
|
+
warningTable.push([
|
|
4229
|
+
w.dataset,
|
|
4230
|
+
`v${w.from}->v${w.to}`,
|
|
4231
|
+
w.changes.join(", "),
|
|
4232
|
+
]);
|
|
4233
|
+
}
|
|
4234
|
+
console.log(warningTable.toString());
|
|
4235
|
+
}
|
|
4236
|
+
const lineageErrors = [];
|
|
4237
|
+
for (const record of records) {
|
|
4238
|
+
for (const v of record.versions || []) {
|
|
4239
|
+
for (const step of v.steps || []) {
|
|
4240
|
+
const errMsg = (typeof step.metrics?.error === "string" && step.metrics.error) ||
|
|
4241
|
+
(typeof step.params?.error === "string" && step.params.error) ||
|
|
4242
|
+
undefined;
|
|
4243
|
+
if (errMsg) {
|
|
4244
|
+
lineageErrors.push({ at: step.at, where: `${record.dataset_id_base}/v${v.version}:${step.step}`, message: errMsg });
|
|
4245
|
+
}
|
|
4246
|
+
}
|
|
4247
|
+
}
|
|
4248
|
+
}
|
|
4249
|
+
const logErrors = parseErrorLogLines(errorLogPath, 7).map((e) => ({
|
|
4250
|
+
at: e.at,
|
|
4251
|
+
where: "vesper_errors.log",
|
|
4252
|
+
message: e.message,
|
|
4253
|
+
}));
|
|
4254
|
+
const recentErrors = [...lineageErrors, ...logErrors].slice(-20);
|
|
4255
|
+
console.log(chalk.bold("\nRecent errors"));
|
|
4256
|
+
if (recentErrors.length === 0) {
|
|
4257
|
+
console.log(chalk.green("No recent lineage-linked errors found."));
|
|
4258
|
+
}
|
|
4259
|
+
else {
|
|
4260
|
+
const errTable = new Table({
|
|
4261
|
+
head: ["At", "Where", "Error"],
|
|
4262
|
+
colWidths: [28, 36, 46],
|
|
4263
|
+
wordWrap: true,
|
|
4264
|
+
});
|
|
4265
|
+
for (const e of recentErrors) {
|
|
4266
|
+
errTable.push([e.at || "-", e.where, e.message]);
|
|
4267
|
+
}
|
|
4268
|
+
console.log(errTable.toString());
|
|
4269
|
+
}
|
|
4270
|
+
}
|
|
3415
4271
|
async function runSetupWizard(silent = false) {
|
|
3416
4272
|
if (!silent && process.stdin.isTTY) {
|
|
3417
4273
|
const wizardCandidates = [
|
package/build/python/cleaner.py
CHANGED
|
@@ -182,6 +182,8 @@ def main():
|
|
|
182
182
|
output_format = "parquet"
|
|
183
183
|
|
|
184
184
|
base_name = file_path.rsplit(".", 1)[0]
|
|
185
|
+
if base_name.endswith("_cleaned"):
|
|
186
|
+
base_name = base_name[:-8]
|
|
185
187
|
if output_format == "csv":
|
|
186
188
|
output_path = f"{base_name}_cleaned.csv"
|
|
187
189
|
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.29",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
@@ -37,6 +37,7 @@
|
|
|
37
37
|
"setup": "node build/index.js --setup",
|
|
38
38
|
"setup:silent": "node build/index.js --setup --silent",
|
|
39
39
|
"refresh-index": "node scripts/refresh-index.cjs",
|
|
40
|
+
"telemetry:receiver": "tsx telemetry/lineage-receiver.ts",
|
|
40
41
|
"test": "vitest",
|
|
41
42
|
"start": "node build/index.js"
|
|
42
43
|
},
|
|
@@ -79,9 +80,13 @@
|
|
|
79
80
|
"ajv": "^8.17.1",
|
|
80
81
|
"ajv-formats": "^3.0.1",
|
|
81
82
|
"better-sqlite3": "^12.6.0",
|
|
83
|
+
"chalk": "^5.6.2",
|
|
84
|
+
"cli-table3": "^0.6.5",
|
|
85
|
+
"express": "^5.1.0",
|
|
82
86
|
"inquirer": "^13.3.0",
|
|
83
87
|
"lodash": "^4.17.21",
|
|
84
88
|
"pdf-parse": "^2.4.5",
|
|
89
|
+
"pg": "^8.16.3",
|
|
85
90
|
"uuid": "^13.0.0",
|
|
86
91
|
"zod": "^4.3.5",
|
|
87
92
|
"zod-to-json-schema": "^3.25.1"
|
package/src/python/cleaner.py
CHANGED
|
@@ -182,6 +182,8 @@ def main():
|
|
|
182
182
|
output_format = "parquet"
|
|
183
183
|
|
|
184
184
|
base_name = file_path.rsplit(".", 1)[0]
|
|
185
|
+
if base_name.endswith("_cleaned"):
|
|
186
|
+
base_name = base_name[:-8]
|
|
185
187
|
if output_format == "csv":
|
|
186
188
|
output_path = f"{base_name}_cleaned.csv"
|
|
187
189
|
# Stringify ANY column that might not be CSV-safe (List, Struct, Object, etc.)
|