@vespermcp/mcp-server 1.2.27 → 1.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/README.md +43 -0
  2. package/build/index.js +812 -4
  3. package/package.json +6 -1
package/README.md CHANGED
@@ -88,6 +88,47 @@ Vesper attempts to auto-configure itself! Restart Claude and check. If not:
88
88
 
89
89
  - `KAGGLE_USERNAME` & `KAGGLE_KEY`: For Kaggle dataset access
90
90
  - `HF_TOKEN`: For private HuggingFace datasets
91
+ - `VESPER_TELEMETRY_ENDPOINT`: Optional HTTP endpoint for lineage telemetry events (`lineage.version.appended`)
92
+
93
+ ### Telemetry Transparency (Opt-in)
94
+
95
+ Vesper does **not** send telemetry unless `VESPER_TELEMETRY_ENDPOINT` is explicitly set.
96
+
97
+ When enabled, Vesper sends only lineage event metadata on version append:
98
+ - dataset base/version IDs
99
+ - tool name + actor metadata (`agent_id`, `pipeline_id` when provided)
100
+ - basic output metadata (`local_path`, rows/columns, format)
101
+ - timestamp + host name
102
+
103
+ It does **not** upload dataset file contents.
104
+
105
+ ### Lineage Receiver (for web dashboard backend)
106
+
107
+ Vesper includes a tiny ingestion server for lineage telemetry events:
108
+
109
+ ```bash
110
+ npm run telemetry:receiver
111
+ ```
112
+
113
+ Storage backends:
114
+ - **Postgres**: set `DATABASE_URL`
115
+ - **SQLite**: set `SQLITE_PATH` (for lightweight/local deployments)
116
+
117
+ Optional env vars:
118
+ - `PORT` (default `8787`)
119
+ - `LINEAGE_INGEST_PATH` (default `/vesper/lineage`)
120
+
121
+ Example for hosted backend:
122
+ - ingest URL: `https://getvesper.dev/vesper/lineage`
123
+ - client env: `VESPER_TELEMETRY_ENDPOINT=https://getvesper.dev/vesper/lineage`
124
+
125
+ DDL files:
126
+ - `telemetry/sql/lineage_events.postgres.sql`
127
+ - `telemetry/sql/lineage_events.sqlite.sql`
128
+
129
+ Stats endpoint for web dashboard bootstrap:
130
+ - `GET /vesper/lineage/stats?days=30`
131
+ - Returns JSON: overview, by-tool counts, by-day counts, top datasets, recent activity.
91
132
 
92
133
  ### Optional Kaggle Setup (Not Required)
93
134
 
@@ -121,6 +162,8 @@ vespermcp discover --source kaggle "credit risk" --limit 10
121
162
  vespermcp discover --source huggingface "credit risk" --limit 10
122
163
  vespermcp download kaggle username/dataset-name
123
164
  vespermcp download kaggle https://www.kaggle.com/datasets/username/dataset-name --target-dir ./data
165
+ vespermcp status
166
+ vespermcp status --dir ./some/project --max-depth 3
124
167
  ```
125
168
 
126
169
  ## 🚀 Quick Start
package/build/index.js CHANGED
@@ -74,18 +74,156 @@ function getRegistryEntry(dataset_id) {
74
74
  console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
75
75
  return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
76
76
  }
77
+ function getLineageDir() {
78
+ const p = path.join(dataRoot, "lineage");
79
+ if (!fs.existsSync(p))
80
+ fs.mkdirSync(p, { recursive: true });
81
+ return p;
82
+ }
83
+ function toBaseDatasetId(datasetId) {
84
+ const safe = normalize_dataset_id(String(datasetId || "").trim());
85
+ return safe.replace(/_v\d+$/i, "");
86
+ }
87
+ function getLineageRecordPath(datasetIdBase) {
88
+ return path.join(getLineageDir(), `${toBaseDatasetId(datasetIdBase)}.lineage.json`);
89
+ }
90
+ function readLineageRecord(datasetIdBase) {
91
+ const p = getLineageRecordPath(datasetIdBase);
92
+ if (!fs.existsSync(p)) {
93
+ return {
94
+ dataset_id_base: toBaseDatasetId(datasetIdBase),
95
+ latest_version: 0,
96
+ updated_at: new Date().toISOString(),
97
+ versions: [],
98
+ };
99
+ }
100
+ try {
101
+ return JSON.parse(fs.readFileSync(p, "utf-8"));
102
+ }
103
+ catch {
104
+ return {
105
+ dataset_id_base: toBaseDatasetId(datasetIdBase),
106
+ latest_version: 0,
107
+ updated_at: new Date().toISOString(),
108
+ versions: [],
109
+ };
110
+ }
111
+ }
112
+ function writeLineageRecord(record) {
113
+ const p = getLineageRecordPath(record.dataset_id_base);
114
+ fs.writeFileSync(p, JSON.stringify(record, null, 2));
115
+ }
116
+ function appendLineageVersion(input) {
117
+ const base = toBaseDatasetId(input.datasetIdBase);
118
+ const record = readLineageRecord(base);
119
+ if (input.outputPath) {
120
+ const existing = record.versions.find(v => v.output?.local_path === input.outputPath);
121
+ if (existing) {
122
+ return { datasetVersionId: existing.dataset_id, version: existing.version, lineagePath: getLineageRecordPath(base) };
123
+ }
124
+ }
125
+ const version = (record.latest_version || 0) + 1;
126
+ const datasetVersionId = `${base}_v${version}`;
127
+ const now = new Date().toISOString();
128
+ const outputPath = input.outputPath;
129
+ let sidecarPath;
130
+ if (outputPath && fs.existsSync(outputPath)) {
131
+ sidecarPath = `${outputPath}.lineage.json`;
132
+ }
133
+ const v = {
134
+ version,
135
+ dataset_id: datasetVersionId,
136
+ created_at: now,
137
+ triggered_by: {
138
+ tool: input.tool,
139
+ agent_id: input.requestArgs?.agent_id ? String(input.requestArgs.agent_id) : undefined,
140
+ pipeline_id: input.requestArgs?.pipeline_id ? String(input.requestArgs.pipeline_id) : undefined,
141
+ api_key: input.requestArgs?.api_key ? String(input.requestArgs.api_key) : undefined,
142
+ },
143
+ input: {
144
+ dataset_id: input.requestArgs?.dataset_id ? String(input.requestArgs.dataset_id) : undefined,
145
+ query: input.requestArgs?.query ? String(input.requestArgs.query) : undefined,
146
+ source_path: input.requestArgs?.file_path ? String(input.requestArgs.file_path) : undefined,
147
+ source_urls: Array.isArray(input.requestArgs?.source_urls)
148
+ ? input.requestArgs.source_urls.map((u) => String(u))
149
+ : undefined,
150
+ },
151
+ output: {
152
+ local_path: outputPath,
153
+ rows: typeof input.output?.rows === "number" ? input.output.rows : undefined,
154
+ columns: typeof input.output?.columns === "number" ? input.output.columns : undefined,
155
+ format: typeof input.output?.format === "string" ? input.output.format : undefined,
156
+ size_mb: typeof input.output?.size_mb === "number" ? input.output.size_mb : undefined,
157
+ quality_score: typeof input.output?.quality_score === "number" ? input.output.quality_score : undefined,
158
+ schema_before: input.output?.schema_before && typeof input.output.schema_before === "object"
159
+ ? {
160
+ rows: typeof input.output.schema_before.rows === "number" ? input.output.schema_before.rows : undefined,
161
+ columns: Array.isArray(input.output.schema_before.columns) ? input.output.schema_before.columns.map((c) => String(c)) : undefined,
162
+ dtypes: input.output.schema_before.dtypes && typeof input.output.schema_before.dtypes === "object"
163
+ ? Object.fromEntries(Object.entries(input.output.schema_before.dtypes).map(([k, v]) => [String(k), String(v)]))
164
+ : undefined,
165
+ }
166
+ : undefined,
167
+ schema_after: input.output?.schema_after && typeof input.output.schema_after === "object"
168
+ ? {
169
+ rows: typeof input.output.schema_after.rows === "number" ? input.output.schema_after.rows : undefined,
170
+ columns: Array.isArray(input.output.schema_after.columns) ? input.output.schema_after.columns.map((c) => String(c)) : undefined,
171
+ dtypes: input.output.schema_after.dtypes && typeof input.output.schema_after.dtypes === "object"
172
+ ? Object.fromEntries(Object.entries(input.output.schema_after.dtypes).map(([k, v]) => [String(k), String(v)]))
173
+ : undefined,
174
+ }
175
+ : undefined,
176
+ },
177
+ sources: input.sources || [],
178
+ steps: input.steps || [{ step: input.tool, at: now, params: input.requestArgs || {} }],
179
+ };
180
+ record.latest_version = version;
181
+ record.updated_at = now;
182
+ record.versions.push(v);
183
+ writeLineageRecord(record);
184
+ const telemetryEndpoint = process.env.VESPER_TELEMETRY_ENDPOINT?.trim();
185
+ if (telemetryEndpoint) {
186
+ postJsonNonBlocking(telemetryEndpoint, {
187
+ event: "lineage.version.appended",
188
+ sent_at: now,
189
+ host: os.hostname(),
190
+ dataset_id_base: base,
191
+ version: v.version,
192
+ dataset_id: v.dataset_id,
193
+ triggered_by: v.triggered_by,
194
+ output: {
195
+ local_path: v.output?.local_path,
196
+ rows: v.output?.rows,
197
+ columns: v.output?.columns,
198
+ format: v.output?.format,
199
+ },
200
+ });
201
+ }
202
+ if (sidecarPath) {
203
+ try {
204
+ fs.writeFileSync(sidecarPath, JSON.stringify(v, null, 2));
205
+ }
206
+ catch {
207
+ // best effort sidecar write
208
+ }
209
+ }
210
+ return { datasetVersionId, version, lineagePath: getLineageRecordPath(base) };
211
+ }
77
212
  const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
78
213
  const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
79
- function walkFilesRecursive(rootDir) {
214
+ function walkFilesRecursive(rootDir, maxDepth = Number.POSITIVE_INFINITY) {
80
215
  const out = [];
81
- const stack = [rootDir];
216
+ const stack = [{ dir: rootDir, depth: 0 }];
82
217
  while (stack.length > 0) {
83
- const currentDir = stack.pop();
218
+ const current = stack.pop();
219
+ const currentDir = current.dir;
84
220
  const entries = fs.readdirSync(currentDir, { withFileTypes: true });
85
221
  for (const entry of entries) {
86
222
  const fullPath = path.join(currentDir, entry.name);
87
223
  if (entry.isDirectory()) {
88
- stack.push(fullPath);
224
+ if (current.depth < maxDepth) {
225
+ stack.push({ dir: fullPath, depth: current.depth + 1 });
226
+ }
89
227
  }
90
228
  else if (entry.isFile()) {
91
229
  out.push(fullPath);
@@ -296,6 +434,8 @@ import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
296
434
  import { ConfigManager } from "./config/config-manager.js";
297
435
  import { SecureKeysManager } from "./config/secure-keys.js";
298
436
  import readline from "readline";
437
+ import http from "http";
438
+ import https from "https";
299
439
  import os from "os";
300
440
  // Determine absolute paths relative to the compiled script
301
441
  const __filename = fileURLToPath(import.meta.url);
@@ -321,6 +461,34 @@ function logError(err, context) {
321
461
  fs.appendFileSync(errorLogPath, msg);
322
462
  console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
323
463
  }
464
+ function postJsonNonBlocking(urlRaw, body) {
465
+ try {
466
+ const u = new URL(urlRaw);
467
+ const payload = JSON.stringify(body);
468
+ const isHttps = u.protocol === "https:";
469
+ const transport = isHttps ? https : http;
470
+ const req = transport.request({
471
+ method: "POST",
472
+ hostname: u.hostname,
473
+ port: u.port ? Number(u.port) : (isHttps ? 443 : 80),
474
+ path: `${u.pathname}${u.search || ""}`,
475
+ headers: {
476
+ "content-type": "application/json",
477
+ "content-length": Buffer.byteLength(payload),
478
+ },
479
+ timeout: 3000,
480
+ }, (res) => {
481
+ res.resume();
482
+ });
483
+ req.on("error", () => { });
484
+ req.on("timeout", () => req.destroy());
485
+ req.write(payload);
486
+ req.end();
487
+ }
488
+ catch {
489
+ // best effort telemetry only
490
+ }
491
+ }
324
492
  // --- Request Queue: serialize all MCP tool calls to prevent crashes ---
325
493
  class RequestQueue {
326
494
  queue = [];
@@ -535,6 +703,102 @@ function runPythonJson(scriptPath, args) {
535
703
  });
536
704
  });
537
705
  }
706
+ async function getSchemaSnapshot(filePath) {
707
+ try {
708
+ if (!filePath || !fs.existsSync(filePath))
709
+ return undefined;
710
+ await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
711
+ const pyCode = [
712
+ "import json, os, polars as pl",
713
+ "p = __import__('sys').argv[1]",
714
+ "ext = os.path.splitext(p)[1].lower()",
715
+ "if ext == '.csv': df = pl.read_csv(p, infer_schema_length=10000, ignore_errors=True)",
716
+ "elif ext in ('.jsonl', '.ndjson'): df = pl.read_ndjson(p)",
717
+ "elif ext == '.json': df = pl.read_json(p)",
718
+ "elif ext in ('.parquet', '.pq'): df = pl.read_parquet(p)",
719
+ "elif ext in ('.feather', '.ftr', '.arrow', '.ipc'): df = pl.read_ipc(p)",
720
+ "else: df = pl.read_csv(p, infer_schema_length=10000, ignore_errors=True)",
721
+ "print(json.dumps({'rows': int(df.height), 'columns': [str(c) for c in df.columns], 'dtypes': {str(c): str(t) for c,t in zip(df.columns, df.dtypes)}}))",
722
+ ].join(";");
723
+ const proc = await runPythonProcess(["-c", pyCode, filePath], 120000);
724
+ if (proc.code !== 0)
725
+ return undefined;
726
+ return JSON.parse((proc.stdout || "{}").trim());
727
+ }
728
+ catch {
729
+ return undefined;
730
+ }
731
+ }
732
+ function mergeSchemaSnapshots(snapshots) {
733
+ const valid = snapshots.filter(Boolean);
734
+ if (valid.length === 0)
735
+ return undefined;
736
+ let rows = 0;
737
+ const colSet = new Set();
738
+ const dtypeMulti = {};
739
+ for (const s of valid) {
740
+ if (typeof s.rows === "number") {
741
+ rows = (rows ?? 0) + s.rows;
742
+ }
743
+ else {
744
+ rows = undefined;
745
+ }
746
+ for (const c of s.columns || [])
747
+ colSet.add(String(c));
748
+ for (const [k, v] of Object.entries(s.dtypes || {})) {
749
+ if (!dtypeMulti[k])
750
+ dtypeMulti[k] = new Set();
751
+ dtypeMulti[k].add(String(v));
752
+ }
753
+ }
754
+ const dtypes = {};
755
+ for (const [k, values] of Object.entries(dtypeMulti)) {
756
+ const list = Array.from(values);
757
+ dtypes[k] = list.length <= 1 ? list[0] : `mixed(${list.join("|")})`;
758
+ }
759
+ return {
760
+ rows,
761
+ columns: Array.from(colSet).sort(),
762
+ dtypes,
763
+ };
764
+ }
765
+ function diffSchemaMaps(fromColumns, toColumns, fromDtypes, toDtypes) {
766
+ const fromSet = new Set(fromColumns);
767
+ const toSet = new Set(toColumns);
768
+ const added_columns = toColumns.filter((c) => !fromSet.has(c));
769
+ const removed_columns = fromColumns.filter((c) => !toSet.has(c));
770
+ const common = fromColumns.filter((c) => toSet.has(c));
771
+ const changed_dtypes = common
772
+ .filter((c) => String(fromDtypes[c] || "") !== String(toDtypes[c] || ""))
773
+ .map((c) => ({ column: c, from: fromDtypes[c], to: toDtypes[c] }));
774
+ return { added_columns, removed_columns, changed_dtypes };
775
+ }
776
+ function isLineageRecordShape(value) {
777
+ return !!value && typeof value === "object" && typeof value.dataset_id_base === "string" && Array.isArray(value.versions);
778
+ }
779
+ function parseErrorLogLines(filePath, withinDays) {
780
+ try {
781
+ if (!fs.existsSync(filePath))
782
+ return [];
783
+ const text = fs.readFileSync(filePath, "utf-8");
784
+ const lines = text.split(/\r?\n/).filter(Boolean);
785
+ const minTs = Date.now() - withinDays * 24 * 60 * 60 * 1000;
786
+ const out = [];
787
+ for (const line of lines) {
788
+ const m = line.match(/^\[(.+?)\]\s+ERROR\s+in\s+(.+)$/);
789
+ if (!m)
790
+ continue;
791
+ const ts = Date.parse(m[1]);
792
+ if (Number.isFinite(ts) && ts >= minTs) {
793
+ out.push({ at: m[1], message: m[2] });
794
+ }
795
+ }
796
+ return out.slice(-20);
797
+ }
798
+ catch {
799
+ return [];
800
+ }
801
+ }
538
802
  async function countRows(filePath) {
539
803
  const scriptPath = path.join(dataRoot, "python", "row_count.py");
540
804
  const result = await runPythonJson(scriptPath, [filePath]);
@@ -1166,6 +1430,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1166
1430
  type: "boolean",
1167
1431
  description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
1168
1432
  },
1433
+ agent_id: {
1434
+ type: "string",
1435
+ description: "Strongly recommended: caller agent identity for lineage/audit.",
1436
+ },
1437
+ pipeline_id: {
1438
+ type: "string",
1439
+ description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
1440
+ },
1169
1441
  },
1170
1442
  required: ["operation"],
1171
1443
  },
@@ -1185,6 +1457,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1185
1457
  limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
1186
1458
  arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
1187
1459
  github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
1460
+ agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
1461
+ pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
1188
1462
  },
1189
1463
  required: ["query"],
1190
1464
  },
@@ -1232,6 +1506,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1232
1506
  enum: ["semantic", "exact", "none"],
1233
1507
  description: "How to deduplicate across sources.",
1234
1508
  },
1509
+ agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
1510
+ pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
1235
1511
  },
1236
1512
  required: ["sources"],
1237
1513
  },
@@ -1435,6 +1711,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1435
1711
  download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
1436
1712
  cleaning_options: { type: "object" },
1437
1713
  split_config: { type: "object" },
1714
+ agent_id: { type: "string", description: "Strongly recommended: caller agent identity for lineage/audit." },
1715
+ pipeline_id: { type: "string", description: "Strongly recommended: workflow/pipeline identifier for lineage/audit." },
1438
1716
  },
1439
1717
  required: ["query"],
1440
1718
  },
@@ -1509,6 +1787,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1509
1787
  items: { type: "string" },
1510
1788
  description: "Export only these columns (faster for wide datasets).",
1511
1789
  },
1790
+ agent_id: {
1791
+ type: "string",
1792
+ description: "Strongly recommended: caller agent identity for lineage/audit.",
1793
+ },
1794
+ pipeline_id: {
1795
+ type: "string",
1796
+ description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
1797
+ },
1512
1798
  },
1513
1799
  required: ["dataset_id"],
1514
1800
  },
@@ -1521,6 +1807,42 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1521
1807
  properties: {},
1522
1808
  },
1523
1809
  },
1810
+ {
1811
+ name: "get_lineage",
1812
+ description: "Get version history and full lineage/provenance for a dataset (sources, steps, inputs/outputs, trigger metadata).",
1813
+ inputSchema: {
1814
+ type: "object",
1815
+ properties: {
1816
+ dataset_id: {
1817
+ type: "string",
1818
+ description: "Dataset ID (base or versioned, e.g. my_dataset or my_dataset_v2).",
1819
+ },
1820
+ },
1821
+ required: ["dataset_id"],
1822
+ },
1823
+ },
1824
+ {
1825
+ name: "diff_lineage_versions",
1826
+ description: "Diff two lineage versions for one dataset and return structured changes (schema, rows, steps, actor identity).",
1827
+ inputSchema: {
1828
+ type: "object",
1829
+ properties: {
1830
+ dataset_id: {
1831
+ type: "string",
1832
+ description: "Dataset ID (base or versioned).",
1833
+ },
1834
+ from_version: {
1835
+ type: "number",
1836
+ description: "Source lineage version number (e.g., 1).",
1837
+ },
1838
+ to_version: {
1839
+ type: "number",
1840
+ description: "Target lineage version number (e.g., 2).",
1841
+ },
1842
+ },
1843
+ required: ["dataset_id", "from_version", "to_version"],
1844
+ },
1845
+ },
1524
1846
  {
1525
1847
  name: "vesper_convert_format",
1526
1848
  description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
@@ -1536,6 +1858,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1536
1858
  enum: ["csv", "parquet", "json", "jsonl"],
1537
1859
  description: "The desired output format.",
1538
1860
  },
1861
+ agent_id: {
1862
+ type: "string",
1863
+ description: "Strongly recommended: caller agent identity for lineage/audit.",
1864
+ },
1865
+ pipeline_id: {
1866
+ type: "string",
1867
+ description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
1868
+ },
1539
1869
  },
1540
1870
  required: ["file_path", "target_format"],
1541
1871
  },
@@ -1572,6 +1902,14 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
1572
1902
  enum: ["blob", "drop"],
1573
1903
  description: "How to handle metadata_json keys beyond max_keys. blob keeps them in metadata_json_blob; drop discards them. Default: blob.",
1574
1904
  },
1905
+ agent_id: {
1906
+ type: "string",
1907
+ description: "Strongly recommended: caller agent identity for lineage/audit.",
1908
+ },
1909
+ pipeline_id: {
1910
+ type: "string",
1911
+ description: "Strongly recommended: workflow/pipeline identifier for lineage/audit.",
1912
+ },
1575
1913
  },
1576
1914
  required: ["file_path"],
1577
1915
  },
@@ -1761,6 +2099,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1761
2099
  arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
1762
2100
  github_include_readme: request.params.arguments?.github_include_readme === true,
1763
2101
  });
2102
+ try {
2103
+ appendLineageVersion({
2104
+ datasetIdBase: `webfind_${query || "query"}`,
2105
+ tool: "vesper_web_find",
2106
+ requestArgs: request.params.arguments,
2107
+ output: {
2108
+ rows: Array.isArray(result.results) ? result.results.length : undefined,
2109
+ },
2110
+ sources: Array.isArray(result.results)
2111
+ ? result.results.slice(0, 200).map((r) => ({
2112
+ source: String(r?.source_type || "unknown"),
2113
+ url: typeof r?.source_url === "string" ? r.source_url : undefined,
2114
+ at: typeof r?.collected_at === "string" ? r.collected_at : undefined,
2115
+ }))
2116
+ : [],
2117
+ steps: [
2118
+ { step: "web_find_discover", at: new Date().toISOString(), params: { query, sources, limit } },
2119
+ { step: "web_find_complete", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
2120
+ ],
2121
+ });
2122
+ }
2123
+ catch (e) {
2124
+ console.error(`[Lineage] vesper_web_find append failed: ${e?.message || e}`);
2125
+ }
1764
2126
  return {
1765
2127
  content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1766
2128
  };
@@ -1882,6 +2244,30 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1882
2244
  limit: Number(request.params.arguments?.limit || 10),
1883
2245
  publicOnly,
1884
2246
  });
2247
+ try {
2248
+ appendLineageVersion({
2249
+ datasetIdBase: `discover_${source}_${query || "query"}`,
2250
+ tool: "unified_dataset_api.discover",
2251
+ requestArgs: request.params.arguments,
2252
+ output: { rows: Array.isArray(result.results) ? result.results.length : undefined },
2253
+ sources: Array.isArray(result.results)
2254
+ ? result.results.slice(0, 200).map((r) => ({
2255
+ source: String(r?.source || source || "unknown"),
2256
+ url: typeof r?.download_url === "string"
2257
+ ? r.download_url
2258
+ : (typeof r?.metadata_url === "string" ? r.metadata_url : undefined),
2259
+ at: new Date().toISOString(),
2260
+ }))
2261
+ : [],
2262
+ steps: [
2263
+ { step: "discover_requested", at: new Date().toISOString(), params: { query, source, limit: Number(request.params.arguments?.limit || 10), publicOnly } },
2264
+ { step: "discover_completed", at: new Date().toISOString(), metrics: { result_count: Array.isArray(result.results) ? result.results.length : 0 } },
2265
+ ],
2266
+ });
2267
+ }
2268
+ catch (e) {
2269
+ console.error(`[Lineage] unified discover append failed: ${e?.message || e}`);
2270
+ }
1885
2271
  return {
1886
2272
  content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1887
2273
  };
@@ -1914,6 +2300,36 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1914
2300
  catch (e) {
1915
2301
  console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
1916
2302
  }
2303
+ try {
2304
+ const schemaAfter = await getSchemaSnapshot(result.copied_to || result.local_path);
2305
+ const lineage = appendLineageVersion({
2306
+ datasetIdBase: result.dataset_id,
2307
+ tool: "unified_dataset_api.download",
2308
+ requestArgs: request.params.arguments,
2309
+ outputPath: result.copied_to || result.local_path,
2310
+ output: {
2311
+ local_path: result.copied_to || result.local_path,
2312
+ format: path.extname(result.copied_to || result.local_path).replace(".", ""),
2313
+ schema_after: schemaAfter,
2314
+ },
2315
+ sources: [{
2316
+ source: source,
2317
+ url: typeof result.dataset_id === "string" ? result.dataset_id : undefined,
2318
+ at: new Date().toISOString(),
2319
+ }],
2320
+ steps: [
2321
+ { step: "download_requested", at: new Date().toISOString(), params: { datasetId, source, targetDir } },
2322
+ { step: "download_completed", at: new Date().toISOString(), metrics: { local_path: result.copied_to || result.local_path } },
2323
+ ],
2324
+ });
2325
+ try {
2326
+ upsertRegistry(lineage.datasetVersionId, result.copied_to || result.local_path, "completed");
2327
+ }
2328
+ catch { }
2329
+ }
2330
+ catch (e) {
2331
+ console.error(`[Lineage] unified download append failed: ${e?.message || e}`);
2332
+ }
1917
2333
  return {
1918
2334
  content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
1919
2335
  };
@@ -2449,6 +2865,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2449
2865
  };
2450
2866
  }
2451
2867
  jobStatusLastPoll[jobId] = now;
2868
+ if (job.status === "completed") {
2869
+ try {
2870
+ const meta = job.metadata ? JSON.parse(job.metadata) : {};
2871
+ const baseId = String(meta?.datasetId || meta?.dataset_id || meta?.query || job.id);
2872
+ const outPath = typeof job.result_url === "string" ? job.result_url : undefined;
2873
+ appendLineageVersion({
2874
+ datasetIdBase: baseId,
2875
+ tool: `job:${job.type}`,
2876
+ requestArgs: {
2877
+ dataset_id: meta?.datasetId || meta?.dataset_id,
2878
+ query: meta?.query,
2879
+ pipeline_id: meta?.pipeline_id,
2880
+ agent_id: meta?.agent_id,
2881
+ },
2882
+ outputPath: outPath,
2883
+ output: {},
2884
+ steps: [
2885
+ { step: `${job.type}_started`, at: job.created_at, params: meta || {} },
2886
+ { step: `${job.type}_completed`, at: job.updated_at || new Date().toISOString(), metrics: { progress: job.progress } },
2887
+ ],
2888
+ });
2889
+ }
2890
+ catch (e) {
2891
+ console.error(`[Lineage] check_job_status append failed: ${e?.message || e}`);
2892
+ }
2893
+ }
2452
2894
  return {
2453
2895
  content: [{ type: "text", text: formatJobStatus(job) }]
2454
2896
  };
@@ -2567,10 +3009,36 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2567
3009
  if (!fs.existsSync(outDir))
2568
3010
  fs.mkdirSync(outDir, { recursive: true });
2569
3011
  const outputFile = path.join(outDir, `${safeName}${ext}`);
3012
+ const schemaBefore = await getSchemaSnapshot(sourcePath);
2570
3013
  const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
3014
+ const schemaAfter = await getSchemaSnapshot(result.output_path);
3015
+ const lineage = appendLineageVersion({
3016
+ datasetIdBase: datasetId,
3017
+ tool: "export_dataset",
3018
+ requestArgs: request.params.arguments,
3019
+ outputPath: result.output_path,
3020
+ output: {
3021
+ rows: result.rows,
3022
+ columns: result.columns,
3023
+ format: requestedFormat,
3024
+ size_mb: result.file_size_mb,
3025
+ schema_before: schemaBefore,
3026
+ schema_after: schemaAfter,
3027
+ },
3028
+ steps: [
3029
+ { step: "source_resolved", at: new Date().toISOString(), params: { sourcePath } },
3030
+ { step: "exported", at: new Date().toISOString(), params: { format: requestedFormat, compression }, metrics: { rows: result.rows, columns: result.columns } },
3031
+ ],
3032
+ });
3033
+ try {
3034
+ upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
3035
+ }
3036
+ catch { }
2571
3037
  // Build rich response
2572
3038
  let msg = `**Export complete**\n`;
2573
3039
  msg += `- **File**: ${result.output_path}\n`;
3040
+ msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3041
+ msg += `- **Lineage**: ${lineage.lineagePath}\n`;
2574
3042
  msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
2575
3043
  msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2576
3044
  if (result.file_size_mb !== undefined)
@@ -2616,6 +3084,100 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2616
3084
  content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
2617
3085
  };
2618
3086
  }
3087
+ case "get_lineage": {
3088
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
3089
+ if (!datasetId) {
3090
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
3091
+ }
3092
+ const base = toBaseDatasetId(datasetId);
3093
+ const record = readLineageRecord(base);
3094
+ if (!record.versions || record.versions.length === 0) {
3095
+ return {
3096
+ content: [{ type: "text", text: `No lineage found for '${datasetId}' yet.` }]
3097
+ };
3098
+ }
3099
+ return {
3100
+ content: [{ type: "text", text: JSON.stringify(record, null, 2) }]
3101
+ };
3102
+ }
3103
+ case "diff_lineage_versions": {
3104
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
3105
+ const fromVersion = Number(request.params.arguments?.from_version);
3106
+ const toVersion = Number(request.params.arguments?.to_version);
3107
+ if (!datasetId) {
3108
+ throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
3109
+ }
3110
+ if (!Number.isInteger(fromVersion) || fromVersion <= 0) {
3111
+ throw new McpError(ErrorCode.InvalidParams, "from_version must be a positive integer");
3112
+ }
3113
+ if (!Number.isInteger(toVersion) || toVersion <= 0) {
3114
+ throw new McpError(ErrorCode.InvalidParams, "to_version must be a positive integer");
3115
+ }
3116
+ const base = toBaseDatasetId(datasetId);
3117
+ const record = readLineageRecord(base);
3118
+ const fromV = record.versions.find((v) => v.version === fromVersion);
3119
+ const toV = record.versions.find((v) => v.version === toVersion);
3120
+ if (!fromV || !toV) {
3121
+ return {
3122
+ content: [{ type: "text", text: `ERROR: Could not find both versions in lineage for '${datasetId}'.` }],
3123
+ isError: true,
3124
+ };
3125
+ }
3126
+ const fromSchema = (toV.output?.schema_before && toVersion > fromVersion)
3127
+ ? fromV.output?.schema_after || fromV.output?.schema_before || {}
3128
+ : fromV.output?.schema_after || fromV.output?.schema_before || {};
3129
+ const toSchema = toV.output?.schema_after || toV.output?.schema_before || {};
3130
+ const fromCols = Array.isArray(fromSchema.columns) ? fromSchema.columns.map((c) => String(c)) : [];
3131
+ const toCols = Array.isArray(toSchema.columns) ? toSchema.columns.map((c) => String(c)) : [];
3132
+ const fromDtypes = (fromSchema.dtypes && typeof fromSchema.dtypes === "object") ? fromSchema.dtypes : {};
3133
+ const toDtypes = (toSchema.dtypes && typeof toSchema.dtypes === "object") ? toSchema.dtypes : {};
3134
+ const schemaDiff = diffSchemaMaps(fromCols, toCols, fromDtypes, toDtypes);
3135
+ const fromRows = typeof fromSchema.rows === "number"
3136
+ ? fromSchema.rows
3137
+ : (typeof fromV.output?.rows === "number" ? fromV.output.rows : undefined);
3138
+ const toRows = typeof toSchema.rows === "number"
3139
+ ? toSchema.rows
3140
+ : (typeof toV.output?.rows === "number" ? toV.output.rows : undefined);
3141
+ const fromSteps = new Set((fromV.steps || []).map((s) => String(s.step)));
3142
+ const toSteps = new Set((toV.steps || []).map((s) => String(s.step)));
3143
+ const addedSteps = Array.from(toSteps).filter((s) => !fromSteps.has(s));
3144
+ const removedSteps = Array.from(fromSteps).filter((s) => !toSteps.has(s));
3145
+ const actorDiff = {
3146
+ changed: String(fromV.triggered_by?.agent_id || "") !== String(toV.triggered_by?.agent_id || "") ||
3147
+ String(fromV.triggered_by?.pipeline_id || "") !== String(toV.triggered_by?.pipeline_id || ""),
3148
+ from: {
3149
+ tool: fromV.triggered_by?.tool,
3150
+ agent_id: fromV.triggered_by?.agent_id,
3151
+ pipeline_id: fromV.triggered_by?.pipeline_id,
3152
+ },
3153
+ to: {
3154
+ tool: toV.triggered_by?.tool,
3155
+ agent_id: toV.triggered_by?.agent_id,
3156
+ pipeline_id: toV.triggered_by?.pipeline_id,
3157
+ },
3158
+ };
3159
+ const diffResult = {
3160
+ dataset_id_base: base,
3161
+ from_version: fromVersion,
3162
+ to_version: toVersion,
3163
+ schema_diff: schemaDiff,
3164
+ row_count_delta: {
3165
+ from: fromRows,
3166
+ to: toRows,
3167
+ delta: (typeof fromRows === "number" && typeof toRows === "number") ? (toRows - fromRows) : undefined,
3168
+ },
3169
+ steps_diff: {
3170
+ added: addedSteps,
3171
+ removed: removedSteps,
3172
+ from_steps: Array.from(fromSteps),
3173
+ to_steps: Array.from(toSteps),
3174
+ },
3175
+ actor_diff: actorDiff,
3176
+ };
3177
+ return {
3178
+ content: [{ type: "text", text: JSON.stringify(diffResult, null, 2) }],
3179
+ };
3180
+ }
2619
3181
  case "vesper_convert_format": {
2620
3182
  const filePath = String(request.params.arguments?.file_path || "").trim();
2621
3183
  const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
@@ -2643,7 +3205,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2643
3205
  try {
2644
3206
  await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
2645
3207
  const convertScript = path.join(dataRoot, "python", "convert_engine.py");
3208
+ const schemaBefore = await getSchemaSnapshot(filePath);
2646
3209
  const result = await runPythonJson(convertScript, [filePath, outputPath]);
3210
+ const schemaAfter = await getSchemaSnapshot(outputPath);
2647
3211
  if (!result.ok) {
2648
3212
  return {
2649
3213
  content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
@@ -2658,9 +3222,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2658
3222
  catch (e) {
2659
3223
  console.error(`[Convert] Registry write failed: ${e?.message || e}`);
2660
3224
  }
3225
+ const lineage = appendLineageVersion({
3226
+ datasetIdBase: datasetId,
3227
+ tool: "vesper_convert_format",
3228
+ requestArgs: request.params.arguments,
3229
+ outputPath,
3230
+ output: {
3231
+ rows: result.rows,
3232
+ columns: result.columns,
3233
+ format: targetFormat,
3234
+ size_mb: result.size_mb,
3235
+ schema_before: schemaBefore,
3236
+ schema_after: schemaAfter,
3237
+ },
3238
+ steps: [
3239
+ { step: "converted", at: new Date().toISOString(), params: { from: inputExt, to: outputExt } },
3240
+ ],
3241
+ });
3242
+ try {
3243
+ upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
3244
+ }
3245
+ catch { }
2661
3246
  let msg = `**Conversion complete**\n`;
2662
3247
  msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
2663
3248
  msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
3249
+ msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3250
+ msg += `- **Lineage**: ${lineage.lineagePath}\n`;
2664
3251
  msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
2665
3252
  if (result.size_mb !== undefined)
2666
3253
  msg += `- **Size**: ${result.size_mb} MB\n`;
@@ -2701,7 +3288,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2701
3288
  max_keys: Number.isFinite(maxKeys) ? maxKeys : 200,
2702
3289
  extras_mode: ["blob", "drop"].includes(extrasMode) ? extrasMode : "blob",
2703
3290
  };
3291
+ const schemaBefore = await getSchemaSnapshot(filePath);
2704
3292
  const result = await runPythonJson(scriptPath, [filePath, outputPath, JSON.stringify(options)]);
3293
+ const schemaAfter = await getSchemaSnapshot(outputPath);
2705
3294
  if (!result.ok) {
2706
3295
  return { content: [{ type: "text", text: `ERROR: Schema normalization failed: ${result.error}` }], isError: true };
2707
3296
  }
@@ -2713,9 +3302,31 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2713
3302
  catch (e) {
2714
3303
  console.error(`[NormalizeSchema] Registry write failed: ${e?.message || e}`);
2715
3304
  }
3305
+ const lineage = appendLineageVersion({
3306
+ datasetIdBase: path.basename(outputPath, path.extname(outputPath)),
3307
+ tool: "vesper_normalize_schema",
3308
+ requestArgs: request.params.arguments,
3309
+ outputPath,
3310
+ output: {
3311
+ rows: result.rows,
3312
+ columns: result.columns,
3313
+ format: outputFormat,
3314
+ schema_before: schemaBefore,
3315
+ schema_after: schemaAfter,
3316
+ },
3317
+ steps: [
3318
+ { step: "schema_normalized", at: new Date().toISOString(), params: options, metrics: { flattened_keys: result.flattened_keys } },
3319
+ ],
3320
+ });
3321
+ try {
3322
+ upsertRegistry(lineage.datasetVersionId, outputPath, "completed");
3323
+ }
3324
+ catch { }
2716
3325
  let msg = `**Schema normalization complete**\n`;
2717
3326
  msg += `- **Input**: ${filePath}\n`;
2718
3327
  msg += `- **Output**: ${result.output_path}\n`;
3328
+ msg += `- **Version**: ${lineage.datasetVersionId}\n`;
3329
+ msg += `- **Lineage**: ${lineage.lineagePath}\n`;
2719
3330
  msg += `- **Rows**: ${result.rows?.toLocaleString?.() ?? result.rows}\n`;
2720
3331
  msg += `- **Columns**: ${result.columns}\n`;
2721
3332
  msg += `- **Flattened keys**: ${result.flattened_keys}\n`;
@@ -2795,10 +3406,35 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
2795
3406
  catch (e) {
2796
3407
  console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
2797
3408
  }
3409
+ const inputSchemaSnapshots = await Promise.all(resolvedPaths.map((p) => getSchemaSnapshot(p)));
3410
+ const schemaBefore = mergeSchemaSnapshots(inputSchemaSnapshots);
3411
+ const schemaAfter = await getSchemaSnapshot(result.output_path);
3412
+ const lineage = appendLineageVersion({
3413
+ datasetIdBase: fusedId,
3414
+ tool: "fuse_datasets",
3415
+ requestArgs: request.params.arguments,
3416
+ outputPath: result.output_path,
3417
+ output: {
3418
+ rows: result.stats.rows_after,
3419
+ format: outputFormat,
3420
+ schema_before: schemaBefore,
3421
+ schema_after: schemaAfter,
3422
+ },
3423
+ sources: resolvedPaths.map((p) => ({ source: "local", url: p, at: new Date().toISOString() })),
3424
+ steps: [
3425
+ { step: "fused", at: new Date().toISOString(), params: { strategy, dedup, how }, metrics: { rows_before: result.stats.rows_before, rows_after: result.stats.rows_after, duplicates_removed: result.stats.duplicates_removed } },
3426
+ ],
3427
+ });
3428
+ try {
3429
+ upsertRegistry(lineage.datasetVersionId, result.output_path, "completed");
3430
+ }
3431
+ catch { }
2798
3432
  let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
2799
3433
  msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
2800
3434
  msg += `- Null change: ${nullText}\n`;
2801
3435
  msg += `- Output: ${result.output_path}\n`;
3436
+ msg += `- Version: ${lineage.datasetVersionId}\n`;
3437
+ msg += `- Lineage: ${lineage.lineagePath}\n`;
2802
3438
  if (result.preview_path)
2803
3439
  msg += `- Preview: ${result.preview_path}\n`;
2804
3440
  if (result.leakage_report) {
@@ -2966,6 +3602,7 @@ async function main() {
2966
3602
  const isDiscover = args.includes("discover");
2967
3603
  const isDownload = args.includes("download");
2968
3604
  const isExport = args.includes("export");
3605
+ const isStatus = args.includes("status");
2969
3606
  const isConfig = args.includes("config") || args.includes("configure");
2970
3607
  const isSetup = args.includes("--setup") || args.includes("setup");
2971
3608
  const isSilent = args.includes("--silent");
@@ -2992,6 +3629,10 @@ async function main() {
2992
3629
  await runExportCli(args);
2993
3630
  return;
2994
3631
  }
3632
+ if (isStatus) {
3633
+ await runStatusCli(args);
3634
+ return;
3635
+ }
2995
3636
  // If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
2996
3637
  if (isSetup) {
2997
3638
  await runSetupWizard(isSilent);
@@ -3412,6 +4053,173 @@ async function runFuseCli(args) {
3412
4053
  console.log(`Preview saved: ${result.preview_path}`);
3413
4054
  console.log("Next: run vespermcp split/export on the fused dataset");
3414
4055
  }
4056
+ async function runStatusCli(args) {
4057
+ const [{ default: chalk }, { default: Table }] = await Promise.all([
4058
+ import("chalk"),
4059
+ import("cli-table3"),
4060
+ ]);
4061
+ const getArgValue = (name) => {
4062
+ const idx = args.findIndex(a => a === name);
4063
+ if (idx >= 0 && idx + 1 < args.length)
4064
+ return args[idx + 1];
4065
+ return undefined;
4066
+ };
4067
+ const defaultDir = path.join(dataRoot, "lineage");
4068
+ const scanDirRaw = getArgValue("--dir");
4069
+ const scanDir = path.resolve(scanDirRaw || defaultDir);
4070
+ const maxDepthRaw = getArgValue("--max-depth");
4071
+ const maxDepthParsed = maxDepthRaw !== undefined ? Number(maxDepthRaw) : 4;
4072
+ const maxDepth = Number.isFinite(maxDepthParsed) && maxDepthParsed >= 0
4073
+ ? Math.floor(maxDepthParsed)
4074
+ : 4;
4075
+ if (!fs.existsSync(scanDir)) {
4076
+ console.log(`Lineage directory not found: ${scanDir}`);
4077
+ console.log("Tip: use --dir <path> to scan a custom location.");
4078
+ return;
4079
+ }
4080
+ const lineageFiles = walkFilesRecursive(scanDir, maxDepth).filter((p) => p.toLowerCase().endsWith(".lineage.json"));
4081
+ const records = [];
4082
+ for (const filePath of lineageFiles) {
4083
+ try {
4084
+ const data = JSON.parse(fs.readFileSync(filePath, "utf-8"));
4085
+ if (isLineageRecordShape(data)) {
4086
+ records.push(data);
4087
+ }
4088
+ }
4089
+ catch {
4090
+ // ignore malformed files
4091
+ }
4092
+ }
4093
+ if (records.length === 0) {
4094
+ console.log("No lineage records found.");
4095
+ console.log("Tip: default scan is ~/.vesper/lineage. Use --dir <path> for project-local lineage files.");
4096
+ return;
4097
+ }
4098
+ const allVersions = records.flatMap((r) => r.versions || []);
4099
+ const sevenDaysAgo = Date.now() - 7 * 24 * 60 * 60 * 1000;
4100
+ const operationsLast7d = allVersions.filter((v) => Date.parse(v.created_at || "") >= sevenDaysAgo).length;
4101
+ console.log(chalk.bold.cyan("\nVesper Lineage Status"));
4102
+ console.log(chalk.gray(`Scan dir: ${scanDir}`));
4103
+ console.log(chalk.gray(`Max depth: ${maxDepth}`));
4104
+ console.log(chalk.gray(`Lineage records: ${records.length}`));
4105
+ console.log(chalk.gray(`Total operations: ${allVersions.length} (${operationsLast7d} in last 7 days)\n`));
4106
+ const perDatasetTable = new Table({
4107
+ head: ["Dataset", "Versions", "Last Modified", "Last Actor"],
4108
+ colWidths: [34, 10, 28, 28],
4109
+ wordWrap: true,
4110
+ });
4111
+ for (const record of records.sort((a, b) => (a.dataset_id_base || "").localeCompare(b.dataset_id_base || ""))) {
4112
+ const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
4113
+ const last = sorted[sorted.length - 1];
4114
+ const actor = last?.triggered_by?.agent_id || last?.triggered_by?.pipeline_id || "-";
4115
+ perDatasetTable.push([
4116
+ record.dataset_id_base,
4117
+ String(sorted.length),
4118
+ last?.created_at || "-",
4119
+ actor,
4120
+ ]);
4121
+ }
4122
+ console.log(chalk.bold("Per-dataset summary"));
4123
+ console.log(perDatasetTable.toString());
4124
+ const trendTable = new Table({
4125
+ head: ["Dataset", "Rows Trend", "Details"],
4126
+ colWidths: [34, 14, 52],
4127
+ wordWrap: true,
4128
+ });
4129
+ for (const record of records.sort((a, b) => (a.dataset_id_base || "").localeCompare(b.dataset_id_base || ""))) {
4130
+ const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
4131
+ const series = sorted
4132
+ .map((v) => ({ version: v.version, rows: v.output?.schema_after?.rows ?? v.output?.rows }))
4133
+ .filter((x) => typeof x.rows === "number");
4134
+ if (series.length < 2) {
4135
+ trendTable.push([record.dataset_id_base, "-", "insufficient row snapshots"]);
4136
+ continue;
4137
+ }
4138
+ const first = series[0].rows;
4139
+ const last = series[series.length - 1].rows;
4140
+ const trend = last > first ? chalk.green("growing") : last < first ? chalk.yellow("shrinking") : "flat";
4141
+ const details = series.map((x) => `v${x.version}:${x.rows}`).join(" -> ");
4142
+ trendTable.push([record.dataset_id_base, trend, details]);
4143
+ }
4144
+ console.log(chalk.bold("\nQuality trend (schema_after.rows)"));
4145
+ console.log(trendTable.toString());
4146
+ const dtypeWarnings = [];
4147
+ for (const record of records) {
4148
+ const sorted = [...(record.versions || [])].sort((a, b) => (a.version || 0) - (b.version || 0));
4149
+ for (let i = 1; i < sorted.length; i++) {
4150
+ const prev = sorted[i - 1];
4151
+ const curr = sorted[i];
4152
+ const prevSchema = prev.output?.schema_after || prev.output?.schema_before;
4153
+ const currSchema = curr.output?.schema_after || curr.output?.schema_before;
4154
+ const prevCols = prevSchema?.columns || [];
4155
+ const currCols = currSchema?.columns || [];
4156
+ const prevDtypes = prevSchema?.dtypes || {};
4157
+ const currDtypes = currSchema?.dtypes || {};
4158
+ const diff = diffSchemaMaps(prevCols, currCols, prevDtypes, currDtypes);
4159
+ if (diff.changed_dtypes.length > 0) {
4160
+ dtypeWarnings.push({
4161
+ dataset: record.dataset_id_base,
4162
+ from: prev.version,
4163
+ to: curr.version,
4164
+ changes: diff.changed_dtypes.slice(0, 4).map((d) => `${d.column}:${d.from}->${d.to}`),
4165
+ });
4166
+ }
4167
+ }
4168
+ }
4169
+ console.log(chalk.bold("\nDtype warnings"));
4170
+ if (dtypeWarnings.length === 0) {
4171
+ console.log(chalk.green("No dtype changes detected across adjacent versions."));
4172
+ }
4173
+ else {
4174
+ const warningTable = new Table({
4175
+ head: ["Dataset", "Versions", "Changed dtypes"],
4176
+ colWidths: [34, 14, 52],
4177
+ wordWrap: true,
4178
+ });
4179
+ for (const w of dtypeWarnings.slice(-20)) {
4180
+ warningTable.push([
4181
+ w.dataset,
4182
+ `v${w.from}->v${w.to}`,
4183
+ w.changes.join(", "),
4184
+ ]);
4185
+ }
4186
+ console.log(warningTable.toString());
4187
+ }
4188
+ const lineageErrors = [];
4189
+ for (const record of records) {
4190
+ for (const v of record.versions || []) {
4191
+ for (const step of v.steps || []) {
4192
+ const errMsg = (typeof step.metrics?.error === "string" && step.metrics.error) ||
4193
+ (typeof step.params?.error === "string" && step.params.error) ||
4194
+ undefined;
4195
+ if (errMsg) {
4196
+ lineageErrors.push({ at: step.at, where: `${record.dataset_id_base}/v${v.version}:${step.step}`, message: errMsg });
4197
+ }
4198
+ }
4199
+ }
4200
+ }
4201
+ const logErrors = parseErrorLogLines(errorLogPath, 7).map((e) => ({
4202
+ at: e.at,
4203
+ where: "vesper_errors.log",
4204
+ message: e.message,
4205
+ }));
4206
+ const recentErrors = [...lineageErrors, ...logErrors].slice(-20);
4207
+ console.log(chalk.bold("\nRecent errors"));
4208
+ if (recentErrors.length === 0) {
4209
+ console.log(chalk.green("No recent lineage-linked errors found."));
4210
+ }
4211
+ else {
4212
+ const errTable = new Table({
4213
+ head: ["At", "Where", "Error"],
4214
+ colWidths: [28, 36, 46],
4215
+ wordWrap: true,
4216
+ });
4217
+ for (const e of recentErrors) {
4218
+ errTable.push([e.at || "-", e.where, e.message]);
4219
+ }
4220
+ console.log(errTable.toString());
4221
+ }
4222
+ }
3415
4223
  async function runSetupWizard(silent = false) {
3416
4224
  if (!silent && process.stdin.isTTY) {
3417
4225
  const wizardCandidates = [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.27",
3
+ "version": "1.2.28",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
@@ -37,6 +37,7 @@
37
37
  "setup": "node build/index.js --setup",
38
38
  "setup:silent": "node build/index.js --setup --silent",
39
39
  "refresh-index": "node scripts/refresh-index.cjs",
40
+ "telemetry:receiver": "tsx telemetry/lineage-receiver.ts",
40
41
  "test": "vitest",
41
42
  "start": "node build/index.js"
42
43
  },
@@ -79,9 +80,13 @@
79
80
  "ajv": "^8.17.1",
80
81
  "ajv-formats": "^3.0.1",
81
82
  "better-sqlite3": "^12.6.0",
83
+ "chalk": "^5.6.2",
84
+ "cli-table3": "^0.6.5",
85
+ "express": "^5.1.0",
82
86
  "inquirer": "^13.3.0",
83
87
  "lodash": "^4.17.21",
84
88
  "pdf-parse": "^2.4.5",
89
+ "pg": "^8.16.3",
85
90
  "uuid": "^13.0.0",
86
91
  "zod": "^4.3.5",
87
92
  "zod-to-json-schema": "^3.25.1"